~dricottone/fmg-timesheets

c041ec57595dc0e315a64b892633df30acf57509 — Dominic Ricottone 2 years ago
Initial commit
6 files changed, 288 insertions(+), 0 deletions(-)

A Makefile
A main.py
A notes
A parse.py
A parser/html.py
A parser/pdf.py
A  => Makefile +8 -0
@@ 1,8 @@
.PHONY: clean
clean:
	rm --force --recursive __pycache__

.PHONY: process
process:
	python3 main.py ~/web/*.pdf


A  => main.py +23 -0
@@ 1,23 @@
#!/usr/bin/env python3

import sys
import pathlib

import parse

def main(filelist):
    print(f"processing {len(filelist)} files")
    for filename in (filelist[4],):
        parse.timesheet(filename)

if __name__ == "__main__":
    filelist = []
    for filename in sys.argv[1:]:
        filepath = pathlib.Path(filename)
        if filepath.exists():
            filelist.append(filepath)
        else:
            print(f"no such file: '{filename}'")
    
    main(filelist)


A  => notes +63 -0
@@ 1,63 @@
index                   20px
timecode (usually "ST") 40px
project                 39px
timetype                201px



day1     571px or 572px
day2     597px
day3     622px
day4     646px or 647px
day5     671px or 672px
day6     696px or 697px
day7     721px or 722px
rowtotal 751px or 753px or 756px

document header:
"Timesheet"
"Mon N, YYYY - Mon N, YYYY"
"Location:"
"[E01] Fors Marsh Group"
"Department:"
"[3200] Advanced Analytics"
"Employee Type:"
"[1] Annual Salary"
"Location (Default"
"[LOCAL] Location"
"Function:"
"Exempt:"
"Status:"
"[1] Full Time"
"Yes"
"Approved"
"Post Status:"
"Validation:"
"Date/Time:"
"Posted"
"Passed"
"Mon N, YYYY HH:MM"
"Total Tomesheet:"
"Standard Hours:"
"Total Billable:"
"Percent Billability:"
<total hours on spreadsheet>
<minimum hours for spreadsheet>
<total billable hours on spreadsheet>
<total billable hours / total hours>

left column header:
"ID"
"Time Code"
"Project"
"TimeType"

right colummn header:
"Mon"
"Tues Wed"
"Thu"
"Fri"
"Sat"
"Sun"
(or some cycled version of this)


A  => parse.py +66 -0
@@ 1,66 @@
#!/usr/bin/env python3

"""The parsers are developed, debugged, and refactored in this file.

When they mature, I refactor them into standalone modules under the `parser`
directory.

Eventually, the entire parse step will mature and be abstracted into a
single function call, which will be appropriate to call in `main.py`.

If you can see this file, then I'm not done yet.
"""

from pprint import pprint

from parser.html import parse as parse_html
from parser.pdf import parse as parse_pdf

def read_timesheet(filename):
    unstructured_data = parse_pdf(filename)
    semistructured_data = parse_html(unstructured_data)
    return semistructured_data

def has_style_left(attrs):
    return "style" in attrs.keys() and "left" in attrs["style"].keys()

def update_count(counters, key):
    if key in counters.keys():
        counters[key] += 1
    else:
        counters[key] = 1
    return counters

def parse_timesheet(data):

    in_div = False
    in_span = False
    left = ""

    for line in data:
        if in_span:
            if line[0] == "DATA":
                print(f"{left:10} {line[2][0]}")
            in_span = False
            in_div = False
        elif in_div:
            if line[0] == "START" and line[1] == "span":
                in_span = True
            else:
                in_div = False
        else:
            if line[0] == "START" and line[1] == "div" and has_style_left(line[2]):
                in_div = True
                left = line[2]["style"]["left"]

    return []

def extract_projects(structured_data):
    return []

def timesheet(filename):
    unstructured_data = read_timesheet(filename)
    structured_data = parse_timesheet(unstructured_data)
    projects = extract_projects(structured_data)
    return projects


A  => parser/html.py +86 -0
@@ 1,86 @@
#!/usr/bin/env python3

# Crash Course on html.parser
#
# A SAX-style parser. Hook into tags and data like...
#
# ```
# from html.parser import HTMLParser
# class MyHTMLParser(HTMLParser):
#    def handle_starttag(self, tag, attrs):
#        #do something...
#    def handle_endtag(self, tag):
#        #do something...
#    def handle_data(self, data):
#        #do something...
# ```
#
# Valid HTML is fed into the parser like...
#
# ```
# parser = MyHTMLParser()
# parser.feed(html)
# ```

from html.parser import HTMLParser

def parse_attrs_string(attrs):
    """Parse a string structures like `key1:value1;key2:value2;`.

    Embedded CSS (as in `style` attributes) can look like this.
    """
    attrs_dict = {}
    for pair in attrs.split(";"):
        if len(pair.strip()) == 0:
            continue
        key, value = pair.split(":")

        key, value = key.strip(), value.strip()
        attrs_dict[key] = value
    return attrs_dict

def parse_attrs_doubles(attrs):
    """Parse a dictionary of HTML/CSS attributes from a series of doubles.

    The built-in Python HTML parser (`html.parser.HTMLParser`) hands attributes
    to the `handle_starttag` hook like this.
    """
    attrs_dict = {}
    for pair in attrs:
        key, value = pair
        if key == "style":
            value = parse_attrs_string(value)
        attrs_dict[key] = value
    return attrs_dict

class TimesheetHTMLParser(HTMLParser):
    """A specialization of the `html.parser.HTMLParser` class to handle my
    timesheets.

    Data is stored internally and can be dumped with the `dump` method.

    Don't forget to close the parser instance!
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self._data = []
    def handle_starttag(self, tag, _attrs):
        attrs = parse_attrs_doubles(_attrs)
        self._data.append(["START", tag, attrs])
    def handle_endtag(self, tag):
        self._data.append(["END", tag])
    def handle_data(self, data):
        self._data.append(["DATA", "", data.splitlines()])
    def dump(self):
        return self._data

def parse(html):
    """Read an HTML-encoded string into semi-structured data."""
    parser = TimesheetHTMLParser()
    try:
        parser.feed(html)
        data = parser.dump()
    finally:
        parser.close()
    return data


A  => parser/pdf.py +42 -0
@@ 1,42 @@
#!/usr/bin/env python3

from io import StringIO

# Crash Course on pdfminer
#
# Extract text from PDFs like...
#
# ```
# from pdfminer.high_level import extract_text
# with open(filename, "rb") as f:
#   text = extract_text(f)
# ```
#
# The alternative is to use something like...
#
# ```
# from io import StringIO
# from pdfminer.high_level import extract_text_to_fp
# from pdfminer.layout import LAParams
# buffer = StringIO()
# with open(filename, "rb") as f:
#   extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
# html = buffer.getvalue()
# ```

from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

def parse(filename):
    """Read a binary PDF-encoded file and convert it into an HTML-encoded
    string.
    """
    buffer = StringIO()
    try:
        with open(filename, "rb") as f:
            extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
        data = buffer.getvalue()
    finally:
        buffer.close()
    return data