~dricottone/fmg-timesheets

9efdae59717e3d48470a4834470072a86f5fb706 — Dominic Ricottone 2 years ago 7326d28
Implemented time entry extraction; no assert errors!

There is still a major issue ahead of 'structured' data:
Hours data is leaking between entries. There are entries with no hours
at all. There are almost certainly some entries that have hours out of
order.

It will likely be necessary to re-sort all items ahead of processing
based on top then left style attributes. This is going to have the
consequence of invalidating some of the work I've already put into
parsing the data as-is.

Good luck, future me.
3 files changed, 138 insertions(+), 44 deletions(-)

M parser/html.py
M parser/timeentry.py
M parser/timesheet.py
M parser/html.py => parser/html.py +12 -4
@@ 56,6 56,9 @@ def parse_attrs_doubles(attrs):
def has_style_left(attrs):
    return "style" in attrs.keys() and "left" in attrs["style"].keys()

def has_style_top(attrs):
    return "style" in attrs.keys() and "top" in attrs["style"].keys()

class TimesheetHTMLParser(HTMLParser):
    """A specialization of the `html.parser.HTMLParser` class to handle my
    timesheets.


@@ 66,6 69,7 @@ class TimesheetHTMLParser(HTMLParser):
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self._top = 0
        self._left = 0
        self._in_div = False
        self._in_span = False


@@ 78,9 82,13 @@ class TimesheetHTMLParser(HTMLParser):
                self._in_span = True
            else:
                self._in_div = False
        elif tag == "div" and has_style_left(attrs):
            self._left = int(attrs["style"]["left"].removesuffix("px"))
            self._in_div = True
        elif tag == "div":
            if has_style_left(attrs):
                self._left = int(attrs["style"]["left"].removesuffix("px"))
                self._in_div = True
            if has_style_top(attrs):
                self._top = int(attrs["style"]["top"].removesuffix("px"))
                self._in_div = True
        else:
            self._in_span = False
            self._in_div = False


@@ 91,7 99,7 @@ class TimesheetHTMLParser(HTMLParser):

    def handle_data(self, data):
        if self._in_span:
            self._data.append((data.splitlines()[0], self._left, ))
            self._data.append((data.splitlines()[0], self._left, self._top))
        self._in_span = False
        self._in_div = False


M parser/timeentry.py => parser/timeentry.py +118 -38
@@ 2,16 2,21 @@

from re import compile as re_compile

PROJECT_PATTERN = re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})")
TIMETYPE_PATTERN = re_compile("(FSERV|[0-9]{3,5})")
DATE_PATTERN = re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012]")

ENTRY_PATTERNS = (
    re_compile("[1-9][0-9]?"),
    re_compile("(ST|VAC|HOL)"),
    re_compile("(OHORG\.(MTG|ONB)\.0[0-5]\.00[0-5]|UNALW\.EMW\.0[0-5]\.00[0-5]|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"),
    re_compile("[0-9]{3,5}"),
    re_compile("(HOL|OTU|ST|VAC)"),
    re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"),
    re_compile("(FSERV|[0-9]{3,5})"),
    re_compile("."),
    re_compile("Notes"),
    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
    re_compile("- [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] -"),
    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] Other"),
    re_compile("."),
    re_compile("Approved"),
)

class TimeEntry(object):


@@ 20,7 25,10 @@ class TimeEntry(object):
        self._timesheet = timesheet_name
        self._entry = entry_number
        self._data = self.beat_data_with_a_stick(semistructured_data)
        self.assert_entry()
        self.parse_entry_number()
        self.parse_entry_project()
        self.parse_entry_notes()
        self.parse_entry_approval()

    def __str__(self):
        return f"entry #{self._entry} in the timesheet for {self._timesheet}"


@@ 39,46 47,118 @@ class TimeEntry(object):
    def log_issue(self, *parts):
        self._issues.append(''.join(parts))

    def beat_data_with_a_stick(self, data):
        # Project and TimeType fields can get merged
        if " " in data[2][0]:
            project_timetype = data[2][0].split(" ", 1)
            shared = [n for n in data[2][1:]]
            split = [
                (project_timetype[0], *shared, ),
                (project_timetype[1], *shared, ),
            ]
            data = data[:2] + split + data[3:]

        # Holiday and Vacation time entries skip the Project and TimeType fields
        if data[1][0] in ("HOL", "VAC", ):
            data = data[0:1] + [("", 0, ), ("", 0, )] + data[1:]

        return data
    def assert_entry_item_any(self, index, label, should_be_any):
        if self._data[index][0] not in should_be_any:
            self.log_issue(
                "parse_entry: ",
                f"{label} not one of {', '.join(should_be_any)} ",
                f"(is {self._data[index][0]})",
            )

    def assert_entry_item(self, index, should_be_at):
        if not ENTRY_PATTERNS[index].match(self._data[index][0]):
    def assert_entry_item_match(self, index, label, pattern):
        if not pattern.match(self._data[index][0]):
            self.log_issue(
                "assert_entry: ",
                f"item {index} does not look right ",
                f"({self._data[index][0]})",
                "parse_entry: ",
                f"{label} does not look correct ",
                f"(is {self._data[index][0]})",
            )

    def assert_entry_item_at(self, index, label, should_be_at):
        if self._data[index][1] != should_be_at:
            self.log_issue(
                "assert_entry: ",
                f"item {index} is not at {should_be_at}px left ",
                "parse_entry: ",
                f"{label} not at {should_be_at}px left ",
                f"(is at {self._data[index][1]}px)",
            )

    def assert_entry(self):
        self.assert_entry_item(1, 40)
        self.assert_entry_item(2, 120)
        self.assert_entry_item(3, 201)
    def parse_entry_number(self):
        del self._data[0]

    def parse_entry_project(self):
        self.assert_entry_item_any(0, "time code", ("HOL", "OTU", "ST", "VAC", ))
        self.assert_entry_item_at(0, "time code", 40)
        self._timecode = self._data[0][0]
        del self._data[0]

        if self._timecode in ("HOL", "OTU", "VAC", ):
            self._project = ""
            self._timetype = ""
        elif " " in self._data[0][0]:
            self.assert_entry_item_match(0, "project", PROJECT_PATTERN)
            self.assert_entry_item_at(0, "project", 120)
            project_timetype = self._data[0][0].split(" ", 1)
            self._project = project_timetype[0]
            self._timetype = project_timetype[1]
            del self._data[0]
        else:
            self.assert_entry_item_match(0, "project", PROJECT_PATTERN)
            self.assert_entry_item_at(0, "project", 120)
            self._project = self._data[0][0]
            self.assert_entry_item_match(1, "time type", TIMETYPE_PATTERN)
            self.assert_entry_item_at(1, "time type", 201)
            self._timetype = self._data[0][0]
            del self._data[0:2]

        self._name = self._data[0][0]
        del self._data[0]

    def parse_entry_note(self):
        if self._data[0][0] == "Line Note:":
            # There is no start or end field, just the text field
            self._notes.append({
                "start": "",
                "end": "",
                "text": self._data[1][0],
            })
            del self._data[0:2]
        else:
            match = DATE_PATTERN.match(self._data[0][0])
            if match:
                self._notes.append({})

                # Handle the start field
                self.assert_entry_item_at(0, "note start", 40)
                self._notes[-1]["start"] = match

                # Handle the start/end separator if it is separate from the
                # start field
                if self._data[1][0] == "-":
                    del self._data[1]

                # Handle the end field
                self.assert_entry_item_at(1, "note end", 99)
                match = DATE_PATTERN.match(self._data[1][0])
                if match:
                    self._notes[-1]["end"] = match

                # Handle the 'Approved' note if it floated up
                if self._data[2][0] == "Approved":
                    del self._data[2]

                # Handle the text field
                self.assert_entry_item_at(2, "note text", 201)
                self._notes[-1]["text"] = self._data[2][0]

                del self._data[0:3]

    def parse_entry_notes(self):
        print(self._data)
        self.assert_entry_item(4, 39)
        if len(self)>5 and self._data[5] == "Notes":
            self.assert_entry_item(5, 40)
            self.assert_entry_item(6, 40)
            self.assert_entry_item(7, 99)
            self.assert_entry_item(8, 220)
        self._notes = []
        if len(self) and self._data[0][0] == "Notes":
            del self._data[0]

            # One or two notes follow
            if len(self):
                self.parse_entry_note()
            if len(self):
                self.parse_entry_note()

    def parse_entry_approval(self):
        if len(self) and self._data[0][0] == "Approved":
            del self._data[0]

    def beat_data_with_a_stick(self, data):
        data = [i for i in data if i[1]<400] + [i for i in data if i[1]>=400]
        return data


M parser/timesheet.py => parser/timesheet.py +8 -2
@@ 284,7 284,11 @@ class TimeSheet(object):
        # None of this is useful data. Immediately delete it.
        for page_break in reversed(page_breaks):
            i = page_break - 7
            while i < page_break:
            # don't question why this is necessary
            if str(self)=="May 16, 2019 - May 31, 2019" and page_break == 275:
                i -= 30

            while i <= page_break:
                j = 0
                while j < len(PAGE_PATTERNS):
                    if PAGE_PATTERNS[j].match(self._data[i][0]):


@@ 309,7 313,9 @@ class TimeSheet(object):
                        f"({self._data[i][0]})",
                    )
            else:
                if len(entries)>0:
                if self._data[i][0] in ("Mon", "Tue Wed", "Thu", "Fri", "Sat", "Sun", "Total", ):
                    pass
                elif len(entries)>0:
                    entries[-1].append(self._data[i])
                else:
                    self.log_issue(