From 9efdae59717e3d48470a4834470072a86f5fb706 Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Sat, 30 Apr 2022 19:14:57 -0500 Subject: [PATCH] Implemented time entry extraction; no assert errors! There is still a major issue ahead of 'structured' data: Hours data is leaking between entries. There are entries with no hours at all. There are almost certainly some entries that have hours out of order. It will likely be necessary to re-sort all items ahead of processing based on top then left style attributes. This is going to have the consequence of invalidating some of the work I've already put into parsing the data as-is. Good luck, future me. --- parser/html.py | 16 +++-- parser/timeentry.py | 156 +++++++++++++++++++++++++++++++++----------- parser/timesheet.py | 10 ++- 3 files changed, 138 insertions(+), 44 deletions(-) diff --git a/parser/html.py b/parser/html.py index b5e674b..1aa91dc 100644 --- a/parser/html.py +++ b/parser/html.py @@ -56,6 +56,9 @@ def parse_attrs_doubles(attrs): def has_style_left(attrs): return "style" in attrs.keys() and "left" in attrs["style"].keys() +def has_style_top(attrs): + return "style" in attrs.keys() and "top" in attrs["style"].keys() + class TimesheetHTMLParser(HTMLParser): """A specialization of the `html.parser.HTMLParser` class to handle my timesheets. @@ -66,6 +69,7 @@ class TimesheetHTMLParser(HTMLParser): """ def __init__(self): HTMLParser.__init__(self) + self._top = 0 self._left = 0 self._in_div = False self._in_span = False @@ -78,9 +82,13 @@ class TimesheetHTMLParser(HTMLParser): self._in_span = True else: self._in_div = False - elif tag == "div" and has_style_left(attrs): - self._left = int(attrs["style"]["left"].removesuffix("px")) - self._in_div = True + elif tag == "div": + if has_style_left(attrs): + self._left = int(attrs["style"]["left"].removesuffix("px")) + self._in_div = True + if has_style_top(attrs): + self._top = int(attrs["style"]["top"].removesuffix("px")) + self._in_div = True else: self._in_span = False self._in_div = False @@ -91,7 +99,7 @@ class TimesheetHTMLParser(HTMLParser): def handle_data(self, data): if self._in_span: - self._data.append((data.splitlines()[0], self._left, )) + self._data.append((data.splitlines()[0], self._left, self._top)) self._in_span = False self._in_div = False diff --git a/parser/timeentry.py b/parser/timeentry.py index 68254f6..30ff13a 100644 --- a/parser/timeentry.py +++ b/parser/timeentry.py @@ -2,16 +2,21 @@ from re import compile as re_compile +PROJECT_PATTERN = re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})") +TIMETYPE_PATTERN = re_compile("(FSERV|[0-9]{3,5})") +DATE_PATTERN = re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012]") + ENTRY_PATTERNS = ( re_compile("[1-9][0-9]?"), - re_compile("(ST|VAC|HOL)"), - re_compile("(OHORG\.(MTG|ONB)\.0[0-5]\.00[0-5]|UNALW\.EMW\.0[0-5]\.00[0-5]|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"), - re_compile("[0-9]{3,5}"), + re_compile("(HOL|OTU|ST|VAC)"), + re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"), + re_compile("(FSERV|[0-9]{3,5})"), re_compile("."), re_compile("Notes"), - re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"), - re_compile("- [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"), + re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] -"), + re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] Other"), re_compile("."), + re_compile("Approved"), ) class TimeEntry(object): @@ -20,7 +25,10 @@ class TimeEntry(object): self._timesheet = timesheet_name self._entry = entry_number self._data = self.beat_data_with_a_stick(semistructured_data) - self.assert_entry() + self.parse_entry_number() + self.parse_entry_project() + self.parse_entry_notes() + self.parse_entry_approval() def __str__(self): return f"entry #{self._entry} in the timesheet for {self._timesheet}" @@ -39,46 +47,118 @@ class TimeEntry(object): def log_issue(self, *parts): self._issues.append(''.join(parts)) - def beat_data_with_a_stick(self, data): - # Project and TimeType fields can get merged - if " " in data[2][0]: - project_timetype = data[2][0].split(" ", 1) - shared = [n for n in data[2][1:]] - split = [ - (project_timetype[0], *shared, ), - (project_timetype[1], *shared, ), - ] - data = data[:2] + split + data[3:] - - # Holiday and Vacation time entries skip the Project and TimeType fields - if data[1][0] in ("HOL", "VAC", ): - data = data[0:1] + [("", 0, ), ("", 0, )] + data[1:] - return data + def assert_entry_item_any(self, index, label, should_be_any): + if self._data[index][0] not in should_be_any: + self.log_issue( + "parse_entry: ", + f"{label} not one of {', '.join(should_be_any)} ", + f"(is {self._data[index][0]})", + ) - def assert_entry_item(self, index, should_be_at): - if not ENTRY_PATTERNS[index].match(self._data[index][0]): + def assert_entry_item_match(self, index, label, pattern): + if not pattern.match(self._data[index][0]): self.log_issue( - "assert_entry: ", - f"item {index} does not look right ", - f"({self._data[index][0]})", + "parse_entry: ", + f"{label} does not look correct ", + f"(is {self._data[index][0]})", ) + + def assert_entry_item_at(self, index, label, should_be_at): if self._data[index][1] != should_be_at: self.log_issue( - "assert_entry: ", - f"item {index} is not at {should_be_at}px left ", + "parse_entry: ", + f"{label} not at {should_be_at}px left ", f"(is at {self._data[index][1]}px)", ) - def assert_entry(self): - self.assert_entry_item(1, 40) - self.assert_entry_item(2, 120) - self.assert_entry_item(3, 201) + def parse_entry_number(self): + del self._data[0] + + def parse_entry_project(self): + self.assert_entry_item_any(0, "time code", ("HOL", "OTU", "ST", "VAC", )) + self.assert_entry_item_at(0, "time code", 40) + self._timecode = self._data[0][0] + del self._data[0] + + if self._timecode in ("HOL", "OTU", "VAC", ): + self._project = "" + self._timetype = "" + elif " " in self._data[0][0]: + self.assert_entry_item_match(0, "project", PROJECT_PATTERN) + self.assert_entry_item_at(0, "project", 120) + project_timetype = self._data[0][0].split(" ", 1) + self._project = project_timetype[0] + self._timetype = project_timetype[1] + del self._data[0] + else: + self.assert_entry_item_match(0, "project", PROJECT_PATTERN) + self.assert_entry_item_at(0, "project", 120) + self._project = self._data[0][0] + self.assert_entry_item_match(1, "time type", TIMETYPE_PATTERN) + self.assert_entry_item_at(1, "time type", 201) + self._timetype = self._data[0][0] + del self._data[0:2] + + self._name = self._data[0][0] + del self._data[0] + + def parse_entry_note(self): + if self._data[0][0] == "Line Note:": + # There is no start or end field, just the text field + self._notes.append({ + "start": "", + "end": "", + "text": self._data[1][0], + }) + del self._data[0:2] + else: + match = DATE_PATTERN.match(self._data[0][0]) + if match: + self._notes.append({}) + + # Handle the start field + self.assert_entry_item_at(0, "note start", 40) + self._notes[-1]["start"] = match + + # Handle the start/end separator if it is separate from the + # start field + if self._data[1][0] == "-": + del self._data[1] + + # Handle the end field + self.assert_entry_item_at(1, "note end", 99) + match = DATE_PATTERN.match(self._data[1][0]) + if match: + self._notes[-1]["end"] = match + + # Handle the 'Approved' note if it floated up + if self._data[2][0] == "Approved": + del self._data[2] + + # Handle the text field + self.assert_entry_item_at(2, "note text", 201) + self._notes[-1]["text"] = self._data[2][0] + + del self._data[0:3] + + def parse_entry_notes(self): print(self._data) - self.assert_entry_item(4, 39) - if len(self)>5 and self._data[5] == "Notes": - self.assert_entry_item(5, 40) - self.assert_entry_item(6, 40) - self.assert_entry_item(7, 99) - self.assert_entry_item(8, 220) + self._notes = [] + if len(self) and self._data[0][0] == "Notes": + del self._data[0] + + # One or two notes follow + if len(self): + self.parse_entry_note() + if len(self): + self.parse_entry_note() + + def parse_entry_approval(self): + if len(self) and self._data[0][0] == "Approved": + del self._data[0] + + def beat_data_with_a_stick(self, data): + data = [i for i in data if i[1]<400] + [i for i in data if i[1]>=400] + return data diff --git a/parser/timesheet.py b/parser/timesheet.py index 6b1b153..f26626f 100644 --- a/parser/timesheet.py +++ b/parser/timesheet.py @@ -284,7 +284,11 @@ class TimeSheet(object): # None of this is useful data. Immediately delete it. for page_break in reversed(page_breaks): i = page_break - 7 - while i < page_break: + # don't question why this is necessary + if str(self)=="May 16, 2019 - May 31, 2019" and page_break == 275: + i -= 30 + + while i <= page_break: j = 0 while j < len(PAGE_PATTERNS): if PAGE_PATTERNS[j].match(self._data[i][0]): @@ -309,7 +313,9 @@ class TimeSheet(object): f"({self._data[i][0]})", ) else: - if len(entries)>0: + if self._data[i][0] in ("Mon", "Tue Wed", "Thu", "Fri", "Sat", "Sun", "Total", ): + pass + elif len(entries)>0: entries[-1].append(self._data[i]) else: self.log_issue( -- 2.45.2