~dricottone/fmg-timesheets: Implemented time entry extraction; no assert errors!

3 files changed, 138 insertions(+), 44 deletions(-)

M parser/html.py
M parser/timeentry.py
M parser/timesheet.py

M parser/html.py => parser/html.py +12 -4

@@ 56,6 56,9 @@ def parse_attrs_doubles(attrs):
 def has_style_left(attrs):
     return "style" in attrs.keys() and "left" in attrs["style"].keys()
 
+def has_style_top(attrs):
+    return "style" in attrs.keys() and "top" in attrs["style"].keys()
+
 class TimesheetHTMLParser(HTMLParser):
     """A specialization of the `html.parser.HTMLParser` class to handle my
     timesheets.


@@ 66,6 69,7 @@ class TimesheetHTMLParser(HTMLParser):
     """
     def __init__(self):
         HTMLParser.__init__(self)
+        self._top = 0
         self._left = 0
         self._in_div = False
         self._in_span = False


@@ 78,9 82,13 @@ class TimesheetHTMLParser(HTMLParser):
                 self._in_span = True
             else:
                 self._in_div = False
-        elif tag == "div" and has_style_left(attrs):
-            self._left = int(attrs["style"]["left"].removesuffix("px"))
-            self._in_div = True
+        elif tag == "div":
+            if has_style_left(attrs):
+                self._left = int(attrs["style"]["left"].removesuffix("px"))
+                self._in_div = True
+            if has_style_top(attrs):
+                self._top = int(attrs["style"]["top"].removesuffix("px"))
+                self._in_div = True
         else:
             self._in_span = False
             self._in_div = False


@@ 91,7 99,7 @@ class TimesheetHTMLParser(HTMLParser):
 
     def handle_data(self, data):
         if self._in_span:
-            self._data.append((data.splitlines()[0], self._left, ))
+            self._data.append((data.splitlines()[0], self._left, self._top))
         self._in_span = False
         self._in_div = False

M parser/timeentry.py => parser/timeentry.py +118 -38

@@ 2,16 2,21 @@
 
 from re import compile as re_compile
 
+PROJECT_PATTERN = re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})")
+TIMETYPE_PATTERN = re_compile("(FSERV|[0-9]{3,5})")
+DATE_PATTERN = re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012]")
+
 ENTRY_PATTERNS = (
     re_compile("[1-9][0-9]?"),
-    re_compile("(ST|VAC|HOL)"),
-    re_compile("(OHORG\.(MTG|ONB)\.0[0-5]\.00[0-5]|UNALW\.EMW\.0[0-5]\.00[0-5]|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"),
-    re_compile("[0-9]{3,5}"),
+    re_compile("(HOL|OTU|ST|VAC)"),
+    re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"),
+    re_compile("(FSERV|[0-9]{3,5})"),
     re_compile("."),
     re_compile("Notes"),
-    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
-    re_compile("- [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
+    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] -"),
+    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] Other"),
     re_compile("."),
+    re_compile("Approved"),
 )
 
 class TimeEntry(object):


@@ 20,7 25,10 @@ class TimeEntry(object):
         self._timesheet = timesheet_name
         self._entry = entry_number
         self._data = self.beat_data_with_a_stick(semistructured_data)
-        self.assert_entry()
+        self.parse_entry_number()
+        self.parse_entry_project()
+        self.parse_entry_notes()
+        self.parse_entry_approval()
 
     def __str__(self):
         return f"entry #{self._entry} in the timesheet for {self._timesheet}"


@@ 39,46 47,118 @@ class TimeEntry(object):
     def log_issue(self, *parts):
         self._issues.append(''.join(parts))
 
-    def beat_data_with_a_stick(self, data):
-        # Project and TimeType fields can get merged
-        if " " in data[2][0]:
-            project_timetype = data[2][0].split(" ", 1)
-            shared = [n for n in data[2][1:]]
-            split = [
-                (project_timetype[0], *shared, ),
-                (project_timetype[1], *shared, ),
-            ]
-            data = data[:2] + split + data[3:]
-
-        # Holiday and Vacation time entries skip the Project and TimeType fields
-        if data[1][0] in ("HOL", "VAC", ):
-            data = data[0:1] + [("", 0, ), ("", 0, )] + data[1:]
 
-        return data
+    def assert_entry_item_any(self, index, label, should_be_any):
+        if self._data[index][0] not in should_be_any:
+            self.log_issue(
+                "parse_entry: ",
+                f"{label} not one of {', '.join(should_be_any)} ",
+                f"(is {self._data[index][0]})",
+            )
 
-    def assert_entry_item(self, index, should_be_at):
-        if not ENTRY_PATTERNS[index].match(self._data[index][0]):
+    def assert_entry_item_match(self, index, label, pattern):
+        if not pattern.match(self._data[index][0]):
             self.log_issue(
-                "assert_entry: ",
-                f"item {index} does not look right ",
-                f"({self._data[index][0]})",
+                "parse_entry: ",
+                f"{label} does not look correct ",
+                f"(is {self._data[index][0]})",
             )
+
+    def assert_entry_item_at(self, index, label, should_be_at):
         if self._data[index][1] != should_be_at:
             self.log_issue(
-                "assert_entry: ",
-                f"item {index} is not at {should_be_at}px left ",
+                "parse_entry: ",
+                f"{label} not at {should_be_at}px left ",
                 f"(is at {self._data[index][1]}px)",
             )
 
-    def assert_entry(self):
-        self.assert_entry_item(1, 40)
-        self.assert_entry_item(2, 120)
-        self.assert_entry_item(3, 201)
+    def parse_entry_number(self):
+        del self._data[0]
+
+    def parse_entry_project(self):
+        self.assert_entry_item_any(0, "time code", ("HOL", "OTU", "ST", "VAC", ))
+        self.assert_entry_item_at(0, "time code", 40)
+        self._timecode = self._data[0][0]
+        del self._data[0]
+
+        if self._timecode in ("HOL", "OTU", "VAC", ):
+            self._project = ""
+            self._timetype = ""
+        elif " " in self._data[0][0]:
+            self.assert_entry_item_match(0, "project", PROJECT_PATTERN)
+            self.assert_entry_item_at(0, "project", 120)
+            project_timetype = self._data[0][0].split(" ", 1)
+            self._project = project_timetype[0]
+            self._timetype = project_timetype[1]
+            del self._data[0]
+        else:
+            self.assert_entry_item_match(0, "project", PROJECT_PATTERN)
+            self.assert_entry_item_at(0, "project", 120)
+            self._project = self._data[0][0]
+            self.assert_entry_item_match(1, "time type", TIMETYPE_PATTERN)
+            self.assert_entry_item_at(1, "time type", 201)
+            self._timetype = self._data[0][0]
+            del self._data[0:2]
+
+        self._name = self._data[0][0]
+        del self._data[0]
+
+    def parse_entry_note(self):
+        if self._data[0][0] == "Line Note:":
+            # There is no start or end field, just the text field
+            self._notes.append({
+                "start": "",
+                "end": "",
+                "text": self._data[1][0],
+            })
+            del self._data[0:2]
+        else:
+            match = DATE_PATTERN.match(self._data[0][0])
+            if match:
+                self._notes.append({})
+
+                # Handle the start field
+                self.assert_entry_item_at(0, "note start", 40)
+                self._notes[-1]["start"] = match
+
+                # Handle the start/end separator if it is separate from the
+                # start field
+                if self._data[1][0] == "-":
+                    del self._data[1]
+
+                # Handle the end field
+                self.assert_entry_item_at(1, "note end", 99)
+                match = DATE_PATTERN.match(self._data[1][0])
+                if match:
+                    self._notes[-1]["end"] = match
+
+                # Handle the 'Approved' note if it floated up
+                if self._data[2][0] == "Approved":
+                    del self._data[2]
+
+                # Handle the text field
+                self.assert_entry_item_at(2, "note text", 201)
+                self._notes[-1]["text"] = self._data[2][0]
+
+                del self._data[0:3]
+
+    def parse_entry_notes(self):
         print(self._data)
-        self.assert_entry_item(4, 39)
-        if len(self)>5 and self._data[5] == "Notes":
-            self.assert_entry_item(5, 40)
-            self.assert_entry_item(6, 40)
-            self.assert_entry_item(7, 99)
-            self.assert_entry_item(8, 220)
+        self._notes = []
+        if len(self) and self._data[0][0] == "Notes":
+            del self._data[0]
+
+            # One or two notes follow
+            if len(self):
+                self.parse_entry_note()
+            if len(self):
+                self.parse_entry_note()
+
+    def parse_entry_approval(self):
+        if len(self) and self._data[0][0] == "Approved":
+            del self._data[0]
+
+    def beat_data_with_a_stick(self, data):
+        data = [i for i in data if i[1]<400] + [i for i in data if i[1]>=400]
+        return data

M parser/timesheet.py => parser/timesheet.py +8 -2

@@ 284,7 284,11 @@ class TimeSheet(object):
         # None of this is useful data. Immediately delete it.
         for page_break in reversed(page_breaks):
             i = page_break - 7
-            while i < page_break:
+            # don't question why this is necessary
+            if str(self)=="May 16, 2019 - May 31, 2019" and page_break == 275:
+                i -= 30
+
+            while i <= page_break:
                 j = 0
                 while j < len(PAGE_PATTERNS):
                     if PAGE_PATTERNS[j].match(self._data[i][0]):


@@ 309,7 313,9 @@ class TimeSheet(object):
                         f"({self._data[i][0]})",
                     )
             else:
-                if len(entries)>0:
+                if self._data[i][0] in ("Mon", "Tue Wed", "Thu", "Fri", "Sat", "Sun", "Total", ):
+                    pass
+                elif len(entries)>0:
                     entries[-1].append(self._data[i])
                 else:
                     self.log_issue(