From 7326d28027df088781be5bf0cb673109a1cc0d13 Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Sat, 30 Apr 2022 14:56:20 -0500 Subject: [PATCH] Started implementing time entry extraction. Time entries are now being parsed and validated, though there are numerous issues still to sort out. I have a feeling that further development will require passing around the `top` style attributes in the same way I'm passing around the `left` style attributes. TBD though. --- parse.py | 4 +-- parser/timeentry.py | 84 +++++++++++++++++++++++++++++++++++++++++++ parser/timesheet.py | 88 ++++++++++++++++++++++++++++++++++++--------- 3 files changed, 158 insertions(+), 18 deletions(-) create mode 100644 parser/timeentry.py diff --git a/parse.py b/parse.py index 34f5224..11b3120 100644 --- a/parse.py +++ b/parse.py @@ -15,7 +15,7 @@ from pprint import pprint from parser.html import parse as parse_html from parser.pdf import parse as parse_pdf -from parser.timesheet import Timesheet +from parser.timesheet import TimeSheet def read_timesheet(filename): unstructured_data = parse_pdf(filename) @@ -23,7 +23,7 @@ def read_timesheet(filename): return semistructured_data def parse_timesheet(data): - t = Timesheet(data) + t = TimeSheet(data) t.report_issues() return [] diff --git a/parser/timeentry.py b/parser/timeentry.py new file mode 100644 index 0000000..68254f6 --- /dev/null +++ b/parser/timeentry.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +from re import compile as re_compile + +ENTRY_PATTERNS = ( + re_compile("[1-9][0-9]?"), + re_compile("(ST|VAC|HOL)"), + re_compile("(OHORG\.(MTG|ONB)\.0[0-5]\.00[0-5]|UNALW\.EMW\.0[0-5]\.00[0-5]|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"), + re_compile("[0-9]{3,5}"), + re_compile("."), + re_compile("Notes"), + re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"), + re_compile("- [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"), + re_compile("."), +) + +class TimeEntry(object): + def __init__(self, semistructured_data, timesheet_name, entry_number): + self._issues = [] + self._timesheet = timesheet_name + self._entry = entry_number + self._data = self.beat_data_with_a_stick(semistructured_data) + self.assert_entry() + + def __str__(self): + return f"entry #{self._entry} in the timesheet for {self._timesheet}" + + def __len__(self): + return len(self._data) + + def report_issues(self): + if self._issues: + print(f"There are some issues in {self}...") + for issue in self._issues: + print(issue) + for index, line in enumerate(self._data): + print(index, line) + + def log_issue(self, *parts): + self._issues.append(''.join(parts)) + + def beat_data_with_a_stick(self, data): + # Project and TimeType fields can get merged + if " " in data[2][0]: + project_timetype = data[2][0].split(" ", 1) + shared = [n for n in data[2][1:]] + split = [ + (project_timetype[0], *shared, ), + (project_timetype[1], *shared, ), + ] + data = data[:2] + split + data[3:] + + # Holiday and Vacation time entries skip the Project and TimeType fields + if data[1][0] in ("HOL", "VAC", ): + data = data[0:1] + [("", 0, ), ("", 0, )] + data[1:] + + return data + + def assert_entry_item(self, index, should_be_at): + if not ENTRY_PATTERNS[index].match(self._data[index][0]): + self.log_issue( + "assert_entry: ", + f"item {index} does not look right ", + f"({self._data[index][0]})", + ) + if self._data[index][1] != should_be_at: + self.log_issue( + "assert_entry: ", + f"item {index} is not at {should_be_at}px left ", + f"(is at {self._data[index][1]}px)", + ) + + def assert_entry(self): + self.assert_entry_item(1, 40) + self.assert_entry_item(2, 120) + self.assert_entry_item(3, 201) + print(self._data) + self.assert_entry_item(4, 39) + if len(self)>5 and self._data[5] == "Notes": + self.assert_entry_item(5, 40) + self.assert_entry_item(6, 40) + self.assert_entry_item(7, 99) + self.assert_entry_item(8, 220) + diff --git a/parser/timesheet.py b/parser/timesheet.py index 54b73c1..6b1b153 100644 --- a/parser/timesheet.py +++ b/parser/timesheet.py @@ -2,7 +2,9 @@ from re import compile as re_compile -PATTERNS = ( +from .timeentry import TimeEntry, ENTRY_PATTERNS + +PAGE_PATTERNS = ( re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012] [012][0-9]:[0-5][0-9]"), re_compile("\(GMT-0[456]:00\) .*"), re_compile("Page [1-9][0-9]?"), @@ -35,41 +37,55 @@ def is_multiple_8(string): except: return False -class Timesheet(object): +class TimeSheet(object): def __init__(self, semistructured_data): + self._warnings = [] self._issues = [] self._data = self.beat_data_with_a_stick(semistructured_data) self.assert_header() self.parse_header() self.parse_footer() self.parse_pages() + self.parse_entries() def __str__(self): - return self.header.get("dates", "Timesheet(...)") + return self.header.get("dates", "TimeSheet(...)") + + def __len__(self): + return len(self._data) def report_issues(self): + if self._warnings: + print(f"There are some warnings in the timesheet for {self}...") + for warning in self._warnings: + print(warning) if self._issues: print(f"There are some issues in the timesheet for {self}...") for issue in self._issues: print(issue) for index, line in enumerate(self._data): print(index, line) + for entry in self._entries: + entry.report_issues() def log_issue(self, *parts): self._issues.append(''.join(parts)) + def log_warning(self, *parts): + self._warnings.append(''.join(parts)) + def beat_data_with_a_stick(self, data): """A fragile solution to the problems resulting from PDF-to-HTML-to-long data conversions. """ # a small number of timesheets have a 'Doc.No.' field if data[14][0] == "Doc.No." and data[17][0] == "1": - self.log_issue("beat_data_with_a_stick: killing 'Doc.No.' fields") + self.log_warning("beat_data_with_a_stick: killing 'Doc.No.' fields") #Note: subtract 1 due to the other operation realigning indices data = remove_item(data, 14) data = remove_item(data, 17-1) elif data[16][0] == "Doc.No." and data[19][0] == "1": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "killing 'Doc.No.' label and field", ) @@ -78,7 +94,7 @@ class Timesheet(object): # 'Post Status:' can float up if the status is 'Not posted' if data[6][0] == "Post Status:" and data[7][0] == "Not posted": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting post status label and field", ) @@ -88,7 +104,7 @@ class Timesheet(object): # 'Function:' and '[1] Full Time' can float up together if data[4][0] == "Function:" and data[5][0] == "[1] Full Time": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting function label and field", ) @@ -96,7 +112,7 @@ class Timesheet(object): data = move_item(data, 4, 10+1) data = move_item(data, 5-1, 13) elif data[8][0] == "Function:" and data[11][0] == "[1] Full Time": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting function label and field", ) @@ -105,7 +121,7 @@ class Timesheet(object): # 'Percent Billability:' can float down if data[28][0] == "Percent Billability:": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting percent billability label", ) @@ -113,7 +129,7 @@ class Timesheet(object): # 'Validation:' and 'Passed' can float up if data[16][0] == "Validation:" and data[19][0] == "Passed": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting validation label and field", ) @@ -122,7 +138,7 @@ class Timesheet(object): # 'Posted'/'Not posted' can float up if data[17][0] == "Posted" or data[17][0] == "Not posted": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting post status field", ) @@ -130,7 +146,7 @@ class Timesheet(object): # 'ID', 'Time Code', 'Project', and 'TimeType' and float down if data[41][0] == "ID" and data[42][0] == "Time Code" and data[43][0] == "Project" and data[44][0] == "TimeType": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting ID, Time Code, Project, and TimeType header", ) @@ -139,7 +155,7 @@ class Timesheet(object): data = move_item(data, 43, 32) data = move_item(data, 44, 33) elif data[43][0] == "ID" and data[44][0] == "Time Code" and data[45][0] == "Project" and data[46][0] == "TimeType": - self.log_issue( + self.log_warning( "beat_data_with_a_stick: ", "re-sorting ID, Time Code, Project, and TimeType header", ) @@ -227,7 +243,7 @@ class Timesheet(object): def parse_footer(self): """Loop though lines to identify the document footer and clear it.""" target = None - for n in range(len(self._data)-1, 0, -1): + for n in range(len(self)-1, 0, -1): if self._data[n][0] == "Hours Distribution by Time Code": target = n break @@ -243,6 +259,7 @@ class Timesheet(object): """Loop through lines to identify page headers and footers, and clear those lines. """ + # Pages begin with "Timesheet" page_breaks = [] for index, line in enumerate(self._data): if line[0] == "Timesheet": @@ -253,15 +270,54 @@ class Timesheet(object): "could not locate any page breaks", ) + # At each page break, there is a page footer sequence before: + # + "Mmm N, YYYY HH:MM" + # + "(GMT-0H:00) TZNAME" + # + "Page N" + # + "of N" + # ...and a page header sequence after: + # + "Mmm N, YYYY - Mmm N, YYYY" + # + "ID" + # + "Time Code" + # + "Project" + # + "TimeType" + # None of this is useful data. Immediately delete it. for page_break in reversed(page_breaks): i = page_break - 7 while i < page_break: j = 0 - while j < len(PATTERNS): - if PATTERNS[j].match(self._data[i][0]): + while j < len(PAGE_PATTERNS): + if PAGE_PATTERNS[j].match(self._data[i][0]): del self._data[i] j = 0 else: j += 1 i += 1 + def parse_entries(self): + """Loop through lines to identify time entries.""" + i = 0 + entries = [] + while i < len(self): + if self._data[i][1] == 20: + if ENTRY_PATTERNS[0].match(self._data[i][0]): + entries.append([self._data[i]]) + else: + self.log_issue( + "parse_entries: ", + "something unexpected in the entry start position ", + f"({self._data[i][0]})", + ) + else: + if len(entries)>0: + entries[-1].append(self._data[i]) + else: + self.log_issue( + "parse_entries: ", + "something unexpected before any entries started ", + f"({self._data[i][0]})", + ) + i += 1 + + self._entries = [TimeEntry(e, str(self), e[0][0]) for e in entries] + -- 2.45.2