From f441822f16af2386abe50dc1d4b8161566843f3e Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Sat, 30 Apr 2022 02:09:25 -0500 Subject: [PATCH] Significant updates Wrote time sheet parser that ingests and validates all semi-structured data. Next step is to interpret left styles as dates, so that hours can be parsed into a time entry object. Updated HTML parser to more completely filter out unhelpful data, and to internally build the array of doubles (data and left style). --- main.py | 3 +- notes | 42 +------ parse.py | 36 +----- parser/html.py | 30 ++++- parser/timesheet.py | 267 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 304 insertions(+), 74 deletions(-) create mode 100644 parser/timesheet.py diff --git a/main.py b/main.py index e608ede..e029c1c 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import parse def main(filelist): print(f"processing {len(filelist)} files") - for filename in (filelist[4],): + for filename in (filelist): parse.timesheet(filename) if __name__ == "__main__": @@ -18,6 +18,5 @@ if __name__ == "__main__": filelist.append(filepath) else: print(f"no such file: '{filename}'") - main(filelist) diff --git a/notes b/notes index a04ffcb..fd20e45 100644 --- a/notes +++ b/notes @@ -2,7 +2,10 @@ index 20px timecode (usually "ST") 40px project 39px timetype 201px - +"Notes" 40px +notesbegin 40px +notesend 99px +note 220px day1 571px or 572px @@ -14,43 +17,6 @@ day6 696px or 697px day7 721px or 722px rowtotal 751px or 753px or 756px -document header: -"Timesheet" -"Mon N, YYYY - Mon N, YYYY" -"Location:" -"[E01] Fors Marsh Group" -"Department:" -"[3200] Advanced Analytics" -"Employee Type:" -"[1] Annual Salary" -"Location (Default" -"[LOCAL] Location" -"Function:" -"Exempt:" -"Status:" -"[1] Full Time" -"Yes" -"Approved" -"Post Status:" -"Validation:" -"Date/Time:" -"Posted" -"Passed" -"Mon N, YYYY HH:MM" -"Total Tomesheet:" -"Standard Hours:" -"Total Billable:" -"Percent Billability:" - - - - - -left column header: -"ID" -"Time Code" -"Project" -"TimeType" right colummn header: "Mon" diff --git a/parse.py b/parse.py index 5b55629..7b0e12d 100644 --- a/parse.py +++ b/parse.py @@ -15,44 +15,18 @@ from pprint import pprint from parser.html import parse as parse_html from parser.pdf import parse as parse_pdf +from parser.timesheet import Timesheet def read_timesheet(filename): unstructured_data = parse_pdf(filename) semistructured_data = parse_html(unstructured_data) return semistructured_data -def has_style_left(attrs): - return "style" in attrs.keys() and "left" in attrs["style"].keys() - -def update_count(counters, key): - if key in counters.keys(): - counters[key] += 1 - else: - counters[key] = 1 - return counters - def parse_timesheet(data): - - in_div = False - in_span = False - left = "" - - for line in data: - if in_span: - if line[0] == "DATA": - print(f"{left:10} {line[2][0]}") - in_span = False - in_div = False - elif in_div: - if line[0] == "START" and line[1] == "span": - in_span = True - else: - in_div = False - else: - if line[0] == "START" and line[1] == "div" and has_style_left(line[2]): - in_div = True - left = line[2]["style"]["left"] - + t = Timesheet(data) + t.report_issues() + #for index, line in enumerate(t._data): + # print(index, line) return [] def extract_projects(structured_data): diff --git a/parser/html.py b/parser/html.py index ec8cdac..b5e674b 100644 --- a/parser/html.py +++ b/parser/html.py @@ -53,6 +53,9 @@ def parse_attrs_doubles(attrs): attrs_dict[key] = value return attrs_dict +def has_style_left(attrs): + return "style" in attrs.keys() and "left" in attrs["style"].keys() + class TimesheetHTMLParser(HTMLParser): """A specialization of the `html.parser.HTMLParser` class to handle my timesheets. @@ -63,14 +66,35 @@ class TimesheetHTMLParser(HTMLParser): """ def __init__(self): HTMLParser.__init__(self) + self._left = 0 + self._in_div = False + self._in_span = False self._data = [] + def handle_starttag(self, tag, _attrs): attrs = parse_attrs_doubles(_attrs) - self._data.append(["START", tag, attrs]) + if self._in_div: + if tag == "span": + self._in_span = True + else: + self._in_div = False + elif tag == "div" and has_style_left(attrs): + self._left = int(attrs["style"]["left"].removesuffix("px")) + self._in_div = True + else: + self._in_span = False + self._in_div = False + def handle_endtag(self, tag): - self._data.append(["END", tag]) + self._in_span = False + self._in_div = False + def handle_data(self, data): - self._data.append(["DATA", "", data.splitlines()]) + if self._in_span: + self._data.append((data.splitlines()[0], self._left, )) + self._in_span = False + self._in_div = False + def dump(self): return self._data diff --git a/parser/timesheet.py b/parser/timesheet.py new file mode 100644 index 0000000..54b73c1 --- /dev/null +++ b/parser/timesheet.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 + +from re import compile as re_compile + +PATTERNS = ( + re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012] [012][0-9]:[0-5][0-9]"), + re_compile("\(GMT-0[456]:00\) .*"), + re_compile("Page [1-9][0-9]?"), + re_compile("of [1-9][0-9]?"), + re_compile("Timesheet"), + re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012] - [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"), + re_compile("ID"), + re_compile("Time Code"), + re_compile("Project"), + re_compile("TimeType"), +) + +def move_item(data, index, destination): + """Re-order a list, moving an item from `index` to `destination`.""" + if index < destination: + return data[:index] + data[index+1:destination+1] + data[index:index+1] + data[destination+1:] + elif index > destination: + return data[:destination] + data[index:index+1] + data[destination:index] + data[index+1:] + else: + return data + +def remove_item(data, index): + """Adjust a list, removing an item from `index`.""" + del data[index] + return data + +def is_multiple_8(string): + try: + return int(float(string)) % 8 == 0 + except: + return False + +class Timesheet(object): + def __init__(self, semistructured_data): + self._issues = [] + self._data = self.beat_data_with_a_stick(semistructured_data) + self.assert_header() + self.parse_header() + self.parse_footer() + self.parse_pages() + + def __str__(self): + return self.header.get("dates", "Timesheet(...)") + + def report_issues(self): + if self._issues: + print(f"There are some issues in the timesheet for {self}...") + for issue in self._issues: + print(issue) + for index, line in enumerate(self._data): + print(index, line) + + def log_issue(self, *parts): + self._issues.append(''.join(parts)) + + def beat_data_with_a_stick(self, data): + """A fragile solution to the problems resulting from + PDF-to-HTML-to-long data conversions. + """ + # a small number of timesheets have a 'Doc.No.' field + if data[14][0] == "Doc.No." and data[17][0] == "1": + self.log_issue("beat_data_with_a_stick: killing 'Doc.No.' fields") + #Note: subtract 1 due to the other operation realigning indices + data = remove_item(data, 14) + data = remove_item(data, 17-1) + elif data[16][0] == "Doc.No." and data[19][0] == "1": + self.log_issue( + "beat_data_with_a_stick: ", + "killing 'Doc.No.' label and field", + ) + data = remove_item(data, 16) + data = remove_item(data, 19-1) + + # 'Post Status:' can float up if the status is 'Not posted' + if data[6][0] == "Post Status:" and data[7][0] == "Not posted": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting post status label and field", + ) + #Note: add/subtract 1 due to the other operation realigning indices + data = move_item(data, 6, 17+1) + data = move_item(data, 7-1, 20) + + # 'Function:' and '[1] Full Time' can float up together + if data[4][0] == "Function:" and data[5][0] == "[1] Full Time": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting function label and field", + ) + #Note: add/subtract 1 due to the other operation realigning indices + data = move_item(data, 4, 10+1) + data = move_item(data, 5-1, 13) + elif data[8][0] == "Function:" and data[11][0] == "[1] Full Time": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting function label and field", + ) + data = move_item(data, 8, 10) + data = move_item(data, 11, 13) + + # 'Percent Billability:' can float down + if data[28][0] == "Percent Billability:": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting percent billability label", + ) + data = move_item(data, 28, 25) + + # 'Validation:' and 'Passed' can float up + if data[16][0] == "Validation:" and data[19][0] == "Passed": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting validation label and field", + ) + data = move_item(data, 16, 17) + data = move_item(data, 19, 20) + + # 'Posted'/'Not posted' can float up + if data[17][0] == "Posted" or data[17][0] == "Not posted": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting post status field", + ) + data = move_item(data, 17, 19) + + # 'ID', 'Time Code', 'Project', and 'TimeType' and float down + if data[41][0] == "ID" and data[42][0] == "Time Code" and data[43][0] == "Project" and data[44][0] == "TimeType": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting ID, Time Code, Project, and TimeType header", + ) + data = move_item(data, 41, 30) + data = move_item(data, 42, 31) + data = move_item(data, 43, 32) + data = move_item(data, 44, 33) + elif data[43][0] == "ID" and data[44][0] == "Time Code" and data[45][0] == "Project" and data[46][0] == "TimeType": + self.log_issue( + "beat_data_with_a_stick: ", + "re-sorting ID, Time Code, Project, and TimeType header", + ) + data = move_item(data, 43, 30) + data = move_item(data, 44, 31) + data = move_item(data, 45, 32) + data = move_item(data, 46, 33) + + return data + + def assert_header_item(self, index, should_be): + if self._data[index][0] != should_be: + self.log_issue( + "assert_header: ", + f"item {index} is not {should_be} ", + f"(is {self._data[index][0]})", + ) + + def assert_header_item_any(self, index, should_be_any): + if self._data[index][0] not in should_be_any: + self.log_issue( + "assert_header: ", + f"item {index} is not one of {', '.join(should_be_any)} ", + f"(is {self._data[index][0]})", + ) + + def assert_header(self): + """Validate a document based on the header lines.""" + self.assert_header_item(0, "Timesheet") + self.assert_header_item(2, "Location:") + self.assert_header_item(3, "[E01] Fors Marsh Group") + self.assert_header_item(4, "Department:") + self.assert_header_item_any(5, ("[3200] Advanced Analytics", "[3230] Data Management", )) + self.assert_header_item(6, "Employee Type:") + self.assert_header_item(7, "[1] Annual Salary") + self.assert_header_item(8, "Location (Default") + self.assert_header_item(9, "[LOCAL] Location") + self.assert_header_item(10, "Function:") + self.assert_header_item(11, "Exempt:") + self.assert_header_item(12, "Status:") + self.assert_header_item(13, "[1] Full Time") + self.assert_header_item(14, "Yes") + self.assert_header_item_any(15, ("Approved", "Closed", "On Hold [Draft]", )) + self.assert_header_item(16, "Post Status:") + self.assert_header_item(17, "Validation:") + self.assert_header_item(18, "Date/Time:") + self.assert_header_item_any(19, ("Posted", "Not posted", )) + self.assert_header_item_any(20, ("Passed", "Warnings", )) + #21 should be like "Mon N, YYYY HH:MM" + self.assert_header_item(22, "Total Timesheet:") + self.assert_header_item(23, "Standard Hours:") + self.assert_header_item(24, "Total Billable:") + self.assert_header_item(25, "Percent Billability:") + if not is_multiple_8(self._data[27][0]): + self.log_issue( + "assert_header: ", + "item 27 is not a multiple of 8 ", + f"(is {self._data[27]})", + ) + self.assert_header_item(30, "ID") + self.assert_header_item(31, "Time Code") + self.assert_header_item(32, "Project") + self.assert_header_item(33, "TimeType") + + def parse_header(self): + """Read data from the document header and clear those lines.""" + self.header = { + "dates": self._data[1][0], + "employ_location": self._data[3][0], + "employ_location_default": self._data[9][0], + "employ_dept": self._data[5][0], + "employ_type": self._data[7][0], + "employ_exempt": self._data[14][0], + "status": self._data[15][0], + "status_posting": self._data[19][0], + "status_validation": self._data[20][0], + "status_timestamp": self._data[21][0], + "hours": self._data[26][0], + "hours_minimum": self._data[27][0], + "hours_billable": self._data[28][0], + "percent_billable": self._data[29][0], + } + del self._data[:34] + + def parse_footer(self): + """Loop though lines to identify the document footer and clear it.""" + target = None + for n in range(len(self._data)-1, 0, -1): + if self._data[n][0] == "Hours Distribution by Time Code": + target = n + break + if target is None: + self.log_issue( + "parse_footer: ", + "could not locate document footer", + ) + else: + del self._data[target:] + + def parse_pages(self): + """Loop through lines to identify page headers and footers, and clear + those lines. + """ + page_breaks = [] + for index, line in enumerate(self._data): + if line[0] == "Timesheet": + page_breaks.append(index) + if not page_breaks: + self.log_issue( + "parse_pages: ", + "could not locate any page breaks", + ) + + for page_break in reversed(page_breaks): + i = page_break - 7 + while i < page_break: + j = 0 + while j < len(PATTERNS): + if PATTERNS[j].match(self._data[i][0]): + del self._data[i] + j = 0 + else: + j += 1 + i += 1 + -- 2.45.2