From cab4c59713aeea180456717c05ba8f8d42e0597b Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Wed, 4 May 2022 16:31:50 -0500 Subject: [PATCH] Data pipeline error Some (sub)total rows were being mistaken for actual hour entries. Previously I cut off all elements ordered after the 'Hours Distribution by Time Code' marker. This is insufficient because (1) some elements can be slightly higher and (2) some elements can float to the previous page. I've fixed (1) by fudging the y-dimension numbers. (2) appears to only impact a single timesheet (2019-06-15). I've added some more debugging to help diagnose and communicate this issue. --- parser/timesheet.py | 43 +++++++++++++++++++++++++++++++++++++------ parser/xml.py | 10 +++++++++- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/parser/timesheet.py b/parser/timesheet.py index e6be2f1..ed75b70 100644 --- a/parser/timesheet.py +++ b/parser/timesheet.py @@ -42,6 +42,7 @@ class TimeEntry(object): self.data = {} self.reference_date = None self.in_notes = False + self.final = False def assert_equal(self, value, should_be): if value != should_be: @@ -51,6 +52,13 @@ class TimeEntry(object): """Given a string like '1.25' and a day offset between 0 and 6, set hours into a date. """ + if self.final: + printf( + "hours ({0}) set after entry finalized", + hours, + ) + return + if self.reference_date is None: printf( "hours ({0}) set before a reference date", @@ -65,6 +73,13 @@ class TimeEntry(object): def set_total_week_hours(self, total_hours): """Given a string like '1.25', validate set hours for a week.""" + if self.final: + printf( + "total week hours ({0}) set after entry finalized", + total_hours, + ) + return + if self.reference_date is None: printf( "total week hours ({0}) set before a reference date", @@ -81,11 +96,13 @@ class TimeEntry(object): self.assert_equal(sum_hours, decimal.Decimal(total_hours)) + self.advance_reference_date() + def set_total_line_hours(self, total_hours): """Given a string like '1.25', validate set hours for a line entry.""" - if self.reference_date is None: + if self.final: printf( - "total line hours ({0}) set before a reference date", + "total line hours ({0}) set after entry finalized", total_hours, ) return @@ -96,6 +113,8 @@ class TimeEntry(object): self.assert_equal(sum_hours, decimal.Decimal(total_hours)) + self.mark_final() + def advance_reference_date(self): """Mark the reference date as invalid.""" self.reference_date = None @@ -118,7 +137,7 @@ class TimeEntry(object): def set_label(self, label): """Given a string, set the human-readable project label.""" - if not self.label and not self.in_notes: + if self.label is None and not self.in_notes: self.label = label def mark_notes(self): @@ -127,12 +146,18 @@ class TimeEntry(object): """ self.in_notes = True + def mark_final(self): + """Mark that no more ho9urs should be accepted.""" + self.final = True + class TimeSheet(object): def __init__(self, data): self.data = data self.entries = [] for row in range(len(self.data)): - self.parse_row(row) + rc = self.parse_row(row) + if rc: + break def set_hours(self, day, hours): """Given a string like '1.25' and a day offset between 0 and 6, set @@ -171,7 +196,11 @@ class TimeSheet(object): """Given a string like '12345.123.12.123', set the official project code. """ - self.entries[-1].set_project(project) + if " " in project: + project_timetype = project.split(" ", 1) + self.entries[-1].set_project(project_timetype[0]) + else: + self.entries[-1].set_project(project) def set_label(self, label): """Given a string, set the human-readable project label.""" @@ -186,7 +215,7 @@ class TimeSheet(object): def parse_row(self, index): """Parse a row of data and dispatch between time entry methods.""" if len(self.data[index])<3: - return + return True if APPROVED_PATTERN.match(self.data[index][2]): pass @@ -234,6 +263,8 @@ class TimeSheet(object): else: self.set_label(self.data[index][2]) + return False + def parse(filename): with open(filename, "r", newline="") as f: reader = csv.reader(f) diff --git a/parser/xml.py b/parser/xml.py index 3b6a92a..2925154 100644 --- a/parser/xml.py +++ b/parser/xml.py @@ -112,7 +112,7 @@ class TimeSheetHandler(handler.ContentHandler): if not handled: text = self.pop_buffer() if text=="Hours Distribution by Time Code": - self.in_hours_distribution = True + self.record_hours_distribution() else: self.record_text(text) self.in_textbox = False @@ -138,6 +138,14 @@ class TimeSheetHandler(handler.ContentHandler): """Helper function to append new text data to the line buffer.""" self.line_buffer[-1].append(data) + def record_hours_distribution(self): + """Helper function to fudge some numbers. Inflate the y-dimension of + the final value in order to better catch some footer elements, + especially (sub)total rows. + """ + self.in_hours_distribution = True + self.line_buffer[-1][1] = str(float(self.line_buffer[-1][1]) + 5) + def append_buffer(self, data): """Helper function to append new character data to the text buffer.""" self.text_buffer += data -- 2.45.2