From 8ebab2a421aa9bb58b0f3fd43e517190ba12e752 Mon Sep 17 00:00:00 2001
From: Dominic Ricottone <me@dominic-ricottone.com>
Date: Tue, 3 May 2022 21:35:27 -0500
Subject: [PATCH] Fully functional timesheet parser.

The timesheet parser is a complete success. Some minor issues were
ironed out in the XML parser as well.

Next steps: writing to a time series database and beginning analysis.
---
 main.py             |  21 +-
 parser/timeentry.py | 163 --------------
 parser/timesheet.py | 536 +++++++++++++++++++-------------------------
 parser/xml.py       |   4 +-
 4 files changed, 245 insertions(+), 479 deletions(-)
 delete mode 100644 parser/timeentry.py

diff --git a/main.py b/main.py
index eb3c0f6..7cfe598 100644
--- a/main.py
+++ b/main.py
@@ -2,12 +2,15 @@
 
 import sys
 import pathlib
+from pprint import pprint
 
 from parser.xml import parse as parse_xml
 from parser.pdf import parse as parse_pdf
-from parser.timesheet import TimeSheet
+from parser.timesheet import parse as parse_timesheet
 
 def main(filelist):
+    projects = {}
+
     print(f"processing {len(filelist)} files")
     for filename in (filelist):
         xml_filename = filename.parent.joinpath(filename.name + ".xml")
@@ -16,8 +19,20 @@ def main(filelist):
         parse_pdf(filename, xml_filename)
         parse_xml(xml_filename, csv_filename)
 
-        #timesheet = TimeSheet(semistructured_data)
-        #timesheet.report_issues()
+        entries = parse_timesheet(csv_filename)
+        for entry in entries:
+            if entry.time_code in ("HOL", "OTU", "VAC", "OPL", ):
+                if entry.time_code not in projects.keys():
+                    projects[entry.time_code] = {}
+                for date, hours in entry.data.items():
+                    projects[entry.time_code][date] = hours
+            else:
+                if entry.project not in projects.keys():
+                    projects[entry.project] = {}
+                for date, hours in entry.data.items():
+                    projects[entry.project][date] = hours
+
+    pprint(projects)
 
 if __name__ == "__main__":
     filelist = []
diff --git a/parser/timeentry.py b/parser/timeentry.py
deleted file mode 100644
index 51faff8..0000000
--- a/parser/timeentry.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python3
-
-from re import compile as re_compile
-
-PROJECT_PATTERN = re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})")
-TIMETYPE_PATTERN = re_compile("(FSERV|[0-9]{3,5})")
-DATE_PATTERN = re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012]")
-
-ENTRY_PATTERNS = (
-    re_compile("[1-9][0-9]?"),
-    re_compile("(HOL|OTU|ST|VAC)"),
-    re_compile("(OHORG\.(MTG|ONB|PFR|TPD|TOH)\.[01][0-9]\.00[0-9]|UNALW\.EMW\.0[0-9]\.00[0-9]|FSERV\.PPM\.00\.000|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"),
-    re_compile("(FSERV|[0-9]{3,5})"),
-    re_compile("."),
-    re_compile("Notes"),
-    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] -"),
-    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [0-9][0-9]?, 20[12][89012] Other"),
-    re_compile("."),
-    re_compile("Approved"),
-)
-
-class TimeEntry(object):
-    def __init__(self, semistructured_data, timesheet_name, entry_number):
-        self._issues = []
-        self._timesheet = timesheet_name
-        self._entry = entry_number
-        self._data = self.beat_data_with_a_stick(semistructured_data)
-        self.parse_entry_number()
-        self.parse_entry_project()
-        self.parse_entry_notes()
-        self.parse_entry_approval()
-
-    def __str__(self):
-        return f"entry #{self._entry} in the timesheet for {self._timesheet}"
-
-    def __len__(self):
-        return len(self._data)
-
-    def report_issues(self):
-        if self._issues:
-            print(f"There are some issues in {self}...")
-            for issue in self._issues:
-                print(issue)
-            for index, line in enumerate(self._data):
-                print(index, line)
-
-    def log_issue(self, *parts):
-        self._issues.append(''.join(parts))
-
-
-    def assert_entry_item_any(self, index, label, should_be_any):
-        if self._data[index][0] not in should_be_any:
-            self.log_issue(
-                "parse_entry: ",
-                f"{label} not one of {', '.join(should_be_any)} ",
-                f"(is {self._data[index][0]})",
-            )
-
-    def assert_entry_item_match(self, index, label, pattern):
-        if not pattern.match(self._data[index][0]):
-            self.log_issue(
-                "parse_entry: ",
-                f"{label} does not look correct ",
-                f"(is {self._data[index][0]})",
-            )
-
-    def assert_entry_item_at(self, index, label, should_be_at):
-        if self._data[index][1] != should_be_at:
-            self.log_issue(
-                "parse_entry: ",
-                f"{label} not at {should_be_at}px left ",
-                f"(is at {self._data[index][1]}px)",
-            )
-
-    def parse_entry_number(self):
-        del self._data[0]
-
-    def parse_entry_project(self):
-        self.assert_entry_item_any(0, "time code", ("HOL", "OTU", "ST", "VAC", ))
-        self.assert_entry_item_at(0, "time code", 40)
-        self._timecode = self._data[0][0]
-        del self._data[0]
-
-        if self._timecode in ("HOL", "OTU", "VAC", ):
-            self._project = ""
-            self._timetype = ""
-        elif " " in self._data[0][0]:
-            self.assert_entry_item_match(0, "project", PROJECT_PATTERN)
-            self.assert_entry_item_at(0, "project", 120)
-            project_timetype = self._data[0][0].split(" ", 1)
-            self._project = project_timetype[0]
-            self._timetype = project_timetype[1]
-            del self._data[0]
-        else:
-            self.assert_entry_item_match(0, "project", PROJECT_PATTERN)
-            self.assert_entry_item_at(0, "project", 120)
-            self._project = self._data[0][0]
-            self.assert_entry_item_match(1, "time type", TIMETYPE_PATTERN)
-            self.assert_entry_item_at(1, "time type", 201)
-            self._timetype = self._data[0][0]
-            del self._data[0:2]
-
-        self._name = self._data[0][0]
-        del self._data[0]
-
-    def parse_entry_note(self):
-        if self._data[0][0] == "Line Note:":
-            # There is no start or end field, just the text field
-            self._notes.append({
-                "start": "",
-                "end": "",
-                "text": self._data[1][0],
-            })
-            del self._data[0:2]
-        else:
-            match = DATE_PATTERN.match(self._data[0][0])
-            if match:
-                self._notes.append({})
-
-                # Handle the start field
-                self.assert_entry_item_at(0, "note start", 40)
-                self._notes[-1]["start"] = match
-
-                # Handle the start/end separator if it is separate from the
-                # start field
-                if self._data[1][0] == "-":
-                    del self._data[1]
-
-                # Handle the end field
-                self.assert_entry_item_at(1, "note end", 99)
-                match = DATE_PATTERN.match(self._data[1][0])
-                if match:
-                    self._notes[-1]["end"] = match
-
-                # Handle the 'Approved' note if it floated up
-                if self._data[2][0] == "Approved":
-                    del self._data[2]
-
-                # Handle the text field
-                self.assert_entry_item_at(2, "note text", 201)
-                self._notes[-1]["text"] = self._data[2][0]
-
-                del self._data[0:3]
-
-    def parse_entry_notes(self):
-        self._notes = []
-        if len(self) and self._data[0][0] == "Notes":
-            del self._data[0]
-
-            # One or two notes follow
-            if len(self):
-                self.parse_entry_note()
-            if len(self):
-                self.parse_entry_note()
-
-    def parse_entry_approval(self):
-        if len(self) and self._data[0][0] == "Approved":
-            del self._data[0]
-
-    def beat_data_with_a_stick(self, data):
-        data = [i for i in data if i[1]<400] + [i for i in data if i[1]>=400]
-        return data
-
diff --git a/parser/timesheet.py b/parser/timesheet.py
index f26626f..e6be2f1 100644
--- a/parser/timesheet.py
+++ b/parser/timesheet.py
@@ -1,329 +1,243 @@
 #!/usr/bin/env python3
 
+import datetime
+import decimal
+import csv
+import sys
 from re import compile as re_compile
 
-from .timeentry import TimeEntry, ENTRY_PATTERNS
-
-PAGE_PATTERNS = (
-    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012] [012][0-9]:[0-5][0-9]"),
-    re_compile("\(GMT-0[456]:00\) .*"),
-    re_compile("Page [1-9][0-9]?"),
-    re_compile("of [1-9][0-9]?"),
-    re_compile("Timesheet"),
-    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012] - [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
-    re_compile("ID"),
-    re_compile("Time Code"),
-    re_compile("Project"),
-    re_compile("TimeType"),
-)
-
-def move_item(data, index, destination):
-    """Re-order a list, moving an item from `index` to `destination`."""
-    if index < destination:
-        return data[:index] + data[index+1:destination+1] + data[index:index+1] + data[destination+1:]
-    elif index > destination:
-        return data[:destination] + data[index:index+1] + data[destination:index] + data[index+1:]
-    else:
-        return data
-
-def remove_item(data, index):
-    """Adjust a list, removing an item from `index`."""
-    del data[index]
-    return data
-
-def is_multiple_8(string):
-    try:
-        return int(float(string)) % 8 == 0
-    except:
-        return False
-
-class TimeSheet(object):
-    def __init__(self, semistructured_data):
-        self._warnings = []
-        self._issues = []
-        self._data = self.beat_data_with_a_stick(semistructured_data)
-        self.assert_header()
-        self.parse_header()
-        self.parse_footer()
-        self.parse_pages()
-        self.parse_entries()
-
-    def __str__(self):
-        return self.header.get("dates", "TimeSheet(...)")
-
-    def __len__(self):
-        return len(self._data)
-
-    def report_issues(self):
-        if self._warnings:
-            print(f"There are some warnings in the timesheet for {self}...")
-            for warning in self._warnings:
-                print(warning)
-        if self._issues:
-            print(f"There are some issues in the timesheet for {self}...")
-            for issue in self._issues:
-                print(issue)
-            for index, line in enumerate(self._data):
-                print(index, line)
-        for entry in self._entries:
-            entry.report_issues()
-
-    def log_issue(self, *parts):
-        self._issues.append(''.join(parts))
-
-    def log_warning(self, *parts):
-        self._warnings.append(''.join(parts))
-
-    def beat_data_with_a_stick(self, data):
-        """A fragile solution to the problems resulting from
-        PDF-to-HTML-to-long data conversions.
+ID_PATTERN = re_compile("[1-2]?[0-9]$")
+TIME_CODE_PATTERN = re_compile("(ST|VAC|HOL|OTU|OPL)")
+PROJECT_PATTERN = re_compile("[A-Z0-9]{5}\.[A-Z0-9]{3}\.[A-Z0-9]{2}\.[A-Z0-9]{3}")
+TIMETYPE_PATTERN = re_compile("[A-Z0-9.]{3,}")
+WEEK_BEGINNING_AND_WEEK_PATTERN = re_compile("Week Beginning: [0-9][0-9] [ADFJMNOS][aceopu][bcglnprtvy], 20[12][89012]")
+WEEK_BEGINNING_PATTERN = re_compile("Week Beginning:")
+WEEK_PATTERN = re_compile("[0-9][0-9] [ADFJMNOS][aceopu][bcglnprtvy], 20[12][89012]")
+HOURS_PATTERN = re_compile("[0-9][0-9]?\.(00|25|50|75)")
+TOTAL_HOURS_PATTERN = re_compile("Total Hours for line ")
+APPROVED_PATTERN = re_compile("Approved")
+NOTES_PATTERN = re_compile("Notes")
+
+def printf(string, *variables):
+    """Print to STDERR with formatting."""
+    sys.stderr.write(string.format(*variables))
+    sys.stderr.write("\n")
+
+def is_approximately(location, target):
+    """Tests if a location is close enough to a target to be considered equal.
+
+    PDFs store the rendered location of a textbox, not the mathematically-
+    ideal location. The net effect is that, while you can rely on the y
+    dimension to identify a row, you cannot rely on the x dimension to
+    identify a column. My solution is to make equivalence a bit fuzzy, to the
+    effect of +/- 5 pixels.
+    """
+    return (target-8 <= location <= target+8)
+
+class TimeEntry(object):
+    def __init__(self):
+        self.label = None
+        self.project = None
+        self.time_code = None
+        self.data = {}
+        self.reference_date = None
+        self.in_notes = False
+
+    def assert_equal(self, value, should_be):
+        if value != should_be:
+            printf("{0} is not {1}", value, should_be)
+
+    def set_hours(self, day_offset, hours):
+        """Given a string like '1.25' and a day offset between 0 and 6, set
+        hours into a date.
         """
-        # a small number of timesheets have a 'Doc.No.' field
-        if data[14][0] == "Doc.No." and data[17][0] == "1":
-            self.log_warning("beat_data_with_a_stick: killing 'Doc.No.' fields")
-            #Note: subtract 1 due to the other operation realigning indices
-            data = remove_item(data, 14)
-            data = remove_item(data, 17-1)
-        elif data[16][0] == "Doc.No." and data[19][0] == "1":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "killing 'Doc.No.' label and field",
-            )
-            data = remove_item(data, 16)
-            data = remove_item(data, 19-1)
-
-        # 'Post Status:' can float up if the status is 'Not posted'
-        if data[6][0] == "Post Status:" and data[7][0] == "Not posted":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting post status label and field",
-            )
-            #Note: add/subtract 1 due to the other operation realigning indices
-            data = move_item(data, 6, 17+1)
-            data = move_item(data, 7-1, 20)
-
-        # 'Function:' and '[1] Full Time' can float up together
-        if data[4][0] == "Function:" and data[5][0] == "[1] Full Time":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting function label and field",
+        if self.reference_date is None:
+            printf(
+                "hours ({0}) set before a reference date",
+                hours,
             )
-            #Note: add/subtract 1 due to the other operation realigning indices
-            data = move_item(data, 4, 10+1)
-            data = move_item(data, 5-1, 13)
-        elif data[8][0] == "Function:" and data[11][0] == "[1] Full Time":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting function label and field",
+            return
+
+        target = self.reference_date
+        if day_offset != 0:
+            target += datetime.timedelta(days=day_offset)
+        self.data[target] = decimal.Decimal(hours)
+
+    def set_total_week_hours(self, total_hours):
+        """Given a string like '1.25', validate set hours for a week."""
+        if self.reference_date is None:
+            printf(
+                "total week hours ({0}) set before a reference date",
+                total_hours,
             )
-            data = move_item(data, 8, 10)
-            data = move_item(data, 11, 13)
-
-        # 'Percent Billability:' can float down
-        if data[28][0] == "Percent Billability:":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting percent billability label",
+            return
+
+        target = self.reference_date
+        sum_hours = decimal.Decimal(0)
+        for _ in range(7):
+            if target in self.data:
+                sum_hours += self.data[target]
+            target += datetime.timedelta(days=1)
+
+        self.assert_equal(sum_hours, decimal.Decimal(total_hours))
+
+    def set_total_line_hours(self, total_hours):
+        """Given a string like '1.25', validate set hours for a line entry."""
+        if self.reference_date is None:
+            printf(
+                "total line hours ({0}) set before a reference date",
+                total_hours,
             )
-            data = move_item(data, 28, 25)
+            return
 
-        # 'Validation:' and 'Passed' can float up
-        if data[16][0] == "Validation:" and data[19][0] == "Passed":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting validation label and field",
-            )
-            data = move_item(data, 16, 17)
-            data = move_item(data, 19, 20)
-
-        # 'Posted'/'Not posted' can float up
-        if data[17][0] == "Posted" or data[17][0] == "Not posted":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting post status field",
-            )
-            data = move_item(data, 17, 19)
+        sum_hours = decimal.Decimal(0)
+        for date, hours in self.data.items():
+            sum_hours += hours
 
-        # 'ID', 'Time Code', 'Project', and 'TimeType' and float down
-        if data[41][0] == "ID" and data[42][0] == "Time Code" and data[43][0] == "Project" and data[44][0] == "TimeType":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting ID, Time Code, Project, and TimeType header",
-            )
-            data = move_item(data, 41, 30)
-            data = move_item(data, 42, 31)
-            data = move_item(data, 43, 32)
-            data = move_item(data, 44, 33)
-        elif data[43][0] == "ID" and data[44][0] == "Time Code" and data[45][0] == "Project" and data[46][0] == "TimeType":
-            self.log_warning(
-                "beat_data_with_a_stick: ",
-                "re-sorting ID, Time Code, Project, and TimeType header",
-            )
-            data = move_item(data, 43, 30)
-            data = move_item(data, 44, 31)
-            data = move_item(data, 45, 32)
-            data = move_item(data, 46, 33)
-
-        return data
-
-    def assert_header_item(self, index, should_be):
-        if self._data[index][0] != should_be:
-            self.log_issue(
-                "assert_header: ",
-                f"item {index} is not {should_be} ",
-                f"(is {self._data[index][0]})",
-            )
+        self.assert_equal(sum_hours, decimal.Decimal(total_hours))
 
-    def assert_header_item_any(self, index, should_be_any):
-        if self._data[index][0] not in should_be_any:
-            self.log_issue(
-                "assert_header: ",
-                f"item {index} is not one of {', '.join(should_be_any)} ",
-                f"(is {self._data[index][0]})",
-            )
+    def advance_reference_date(self):
+        """Mark the reference date as invalid."""
+        self.reference_date = None
 
-    def assert_header(self):
-        """Validate a document based on the header lines."""
-        self.assert_header_item(0, "Timesheet")
-        self.assert_header_item(2, "Location:")
-        self.assert_header_item(3, "[E01] Fors Marsh Group")
-        self.assert_header_item(4, "Department:")
-        self.assert_header_item_any(5, ("[3200] Advanced Analytics", "[3230] Data Management", ))
-        self.assert_header_item(6, "Employee Type:")
-        self.assert_header_item(7, "[1] Annual Salary")
-        self.assert_header_item(8, "Location (Default")
-        self.assert_header_item(9, "[LOCAL] Location")
-        self.assert_header_item(10, "Function:")
-        self.assert_header_item(11, "Exempt:")
-        self.assert_header_item(12, "Status:")
-        self.assert_header_item(13, "[1] Full Time")
-        self.assert_header_item(14, "Yes")
-        self.assert_header_item_any(15, ("Approved", "Closed", "On Hold [Draft]", ))
-        self.assert_header_item(16, "Post Status:")
-        self.assert_header_item(17, "Validation:")
-        self.assert_header_item(18, "Date/Time:")
-        self.assert_header_item_any(19, ("Posted", "Not posted", ))
-        self.assert_header_item_any(20, ("Passed", "Warnings", ))
-        #21 should be like "Mon N, YYYY HH:MM"
-        self.assert_header_item(22, "Total Timesheet:")
-        self.assert_header_item(23, "Standard Hours:")
-        self.assert_header_item(24, "Total Billable:")
-        self.assert_header_item(25, "Percent Billability:")
-        if not is_multiple_8(self._data[27][0]):
-            self.log_issue(
-                "assert_header: ",
-                "item 27 is not a multiple of 8 ",
-                f"(is {self._data[27]})",
-            )
-        self.assert_header_item(30, "ID")
-        self.assert_header_item(31, "Time Code")
-        self.assert_header_item(32, "Project")
-        self.assert_header_item(33, "TimeType")
-
-    def parse_header(self):
-        """Read data from the document header and clear those lines."""
-        self.header = {
-            "dates": self._data[1][0],
-            "employ_location": self._data[3][0],
-            "employ_location_default": self._data[9][0],
-            "employ_dept": self._data[5][0],
-            "employ_type": self._data[7][0],
-            "employ_exempt": self._data[14][0],
-            "status": self._data[15][0],
-            "status_posting": self._data[19][0],
-            "status_validation": self._data[20][0],
-            "status_timestamp": self._data[21][0],
-            "hours": self._data[26][0],
-            "hours_minimum": self._data[27][0],
-            "hours_billable": self._data[28][0],
-            "percent_billable": self._data[29][0],
-        }
-        del self._data[:34]
-
-    def parse_footer(self):
-        """Loop though lines to identify the document footer and clear it."""
-        target = None
-        for n in range(len(self)-1, 0, -1):
-            if self._data[n][0] == "Hours Distribution by Time Code":
-                target = n
-                break
-        if target is None:
-            self.log_issue(
-                "parse_footer: ",
-                "could not locate document footer",
-            )
+    def set_reference_date(self, date):
+        """Given a string like '01 Jan, 2022', set the reference date for
+        subsequent method calls.
+        """
+        self.reference_date = datetime.datetime.strptime(date, "%d %b, %Y")
+
+    def set_time_code(self, time_code):
+        """Given 'ST', 'HOL', or 'VAC', set the time code."""
+        self.time_code = time_code
+
+    def set_project(self, project):
+        """Given a string like '12345.123.12.123', set the official project
+        code.
+        """
+        self.project = project
+
+    def set_label(self, label):
+        """Given a string, set the human-readable project label."""
+        if not self.label and not self.in_notes:
+            self.label = label
+
+    def mark_notes(self):
+        """Mark that an entry is receiving notes. Subsequent labels should be
+        ignored.
+        """
+        self.in_notes = True
+
+class TimeSheet(object):
+    def __init__(self, data):
+        self.data = data
+        self.entries = []
+        for row in range(len(self.data)):
+            self.parse_row(row)
+
+    def set_hours(self, day, hours):
+        """Given a string like '1.25' and a day offset between 0 and 6, set
+        hours into a date.
+        """
+        if " " in hours:
+            two_hours = hours.split(" ", 1)
+            self.entries[-1].set_hours(day, two_hours[0])
+            self.entries[-1].set_hours(day+1, two_hours[1])
         else:
-            del self._data[target:]
+            self.entries[-1].set_hours(day, hours)
+
+    def set_total_week_hours(self, total_hours):
+        """Given a string like '1.25', validate set hours for a week."""
+        self.entries[-1].set_total_week_hours(total_hours)
+
+    def set_total_line_hours(self, total_hours):
+        """Given a string like '1.25', validate set hours for a line entry."""
+        self.entries[-1].set_total_line_hours(total_hours)
 
-    def parse_pages(self):
-        """Loop through lines to identify page headers and footers, and clear
-        those lines.
+    def advance_reference_date(self):
+        """Mark the reference date as invalid."""
+        self.entries[-1].advance_reference_date()
+
+    def set_reference_date(self, date):
+        """Given a string like '01 Jan, 2022', set the reference date for
+        subsequent method calls.
         """
-        # Pages begin with "Timesheet"
-        page_breaks = []
-        for index, line in enumerate(self._data):
-            if line[0] == "Timesheet":
-                page_breaks.append(index)
-        if not page_breaks:
-            self.log_issue(
-                "parse_pages: ",
-                "could not locate any page breaks",
-            )
+        self.entries[-1].set_reference_date(date)
+
+    def set_time_code(self, time_code):
+        """Given 'ST', 'HOL', or 'VAC', set the time code."""
+        self.entries[-1].set_time_code(time_code)
+
+    def set_project(self, project):
+        """Given a string like '12345.123.12.123', set the official project
+        code.
+        """
+        self.entries[-1].set_project(project)
+
+    def set_label(self, label):
+        """Given a string, set the human-readable project label."""
+        self.entries[-1].set_label(label)
 
-        # At each page break, there is a page footer sequence before:
-        #  + "Mmm N, YYYY HH:MM"
-        #  + "(GMT-0H:00) TZNAME"
-        #  + "Page N"
-        #  + "of N"
-        # ...and a page header sequence after:
-        #  + "Mmm N, YYYY - Mmm N, YYYY"
-        #  + "ID"
-        #  + "Time Code"
-        #  + "Project"
-        #  + "TimeType"
-        # None of this is useful data. Immediately delete it.
-        for page_break in reversed(page_breaks):
-            i = page_break - 7
-            # don't question why this is necessary
-            if str(self)=="May 16, 2019 - May 31, 2019" and page_break == 275:
-                i -= 30
-
-            while i <= page_break:
-                j = 0
-                while j < len(PAGE_PATTERNS):
-                    if PAGE_PATTERNS[j].match(self._data[i][0]):
-                        del self._data[i]
-                        j = 0
-                    else:
-                        j += 1
-                i += 1
-
-    def parse_entries(self):
-        """Loop through lines to identify time entries."""
-        i = 0
-        entries = []
-        while i < len(self):
-            if self._data[i][1] == 20:
-                if ENTRY_PATTERNS[0].match(self._data[i][0]):
-                    entries.append([self._data[i]])
-                else:
-                    self.log_issue(
-                        "parse_entries: ",
-                        "something unexpected in the entry start position ",
-                        f"({self._data[i][0]})",
-                    )
+    def mark_notes(self):
+        """Mark that an entry is receiving notes. Subsequent labels should be
+        ignored.
+        """
+        self.entries[-1].mark_notes()
+
+    def parse_row(self, index):
+        """Parse a row of data and dispatch between time entry methods."""
+        if len(self.data[index])<3:
+            return
+
+        if APPROVED_PATTERN.match(self.data[index][2]):
+            pass
+        elif WEEK_BEGINNING_AND_WEEK_PATTERN.match(self.data[index][2]):
+            self.set_reference_date(self.data[index][2].split(": ", 1)[1])
+        elif WEEK_BEGINNING_PATTERN.match(self.data[index][2]):
+            self.advance_reference_date()
+        elif TOTAL_HOURS_PATTERN.match(self.data[index][2]):
+            self.set_total_line_hours(self.data[index][2].split(": ", 1)[1])
+        elif WEEK_PATTERN.match(self.data[index][2]):
+            self.set_reference_date(self.data[index][2])
+        elif HOURS_PATTERN.match(self.data[index][2]):
+            x = int(float(self.data[index][0]))
+            if is_approximately(x, 572):
+                self.set_hours(0, self.data[index][2])
+            elif is_approximately(x, 597):
+                self.set_hours(1, self.data[index][2])
+            elif is_approximately(x, 622):
+                self.set_hours(2, self.data[index][2])
+            elif is_approximately(x, 647):
+                self.set_hours(3, self.data[index][2])
+            elif is_approximately(x, 672):
+                self.set_hours(4, self.data[index][2])
+            elif is_approximately(x, 697):
+                self.set_hours(5, self.data[index][2])
+            elif is_approximately(x, 722):
+                self.set_hours(6, self.data[index][2])
+            elif is_approximately(x, 751):
+                self.set_total_week_hours(self.data[index][2])
             else:
-                if self._data[i][0] in ("Mon", "Tue Wed", "Thu", "Fri", "Sat", "Sun", "Total", ):
-                    pass
-                elif len(entries)>0:
-                    entries[-1].append(self._data[i])
-                else:
-                    self.log_issue(
-                        "parse_entries: ",
-                        "something unexpected before any entries started ",
-                        f"({self._data[i][0]})",
-                    )
-            i += 1
-
-        self._entries = [TimeEntry(e, str(self), e[0][0]) for e in entries]
+                printf(
+                    "found hours ({0}) but they fell through all conditions",
+                    self.data[index][2],
+                )
+        elif TIME_CODE_PATTERN.match(self.data[index][2]):
+            self.set_time_code(self.data[index][2])
+        elif PROJECT_PATTERN.match(self.data[index][2]):
+            self.set_project(self.data[index][2])
+        elif TIMETYPE_PATTERN.match(self.data[index][2]):
+            pass
+        elif ID_PATTERN.match(self.data[index][2]):
+            self.entries.append(TimeEntry())
+        elif NOTES_PATTERN.match(self.data[index][2]):
+            self.mark_notes()
+        else:
+            self.set_label(self.data[index][2])
+
+def parse(filename):
+    with open(filename, "r", newline="") as f:
+        reader = csv.reader(f)
+        timesheet = TimeSheet([row for row in reader])
+        entries = timesheet.entries
+        return entries
 
diff --git a/parser/xml.py b/parser/xml.py
index 7ac3b7f..3b6a92a 100644
--- a/parser/xml.py
+++ b/parser/xml.py
@@ -187,11 +187,11 @@ class TimeSheetHandler(handler.ContentHandler):
         loc = (int(float(location_xy[0])), int(float(location_xy[1])), )
 
         if self.pagenum == "1":
-            if is_approximately(loc, (333, 524, )):
+            if is_approximately(loc, (335, 524, )):
                 self.in_header_footer_parts["timesheet_label"] = True
                 return True
 
-            elif is_approximately(loc, (333, 504, )):
+            elif is_approximately(loc, (335, 504, )):
                 self.in_header_footer_parts["timesheet_value"] = True
                 return True
 
-- 
2.45.2