@@ 0,0 1,695 @@
+#!/usr/bin/env python3
+
+import sys
+from xml.sax import handler, make_parser
+import csv
+
+def printf(string, *variables):
+ """Print to STDERR with formatting."""
+ sys.stderr.write(string.format(*variables))
+ sys.stderr.write("\n")
+
+def is_approximately(location, target):
+ """Tests if a location is close enough to a target to be considered equal.
+
+ PDFs store the rendered location of a textbox, not the mathematically-
+ ideal location. The net effect is that, while you can rely on the y
+ dimension to identify a row, you cannot rely on the x dimension to
+ identify a column. My solution is to make equivalence a bit fuzzy, to the
+ effect of +/- 5 pixels.
+ """
+ if (location[1] == target[1]
+ and target[0]-5 <= location[0] <= target[0]+5):
+ return True
+ return False
+
+class TimeSheetHandler(handler.ContentHandler):
+ def __init__(self):
+ handler.ContentHandler.__init__(self)
+
+ self.text_buffer = ""
+ self.line_buffer = []
+ self.page_buffer = []
+
+ self.pagenum = None
+ self.daterange = None
+
+ self.in_textbox = False
+ self.in_text = False
+ self.in_figure = False
+ self.in_hours_distribution = False
+
+ self.in_header_footer_parts = {
+ "timesheet_label": False,
+ "timesheet_value": False,
+ "location_label": False,
+ "location_value": False,
+ "department_label": False,
+ "department_value": False,
+ "employee_type_label": False,
+ "employee_type_value": False,
+ "location_default_label": False,
+ "location_default_value": False,
+ "function_label": False,
+ "function_value": False,
+ "exempt_label": False,
+ "exempt_value": False,
+ "status_label": False,
+ "status_value": False,
+ "post_status_label": False,
+ "post_status_value": False,
+ "validation_label": False,
+ "validation_value": False,
+ "datetime_label": False,
+ "datetime_value": False,
+ "total_timesheet_label": False,
+ "total_timesheet_value": False,
+ "standard_hours_label": False,
+ "standard_hours_value": False,
+ "total_billable_label": False,
+ "total_billable_value": False,
+ "percent_billability_label": False,
+ "percent_billability_value": False,
+ "id_label": False,
+ "time_code_label": False,
+ "project_label": False,
+ "timetype_label": False,
+ "day_label": False,
+ "total_label": False,
+ "_doc_no_label": False,
+ "_doc_no_value": False,
+ "footer_datetime_value": False,
+ "footer_timezone_value": False,
+ "footer_pagenum_value": False,
+ "footer_pagecount_value": False,
+ }
+
+ def startElement(self, name, attrs):
+ if name=="page":
+ self.in_figure = False
+ self.pagenum = attrs["id"]
+ elif name=="figure":
+ self.in_figure = True
+ elif not self.in_figure and not self.in_hours_distribution:
+ if name=="textbox":
+ handled = self.handle_header_footer_start(attrs["bbox"])
+ if not handled:
+ self.record_location(attrs["bbox"])
+ self.in_textbox = True
+ elif self.in_textbox and name=="text":
+ self.in_text = True
+
+ def characters(self, data):
+ if self.in_text:
+ self.append_buffer(data)
+
+ def endElement(self, name):
+ if name=="page":
+ self.sort_lines()
+ elif not self.in_figure and not self.in_hours_distribution:
+ if name=="textbox":
+ handled = self.handle_header_footer_end()
+ if not handled:
+ text = self.pop_buffer()
+ if text=="Hours Distribution by Time Code":
+ self.in_hours_distribution = True
+ else:
+ self.record_text(text)
+ self.in_textbox = False
+ elif name=="text":
+ self.in_text = False
+
+ def sort_lines(self):
+ """Helper function to perform page-level cleaning on line-level data.
+ Manipulates the page buffer and clears the line buffer.
+ """
+ sorted_lines = sorted(
+ self.line_buffer,
+ key=lambda x: (-float(x[1]), float(x[0])),
+ )
+ self.line_buffer = []
+ self.page_buffer.append(sorted_lines)
+
+ def record_location(self, data):
+ """Helper function to append new location data to the line buffer."""
+ self.line_buffer.append(data.split(",")[:2])
+
+ def record_text(self, data):
+ """Helper function to append new text data to the line buffer."""
+ self.line_buffer[-1].append(data)
+
+ def append_buffer(self, data):
+ """Helper function to append new character data to the text buffer."""
+ self.text_buffer += data
+
+ def pop_buffer(self):
+ """Helper function to grab aggregated character data and reset the
+ text buffer.
+ """
+ data = self.text_buffer
+ self.text_buffer = ""
+ return data.strip()
+
+ def debug_assert(self, value, should_be, label=""):
+ """Helper function to manage validation logic and conditional print
+ statements.
+ """
+ if label:
+ printf_label = f"{self.daterange}:{self.pagenum}:{label}:"
+ else:
+ printf_label = f"{self.daterange}:{self.pagenum}:"
+
+ if isinstance(should_be, tuple):
+ if value not in should_be:
+ printf(
+ printf_label + "should be {0}, is {1}",
+ "one of " + ", ".join(should_be),
+ value,
+ )
+ elif value != should_be:
+ printf(printf_label + "should be {0}, is {1}", should_be, value)
+
+ def handle_header_footer_start(self, location):
+ """Handle header and footer content on a page.
+
+ There is some branching based on whether the current page is the first
+ page of a document.
+
+ If a header or footer box is identified (based on its location),
+ the parser state is updated so that box's content can be grabbed,
+ validated, and suppressed from output upon reaching its end.
+
+ If a box is handled, return True. Otherwise return False to signal
+ that further handling is necessary.
+ """
+ location_xy = location.split(",")[:2]
+ identity = ','.join(location_xy)
+ loc = (int(float(location_xy[0])), int(float(location_xy[1])), )
+
+ if self.pagenum == "1":
+ if is_approximately(loc, (333, 524, )):
+ self.in_header_footer_parts["timesheet_label"] = True
+ return True
+
+ elif is_approximately(loc, (333, 504, )):
+ self.in_header_footer_parts["timesheet_value"] = True
+ return True
+
+ elif is_approximately(loc, (20, 481, )):
+ self.in_header_footer_parts["location_label"] = True
+ return True
+
+ elif is_approximately(loc, (93, 481, )):
+ self.in_header_footer_parts["location_value"] = True
+ return True
+
+ elif is_approximately(loc, (20, 466, )):
+ self.in_header_footer_parts["department_label"] = True
+ return True
+
+ elif is_approximately(loc, (93, 466, )):
+ self.in_header_footer_parts["department_value"] = True
+ return True
+
+ elif is_approximately(loc, (20, 452, )):
+ self.in_header_footer_parts["employee_type_label"] = True
+ return True
+
+ elif is_approximately(loc, (93, 452, )):
+ self.in_header_footer_parts["employee_type_value"] = True
+ return True
+
+ elif is_approximately(loc, (20, 437, )):
+ self.in_header_footer_parts["location_default_label"] = True
+ return True
+
+ elif is_approximately(loc, (93, 437, )):
+ self.in_header_footer_parts["location_default_value"] = True
+ return True
+
+ elif is_approximately(loc, (230, 481, )):
+ self.in_header_footer_parts["function_label"] = True
+ return True
+
+ elif is_approximately(loc, (304, 481, )):
+ self.in_header_footer_parts["function_value"] = True
+ return True
+
+ elif is_approximately(loc, (230, 466, )):
+ self.in_header_footer_parts["exempt_label"] = True
+ return True
+
+ elif is_approximately(loc, (304, 466, )):
+ self.in_header_footer_parts["exempt_value"] = True
+ return True
+
+ elif is_approximately(loc, (230, 452, )):
+ self.in_header_footer_parts["status_label"] = True
+ return True
+
+ elif is_approximately(loc, (304, 452, )):
+ self.in_header_footer_parts["status_value"] = True
+ return True
+
+ elif is_approximately(loc, (230, 437, )):
+ self.in_header_footer_parts["_doc_no_label"] = True
+ return True
+
+ elif is_approximately(loc, (304, 437, )):
+ self.in_header_footer_parts["_doc_no_value"] = True
+ return True
+
+ elif is_approximately(loc, (440, 481, )):
+ self.in_header_footer_parts["post_status_label"] = True
+ return True
+
+ elif is_approximately(loc, (513, 481, )):
+ self.in_header_footer_parts["post_status_value"] = True
+ return True
+
+ elif is_approximately(loc, (440, 466, )):
+ self.in_header_footer_parts["validation_label"] = True
+ return True
+
+ elif is_approximately(loc, (513, 466, )):
+ self.in_header_footer_parts["validation_value"] = True
+ return True
+
+ elif is_approximately(loc, (440, 452, )):
+ self.in_header_footer_parts["datetime_label"] = True
+ return True
+
+ elif is_approximately(loc, (513, 452, )):
+ self.in_header_footer_parts["datetime_value"] = True
+ return True
+
+ elif is_approximately(loc, (651, 481, )):
+ self.in_header_footer_parts["total_timesheet_label"] = True
+ return True
+
+ elif is_approximately(loc, (751, 481, )):
+ self.in_header_footer_parts["total_timesheet_value"] = True
+ return True
+
+ elif is_approximately(loc, (651, 466, )):
+ self.in_header_footer_parts["standard_hours_label"] = True
+ return True
+
+ elif is_approximately(loc, (751, 466, )):
+ self.in_header_footer_parts["standard_hours_value"] = True
+ return True
+
+ elif is_approximately(loc, (651, 452, )):
+ self.in_header_footer_parts["total_billable_label"] = True
+ return True
+
+ elif is_approximately(loc, (751, 452, )):
+ self.in_header_footer_parts["total_billable_value"] = True
+ return True
+
+ elif is_approximately(loc, (651, 437, )):
+ self.in_header_footer_parts["percent_billability_label"] = True
+ return True
+
+ elif is_approximately(loc, (751, 437, )):
+ self.in_header_footer_parts["percent_billability_value"] = True
+ return True
+
+ elif is_approximately(loc, (20, 399, )):
+ self.in_header_footer_parts["id_label"] = True
+ return True
+
+ elif is_approximately(loc, (40, 399, )):
+ self.in_header_footer_parts["time_code_label"] = True
+ return True
+
+ elif is_approximately(loc, (120, 399, )):
+ self.in_header_footer_parts["project_label"] = True
+ return True
+
+ elif is_approximately(loc, (200, 399, )):
+ self.in_header_footer_parts["timetype_label"] = True
+ return True
+
+ elif loc[1]==399 and 370 <= loc[0] <= 730:
+ self.in_header_footer_parts["day_label"] = True
+ return True
+
+ elif is_approximately(loc, (754, 399, )):
+ self.in_header_footer_parts["total_label"] = True
+ return True
+
+ elif is_approximately(loc, (20, 27, )):
+ self.in_header_footer_parts["footer_datetime_value"] = True
+ return True
+
+ elif is_approximately(loc, (120, 27, )):
+ self.in_header_footer_parts["footer_timezone_value"] = True
+ return True
+
+ elif is_approximately(loc, (688, 27, )):
+ self.in_header_footer_parts["footer_pagenum_value"] = True
+ return True
+
+ elif is_approximately(loc, (732, 27, )):
+ self.in_header_footer_parts["footer_pagecount_value"] = True
+ return True
+
+ else:
+ if is_approximately(loc, (333, 524, )):
+ self.in_header_footer_parts["timesheet_label"] = True
+ return True
+
+ elif is_approximately(loc, (333, 504, )):
+ self.in_header_footer_parts["timesheet_value"] = True
+ return True
+
+ elif is_approximately(loc, (20, 479, )):
+ self.in_header_footer_parts["id_label"] = True
+ return True
+
+ elif is_approximately(loc, (40, 479, )):
+ self.in_header_footer_parts["time_code_label"] = True
+ return True
+
+ elif is_approximately(loc, (120, 479, )):
+ self.in_header_footer_parts["project_label"] = True
+ return True
+
+ elif is_approximately(loc, (200, 479, )):
+ self.in_header_footer_parts["timetype_label"] = True
+ return True
+
+ elif loc[1]==479 and 370 <= loc[0] <= 730:
+ self.in_header_footer_parts["day_label"] = True
+ return True
+
+ elif is_approximately(loc, (754, 479, )):
+ self.in_header_footer_parts["total_label"] = True
+ return True
+
+ elif is_approximately(loc, (20, 27, )):
+ self.in_header_footer_parts["footer_datetime_value"] = True
+ return True
+
+ elif is_approximately(loc, (120, 27, )):
+ self.in_header_footer_parts["footer_timezone_value"] = True
+ return True
+
+ elif is_approximately(loc, (688, 27, )):
+ self.in_header_footer_parts["footer_pagenum_value"] = True
+ return True
+
+ elif is_approximately(loc, (732, 27, )):
+ self.in_header_footer_parts["footer_pagecount_value"] = True
+ return True
+
+ return False
+
+ def handle_header_footer_end(self):
+ """Handle header and footer content on a page.
+
+ If a box is handled, return True. Otherwise return False to signal
+ that further handling is necessary.
+ """
+ if self.in_header_footer_parts["_doc_no_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Doc.No.", label="doc no label")
+ self.in_header_footer_parts["_doc_no_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["_doc_no_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "1", label="doc no")
+ self.in_header_footer_parts["_doc_no_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["timesheet_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(
+ value,
+ "Timesheet\n[109015] Ricottone, Dominic",
+ label="timesheet label",
+ )
+ self.in_header_footer_parts["timesheet_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["timesheet_value"]:
+ self.daterange = self.pop_buffer()
+ self.in_header_footer_parts["timesheet_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["location_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Location:", label="location label")
+ self.in_header_footer_parts["location_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["location_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "[E01] Fors Marsh Group", label="location")
+ self.in_header_footer_parts["location_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["department_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Department:", label="department label")
+ self.in_header_footer_parts["department_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["department_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(
+ value,
+ ("[3200] Advanced Analytics", "[3230] Data Management", ),
+ label="department",
+ )
+ self.in_header_footer_parts["department_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["employee_type_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Employee Type:", label="employee type label")
+ self.in_header_footer_parts["employee_type_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["employee_type_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "[1] Annual Salary", label="employee type")
+ self.in_header_footer_parts["employee_type_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["location_default_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Location (Default", label="location default label")
+ self.in_header_footer_parts["location_default_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["location_default_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "[LOCAL] Location", label="location default")
+ self.in_header_footer_parts["location_default_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["function_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Function:", label="function label")
+ self.in_header_footer_parts["function_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["function_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "[1] Full Time", label="function")
+ self.in_header_footer_parts["function_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["exempt_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Exempt:", label="exempt label")
+ self.in_header_footer_parts["exempt_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["exempt_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Yes", label="exempt")
+ self.in_header_footer_parts["exempt_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["status_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Status:", label="status label")
+ self.in_header_footer_parts["status_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["status_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(
+ value,
+ ("Approved", "Closed", "On Hold [Draft]", ),
+ label="status",
+ )
+ self.in_header_footer_parts["status_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["post_status_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Post Status:", label="post status label")
+ self.in_header_footer_parts["post_status_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["post_status_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(
+ value,
+ ("Posted", "Not posted", ),
+ label="post status",
+ )
+ self.in_header_footer_parts["post_status_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["validation_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Validation:", label="validation label")
+ self.in_header_footer_parts["validation_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["validation_value"]:
+ value = self.pop_buffer()
+ self.debug_assert(
+ value,
+ ("Passed", "Warnings", ),
+ label="validation",
+ )
+ self.in_header_footer_parts["validation_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["datetime_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Date/Time:", label="datetime label")
+ self.in_header_footer_parts["datetime_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["datetime_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["datetime_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["total_timesheet_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Total Timesheet:", label="total timesheet label")
+ self.in_header_footer_parts["total_timesheet_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["total_timesheet_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["total_timesheet_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["standard_hours_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Standard Hours:", label="standard hours label")
+ self.in_header_footer_parts["standard_hours_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["standard_hours_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["standard_hours_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["total_billable_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Total Billable:", label="total billable label")
+ self.in_header_footer_parts["total_billable_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["total_billable_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["total_billable_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["percent_billability_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Percent Billability:", label="percent billability label")
+ self.in_header_footer_parts["percent_billability_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["percent_billability_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["percent_billability_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["id_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "ID", label="id label")
+ self.in_header_footer_parts["id_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["time_code_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Time Code", label="time code label")
+ self.in_header_footer_parts["time_code_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["project_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "Project", label="project label")
+ self.in_header_footer_parts["project_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["timetype_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(value, "TimeType", label="timetype label")
+ self.in_header_footer_parts["timetype_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["day_label"]:
+ value = self.pop_buffer()
+ self.debug_assert(
+ value,
+ ("Mon", "Tue Wed", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun", ),
+ label="day label",
+ )
+ self.in_header_footer_parts["day_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["total_label"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["total_label"] = False
+ return True
+
+ elif self.in_header_footer_parts["footer_datetime_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["footer_datetime_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["footer_timezone_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["footer_timezone_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["footer_pagenum_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["footer_pagenum_value"] = False
+ return True
+
+ elif self.in_header_footer_parts["footer_pagecount_value"]:
+ value = self.pop_buffer()
+ self.in_header_footer_parts["footer_pagecount_value"] = False
+ return True
+
+ return False
+
+def parse(filename_in, filename_out):
+ """Main routine. Reads an XML file and writes a CSV file."""
+ parser = make_parser()
+ handler = TimeSheetHandler()
+
+ parser.setContentHandler(handler)
+ parser.parse(filename_in)
+
+ with open(filename_out, "w", newline="") as f:
+ writer = csv.writer(f)
+ for page in handler.page_buffer:
+ for line in page:
+ writer.writerow(line)
+