From ae939a2885fa86f68a1456565d379ca233ec3b19 Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Mon, 2 May 2022 15:23:23 -0500 Subject: [PATCH] Goodbye HTML, hello XML Replaced HTML exporting/parsing with XML exporting/parsing. Also replaced the 'high-level' function call with 'low-level' pdfminer usage. The XML parser handled validation and suppression of header/footer content on its own. From the PDF parser, XML is dumped to a file. From the XML parser, CSV is dumped to a file. The new timesheet parser should read in that CSV file. --- main.py | 13 +- parse.py | 38 --- parser/html.py | 118 --------- parser/pdf.py | 54 ++-- parser/xml.py | 695 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 726 insertions(+), 192 deletions(-) delete mode 100644 parse.py delete mode 100644 parser/html.py create mode 100644 parser/xml.py diff --git a/main.py b/main.py index e029c1c..eb3c0f6 100644 --- a/main.py +++ b/main.py @@ -3,12 +3,21 @@ import sys import pathlib -import parse +from parser.xml import parse as parse_xml +from parser.pdf import parse as parse_pdf +from parser.timesheet import TimeSheet def main(filelist): print(f"processing {len(filelist)} files") for filename in (filelist): - parse.timesheet(filename) + xml_filename = filename.parent.joinpath(filename.name + ".xml") + csv_filename = filename.parent.joinpath(filename.name + ".csv") + + parse_pdf(filename, xml_filename) + parse_xml(xml_filename, csv_filename) + + #timesheet = TimeSheet(semistructured_data) + #timesheet.report_issues() if __name__ == "__main__": filelist = [] diff --git a/parse.py b/parse.py deleted file mode 100644 index 11b3120..0000000 --- a/parse.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 - -"""The parsers are developed, debugged, and refactored in this file. - -When they mature, I refactor them into standalone modules under the `parser` -directory. - -Eventually, the entire parse step will mature and be abstracted into a -single function call, which will be appropriate to call in `main.py`. - -If you can see this file, then I'm not done yet. -""" - -from pprint import pprint - -from parser.html import parse as parse_html -from parser.pdf import parse as parse_pdf -from parser.timesheet import TimeSheet - -def read_timesheet(filename): - unstructured_data = parse_pdf(filename) - semistructured_data = parse_html(unstructured_data) - return semistructured_data - -def parse_timesheet(data): - t = TimeSheet(data) - t.report_issues() - return [] - -def extract_projects(structured_data): - return [] - -def timesheet(filename): - semistructured_data = read_timesheet(filename) - structured_data = parse_timesheet(semistructured_data) - projects = extract_projects(structured_data) - return projects - diff --git a/parser/html.py b/parser/html.py deleted file mode 100644 index 1aa91dc..0000000 --- a/parser/html.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 - -# Crash Course on html.parser -# -# A SAX-style parser. Hook into tags and data like... -# -# ``` -# from html.parser import HTMLParser -# class MyHTMLParser(HTMLParser): -# def handle_starttag(self, tag, attrs): -# #do something... -# def handle_endtag(self, tag): -# #do something... -# def handle_data(self, data): -# #do something... -# ``` -# -# Valid HTML is fed into the parser like... -# -# ``` -# parser = MyHTMLParser() -# parser.feed(html) -# ``` - -from html.parser import HTMLParser - -def parse_attrs_string(attrs): - """Parse a string structures like `key1:value1;key2:value2;`. - - Embedded CSS (as in `style` attributes) can look like this. - """ - attrs_dict = {} - for pair in attrs.split(";"): - if len(pair.strip()) == 0: - continue - key, value = pair.split(":") - - key, value = key.strip(), value.strip() - attrs_dict[key] = value - return attrs_dict - -def parse_attrs_doubles(attrs): - """Parse a dictionary of HTML/CSS attributes from a series of doubles. - - The built-in Python HTML parser (`html.parser.HTMLParser`) hands attributes - to the `handle_starttag` hook like this. - """ - attrs_dict = {} - for pair in attrs: - key, value = pair - if key == "style": - value = parse_attrs_string(value) - attrs_dict[key] = value - return attrs_dict - -def has_style_left(attrs): - return "style" in attrs.keys() and "left" in attrs["style"].keys() - -def has_style_top(attrs): - return "style" in attrs.keys() and "top" in attrs["style"].keys() - -class TimesheetHTMLParser(HTMLParser): - """A specialization of the `html.parser.HTMLParser` class to handle my - timesheets. - - Data is stored internally and can be dumped with the `dump` method. - - Don't forget to close the parser instance! - """ - def __init__(self): - HTMLParser.__init__(self) - self._top = 0 - self._left = 0 - self._in_div = False - self._in_span = False - self._data = [] - - def handle_starttag(self, tag, _attrs): - attrs = parse_attrs_doubles(_attrs) - if self._in_div: - if tag == "span": - self._in_span = True - else: - self._in_div = False - elif tag == "div": - if has_style_left(attrs): - self._left = int(attrs["style"]["left"].removesuffix("px")) - self._in_div = True - if has_style_top(attrs): - self._top = int(attrs["style"]["top"].removesuffix("px")) - self._in_div = True - else: - self._in_span = False - self._in_div = False - - def handle_endtag(self, tag): - self._in_span = False - self._in_div = False - - def handle_data(self, data): - if self._in_span: - self._data.append((data.splitlines()[0], self._left, self._top)) - self._in_span = False - self._in_div = False - - def dump(self): - return self._data - -def parse(html): - """Read an HTML-encoded string into semi-structured data.""" - parser = TimesheetHTMLParser() - try: - parser.feed(html) - data = parser.dump() - finally: - parser.close() - return data - diff --git a/parser/pdf.py b/parser/pdf.py index 67aa2bf..4941cdd 100644 --- a/parser/pdf.py +++ b/parser/pdf.py @@ -2,41 +2,27 @@ from io import StringIO -# Crash Course on pdfminer -# -# Extract text from PDFs like... -# -# ``` -# from pdfminer.high_level import extract_text -# with open(filename, "rb") as f: -# text = extract_text(f) -# ``` -# -# The alternative is to use something like... -# -# ``` -# from io import StringIO -# from pdfminer.high_level import extract_text_to_fp -# from pdfminer.layout import LAParams -# buffer = StringIO() -# with open(filename, "rb") as f: -# extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None) -# html = buffer.getvalue() -# ``` - -from pdfminer.high_level import extract_text_to_fp +from pdfminer.converter import XMLConverter +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.pdfpage import PDFPage from pdfminer.layout import LAParams -def parse(filename): - """Read a binary PDF-encoded file and convert it into an HTML-encoded - string. - """ +def parse(filename_in, filename_out): + """Main routine. Reads a PDF file and writes an XML file.""" buffer = StringIO() - try: - with open(filename, "rb") as f: - extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None) - data = buffer.getvalue() - finally: - buffer.close() - return data + manager = PDFResourceManager(caching=False) + converter = XMLConverter(manager, buffer, laparams=LAParams(), codec=None) + interpreter = PDFPageInterpreter(manager, converter) + + with open(filename_in, "rb") as f: + for page in PDFPage.get_pages(f, caching=False): + interpreter.process_page(page) + + with open(filename_out, "w") as f: + first = True + for line in buffer.getvalue().splitlines(): + if not first: + f.write(line+"\n") + first = False + f.write("\n") diff --git a/parser/xml.py b/parser/xml.py new file mode 100644 index 0000000..7ac3b7f --- /dev/null +++ b/parser/xml.py @@ -0,0 +1,695 @@ +#!/usr/bin/env python3 + +import sys +from xml.sax import handler, make_parser +import csv + +def printf(string, *variables): + """Print to STDERR with formatting.""" + sys.stderr.write(string.format(*variables)) + sys.stderr.write("\n") + +def is_approximately(location, target): + """Tests if a location is close enough to a target to be considered equal. + + PDFs store the rendered location of a textbox, not the mathematically- + ideal location. The net effect is that, while you can rely on the y + dimension to identify a row, you cannot rely on the x dimension to + identify a column. My solution is to make equivalence a bit fuzzy, to the + effect of +/- 5 pixels. + """ + if (location[1] == target[1] + and target[0]-5 <= location[0] <= target[0]+5): + return True + return False + +class TimeSheetHandler(handler.ContentHandler): + def __init__(self): + handler.ContentHandler.__init__(self) + + self.text_buffer = "" + self.line_buffer = [] + self.page_buffer = [] + + self.pagenum = None + self.daterange = None + + self.in_textbox = False + self.in_text = False + self.in_figure = False + self.in_hours_distribution = False + + self.in_header_footer_parts = { + "timesheet_label": False, + "timesheet_value": False, + "location_label": False, + "location_value": False, + "department_label": False, + "department_value": False, + "employee_type_label": False, + "employee_type_value": False, + "location_default_label": False, + "location_default_value": False, + "function_label": False, + "function_value": False, + "exempt_label": False, + "exempt_value": False, + "status_label": False, + "status_value": False, + "post_status_label": False, + "post_status_value": False, + "validation_label": False, + "validation_value": False, + "datetime_label": False, + "datetime_value": False, + "total_timesheet_label": False, + "total_timesheet_value": False, + "standard_hours_label": False, + "standard_hours_value": False, + "total_billable_label": False, + "total_billable_value": False, + "percent_billability_label": False, + "percent_billability_value": False, + "id_label": False, + "time_code_label": False, + "project_label": False, + "timetype_label": False, + "day_label": False, + "total_label": False, + "_doc_no_label": False, + "_doc_no_value": False, + "footer_datetime_value": False, + "footer_timezone_value": False, + "footer_pagenum_value": False, + "footer_pagecount_value": False, + } + + def startElement(self, name, attrs): + if name=="page": + self.in_figure = False + self.pagenum = attrs["id"] + elif name=="figure": + self.in_figure = True + elif not self.in_figure and not self.in_hours_distribution: + if name=="textbox": + handled = self.handle_header_footer_start(attrs["bbox"]) + if not handled: + self.record_location(attrs["bbox"]) + self.in_textbox = True + elif self.in_textbox and name=="text": + self.in_text = True + + def characters(self, data): + if self.in_text: + self.append_buffer(data) + + def endElement(self, name): + if name=="page": + self.sort_lines() + elif not self.in_figure and not self.in_hours_distribution: + if name=="textbox": + handled = self.handle_header_footer_end() + if not handled: + text = self.pop_buffer() + if text=="Hours Distribution by Time Code": + self.in_hours_distribution = True + else: + self.record_text(text) + self.in_textbox = False + elif name=="text": + self.in_text = False + + def sort_lines(self): + """Helper function to perform page-level cleaning on line-level data. + Manipulates the page buffer and clears the line buffer. + """ + sorted_lines = sorted( + self.line_buffer, + key=lambda x: (-float(x[1]), float(x[0])), + ) + self.line_buffer = [] + self.page_buffer.append(sorted_lines) + + def record_location(self, data): + """Helper function to append new location data to the line buffer.""" + self.line_buffer.append(data.split(",")[:2]) + + def record_text(self, data): + """Helper function to append new text data to the line buffer.""" + self.line_buffer[-1].append(data) + + def append_buffer(self, data): + """Helper function to append new character data to the text buffer.""" + self.text_buffer += data + + def pop_buffer(self): + """Helper function to grab aggregated character data and reset the + text buffer. + """ + data = self.text_buffer + self.text_buffer = "" + return data.strip() + + def debug_assert(self, value, should_be, label=""): + """Helper function to manage validation logic and conditional print + statements. + """ + if label: + printf_label = f"{self.daterange}:{self.pagenum}:{label}:" + else: + printf_label = f"{self.daterange}:{self.pagenum}:" + + if isinstance(should_be, tuple): + if value not in should_be: + printf( + printf_label + "should be {0}, is {1}", + "one of " + ", ".join(should_be), + value, + ) + elif value != should_be: + printf(printf_label + "should be {0}, is {1}", should_be, value) + + def handle_header_footer_start(self, location): + """Handle header and footer content on a page. + + There is some branching based on whether the current page is the first + page of a document. + + If a header or footer box is identified (based on its location), + the parser state is updated so that box's content can be grabbed, + validated, and suppressed from output upon reaching its end. + + If a box is handled, return True. Otherwise return False to signal + that further handling is necessary. + """ + location_xy = location.split(",")[:2] + identity = ','.join(location_xy) + loc = (int(float(location_xy[0])), int(float(location_xy[1])), ) + + if self.pagenum == "1": + if is_approximately(loc, (333, 524, )): + self.in_header_footer_parts["timesheet_label"] = True + return True + + elif is_approximately(loc, (333, 504, )): + self.in_header_footer_parts["timesheet_value"] = True + return True + + elif is_approximately(loc, (20, 481, )): + self.in_header_footer_parts["location_label"] = True + return True + + elif is_approximately(loc, (93, 481, )): + self.in_header_footer_parts["location_value"] = True + return True + + elif is_approximately(loc, (20, 466, )): + self.in_header_footer_parts["department_label"] = True + return True + + elif is_approximately(loc, (93, 466, )): + self.in_header_footer_parts["department_value"] = True + return True + + elif is_approximately(loc, (20, 452, )): + self.in_header_footer_parts["employee_type_label"] = True + return True + + elif is_approximately(loc, (93, 452, )): + self.in_header_footer_parts["employee_type_value"] = True + return True + + elif is_approximately(loc, (20, 437, )): + self.in_header_footer_parts["location_default_label"] = True + return True + + elif is_approximately(loc, (93, 437, )): + self.in_header_footer_parts["location_default_value"] = True + return True + + elif is_approximately(loc, (230, 481, )): + self.in_header_footer_parts["function_label"] = True + return True + + elif is_approximately(loc, (304, 481, )): + self.in_header_footer_parts["function_value"] = True + return True + + elif is_approximately(loc, (230, 466, )): + self.in_header_footer_parts["exempt_label"] = True + return True + + elif is_approximately(loc, (304, 466, )): + self.in_header_footer_parts["exempt_value"] = True + return True + + elif is_approximately(loc, (230, 452, )): + self.in_header_footer_parts["status_label"] = True + return True + + elif is_approximately(loc, (304, 452, )): + self.in_header_footer_parts["status_value"] = True + return True + + elif is_approximately(loc, (230, 437, )): + self.in_header_footer_parts["_doc_no_label"] = True + return True + + elif is_approximately(loc, (304, 437, )): + self.in_header_footer_parts["_doc_no_value"] = True + return True + + elif is_approximately(loc, (440, 481, )): + self.in_header_footer_parts["post_status_label"] = True + return True + + elif is_approximately(loc, (513, 481, )): + self.in_header_footer_parts["post_status_value"] = True + return True + + elif is_approximately(loc, (440, 466, )): + self.in_header_footer_parts["validation_label"] = True + return True + + elif is_approximately(loc, (513, 466, )): + self.in_header_footer_parts["validation_value"] = True + return True + + elif is_approximately(loc, (440, 452, )): + self.in_header_footer_parts["datetime_label"] = True + return True + + elif is_approximately(loc, (513, 452, )): + self.in_header_footer_parts["datetime_value"] = True + return True + + elif is_approximately(loc, (651, 481, )): + self.in_header_footer_parts["total_timesheet_label"] = True + return True + + elif is_approximately(loc, (751, 481, )): + self.in_header_footer_parts["total_timesheet_value"] = True + return True + + elif is_approximately(loc, (651, 466, )): + self.in_header_footer_parts["standard_hours_label"] = True + return True + + elif is_approximately(loc, (751, 466, )): + self.in_header_footer_parts["standard_hours_value"] = True + return True + + elif is_approximately(loc, (651, 452, )): + self.in_header_footer_parts["total_billable_label"] = True + return True + + elif is_approximately(loc, (751, 452, )): + self.in_header_footer_parts["total_billable_value"] = True + return True + + elif is_approximately(loc, (651, 437, )): + self.in_header_footer_parts["percent_billability_label"] = True + return True + + elif is_approximately(loc, (751, 437, )): + self.in_header_footer_parts["percent_billability_value"] = True + return True + + elif is_approximately(loc, (20, 399, )): + self.in_header_footer_parts["id_label"] = True + return True + + elif is_approximately(loc, (40, 399, )): + self.in_header_footer_parts["time_code_label"] = True + return True + + elif is_approximately(loc, (120, 399, )): + self.in_header_footer_parts["project_label"] = True + return True + + elif is_approximately(loc, (200, 399, )): + self.in_header_footer_parts["timetype_label"] = True + return True + + elif loc[1]==399 and 370 <= loc[0] <= 730: + self.in_header_footer_parts["day_label"] = True + return True + + elif is_approximately(loc, (754, 399, )): + self.in_header_footer_parts["total_label"] = True + return True + + elif is_approximately(loc, (20, 27, )): + self.in_header_footer_parts["footer_datetime_value"] = True + return True + + elif is_approximately(loc, (120, 27, )): + self.in_header_footer_parts["footer_timezone_value"] = True + return True + + elif is_approximately(loc, (688, 27, )): + self.in_header_footer_parts["footer_pagenum_value"] = True + return True + + elif is_approximately(loc, (732, 27, )): + self.in_header_footer_parts["footer_pagecount_value"] = True + return True + + else: + if is_approximately(loc, (333, 524, )): + self.in_header_footer_parts["timesheet_label"] = True + return True + + elif is_approximately(loc, (333, 504, )): + self.in_header_footer_parts["timesheet_value"] = True + return True + + elif is_approximately(loc, (20, 479, )): + self.in_header_footer_parts["id_label"] = True + return True + + elif is_approximately(loc, (40, 479, )): + self.in_header_footer_parts["time_code_label"] = True + return True + + elif is_approximately(loc, (120, 479, )): + self.in_header_footer_parts["project_label"] = True + return True + + elif is_approximately(loc, (200, 479, )): + self.in_header_footer_parts["timetype_label"] = True + return True + + elif loc[1]==479 and 370 <= loc[0] <= 730: + self.in_header_footer_parts["day_label"] = True + return True + + elif is_approximately(loc, (754, 479, )): + self.in_header_footer_parts["total_label"] = True + return True + + elif is_approximately(loc, (20, 27, )): + self.in_header_footer_parts["footer_datetime_value"] = True + return True + + elif is_approximately(loc, (120, 27, )): + self.in_header_footer_parts["footer_timezone_value"] = True + return True + + elif is_approximately(loc, (688, 27, )): + self.in_header_footer_parts["footer_pagenum_value"] = True + return True + + elif is_approximately(loc, (732, 27, )): + self.in_header_footer_parts["footer_pagecount_value"] = True + return True + + return False + + def handle_header_footer_end(self): + """Handle header and footer content on a page. + + If a box is handled, return True. Otherwise return False to signal + that further handling is necessary. + """ + if self.in_header_footer_parts["_doc_no_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Doc.No.", label="doc no label") + self.in_header_footer_parts["_doc_no_label"] = False + return True + + elif self.in_header_footer_parts["_doc_no_value"]: + value = self.pop_buffer() + self.debug_assert(value, "1", label="doc no") + self.in_header_footer_parts["_doc_no_value"] = False + return True + + elif self.in_header_footer_parts["timesheet_label"]: + value = self.pop_buffer() + self.debug_assert( + value, + "Timesheet\n[109015] Ricottone, Dominic", + label="timesheet label", + ) + self.in_header_footer_parts["timesheet_label"] = False + return True + + elif self.in_header_footer_parts["timesheet_value"]: + self.daterange = self.pop_buffer() + self.in_header_footer_parts["timesheet_value"] = False + return True + + elif self.in_header_footer_parts["location_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Location:", label="location label") + self.in_header_footer_parts["location_label"] = False + return True + + elif self.in_header_footer_parts["location_value"]: + value = self.pop_buffer() + self.debug_assert(value, "[E01] Fors Marsh Group", label="location") + self.in_header_footer_parts["location_value"] = False + return True + + elif self.in_header_footer_parts["department_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Department:", label="department label") + self.in_header_footer_parts["department_label"] = False + return True + + elif self.in_header_footer_parts["department_value"]: + value = self.pop_buffer() + self.debug_assert( + value, + ("[3200] Advanced Analytics", "[3230] Data Management", ), + label="department", + ) + self.in_header_footer_parts["department_value"] = False + return True + + elif self.in_header_footer_parts["employee_type_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Employee Type:", label="employee type label") + self.in_header_footer_parts["employee_type_label"] = False + return True + + elif self.in_header_footer_parts["employee_type_value"]: + value = self.pop_buffer() + self.debug_assert(value, "[1] Annual Salary", label="employee type") + self.in_header_footer_parts["employee_type_value"] = False + return True + + elif self.in_header_footer_parts["location_default_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Location (Default", label="location default label") + self.in_header_footer_parts["location_default_label"] = False + return True + + elif self.in_header_footer_parts["location_default_value"]: + value = self.pop_buffer() + self.debug_assert(value, "[LOCAL] Location", label="location default") + self.in_header_footer_parts["location_default_value"] = False + return True + + elif self.in_header_footer_parts["function_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Function:", label="function label") + self.in_header_footer_parts["function_label"] = False + return True + + elif self.in_header_footer_parts["function_value"]: + value = self.pop_buffer() + self.debug_assert(value, "[1] Full Time", label="function") + self.in_header_footer_parts["function_value"] = False + return True + + elif self.in_header_footer_parts["exempt_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Exempt:", label="exempt label") + self.in_header_footer_parts["exempt_label"] = False + return True + + elif self.in_header_footer_parts["exempt_value"]: + value = self.pop_buffer() + self.debug_assert(value, "Yes", label="exempt") + self.in_header_footer_parts["exempt_value"] = False + return True + + elif self.in_header_footer_parts["status_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Status:", label="status label") + self.in_header_footer_parts["status_label"] = False + return True + + elif self.in_header_footer_parts["status_value"]: + value = self.pop_buffer() + self.debug_assert( + value, + ("Approved", "Closed", "On Hold [Draft]", ), + label="status", + ) + self.in_header_footer_parts["status_value"] = False + return True + + elif self.in_header_footer_parts["post_status_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Post Status:", label="post status label") + self.in_header_footer_parts["post_status_label"] = False + return True + + elif self.in_header_footer_parts["post_status_value"]: + value = self.pop_buffer() + self.debug_assert( + value, + ("Posted", "Not posted", ), + label="post status", + ) + self.in_header_footer_parts["post_status_value"] = False + return True + + elif self.in_header_footer_parts["validation_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Validation:", label="validation label") + self.in_header_footer_parts["validation_label"] = False + return True + + elif self.in_header_footer_parts["validation_value"]: + value = self.pop_buffer() + self.debug_assert( + value, + ("Passed", "Warnings", ), + label="validation", + ) + self.in_header_footer_parts["validation_value"] = False + return True + + elif self.in_header_footer_parts["datetime_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Date/Time:", label="datetime label") + self.in_header_footer_parts["datetime_label"] = False + return True + + elif self.in_header_footer_parts["datetime_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["datetime_value"] = False + return True + + elif self.in_header_footer_parts["total_timesheet_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Total Timesheet:", label="total timesheet label") + self.in_header_footer_parts["total_timesheet_label"] = False + return True + + elif self.in_header_footer_parts["total_timesheet_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["total_timesheet_value"] = False + return True + + elif self.in_header_footer_parts["standard_hours_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Standard Hours:", label="standard hours label") + self.in_header_footer_parts["standard_hours_label"] = False + return True + + elif self.in_header_footer_parts["standard_hours_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["standard_hours_value"] = False + return True + + elif self.in_header_footer_parts["total_billable_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Total Billable:", label="total billable label") + self.in_header_footer_parts["total_billable_label"] = False + return True + + elif self.in_header_footer_parts["total_billable_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["total_billable_value"] = False + return True + + elif self.in_header_footer_parts["percent_billability_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Percent Billability:", label="percent billability label") + self.in_header_footer_parts["percent_billability_label"] = False + return True + + elif self.in_header_footer_parts["percent_billability_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["percent_billability_value"] = False + return True + + elif self.in_header_footer_parts["id_label"]: + value = self.pop_buffer() + self.debug_assert(value, "ID", label="id label") + self.in_header_footer_parts["id_label"] = False + return True + + elif self.in_header_footer_parts["time_code_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Time Code", label="time code label") + self.in_header_footer_parts["time_code_label"] = False + return True + + elif self.in_header_footer_parts["project_label"]: + value = self.pop_buffer() + self.debug_assert(value, "Project", label="project label") + self.in_header_footer_parts["project_label"] = False + return True + + elif self.in_header_footer_parts["timetype_label"]: + value = self.pop_buffer() + self.debug_assert(value, "TimeType", label="timetype label") + self.in_header_footer_parts["timetype_label"] = False + return True + + elif self.in_header_footer_parts["day_label"]: + value = self.pop_buffer() + self.debug_assert( + value, + ("Mon", "Tue Wed", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun", ), + label="day label", + ) + self.in_header_footer_parts["day_label"] = False + return True + + elif self.in_header_footer_parts["total_label"]: + value = self.pop_buffer() + self.in_header_footer_parts["total_label"] = False + return True + + elif self.in_header_footer_parts["footer_datetime_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["footer_datetime_value"] = False + return True + + elif self.in_header_footer_parts["footer_timezone_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["footer_timezone_value"] = False + return True + + elif self.in_header_footer_parts["footer_pagenum_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["footer_pagenum_value"] = False + return True + + elif self.in_header_footer_parts["footer_pagecount_value"]: + value = self.pop_buffer() + self.in_header_footer_parts["footer_pagecount_value"] = False + return True + + return False + +def parse(filename_in, filename_out): + """Main routine. Reads an XML file and writes a CSV file.""" + parser = make_parser() + handler = TimeSheetHandler() + + parser.setContentHandler(handler) + parser.parse(filename_in) + + with open(filename_out, "w", newline="") as f: + writer = csv.writer(f) + for page in handler.page_buffer: + for line in page: + writer.writerow(line) + -- 2.45.2