~dricottone/fmg-timesheets

ae939a2885fa86f68a1456565d379ca233ec3b19 — Dominic Ricottone 2 years ago e4ae39d
Goodbye HTML, hello XML

Replaced HTML exporting/parsing with XML exporting/parsing. Also
replaced the 'high-level' function call with 'low-level' pdfminer
usage.

The XML parser handled validation and suppression of header/footer
content on its own.

From the PDF parser, XML is dumped to a file. From the XML parser, CSV
is dumped to a file. The new timesheet parser should read in that CSV
file.
5 files changed, 726 insertions(+), 192 deletions(-)

M main.py
D parse.py
D parser/html.py
M parser/pdf.py
A parser/xml.py
M main.py => main.py +11 -2
@@ 3,12 3,21 @@
import sys
import pathlib

import parse
from parser.xml import parse as parse_xml
from parser.pdf import parse as parse_pdf
from parser.timesheet import TimeSheet

def main(filelist):
    print(f"processing {len(filelist)} files")
    for filename in (filelist):
        parse.timesheet(filename)
        xml_filename = filename.parent.joinpath(filename.name + ".xml")
        csv_filename = filename.parent.joinpath(filename.name + ".csv")

        parse_pdf(filename, xml_filename)
        parse_xml(xml_filename, csv_filename)

        #timesheet = TimeSheet(semistructured_data)
        #timesheet.report_issues()

if __name__ == "__main__":
    filelist = []

D parse.py => parse.py +0 -38
@@ 1,38 0,0 @@
#!/usr/bin/env python3

"""The parsers are developed, debugged, and refactored in this file.

When they mature, I refactor them into standalone modules under the `parser`
directory.

Eventually, the entire parse step will mature and be abstracted into a
single function call, which will be appropriate to call in `main.py`.

If you can see this file, then I'm not done yet.
"""

from pprint import pprint

from parser.html import parse as parse_html
from parser.pdf import parse as parse_pdf
from parser.timesheet import TimeSheet

def read_timesheet(filename):
    unstructured_data = parse_pdf(filename)
    semistructured_data = parse_html(unstructured_data)
    return semistructured_data

def parse_timesheet(data):
    t = TimeSheet(data)
    t.report_issues()
    return []

def extract_projects(structured_data):
    return []

def timesheet(filename):
    semistructured_data = read_timesheet(filename)
    structured_data = parse_timesheet(semistructured_data)
    projects = extract_projects(structured_data)
    return projects


D parser/html.py => parser/html.py +0 -118
@@ 1,118 0,0 @@
#!/usr/bin/env python3

# Crash Course on html.parser
#
# A SAX-style parser. Hook into tags and data like...
#
# ```
# from html.parser import HTMLParser
# class MyHTMLParser(HTMLParser):
#    def handle_starttag(self, tag, attrs):
#        #do something...
#    def handle_endtag(self, tag):
#        #do something...
#    def handle_data(self, data):
#        #do something...
# ```
#
# Valid HTML is fed into the parser like...
#
# ```
# parser = MyHTMLParser()
# parser.feed(html)
# ```

from html.parser import HTMLParser

def parse_attrs_string(attrs):
    """Parse a string structures like `key1:value1;key2:value2;`.

    Embedded CSS (as in `style` attributes) can look like this.
    """
    attrs_dict = {}
    for pair in attrs.split(";"):
        if len(pair.strip()) == 0:
            continue
        key, value = pair.split(":")

        key, value = key.strip(), value.strip()
        attrs_dict[key] = value
    return attrs_dict

def parse_attrs_doubles(attrs):
    """Parse a dictionary of HTML/CSS attributes from a series of doubles.

    The built-in Python HTML parser (`html.parser.HTMLParser`) hands attributes
    to the `handle_starttag` hook like this.
    """
    attrs_dict = {}
    for pair in attrs:
        key, value = pair
        if key == "style":
            value = parse_attrs_string(value)
        attrs_dict[key] = value
    return attrs_dict

def has_style_left(attrs):
    return "style" in attrs.keys() and "left" in attrs["style"].keys()

def has_style_top(attrs):
    return "style" in attrs.keys() and "top" in attrs["style"].keys()

class TimesheetHTMLParser(HTMLParser):
    """A specialization of the `html.parser.HTMLParser` class to handle my
    timesheets.

    Data is stored internally and can be dumped with the `dump` method.

    Don't forget to close the parser instance!
    """
    def __init__(self):
        HTMLParser.__init__(self)
        self._top = 0
        self._left = 0
        self._in_div = False
        self._in_span = False
        self._data = []

    def handle_starttag(self, tag, _attrs):
        attrs = parse_attrs_doubles(_attrs)
        if self._in_div:
            if tag == "span":
                self._in_span = True
            else:
                self._in_div = False
        elif tag == "div":
            if has_style_left(attrs):
                self._left = int(attrs["style"]["left"].removesuffix("px"))
                self._in_div = True
            if has_style_top(attrs):
                self._top = int(attrs["style"]["top"].removesuffix("px"))
                self._in_div = True
        else:
            self._in_span = False
            self._in_div = False

    def handle_endtag(self, tag):
        self._in_span = False
        self._in_div = False

    def handle_data(self, data):
        if self._in_span:
            self._data.append((data.splitlines()[0], self._left, self._top))
        self._in_span = False
        self._in_div = False

    def dump(self):
        return self._data

def parse(html):
    """Read an HTML-encoded string into semi-structured data."""
    parser = TimesheetHTMLParser()
    try:
        parser.feed(html)
        data = parser.dump()
    finally:
        parser.close()
    return data


M parser/pdf.py => parser/pdf.py +20 -34
@@ 2,41 2,27 @@

from io import StringIO

# Crash Course on pdfminer
#
# Extract text from PDFs like...
#
# ```
# from pdfminer.high_level import extract_text
# with open(filename, "rb") as f:
#   text = extract_text(f)
# ```
#
# The alternative is to use something like...
#
# ```
# from io import StringIO
# from pdfminer.high_level import extract_text_to_fp
# from pdfminer.layout import LAParams
# buffer = StringIO()
# with open(filename, "rb") as f:
#   extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
# html = buffer.getvalue()
# ```

from pdfminer.high_level import extract_text_to_fp
from pdfminer.converter import XMLConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

def parse(filename):
    """Read a binary PDF-encoded file and convert it into an HTML-encoded
    string.
    """
def parse(filename_in, filename_out):
    """Main routine. Reads a PDF file and writes an XML file."""
    buffer = StringIO()
    try:
        with open(filename, "rb") as f:
            extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
        data = buffer.getvalue()
    finally:
        buffer.close()
    return data
    manager = PDFResourceManager(caching=False)
    converter = XMLConverter(manager, buffer, laparams=LAParams(), codec=None)
    interpreter = PDFPageInterpreter(manager, converter)

    with open(filename_in, "rb") as f:
        for page in PDFPage.get_pages(f, caching=False):
            interpreter.process_page(page)

    with open(filename_out, "w") as f:
        first = True
        for line in buffer.getvalue().splitlines():
            if not first:
                f.write(line+"\n")
            first = False
        f.write("</pages>\n")


A parser/xml.py => parser/xml.py +695 -0
@@ 0,0 1,695 @@
#!/usr/bin/env python3

import sys
from xml.sax import handler, make_parser
import csv

def printf(string, *variables):
    """Print to STDERR with formatting."""
    sys.stderr.write(string.format(*variables))
    sys.stderr.write("\n")

def is_approximately(location, target):
    """Tests if a location is close enough to a target to be considered equal.

    PDFs store the rendered location of a textbox, not the mathematically-
    ideal location. The net effect is that, while you can rely on the y
    dimension to identify a row, you cannot rely on the x dimension to
    identify a column. My solution is to make equivalence a bit fuzzy, to the
    effect of +/- 5 pixels.
    """
    if (location[1] == target[1]
        and target[0]-5 <= location[0] <= target[0]+5):
        return True
    return False

class TimeSheetHandler(handler.ContentHandler):
    def __init__(self):
        handler.ContentHandler.__init__(self)

        self.text_buffer = ""
        self.line_buffer = []
        self.page_buffer = []

        self.pagenum = None
        self.daterange = None

        self.in_textbox = False
        self.in_text = False
        self.in_figure = False
        self.in_hours_distribution = False

        self.in_header_footer_parts = {
            "timesheet_label": False,
            "timesheet_value": False,
            "location_label": False,
            "location_value": False,
            "department_label": False,
            "department_value": False,
            "employee_type_label": False,
            "employee_type_value": False,
            "location_default_label": False,
            "location_default_value": False,
            "function_label": False,
            "function_value": False,
            "exempt_label": False,
            "exempt_value": False,
            "status_label": False,
            "status_value": False,
            "post_status_label": False,
            "post_status_value": False,
            "validation_label": False,
            "validation_value": False,
            "datetime_label": False,
            "datetime_value": False,
            "total_timesheet_label": False,
            "total_timesheet_value": False,
            "standard_hours_label": False,
            "standard_hours_value": False,
            "total_billable_label": False,
            "total_billable_value": False,
            "percent_billability_label": False,
            "percent_billability_value": False,
            "id_label": False,
            "time_code_label": False,
            "project_label": False,
            "timetype_label": False,
            "day_label": False,
            "total_label": False,
            "_doc_no_label": False,
            "_doc_no_value": False,
            "footer_datetime_value": False,
            "footer_timezone_value": False,
            "footer_pagenum_value": False,
            "footer_pagecount_value": False,
        }

    def startElement(self, name, attrs):
        if name=="page":
            self.in_figure = False
            self.pagenum = attrs["id"]
        elif name=="figure":
            self.in_figure = True
        elif not self.in_figure and not self.in_hours_distribution:
            if name=="textbox":
                handled = self.handle_header_footer_start(attrs["bbox"])
                if not handled:
                    self.record_location(attrs["bbox"])
                self.in_textbox = True
            elif self.in_textbox and name=="text":
                self.in_text = True

    def characters(self, data):
        if self.in_text:
            self.append_buffer(data)

    def endElement(self, name):
        if name=="page":
            self.sort_lines()
        elif not self.in_figure and not self.in_hours_distribution:
            if name=="textbox":
                handled = self.handle_header_footer_end()
                if not handled:
                    text = self.pop_buffer()
                    if text=="Hours Distribution by Time Code":
                        self.in_hours_distribution = True
                    else:
                        self.record_text(text)
                self.in_textbox = False
            elif name=="text":
                self.in_text = False

    def sort_lines(self):
        """Helper function to perform page-level cleaning on line-level data.
        Manipulates the page buffer and clears the line buffer.
        """
        sorted_lines = sorted(
            self.line_buffer,
            key=lambda x: (-float(x[1]), float(x[0])),
        )
        self.line_buffer = []
        self.page_buffer.append(sorted_lines)

    def record_location(self, data):
        """Helper function to append new location data to the line buffer."""
        self.line_buffer.append(data.split(",")[:2])

    def record_text(self, data):
        """Helper function to append new text data to the line buffer."""
        self.line_buffer[-1].append(data)

    def append_buffer(self, data):
        """Helper function to append new character data to the text buffer."""
        self.text_buffer += data

    def pop_buffer(self):
        """Helper function to grab aggregated character data and reset the
        text buffer.
        """
        data = self.text_buffer
        self.text_buffer = ""
        return data.strip()

    def debug_assert(self, value, should_be, label=""):
        """Helper function to manage validation logic and conditional print
        statements.
        """
        if label:
            printf_label = f"{self.daterange}:{self.pagenum}:{label}:"
        else:
            printf_label = f"{self.daterange}:{self.pagenum}:"

        if isinstance(should_be, tuple):
            if value not in should_be:
                printf(
                    printf_label + "should be {0}, is {1}",
                    "one of " + ", ".join(should_be),
                    value,
                )
        elif value != should_be:
            printf(printf_label + "should be {0}, is {1}", should_be, value)

    def handle_header_footer_start(self, location):
        """Handle header and footer content on a page.

        There is some branching based on whether the current page is the first
        page of a document.

        If a header or footer box is identified (based on its location),
        the parser state is updated so that box's content can be grabbed,
        validated, and suppressed from output upon reaching its end.

        If a box is handled, return True. Otherwise return False to signal
        that further handling is necessary.
        """
        location_xy = location.split(",")[:2]
        identity = ','.join(location_xy)
        loc = (int(float(location_xy[0])), int(float(location_xy[1])), )

        if self.pagenum == "1":
            if is_approximately(loc, (333, 524, )):
                self.in_header_footer_parts["timesheet_label"] = True
                return True

            elif is_approximately(loc, (333, 504, )):
                self.in_header_footer_parts["timesheet_value"] = True
                return True

            elif is_approximately(loc, (20, 481, )):
                self.in_header_footer_parts["location_label"] = True
                return True

            elif is_approximately(loc, (93, 481, )):
                self.in_header_footer_parts["location_value"] = True
                return True

            elif is_approximately(loc, (20, 466, )):
                self.in_header_footer_parts["department_label"] = True
                return True

            elif is_approximately(loc, (93, 466, )):
                self.in_header_footer_parts["department_value"] = True
                return True

            elif is_approximately(loc, (20, 452, )):
                self.in_header_footer_parts["employee_type_label"] = True
                return True

            elif is_approximately(loc, (93, 452, )):
                self.in_header_footer_parts["employee_type_value"] = True
                return True

            elif is_approximately(loc, (20, 437, )):
                self.in_header_footer_parts["location_default_label"] = True
                return True

            elif is_approximately(loc, (93, 437, )):
                self.in_header_footer_parts["location_default_value"] = True
                return True

            elif is_approximately(loc, (230, 481, )):
                self.in_header_footer_parts["function_label"] = True
                return True

            elif is_approximately(loc, (304, 481, )):
                self.in_header_footer_parts["function_value"] = True
                return True

            elif is_approximately(loc, (230, 466, )):
                self.in_header_footer_parts["exempt_label"] = True
                return True

            elif is_approximately(loc, (304, 466, )):
                self.in_header_footer_parts["exempt_value"] = True
                return True

            elif is_approximately(loc, (230, 452, )):
                self.in_header_footer_parts["status_label"] = True
                return True

            elif is_approximately(loc, (304, 452, )):
                self.in_header_footer_parts["status_value"] = True
                return True

            elif is_approximately(loc, (230, 437, )):
                self.in_header_footer_parts["_doc_no_label"] = True
                return True

            elif is_approximately(loc, (304, 437, )):
                self.in_header_footer_parts["_doc_no_value"] = True
                return True

            elif is_approximately(loc, (440, 481, )):
                self.in_header_footer_parts["post_status_label"] = True
                return True

            elif is_approximately(loc, (513, 481, )):
                self.in_header_footer_parts["post_status_value"] = True
                return True

            elif is_approximately(loc, (440, 466, )):
                self.in_header_footer_parts["validation_label"] = True
                return True

            elif is_approximately(loc, (513, 466, )):
                self.in_header_footer_parts["validation_value"] = True
                return True

            elif is_approximately(loc, (440, 452, )):
                self.in_header_footer_parts["datetime_label"] = True
                return True

            elif is_approximately(loc, (513, 452, )):
                self.in_header_footer_parts["datetime_value"] = True
                return True

            elif is_approximately(loc, (651, 481, )):
                self.in_header_footer_parts["total_timesheet_label"] = True
                return True

            elif is_approximately(loc, (751, 481, )):
                self.in_header_footer_parts["total_timesheet_value"] = True
                return True

            elif is_approximately(loc, (651, 466, )):
                self.in_header_footer_parts["standard_hours_label"] = True
                return True

            elif is_approximately(loc, (751, 466, )):
                self.in_header_footer_parts["standard_hours_value"] = True
                return True

            elif is_approximately(loc, (651, 452, )):
                self.in_header_footer_parts["total_billable_label"] = True
                return True

            elif is_approximately(loc, (751, 452, )):
                self.in_header_footer_parts["total_billable_value"] = True
                return True

            elif is_approximately(loc, (651, 437, )):
                self.in_header_footer_parts["percent_billability_label"] = True
                return True

            elif is_approximately(loc, (751, 437, )):
                self.in_header_footer_parts["percent_billability_value"] = True
                return True

            elif is_approximately(loc, (20, 399, )):
                self.in_header_footer_parts["id_label"] = True
                return True

            elif is_approximately(loc, (40, 399, )):
                self.in_header_footer_parts["time_code_label"] = True
                return True

            elif is_approximately(loc, (120, 399, )):
                self.in_header_footer_parts["project_label"] = True
                return True

            elif is_approximately(loc, (200, 399, )):
                self.in_header_footer_parts["timetype_label"] = True
                return True

            elif loc[1]==399 and 370 <= loc[0] <= 730:
                self.in_header_footer_parts["day_label"] = True
                return True

            elif is_approximately(loc, (754, 399, )):
                self.in_header_footer_parts["total_label"] = True
                return True

            elif is_approximately(loc, (20, 27, )):
                self.in_header_footer_parts["footer_datetime_value"] = True
                return True

            elif is_approximately(loc, (120, 27, )):
                self.in_header_footer_parts["footer_timezone_value"] = True
                return True

            elif is_approximately(loc, (688, 27, )):
                self.in_header_footer_parts["footer_pagenum_value"] = True
                return True

            elif is_approximately(loc, (732, 27, )):
                self.in_header_footer_parts["footer_pagecount_value"] = True
                return True

        else:
            if is_approximately(loc, (333, 524, )):
                self.in_header_footer_parts["timesheet_label"] = True
                return True

            elif is_approximately(loc, (333, 504, )):
                self.in_header_footer_parts["timesheet_value"] = True
                return True

            elif is_approximately(loc, (20, 479, )):
                self.in_header_footer_parts["id_label"] = True
                return True

            elif is_approximately(loc, (40, 479, )):
                self.in_header_footer_parts["time_code_label"] = True
                return True

            elif is_approximately(loc, (120, 479, )):
                self.in_header_footer_parts["project_label"] = True
                return True

            elif is_approximately(loc, (200, 479, )):
                self.in_header_footer_parts["timetype_label"] = True
                return True

            elif loc[1]==479 and 370 <= loc[0] <= 730:
                self.in_header_footer_parts["day_label"] = True
                return True

            elif is_approximately(loc, (754, 479, )):
                self.in_header_footer_parts["total_label"] = True
                return True

            elif is_approximately(loc, (20, 27, )):
                self.in_header_footer_parts["footer_datetime_value"] = True
                return True

            elif is_approximately(loc, (120, 27, )):
                self.in_header_footer_parts["footer_timezone_value"] = True
                return True

            elif is_approximately(loc, (688, 27, )):
                self.in_header_footer_parts["footer_pagenum_value"] = True
                return True

            elif is_approximately(loc, (732, 27, )):
                self.in_header_footer_parts["footer_pagecount_value"] = True
                return True

        return False

    def handle_header_footer_end(self):
        """Handle header and footer content on a page.

        If a box is handled, return True. Otherwise return False to signal
        that further handling is necessary.
        """
        if self.in_header_footer_parts["_doc_no_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Doc.No.", label="doc no label")
            self.in_header_footer_parts["_doc_no_label"] = False
            return True

        elif self.in_header_footer_parts["_doc_no_value"]:
            value = self.pop_buffer()
            self.debug_assert(value, "1", label="doc no")
            self.in_header_footer_parts["_doc_no_value"] = False
            return True

        elif self.in_header_footer_parts["timesheet_label"]:
            value = self.pop_buffer()
            self.debug_assert(
                value,
                "Timesheet\n[109015] Ricottone, Dominic",
                label="timesheet label",
            )
            self.in_header_footer_parts["timesheet_label"] = False
            return True

        elif self.in_header_footer_parts["timesheet_value"]:
            self.daterange = self.pop_buffer()
            self.in_header_footer_parts["timesheet_value"] = False
            return True

        elif self.in_header_footer_parts["location_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Location:", label="location label")
            self.in_header_footer_parts["location_label"] = False
            return True

        elif self.in_header_footer_parts["location_value"]:
            value = self.pop_buffer()
            self.debug_assert(value, "[E01] Fors Marsh Group", label="location")
            self.in_header_footer_parts["location_value"] = False
            return True

        elif self.in_header_footer_parts["department_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Department:", label="department label")
            self.in_header_footer_parts["department_label"] = False
            return True

        elif self.in_header_footer_parts["department_value"]:
            value = self.pop_buffer()
            self.debug_assert(
                value,
                ("[3200] Advanced Analytics", "[3230] Data Management", ),
                label="department",
            )
            self.in_header_footer_parts["department_value"] = False
            return True

        elif self.in_header_footer_parts["employee_type_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Employee Type:", label="employee type label")
            self.in_header_footer_parts["employee_type_label"] = False
            return True

        elif self.in_header_footer_parts["employee_type_value"]:
            value = self.pop_buffer()
            self.debug_assert(value, "[1] Annual Salary", label="employee type")
            self.in_header_footer_parts["employee_type_value"] = False
            return True

        elif self.in_header_footer_parts["location_default_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Location (Default", label="location default label")
            self.in_header_footer_parts["location_default_label"] = False
            return True

        elif self.in_header_footer_parts["location_default_value"]:
            value = self.pop_buffer()
            self.debug_assert(value, "[LOCAL] Location", label="location default")
            self.in_header_footer_parts["location_default_value"] = False
            return True

        elif self.in_header_footer_parts["function_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Function:", label="function label")
            self.in_header_footer_parts["function_label"] = False
            return True

        elif self.in_header_footer_parts["function_value"]:
            value = self.pop_buffer()
            self.debug_assert(value, "[1] Full Time", label="function")
            self.in_header_footer_parts["function_value"] = False
            return True

        elif self.in_header_footer_parts["exempt_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Exempt:", label="exempt label")
            self.in_header_footer_parts["exempt_label"] = False
            return True

        elif self.in_header_footer_parts["exempt_value"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Yes", label="exempt")
            self.in_header_footer_parts["exempt_value"] = False
            return True

        elif self.in_header_footer_parts["status_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Status:", label="status label")
            self.in_header_footer_parts["status_label"] = False
            return True

        elif self.in_header_footer_parts["status_value"]:
            value = self.pop_buffer()
            self.debug_assert(
                value,
                ("Approved", "Closed", "On Hold [Draft]", ),
                label="status",
            )
            self.in_header_footer_parts["status_value"] = False
            return True

        elif self.in_header_footer_parts["post_status_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Post Status:", label="post status label")
            self.in_header_footer_parts["post_status_label"] = False
            return True

        elif self.in_header_footer_parts["post_status_value"]:
            value = self.pop_buffer()
            self.debug_assert(
                value,
                ("Posted", "Not posted", ),
                label="post status",
            )
            self.in_header_footer_parts["post_status_value"] = False
            return True

        elif self.in_header_footer_parts["validation_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Validation:", label="validation label")
            self.in_header_footer_parts["validation_label"] = False
            return True

        elif self.in_header_footer_parts["validation_value"]:
            value = self.pop_buffer()
            self.debug_assert(
                value,
                ("Passed", "Warnings", ),
                label="validation",
            )
            self.in_header_footer_parts["validation_value"] = False
            return True

        elif self.in_header_footer_parts["datetime_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Date/Time:", label="datetime label")
            self.in_header_footer_parts["datetime_label"] = False
            return True

        elif self.in_header_footer_parts["datetime_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["datetime_value"] = False
            return True

        elif self.in_header_footer_parts["total_timesheet_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Total Timesheet:", label="total timesheet label")
            self.in_header_footer_parts["total_timesheet_label"] = False
            return True

        elif self.in_header_footer_parts["total_timesheet_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["total_timesheet_value"] = False
            return True

        elif self.in_header_footer_parts["standard_hours_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Standard Hours:", label="standard hours label")
            self.in_header_footer_parts["standard_hours_label"] = False
            return True

        elif self.in_header_footer_parts["standard_hours_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["standard_hours_value"] = False
            return True

        elif self.in_header_footer_parts["total_billable_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Total Billable:", label="total billable label")
            self.in_header_footer_parts["total_billable_label"] = False
            return True

        elif self.in_header_footer_parts["total_billable_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["total_billable_value"] = False
            return True

        elif self.in_header_footer_parts["percent_billability_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Percent Billability:", label="percent billability label")
            self.in_header_footer_parts["percent_billability_label"] = False
            return True

        elif self.in_header_footer_parts["percent_billability_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["percent_billability_value"] = False
            return True

        elif self.in_header_footer_parts["id_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "ID", label="id label")
            self.in_header_footer_parts["id_label"] = False
            return True

        elif self.in_header_footer_parts["time_code_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Time Code", label="time code label")
            self.in_header_footer_parts["time_code_label"] = False
            return True

        elif self.in_header_footer_parts["project_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "Project", label="project label")
            self.in_header_footer_parts["project_label"] = False
            return True

        elif self.in_header_footer_parts["timetype_label"]:
            value = self.pop_buffer()
            self.debug_assert(value, "TimeType", label="timetype label")
            self.in_header_footer_parts["timetype_label"] = False
            return True

        elif self.in_header_footer_parts["day_label"]:
            value = self.pop_buffer()
            self.debug_assert(
                value,
                ("Mon", "Tue Wed", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun", ),
                label="day label",
            )
            self.in_header_footer_parts["day_label"] = False
            return True

        elif self.in_header_footer_parts["total_label"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["total_label"] = False
            return True

        elif self.in_header_footer_parts["footer_datetime_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["footer_datetime_value"] = False
            return True

        elif self.in_header_footer_parts["footer_timezone_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["footer_timezone_value"] = False
            return True

        elif self.in_header_footer_parts["footer_pagenum_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["footer_pagenum_value"] = False
            return True

        elif self.in_header_footer_parts["footer_pagecount_value"]:
            value = self.pop_buffer()
            self.in_header_footer_parts["footer_pagecount_value"] = False
            return True

        return False

def parse(filename_in, filename_out):
    """Main routine. Reads an XML file and writes a CSV file."""
    parser = make_parser()
    handler = TimeSheetHandler()

    parser.setContentHandler(handler)
    parser.parse(filename_in)

    with open(filename_out, "w", newline="") as f:
        writer = csv.writer(f)
        for page in handler.page_buffer:
            for line in page:
                writer.writerow(line)