~dricottone/fmg-timesheets

7326d28027df088781be5bf0cb673109a1cc0d13 — Dominic Ricottone 2 years ago 1c07f9a
Started implementing time entry extraction.

Time entries are now being parsed and validated, though there are
numerous issues still to sort out.

I have a feeling that further development will require passing around
the `top` style attributes in the same way I'm passing around the `left`
style attributes. TBD though.
3 files changed, 158 insertions(+), 18 deletions(-)

M parse.py
A parser/timeentry.py
M parser/timesheet.py
M parse.py => parse.py +2 -2
@@ 15,7 15,7 @@ from pprint import pprint

from parser.html import parse as parse_html
from parser.pdf import parse as parse_pdf
from parser.timesheet import Timesheet
from parser.timesheet import TimeSheet

def read_timesheet(filename):
    unstructured_data = parse_pdf(filename)


@@ 23,7 23,7 @@ def read_timesheet(filename):
    return semistructured_data

def parse_timesheet(data):
    t = Timesheet(data)
    t = TimeSheet(data)
    t.report_issues()
    return []


A parser/timeentry.py => parser/timeentry.py +84 -0
@@ 0,0 1,84 @@
#!/usr/bin/env python3

from re import compile as re_compile

ENTRY_PATTERNS = (
    re_compile("[1-9][0-9]?"),
    re_compile("(ST|VAC|HOL)"),
    re_compile("(OHORG\.(MTG|ONB)\.0[0-5]\.00[0-5]|UNALW\.EMW\.0[0-5]\.00[0-5]|[0-9]{5}\.[0-9]{3}\.[0-9]{2}\.[0-9]{3})"),
    re_compile("[0-9]{3,5}"),
    re_compile("."),
    re_compile("Notes"),
    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
    re_compile("- [ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012]"),
    re_compile("."),
)

class TimeEntry(object):
    def __init__(self, semistructured_data, timesheet_name, entry_number):
        self._issues = []
        self._timesheet = timesheet_name
        self._entry = entry_number
        self._data = self.beat_data_with_a_stick(semistructured_data)
        self.assert_entry()

    def __str__(self):
        return f"entry #{self._entry} in the timesheet for {self._timesheet}"

    def __len__(self):
        return len(self._data)

    def report_issues(self):
        if self._issues:
            print(f"There are some issues in {self}...")
            for issue in self._issues:
                print(issue)
            for index, line in enumerate(self._data):
                print(index, line)

    def log_issue(self, *parts):
        self._issues.append(''.join(parts))

    def beat_data_with_a_stick(self, data):
        # Project and TimeType fields can get merged
        if " " in data[2][0]:
            project_timetype = data[2][0].split(" ", 1)
            shared = [n for n in data[2][1:]]
            split = [
                (project_timetype[0], *shared, ),
                (project_timetype[1], *shared, ),
            ]
            data = data[:2] + split + data[3:]

        # Holiday and Vacation time entries skip the Project and TimeType fields
        if data[1][0] in ("HOL", "VAC", ):
            data = data[0:1] + [("", 0, ), ("", 0, )] + data[1:]

        return data

    def assert_entry_item(self, index, should_be_at):
        if not ENTRY_PATTERNS[index].match(self._data[index][0]):
            self.log_issue(
                "assert_entry: ",
                f"item {index} does not look right ",
                f"({self._data[index][0]})",
            )
        if self._data[index][1] != should_be_at:
            self.log_issue(
                "assert_entry: ",
                f"item {index} is not at {should_be_at}px left ",
                f"(is at {self._data[index][1]}px)",
            )

    def assert_entry(self):
        self.assert_entry_item(1, 40)
        self.assert_entry_item(2, 120)
        self.assert_entry_item(3, 201)
        print(self._data)
        self.assert_entry_item(4, 39)
        if len(self)>5 and self._data[5] == "Notes":
            self.assert_entry_item(5, 40)
            self.assert_entry_item(6, 40)
            self.assert_entry_item(7, 99)
            self.assert_entry_item(8, 220)


M parser/timesheet.py => parser/timesheet.py +72 -16
@@ 2,7 2,9 @@

from re import compile as re_compile

PATTERNS = (
from .timeentry import TimeEntry, ENTRY_PATTERNS

PAGE_PATTERNS = (
    re_compile("[ADFJMNOS][aceopu][bcglnprtvy] [1-9][0-9]?, 20[12][89012] [012][0-9]:[0-5][0-9]"),
    re_compile("\(GMT-0[456]:00\) .*"),
    re_compile("Page [1-9][0-9]?"),


@@ 35,41 37,55 @@ def is_multiple_8(string):
    except:
        return False

class Timesheet(object):
class TimeSheet(object):
    def __init__(self, semistructured_data):
        self._warnings = []
        self._issues = []
        self._data = self.beat_data_with_a_stick(semistructured_data)
        self.assert_header()
        self.parse_header()
        self.parse_footer()
        self.parse_pages()
        self.parse_entries()

    def __str__(self):
        return self.header.get("dates", "Timesheet(...)")
        return self.header.get("dates", "TimeSheet(...)")

    def __len__(self):
        return len(self._data)

    def report_issues(self):
        if self._warnings:
            print(f"There are some warnings in the timesheet for {self}...")
            for warning in self._warnings:
                print(warning)
        if self._issues:
            print(f"There are some issues in the timesheet for {self}...")
            for issue in self._issues:
                print(issue)
            for index, line in enumerate(self._data):
                print(index, line)
        for entry in self._entries:
            entry.report_issues()

    def log_issue(self, *parts):
        self._issues.append(''.join(parts))

    def log_warning(self, *parts):
        self._warnings.append(''.join(parts))

    def beat_data_with_a_stick(self, data):
        """A fragile solution to the problems resulting from
        PDF-to-HTML-to-long data conversions.
        """
        # a small number of timesheets have a 'Doc.No.' field
        if data[14][0] == "Doc.No." and data[17][0] == "1":
            self.log_issue("beat_data_with_a_stick: killing 'Doc.No.' fields")
            self.log_warning("beat_data_with_a_stick: killing 'Doc.No.' fields")
            #Note: subtract 1 due to the other operation realigning indices
            data = remove_item(data, 14)
            data = remove_item(data, 17-1)
        elif data[16][0] == "Doc.No." and data[19][0] == "1":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "killing 'Doc.No.' label and field",
            )


@@ 78,7 94,7 @@ class Timesheet(object):

        # 'Post Status:' can float up if the status is 'Not posted'
        if data[6][0] == "Post Status:" and data[7][0] == "Not posted":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting post status label and field",
            )


@@ 88,7 104,7 @@ class Timesheet(object):

        # 'Function:' and '[1] Full Time' can float up together
        if data[4][0] == "Function:" and data[5][0] == "[1] Full Time":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting function label and field",
            )


@@ 96,7 112,7 @@ class Timesheet(object):
            data = move_item(data, 4, 10+1)
            data = move_item(data, 5-1, 13)
        elif data[8][0] == "Function:" and data[11][0] == "[1] Full Time":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting function label and field",
            )


@@ 105,7 121,7 @@ class Timesheet(object):

        # 'Percent Billability:' can float down
        if data[28][0] == "Percent Billability:":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting percent billability label",
            )


@@ 113,7 129,7 @@ class Timesheet(object):

        # 'Validation:' and 'Passed' can float up
        if data[16][0] == "Validation:" and data[19][0] == "Passed":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting validation label and field",
            )


@@ 122,7 138,7 @@ class Timesheet(object):

        # 'Posted'/'Not posted' can float up
        if data[17][0] == "Posted" or data[17][0] == "Not posted":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting post status field",
            )


@@ 130,7 146,7 @@ class Timesheet(object):

        # 'ID', 'Time Code', 'Project', and 'TimeType' and float down
        if data[41][0] == "ID" and data[42][0] == "Time Code" and data[43][0] == "Project" and data[44][0] == "TimeType":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting ID, Time Code, Project, and TimeType header",
            )


@@ 139,7 155,7 @@ class Timesheet(object):
            data = move_item(data, 43, 32)
            data = move_item(data, 44, 33)
        elif data[43][0] == "ID" and data[44][0] == "Time Code" and data[45][0] == "Project" and data[46][0] == "TimeType":
            self.log_issue(
            self.log_warning(
                "beat_data_with_a_stick: ",
                "re-sorting ID, Time Code, Project, and TimeType header",
            )


@@ 227,7 243,7 @@ class Timesheet(object):
    def parse_footer(self):
        """Loop though lines to identify the document footer and clear it."""
        target = None
        for n in range(len(self._data)-1, 0, -1):
        for n in range(len(self)-1, 0, -1):
            if self._data[n][0] == "Hours Distribution by Time Code":
                target = n
                break


@@ 243,6 259,7 @@ class Timesheet(object):
        """Loop through lines to identify page headers and footers, and clear
        those lines.
        """
        # Pages begin with "Timesheet"
        page_breaks = []
        for index, line in enumerate(self._data):
            if line[0] == "Timesheet":


@@ 253,15 270,54 @@ class Timesheet(object):
                "could not locate any page breaks",
            )

        # At each page break, there is a page footer sequence before:
        #  + "Mmm N, YYYY HH:MM"
        #  + "(GMT-0H:00) TZNAME"
        #  + "Page N"
        #  + "of N"
        # ...and a page header sequence after:
        #  + "Mmm N, YYYY - Mmm N, YYYY"
        #  + "ID"
        #  + "Time Code"
        #  + "Project"
        #  + "TimeType"
        # None of this is useful data. Immediately delete it.
        for page_break in reversed(page_breaks):
            i = page_break - 7
            while i < page_break:
                j = 0
                while j < len(PATTERNS):
                    if PATTERNS[j].match(self._data[i][0]):
                while j < len(PAGE_PATTERNS):
                    if PAGE_PATTERNS[j].match(self._data[i][0]):
                        del self._data[i]
                        j = 0
                    else:
                        j += 1
                i += 1

    def parse_entries(self):
        """Loop through lines to identify time entries."""
        i = 0
        entries = []
        while i < len(self):
            if self._data[i][1] == 20:
                if ENTRY_PATTERNS[0].match(self._data[i][0]):
                    entries.append([self._data[i]])
                else:
                    self.log_issue(
                        "parse_entries: ",
                        "something unexpected in the entry start position ",
                        f"({self._data[i][0]})",
                    )
            else:
                if len(entries)>0:
                    entries[-1].append(self._data[i])
                else:
                    self.log_issue(
                        "parse_entries: ",
                        "something unexpected before any entries started ",
                        f"({self._data[i][0]})",
                    )
            i += 1

        self._entries = [TimeEntry(e, str(self), e[0][0]) for e in entries]