From c041ec57595dc0e315a64b892633df30acf57509 Mon Sep 17 00:00:00 2001
From: Dominic Ricottone <me@dominic-ricottone.com>
Date: Fri, 29 Apr 2022 20:26:34 -0500
Subject: [PATCH] Initial commit

---
 Makefile       |  8 +++++
 main.py        | 23 ++++++++++++++
 notes          | 63 ++++++++++++++++++++++++++++++++++++
 parse.py       | 66 ++++++++++++++++++++++++++++++++++++++
 parser/html.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
 parser/pdf.py  | 42 ++++++++++++++++++++++++
 6 files changed, 288 insertions(+)
 create mode 100644 Makefile
 create mode 100644 main.py
 create mode 100644 notes
 create mode 100644 parse.py
 create mode 100644 parser/html.py
 create mode 100644 parser/pdf.py
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9906ed8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,8 @@
+.PHONY: clean
+clean:
+	rm --force --recursive __pycache__
+
+.PHONY: process
+process:
+	python3 main.py ~/web/*.pdf
+
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e608ede
--- /dev/null
+++ b/main.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+import sys
+import pathlib
+
+import parse
+
+def main(filelist):
+    print(f"processing {len(filelist)} files")
+    for filename in (filelist[4],):
+        parse.timesheet(filename)
+
+if __name__ == "__main__":
+    filelist = []
+    for filename in sys.argv[1:]:
+        filepath = pathlib.Path(filename)
+        if filepath.exists():
+            filelist.append(filepath)
+        else:
+            print(f"no such file: '{filename}'")
+    
+    main(filelist)
+
diff --git a/notes b/notes
new file mode 100644
index 0000000..a04ffcb
--- /dev/null
+++ b/notes
@@ -0,0 +1,63 @@
+index                   20px
+timecode (usually "ST") 40px
+project                 39px
+timetype                201px
+
+
+
+day1     571px or 572px
+day2     597px
+day3     622px
+day4     646px or 647px
+day5     671px or 672px
+day6     696px or 697px
+day7     721px or 722px
+rowtotal 751px or 753px or 756px
+
+document header:
+"Timesheet"
+"Mon N, YYYY - Mon N, YYYY"
+"Location:"
+"[E01] Fors Marsh Group"
+"Department:"
+"[3200] Advanced Analytics"
+"Employee Type:"
+"[1] Annual Salary"
+"Location (Default"
+"[LOCAL] Location"
+"Function:"
+"Exempt:"
+"Status:"
+"[1] Full Time"
+"Yes"
+"Approved"
+"Post Status:"
+"Validation:"
+"Date/Time:"
+"Posted"
+"Passed"
+"Mon N, YYYY HH:MM"
+"Total Tomesheet:"
+"Standard Hours:"
+"Total Billable:"
+"Percent Billability:"
+<total hours on spreadsheet>
+<minimum hours for spreadsheet>
+<total billable hours on spreadsheet>
+<total billable hours / total hours>
+
+left column header:
+"ID"
+"Time Code"
+"Project"
+"TimeType"
+
+right colummn header:
+"Mon"
+"Tues Wed"
+"Thu"
+"Fri"
+"Sat"
+"Sun"
+(or some cycled version of this)
+
diff --git a/parse.py b/parse.py
new file mode 100644
index 0000000..5b55629
--- /dev/null
+++ b/parse.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+"""The parsers are developed, debugged, and refactored in this file.
+
+When they mature, I refactor them into standalone modules under the `parser`
+directory.
+
+Eventually, the entire parse step will mature and be abstracted into a
+single function call, which will be appropriate to call in `main.py`.
+
+If you can see this file, then I'm not done yet.
+"""
+
+from pprint import pprint
+
+from parser.html import parse as parse_html
+from parser.pdf import parse as parse_pdf
+
+def read_timesheet(filename):
+    unstructured_data = parse_pdf(filename)
+    semistructured_data = parse_html(unstructured_data)
+    return semistructured_data
+
+def has_style_left(attrs):
+    return "style" in attrs.keys() and "left" in attrs["style"].keys()
+
+def update_count(counters, key):
+    if key in counters.keys():
+        counters[key] += 1
+    else:
+        counters[key] = 1
+    return counters
+
+def parse_timesheet(data):
+
+    in_div = False
+    in_span = False
+    left = ""
+
+    for line in data:
+        if in_span:
+            if line[0] == "DATA":
+                print(f"{left:10} {line[2][0]}")
+            in_span = False
+            in_div = False
+        elif in_div:
+            if line[0] == "START" and line[1] == "span":
+                in_span = True
+            else:
+                in_div = False
+        else:
+            if line[0] == "START" and line[1] == "div" and has_style_left(line[2]):
+                in_div = True
+                left = line[2]["style"]["left"]
+
+    return []
+
+def extract_projects(structured_data):
+    return []
+
+def timesheet(filename):
+    unstructured_data = read_timesheet(filename)
+    structured_data = parse_timesheet(unstructured_data)
+    projects = extract_projects(structured_data)
+    return projects
+
diff --git a/parser/html.py b/parser/html.py
new file mode 100644
index 0000000..ec8cdac
--- /dev/null
+++ b/parser/html.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+# Crash Course on html.parser
+#
+# A SAX-style parser. Hook into tags and data like...
+#
+# ```
+# from html.parser import HTMLParser
+# class MyHTMLParser(HTMLParser):
+#    def handle_starttag(self, tag, attrs):
+#        #do something...
+#    def handle_endtag(self, tag):
+#        #do something...
+#    def handle_data(self, data):
+#        #do something...
+# ```
+#
+# Valid HTML is fed into the parser like...
+#
+# ```
+# parser = MyHTMLParser()
+# parser.feed(html)
+# ```
+
+from html.parser import HTMLParser
+
+def parse_attrs_string(attrs):
+    """Parse a string structures like `key1:value1;key2:value2;`.
+
+    Embedded CSS (as in `style` attributes) can look like this.
+    """
+    attrs_dict = {}
+    for pair in attrs.split(";"):
+        if len(pair.strip()) == 0:
+            continue
+        key, value = pair.split(":")
+
+        key, value = key.strip(), value.strip()
+        attrs_dict[key] = value
+    return attrs_dict
+
+def parse_attrs_doubles(attrs):
+    """Parse a dictionary of HTML/CSS attributes from a series of doubles.
+
+    The built-in Python HTML parser (`html.parser.HTMLParser`) hands attributes
+    to the `handle_starttag` hook like this.
+    """
+    attrs_dict = {}
+    for pair in attrs:
+        key, value = pair
+        if key == "style":
+            value = parse_attrs_string(value)
+        attrs_dict[key] = value
+    return attrs_dict
+
+class TimesheetHTMLParser(HTMLParser):
+    """A specialization of the `html.parser.HTMLParser` class to handle my
+    timesheets.
+
+    Data is stored internally and can be dumped with the `dump` method.
+
+    Don't forget to close the parser instance!
+    """
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self._data = []
+    def handle_starttag(self, tag, _attrs):
+        attrs = parse_attrs_doubles(_attrs)
+        self._data.append(["START", tag, attrs])
+    def handle_endtag(self, tag):
+        self._data.append(["END", tag])
+    def handle_data(self, data):
+        self._data.append(["DATA", "", data.splitlines()])
+    def dump(self):
+        return self._data
+
+def parse(html):
+    """Read an HTML-encoded string into semi-structured data."""
+    parser = TimesheetHTMLParser()
+    try:
+        parser.feed(html)
+        data = parser.dump()
+    finally:
+        parser.close()
+    return data
+
diff --git a/parser/pdf.py b/parser/pdf.py
new file mode 100644
index 0000000..67aa2bf
--- /dev/null
+++ b/parser/pdf.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+from io import StringIO
+
+# Crash Course on pdfminer
+#
+# Extract text from PDFs like...
+#
+# ```
+# from pdfminer.high_level import extract_text
+# with open(filename, "rb") as f:
+#   text = extract_text(f)
+# ```
+#
+# The alternative is to use something like...
+#
+# ```
+# from io import StringIO
+# from pdfminer.high_level import extract_text_to_fp
+# from pdfminer.layout import LAParams
+# buffer = StringIO()
+# with open(filename, "rb") as f:
+#   extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
+# html = buffer.getvalue()
+# ```
+
+from pdfminer.high_level import extract_text_to_fp
+from pdfminer.layout import LAParams
+
+def parse(filename):
+    """Read a binary PDF-encoded file and convert it into an HTML-encoded
+    string.
+    """
+    buffer = StringIO()
+    try:
+        with open(filename, "rb") as f:
+            extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
+        data = buffer.getvalue()
+    finally:
+        buffer.close()
+    return data
+
-- 
2.45.2