A => Makefile +8 -0
@@ 1,8 @@
+.PHONY: clean
+clean:
+ rm --force --recursive __pycache__
+
+.PHONY: process
+process:
+ python3 main.py ~/web/*.pdf
+
A => main.py +23 -0
@@ 1,23 @@
+#!/usr/bin/env python3
+
+import sys
+import pathlib
+
+import parse
+
+def main(filelist):
+ print(f"processing {len(filelist)} files")
+ for filename in (filelist[4],):
+ parse.timesheet(filename)
+
+if __name__ == "__main__":
+ filelist = []
+ for filename in sys.argv[1:]:
+ filepath = pathlib.Path(filename)
+ if filepath.exists():
+ filelist.append(filepath)
+ else:
+ print(f"no such file: '{filename}'")
+
+ main(filelist)
+
A => notes +63 -0
@@ 1,63 @@
+index 20px
+timecode (usually "ST") 40px
+project 39px
+timetype 201px
+
+
+
+day1 571px or 572px
+day2 597px
+day3 622px
+day4 646px or 647px
+day5 671px or 672px
+day6 696px or 697px
+day7 721px or 722px
+rowtotal 751px or 753px or 756px
+
+document header:
+"Timesheet"
+"Mon N, YYYY - Mon N, YYYY"
+"Location:"
+"[E01] Fors Marsh Group"
+"Department:"
+"[3200] Advanced Analytics"
+"Employee Type:"
+"[1] Annual Salary"
+"Location (Default"
+"[LOCAL] Location"
+"Function:"
+"Exempt:"
+"Status:"
+"[1] Full Time"
+"Yes"
+"Approved"
+"Post Status:"
+"Validation:"
+"Date/Time:"
+"Posted"
+"Passed"
+"Mon N, YYYY HH:MM"
+"Total Tomesheet:"
+"Standard Hours:"
+"Total Billable:"
+"Percent Billability:"
+<total hours on spreadsheet>
+<minimum hours for spreadsheet>
+<total billable hours on spreadsheet>
+<total billable hours / total hours>
+
+left column header:
+"ID"
+"Time Code"
+"Project"
+"TimeType"
+
+right colummn header:
+"Mon"
+"Tues Wed"
+"Thu"
+"Fri"
+"Sat"
+"Sun"
+(or some cycled version of this)
+
A => parse.py +66 -0
@@ 1,66 @@
+#!/usr/bin/env python3
+
+"""The parsers are developed, debugged, and refactored in this file.
+
+When they mature, I refactor them into standalone modules under the `parser`
+directory.
+
+Eventually, the entire parse step will mature and be abstracted into a
+single function call, which will be appropriate to call in `main.py`.
+
+If you can see this file, then I'm not done yet.
+"""
+
+from pprint import pprint
+
+from parser.html import parse as parse_html
+from parser.pdf import parse as parse_pdf
+
+def read_timesheet(filename):
+ unstructured_data = parse_pdf(filename)
+ semistructured_data = parse_html(unstructured_data)
+ return semistructured_data
+
+def has_style_left(attrs):
+ return "style" in attrs.keys() and "left" in attrs["style"].keys()
+
+def update_count(counters, key):
+ if key in counters.keys():
+ counters[key] += 1
+ else:
+ counters[key] = 1
+ return counters
+
+def parse_timesheet(data):
+
+ in_div = False
+ in_span = False
+ left = ""
+
+ for line in data:
+ if in_span:
+ if line[0] == "DATA":
+ print(f"{left:10} {line[2][0]}")
+ in_span = False
+ in_div = False
+ elif in_div:
+ if line[0] == "START" and line[1] == "span":
+ in_span = True
+ else:
+ in_div = False
+ else:
+ if line[0] == "START" and line[1] == "div" and has_style_left(line[2]):
+ in_div = True
+ left = line[2]["style"]["left"]
+
+ return []
+
+def extract_projects(structured_data):
+ return []
+
+def timesheet(filename):
+ unstructured_data = read_timesheet(filename)
+ structured_data = parse_timesheet(unstructured_data)
+ projects = extract_projects(structured_data)
+ return projects
+
A => parser/html.py +86 -0
@@ 1,86 @@
+#!/usr/bin/env python3
+
+# Crash Course on html.parser
+#
+# A SAX-style parser. Hook into tags and data like...
+#
+# ```
+# from html.parser import HTMLParser
+# class MyHTMLParser(HTMLParser):
+# def handle_starttag(self, tag, attrs):
+# #do something...
+# def handle_endtag(self, tag):
+# #do something...
+# def handle_data(self, data):
+# #do something...
+# ```
+#
+# Valid HTML is fed into the parser like...
+#
+# ```
+# parser = MyHTMLParser()
+# parser.feed(html)
+# ```
+
+from html.parser import HTMLParser
+
+def parse_attrs_string(attrs):
+ """Parse a string structures like `key1:value1;key2:value2;`.
+
+ Embedded CSS (as in `style` attributes) can look like this.
+ """
+ attrs_dict = {}
+ for pair in attrs.split(";"):
+ if len(pair.strip()) == 0:
+ continue
+ key, value = pair.split(":")
+
+ key, value = key.strip(), value.strip()
+ attrs_dict[key] = value
+ return attrs_dict
+
+def parse_attrs_doubles(attrs):
+ """Parse a dictionary of HTML/CSS attributes from a series of doubles.
+
+ The built-in Python HTML parser (`html.parser.HTMLParser`) hands attributes
+ to the `handle_starttag` hook like this.
+ """
+ attrs_dict = {}
+ for pair in attrs:
+ key, value = pair
+ if key == "style":
+ value = parse_attrs_string(value)
+ attrs_dict[key] = value
+ return attrs_dict
+
+class TimesheetHTMLParser(HTMLParser):
+ """A specialization of the `html.parser.HTMLParser` class to handle my
+ timesheets.
+
+ Data is stored internally and can be dumped with the `dump` method.
+
+ Don't forget to close the parser instance!
+ """
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self._data = []
+ def handle_starttag(self, tag, _attrs):
+ attrs = parse_attrs_doubles(_attrs)
+ self._data.append(["START", tag, attrs])
+ def handle_endtag(self, tag):
+ self._data.append(["END", tag])
+ def handle_data(self, data):
+ self._data.append(["DATA", "", data.splitlines()])
+ def dump(self):
+ return self._data
+
+def parse(html):
+ """Read an HTML-encoded string into semi-structured data."""
+ parser = TimesheetHTMLParser()
+ try:
+ parser.feed(html)
+ data = parser.dump()
+ finally:
+ parser.close()
+ return data
+
A => parser/pdf.py +42 -0
@@ 1,42 @@
+#!/usr/bin/env python3
+
+from io import StringIO
+
+# Crash Course on pdfminer
+#
+# Extract text from PDFs like...
+#
+# ```
+# from pdfminer.high_level import extract_text
+# with open(filename, "rb") as f:
+# text = extract_text(f)
+# ```
+#
+# The alternative is to use something like...
+#
+# ```
+# from io import StringIO
+# from pdfminer.high_level import extract_text_to_fp
+# from pdfminer.layout import LAParams
+# buffer = StringIO()
+# with open(filename, "rb") as f:
+# extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
+# html = buffer.getvalue()
+# ```
+
+from pdfminer.high_level import extract_text_to_fp
+from pdfminer.layout import LAParams
+
+def parse(filename):
+ """Read a binary PDF-encoded file and convert it into an HTML-encoded
+ string.
+ """
+ buffer = StringIO()
+ try:
+ with open(filename, "rb") as f:
+ extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
+ data = buffer.getvalue()
+ finally:
+ buffer.close()
+ return data
+