From c041ec57595dc0e315a64b892633df30acf57509 Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Fri, 29 Apr 2022 20:26:34 -0500 Subject: [PATCH] Initial commit --- Makefile | 8 +++++ main.py | 23 ++++++++++++++ notes | 63 ++++++++++++++++++++++++++++++++++++ parse.py | 66 ++++++++++++++++++++++++++++++++++++++ parser/html.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++ parser/pdf.py | 42 ++++++++++++++++++++++++ 6 files changed, 288 insertions(+) create mode 100644 Makefile create mode 100644 main.py create mode 100644 notes create mode 100644 parse.py create mode 100644 parser/html.py create mode 100644 parser/pdf.py diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9906ed8 --- /dev/null +++ b/Makefile @@ -0,0 +1,8 @@ +.PHONY: clean +clean: + rm --force --recursive __pycache__ + +.PHONY: process +process: + python3 main.py ~/web/*.pdf + diff --git a/main.py b/main.py new file mode 100644 index 0000000..e608ede --- /dev/null +++ b/main.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import sys +import pathlib + +import parse + +def main(filelist): + print(f"processing {len(filelist)} files") + for filename in (filelist[4],): + parse.timesheet(filename) + +if __name__ == "__main__": + filelist = [] + for filename in sys.argv[1:]: + filepath = pathlib.Path(filename) + if filepath.exists(): + filelist.append(filepath) + else: + print(f"no such file: '{filename}'") + + main(filelist) + diff --git a/notes b/notes new file mode 100644 index 0000000..a04ffcb --- /dev/null +++ b/notes @@ -0,0 +1,63 @@ +index 20px +timecode (usually "ST") 40px +project 39px +timetype 201px + + + +day1 571px or 572px +day2 597px +day3 622px +day4 646px or 647px +day5 671px or 672px +day6 696px or 697px +day7 721px or 722px +rowtotal 751px or 753px or 756px + +document header: +"Timesheet" +"Mon N, YYYY - Mon N, YYYY" +"Location:" +"[E01] Fors Marsh Group" +"Department:" +"[3200] Advanced Analytics" +"Employee Type:" +"[1] Annual Salary" +"Location (Default" +"[LOCAL] Location" +"Function:" +"Exempt:" +"Status:" +"[1] Full Time" +"Yes" +"Approved" +"Post Status:" +"Validation:" +"Date/Time:" +"Posted" +"Passed" +"Mon N, YYYY HH:MM" +"Total Tomesheet:" +"Standard Hours:" +"Total Billable:" +"Percent Billability:" + + + + + +left column header: +"ID" +"Time Code" +"Project" +"TimeType" + +right colummn header: +"Mon" +"Tues Wed" +"Thu" +"Fri" +"Sat" +"Sun" +(or some cycled version of this) + diff --git a/parse.py b/parse.py new file mode 100644 index 0000000..5b55629 --- /dev/null +++ b/parse.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +"""The parsers are developed, debugged, and refactored in this file. + +When they mature, I refactor them into standalone modules under the `parser` +directory. + +Eventually, the entire parse step will mature and be abstracted into a +single function call, which will be appropriate to call in `main.py`. + +If you can see this file, then I'm not done yet. +""" + +from pprint import pprint + +from parser.html import parse as parse_html +from parser.pdf import parse as parse_pdf + +def read_timesheet(filename): + unstructured_data = parse_pdf(filename) + semistructured_data = parse_html(unstructured_data) + return semistructured_data + +def has_style_left(attrs): + return "style" in attrs.keys() and "left" in attrs["style"].keys() + +def update_count(counters, key): + if key in counters.keys(): + counters[key] += 1 + else: + counters[key] = 1 + return counters + +def parse_timesheet(data): + + in_div = False + in_span = False + left = "" + + for line in data: + if in_span: + if line[0] == "DATA": + print(f"{left:10} {line[2][0]}") + in_span = False + in_div = False + elif in_div: + if line[0] == "START" and line[1] == "span": + in_span = True + else: + in_div = False + else: + if line[0] == "START" and line[1] == "div" and has_style_left(line[2]): + in_div = True + left = line[2]["style"]["left"] + + return [] + +def extract_projects(structured_data): + return [] + +def timesheet(filename): + unstructured_data = read_timesheet(filename) + structured_data = parse_timesheet(unstructured_data) + projects = extract_projects(structured_data) + return projects + diff --git a/parser/html.py b/parser/html.py new file mode 100644 index 0000000..ec8cdac --- /dev/null +++ b/parser/html.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +# Crash Course on html.parser +# +# A SAX-style parser. Hook into tags and data like... +# +# ``` +# from html.parser import HTMLParser +# class MyHTMLParser(HTMLParser): +# def handle_starttag(self, tag, attrs): +# #do something... +# def handle_endtag(self, tag): +# #do something... +# def handle_data(self, data): +# #do something... +# ``` +# +# Valid HTML is fed into the parser like... +# +# ``` +# parser = MyHTMLParser() +# parser.feed(html) +# ``` + +from html.parser import HTMLParser + +def parse_attrs_string(attrs): + """Parse a string structures like `key1:value1;key2:value2;`. + + Embedded CSS (as in `style` attributes) can look like this. + """ + attrs_dict = {} + for pair in attrs.split(";"): + if len(pair.strip()) == 0: + continue + key, value = pair.split(":") + + key, value = key.strip(), value.strip() + attrs_dict[key] = value + return attrs_dict + +def parse_attrs_doubles(attrs): + """Parse a dictionary of HTML/CSS attributes from a series of doubles. + + The built-in Python HTML parser (`html.parser.HTMLParser`) hands attributes + to the `handle_starttag` hook like this. + """ + attrs_dict = {} + for pair in attrs: + key, value = pair + if key == "style": + value = parse_attrs_string(value) + attrs_dict[key] = value + return attrs_dict + +class TimesheetHTMLParser(HTMLParser): + """A specialization of the `html.parser.HTMLParser` class to handle my + timesheets. + + Data is stored internally and can be dumped with the `dump` method. + + Don't forget to close the parser instance! + """ + def __init__(self): + HTMLParser.__init__(self) + self._data = [] + def handle_starttag(self, tag, _attrs): + attrs = parse_attrs_doubles(_attrs) + self._data.append(["START", tag, attrs]) + def handle_endtag(self, tag): + self._data.append(["END", tag]) + def handle_data(self, data): + self._data.append(["DATA", "", data.splitlines()]) + def dump(self): + return self._data + +def parse(html): + """Read an HTML-encoded string into semi-structured data.""" + parser = TimesheetHTMLParser() + try: + parser.feed(html) + data = parser.dump() + finally: + parser.close() + return data + diff --git a/parser/pdf.py b/parser/pdf.py new file mode 100644 index 0000000..67aa2bf --- /dev/null +++ b/parser/pdf.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +from io import StringIO + +# Crash Course on pdfminer +# +# Extract text from PDFs like... +# +# ``` +# from pdfminer.high_level import extract_text +# with open(filename, "rb") as f: +# text = extract_text(f) +# ``` +# +# The alternative is to use something like... +# +# ``` +# from io import StringIO +# from pdfminer.high_level import extract_text_to_fp +# from pdfminer.layout import LAParams +# buffer = StringIO() +# with open(filename, "rb") as f: +# extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None) +# html = buffer.getvalue() +# ``` + +from pdfminer.high_level import extract_text_to_fp +from pdfminer.layout import LAParams + +def parse(filename): + """Read a binary PDF-encoded file and convert it into an HTML-encoded + string. + """ + buffer = StringIO() + try: + with open(filename, "rb") as f: + extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None) + data = buffer.getvalue() + finally: + buffer.close() + return data + -- 2.45.2