~dricottone/fmg-timesheets

ref: ae939a2885fa86f68a1456565d379ca233ec3b19 fmg-timesheets/parser/pdf.py -rw-r--r-- 920 bytes
ae939a28Dominic Ricottone Goodbye HTML, hello XML 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python3

from io import StringIO

from pdfminer.converter import XMLConverter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams

def parse(filename_in, filename_out):
    """Main routine. Reads a PDF file and writes an XML file."""
    buffer = StringIO()
    manager = PDFResourceManager(caching=False)
    converter = XMLConverter(manager, buffer, laparams=LAParams(), codec=None)
    interpreter = PDFPageInterpreter(manager, converter)

    with open(filename_in, "rb") as f:
        for page in PDFPage.get_pages(f, caching=False):
            interpreter.process_page(page)

    with open(filename_out, "w") as f:
        first = True
        for line in buffer.getvalue().splitlines():
            if not first:
                f.write(line+"\n")
            first = False
        f.write("</pages>\n")