~dricottone/fmg-timesheets

ref: f441822f16af2386abe50dc1d4b8161566843f3e fmg-timesheets/parser/pdf.py -rw-r--r-- 1.0 KiB
f441822fDominic Ricottone Significant updates 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python3

from io import StringIO

# Crash Course on pdfminer
#
# Extract text from PDFs like...
#
# ```
# from pdfminer.high_level import extract_text
# with open(filename, "rb") as f:
#   text = extract_text(f)
# ```
#
# The alternative is to use something like...
#
# ```
# from io import StringIO
# from pdfminer.high_level import extract_text_to_fp
# from pdfminer.layout import LAParams
# buffer = StringIO()
# with open(filename, "rb") as f:
#   extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
# html = buffer.getvalue()
# ```

from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams

def parse(filename):
    """Read a binary PDF-encoded file and convert it into an HTML-encoded
    string.
    """
    buffer = StringIO()
    try:
        with open(filename, "rb") as f:
            extract_text_to_fp(f, buffer, laparams=LAParams(), output_type="html", codec=None)
        data = buffer.getvalue()
    finally:
        buffer.close()
    return data