~dricottone/digestion

ref: de6aac89edaeb2ceda7412983ac035e0f9912a25 digestion/ingest/parse.py -rw-r--r-- 4.4 KiB
de6aac89Dominic Ricottone Initial commit 4 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python3

import sys
import re
from typing import List

from . import message

RE_MESSAGE_BREAK = re.compile(r"^-* *$")
RE_HEADER_LINE =   re.compile(r"^(?:Date|From|Subject|To|Cc|Message-ID|Content-Type):")
RE_BLANK_LINE =    re.compile(r"^ *$")
RE_SUBJECT_LINE =  re.compile(r"^Subject: *(.*) *$")
RE_DATE_LINE =     re.compile(r"^Date: *(.*) *$")
RE_FROM_LINE =     re.compile(r"^From: *(.*) *$")
RE_TO_LINE =       re.compile(r"^To: *(.*) *$")
RE_CC_LINE =       re.compile(r"^Cc: *(.*) *$")
RE_ID_LINE =       re.compile(r"^Message-ID: *(.*) *$")
RE_CONTENT_LINE =  re.compile(r"^Content-Type: *(.*) *$")
RE_RUNON =         re.compile(r"^[ \t]+(.*) *$")
RE_BOUNDARY =      re.compile(r'.*boundary="(.*)".*')

def split_messages(blob: List[bytes]) -> List[List[bytes]]:
    """Split a blob into messages."""
    message_breaks = list()
    message_start = 0
    messages = list()

    # Find probable message breaks
    for index, bytes_line in enumerate(blob):
        try:
            line = str(bytes_line)
        except:
            # test for bad encodings
            raise
        if RE_MESSAGE_BREAK.match(line):
            message_breaks.append(index)

    # Validate message breaks and copy text into split messages
    # NOTE: message breaks are validated by checking for...
    #  1) A blank line following the message break
    #  2) A header line following the blank line
    for index in message_breaks:
        try:
            line1 = str(blob[index+1])
            line2 = str(blob[index+2])
        except:
            # test for bad encodings
            raise

        # If fails validation, skip to next probable break
        if not RE_BLANK_LINE.match(line1):
            continue
        elif not RE_HEADER_LINE.match(line2):
            continue

        # Message spans from known start to line before break
        messages.append(blob[message_start:index - 1])

        # Next message starts on first header line
        message_start = index + 2

    # Handle remainder
    messages.append(blob[message_start:])

    return messages

def split_message_parts(
    blob: List[bytes],
    boundary: str,
) -> List[List[bytes]]:
    """Split a blob into message parts."""
    part_breaks = list()
    parts = list()
    part_start = 0

    # NOTE: can use `in' operator with bytes and strings
    for index, line in enumerate(blob):
        if boundary in line:
            part_breaks.append(index)

    for index in part_breaks:
        parts.append(blob[part_start:index - 1])
        part_start = index + 1

    parts.append(blob[part_start:])

    return parts





def parse_message(blob: List[bytes]):
    """Parse a message blob for metadata and parts."""
    msg = message.Message()
    header_end = 0

    # Parse the header
    for bytes_line in blob:
        header_end += 1
        try:
            line = str(bytes_line)
        except:
            # test for bad encodings
            raise

        if RE_BLANK_LINE.match(line):
            break
        elif match := RE_RUNON.match(line):
            msg.append_last(line)

        elif match := RE_SUBJECT_LINE.match(line):
            msg.hdr_subject = match.group(1)
        elif match := RE_DATE_LINE.match(line):
            msg.hdr_date = match.group(1)
        elif match := RE_FROM_LINE.match(line):
            msg.hdr_from = match.group(1)
        elif match := RE_TO_LINE.match(line):
            msg.hdr_to = match.group(1)
        elif match := RE_CC_LINE.match(line):
            msg.hdr_cc = match.group(1)
        elif match := RE_ID_LINE.match(line):
            msg.hdr_message_id = match.group(1)
        elif match := RE_CONTENT_LINE.match(line):
            msg.content_type = match.group(1)

    # store content
    msg._content = blob[header_end + 1:]

    # parse for parts
    try:
        if "multipart/" in msg.content_type:
            msg = msg.into_multipart()
    except:
        pass
    if isinstance(msg,message.MultipartMessage):
        if match := RE_BOUNDARY.match(msg.content_type):
            boundary = "".join(match.group(1).split())
        else:
            raise ValueError("no boundary for multipart content") from None
        msg._parts = split_message_parts(msg._content, boundary)

    print(msg)

def read_input() -> List[bytes]:
    """Read STDIN into a blob."""
    textblob = list()
    for line in sys.stdin:
        try:
            textblob.append(line.rstrip())
        except:
            # test for bad encoding
            raise
    return textblob