14 files changed, 306 insertions(+), 449 deletions(-)
R textwrap/Makefile => Makefile
A go.mod
A go.sum
D ingest/__init__.py
D ingest/__main__.py
D ingest/__pycache__/__init__.cpython-38.pyc
D ingest/__pycache__/__main__.cpython-38.pyc
D ingest/__pycache__/message.cpython-38.pyc
D ingest/__pycache__/parse.cpython-38.pyc
D ingest/message.py
D ingest/parse.py
A main.go
D textwrap/main.go
D textwrap/textwrap
R textwrap/Makefile => Makefile +1 -1
@@ 1,5 1,5 @@
clean:
- rm -rf textwrap
+ rm -rf digestion
build:
go build
A go.mod => go.mod +5 -0
@@ 0,0 1,5 @@
+module git.dominic-ricottone.com/digestion
+
+go 1.15
+
+require git.dominic-ricottone.com/textwrap v0.0.2
A go.sum => go.sum +4 -0
@@ 0,0 1,4 @@
+git.dominic-ricottone.com/textwrap v0.0.1 h1:vOkmBCgYfMcm4zgkI2InIUfxx74Nvma10UFr2I8gKrQ=
+git.dominic-ricottone.com/textwrap v0.0.1/go.mod h1:oMsi+AbzUET9rjr9bDbtRUhD+bUPNjWMCIEhsLK80ZA=
+git.dominic-ricottone.com/textwrap v0.0.2 h1:BoCxezIxM2i/Mvs8gREB4XSLmQh+5DHmiEW/jiwdUbA=
+git.dominic-ricottone.com/textwrap v0.0.2/go.mod h1:oMsi+AbzUET9rjr9bDbtRUhD+bUPNjWMCIEhsLK80ZA=
D ingest/__init__.py => ingest/__init__.py +0 -0
D ingest/__main__.py => ingest/__main__.py +0 -11
@@ 1,11 0,0 @@
-#!/usr/bin/env python3
-
-from . import parse
-
-for msg in parse.split_messages(parse.read_input())[1:]:
- try:
- parse.parse_message(msg)
- except:
- raise
-
-
D ingest/__pycache__/__init__.cpython-38.pyc => ingest/__pycache__/__init__.cpython-38.pyc +0 -0
D ingest/__pycache__/__main__.cpython-38.pyc => ingest/__pycache__/__main__.cpython-38.pyc +0 -0
D ingest/__pycache__/message.cpython-38.pyc => ingest/__pycache__/message.cpython-38.pyc +0 -0
D ingest/__pycache__/parse.cpython-38.pyc => ingest/__pycache__/parse.cpython-38.pyc +0 -0
D ingest/message.py => ingest/message.py +0 -201
@@ 1,201 0,0 @@
-#!/usr/bin/env python3
-
-"""The message object and API."""
-
-from typing import Optional
-
-class Message(object):
- """Container for a message and metadata."""
- def __init__(
- self,
- *,
- hdr_subject: Optional[str] = None,
- hdr_date: Optional[str] = None,
- hdr_from: Optional[str] = None,
- hdr_to: Optional[str] = None,
- hdr_cc: Optional[str] = None,
- hdr_message_id: Optional[str] = None,
- content_type: Optional[str] = None,
- content: Optional[str] = None,
- ) -> None:
- self._subject = hdr_subject
- self._date = hdr_date
- self._from = hdr_from
- self._to = hdr_to
- self._cc = hdr_cc
- self._message_id = hdr_message_id
- self._content_type = content_type
- self._content = content
- self._last_hdr = None
-
- def __str__(self) -> str:
- return (
- f"Subject: {self._subject}\n"
- f"Date: {self._date}\n"
- f"To: {self._to}\n"
- f"From: {self._from}\n"
- f"Cc: {self._cc}\n"
- f"Message-ID: {self._message_id}\n"
- f"Content-Type: {self._content_type}\n"
- )
-
- @property
- def hdr_subject(self) -> str:
- if self._subject is not None:
- return self._subject
- else:
- raise ValueError("no header `subject' set") from None
- @hdr_subject.setter
- def hdr_subject(self, value: str):
- if self._subject is None:
- self._subject = value
- self._last_hdr = "_subject"
- else:
- raise ValueError("header `subject' already set") from None
-
- @property
- def hdr_date(self) -> str:
- if self._date is not None:
- return self._date
- else:
- raise ValueError("no header `date' set") from None
- @hdr_date.setter
- def hdr_date(self, value: str):
- if self._date is None:
- self._date = value
- self._last_hdr = "_date"
- else:
- raise ValueError("header `date' already set") from None
-
- @property
- def hdr_from(self) -> str:
- if self._from is not None:
- return self._from
- else:
- raise ValueError("no header `from' set") from None
- @hdr_from.setter
- def hdr_from(self, value: str):
- if self._from is None:
- self._from = value
- self._last_hdr = "_from"
- else:
- raise ValueError("header `from' already set") from None
-
- @property
- def hdr_to(self) -> str:
- if self._to is not None:
- return self._to
- else:
- raise ValueError("no header `to' set") from None
- @hdr_to.setter
- def hdr_to(self, value: str):
- if self._to is None:
- self._to = value
- self._last_hdr = "_to"
- else:
- raise ValueError("header `to' already set") from None
-
- @property
- def hdr_cc(self) -> str:
- if self._cc is not None:
- return self._cc
- else:
- raise ValueError("no header `cc' set") from None
- @hdr_cc.setter
- def hdr_cc(self, value: str):
- if self._cc is None:
- self._cc = value
- self._last_hdr = "_cc"
- else:
- raise ValueError("header `cc' already set") from None
-
- @property
- def hdr_message_id(self) -> str:
- if self._message_id is not None:
- return self._message_id
- else:
- raise ValueError("no header `message_id' set") from None
- @hdr_message_id.setter
- def hdr_message_id(self, value: str):
- if self._message_id is None:
- self._message_id = value
- self._last_hdr = "_message_id"
- else:
- raise ValueError("header `message_id' already set") from None
-
- @property
- def content_type(self) -> str:
- if self._content_type is not None:
- return self._content_type
- else:
- raise ValueError("no `content_type' set") from None
- @content_type.setter
- def content_type(self, value: str):
- if self._content_type is None:
- self._content_type = value
- self._last_hdr = "_content_type"
- else:
- raise ValueError("`content_type' already set") from None
-
- def append_last(self, value: str):
- if self._last_hdr is not None:
- old = getattr(self, self._last_hdr)
- try:
- new = old + value
- except:
- # test for bad encoding
- raise
- setattr(self, self._last_hdr, new)
- else:
- raise ValueError("no header set") from None
-
- def into_multipart(self):
- return MultipartMessage(
- hdr_subject=self._subject,
- hdr_date=self._date,
- hdr_from=self._from,
- hdr_to=self._to,
- hdr_cc=self._cc,
- hdr_message_id=self._message_id,
- content_type=self._content_type,
- content=self._content,
- )
-
-class MultipartMessage(Message):
- """Container for a multi-part message and metadata."""
- def __init__(
- self,
- *,
- hdr_subject: Optional[str] = None,
- hdr_date: Optional[str] = None,
- hdr_from: Optional[str] = None,
- hdr_to: Optional[str] = None,
- hdr_cc: Optional[str] = None,
- hdr_message_id: Optional[str] = None,
- content_type: Optional[str] = None,
- content: Optional[str] = None,
- ) -> None:
- self._subject = hdr_subject
- self._date = hdr_date
- self._from = hdr_from
- self._to = hdr_to
- self._cc = hdr_cc
- self._message_id = hdr_message_id
- self._content_type = content_type
- self._content = content
- self._last_hdr = None
-
- self._parts = list()
-
- def __str__(self) -> str:
- return (
- f"Subject: {self._subject}\n"
- f"Date: {self._date}\n"
- f"To: {self._to}\n"
- f"From: {self._from}\n"
- f"Cc: {self._cc}\n"
- f"Message-ID: {self._message_id}\n"
- f"Content-Type: {self._content_type}\n"
- f"Parts: {len(self._parts)}\n"
- )
-
D ingest/parse.py => ingest/parse.py +0 -155
@@ 1,155 0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import re
-from typing import List
-
-from . import message
-
-RE_MESSAGE_BREAK = re.compile(r"^-* *$")
-RE_HEADER_LINE = re.compile(r"^(?:Date|From|Subject|To|Cc|Message-ID|Content-Type):")
-RE_BLANK_LINE = re.compile(r"^ *$")
-RE_SUBJECT_LINE = re.compile(r"^Subject: *(.*) *$")
-RE_DATE_LINE = re.compile(r"^Date: *(.*) *$")
-RE_FROM_LINE = re.compile(r"^From: *(.*) *$")
-RE_TO_LINE = re.compile(r"^To: *(.*) *$")
-RE_CC_LINE = re.compile(r"^Cc: *(.*) *$")
-RE_ID_LINE = re.compile(r"^Message-ID: *(.*) *$")
-RE_CONTENT_LINE = re.compile(r"^Content-Type: *(.*) *$")
-RE_RUNON = re.compile(r"^[ \t]+(.*) *$")
-RE_BOUNDARY = re.compile(r'.*boundary="(.*)".*')
-
-def split_messages(blob: List[bytes]) -> List[List[bytes]]:
- """Split a blob into messages."""
- message_breaks = list()
- message_start = 0
- messages = list()
-
- # Find probable message breaks
- for index, bytes_line in enumerate(blob):
- try:
- line = str(bytes_line)
- except:
- # test for bad encodings
- raise
- if RE_MESSAGE_BREAK.match(line):
- message_breaks.append(index)
-
- # Validate message breaks and copy text into split messages
- # NOTE: message breaks are validated by checking for...
- # 1) A blank line following the message break
- # 2) A header line following the blank line
- for index in message_breaks:
- try:
- line1 = str(blob[index+1])
- line2 = str(blob[index+2])
- except:
- # test for bad encodings
- raise
-
- # If fails validation, skip to next probable break
- if not RE_BLANK_LINE.match(line1):
- continue
- elif not RE_HEADER_LINE.match(line2):
- continue
-
- # Message spans from known start to line before break
- messages.append(blob[message_start:index - 1])
-
- # Next message starts on first header line
- message_start = index + 2
-
- # Handle remainder
- messages.append(blob[message_start:])
-
- return messages
-
-def split_message_parts(
- blob: List[bytes],
- boundary: str,
-) -> List[List[bytes]]:
- """Split a blob into message parts."""
- part_breaks = list()
- parts = list()
- part_start = 0
-
- # NOTE: can use `in' operator with bytes and strings
- for index, line in enumerate(blob):
- if boundary in line:
- part_breaks.append(index)
-
- for index in part_breaks:
- parts.append(blob[part_start:index - 1])
- part_start = index + 1
-
- parts.append(blob[part_start:])
-
- return parts
-
-
-
-
-
-def parse_message(blob: List[bytes]):
- """Parse a message blob for metadata and parts."""
- msg = message.Message()
- header_end = 0
-
- # Parse the header
- for bytes_line in blob:
- header_end += 1
- try:
- line = str(bytes_line)
- except:
- # test for bad encodings
- raise
-
- if RE_BLANK_LINE.match(line):
- break
- elif match := RE_RUNON.match(line):
- msg.append_last(line)
-
- elif match := RE_SUBJECT_LINE.match(line):
- msg.hdr_subject = match.group(1)
- elif match := RE_DATE_LINE.match(line):
- msg.hdr_date = match.group(1)
- elif match := RE_FROM_LINE.match(line):
- msg.hdr_from = match.group(1)
- elif match := RE_TO_LINE.match(line):
- msg.hdr_to = match.group(1)
- elif match := RE_CC_LINE.match(line):
- msg.hdr_cc = match.group(1)
- elif match := RE_ID_LINE.match(line):
- msg.hdr_message_id = match.group(1)
- elif match := RE_CONTENT_LINE.match(line):
- msg.content_type = match.group(1)
-
- # store content
- msg._content = blob[header_end + 1:]
-
- # parse for parts
- try:
- if "multipart/" in msg.content_type:
- msg = msg.into_multipart()
- except:
- pass
- if isinstance(msg,message.MultipartMessage):
- if match := RE_BOUNDARY.match(msg.content_type):
- boundary = "".join(match.group(1).split())
- else:
- raise ValueError("no boundary for multipart content") from None
- msg._parts = split_message_parts(msg._content, boundary)
-
- print(msg)
-
-def read_input() -> List[bytes]:
- """Read STDIN into a blob."""
- textblob = list()
- for line in sys.stdin:
- try:
- textblob.append(line.rstrip())
- except:
- # test for bad encoding
- raise
- return textblob
-
A main.go => main.go +296 -0
@@ 0,0 1,296 @@
+package main
+
+import (
+ "fmt"
+ "os"
+ "strings"
+ "io"
+ "bufio"
+ "regexp"
+
+ "git.dominic-ricottone.com/textwrap/common"
+)
+
+// An enumeration of header parts
+const(
+ HeaderSubject = "HeaderSubject"
+ HeaderDate = "HeaderDate"
+ HeaderFrom = "HeaderFrom"
+ HeaderTo = "HeaderTo"
+ HeaderCc = "HeaderCc"
+ HeaderMessageID = "HeaderMessageID"
+ HeaderContentType = "HeaderContentType"
+ HeaderContentEncoding = "HeaderContentEncoding"
+)
+
+// A message header container, used within message containers
+type MessageHeader struct {
+ Subject string
+ Date string
+ From string
+ To string
+ Cc string
+ MessageID string
+ ContentType string
+ LastSet string
+}
+
+// Builder for a message header
+func NewHeader() *MessageHeader {
+ return &MessageHeader{"", "", "", "", "", "", "", ""}
+}
+
+// A message part header container, used within message part containers
+type MessagePartHeader struct {
+ ContentType string
+ ContentEncoding string
+ LastSet string
+}
+
+// Builder for a message part header
+func NewPartHeader() *MessagePartHeader {
+ return &MessagePartHeader{"", "", ""}
+}
+
+// A message part container, used within message containers
+type MessagePart struct {
+ Header *MessagePartHeader
+ Content []string
+}
+
+// Builder for a message part
+func NewPart() *MessagePart {
+ return &MessagePart{NewPartHeader(), []string{""}}
+}
+
+// A message container
+type Message struct {
+ Header *MessageHeader
+ Parts []*MessagePart
+ PartBoundary *regexp.Regexp
+}
+
+// Builder for a message
+func NewMessage() *Message {
+ return &Message{NewHeader(), []*MessagePart{NewPart()}, nil}
+}
+
+// Message setters
+func (m *Message) SetHeader(line string) {
+ if strings.HasPrefix(line, "Subject:") {
+ m.Header.Subject = line[8:]
+ m.Header.LastSet = HeaderSubject
+ } else if strings.HasPrefix(line, "Date:") {
+ m.Header.Date = line[5:]
+ m.Header.LastSet = HeaderDate
+ } else if strings.HasPrefix(line, "From:") {
+ m.Header.From = line[5:]
+ m.Header.LastSet = HeaderFrom
+ } else if strings.HasPrefix(line, "To:") {
+ m.Header.To = line[3:]
+ m.Header.LastSet = HeaderTo
+ } else if strings.HasPrefix(line, "Cc:") {
+ m.Header.Cc = line[3:]
+ m.Header.LastSet = HeaderCc
+ } else if strings.HasPrefix(line, "Message-ID:") {
+ m.Header.MessageID = line[11:]
+ m.Header.LastSet = HeaderMessageID
+ } else if strings.HasPrefix(line, "Content-Type:") {
+ m.Header.ContentType = line[13:]
+ m.Header.LastSet = HeaderContentType
+ }
+}
+
+func (m *Message) AppendLastHeader(s string) {
+ switch m.Header.LastSet {
+ case HeaderSubject:
+ m.Header.Subject += " " + s
+ case HeaderDate:
+ m.Header.Date += " " + s
+ case HeaderFrom:
+ m.Header.From += " " + s
+ case HeaderTo:
+ m.Header.To += " " + s
+ case HeaderCc:
+ m.Header.Cc += " " + s
+ case HeaderMessageID:
+ m.Header.MessageID += " " + s
+ case HeaderContentType:
+ m.Header.ContentType += " " + s
+ }
+}
+
+func (m *Message) SetPartHeader(line string) {
+ if strings.HasPrefix(line, "Content-Type:") {
+ m.Parts[len(m.Parts)-1].Header.ContentType = line[13:]
+ m.Parts[len(m.Parts)-1].Header.LastSet = HeaderContentType
+ } else if strings.HasPrefix(line, "Content-Transfer-Encoding:") {
+ m.Parts[len(m.Parts)-1].Header.ContentEncoding = line[26:]
+ m.Parts[len(m.Parts)-1].Header.LastSet = HeaderContentEncoding
+ }
+}
+
+func (m *Message) AppendLastPartHeader(s string) {
+ switch m.Parts[len(m.Parts)-1].Header.LastSet {
+ case HeaderContentType:
+ m.Parts[len(m.Parts)-1].Header.ContentType += " " + s
+ case HeaderContentEncoding:
+ m.Parts[len(m.Parts)-1].Header.ContentEncoding += " " + s
+ }
+}
+
+func (m *Message) AppendPart() {
+ m.Parts = append(m.Parts, NewPart())
+}
+
+func (m *Message) AppendContent(s string) {
+ i := len(m.Parts)-1
+ m.Parts[i].Content = append(m.Parts[i].Content, s)
+}
+
+func (m *Message) FindBoundary(re *regexp.Regexp) {
+ match := re.FindStringSubmatch(m.Header.ContentType)
+ if match != nil {
+ boundary := strings.Replace(match[1], " ", "", -1)
+ m.PartBoundary, _ = regexp.Compile(".*" + boundary + ".*")
+ }
+}
+
+// Message logic
+func (m *Message) MatchBoundary(line string) bool {
+ if m.PartBoundary != nil {
+ return m.PartBoundary.MatchString(line)
+ } else {
+ return false
+ }
+}
+
+// A message printer
+func (m *Message) Dump() {
+ fmt.Printf("Subject: %s\n", m.Header.Subject)
+ fmt.Printf("Date: %s\n", m.Header.Date)
+ fmt.Printf("From: %s\n", m.Header.From)
+ fmt.Printf("To: %s\n", m.Header.To)
+ fmt.Printf("Cc: %s\n", m.Header.Cc)
+ fmt.Printf("MessageID: %s\n", m.Header.MessageID)
+ fmt.Printf("ContentType: %s\n", m.Header.ContentType)
+ for i := 0; i < len(m.Parts); i++ {
+ fmt.Printf("Part %d:\n", i)
+ fmt.Printf("ContentType: %s\n", m.Parts[i].Header.ContentType)
+ fmt.Printf("ContentEncoding: %s\n", m.Parts[i].Header.ContentEncoding)
+
+ wrapped, _ := common.WrapArray(m.Parts[i].Content, 80)
+ for j := 0; j < len(wrapped); j++ {
+ fmt.Printf("%s\n", wrapped[j])
+ }
+ fmt.Println("EOF")
+ }
+}
+
+// Parser statuses
+const (
+ ParsingPreHeader = "ParsingPreHeader"
+ ParsingHeader = "ParsingHeader"
+ ParsingPartHeader = "ParsingPartHeader"
+ ParsingContent = "ParsingContent"
+)
+
+func parse_stream(reader io.Reader) {
+ // Create scanner from reader
+ input := bufio.NewScanner(reader)
+
+ // Compile regular expressions
+ re_message_break, err := regexp.Compile("^-+$")
+ if err != nil {
+ fmt.Printf("internal error - %v\n", err)
+ os.Exit(1)
+ }
+ re_header, err := regexp.Compile(
+ "^(?:Date|From|Subject|To|Cc|Message-ID|" +
+ "Content-(?:Type|Transfer-Encoding)):",
+ )
+ if err != nil {
+ fmt.Printf("internal error - %v\n", err)
+ os.Exit(1)
+ }
+ re_multipart, err := regexp.Compile(".*boundary=\"(.*)\".*")
+ if err != nil {
+ fmt.Printf("internal error - %v\n", err)
+ os.Exit(1)
+ }
+
+ parsing := ParsingPreHeader
+ message := NewMessage()
+
+ for input.Scan() {
+ line := input.Text()
+ tline := strings.TrimSpace(line)
+
+ if parsing == ParsingPreHeader {
+ if re_header.MatchString(tline) {
+ parsing = ParsingHeader
+ message.SetHeader(tline)
+ }
+ } else if parsing == ParsingHeader {
+ if tline == "" {
+ parsing = ParsingContent
+ message.FindBoundary(re_multipart)
+ } else if strings.HasPrefix(line, "\t") {
+ message.AppendLastHeader(tline)
+ } else {
+ message.SetHeader(tline)
+ }
+ } else if parsing == ParsingPartHeader {
+ if tline == "" {
+ parsing = ParsingContent
+ } else if strings.HasPrefix(line, "\t") {
+ message.AppendLastPartHeader(tline)
+ } else {
+ message.SetPartHeader(tline)
+ }
+ } else if parsing == ParsingContent {
+ if re_message_break.MatchString(tline) {
+ parsing = ParsingPreHeader
+ message.Dump()
+ message = NewMessage()
+ } else if message.MatchBoundary(tline) {
+ parsing = ParsingPartHeader
+ message.AppendPart()
+ } else {
+ message.AppendContent(tline)
+ }
+ }
+ }
+
+ // Check for scanner errors
+ if err = input.Err(); err != nil {
+ fmt.Printf("internal error - %v\n", err)
+ os.Exit(1)
+ }
+}
+
+func parse_file(filename string) {
+ // Check file
+ file, err := os.Open(filename)
+ if err != nil {
+ fmt.Printf("cannot read file '%s'\n", filename)
+ os.Exit(1)
+ }
+ defer file.Close()
+
+ // Parse
+ parse_stream(file)
+}
+
+func main() {
+ // Check STDIN
+ _, err := os.Stdin.Stat()
+ if err != nil {
+ fmt.Println("cannot read input")
+ os.Exit(1)
+ }
+
+ // Parse
+ parse_stream(os.Stdin)
+}
+
D textwrap/main.go => textwrap/main.go +0 -81
@@ 1,81 0,0 @@
-package main
-
-import (
- "fmt"
- "os"
- "strings"
- "bufio"
- "regexp"
- "flag"
-)
-
-const LENGTH = 40
-
-func print_break(length int) {
- fmt.Printf("%s\n", strings.Repeat("-", length))
-}
-
-func print_wrapped(line string, length int, quote string) {
- len_quote := len(quote)
- buffer := quote
- for index, rune := range line[len_quote:] {
- buffer += string(rune)
- if (index + 1) % (length - len_quote) == 0 {
- fmt.Printf("%s\n", buffer)
- buffer = quote
- }
- }
- if buffer != "" {
- fmt.Printf("%s\n", buffer)
- }
-}
-
-func main() {
- // Open STDIN as scanner
- _, err := os.Stdin.Stat()
- if err != nil {
- fmt.Printf("%s\n", "cannot read input")
- os.Exit(1)
- }
- input := bufio.NewScanner(os.Stdin)
-
- // Look for arguments
- var width = flag.Int("width", 80, "target width for output")
- flag.Parse()
-
- // Compile regular expressions
- re_quote, err := regexp.Compile("^([> ]*)")
- if err != nil {
- fmt.Printf("internal error - %v\n", err)
- os.Exit(1)
- }
- re_break, err := regexp.Compile("^(?:-{5,}|={5,})$")
- if err != nil {
- fmt.Printf("internal error - %v\n", err)
- os.Exit(1)
- }
-
- // Scan line by line
- for input.Scan() {
- line := input.Text()
- line = strings.TrimSpace(line)
-
- if len(line) > *width {
- if re_break.MatchString(line) {
- print_break(*width)
- } else {
- quote := re_quote.FindString(line)
- print_wrapped(line, *width, quote)
- }
- } else {
- fmt.Printf("%s\n", line)
- }
- }
-
- // Check for scanner errors
- if err = input.Err(); err != nil {
- fmt.Printf("internal error - %v\n", err)
- os.Exit(1)
- }
-}
-
D textwrap/textwrap => textwrap/textwrap +0 -0