~dricottone/digestion

6664dffbe0572acfb58412a48c25debc609e2f96 — Dominic Ricottone 4 years ago de6aac8
Rewrote in Go; Split textwrap into external library
14 files changed, 306 insertions(+), 449 deletions(-)

R textwrap/Makefile => Makefile
A go.mod
A go.sum
D ingest/__init__.py
D ingest/__main__.py
D ingest/__pycache__/__init__.cpython-38.pyc
D ingest/__pycache__/__main__.cpython-38.pyc
D ingest/__pycache__/message.cpython-38.pyc
D ingest/__pycache__/parse.cpython-38.pyc
D ingest/message.py
D ingest/parse.py
A main.go
D textwrap/main.go
D textwrap/textwrap
R textwrap/Makefile => Makefile +1 -1
@@ 1,5 1,5 @@
clean:
	rm -rf textwrap
	rm -rf digestion

build:
	go build

A go.mod => go.mod +5 -0
@@ 0,0 1,5 @@
module git.dominic-ricottone.com/digestion

go 1.15

require git.dominic-ricottone.com/textwrap v0.0.2

A go.sum => go.sum +4 -0
@@ 0,0 1,4 @@
git.dominic-ricottone.com/textwrap v0.0.1 h1:vOkmBCgYfMcm4zgkI2InIUfxx74Nvma10UFr2I8gKrQ=
git.dominic-ricottone.com/textwrap v0.0.1/go.mod h1:oMsi+AbzUET9rjr9bDbtRUhD+bUPNjWMCIEhsLK80ZA=
git.dominic-ricottone.com/textwrap v0.0.2 h1:BoCxezIxM2i/Mvs8gREB4XSLmQh+5DHmiEW/jiwdUbA=
git.dominic-ricottone.com/textwrap v0.0.2/go.mod h1:oMsi+AbzUET9rjr9bDbtRUhD+bUPNjWMCIEhsLK80ZA=

D ingest/__init__.py => ingest/__init__.py +0 -0
D ingest/__main__.py => ingest/__main__.py +0 -11
@@ 1,11 0,0 @@
#!/usr/bin/env python3

from . import parse

for msg in parse.split_messages(parse.read_input())[1:]:
    try:
        parse.parse_message(msg)
    except:
        raise



D ingest/__pycache__/__init__.cpython-38.pyc => ingest/__pycache__/__init__.cpython-38.pyc +0 -0
D ingest/__pycache__/__main__.cpython-38.pyc => ingest/__pycache__/__main__.cpython-38.pyc +0 -0
D ingest/__pycache__/message.cpython-38.pyc => ingest/__pycache__/message.cpython-38.pyc +0 -0
D ingest/__pycache__/parse.cpython-38.pyc => ingest/__pycache__/parse.cpython-38.pyc +0 -0
D ingest/message.py => ingest/message.py +0 -201
@@ 1,201 0,0 @@
#!/usr/bin/env python3

"""The message object and API."""

from typing import Optional

class Message(object):
    """Container for a message and metadata."""
    def __init__(
        self,
        *,
        hdr_subject: Optional[str] = None,
        hdr_date: Optional[str] = None,
        hdr_from: Optional[str] = None,
        hdr_to: Optional[str] = None,
        hdr_cc: Optional[str] = None,
        hdr_message_id: Optional[str] = None,
        content_type: Optional[str] = None,
        content: Optional[str] = None,
    ) -> None:
        self._subject = hdr_subject
        self._date = hdr_date
        self._from = hdr_from
        self._to = hdr_to
        self._cc = hdr_cc
        self._message_id = hdr_message_id
        self._content_type = content_type
        self._content = content
        self._last_hdr = None

    def __str__(self) -> str:
        return (
            f"Subject: {self._subject}\n"
            f"Date: {self._date}\n"
            f"To: {self._to}\n"
            f"From: {self._from}\n"
            f"Cc: {self._cc}\n"
            f"Message-ID: {self._message_id}\n"
            f"Content-Type: {self._content_type}\n"
        )

    @property
    def hdr_subject(self) -> str:
        if self._subject is not None:
            return self._subject
        else:
            raise ValueError("no header `subject' set") from None
    @hdr_subject.setter
    def hdr_subject(self, value: str):
        if self._subject is None:
            self._subject = value
            self._last_hdr = "_subject"
        else:
            raise ValueError("header `subject' already set") from None

    @property
    def hdr_date(self) -> str:
        if self._date is not None:
            return self._date
        else:
            raise ValueError("no header `date' set") from None
    @hdr_date.setter
    def hdr_date(self, value: str):
        if self._date is None:
            self._date = value
            self._last_hdr = "_date"
        else:
            raise ValueError("header `date' already set") from None

    @property
    def hdr_from(self) -> str:
        if self._from is not None:
            return self._from
        else:
            raise ValueError("no header `from' set") from None
    @hdr_from.setter
    def hdr_from(self, value: str):
        if self._from is None:
            self._from = value
            self._last_hdr = "_from"
        else:
            raise ValueError("header `from' already set") from None

    @property
    def hdr_to(self) -> str:
        if self._to is not None:
            return self._to
        else:
            raise ValueError("no header `to' set") from None
    @hdr_to.setter
    def hdr_to(self, value: str):
        if self._to is None:
            self._to = value
            self._last_hdr = "_to"
        else:
            raise ValueError("header `to' already set") from None

    @property
    def hdr_cc(self) -> str:
        if self._cc is not None:
            return self._cc
        else:
            raise ValueError("no header `cc' set") from None
    @hdr_cc.setter
    def hdr_cc(self, value: str):
        if self._cc is None:
            self._cc = value
            self._last_hdr = "_cc"
        else:
            raise ValueError("header `cc' already set") from None

    @property
    def hdr_message_id(self) -> str:
        if self._message_id is not None:
            return self._message_id
        else:
            raise ValueError("no header `message_id' set") from None
    @hdr_message_id.setter
    def hdr_message_id(self, value: str):
        if self._message_id is None:
            self._message_id = value
            self._last_hdr = "_message_id"
        else:
            raise ValueError("header `message_id' already set") from None

    @property
    def content_type(self) -> str:
        if self._content_type is not None:
            return self._content_type
        else:
            raise ValueError("no `content_type' set") from None
    @content_type.setter
    def content_type(self, value: str):
        if self._content_type is None:
            self._content_type = value
            self._last_hdr = "_content_type"
        else:
            raise ValueError("`content_type' already set") from None

    def append_last(self, value: str):
        if self._last_hdr is not None:
            old = getattr(self, self._last_hdr)
            try:
                new = old + value
            except:
                # test for bad encoding
                raise
            setattr(self, self._last_hdr, new)
        else:
            raise ValueError("no header set") from None

    def into_multipart(self):
        return MultipartMessage(
            hdr_subject=self._subject,
            hdr_date=self._date,
            hdr_from=self._from,
            hdr_to=self._to,
            hdr_cc=self._cc,
            hdr_message_id=self._message_id,
            content_type=self._content_type,
            content=self._content,
        )

class MultipartMessage(Message):
    """Container for a multi-part message and metadata."""
    def __init__(
        self,
        *,
        hdr_subject: Optional[str] = None,
        hdr_date: Optional[str] = None,
        hdr_from: Optional[str] = None,
        hdr_to: Optional[str] = None,
        hdr_cc: Optional[str] = None,
        hdr_message_id: Optional[str] = None,
        content_type: Optional[str] = None,
        content: Optional[str] = None,
    ) -> None:
        self._subject = hdr_subject
        self._date = hdr_date
        self._from = hdr_from
        self._to = hdr_to
        self._cc = hdr_cc
        self._message_id = hdr_message_id
        self._content_type = content_type
        self._content = content
        self._last_hdr = None

        self._parts = list()

    def __str__(self) -> str:
        return (
            f"Subject: {self._subject}\n"
            f"Date: {self._date}\n"
            f"To: {self._to}\n"
            f"From: {self._from}\n"
            f"Cc: {self._cc}\n"
            f"Message-ID: {self._message_id}\n"
            f"Content-Type: {self._content_type}\n"
            f"Parts: {len(self._parts)}\n"
        )


D ingest/parse.py => ingest/parse.py +0 -155
@@ 1,155 0,0 @@
#!/usr/bin/env python3

import sys
import re
from typing import List

from . import message

RE_MESSAGE_BREAK = re.compile(r"^-* *$")
RE_HEADER_LINE =   re.compile(r"^(?:Date|From|Subject|To|Cc|Message-ID|Content-Type):")
RE_BLANK_LINE =    re.compile(r"^ *$")
RE_SUBJECT_LINE =  re.compile(r"^Subject: *(.*) *$")
RE_DATE_LINE =     re.compile(r"^Date: *(.*) *$")
RE_FROM_LINE =     re.compile(r"^From: *(.*) *$")
RE_TO_LINE =       re.compile(r"^To: *(.*) *$")
RE_CC_LINE =       re.compile(r"^Cc: *(.*) *$")
RE_ID_LINE =       re.compile(r"^Message-ID: *(.*) *$")
RE_CONTENT_LINE =  re.compile(r"^Content-Type: *(.*) *$")
RE_RUNON =         re.compile(r"^[ \t]+(.*) *$")
RE_BOUNDARY =      re.compile(r'.*boundary="(.*)".*')

def split_messages(blob: List[bytes]) -> List[List[bytes]]:
    """Split a blob into messages."""
    message_breaks = list()
    message_start = 0
    messages = list()

    # Find probable message breaks
    for index, bytes_line in enumerate(blob):
        try:
            line = str(bytes_line)
        except:
            # test for bad encodings
            raise
        if RE_MESSAGE_BREAK.match(line):
            message_breaks.append(index)

    # Validate message breaks and copy text into split messages
    # NOTE: message breaks are validated by checking for...
    #  1) A blank line following the message break
    #  2) A header line following the blank line
    for index in message_breaks:
        try:
            line1 = str(blob[index+1])
            line2 = str(blob[index+2])
        except:
            # test for bad encodings
            raise

        # If fails validation, skip to next probable break
        if not RE_BLANK_LINE.match(line1):
            continue
        elif not RE_HEADER_LINE.match(line2):
            continue

        # Message spans from known start to line before break
        messages.append(blob[message_start:index - 1])

        # Next message starts on first header line
        message_start = index + 2

    # Handle remainder
    messages.append(blob[message_start:])

    return messages

def split_message_parts(
    blob: List[bytes],
    boundary: str,
) -> List[List[bytes]]:
    """Split a blob into message parts."""
    part_breaks = list()
    parts = list()
    part_start = 0

    # NOTE: can use `in' operator with bytes and strings
    for index, line in enumerate(blob):
        if boundary in line:
            part_breaks.append(index)

    for index in part_breaks:
        parts.append(blob[part_start:index - 1])
        part_start = index + 1

    parts.append(blob[part_start:])

    return parts





def parse_message(blob: List[bytes]):
    """Parse a message blob for metadata and parts."""
    msg = message.Message()
    header_end = 0

    # Parse the header
    for bytes_line in blob:
        header_end += 1
        try:
            line = str(bytes_line)
        except:
            # test for bad encodings
            raise

        if RE_BLANK_LINE.match(line):
            break
        elif match := RE_RUNON.match(line):
            msg.append_last(line)

        elif match := RE_SUBJECT_LINE.match(line):
            msg.hdr_subject = match.group(1)
        elif match := RE_DATE_LINE.match(line):
            msg.hdr_date = match.group(1)
        elif match := RE_FROM_LINE.match(line):
            msg.hdr_from = match.group(1)
        elif match := RE_TO_LINE.match(line):
            msg.hdr_to = match.group(1)
        elif match := RE_CC_LINE.match(line):
            msg.hdr_cc = match.group(1)
        elif match := RE_ID_LINE.match(line):
            msg.hdr_message_id = match.group(1)
        elif match := RE_CONTENT_LINE.match(line):
            msg.content_type = match.group(1)

    # store content
    msg._content = blob[header_end + 1:]

    # parse for parts
    try:
        if "multipart/" in msg.content_type:
            msg = msg.into_multipart()
    except:
        pass
    if isinstance(msg,message.MultipartMessage):
        if match := RE_BOUNDARY.match(msg.content_type):
            boundary = "".join(match.group(1).split())
        else:
            raise ValueError("no boundary for multipart content") from None
        msg._parts = split_message_parts(msg._content, boundary)

    print(msg)

def read_input() -> List[bytes]:
    """Read STDIN into a blob."""
    textblob = list()
    for line in sys.stdin:
        try:
            textblob.append(line.rstrip())
        except:
            # test for bad encoding
            raise
    return textblob


A main.go => main.go +296 -0
@@ 0,0 1,296 @@
package main

import (
	"fmt"
	"os"
	"strings"
	"io"
	"bufio"
	"regexp"

	"git.dominic-ricottone.com/textwrap/common"
)

// An enumeration of header parts
const(
	HeaderSubject         = "HeaderSubject"
	HeaderDate            = "HeaderDate"
	HeaderFrom            = "HeaderFrom"
	HeaderTo              = "HeaderTo"
	HeaderCc              = "HeaderCc"
	HeaderMessageID       = "HeaderMessageID"
	HeaderContentType     = "HeaderContentType"
	HeaderContentEncoding = "HeaderContentEncoding"
)

// A message header container, used within message containers
type MessageHeader struct {
	Subject     string
	Date        string
	From        string
	To          string
	Cc          string
	MessageID   string
	ContentType string
	LastSet     string
}

// Builder for a message header
func NewHeader() *MessageHeader {
	return &MessageHeader{"", "", "", "", "", "", "", ""}
}

// A message part header container, used within message part containers
type MessagePartHeader struct {
	ContentType     string
	ContentEncoding string
	LastSet         string
}

// Builder for a message part header
func NewPartHeader() *MessagePartHeader {
	return &MessagePartHeader{"", "", ""}
}

// A message part container, used within message containers
type MessagePart struct {
	Header      *MessagePartHeader
	Content     []string
}

// Builder for a message part
func NewPart() *MessagePart {
	return &MessagePart{NewPartHeader(), []string{""}}
}

// A message container
type Message struct {
	Header       *MessageHeader
	Parts        []*MessagePart
	PartBoundary *regexp.Regexp
}

// Builder for a message
func NewMessage() *Message {
	return &Message{NewHeader(), []*MessagePart{NewPart()}, nil}
}

// Message setters
func (m *Message) SetHeader(line string) {
	if strings.HasPrefix(line, "Subject:") {
		m.Header.Subject = line[8:]
		m.Header.LastSet = HeaderSubject
	} else if strings.HasPrefix(line, "Date:") {
		m.Header.Date = line[5:]
		m.Header.LastSet = HeaderDate
	} else if strings.HasPrefix(line, "From:") {
		m.Header.From = line[5:]
		m.Header.LastSet = HeaderFrom
	} else if strings.HasPrefix(line, "To:") {
		m.Header.To = line[3:]
		m.Header.LastSet = HeaderTo
	} else if strings.HasPrefix(line, "Cc:") {
		m.Header.Cc = line[3:]
		m.Header.LastSet = HeaderCc
	} else if strings.HasPrefix(line, "Message-ID:") {
		m.Header.MessageID = line[11:]
		m.Header.LastSet = HeaderMessageID
	} else if strings.HasPrefix(line, "Content-Type:") {
		m.Header.ContentType = line[13:]
		m.Header.LastSet = HeaderContentType
	}
}

func (m *Message) AppendLastHeader(s string) {
	switch m.Header.LastSet {
	case HeaderSubject:
		m.Header.Subject += " " + s
	case HeaderDate:
		m.Header.Date += " " + s
	case HeaderFrom:
		m.Header.From += " " + s
	case HeaderTo:
		m.Header.To += " " + s
	case HeaderCc:
		m.Header.Cc += " " + s
	case HeaderMessageID:
		m.Header.MessageID += " " + s
	case HeaderContentType:
		m.Header.ContentType += " " + s
	}
}

func (m *Message) SetPartHeader(line string) {
	if strings.HasPrefix(line, "Content-Type:") {
		m.Parts[len(m.Parts)-1].Header.ContentType = line[13:]
		m.Parts[len(m.Parts)-1].Header.LastSet = HeaderContentType
	} else if strings.HasPrefix(line, "Content-Transfer-Encoding:") {
		m.Parts[len(m.Parts)-1].Header.ContentEncoding = line[26:]
		m.Parts[len(m.Parts)-1].Header.LastSet = HeaderContentEncoding
	}
}

func (m *Message) AppendLastPartHeader(s string) {
	switch m.Parts[len(m.Parts)-1].Header.LastSet {
	case HeaderContentType:
		m.Parts[len(m.Parts)-1].Header.ContentType += " " + s
	case HeaderContentEncoding:
		m.Parts[len(m.Parts)-1].Header.ContentEncoding += " " + s
	}
}

func (m *Message) AppendPart() {
	m.Parts = append(m.Parts, NewPart())
}

func (m *Message) AppendContent(s string) {
	i := len(m.Parts)-1
	m.Parts[i].Content = append(m.Parts[i].Content, s)
}

func (m *Message) FindBoundary(re *regexp.Regexp) {
	match := re.FindStringSubmatch(m.Header.ContentType)
	if match != nil {
		boundary := strings.Replace(match[1], " ", "", -1)
		m.PartBoundary, _ = regexp.Compile(".*" + boundary + ".*")
	}
}

// Message logic
func (m *Message) MatchBoundary(line string) bool {
	if m.PartBoundary != nil {
		return m.PartBoundary.MatchString(line)
	} else {
		return false
	}
}

// A message printer
func (m *Message) Dump() {
	fmt.Printf("Subject: %s\n", m.Header.Subject)
	fmt.Printf("Date: %s\n", m.Header.Date)
	fmt.Printf("From: %s\n", m.Header.From)
	fmt.Printf("To: %s\n", m.Header.To)
	fmt.Printf("Cc: %s\n", m.Header.Cc)
	fmt.Printf("MessageID: %s\n", m.Header.MessageID)
	fmt.Printf("ContentType: %s\n", m.Header.ContentType)
	for i := 0; i < len(m.Parts); i++ {
		fmt.Printf("Part %d:\n", i)
		fmt.Printf("ContentType: %s\n", m.Parts[i].Header.ContentType)
		fmt.Printf("ContentEncoding: %s\n", m.Parts[i].Header.ContentEncoding)

		wrapped, _ := common.WrapArray(m.Parts[i].Content, 80)
		for j := 0; j < len(wrapped); j++ {
			fmt.Printf("%s\n", wrapped[j])
		}
		fmt.Println("EOF")
	}
}

// Parser statuses
const (
	ParsingPreHeader  = "ParsingPreHeader"
	ParsingHeader     = "ParsingHeader"
	ParsingPartHeader = "ParsingPartHeader"
	ParsingContent    = "ParsingContent"
)

func parse_stream(reader io.Reader) {
	// Create scanner from reader
	input := bufio.NewScanner(reader)

	// Compile regular expressions
	re_message_break, err := regexp.Compile("^-+$")
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}
	re_header, err := regexp.Compile(
		"^(?:Date|From|Subject|To|Cc|Message-ID|" +
		"Content-(?:Type|Transfer-Encoding)):",
	)
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}
	re_multipart, err := regexp.Compile(".*boundary=\"(.*)\".*")
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}

	parsing := ParsingPreHeader
	message := NewMessage()

	for input.Scan() {
		line := input.Text()
		tline := strings.TrimSpace(line)

		if parsing == ParsingPreHeader {
			if re_header.MatchString(tline) {
				parsing = ParsingHeader
				message.SetHeader(tline)
			}
		} else if parsing == ParsingHeader {
			if tline == "" {
				parsing = ParsingContent
				message.FindBoundary(re_multipart)
			} else if strings.HasPrefix(line, "\t") {
				message.AppendLastHeader(tline)
			} else {
				message.SetHeader(tline)
			}
		} else if parsing == ParsingPartHeader {
			if tline == "" {
				parsing = ParsingContent
			} else if strings.HasPrefix(line, "\t") {
				message.AppendLastPartHeader(tline)
			} else {
				message.SetPartHeader(tline)
			}
		} else if parsing == ParsingContent {
			if re_message_break.MatchString(tline) {
				parsing = ParsingPreHeader
				message.Dump()
				message = NewMessage()
			} else if message.MatchBoundary(tline) {
				parsing = ParsingPartHeader
				message.AppendPart()
			} else {
				message.AppendContent(tline)
			}
		}
	}

	// Check for scanner errors
	if err = input.Err(); err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}
}

func parse_file(filename string) {
	// Check file
	file, err := os.Open(filename)
	if err != nil {
		fmt.Printf("cannot read file '%s'\n", filename)
		os.Exit(1)
	}
	defer file.Close()

	// Parse
	parse_stream(file)
}

func main() {
	// Check STDIN
	_, err := os.Stdin.Stat()
	if err != nil {
		fmt.Println("cannot read input")
		os.Exit(1)
	}

	// Parse
	parse_stream(os.Stdin)
}


D textwrap/main.go => textwrap/main.go +0 -81
@@ 1,81 0,0 @@
package main

import (
	"fmt"
	"os"
	"strings"
	"bufio"
	"regexp"
	"flag"
)

const LENGTH = 40

func print_break(length int) {
	fmt.Printf("%s\n", strings.Repeat("-", length))
}

func print_wrapped(line string, length int, quote string) {
	len_quote := len(quote)
	buffer := quote
	for index, rune := range line[len_quote:] {
		buffer += string(rune)
		if (index + 1) % (length - len_quote) == 0 {
			fmt.Printf("%s\n", buffer)
			buffer = quote
		}
	}
	if buffer != "" {
		fmt.Printf("%s\n", buffer)
	}
}

func main() {
	// Open STDIN as scanner
	_, err := os.Stdin.Stat()
	if err != nil {
		fmt.Printf("%s\n", "cannot read input")
		os.Exit(1)
	}
	input := bufio.NewScanner(os.Stdin)

	// Look for arguments
	var width = flag.Int("width", 80, "target width for output")
	flag.Parse()

	// Compile regular expressions
	re_quote, err := regexp.Compile("^([> ]*)")
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}
	re_break, err := regexp.Compile("^(?:-{5,}|={5,})$")
	if err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}

	// Scan line by line
	for input.Scan() {
		line := input.Text()
		line = strings.TrimSpace(line)

		if len(line) > *width {
			if re_break.MatchString(line) {
				print_break(*width)
			} else {
				quote := re_quote.FindString(line)
				print_wrapped(line, *width, quote)
			}
		} else {
			fmt.Printf("%s\n", line)
		}
	}

	// Check for scanner errors
	if err = input.Err(); err != nil {
		fmt.Printf("internal error - %v\n", err)
		os.Exit(1)
	}
}


D textwrap/textwrap => textwrap/textwrap +0 -0