~dricottone/epub2html: Fixing blockquotes and refactoring

7 files changed, 344 insertions(+), 189 deletions(-)

A epub.go
A html.go
M main.go
A ncx.go
A textnodes.go
A xhtml.go
D xml.go

A epub.go => epub.go +48 -0

@@ 0,0 1,48 @@
+// Functions for handling e-pub archives
+
+package main
+
+import (
+	"io"
+	"fmt"
+	"strings"
+	"archive/zip"
+)
+
+func ReadArchive(filename string) (map[string]string, error) {
+	// Open archive
+	archive_reader, err := zip.OpenReader(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer archive_reader.Close()
+
+	var archive = map[string]string{}
+
+	// Loop over files in archive
+	for _, file := range archive_reader.File {
+
+		// Skip these less useful files
+		if (file.Name == "mimetype" || file.Name == "content.opf" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css")) {
+			continue
+		}
+
+		// Open file
+		file_reader, err := file.Open()
+		if err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+
+		// Copy file contents into a string builder
+		buffer := new(strings.Builder)
+		if _, err := io.Copy(buffer, file_reader); err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+
+		// Store final string mapped by the file name
+		archive[file.Name] = buffer.String()
+	}
+
+	return archive, nil
+}
+

A html.go => html.go +31 -0

@@ 0,0 1,31 @@
+// Structs for marshaling HTML files
+
+package main
+
+import (
+	"encoding/xml"
+)
+
+// Unlike the nested X(HT)ML division, this division is raw XHTML.
+type HtmlDivision struct {
+	Content []byte `xml:",innerxml"`
+}
+
+// The HTML structure should be a pair of one head and one body.
+// An HTML head will mirror an XML head.
+// An HTML body will be much simpler than an XML body.
+type HtmlBody struct {
+	Title    string       `xml:"h3"`
+	Division HtmlDivision `xml:"div"`
+}
+
+type HtmlHead struct {
+	Title string `xml:"title"`
+}
+
+type Html struct {
+	XMLName xml.Name `xml:"html"`
+	Head    HtmlHead `xml:"head"`
+	Body    HtmlBody `xml:"body"`
+}
+

M main.go => main.go +8 -90

@@ 1,104 1,22 @@
 package main
 
 import (
-	"io"
 	"fmt"
-	"strings"
-	"sort"
-	"archive/zip"
-	"encoding/xml"
 )
 
-// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
-// div tags, we need to recursively extract paragraphs from divisions.
-// Recommended usage:
-//   Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
-//   Xhtml.Body.Division.Divisions = []Division{}
-func normalize_division(div Division) []Paragraph {
-	// If div contains p tags, return those
-	if (len(div.Paragraphs) != 0) {
-		return div.Paragraphs
-	}
-
-	var pars []Paragraph
-
-	// If div contains blockquote tags, return the nested p tags
-	if (len(div.BlockQuotes) != 0) {
-		for _, quote := range div.BlockQuotes {
-			for _, par := range quote.Paragraphs {
-				pars = append(pars, par)
-			}
-		}
-		return pars
-	}
-
-	// Else recurse on nested div tags
-	for _, nested_div := range div.Divisions {
-		pars = append(pars, normalize_division(nested_div)...)
-	}
-	return pars
-}
-
-func dump_archive(filename string) error {
-	// Open archive
-	areader, err := zip.OpenReader(filename)
+func dump_archive(archive_name string) error {
+	// Read the archive
+	archive, err := ReadArchive(archive_name)
 	if err != nil {
 		return err
 	}
-	defer areader.Close()
-
-	// Loop over files in archive
-	for _, file := range areader.File {
-
-		// Skip these less useful files
-		if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
-			fmt.Printf("Skipping %s...\n", file.Name)
-			continue
-		}
-
-		// Open file and copy into a string builder
-		fmt.Printf("Contents of %s:\n", file.Name)
-		freader, err := file.Open()
-		if err != nil {
-			fmt.Printf("error: %s\n", err)
-		}
-		buffer := new(strings.Builder)
-		if _, err := io.Copy(buffer, freader); err != nil {
-			fmt.Printf("error: %s\n", err)
-		}
-
-		if (file.Name == "toc.ncx") {
-			target := Ncx{}
-			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
-				fmt.Printf("error: %s\n", err)
-			}
-
-			fmt.Println(target.Title)
-
-			sort.Slice(target.NavPoints, func(i, j int) bool {
-				return target.NavPoints[i].Order < target.NavPoints[j].Order
-			})
-
-			html, err := xml.MarshalIndent(&target, "", "  ")
-			if err != nil {
-				fmt.Printf("error: %s\n", err)
-			}
-			fmt.Println(string(html))
-		} else {
-			target := Xhtml{}
-			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
-				fmt.Printf("error: %s\n", err)
-			}
 
-			target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
-			target.Body.Division.Divisions = []Division{}
+	// Get a sorted table of contents
+	toc := ReadTableOfContents(archive["toc.ncx"])
 
-			html, err := xml.MarshalIndent(&target, "", "  ")
-			if err != nil {
-				fmt.Printf("error: %s\n", err)
-			}
-			fmt.Println(string(html))
-		}
+	// Print files according to the table of contents
+	for _, file_name := range toc {
+		fmt.Println(ReadHtml(archive[file_name]))
 	}
 
 	return nil

A ncx.go => ncx.go +65 -0

@@ 0,0 1,65 @@
+// Structs and functions for unmarshaling NCX files in e-pub archives
+
+package main
+
+import (
+	"fmt"
+	"encoding/xml"
+	"net/url"
+	"sort"
+)
+
+// A URI for content
+type Content struct {
+	Src string `xml:"src,attr"`
+}
+
+// A content label and URI pair
+type NavPoint struct {
+	Label   string  `xml:"navLabel>text"`
+	Content Content `xml:"content"`
+	Order   int     `xml:"playOrder,attr"`
+}
+
+// A series of navigation points
+type Ncx struct {
+	XMLName   xml.Name   `xml:"ncx"`
+	Title     string     `xml:"docTitle>text"`
+	NavPoints []NavPoint `xml:"navMap>navPoint"`
+}
+
+// Parse an NCX file
+func ParseNcx(content string) *Ncx {
+	dest := &Ncx{}
+
+	// Unmarshal XHTML
+	if err := xml.Unmarshal([]byte(content), &dest); err != nil {
+		fmt.Printf("error: %s\n", err)
+	}
+
+	// Sort navigation points
+	sort.Slice(dest.NavPoints, func(i, j int) bool {
+		return dest.NavPoints[i].Order < dest.NavPoints[j].Order
+	})
+
+	return dest
+}
+
+// Build a table of contents from an NCX file
+func ReadTableOfContents(content string) []string {
+	ncx := ParseNcx(content)
+
+	var toc []string
+
+	for _, point := range ncx.NavPoints {
+		src, err := url.QueryUnescape(point.Content.Src)
+		if err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+
+		toc = append(toc, src)
+	}
+
+	return toc
+}
+

A textnodes.go => textnodes.go +37 -0

@@ 0,0 1,37 @@
+// Structs for handling text nodes
+
+package main
+
+import (
+	"encoding/xml"
+)
+
+// Text nodes are either a paragraph or a blockquote.
+type TextNode interface {
+	Order() int
+}
+
+// A paragraph contains character data and other text-oriented tags, such as
+// b or strong. This data should be retained as raw XML.
+type Paragraph struct {
+	XMLName xml.Name `xml:"p"`
+	Text    string   `xml:",innerxml"`
+	order   int      `xml:"-"`
+}
+
+func (p Paragraph) Order() int {
+	return p.order
+}
+
+// A blockquote contains paragraphs. Unlike divisions, the blockquote structure
+// must be maintained in order to format the paragraphs correctly.
+type BlockQuote struct {
+	XMLName    xml.Name    `xml:"blockquote"`
+	Paragraphs []Paragraph `xml:"p"`
+	order      int         `xml:"-"`
+}
+
+func (b BlockQuote) Order() int {
+	return b.order
+}
+

A xhtml.go => xhtml.go +155 -0

@@ 0,0 1,155 @@
+// Structs and functions for unmarshaling XHTML files in e-pub archives
+
+package main
+
+import (
+	"fmt"
+	"io"
+	"sort"
+	"encoding/xml"
+)
+
+// A XHTML division can contain paragraphs, blockquotes, and further nested
+// XHTML divisions.
+type XhtmlDivision struct {
+	Divisions   []XhtmlDivision `xml:"div"`
+	Paragraphs  []Paragraph     `xml:"p"`
+	BlockQuotes []BlockQuote    `xml:"blockquote"`
+}
+
+func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
+	counter := 0
+
+	for {
+		token, err := decoder.Token()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+
+		switch token.(type) {
+		case xml.StartElement:
+			new_start := token.(xml.StartElement)
+			if (new_start.Name.Local == "p") {
+				target := Paragraph{}
+				decoder.DecodeElement(&target, &new_start)
+
+				target.order = counter
+				counter += 1
+
+				d.Paragraphs = append(d.Paragraphs, target)
+			} else if (new_start.Name.Local == "blockquote") {
+				target := BlockQuote{}
+				decoder.DecodeElement(&target, &new_start)
+
+				target.order = counter
+				counter += 1
+
+				d.BlockQuotes = append(d.BlockQuotes, target)
+			} else if (new_start.Name.Local == "div") {
+				target := XhtmlDivision{}
+				decoder.DecodeElement(&target, &new_start)
+
+				d.Divisions = append(d.Divisions, target)
+			}
+		}
+	}
+
+	return nil
+}
+
+// The XHTML structure should be a pair of one head and one body.
+// The XHTML head contains little information that should be retained.
+// The XHTML body is a nesting structure.
+type XhtmlHead struct {
+	Title string `xml:"title"`
+}
+
+type XhtmlBody struct {
+	Title    string   `xml:"h3"`
+	Division XhtmlDivision `xml:"div"`
+}
+
+type Xhtml struct {
+	XMLName xml.Name  `xml:"html"`
+	Head    XhtmlHead `xml:"head"`
+	Body    XhtmlBody `xml:"body"`
+}
+
+// Normalize an XHTML division into raw XHTML
+func normalize_division(div XhtmlDivision) []byte {
+	var nodes []TextNode
+
+	// Pull all paragraphs into nodes
+	for _, par := range div.Paragraphs {
+		nodes = append(nodes, par)
+	}
+
+	// Pull all blockquotes into nodes
+	for _, quote := range div.BlockQuotes {
+		nodes = append(nodes, quote)
+	}
+
+	// Sort paragraphs and blockquotes
+	sort.Slice(nodes, func(i, j int) bool {
+		return nodes[i].Order() < nodes[j].Order()
+	})
+
+	var xhtml []byte
+
+	// Convert nodes into raw XHTML
+	for _, node := range nodes {
+		xhtml_node, err := xml.Marshal(&node)
+		if err != nil {
+			fmt.Printf("error: %s\n", err)
+		} else {
+			xhtml= append(xhtml, xhtml_node...)
+		}
+	}
+
+	// Recurse with all nested divisions
+	for _, nested_div := range div.Divisions {
+		xhtml = append(xhtml, normalize_division(nested_div)...)
+	}
+
+	return xhtml
+}
+
+// Convert an XHTML structure into an HTML structure
+func xhtml_to_html(source Xhtml) *Html {
+	dest := &Html{}
+
+	// Copy fields
+	dest.Body.Title = source.Body.Title
+	dest.Body.Title = source.Body.Title
+
+	// Convert division
+	dest.Body.Division.Content = normalize_division(source.Body.Division)
+
+	return dest
+}
+
+// Parse an XHTML file
+func ParseXhtml(content string) *Html {
+	xhtml := Xhtml{}
+
+	// Unmarshal XHTML
+	if err := xml.Unmarshal([]byte(content), &xhtml); err != nil {
+		fmt.Printf("error: %s\n", err)
+	}
+
+
+	return xhtml_to_html(xhtml)
+}
+
+// Read HTML from an HXTML file
+func ReadHtml(content string) string {
+	buffer, err := xml.MarshalIndent(ParseXhtml(content), "", "  ")
+	if err != nil {
+		fmt.Printf("error: %s\n", err)
+	}
+	return string(buffer)
+}
+

D xml.go => xml.go +0 -99

@@ 1,99 0,0 @@
-// Structs for parsing X(HT)?ML files in e-pub archives
-
-package main
-
-import (
-	"io"
-	"encoding/xml"
-)
-
-type Head struct {
-	Title string `xml:"title"`
-}
-
-type Paragraph struct {
-	Text  string `xml:",innerxml"`
-	Order int    `xml:"-"`
-}
-
-type BlockQuote struct {
-	Paragraphs []Paragraph `xml:"p"`
-	Order      int         `xml:"-"`
-}
-
-type Division struct {
-	Divisions   []Division   `xml:"div"`
-	Paragraphs  []Paragraph  `xml:"p"`
-	BlockQuotes []BlockQuote `xml:"blockquote"`
-}
-
-func (d *Division) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
-	counter := 0
-
-	for {
-		token, err := decoder.Token()
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			return err
-		}
-
-		switch token.(type) {
-		case xml.StartElement:
-			new_start := token.(xml.StartElement)
-			if (new_start.Name.Local == "p") {
-				target := Paragraph{}
-				decoder.DecodeElement(&target, &new_start)
-
-				target.Order = counter
-				counter += 1
-
-				d.Paragraphs = append(d.Paragraphs, target)
-			} else if (new_start.Name.Local == "blockquote") {
-				target := BlockQuote{}
-				decoder.DecodeElement(&target, &new_start)
-
-				target.Order = counter
-				counter += 1
-
-				d.BlockQuotes = append(d.BlockQuotes, target)
-			} else if (new_start.Name.Local == "div") {
-				target := Division{}
-				decoder.DecodeElement(&target, &new_start)
-
-				d.Divisions = append(d.Divisions, target)
-			}
-		}
-	}
-
-	return nil
-}
-
-type Body struct {
-	Title    string   `xml:"h3"`
-	Division Division `xml:"div"`
-}
-
-type Xhtml struct {
-	XMLName xml.Name `xml:"html"`
-	Head    Head     `xml:"head"`
-	Body    Body     `xml:"body"`
-}
-
-type Content struct {
-	Src string `xml:"src,attr"`
-}
-
-type NavPoint struct {
-	Label   string  `xml:"navLabel>text"`
-	Content Content `xml:"content"`
-	Order   int     `xml:"playOrder,attr"`
-}
-
-type Ncx struct {
-	XMLName   xml.Name   `xml:"ncx"`
-	Title     string     `xml:"docTitle>text"`
-	NavPoints []NavPoint `xml:"navMap>navPoint"`
-}
-