~dricottone/epub2html

ae806b4144a63817e5a1a9c3448540314e2e77ce — Dominic Ricottone 2 years ago 494d6b5 dev
Fixing blockquotes and refactoring

Blockquotes are fixed. Rather than hacking on the marshaler, I am
populating a secondary struct with raw XML. The standard marshaler is
able to work with this struct perfectly.

The code has been refactored. All of the (un)?marshaling structs and
implementations have been moved to domain-specific files. Helper
functions are defined so that data is stored in an intermediary slice
and map. The program is still just dumping that data to STDOUT, but all
the hard work for a higher level functionality is done.
7 files changed, 344 insertions(+), 189 deletions(-)

A epub.go
A html.go
M main.go
A ncx.go
A textnodes.go
A xhtml.go
D xml.go
A epub.go => epub.go +48 -0
@@ 0,0 1,48 @@
// Functions for handling e-pub archives

package main

import (
	"io"
	"fmt"
	"strings"
	"archive/zip"
)

func ReadArchive(filename string) (map[string]string, error) {
	// Open archive
	archive_reader, err := zip.OpenReader(filename)
	if err != nil {
		return nil, err
	}
	defer archive_reader.Close()

	var archive = map[string]string{}

	// Loop over files in archive
	for _, file := range archive_reader.File {

		// Skip these less useful files
		if (file.Name == "mimetype" || file.Name == "content.opf" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css")) {
			continue
		}

		// Open file
		file_reader, err := file.Open()
		if err != nil {
			fmt.Printf("error: %s\n", err)
		}

		// Copy file contents into a string builder
		buffer := new(strings.Builder)
		if _, err := io.Copy(buffer, file_reader); err != nil {
			fmt.Printf("error: %s\n", err)
		}

		// Store final string mapped by the file name
		archive[file.Name] = buffer.String()
	}

	return archive, nil
}


A html.go => html.go +31 -0
@@ 0,0 1,31 @@
// Structs for marshaling HTML files

package main

import (
	"encoding/xml"
)

// Unlike the nested X(HT)ML division, this division is raw XHTML.
type HtmlDivision struct {
	Content []byte `xml:",innerxml"`
}

// The HTML structure should be a pair of one head and one body.
// An HTML head will mirror an XML head.
// An HTML body will be much simpler than an XML body.
type HtmlBody struct {
	Title    string       `xml:"h3"`
	Division HtmlDivision `xml:"div"`
}

type HtmlHead struct {
	Title string `xml:"title"`
}

type Html struct {
	XMLName xml.Name `xml:"html"`
	Head    HtmlHead `xml:"head"`
	Body    HtmlBody `xml:"body"`
}


M main.go => main.go +8 -90
@@ 1,104 1,22 @@
package main

import (
	"io"
	"fmt"
	"strings"
	"sort"
	"archive/zip"
	"encoding/xml"
)

// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
// div tags, we need to recursively extract paragraphs from divisions.
// Recommended usage:
//   Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
//   Xhtml.Body.Division.Divisions = []Division{}
func normalize_division(div Division) []Paragraph {
	// If div contains p tags, return those
	if (len(div.Paragraphs) != 0) {
		return div.Paragraphs
	}

	var pars []Paragraph

	// If div contains blockquote tags, return the nested p tags
	if (len(div.BlockQuotes) != 0) {
		for _, quote := range div.BlockQuotes {
			for _, par := range quote.Paragraphs {
				pars = append(pars, par)
			}
		}
		return pars
	}

	// Else recurse on nested div tags
	for _, nested_div := range div.Divisions {
		pars = append(pars, normalize_division(nested_div)...)
	}
	return pars
}

func dump_archive(filename string) error {
	// Open archive
	areader, err := zip.OpenReader(filename)
func dump_archive(archive_name string) error {
	// Read the archive
	archive, err := ReadArchive(archive_name)
	if err != nil {
		return err
	}
	defer areader.Close()

	// Loop over files in archive
	for _, file := range areader.File {

		// Skip these less useful files
		if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
			fmt.Printf("Skipping %s...\n", file.Name)
			continue
		}

		// Open file and copy into a string builder
		fmt.Printf("Contents of %s:\n", file.Name)
		freader, err := file.Open()
		if err != nil {
			fmt.Printf("error: %s\n", err)
		}
		buffer := new(strings.Builder)
		if _, err := io.Copy(buffer, freader); err != nil {
			fmt.Printf("error: %s\n", err)
		}

		if (file.Name == "toc.ncx") {
			target := Ncx{}
			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
				fmt.Printf("error: %s\n", err)
			}

			fmt.Println(target.Title)

			sort.Slice(target.NavPoints, func(i, j int) bool {
				return target.NavPoints[i].Order < target.NavPoints[j].Order
			})

			html, err := xml.MarshalIndent(&target, "", "  ")
			if err != nil {
				fmt.Printf("error: %s\n", err)
			}
			fmt.Println(string(html))
		} else {
			target := Xhtml{}
			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
				fmt.Printf("error: %s\n", err)
			}

			target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
			target.Body.Division.Divisions = []Division{}
	// Get a sorted table of contents
	toc := ReadTableOfContents(archive["toc.ncx"])

			html, err := xml.MarshalIndent(&target, "", "  ")
			if err != nil {
				fmt.Printf("error: %s\n", err)
			}
			fmt.Println(string(html))
		}
	// Print files according to the table of contents
	for _, file_name := range toc {
		fmt.Println(ReadHtml(archive[file_name]))
	}

	return nil

A ncx.go => ncx.go +65 -0
@@ 0,0 1,65 @@
// Structs and functions for unmarshaling NCX files in e-pub archives

package main

import (
	"fmt"
	"encoding/xml"
	"net/url"
	"sort"
)

// A URI for content
type Content struct {
	Src string `xml:"src,attr"`
}

// A content label and URI pair
type NavPoint struct {
	Label   string  `xml:"navLabel>text"`
	Content Content `xml:"content"`
	Order   int     `xml:"playOrder,attr"`
}

// A series of navigation points
type Ncx struct {
	XMLName   xml.Name   `xml:"ncx"`
	Title     string     `xml:"docTitle>text"`
	NavPoints []NavPoint `xml:"navMap>navPoint"`
}

// Parse an NCX file
func ParseNcx(content string) *Ncx {
	dest := &Ncx{}

	// Unmarshal XHTML
	if err := xml.Unmarshal([]byte(content), &dest); err != nil {
		fmt.Printf("error: %s\n", err)
	}

	// Sort navigation points
	sort.Slice(dest.NavPoints, func(i, j int) bool {
		return dest.NavPoints[i].Order < dest.NavPoints[j].Order
	})

	return dest
}

// Build a table of contents from an NCX file
func ReadTableOfContents(content string) []string {
	ncx := ParseNcx(content)

	var toc []string

	for _, point := range ncx.NavPoints {
		src, err := url.QueryUnescape(point.Content.Src)
		if err != nil {
			fmt.Printf("error: %s\n", err)
		}

		toc = append(toc, src)
	}

	return toc
}


A textnodes.go => textnodes.go +37 -0
@@ 0,0 1,37 @@
// Structs for handling text nodes

package main

import (
	"encoding/xml"
)

// Text nodes are either a paragraph or a blockquote.
type TextNode interface {
	Order() int
}

// A paragraph contains character data and other text-oriented tags, such as
// b or strong. This data should be retained as raw XML.
type Paragraph struct {
	XMLName xml.Name `xml:"p"`
	Text    string   `xml:",innerxml"`
	order   int      `xml:"-"`
}

func (p Paragraph) Order() int {
	return p.order
}

// A blockquote contains paragraphs. Unlike divisions, the blockquote structure
// must be maintained in order to format the paragraphs correctly.
type BlockQuote struct {
	XMLName    xml.Name    `xml:"blockquote"`
	Paragraphs []Paragraph `xml:"p"`
	order      int         `xml:"-"`
}

func (b BlockQuote) Order() int {
	return b.order
}


A xhtml.go => xhtml.go +155 -0
@@ 0,0 1,155 @@
// Structs and functions for unmarshaling XHTML files in e-pub archives

package main

import (
	"fmt"
	"io"
	"sort"
	"encoding/xml"
)

// A XHTML division can contain paragraphs, blockquotes, and further nested
// XHTML divisions.
type XhtmlDivision struct {
	Divisions   []XhtmlDivision `xml:"div"`
	Paragraphs  []Paragraph     `xml:"p"`
	BlockQuotes []BlockQuote    `xml:"blockquote"`
}

func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
	counter := 0

	for {
		token, err := decoder.Token()
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}

		switch token.(type) {
		case xml.StartElement:
			new_start := token.(xml.StartElement)
			if (new_start.Name.Local == "p") {
				target := Paragraph{}
				decoder.DecodeElement(&target, &new_start)

				target.order = counter
				counter += 1

				d.Paragraphs = append(d.Paragraphs, target)
			} else if (new_start.Name.Local == "blockquote") {
				target := BlockQuote{}
				decoder.DecodeElement(&target, &new_start)

				target.order = counter
				counter += 1

				d.BlockQuotes = append(d.BlockQuotes, target)
			} else if (new_start.Name.Local == "div") {
				target := XhtmlDivision{}
				decoder.DecodeElement(&target, &new_start)

				d.Divisions = append(d.Divisions, target)
			}
		}
	}

	return nil
}

// The XHTML structure should be a pair of one head and one body.
// The XHTML head contains little information that should be retained.
// The XHTML body is a nesting structure.
type XhtmlHead struct {
	Title string `xml:"title"`
}

type XhtmlBody struct {
	Title    string   `xml:"h3"`
	Division XhtmlDivision `xml:"div"`
}

type Xhtml struct {
	XMLName xml.Name  `xml:"html"`
	Head    XhtmlHead `xml:"head"`
	Body    XhtmlBody `xml:"body"`
}

// Normalize an XHTML division into raw XHTML
func normalize_division(div XhtmlDivision) []byte {
	var nodes []TextNode

	// Pull all paragraphs into nodes
	for _, par := range div.Paragraphs {
		nodes = append(nodes, par)
	}

	// Pull all blockquotes into nodes
	for _, quote := range div.BlockQuotes {
		nodes = append(nodes, quote)
	}

	// Sort paragraphs and blockquotes
	sort.Slice(nodes, func(i, j int) bool {
		return nodes[i].Order() < nodes[j].Order()
	})

	var xhtml []byte

	// Convert nodes into raw XHTML
	for _, node := range nodes {
		xhtml_node, err := xml.Marshal(&node)
		if err != nil {
			fmt.Printf("error: %s\n", err)
		} else {
			xhtml= append(xhtml, xhtml_node...)
		}
	}

	// Recurse with all nested divisions
	for _, nested_div := range div.Divisions {
		xhtml = append(xhtml, normalize_division(nested_div)...)
	}

	return xhtml
}

// Convert an XHTML structure into an HTML structure
func xhtml_to_html(source Xhtml) *Html {
	dest := &Html{}

	// Copy fields
	dest.Body.Title = source.Body.Title
	dest.Body.Title = source.Body.Title

	// Convert division
	dest.Body.Division.Content = normalize_division(source.Body.Division)

	return dest
}

// Parse an XHTML file
func ParseXhtml(content string) *Html {
	xhtml := Xhtml{}

	// Unmarshal XHTML
	if err := xml.Unmarshal([]byte(content), &xhtml); err != nil {
		fmt.Printf("error: %s\n", err)
	}


	return xhtml_to_html(xhtml)
}

// Read HTML from an HXTML file
func ReadHtml(content string) string {
	buffer, err := xml.MarshalIndent(ParseXhtml(content), "", "  ")
	if err != nil {
		fmt.Printf("error: %s\n", err)
	}
	return string(buffer)
}


D xml.go => xml.go +0 -99
@@ 1,99 0,0 @@
// Structs for parsing X(HT)?ML files in e-pub archives

package main

import (
	"io"
	"encoding/xml"
)

type Head struct {
	Title string `xml:"title"`
}

type Paragraph struct {
	Text  string `xml:",innerxml"`
	Order int    `xml:"-"`
}

type BlockQuote struct {
	Paragraphs []Paragraph `xml:"p"`
	Order      int         `xml:"-"`
}

type Division struct {
	Divisions   []Division   `xml:"div"`
	Paragraphs  []Paragraph  `xml:"p"`
	BlockQuotes []BlockQuote `xml:"blockquote"`
}

func (d *Division) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
	counter := 0

	for {
		token, err := decoder.Token()
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}

		switch token.(type) {
		case xml.StartElement:
			new_start := token.(xml.StartElement)
			if (new_start.Name.Local == "p") {
				target := Paragraph{}
				decoder.DecodeElement(&target, &new_start)

				target.Order = counter
				counter += 1

				d.Paragraphs = append(d.Paragraphs, target)
			} else if (new_start.Name.Local == "blockquote") {
				target := BlockQuote{}
				decoder.DecodeElement(&target, &new_start)

				target.Order = counter
				counter += 1

				d.BlockQuotes = append(d.BlockQuotes, target)
			} else if (new_start.Name.Local == "div") {
				target := Division{}
				decoder.DecodeElement(&target, &new_start)

				d.Divisions = append(d.Divisions, target)
			}
		}
	}

	return nil
}

type Body struct {
	Title    string   `xml:"h3"`
	Division Division `xml:"div"`
}

type Xhtml struct {
	XMLName xml.Name `xml:"html"`
	Head    Head     `xml:"head"`
	Body    Body     `xml:"body"`
}

type Content struct {
	Src string `xml:"src,attr"`
}

type NavPoint struct {
	Label   string  `xml:"navLabel>text"`
	Content Content `xml:"content"`
	Order   int     `xml:"playOrder,attr"`
}

type Ncx struct {
	XMLName   xml.Name   `xml:"ncx"`
	Title     string     `xml:"docTitle>text"`
	NavPoints []NavPoint `xml:"navMap>navPoint"`
}