~dricottone/epub2html (dev): xhtml.go blame

ae806b41 Dominic Ricottone

2 years ago

// Structs and functions for unmarshaling XHTML files in e-pub archives

package main

import (
	"fmt"
	"io"
	"sort"
	"encoding/xml"
)

// A XHTML division can contain paragraphs, blockquotes, and further nested
// XHTML divisions.
type XhtmlDivision struct {
	Divisions   []XhtmlDivision `xml:"div"`
	Paragraphs  []Paragraph     `xml:"p"`
	BlockQuotes []BlockQuote    `xml:"blockquote"`
}

func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
	counter := 0

	for {
		token, err := decoder.Token()
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}

		switch token.(type) {
		case xml.StartElement:
			new_start := token.(xml.StartElement)
			if (new_start.Name.Local == "p") {
				target := Paragraph{}
				decoder.DecodeElement(&target, &new_start)

				target.order = counter
				counter += 1

				d.Paragraphs = append(d.Paragraphs, target)
			} else if (new_start.Name.Local == "blockquote") {
				target := BlockQuote{}
				decoder.DecodeElement(&target, &new_start)

				target.order = counter
				counter += 1

				d.BlockQuotes = append(d.BlockQuotes, target)
			} else if (new_start.Name.Local == "div") {
				target := XhtmlDivision{}
				decoder.DecodeElement(&target, &new_start)

				d.Divisions = append(d.Divisions, target)
			}
		}
	}

	return nil
}

// The XHTML structure should be a pair of one head and one body.
// The XHTML head contains little information that should be retained.
// The XHTML body is a nesting structure.
type XhtmlHead struct {
	Title string `xml:"title"`
}

type XhtmlBody struct {
	Title    string   `xml:"h3"`
	Division XhtmlDivision `xml:"div"`
}

type Xhtml struct {
	XMLName xml.Name  `xml:"html"`
	Head    XhtmlHead `xml:"head"`
	Body    XhtmlBody `xml:"body"`
}

// Normalize an XHTML division into raw XHTML
func normalize_division(div XhtmlDivision) []byte {
	var nodes []TextNode

	// Pull all paragraphs into nodes
	for _, par := range div.Paragraphs {
		nodes = append(nodes, par)
	}

	// Pull all blockquotes into nodes
	for _, quote := range div.BlockQuotes {
		nodes = append(nodes, quote)
	}

	// Sort paragraphs and blockquotes
	sort.Slice(nodes, func(i, j int) bool {
		return nodes[i].Order() < nodes[j].Order()
	})

	var xhtml []byte

	// Convert nodes into raw XHTML
	for _, node := range nodes {
		xhtml_node, err := xml.Marshal(&node)
		if err != nil {
			fmt.Printf("error: %s\n", err)
		} else {
			xhtml= append(xhtml, xhtml_node...)
		}
	}

	// Recurse with all nested divisions
	for _, nested_div := range div.Divisions {
		xhtml = append(xhtml, normalize_division(nested_div)...)
	}

	return xhtml
}

// Convert an XHTML structure into an HTML structure
func xhtml_to_html(source Xhtml) *Html {
	dest := &Html{}

	// Copy fields
	dest.Body.Title = source.Body.Title
	dest.Body.Title = source.Body.Title

	// Convert division
	dest.Body.Division.Content = normalize_division(source.Body.Division)

	return dest
}

// Parse an XHTML file
func ParseXhtml(content string) *Html {
	xhtml := Xhtml{}

	// Unmarshal XHTML
	if err := xml.Unmarshal([]byte(content), &xhtml); err != nil {
		fmt.Printf("error: %s\n", err)
	}


	return xhtml_to_html(xhtml)
}

// Read HTML from an HXTML file
func ReadHtml(content string) string {
	buffer, err := xml.MarshalIndent(ParseXhtml(content), "", "  ")
	if err != nil {
		fmt.Printf("error: %s\n", err)
	}
	return string(buffer)
}