// Structs and functions for unmarshaling XHTML files in e-pub archives
package main
import (
"fmt"
"io"
"sort"
"encoding/xml"
)
// A XHTML division can contain paragraphs, blockquotes, and further nested
// XHTML divisions.
type XhtmlDivision struct {
Divisions []XhtmlDivision `xml:"div"`
Paragraphs []Paragraph `xml:"p"`
BlockQuotes []BlockQuote `xml:"blockquote"`
}
func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
counter := 0
for {
token, err := decoder.Token()
if err == io.EOF {
break
}
if err != nil {
return err
}
switch token.(type) {
case xml.StartElement:
new_start := token.(xml.StartElement)
if (new_start.Name.Local == "p") {
target := Paragraph{}
decoder.DecodeElement(&target, &new_start)
target.order = counter
counter += 1
d.Paragraphs = append(d.Paragraphs, target)
} else if (new_start.Name.Local == "blockquote") {
target := BlockQuote{}
decoder.DecodeElement(&target, &new_start)
target.order = counter
counter += 1
d.BlockQuotes = append(d.BlockQuotes, target)
} else if (new_start.Name.Local == "div") {
target := XhtmlDivision{}
decoder.DecodeElement(&target, &new_start)
d.Divisions = append(d.Divisions, target)
}
}
}
return nil
}
// The XHTML structure should be a pair of one head and one body.
// The XHTML head contains little information that should be retained.
// The XHTML body is a nesting structure.
type XhtmlHead struct {
Title string `xml:"title"`
}
type XhtmlBody struct {
Title string `xml:"h3"`
Division XhtmlDivision `xml:"div"`
}
type Xhtml struct {
XMLName xml.Name `xml:"html"`
Head XhtmlHead `xml:"head"`
Body XhtmlBody `xml:"body"`
}
// Normalize an XHTML division into raw XHTML
func normalize_division(div XhtmlDivision) []byte {
var nodes []TextNode
// Pull all paragraphs into nodes
for _, par := range div.Paragraphs {
nodes = append(nodes, par)
}
// Pull all blockquotes into nodes
for _, quote := range div.BlockQuotes {
nodes = append(nodes, quote)
}
// Sort paragraphs and blockquotes
sort.Slice(nodes, func(i, j int) bool {
return nodes[i].Order() < nodes[j].Order()
})
var xhtml []byte
// Convert nodes into raw XHTML
for _, node := range nodes {
xhtml_node, err := xml.Marshal(&node)
if err != nil {
fmt.Printf("error: %s\n", err)
} else {
xhtml= append(xhtml, xhtml_node...)
}
}
// Recurse with all nested divisions
for _, nested_div := range div.Divisions {
xhtml = append(xhtml, normalize_division(nested_div)...)
}
return xhtml
}
// Convert an XHTML structure into an HTML structure
func xhtml_to_html(source Xhtml) *Html {
dest := &Html{}
// Copy fields
dest.Body.Title = source.Body.Title
dest.Body.Title = source.Body.Title
// Convert division
dest.Body.Division.Content = normalize_division(source.Body.Division)
return dest
}
// Parse an XHTML file
func ParseXhtml(content string) *Html {
xhtml := Xhtml{}
// Unmarshal XHTML
if err := xml.Unmarshal([]byte(content), &xhtml); err != nil {
fmt.Printf("error: %s\n", err)
}
return xhtml_to_html(xhtml)
}
// Read HTML from an HXTML file
func ReadHtml(content string) string {
buffer, err := xml.MarshalIndent(ParseXhtml(content), "", " ")
if err != nil {
fmt.Printf("error: %s\n", err)
}
return string(buffer)
}