~dricottone/epub2html

epub2html/xhtml.go -rw-r--r-- 3.3 KiB
ae806b41Dominic Ricottone Fixing blockquotes and refactoring 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Structs and functions for unmarshaling XHTML files in e-pub archives

package main

import (
	"fmt"
	"io"
	"sort"
	"encoding/xml"
)

// A XHTML division can contain paragraphs, blockquotes, and further nested
// XHTML divisions.
type XhtmlDivision struct {
	Divisions   []XhtmlDivision `xml:"div"`
	Paragraphs  []Paragraph     `xml:"p"`
	BlockQuotes []BlockQuote    `xml:"blockquote"`
}

func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
	counter := 0

	for {
		token, err := decoder.Token()
		if err == io.EOF {
			break
		}
		if err != nil {
			return err
		}

		switch token.(type) {
		case xml.StartElement:
			new_start := token.(xml.StartElement)
			if (new_start.Name.Local == "p") {
				target := Paragraph{}
				decoder.DecodeElement(&target, &new_start)

				target.order = counter
				counter += 1

				d.Paragraphs = append(d.Paragraphs, target)
			} else if (new_start.Name.Local == "blockquote") {
				target := BlockQuote{}
				decoder.DecodeElement(&target, &new_start)

				target.order = counter
				counter += 1

				d.BlockQuotes = append(d.BlockQuotes, target)
			} else if (new_start.Name.Local == "div") {
				target := XhtmlDivision{}
				decoder.DecodeElement(&target, &new_start)

				d.Divisions = append(d.Divisions, target)
			}
		}
	}

	return nil
}

// The XHTML structure should be a pair of one head and one body.
// The XHTML head contains little information that should be retained.
// The XHTML body is a nesting structure.
type XhtmlHead struct {
	Title string `xml:"title"`
}

type XhtmlBody struct {
	Title    string   `xml:"h3"`
	Division XhtmlDivision `xml:"div"`
}

type Xhtml struct {
	XMLName xml.Name  `xml:"html"`
	Head    XhtmlHead `xml:"head"`
	Body    XhtmlBody `xml:"body"`
}

// Normalize an XHTML division into raw XHTML
func normalize_division(div XhtmlDivision) []byte {
	var nodes []TextNode

	// Pull all paragraphs into nodes
	for _, par := range div.Paragraphs {
		nodes = append(nodes, par)
	}

	// Pull all blockquotes into nodes
	for _, quote := range div.BlockQuotes {
		nodes = append(nodes, quote)
	}

	// Sort paragraphs and blockquotes
	sort.Slice(nodes, func(i, j int) bool {
		return nodes[i].Order() < nodes[j].Order()
	})

	var xhtml []byte

	// Convert nodes into raw XHTML
	for _, node := range nodes {
		xhtml_node, err := xml.Marshal(&node)
		if err != nil {
			fmt.Printf("error: %s\n", err)
		} else {
			xhtml= append(xhtml, xhtml_node...)
		}
	}

	// Recurse with all nested divisions
	for _, nested_div := range div.Divisions {
		xhtml = append(xhtml, normalize_division(nested_div)...)
	}

	return xhtml
}

// Convert an XHTML structure into an HTML structure
func xhtml_to_html(source Xhtml) *Html {
	dest := &Html{}

	// Copy fields
	dest.Body.Title = source.Body.Title
	dest.Body.Title = source.Body.Title

	// Convert division
	dest.Body.Division.Content = normalize_division(source.Body.Division)

	return dest
}

// Parse an XHTML file
func ParseXhtml(content string) *Html {
	xhtml := Xhtml{}

	// Unmarshal XHTML
	if err := xml.Unmarshal([]byte(content), &xhtml); err != nil {
		fmt.Printf("error: %s\n", err)
	}


	return xhtml_to_html(xhtml)
}

// Read HTML from an HXTML file
func ReadHtml(content string) string {
	buffer, err := xml.MarshalIndent(ParseXhtml(content), "", "  ")
	if err != nil {
		fmt.Printf("error: %s\n", err)
	}
	return string(buffer)
}