A epub.go => epub.go +48 -0
@@ 0,0 1,48 @@
+// Functions for handling e-pub archives
+
+package main
+
+import (
+ "io"
+ "fmt"
+ "strings"
+ "archive/zip"
+)
+
+func ReadArchive(filename string) (map[string]string, error) {
+ // Open archive
+ archive_reader, err := zip.OpenReader(filename)
+ if err != nil {
+ return nil, err
+ }
+ defer archive_reader.Close()
+
+ var archive = map[string]string{}
+
+ // Loop over files in archive
+ for _, file := range archive_reader.File {
+
+ // Skip these less useful files
+ if (file.Name == "mimetype" || file.Name == "content.opf" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css")) {
+ continue
+ }
+
+ // Open file
+ file_reader, err := file.Open()
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ // Copy file contents into a string builder
+ buffer := new(strings.Builder)
+ if _, err := io.Copy(buffer, file_reader); err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ // Store final string mapped by the file name
+ archive[file.Name] = buffer.String()
+ }
+
+ return archive, nil
+}
+
A html.go => html.go +31 -0
@@ 0,0 1,31 @@
+// Structs for marshaling HTML files
+
+package main
+
+import (
+ "encoding/xml"
+)
+
+// Unlike the nested X(HT)ML division, this division is raw XHTML.
+type HtmlDivision struct {
+ Content []byte `xml:",innerxml"`
+}
+
+// The HTML structure should be a pair of one head and one body.
+// An HTML head will mirror an XML head.
+// An HTML body will be much simpler than an XML body.
+type HtmlBody struct {
+ Title string `xml:"h3"`
+ Division HtmlDivision `xml:"div"`
+}
+
+type HtmlHead struct {
+ Title string `xml:"title"`
+}
+
+type Html struct {
+ XMLName xml.Name `xml:"html"`
+ Head HtmlHead `xml:"head"`
+ Body HtmlBody `xml:"body"`
+}
+
M main.go => main.go +8 -90
@@ 1,104 1,22 @@
package main
import (
- "io"
"fmt"
- "strings"
- "sort"
- "archive/zip"
- "encoding/xml"
)
-// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
-// div tags, we need to recursively extract paragraphs from divisions.
-// Recommended usage:
-// Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
-// Xhtml.Body.Division.Divisions = []Division{}
-func normalize_division(div Division) []Paragraph {
- // If div contains p tags, return those
- if (len(div.Paragraphs) != 0) {
- return div.Paragraphs
- }
-
- var pars []Paragraph
-
- // If div contains blockquote tags, return the nested p tags
- if (len(div.BlockQuotes) != 0) {
- for _, quote := range div.BlockQuotes {
- for _, par := range quote.Paragraphs {
- pars = append(pars, par)
- }
- }
- return pars
- }
-
- // Else recurse on nested div tags
- for _, nested_div := range div.Divisions {
- pars = append(pars, normalize_division(nested_div)...)
- }
- return pars
-}
-
-func dump_archive(filename string) error {
- // Open archive
- areader, err := zip.OpenReader(filename)
+func dump_archive(archive_name string) error {
+ // Read the archive
+ archive, err := ReadArchive(archive_name)
if err != nil {
return err
}
- defer areader.Close()
-
- // Loop over files in archive
- for _, file := range areader.File {
-
- // Skip these less useful files
- if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
- fmt.Printf("Skipping %s...\n", file.Name)
- continue
- }
-
- // Open file and copy into a string builder
- fmt.Printf("Contents of %s:\n", file.Name)
- freader, err := file.Open()
- if err != nil {
- fmt.Printf("error: %s\n", err)
- }
- buffer := new(strings.Builder)
- if _, err := io.Copy(buffer, freader); err != nil {
- fmt.Printf("error: %s\n", err)
- }
-
- if (file.Name == "toc.ncx") {
- target := Ncx{}
- if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
- fmt.Printf("error: %s\n", err)
- }
-
- fmt.Println(target.Title)
-
- sort.Slice(target.NavPoints, func(i, j int) bool {
- return target.NavPoints[i].Order < target.NavPoints[j].Order
- })
-
- html, err := xml.MarshalIndent(&target, "", " ")
- if err != nil {
- fmt.Printf("error: %s\n", err)
- }
- fmt.Println(string(html))
- } else {
- target := Xhtml{}
- if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
- fmt.Printf("error: %s\n", err)
- }
- target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
- target.Body.Division.Divisions = []Division{}
+ // Get a sorted table of contents
+ toc := ReadTableOfContents(archive["toc.ncx"])
- html, err := xml.MarshalIndent(&target, "", " ")
- if err != nil {
- fmt.Printf("error: %s\n", err)
- }
- fmt.Println(string(html))
- }
+ // Print files according to the table of contents
+ for _, file_name := range toc {
+ fmt.Println(ReadHtml(archive[file_name]))
}
return nil
A ncx.go => ncx.go +65 -0
@@ 0,0 1,65 @@
+// Structs and functions for unmarshaling NCX files in e-pub archives
+
+package main
+
+import (
+ "fmt"
+ "encoding/xml"
+ "net/url"
+ "sort"
+)
+
+// A URI for content
+type Content struct {
+ Src string `xml:"src,attr"`
+}
+
+// A content label and URI pair
+type NavPoint struct {
+ Label string `xml:"navLabel>text"`
+ Content Content `xml:"content"`
+ Order int `xml:"playOrder,attr"`
+}
+
+// A series of navigation points
+type Ncx struct {
+ XMLName xml.Name `xml:"ncx"`
+ Title string `xml:"docTitle>text"`
+ NavPoints []NavPoint `xml:"navMap>navPoint"`
+}
+
+// Parse an NCX file
+func ParseNcx(content string) *Ncx {
+ dest := &Ncx{}
+
+ // Unmarshal XHTML
+ if err := xml.Unmarshal([]byte(content), &dest); err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ // Sort navigation points
+ sort.Slice(dest.NavPoints, func(i, j int) bool {
+ return dest.NavPoints[i].Order < dest.NavPoints[j].Order
+ })
+
+ return dest
+}
+
+// Build a table of contents from an NCX file
+func ReadTableOfContents(content string) []string {
+ ncx := ParseNcx(content)
+
+ var toc []string
+
+ for _, point := range ncx.NavPoints {
+ src, err := url.QueryUnescape(point.Content.Src)
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ toc = append(toc, src)
+ }
+
+ return toc
+}
+
A textnodes.go => textnodes.go +37 -0
@@ 0,0 1,37 @@
+// Structs for handling text nodes
+
+package main
+
+import (
+ "encoding/xml"
+)
+
+// Text nodes are either a paragraph or a blockquote.
+type TextNode interface {
+ Order() int
+}
+
+// A paragraph contains character data and other text-oriented tags, such as
+// b or strong. This data should be retained as raw XML.
+type Paragraph struct {
+ XMLName xml.Name `xml:"p"`
+ Text string `xml:",innerxml"`
+ order int `xml:"-"`
+}
+
+func (p Paragraph) Order() int {
+ return p.order
+}
+
+// A blockquote contains paragraphs. Unlike divisions, the blockquote structure
+// must be maintained in order to format the paragraphs correctly.
+type BlockQuote struct {
+ XMLName xml.Name `xml:"blockquote"`
+ Paragraphs []Paragraph `xml:"p"`
+ order int `xml:"-"`
+}
+
+func (b BlockQuote) Order() int {
+ return b.order
+}
+
A xhtml.go => xhtml.go +155 -0
@@ 0,0 1,155 @@
+// Structs and functions for unmarshaling XHTML files in e-pub archives
+
+package main
+
+import (
+ "fmt"
+ "io"
+ "sort"
+ "encoding/xml"
+)
+
+// A XHTML division can contain paragraphs, blockquotes, and further nested
+// XHTML divisions.
+type XhtmlDivision struct {
+ Divisions []XhtmlDivision `xml:"div"`
+ Paragraphs []Paragraph `xml:"p"`
+ BlockQuotes []BlockQuote `xml:"blockquote"`
+}
+
+func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
+ counter := 0
+
+ for {
+ token, err := decoder.Token()
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ return err
+ }
+
+ switch token.(type) {
+ case xml.StartElement:
+ new_start := token.(xml.StartElement)
+ if (new_start.Name.Local == "p") {
+ target := Paragraph{}
+ decoder.DecodeElement(&target, &new_start)
+
+ target.order = counter
+ counter += 1
+
+ d.Paragraphs = append(d.Paragraphs, target)
+ } else if (new_start.Name.Local == "blockquote") {
+ target := BlockQuote{}
+ decoder.DecodeElement(&target, &new_start)
+
+ target.order = counter
+ counter += 1
+
+ d.BlockQuotes = append(d.BlockQuotes, target)
+ } else if (new_start.Name.Local == "div") {
+ target := XhtmlDivision{}
+ decoder.DecodeElement(&target, &new_start)
+
+ d.Divisions = append(d.Divisions, target)
+ }
+ }
+ }
+
+ return nil
+}
+
+// The XHTML structure should be a pair of one head and one body.
+// The XHTML head contains little information that should be retained.
+// The XHTML body is a nesting structure.
+type XhtmlHead struct {
+ Title string `xml:"title"`
+}
+
+type XhtmlBody struct {
+ Title string `xml:"h3"`
+ Division XhtmlDivision `xml:"div"`
+}
+
+type Xhtml struct {
+ XMLName xml.Name `xml:"html"`
+ Head XhtmlHead `xml:"head"`
+ Body XhtmlBody `xml:"body"`
+}
+
+// Normalize an XHTML division into raw XHTML
+func normalize_division(div XhtmlDivision) []byte {
+ var nodes []TextNode
+
+ // Pull all paragraphs into nodes
+ for _, par := range div.Paragraphs {
+ nodes = append(nodes, par)
+ }
+
+ // Pull all blockquotes into nodes
+ for _, quote := range div.BlockQuotes {
+ nodes = append(nodes, quote)
+ }
+
+ // Sort paragraphs and blockquotes
+ sort.Slice(nodes, func(i, j int) bool {
+ return nodes[i].Order() < nodes[j].Order()
+ })
+
+ var xhtml []byte
+
+ // Convert nodes into raw XHTML
+ for _, node := range nodes {
+ xhtml_node, err := xml.Marshal(&node)
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ } else {
+ xhtml= append(xhtml, xhtml_node...)
+ }
+ }
+
+ // Recurse with all nested divisions
+ for _, nested_div := range div.Divisions {
+ xhtml = append(xhtml, normalize_division(nested_div)...)
+ }
+
+ return xhtml
+}
+
+// Convert an XHTML structure into an HTML structure
+func xhtml_to_html(source Xhtml) *Html {
+ dest := &Html{}
+
+ // Copy fields
+ dest.Body.Title = source.Body.Title
+ dest.Body.Title = source.Body.Title
+
+ // Convert division
+ dest.Body.Division.Content = normalize_division(source.Body.Division)
+
+ return dest
+}
+
+// Parse an XHTML file
+func ParseXhtml(content string) *Html {
+ xhtml := Xhtml{}
+
+ // Unmarshal XHTML
+ if err := xml.Unmarshal([]byte(content), &xhtml); err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+
+ return xhtml_to_html(xhtml)
+}
+
+// Read HTML from an HXTML file
+func ReadHtml(content string) string {
+ buffer, err := xml.MarshalIndent(ParseXhtml(content), "", " ")
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+ return string(buffer)
+}
+
D xml.go => xml.go +0 -99
@@ 1,99 0,0 @@
-// Structs for parsing X(HT)?ML files in e-pub archives
-
-package main
-
-import (
- "io"
- "encoding/xml"
-)
-
-type Head struct {
- Title string `xml:"title"`
-}
-
-type Paragraph struct {
- Text string `xml:",innerxml"`
- Order int `xml:"-"`
-}
-
-type BlockQuote struct {
- Paragraphs []Paragraph `xml:"p"`
- Order int `xml:"-"`
-}
-
-type Division struct {
- Divisions []Division `xml:"div"`
- Paragraphs []Paragraph `xml:"p"`
- BlockQuotes []BlockQuote `xml:"blockquote"`
-}
-
-func (d *Division) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error {
- counter := 0
-
- for {
- token, err := decoder.Token()
- if err == io.EOF {
- break
- }
- if err != nil {
- return err
- }
-
- switch token.(type) {
- case xml.StartElement:
- new_start := token.(xml.StartElement)
- if (new_start.Name.Local == "p") {
- target := Paragraph{}
- decoder.DecodeElement(&target, &new_start)
-
- target.Order = counter
- counter += 1
-
- d.Paragraphs = append(d.Paragraphs, target)
- } else if (new_start.Name.Local == "blockquote") {
- target := BlockQuote{}
- decoder.DecodeElement(&target, &new_start)
-
- target.Order = counter
- counter += 1
-
- d.BlockQuotes = append(d.BlockQuotes, target)
- } else if (new_start.Name.Local == "div") {
- target := Division{}
- decoder.DecodeElement(&target, &new_start)
-
- d.Divisions = append(d.Divisions, target)
- }
- }
- }
-
- return nil
-}
-
-type Body struct {
- Title string `xml:"h3"`
- Division Division `xml:"div"`
-}
-
-type Xhtml struct {
- XMLName xml.Name `xml:"html"`
- Head Head `xml:"head"`
- Body Body `xml:"body"`
-}
-
-type Content struct {
- Src string `xml:"src,attr"`
-}
-
-type NavPoint struct {
- Label string `xml:"navLabel>text"`
- Content Content `xml:"content"`
- Order int `xml:"playOrder,attr"`
-}
-
-type Ncx struct {
- XMLName xml.Name `xml:"ncx"`
- Title string `xml:"docTitle>text"`
- NavPoints []NavPoint `xml:"navMap>navPoint"`
-}
-