~dricottone/epub2html: Initial commit - dominic-ricottone.com git

3 files changed, 170 insertions(+), 0 deletions(-)

A go.mod
A main.go
A xml.go

A  => go.mod +3 -0

@@ 1,3 @@
+module git.dominic-ricottone.com/~dricottone/epub2html
+
+go 1.19

A  => main.go +114 -0

@@ 1,114 @@
+package main
+
+import (
+	"io"
+	"fmt"
+	"strings"
+	"sort"
+	"archive/zip"
+	"encoding/xml"
+)
+
+// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
+// div tags, we need to recursively extract paragraphs from divisions.
+// Recommended usage:
+//   Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
+//   Xhtml.Body.Division.Divisions = []Division{}
+func normalize_division(div Division) []Paragraph {
+	// If div contains p tags, return those
+	if (len(div.Paragraphs) != 0) {
+		return div.Paragraphs
+	}
+
+	var pars []Paragraph
+
+	// If div contains blockquote tags, return the nested p tags
+	if (len(div.BlockQuotes) != 0) {
+		for _, quote := range div.BlockQuotes {
+			for _, par := range quote.Paragraphs {
+				pars = append(pars, par)
+			}
+		}
+		return pars
+	}
+
+	// Else recurse on nested div tags
+	for _, nested_div := range div.Divisions {
+		pars = append(pars, normalize_division(nested_div)...)
+	}
+	return pars
+}
+
+func dump_archive(filename string) error {
+	// Open archive
+	areader, err := zip.OpenReader(filename)
+	if err != nil {
+		return err
+	}
+	defer areader.Close()
+
+	// Loop over files in archive
+	for _, file := range areader.File {
+
+		// Skip these less useful files
+		if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
+			fmt.Printf("Skipping %s...\n", file.Name)
+			continue
+		}
+
+		// Open file and copy into a string builder
+		fmt.Printf("Contents of %s:\n", file.Name)
+		freader, err := file.Open()
+		if err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+		buffer := new(strings.Builder)
+		if _, err := io.Copy(buffer, freader); err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+
+		if (file.Name == "toc.ncx") {
+			target := Ncx{}
+			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+
+			fmt.Println(target.Title)
+
+			sort.Slice(target.NavPoints, func(i, j int) bool {
+				return target.NavPoints[i].Order < target.NavPoints[j].Order
+			})
+
+			html, err := xml.MarshalIndent(&target, "", "  ")
+			if err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+			fmt.Println(string(html))
+		} else {
+			target := Xhtml{}
+			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+
+			target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
+			target.Body.Division.Divisions = []Division{}
+
+			html, err := xml.MarshalIndent(&target, "", "  ")
+			if err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+			fmt.Println(string(html))
+		}
+	}
+
+	return nil
+}
+
+func main() {
+	// process arguments
+
+	if err := dump_archive("the_future_is_female.epub"); err != nil {
+		fmt.Printf("fatal error: %s\n", err)
+	}
+}
+

A  => xml.go +53 -0

@@ 1,53 @@
+// Structs for parsing X(HT)?ML files in e-pub archives
+
+package main
+
+import (
+	"encoding/xml"
+)
+
+type Head struct {
+	Title string `xml:"title"`
+}
+
+type Paragraph struct {
+	Text string `xml:",innerxml"`
+}
+
+type BlockQuote struct {
+	Paragraphs  []Paragraph  `xml:"p"`
+}
+
+type Division struct {
+	Divisions   []Division   `xml:"div"`
+	Paragraphs  []Paragraph  `xml:"p"`
+	BlockQuotes []BlockQuote `xml:"blockquote"`
+}
+
+type Body struct {
+	Title    string   `xml:"h3"`
+	Division Division `xml:"div"`
+}
+
+type Xhtml struct {
+	XMLName xml.Name `xml:"html"`
+	Head    Head     `xml:"head"`
+	Body    Body     `xml:"body"`
+}
+
+type Content struct {
+	Src string `xml:"src,attr"`
+}
+
+type NavPoint struct {
+	Label   string  `xml:"navLabel>text"`
+	Content Content `xml:"content"`
+	Order   int     `xml:"playOrder,attr"`
+}
+
+type Ncx struct {
+	XMLName   xml.Name   `xml:"ncx"`
+	Title     string     `xml:"docTitle>text"`
+	NavPoints []NavPoint `xml:"navMap>navPoint"`
+}
+