From 565f40feaf047541aac49915ddf89afac6d04362 Mon Sep 17 00:00:00 2001
From: Dominic Ricottone <me@dominic-ricottone.com>
Date: Sun, 16 Oct 2022 14:12:21 -0500
Subject: [PATCH] Initial commit

Parses an EPUB archive into HTML. Currently dumps everything to STDOUT,
but can push into array and then sort according to table of contents.

Blockquotes are not handled correctly. Need to hack on (un)marshaler for
that to work correctly.

Titles are not always handled correctly. Need to look for more than just
h3 tags (i.e. h1, h2) and need to let these bubble up from div tags.
---
 go.mod  |   3 ++
 main.go | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 xml.go  |  53 ++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 go.mod
 create mode 100644 main.go
 create mode 100644 xml.go

diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..d863c0a
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module git.dominic-ricottone.com/~dricottone/epub2html
+
+go 1.19
diff --git a/main.go b/main.go
new file mode 100644
index 0000000..9483233
--- /dev/null
+++ b/main.go
@@ -0,0 +1,114 @@
+package main
+
+import (
+	"io"
+	"fmt"
+	"strings"
+	"sort"
+	"archive/zip"
+	"encoding/xml"
+)
+
+// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
+// div tags, we need to recursively extract paragraphs from divisions.
+// Recommended usage:
+//   Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
+//   Xhtml.Body.Division.Divisions = []Division{}
+func normalize_division(div Division) []Paragraph {
+	// If div contains p tags, return those
+	if (len(div.Paragraphs) != 0) {
+		return div.Paragraphs
+	}
+
+	var pars []Paragraph
+
+	// If div contains blockquote tags, return the nested p tags
+	if (len(div.BlockQuotes) != 0) {
+		for _, quote := range div.BlockQuotes {
+			for _, par := range quote.Paragraphs {
+				pars = append(pars, par)
+			}
+		}
+		return pars
+	}
+
+	// Else recurse on nested div tags
+	for _, nested_div := range div.Divisions {
+		pars = append(pars, normalize_division(nested_div)...)
+	}
+	return pars
+}
+
+func dump_archive(filename string) error {
+	// Open archive
+	areader, err := zip.OpenReader(filename)
+	if err != nil {
+		return err
+	}
+	defer areader.Close()
+
+	// Loop over files in archive
+	for _, file := range areader.File {
+
+		// Skip these less useful files
+		if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
+			fmt.Printf("Skipping %s...\n", file.Name)
+			continue
+		}
+
+		// Open file and copy into a string builder
+		fmt.Printf("Contents of %s:\n", file.Name)
+		freader, err := file.Open()
+		if err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+		buffer := new(strings.Builder)
+		if _, err := io.Copy(buffer, freader); err != nil {
+			fmt.Printf("error: %s\n", err)
+		}
+
+		if (file.Name == "toc.ncx") {
+			target := Ncx{}
+			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+
+			fmt.Println(target.Title)
+
+			sort.Slice(target.NavPoints, func(i, j int) bool {
+				return target.NavPoints[i].Order < target.NavPoints[j].Order
+			})
+
+			html, err := xml.MarshalIndent(&target, "", "  ")
+			if err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+			fmt.Println(string(html))
+		} else {
+			target := Xhtml{}
+			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+
+			target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
+			target.Body.Division.Divisions = []Division{}
+
+			html, err := xml.MarshalIndent(&target, "", "  ")
+			if err != nil {
+				fmt.Printf("error: %s\n", err)
+			}
+			fmt.Println(string(html))
+		}
+	}
+
+	return nil
+}
+
+func main() {
+	// process arguments
+
+	if err := dump_archive("the_future_is_female.epub"); err != nil {
+		fmt.Printf("fatal error: %s\n", err)
+	}
+}
+
diff --git a/xml.go b/xml.go
new file mode 100644
index 0000000..a1c3e58
--- /dev/null
+++ b/xml.go
@@ -0,0 +1,53 @@
+// Structs for parsing X(HT)?ML files in e-pub archives
+
+package main
+
+import (
+	"encoding/xml"
+)
+
+type Head struct {
+	Title string `xml:"title"`
+}
+
+type Paragraph struct {
+	Text string `xml:",innerxml"`
+}
+
+type BlockQuote struct {
+	Paragraphs  []Paragraph  `xml:"p"`
+}
+
+type Division struct {
+	Divisions   []Division   `xml:"div"`
+	Paragraphs  []Paragraph  `xml:"p"`
+	BlockQuotes []BlockQuote `xml:"blockquote"`
+}
+
+type Body struct {
+	Title    string   `xml:"h3"`
+	Division Division `xml:"div"`
+}
+
+type Xhtml struct {
+	XMLName xml.Name `xml:"html"`
+	Head    Head     `xml:"head"`
+	Body    Body     `xml:"body"`
+}
+
+type Content struct {
+	Src string `xml:"src,attr"`
+}
+
+type NavPoint struct {
+	Label   string  `xml:"navLabel>text"`
+	Content Content `xml:"content"`
+	Order   int     `xml:"playOrder,attr"`
+}
+
+type Ncx struct {
+	XMLName   xml.Name   `xml:"ncx"`
+	Title     string     `xml:"docTitle>text"`
+	NavPoints []NavPoint `xml:"navMap>navPoint"`
+}
+
-- 
2.45.2