From 565f40feaf047541aac49915ddf89afac6d04362 Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Sun, 16 Oct 2022 14:12:21 -0500 Subject: [PATCH] Initial commit Parses an EPUB archive into HTML. Currently dumps everything to STDOUT, but can push into array and then sort according to table of contents. Blockquotes are not handled correctly. Need to hack on (un)marshaler for that to work correctly. Titles are not always handled correctly. Need to look for more than just h3 tags (i.e. h1, h2) and need to let these bubble up from div tags. --- go.mod | 3 ++ main.go | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ xml.go | 53 ++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 go.mod create mode 100644 main.go create mode 100644 xml.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..d863c0a --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module git.dominic-ricottone.com/~dricottone/epub2html + +go 1.19 diff --git a/main.go b/main.go new file mode 100644 index 0000000..9483233 --- /dev/null +++ b/main.go @@ -0,0 +1,114 @@ +package main + +import ( + "io" + "fmt" + "strings" + "sort" + "archive/zip" + "encoding/xml" +) + +// e-pub XHTML features arbitrary nesting of divisions. To strip the excess +// div tags, we need to recursively extract paragraphs from divisions. +// Recommended usage: +// Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division) +// Xhtml.Body.Division.Divisions = []Division{} +func normalize_division(div Division) []Paragraph { + // If div contains p tags, return those + if (len(div.Paragraphs) != 0) { + return div.Paragraphs + } + + var pars []Paragraph + + // If div contains blockquote tags, return the nested p tags + if (len(div.BlockQuotes) != 0) { + for _, quote := range div.BlockQuotes { + for _, par := range quote.Paragraphs { + pars = append(pars, par) + } + } + return pars + } + + // Else recurse on nested div tags + for _, nested_div := range div.Divisions { + pars = append(pars, normalize_division(nested_div)...) + } + return pars +} + +func dump_archive(filename string) error { + // Open archive + areader, err := zip.OpenReader(filename) + if err != nil { + return err + } + defer areader.Close() + + // Loop over files in archive + for _, file := range areader.File { + + // Skip these less useful files + if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") { + fmt.Printf("Skipping %s...\n", file.Name) + continue + } + + // Open file and copy into a string builder + fmt.Printf("Contents of %s:\n", file.Name) + freader, err := file.Open() + if err != nil { + fmt.Printf("error: %s\n", err) + } + buffer := new(strings.Builder) + if _, err := io.Copy(buffer, freader); err != nil { + fmt.Printf("error: %s\n", err) + } + + if (file.Name == "toc.ncx") { + target := Ncx{} + if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil { + fmt.Printf("error: %s\n", err) + } + + fmt.Println(target.Title) + + sort.Slice(target.NavPoints, func(i, j int) bool { + return target.NavPoints[i].Order < target.NavPoints[j].Order + }) + + html, err := xml.MarshalIndent(&target, "", " ") + if err != nil { + fmt.Printf("error: %s\n", err) + } + fmt.Println(string(html)) + } else { + target := Xhtml{} + if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil { + fmt.Printf("error: %s\n", err) + } + + target.Body.Division.Paragraphs = normalize_division(target.Body.Division) + target.Body.Division.Divisions = []Division{} + + html, err := xml.MarshalIndent(&target, "", " ") + if err != nil { + fmt.Printf("error: %s\n", err) + } + fmt.Println(string(html)) + } + } + + return nil +} + +func main() { + // process arguments + + if err := dump_archive("the_future_is_female.epub"); err != nil { + fmt.Printf("fatal error: %s\n", err) + } +} + diff --git a/xml.go b/xml.go new file mode 100644 index 0000000..a1c3e58 --- /dev/null +++ b/xml.go @@ -0,0 +1,53 @@ +// Structs for parsing X(HT)?ML files in e-pub archives + +package main + +import ( + "encoding/xml" +) + +type Head struct { + Title string `xml:"title"` +} + +type Paragraph struct { + Text string `xml:",innerxml"` +} + +type BlockQuote struct { + Paragraphs []Paragraph `xml:"p"` +} + +type Division struct { + Divisions []Division `xml:"div"` + Paragraphs []Paragraph `xml:"p"` + BlockQuotes []BlockQuote `xml:"blockquote"` +} + +type Body struct { + Title string `xml:"h3"` + Division Division `xml:"div"` +} + +type Xhtml struct { + XMLName xml.Name `xml:"html"` + Head Head `xml:"head"` + Body Body `xml:"body"` +} + +type Content struct { + Src string `xml:"src,attr"` +} + +type NavPoint struct { + Label string `xml:"navLabel>text"` + Content Content `xml:"content"` + Order int `xml:"playOrder,attr"` +} + +type Ncx struct { + XMLName xml.Name `xml:"ncx"` + Title string `xml:"docTitle>text"` + NavPoints []NavPoint `xml:"navMap>navPoint"` +} + -- 2.45.2