@@ 1,114 @@
+package main
+
+import (
+ "io"
+ "fmt"
+ "strings"
+ "sort"
+ "archive/zip"
+ "encoding/xml"
+)
+
+// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
+// div tags, we need to recursively extract paragraphs from divisions.
+// Recommended usage:
+// Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
+// Xhtml.Body.Division.Divisions = []Division{}
+func normalize_division(div Division) []Paragraph {
+ // If div contains p tags, return those
+ if (len(div.Paragraphs) != 0) {
+ return div.Paragraphs
+ }
+
+ var pars []Paragraph
+
+ // If div contains blockquote tags, return the nested p tags
+ if (len(div.BlockQuotes) != 0) {
+ for _, quote := range div.BlockQuotes {
+ for _, par := range quote.Paragraphs {
+ pars = append(pars, par)
+ }
+ }
+ return pars
+ }
+
+ // Else recurse on nested div tags
+ for _, nested_div := range div.Divisions {
+ pars = append(pars, normalize_division(nested_div)...)
+ }
+ return pars
+}
+
+func dump_archive(filename string) error {
+ // Open archive
+ areader, err := zip.OpenReader(filename)
+ if err != nil {
+ return err
+ }
+ defer areader.Close()
+
+ // Loop over files in archive
+ for _, file := range areader.File {
+
+ // Skip these less useful files
+ if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
+ fmt.Printf("Skipping %s...\n", file.Name)
+ continue
+ }
+
+ // Open file and copy into a string builder
+ fmt.Printf("Contents of %s:\n", file.Name)
+ freader, err := file.Open()
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+ buffer := new(strings.Builder)
+ if _, err := io.Copy(buffer, freader); err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ if (file.Name == "toc.ncx") {
+ target := Ncx{}
+ if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ fmt.Println(target.Title)
+
+ sort.Slice(target.NavPoints, func(i, j int) bool {
+ return target.NavPoints[i].Order < target.NavPoints[j].Order
+ })
+
+ html, err := xml.MarshalIndent(&target, "", " ")
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+ fmt.Println(string(html))
+ } else {
+ target := Xhtml{}
+ if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+
+ target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
+ target.Body.Division.Divisions = []Division{}
+
+ html, err := xml.MarshalIndent(&target, "", " ")
+ if err != nil {
+ fmt.Printf("error: %s\n", err)
+ }
+ fmt.Println(string(html))
+ }
+ }
+
+ return nil
+}
+
+func main() {
+ // process arguments
+
+ if err := dump_archive("the_future_is_female.epub"); err != nil {
+ fmt.Printf("fatal error: %s\n", err)
+ }
+}
+
@@ 1,53 @@
+// Structs for parsing X(HT)?ML files in e-pub archives
+
+package main
+
+import (
+ "encoding/xml"
+)
+
+type Head struct {
+ Title string `xml:"title"`
+}
+
+type Paragraph struct {
+ Text string `xml:",innerxml"`
+}
+
+type BlockQuote struct {
+ Paragraphs []Paragraph `xml:"p"`
+}
+
+type Division struct {
+ Divisions []Division `xml:"div"`
+ Paragraphs []Paragraph `xml:"p"`
+ BlockQuotes []BlockQuote `xml:"blockquote"`
+}
+
+type Body struct {
+ Title string `xml:"h3"`
+ Division Division `xml:"div"`
+}
+
+type Xhtml struct {
+ XMLName xml.Name `xml:"html"`
+ Head Head `xml:"head"`
+ Body Body `xml:"body"`
+}
+
+type Content struct {
+ Src string `xml:"src,attr"`
+}
+
+type NavPoint struct {
+ Label string `xml:"navLabel>text"`
+ Content Content `xml:"content"`
+ Order int `xml:"playOrder,attr"`
+}
+
+type Ncx struct {
+ XMLName xml.Name `xml:"ncx"`
+ Title string `xml:"docTitle>text"`
+ NavPoints []NavPoint `xml:"navMap>navPoint"`
+}
+