~dricottone/epub2html

565f40feaf047541aac49915ddf89afac6d04362 — Dominic Ricottone 2 years ago
Initial commit

Parses an EPUB archive into HTML. Currently dumps everything to STDOUT,
but can push into array and then sort according to table of contents.

Blockquotes are not handled correctly. Need to hack on (un)marshaler for
that to work correctly.

Titles are not always handled correctly. Need to look for more than just
h3 tags (i.e. h1, h2) and need to let these bubble up from div tags.
3 files changed, 170 insertions(+), 0 deletions(-)

A go.mod
A main.go
A xml.go
A  => go.mod +3 -0
@@ 1,3 @@
module git.dominic-ricottone.com/~dricottone/epub2html

go 1.19

A  => main.go +114 -0
@@ 1,114 @@
package main

import (
	"io"
	"fmt"
	"strings"
	"sort"
	"archive/zip"
	"encoding/xml"
)

// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
// div tags, we need to recursively extract paragraphs from divisions.
// Recommended usage:
//   Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
//   Xhtml.Body.Division.Divisions = []Division{}
func normalize_division(div Division) []Paragraph {
	// If div contains p tags, return those
	if (len(div.Paragraphs) != 0) {
		return div.Paragraphs
	}

	var pars []Paragraph

	// If div contains blockquote tags, return the nested p tags
	if (len(div.BlockQuotes) != 0) {
		for _, quote := range div.BlockQuotes {
			for _, par := range quote.Paragraphs {
				pars = append(pars, par)
			}
		}
		return pars
	}

	// Else recurse on nested div tags
	for _, nested_div := range div.Divisions {
		pars = append(pars, normalize_division(nested_div)...)
	}
	return pars
}

func dump_archive(filename string) error {
	// Open archive
	areader, err := zip.OpenReader(filename)
	if err != nil {
		return err
	}
	defer areader.Close()

	// Loop over files in archive
	for _, file := range areader.File {

		// Skip these less useful files
		if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
			fmt.Printf("Skipping %s...\n", file.Name)
			continue
		}

		// Open file and copy into a string builder
		fmt.Printf("Contents of %s:\n", file.Name)
		freader, err := file.Open()
		if err != nil {
			fmt.Printf("error: %s\n", err)
		}
		buffer := new(strings.Builder)
		if _, err := io.Copy(buffer, freader); err != nil {
			fmt.Printf("error: %s\n", err)
		}

		if (file.Name == "toc.ncx") {
			target := Ncx{}
			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
				fmt.Printf("error: %s\n", err)
			}

			fmt.Println(target.Title)

			sort.Slice(target.NavPoints, func(i, j int) bool {
				return target.NavPoints[i].Order < target.NavPoints[j].Order
			})

			html, err := xml.MarshalIndent(&target, "", "  ")
			if err != nil {
				fmt.Printf("error: %s\n", err)
			}
			fmt.Println(string(html))
		} else {
			target := Xhtml{}
			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
				fmt.Printf("error: %s\n", err)
			}

			target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
			target.Body.Division.Divisions = []Division{}

			html, err := xml.MarshalIndent(&target, "", "  ")
			if err != nil {
				fmt.Printf("error: %s\n", err)
			}
			fmt.Println(string(html))
		}
	}

	return nil
}

func main() {
	// process arguments

	if err := dump_archive("the_future_is_female.epub"); err != nil {
		fmt.Printf("fatal error: %s\n", err)
	}
}


A  => xml.go +53 -0
@@ 1,53 @@
// Structs for parsing X(HT)?ML files in e-pub archives

package main

import (
	"encoding/xml"
)

type Head struct {
	Title string `xml:"title"`
}

type Paragraph struct {
	Text string `xml:",innerxml"`
}

type BlockQuote struct {
	Paragraphs  []Paragraph  `xml:"p"`
}

type Division struct {
	Divisions   []Division   `xml:"div"`
	Paragraphs  []Paragraph  `xml:"p"`
	BlockQuotes []BlockQuote `xml:"blockquote"`
}

type Body struct {
	Title    string   `xml:"h3"`
	Division Division `xml:"div"`
}

type Xhtml struct {
	XMLName xml.Name `xml:"html"`
	Head    Head     `xml:"head"`
	Body    Body     `xml:"body"`
}

type Content struct {
	Src string `xml:"src,attr"`
}

type NavPoint struct {
	Label   string  `xml:"navLabel>text"`
	Content Content `xml:"content"`
	Order   int     `xml:"playOrder,attr"`
}

type Ncx struct {
	XMLName   xml.Name   `xml:"ncx"`
	Title     string     `xml:"docTitle>text"`
	NavPoints []NavPoint `xml:"navMap>navPoint"`
}