~dricottone/epub2html

ref: 494d6b55394b19378dda6b4a3b95960f64c7ee4b epub2html/main.go -rw-r--r-- 2.7 KiB
494d6b55Dominic Ricottone Structure for test documents 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package main

import (
	"io"
	"fmt"
	"strings"
	"sort"
	"archive/zip"
	"encoding/xml"
)

// e-pub XHTML features arbitrary nesting of divisions. To strip the excess
// div tags, we need to recursively extract paragraphs from divisions.
// Recommended usage:
//   Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division)
//   Xhtml.Body.Division.Divisions = []Division{}
func normalize_division(div Division) []Paragraph {
	// If div contains p tags, return those
	if (len(div.Paragraphs) != 0) {
		return div.Paragraphs
	}

	var pars []Paragraph

	// If div contains blockquote tags, return the nested p tags
	if (len(div.BlockQuotes) != 0) {
		for _, quote := range div.BlockQuotes {
			for _, par := range quote.Paragraphs {
				pars = append(pars, par)
			}
		}
		return pars
	}

	// Else recurse on nested div tags
	for _, nested_div := range div.Divisions {
		pars = append(pars, normalize_division(nested_div)...)
	}
	return pars
}

func dump_archive(filename string) error {
	// Open archive
	areader, err := zip.OpenReader(filename)
	if err != nil {
		return err
	}
	defer areader.Close()

	// Loop over files in archive
	for _, file := range areader.File {

		// Skip these less useful files
		if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") {
			fmt.Printf("Skipping %s...\n", file.Name)
			continue
		}

		// Open file and copy into a string builder
		fmt.Printf("Contents of %s:\n", file.Name)
		freader, err := file.Open()
		if err != nil {
			fmt.Printf("error: %s\n", err)
		}
		buffer := new(strings.Builder)
		if _, err := io.Copy(buffer, freader); err != nil {
			fmt.Printf("error: %s\n", err)
		}

		if (file.Name == "toc.ncx") {
			target := Ncx{}
			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
				fmt.Printf("error: %s\n", err)
			}

			fmt.Println(target.Title)

			sort.Slice(target.NavPoints, func(i, j int) bool {
				return target.NavPoints[i].Order < target.NavPoints[j].Order
			})

			html, err := xml.MarshalIndent(&target, "", "  ")
			if err != nil {
				fmt.Printf("error: %s\n", err)
			}
			fmt.Println(string(html))
		} else {
			target := Xhtml{}
			if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil {
				fmt.Printf("error: %s\n", err)
			}

			target.Body.Division.Paragraphs = normalize_division(target.Body.Division)
			target.Body.Division.Divisions = []Division{}

			html, err := xml.MarshalIndent(&target, "", "  ")
			if err != nil {
				fmt.Printf("error: %s\n", err)
			}
			fmt.Println(string(html))
		}
	}

	return nil
}

func main() {
	// process arguments

	if err := dump_archive("test_docs/the_future_is_female.epub"); err != nil {
		fmt.Printf("fatal error: %s\n", err)
	}
}