From ae806b4144a63817e5a1a9c3448540314e2e77ce Mon Sep 17 00:00:00 2001 From: Dominic Ricottone Date: Mon, 17 Oct 2022 16:04:16 -0500 Subject: [PATCH] Fixing blockquotes and refactoring Blockquotes are fixed. Rather than hacking on the marshaler, I am populating a secondary struct with raw XML. The standard marshaler is able to work with this struct perfectly. The code has been refactored. All of the (un)?marshaling structs and implementations have been moved to domain-specific files. Helper functions are defined so that data is stored in an intermediary slice and map. The program is still just dumping that data to STDOUT, but all the hard work for a higher level functionality is done. --- epub.go | 48 ++++++++++++++++ html.go | 31 +++++++++++ main.go | 98 +++----------------------------- ncx.go | 65 +++++++++++++++++++++ textnodes.go | 37 ++++++++++++ xhtml.go | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++ xml.go | 99 -------------------------------- 7 files changed, 344 insertions(+), 189 deletions(-) create mode 100644 epub.go create mode 100644 html.go create mode 100644 ncx.go create mode 100644 textnodes.go create mode 100644 xhtml.go delete mode 100644 xml.go diff --git a/epub.go b/epub.go new file mode 100644 index 0000000..22353da --- /dev/null +++ b/epub.go @@ -0,0 +1,48 @@ +// Functions for handling e-pub archives + +package main + +import ( + "io" + "fmt" + "strings" + "archive/zip" +) + +func ReadArchive(filename string) (map[string]string, error) { + // Open archive + archive_reader, err := zip.OpenReader(filename) + if err != nil { + return nil, err + } + defer archive_reader.Close() + + var archive = map[string]string{} + + // Loop over files in archive + for _, file := range archive_reader.File { + + // Skip these less useful files + if (file.Name == "mimetype" || file.Name == "content.opf" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css")) { + continue + } + + // Open file + file_reader, err := file.Open() + if err != nil { + fmt.Printf("error: %s\n", err) + } + + // Copy file contents into a string builder + buffer := new(strings.Builder) + if _, err := io.Copy(buffer, file_reader); err != nil { + fmt.Printf("error: %s\n", err) + } + + // Store final string mapped by the file name + archive[file.Name] = buffer.String() + } + + return archive, nil +} + diff --git a/html.go b/html.go new file mode 100644 index 0000000..f07aba4 --- /dev/null +++ b/html.go @@ -0,0 +1,31 @@ +// Structs for marshaling HTML files + +package main + +import ( + "encoding/xml" +) + +// Unlike the nested X(HT)ML division, this division is raw XHTML. +type HtmlDivision struct { + Content []byte `xml:",innerxml"` +} + +// The HTML structure should be a pair of one head and one body. +// An HTML head will mirror an XML head. +// An HTML body will be much simpler than an XML body. +type HtmlBody struct { + Title string `xml:"h3"` + Division HtmlDivision `xml:"div"` +} + +type HtmlHead struct { + Title string `xml:"title"` +} + +type Html struct { + XMLName xml.Name `xml:"html"` + Head HtmlHead `xml:"head"` + Body HtmlBody `xml:"body"` +} + diff --git a/main.go b/main.go index 6389524..8f7cdec 100644 --- a/main.go +++ b/main.go @@ -1,104 +1,22 @@ package main import ( - "io" "fmt" - "strings" - "sort" - "archive/zip" - "encoding/xml" ) -// e-pub XHTML features arbitrary nesting of divisions. To strip the excess -// div tags, we need to recursively extract paragraphs from divisions. -// Recommended usage: -// Xhtml.Body.Division.Paragraphs = normalize_division(Xhtml.Body.Division) -// Xhtml.Body.Division.Divisions = []Division{} -func normalize_division(div Division) []Paragraph { - // If div contains p tags, return those - if (len(div.Paragraphs) != 0) { - return div.Paragraphs - } - - var pars []Paragraph - - // If div contains blockquote tags, return the nested p tags - if (len(div.BlockQuotes) != 0) { - for _, quote := range div.BlockQuotes { - for _, par := range quote.Paragraphs { - pars = append(pars, par) - } - } - return pars - } - - // Else recurse on nested div tags - for _, nested_div := range div.Divisions { - pars = append(pars, normalize_division(nested_div)...) - } - return pars -} - -func dump_archive(filename string) error { - // Open archive - areader, err := zip.OpenReader(filename) +func dump_archive(archive_name string) error { + // Read the archive + archive, err := ReadArchive(archive_name) if err != nil { return err } - defer areader.Close() - - // Loop over files in archive - for _, file := range areader.File { - - // Skip these less useful files - if (file.Name == "mimetype" || strings.HasPrefix(file.Name, "META-INF") || strings.HasSuffix(file.Name, ".css") || file.Name == "content.opf") { - fmt.Printf("Skipping %s...\n", file.Name) - continue - } - - // Open file and copy into a string builder - fmt.Printf("Contents of %s:\n", file.Name) - freader, err := file.Open() - if err != nil { - fmt.Printf("error: %s\n", err) - } - buffer := new(strings.Builder) - if _, err := io.Copy(buffer, freader); err != nil { - fmt.Printf("error: %s\n", err) - } - - if (file.Name == "toc.ncx") { - target := Ncx{} - if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil { - fmt.Printf("error: %s\n", err) - } - - fmt.Println(target.Title) - - sort.Slice(target.NavPoints, func(i, j int) bool { - return target.NavPoints[i].Order < target.NavPoints[j].Order - }) - - html, err := xml.MarshalIndent(&target, "", " ") - if err != nil { - fmt.Printf("error: %s\n", err) - } - fmt.Println(string(html)) - } else { - target := Xhtml{} - if err = xml.Unmarshal([]byte(buffer.String()), &target); err != nil { - fmt.Printf("error: %s\n", err) - } - target.Body.Division.Paragraphs = normalize_division(target.Body.Division) - target.Body.Division.Divisions = []Division{} + // Get a sorted table of contents + toc := ReadTableOfContents(archive["toc.ncx"]) - html, err := xml.MarshalIndent(&target, "", " ") - if err != nil { - fmt.Printf("error: %s\n", err) - } - fmt.Println(string(html)) - } + // Print files according to the table of contents + for _, file_name := range toc { + fmt.Println(ReadHtml(archive[file_name])) } return nil diff --git a/ncx.go b/ncx.go new file mode 100644 index 0000000..f006b5f --- /dev/null +++ b/ncx.go @@ -0,0 +1,65 @@ +// Structs and functions for unmarshaling NCX files in e-pub archives + +package main + +import ( + "fmt" + "encoding/xml" + "net/url" + "sort" +) + +// A URI for content +type Content struct { + Src string `xml:"src,attr"` +} + +// A content label and URI pair +type NavPoint struct { + Label string `xml:"navLabel>text"` + Content Content `xml:"content"` + Order int `xml:"playOrder,attr"` +} + +// A series of navigation points +type Ncx struct { + XMLName xml.Name `xml:"ncx"` + Title string `xml:"docTitle>text"` + NavPoints []NavPoint `xml:"navMap>navPoint"` +} + +// Parse an NCX file +func ParseNcx(content string) *Ncx { + dest := &Ncx{} + + // Unmarshal XHTML + if err := xml.Unmarshal([]byte(content), &dest); err != nil { + fmt.Printf("error: %s\n", err) + } + + // Sort navigation points + sort.Slice(dest.NavPoints, func(i, j int) bool { + return dest.NavPoints[i].Order < dest.NavPoints[j].Order + }) + + return dest +} + +// Build a table of contents from an NCX file +func ReadTableOfContents(content string) []string { + ncx := ParseNcx(content) + + var toc []string + + for _, point := range ncx.NavPoints { + src, err := url.QueryUnescape(point.Content.Src) + if err != nil { + fmt.Printf("error: %s\n", err) + } + + toc = append(toc, src) + } + + return toc +} + diff --git a/textnodes.go b/textnodes.go new file mode 100644 index 0000000..63fdf99 --- /dev/null +++ b/textnodes.go @@ -0,0 +1,37 @@ +// Structs for handling text nodes + +package main + +import ( + "encoding/xml" +) + +// Text nodes are either a paragraph or a blockquote. +type TextNode interface { + Order() int +} + +// A paragraph contains character data and other text-oriented tags, such as +// b or strong. This data should be retained as raw XML. +type Paragraph struct { + XMLName xml.Name `xml:"p"` + Text string `xml:",innerxml"` + order int `xml:"-"` +} + +func (p Paragraph) Order() int { + return p.order +} + +// A blockquote contains paragraphs. Unlike divisions, the blockquote structure +// must be maintained in order to format the paragraphs correctly. +type BlockQuote struct { + XMLName xml.Name `xml:"blockquote"` + Paragraphs []Paragraph `xml:"p"` + order int `xml:"-"` +} + +func (b BlockQuote) Order() int { + return b.order +} + diff --git a/xhtml.go b/xhtml.go new file mode 100644 index 0000000..761eb28 --- /dev/null +++ b/xhtml.go @@ -0,0 +1,155 @@ +// Structs and functions for unmarshaling XHTML files in e-pub archives + +package main + +import ( + "fmt" + "io" + "sort" + "encoding/xml" +) + +// A XHTML division can contain paragraphs, blockquotes, and further nested +// XHTML divisions. +type XhtmlDivision struct { + Divisions []XhtmlDivision `xml:"div"` + Paragraphs []Paragraph `xml:"p"` + BlockQuotes []BlockQuote `xml:"blockquote"` +} + +func (d *XhtmlDivision) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error { + counter := 0 + + for { + token, err := decoder.Token() + if err == io.EOF { + break + } + if err != nil { + return err + } + + switch token.(type) { + case xml.StartElement: + new_start := token.(xml.StartElement) + if (new_start.Name.Local == "p") { + target := Paragraph{} + decoder.DecodeElement(&target, &new_start) + + target.order = counter + counter += 1 + + d.Paragraphs = append(d.Paragraphs, target) + } else if (new_start.Name.Local == "blockquote") { + target := BlockQuote{} + decoder.DecodeElement(&target, &new_start) + + target.order = counter + counter += 1 + + d.BlockQuotes = append(d.BlockQuotes, target) + } else if (new_start.Name.Local == "div") { + target := XhtmlDivision{} + decoder.DecodeElement(&target, &new_start) + + d.Divisions = append(d.Divisions, target) + } + } + } + + return nil +} + +// The XHTML structure should be a pair of one head and one body. +// The XHTML head contains little information that should be retained. +// The XHTML body is a nesting structure. +type XhtmlHead struct { + Title string `xml:"title"` +} + +type XhtmlBody struct { + Title string `xml:"h3"` + Division XhtmlDivision `xml:"div"` +} + +type Xhtml struct { + XMLName xml.Name `xml:"html"` + Head XhtmlHead `xml:"head"` + Body XhtmlBody `xml:"body"` +} + +// Normalize an XHTML division into raw XHTML +func normalize_division(div XhtmlDivision) []byte { + var nodes []TextNode + + // Pull all paragraphs into nodes + for _, par := range div.Paragraphs { + nodes = append(nodes, par) + } + + // Pull all blockquotes into nodes + for _, quote := range div.BlockQuotes { + nodes = append(nodes, quote) + } + + // Sort paragraphs and blockquotes + sort.Slice(nodes, func(i, j int) bool { + return nodes[i].Order() < nodes[j].Order() + }) + + var xhtml []byte + + // Convert nodes into raw XHTML + for _, node := range nodes { + xhtml_node, err := xml.Marshal(&node) + if err != nil { + fmt.Printf("error: %s\n", err) + } else { + xhtml= append(xhtml, xhtml_node...) + } + } + + // Recurse with all nested divisions + for _, nested_div := range div.Divisions { + xhtml = append(xhtml, normalize_division(nested_div)...) + } + + return xhtml +} + +// Convert an XHTML structure into an HTML structure +func xhtml_to_html(source Xhtml) *Html { + dest := &Html{} + + // Copy fields + dest.Body.Title = source.Body.Title + dest.Body.Title = source.Body.Title + + // Convert division + dest.Body.Division.Content = normalize_division(source.Body.Division) + + return dest +} + +// Parse an XHTML file +func ParseXhtml(content string) *Html { + xhtml := Xhtml{} + + // Unmarshal XHTML + if err := xml.Unmarshal([]byte(content), &xhtml); err != nil { + fmt.Printf("error: %s\n", err) + } + + + return xhtml_to_html(xhtml) +} + +// Read HTML from an HXTML file +func ReadHtml(content string) string { + buffer, err := xml.MarshalIndent(ParseXhtml(content), "", " ") + if err != nil { + fmt.Printf("error: %s\n", err) + } + return string(buffer) +} + diff --git a/xml.go b/xml.go deleted file mode 100644 index e35c01a..0000000 --- a/xml.go +++ /dev/null @@ -1,99 +0,0 @@ -// Structs for parsing X(HT)?ML files in e-pub archives - -package main - -import ( - "io" - "encoding/xml" -) - -type Head struct { - Title string `xml:"title"` -} - -type Paragraph struct { - Text string `xml:",innerxml"` - Order int `xml:"-"` -} - -type BlockQuote struct { - Paragraphs []Paragraph `xml:"p"` - Order int `xml:"-"` -} - -type Division struct { - Divisions []Division `xml:"div"` - Paragraphs []Paragraph `xml:"p"` - BlockQuotes []BlockQuote `xml:"blockquote"` -} - -func (d *Division) UnmarshalXML(decoder *xml.Decoder, start xml.StartElement) error { - counter := 0 - - for { - token, err := decoder.Token() - if err == io.EOF { - break - } - if err != nil { - return err - } - - switch token.(type) { - case xml.StartElement: - new_start := token.(xml.StartElement) - if (new_start.Name.Local == "p") { - target := Paragraph{} - decoder.DecodeElement(&target, &new_start) - - target.Order = counter - counter += 1 - - d.Paragraphs = append(d.Paragraphs, target) - } else if (new_start.Name.Local == "blockquote") { - target := BlockQuote{} - decoder.DecodeElement(&target, &new_start) - - target.Order = counter - counter += 1 - - d.BlockQuotes = append(d.BlockQuotes, target) - } else if (new_start.Name.Local == "div") { - target := Division{} - decoder.DecodeElement(&target, &new_start) - - d.Divisions = append(d.Divisions, target) - } - } - } - - return nil -} - -type Body struct { - Title string `xml:"h3"` - Division Division `xml:"div"` -} - -type Xhtml struct { - XMLName xml.Name `xml:"html"` - Head Head `xml:"head"` - Body Body `xml:"body"` -} - -type Content struct { - Src string `xml:"src,attr"` -} - -type NavPoint struct { - Label string `xml:"navLabel>text"` - Content Content `xml:"content"` - Order int `xml:"playOrder,attr"` -} - -type Ncx struct { - XMLName xml.Name `xml:"ncx"` - Title string `xml:"docTitle>text"` - NavPoints []NavPoint `xml:"navMap>navPoint"` -} - -- 2.45.2