[Golang] HTML a, ul, li Element to reStructuredText


Introduction

Convert HTML unordered (bulleted) list and HTML link to restructuredtext format via net/html package in Golang (Go programming language). I am not sure whether the rst output can be converted back to HTML or not.

Install net/html Package

$ go get -u golang.org/x/net/html

Source Code

aulli2rst.go | repository | view raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package aulli2rst

import (
	"bufio"
	"fmt"
	"golang.org/x/net/html"
	"io"
	"os"
	"strings"
)

func StringToLines(s string) []string {
	var lines []string

	scanner := bufio.NewScanner(strings.NewReader(s))
	for scanner.Scan() {
		lines = append(lines, scanner.Text())
	}

	if err := scanner.Err(); err != nil {
		fmt.Fprintln(os.Stderr, "reading standard input:", err)
	}

	return lines
}

func indentEachLine(s string) string {
	lines := StringToLines(s)
	var indentedLines []string
	for _, line := range lines {
		indentedLines = append(indentedLines, "  "+line)
	}
	return strings.Join(indentedLines, "\n")
}

func isAnchorElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "a"
}

func isUlElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "ul"
}

func isLiElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "li"
}

func isTextNode(n *html.Node) bool {
	return n.Type == html.TextNode
}

func getAttribute(n *html.Node, key string) (string, bool) {
	for _, attr := range n.Attr {
		if attr.Key == key {
			return attr.Val, true
		}
	}
	return "", false
}

func textNode2rst(n *html.Node) string {
	text := strings.TrimSpace(n.Data)
	if text == "" {
		return "\n"
	}
	return n.Data
}

func a2rst(n *html.Node) string {
	text := strings.TrimSpace(n.FirstChild.Data)

	href, ok := getAttribute(n, "href")
	if ok {
		return "`" + text + " <" + href + ">`__"
	}

	return ""
}

func li2rst(n *html.Node) string {
	rstText := ""
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		if isTextNode(c) {
			rstText += textNode2rst(c)
		}
		if isAnchorElement(c) {
			rstText += a2rst(c)
		}
		if isUlElement(c) {
			rstText += "\n"
			rstText += indentEachLine(ul2rst(c))
		}
	}

	return "- " + rstText + "\n"
}

func ul2rst(n *html.Node) string {
	rstText := ""
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		if isLiElement(c) {
			rstText += li2rst(c)
		}
	}

	return rstText
}

func traverse(n *html.Node) string {
	if isTextNode(n) {
		return textNode2rst(n)
	}
	if isAnchorElement(n) {
		return a2rst(n)
	}
	if isUlElement(n) {
		return ul2rst(n)
	}

	rstText := ""
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		rstText += traverse(c)
	}

	return rstText
}

func HtmlAUlLiToRst(r io.Reader) string {
	doc, err := html.Parse(r)
	if err != nil {
		panic("Fail to parse html")
	}

	return traverse(doc)
}
aulli2rst_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
package aulli2rst

import (
	"strings"
	"testing"
)

var testHtml = `<!DOCTYPE html><html>
<head><title>a ul li to rst</title></head>
<body>
  <ul>
    <li>item 1</li>
    <li>item 2 <a href="/">link 1</a></li>
    <li>item 3
      <ul>
        <li>item 3-1</li>
        <li>item 3-2</li>
      </ul>
    </li>
  </ul>
  <a href="/">link 2</a>
</body></html>`

func TestHtmlAUlLiToRst(t *testing.T) {
	print(HtmlAUlLiToRst(strings.NewReader(testHtml)))
}

Output of test:

=== RUN   TestHtmlAUlLiToRst
a ul li to rst

- item 1
- item 2 `link 1 </>`__
- item 3

  - item 3-1
  - item 3-2


`link 2 </>`__
--- PASS: TestHtmlAUlLiToRst (0.00s)
PASS

Tested on: Ubuntu Linux 16.04, Go 1.6.2.


References:

[1]html - GoDoc
[2]goquery - Replace HTML Link Node with reStructuredText Text Node
[3][Golang] Read Lines From File or String
[4]

go string concat - Google search

How to efficiently concatenate strings in Go? - Stack Overflow

[5]goquery - Convert HTML Unordered List to reStructuredText