[Golang] Unrobust HTML Table to reStructuredText list-table


Introduction

Convert HTML table to reStructuredText list-table via net/html package in Golang (Go programming language). This is experimental and not robust implementation. For more robust converter, see Python Beautiful Soup 4 (bs4) implementation [4].

Install net/html package

$ go get -u golang.org/x/net/html

HTML table to reStructuredText list-table

table2rst.go | repository | view raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package table2rst

import (
	"golang.org/x/net/html"
	"strings"
)

type ElementType int

const (
	TextNode ElementType = iota
	tableElementNode
	tbodyElementNode
	trElementNode
	tdElementNode
	NoNeedToKnow
)

func getElementType(n *html.Node) ElementType {
	if n.Type == html.TextNode {
		return TextNode
	}
	if n.Type == html.ElementNode && n.Data == "td" {
		return tdElementNode
	}
	if n.Type == html.ElementNode && n.Data == "tr" {
		return trElementNode
	}
	if n.Type == html.ElementNode && n.Data == "tbody" {
		return tbodyElementNode
	}
	if n.Type == html.ElementNode && n.Data == "table" {
		return tableElementNode
	}
	return NoNeedToKnow
}

func getTextNodeRst(text *html.Node) string {
	return strings.TrimSpace(text.Data)
}

func getTdRst(td *html.Node) string {
	s := ""
	for c := td.FirstChild; c != nil; c = c.NextSibling {
		if getElementType(c) == TextNode {
			s += (getTextNodeRst(c) + "\n")
			continue
		}
		panic("cannot convert this td")
	}
	return s
}

func getTrRst(tr *html.Node) string {
	s := ""
	isFirstTd := true
	for c := tr.FirstChild; c != nil; c = c.NextSibling {
		if getElementType(c) == tdElementNode {
			if isFirstTd {
				s += ("  * - " + getTdRst(c))
				isFirstTd = false
			} else {
				s += ("    - " + getTdRst(c))
			}
			continue
		}
		if getElementType(c) == TextNode {
			s += getTextNodeRst(c)
			continue
		}
		panic("cannot convert this tr")
	}
	return s
}

func getTbodyRst(tbody *html.Node) string {
	s := ""
	for c := tbody.FirstChild; c != nil; c = c.NextSibling {
		if getElementType(c) == trElementNode {
			s += getTrRst(c)
			continue
		}
		if getElementType(c) == TextNode {
			s += getTextNodeRst(c)
			continue
		}
		panic("cannot convert this tbody")
	}
	return s
}

func getTableRst(table *html.Node) string {
	s := ".. list-table::\n\n"
	for c := table.FirstChild; c != nil; c = c.NextSibling {
		if getElementType(c) == tbodyElementNode {
			s += getTbodyRst(c)
			continue
		}
		if getElementType(c) == TextNode {
			s += getTextNodeRst(c)
			continue
		}
		panic("cannot convert this table")
	}
	return s
}

func traverse(n *html.Node) string {
	s := ""

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		if getElementType(c) == tableElementNode {
			s += getTableRst(c)
			continue
		} else {
			s += traverse(c)
		}
	}

	return s
}

func HtmlTableToRstListTable(doc *html.Node) string {
	return traverse(doc)
}
table2rst_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
package table2rst

import (
	"golang.org/x/net/html"
	"strings"
	"testing"
)

const indexHtml = `<!DOCTYPE html>
<html>
<head><title>[Go] HTML table to reStructuredText list-table</title></head>
<body>
  <table>
    <tr><td>R1, C1</td><td>R1, C2</td></tr>
    <tr><td>R2, C1</td><td>R2, C2</td></tr>
  </table>
</body>
</html>`

const tableRst = `.. list-table::

  * - R1, C1
    - R1, C2
  * - R2, C1
    - R2, C2
`

func TestTable2Rst(t *testing.T) {
	doc, err := html.Parse(strings.NewReader(indexHtml))
	if err != nil {
		panic("Fail to parse!")
	}

	if HtmlTableToRstListTable(doc) != tableRst {
		t.Error("Fail to convert html table to rst")
	}
}

Tested on: Ubuntu Linux 15.10, Go 1.6.


References:

[1]jquery iterate over elements - Google search
[2]
[3]github.com/PuerkitoBio/goquery - GoDoc
[4][Python] Convert HTML Table to reStructuredText list-table
[5]html table to rst list-table · twnanda/twnanda@e022835 · GitHub
[6][Golang] HTML Table to reStructuredText list-table via goquery
[7]go - How to convert HTML table to array with golang - Stack Overflow