goquery - Convert HTML Unordered List to reStructuredText


Introduction

Convert HTML unordered (bulleted) list to restructuredtext format via goquery in Golang (Go programming language).

Install goquery Package

$ go get -u github.com/PuerkitoBio/goquery

Source Code

ulli2rst.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package ulli2rst

import (
	"bufio"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"os"
	"strings"
)

var liMark = []string{"-", "*"}

func StringToLines(s string) []string {
	var lines []string

	scanner := bufio.NewScanner(strings.NewReader(s))
	for scanner.Scan() {
		lines = append(lines, scanner.Text())
	}

	if err := scanner.Err(); err != nil {
		fmt.Fprintln(os.Stderr, "reading standard input:", err)
	}

	return lines
}

func processUl(ul *goquery.Selection, depth int) {
	ul.Find("li").Each(func(_ int, li *goquery.Selection) {
		li.Find("ul").Each(func(_ int, childUl *goquery.Selection) {
			processUl(childUl, depth+1)
		})

		lines := StringToLines(li.Text())
		var indentedLines []string
		for i, line := range lines {
			if i == 0 {
				liMarkIndex := depth % 2
				mark := liMark[liMarkIndex]
				indentedLines = append(indentedLines, "\n"+mark+" "+line)
			} else {
				indentedLines = append(indentedLines, "  "+line)
			}
		}
		li.ReplaceWithHtml(strings.Join(indentedLines, "\n"))
	})

	ul.ReplaceWithHtml(ul.Text())
}

func HtmlUlLiToRst(doc *goquery.Document) *goquery.Document {
	for ul := doc.Find("ul").First(); ul.Length() != 0; ul = doc.Find("ul").First() {
		processUl(ul, 0)
	}

	return doc
}
ulli2rst_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
package ulli2rst

import (
	"github.com/PuerkitoBio/goquery"
	"testing"
)

func TestHtmlUlLiToRst(t *testing.T) {
	url := "http://nanda.online-dhamma.net/"
	doc, err := goquery.NewDocument(url)
	if err != nil {
		panic(err)
	}

	doc2 := HtmlUlLiToRst(doc)
	print(doc2.Find("body").Text())
}

Tested on: Ubuntu Linux 16.04, Go 1.6.2.


References:

[1]github.com/PuerkitoBio/goquery - GoDoc
[2]goquery - Replace HTML Link Node with reStructuredText Text Node
[3][Golang] Read Lines From File or String
[4]

go string concat - Google search

How to efficiently concatenate strings in Go? - Stack Overflow