goquery - Replace HTML Link Node with reStructuredText Text Node


Introduction

Replace HTML links node with text node of restructuredtext format in a webpage via goquery in Golang (Go programming language).

Install goquery Package

$ go get -u github.com/PuerkitoBio/goquery

Source Code

link2rst.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
package link2rst

import (
	"bytes"
	"github.com/PuerkitoBio/goquery"
	"strings"
	"text/template"
)

const rstLink = "`{{.Text}} <{{.Href}}>`_"

type htmlLink struct {
	Text string
	Href string
}

func HtmlAnchorNodeToRstTextNode(doc *goquery.Document) *goquery.Document {
	tmpl := template.Must(template.New("link2rst").Parse(rstLink))

	doc.Find("a").Each(func(_ int, link *goquery.Selection) {
		text := strings.TrimSpace(link.Text())
		href, ok := link.Attr("href")
		if ok {
			var rstBuf bytes.Buffer
			err := tmpl.Execute(&rstBuf, &htmlLink{text, href})
			if err != nil {
				panic(err)
			}
			link.ReplaceWithHtml(rstBuf.String())
		}
	})

	return doc
}
link2rst_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
package link2rst

import (
	"github.com/PuerkitoBio/goquery"
	"testing"
)

func TestHtmlAnchorNodeToRstTextNode(t *testing.T) {
	url := "http://nanda.online-dhamma.net/"
	doc, err := goquery.NewDocument(url)
	if err != nil {
		panic(err)
	}

	doc2 := HtmlAnchorNodeToRstTextNode(doc)
	print(doc2.Find("body").Text())
}

Tested on: Ubuntu Linux 16.04, Go 1.6.2.


References:

[1]github.com/PuerkitoBio/goquery - GoDoc
[2][Golang] Convert All HTML Links to reStructuredText via goquery
[3][Golang] Extract Title, Image, and URL via goquery
[4]html escape < - Google search