[Golang] Iterate over All DOM Elements in HTML


Introduction

Iterate over all DOM elements in HTML via Golang. Use net/html package to parse and iterate all elements in HTML. Search for HTML links and output them in reStructuredText format.

Another example of iterating over all DOM elements can be found in [4].

Install net/html package

$ go get -u golang.org/x/net/html

Traverse DOM Tree

Traverse the DOM tree (Iterate over all elements in HTML):

html.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
package main

import (
	"flag"
	"fmt"
	"golang.org/x/net/html"
	"os"
)

func traverse(n *html.Node) {
	if isAnchorElement(n) {
		printRstLink(n)
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		traverse(c)
	}
}

func parseCommandLineArguments() string {
	pPath := flag.String("input", "", "Path of HTML file to be processed")
	flag.Parse()
	path := *pPath
	if path == "" {
		fmt.Fprintf(os.Stderr, "Error: empty path!\n")
	}

	return path
}

func main() {
	inputFile := parseCommandLineArguments()

	fin, err := os.Open(inputFile)
	if err != nil {
		panic("Fail to open " + inputFile)
	}
	defer fin.Close()

	doc, err := html.Parse(fin)
	if err != nil {
		panic("Fail to parse " + inputFile)
	}

	traverse(doc)
}

Find HTML links and print them in reStructuredText format:

handleHtmlLink.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package main

import (
	"errors"
	"fmt"
	"golang.org/x/net/html"
	"os"
	"strings"
)

func isAnchorElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "a"
}

func isTextNode(n *html.Node) bool {
	return n.Type == html.TextNode
}

func isHasOnlyOneChild(n *html.Node) bool {
	return n.FirstChild != nil && n.FirstChild == n.LastChild
}

func getAttribute(n *html.Node, key string) (string, error) {
	for _, attr := range n.Attr {
		if attr.Key == key {
			return attr.Val, nil
		}
	}
	return "", errors.New(key + " not exist in attribute!")
}

func printRstLink(n *html.Node) {
	if !isHasOnlyOneChild(n) {
		fmt.Fprintf(os.Stderr, "Child number of anchor is not 1\n")
		return
	}

	if !isTextNode(n.FirstChild) {
		fmt.Fprintf(os.Stderr, "Child of anchor is not TextNode\n")
		return
	}

	text := strings.TrimSpace(n.FirstChild.Data)

	href, err := getAttribute(n, "href")
	if err != nil {
		fmt.Fprintf(os.Stderr, err.Error())
		return
	}

	rstLink := "`" + text + " <" + href + ">`__"
	fmt.Println(rstLink)
}

Usage

Download any HTML file and pass the file path to Go program by input flag. For example, if you have index.html put together with Go program in current directory, run the program by the following command:

$ go run html.go handleHtmlLink.go -input=index.html

Tested on: Ubuntu Linux 15.10, Go 1.6.


References:

[1]jquery iterate over elements - Google search
[2]

net/html go - Google search

A Simple Web Scraper in Go | Gregory Schier

golang.org/x/net/html GoDoc

[3]github.com/PuerkitoBio/goquery - GoDoc
[4][Golang] getElementById via net/html Package