[Golang] Auto-Detect and Convert Encoding of HTML to UTF-8


Given an URL, auto-detect and convert the encoding of the HTML document to UTF-8 if it is not UTF-8 encoded in Go. Package golang.org/x/net/html is used to determine the encoding of the HTML document, and package golang.org/x/text is used for encoding conversion.

Install the packages first:

$ go get -u golang.org/x/text
$ go get -u golang.org/x/net/html

The following code shows how to determine the encoding of an HTML document given the URL, and convert the encoding to utf8 if it is not utf8 encoded:

url.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
package toutf8

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"net/http"

	"golang.org/x/net/html/charset"
	"golang.org/x/text/encoding"
	"golang.org/x/text/transform"
)

func UrlToUtf8Encoding(url string) (r io.Reader, name string, certain bool, err error) {
	resp, err := http.Get(url)
	if err != nil {
		return
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		err = fmt.Errorf("response status code: %d", resp.StatusCode)
		return
	}

	b, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return
	}
	e, name, certain, err := DetermineEncodingFromReader(bytes.NewReader(b))
	if err != nil {
		return
	}

	r = transform.NewReader(bytes.NewReader(b), e.NewDecoder())
	return
}

func DetermineEncodingFromReader(r io.Reader) (e encoding.Encoding, name string, certain bool, err error) {
	b, err := bufio.NewReader(r).Peek(1024)
	if err != nil {
		return
	}

	e, name, certain = charset.DetermineEncoding(b, "")
	return
}

Note that resp.Body stream cannot be read twice, so it is read into bytes first and used later for detection and conversion. See [4] for more details.

Usage of the above code:

url_test.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
package toutf8

import (
	"io/ioutil"
	"testing"
)

func TestUrlToUtf8Encoding(t *testing.T) {
	r, name, _, err := UrlToUtf8Encoding("http://shenfang.com.tw/product-1.htm")
	if err != nil {
		t.Error(err)
		return
	}
	if name != "big5" {
		t.Error("bad guess!")
		return
	}
	b, err := ioutil.ReadAll(r)
	t.Log(string(b))

	r, name, _, err = UrlToUtf8Encoding("https://siongui.github.io/")
	if err != nil {
		t.Error(err)
		return
	}
	if name != "utf-8" {
		t.Error("bad guess!")
		return
	}
	b, err = ioutil.ReadAll(r)
	t.Log(string(b))
}

Tested on: Ubuntu 18.04, Go 1.11.1


References:

[1]
[2]golang 用/x/net/html写的小爬虫,爬小说 - 简书
[3][Golang] Determine Encoding of HTML Document
[4][Golang] Read Twice From the Same io.Reader