[Golang] Unmarshal UTF-16 XML
How to parse a UTF-16 XML file in Go.
The Go standard encoding/xml package provides basic methods for parsing XML files. However, the encoding/xml package parses only UTF-8 encoded XML. After spending a lot of time for search and trial and error [1] [2] [3], finally I get my UTF-16 XML parsed correctly. I do not know why it works. Just show the code for reference.
import (
"encoding/xml"
"io"
"golang.org/x/net/html/charset"
)
func BypassReader(label string, input io.Reader) (io.Reader, error) {
return input, nil
}
func DecodeUtf16XML(r io.Reader, v interface{}) (err error) {
// https://www.tipitaka.org/romn/cscd/vin01m.mul.toc.xml
// The Tipiṭaka XML is encoded in UTF-16
// Google search: golang xml utf-16
// https://stackoverflow.com/questions/6002619/unmarshal-an-iso-8859-1-xml-input-in-go
// https://groups.google.com/forum/#!topic/golang-nuts/tXcECEKC2rs
nr, err := charset.NewReader(r, "utf-16")
if err != nil {
return
}
decoder := xml.NewDecoder(nr)
decoder.CharsetReader = BypassReader
err = decoder.Decode(v)
return
}
Usage:
import (
"encoding/xml"
"os"
"testing"
)
type Tree struct {
XMLName xml.Name `xml:"tree"`
Trees []Tree `xml:"tree"`
Text string `xml:"text,attr"`
Src string `xml:"src,attr"`
Action string `xml:"action,attr"`
}
func TestDecodeUtf16XML(t *testing.T) {
dst := "/tmp/romn/cscd/vin01m.mul.toc.xml"
err := CheckDownload("https://www.tipitaka.org/romn/cscd/vin01m.mul.toc.xml", dst, false)
if err != nil {
t.Error(err)
return
}
f16, err := os.Open(dst)
if err != nil {
t.Error(err)
return
}
tree := Tree{}
err = DecodeUtf16XML(f16, &tree)
if err != nil {
t.Error(err)
return
}
t.Log(tree)
}
Tested on: Ubuntu Linux 20.04, Go 1.12.17, 1.14.4.
References:
[1] |
[2] | utf 8 - Unmarshal an ISO-8859-1 XML input in Go - Stack Overflow |
[3] |
[4] | decode utf-16 xml · siongui/gopalilib@370ba1f · GitHub |
[5] | [Golang] Auto-Detect and Convert Encoding of HTML to UTF-8 |