[Golang] Unmarshal UTF-16 XML


How to parse a UTF-16 XML file in Go.

The Go standard encoding/xml package provides basic methods for parsing XML files. However, the encoding/xml package parses only UTF-8 encoded XML. After spending a lot of time for search and trial and error [1] [2] [3], finally I get my UTF-16 XML parsed correctly. I do not know why it works. Just show the code for reference.

import (
      "encoding/xml"
      "io"

      "golang.org/x/net/html/charset"
)

func BypassReader(label string, input io.Reader) (io.Reader, error) {
      return input, nil
}

func DecodeUtf16XML(r io.Reader, v interface{}) (err error) {
      // https://www.tipitaka.org/romn/cscd/vin01m.mul.toc.xml
      // The Tipiṭaka XML is encoded in UTF-16
      // Google search: golang xml utf-16
      // https://stackoverflow.com/questions/6002619/unmarshal-an-iso-8859-1-xml-input-in-go
      // https://groups.google.com/forum/#!topic/golang-nuts/tXcECEKC2rs
      nr, err := charset.NewReader(r, "utf-16")
      if err != nil {
              return
      }
      decoder := xml.NewDecoder(nr)
      decoder.CharsetReader = BypassReader
      err = decoder.Decode(v)
      return
}

Usage:

import (
      "encoding/xml"
      "os"
      "testing"
)

type Tree struct {
      XMLName xml.Name `xml:"tree"`
      Trees   []Tree   `xml:"tree"`
      Text    string   `xml:"text,attr"`
      Src     string   `xml:"src,attr"`
      Action  string   `xml:"action,attr"`
}

func TestDecodeUtf16XML(t *testing.T) {
      dst := "/tmp/romn/cscd/vin01m.mul.toc.xml"
      err := CheckDownload("https://www.tipitaka.org/romn/cscd/vin01m.mul.toc.xml", dst, false)
      if err != nil {
              t.Error(err)
              return
      }

      f16, err := os.Open(dst)
      if err != nil {
              t.Error(err)
              return
      }

      tree := Tree{}
      err = DecodeUtf16XML(f16, &tree)
      if err != nil {
              t.Error(err)
              return
      }
      t.Log(tree)
}

Tested on: Ubuntu Linux 20.04, Go 1.12.17, 1.14.4.


References:

[1]
[2]utf 8 - Unmarshal an ISO-8859-1 XML input in Go - Stack Overflow
[3]
[4]decode utf-16 xml · siongui/gopalilib@370ba1f · GitHub
[5][Golang] Auto-Detect and Convert Encoding of HTML to UTF-8