[Golang] Parse Web Feed - RSS and Atom


This post shows how to parse web feeds of RSS 2.0 and Atom 1.0. (The logic will be explained below the source code)

Souce Code

parseFeed.go | repository | view raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
package main

import (
	"io/ioutil"
	"encoding/xml"
	"html/template"
	"log"
)


type Rss2 struct {
	XMLName		xml.Name	`xml:"rss"`
	Version		string		`xml:"version,attr"`
	// Required
	Title		string		`xml:"channel>title"`
	Link		string		`xml:"channel>link"`
	Description	string		`xml:"channel>description"`
	// Optional
	PubDate		string		`xml:"channel>pubDate"`
	ItemList	[]Item		`xml:"channel>item"`
}

type Item struct {
	// Required
	Title		string		`xml:"title"`
	Link		string		`xml:"link"`
	Description	template.HTML	`xml:"description"`
	// Optional
	Content		template.HTML	`xml:"encoded"`
	PubDate		string		`xml:"pubDate"`
	Comments	string		`xml:"comments"`
}


type Atom1 struct {
	XMLName		xml.Name	`xml:"http://www.w3.org/2005/Atom feed"`
	Title		string		`xml:"title"`
	Subtitle	string		`xml:"subtitle"`
	Id		string		`xml:"id"`
	Updated		string		`xml:"updated"`
	Rights		string		`xml:"rights"`
	Link		Link		`xml:"link"`
	Author		Author		`xml:"author"`
	EntryList	[]Entry		`xml:"entry"`
}

type Link struct {
	Href		string		`xml:"href,attr"`
}

type Author struct {
	Name		string		`xml:"name"`
	Email		string		`xml:"email"`
}

type Entry struct {
	Title		string		`xml:"title"`
	Summary		string		`xml:"summary"`
	Content		string		`xml:"content"`
	Id		string		`xml:"id"`
	Updated		string		`xml:"updated"`
	Link		Link		`xml:"link"`
	Author		Author		`xml:"author"`
}

func atom1ToRss2(a Atom1) Rss2 {
	r := Rss2{
		Title: a.Title,
		Link: a.Link.Href,
		Description: a.Subtitle,
		PubDate: a.Updated,
	}
	r.ItemList = make([]Item, len(a.EntryList))
	for i, entry := range a.EntryList {
		r.ItemList[i].Title = entry.Title
		r.ItemList[i].Link = entry.Link.Href
		if entry.Content == "" {
			r.ItemList[i].Description = template.HTML(entry.Summary)
		} else {
			r.ItemList[i].Description = template.HTML(entry.Content)
		}
	}
	return r
}


const atomErrStr = "expected element type <rss> but have <feed>"

func parseAtom(content []byte) (Rss2, bool){
	a := Atom1{}
	err := xml.Unmarshal(content, &a)
	if err != nil {
		log.Println(err)
		return Rss2{}, false
	}
	return atom1ToRss2(a), true
}

func parseFeedContent(content []byte) (Rss2, bool) {
	v := Rss2{}
	err := xml.Unmarshal(content, &v)
	if err != nil {
		if err.Error() == atomErrStr {
			// try Atom 1.0
			return parseAtom(content)
		}
		log.Println(err)
		return v, false
	}

	if v.Version == "2.0" {
		// RSS 2.0
		for i, _ := range v.ItemList {
			if v.ItemList[i].Content != "" {
				v.ItemList[i].Description = v.ItemList[i].Content
			}
		}
		return v, true
	}

	log.Println("not RSS 2.0")
	return v, false
}


func main() {
	// parse sample rss feed
	xmlContent1, _ := ioutil.ReadFile("example-6.xml")
	r1, ok1 := parseFeedContent(xmlContent1)
	if ok1 {
		log.Println(r1.Title)
	} else {
		log.Println("fail to read example-6")
	}

	// parse sample atom feed
	xmlContent2, _ := ioutil.ReadFile("example-7.xml")
	r2, ok2 := parseFeedContent(xmlContent2)
	if ok2 {
		log.Println(r2.Title)
	} else {
		log.Println("fail to read example-7")
	}

	// parse opml
	xmlContent3, _ := ioutil.ReadFile("example-5.xml")
	r3, ok3 := parseFeedContent(xmlContent3)
	if ok3 {
		log.Println(r3.Title)
	} else {
		log.Println("fail to read example-5")
	}
}

Main logic of the parseFeedContent function:

  1. Given xml content, the function tries to parse the content in RSS format. If success, return the parsed result.
  2. If parsing RSS fails, then try to parse the content in Atom 1.0 format. If success, convert the Atom format to RSS format and return the parsed result.
  3. If both parsing RSS and Atom fails, return.

The sample Atom 1.0 feed comes from kura.io website. The sample RSS 2.0 feed comes from Solidot website. The sample OPML xml comes from my web feeds.

Tested on: Ubuntu Linux 14.10, Go 1.4.


[Golang] XML Parsing Example series:

[1][Golang] XML Parsing Example (1)
[2][Golang] XML Parsing Example (2)
[3][Golang] XML Parsing Example (3)
[4][Golang] XML Parsing Example (4)
[5][Golang] XML Parsing Example (5) - Parse OPML
[6][Golang] XML Parsing Example (6) - Parse OPML Concisely
[7][Golang] XML Parsing Example (7) - Parse RSS 2.0
[8][Golang] XML Parsing Example (8) - Parse Atom 1.0
[9][Golang] Convert Atom to RSS
[10][Golang] Parse Web Feed - RSS and Atom

[a]XML to Go struct : golang