[Golang] Web Scrape Blogger Post via goquery


Fetch a public post on Blogger and extract data via goquery.

We will extract the following data from HTML:

  • PostUrl
  • Title
  • TimeStamp
  • Author
  • Summary
  • Content
  • Tags

The following is complete source code:

parse.go | repository | view raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package main

import (
	"errors"
	"github.com/PuerkitoBio/goquery"
	"strings"
)

type PostData struct {
	PostUrl   string
	Title     string
	TimeStamp string
	Author    string
	Summary   string
	Content   string
	Tags      string
}

func GetBlogspotTimeStamp(doc *goquery.Document) (string, error) {
	abbr := doc.Find("a.timestamp-link > abbr").First()
	t, ok := abbr.Attr("title")
	if ok {
		return t, nil
	}

	return "", errors.New("cannot find timestamp")
}

func GetBlogspotTitle(doc *goquery.Document) (string, error) {
	t := doc.Find("h3.post-title").First()
	return strings.TrimSpace(t.Text()), nil
}

func GetBlogspotContent(doc *goquery.Document) (string, error) {
	c := doc.Find("div.post-body").First()
	return c.Html()
}

func GetBlogspotUrl(doc *goquery.Document) (string, error) {
	meta := doc.Find("meta[property='og:url']").First()
	u, ok := meta.Attr("content")
	if ok {
		return u, nil
	}

	return "", errors.New("cannot find url")
}

func GetBlogspotSummary(doc *goquery.Document) (string, error) {
	meta := doc.Find("meta[property='og:description']").First()
	d, ok := meta.Attr("content")
	if ok {
		return d, nil
	}

	return "", errors.New("cannot find summary")
}

func GetBlogspotAuthor(doc *goquery.Document) (string, error) {
	a := doc.Find("span.post-author > span.fn").First()
	return a.Text(), nil
}

func GetBlogspotTags(doc *goquery.Document) (string, error) {
	s := doc.Find("span.post-labels > a")
	labels := ""
	s.Each(func(_ int, l *goquery.Selection) {
		if labels != "" {
			labels += ", "
		}
		labels += l.Text()
	})
	return labels, nil
}

func ParseBlogspotPost(doc *goquery.Document) (*PostData, error) {
	bs := PostData{}
	var err error

	bs.TimeStamp, err = GetBlogspotTimeStamp(doc)
	if err != nil {
		return &bs, err
	}

	bs.Title, err = GetBlogspotTitle(doc)
	if err != nil {
		return &bs, err
	}

	bs.Content, err = GetBlogspotContent(doc)
	if err != nil {
		return &bs, err
	}

	bs.PostUrl, err = GetBlogspotUrl(doc)
	if err != nil {
		return &bs, err
	}

	bs.Summary, err = GetBlogspotSummary(doc)
	if err != nil {
		return &bs, err
	}

	bs.Author, err = GetBlogspotAuthor(doc)
	if err != nil {
		return &bs, err
	}

	bs.Tags, err = GetBlogspotTags(doc)
	if err != nil {
		return &bs, err
	}

	return &bs, nil
}

func main() {
	//url := "https://oathbystyx.blogspot.tw/2018/01/descartes-rules-of-signs.html"
	url := "https://timrau.blogspot.com/2017/11/avoid-vim-overwriting-indention-settings.html"
	doc, err := goquery.NewDocument(url)
	if err != nil {
		panic(err)
	}

	post, err := ParseBlogspotPost(doc)
	if err != nil {
		panic(err)
	}

	println(post.TimeStamp)
	println(post.Title)
	println(post.Content)
	println(post.PostUrl)
	println(post.Summary)
	println(post.Author)
	println(post.Tags)
}

Tested on: Ubuntu Linux 17.10, Go 1.9.2.


References:

[1][Golang] Web Scrape Facebook Post via goquery
[2]GitHub - PuerkitoBio/goquery: A little like that j-thing, only in Go. godoc
[3]Tips and tricks · PuerkitoBio/goquery Wiki · GitHub