Online Taobao Item to reStructuredText Image on Google App Engine Go


Online service on Google App Engine Go, which helps you extract title, image URL from Taobao item webpage, and output in reStructuredText format.

Online Taobao Item to reStructuredText

Source code:

Makefile | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
export PATH := $(PATH):$(realpath ../../../../go_appengine/)
PROJECT_DIR=$(CURDIR)
PROJECT_ID=golden-operator-130720
PROJECT_VERSION=taobao-item2rst

default:
	@echo "\033[92mRun development web server ...\033[0m"
	@cd ../; goapp serve ${PROJECT_DIR}

fmt:
	@echo "\033[92mGo fmt source code ...\033[0m"
	@goapp fmt *.go

deploy:
	cd ../; appcfg.py -A ${PROJECT_ID} -V ${PROJECT_VERSION} update ${PROJECT_DIR}
	@echo "\033[92mDeployed URL: http://${PROJECT_VERSION}.${PROJECT_ID}.appspot.com/\033[0m"

install:
	@echo "\033[92mInstall golang.org/x/net/html ...\033[0m"
	@goapp get -u golang.org/x/net/html
	@echo "\033[92mInstall google.golang.org/appengine ...\033[0m"
	@goapp get -u google.golang.org/appengine
app.yaml | repository | view raw
1
2
3
4
5
6
runtime: go
api_version: go1

handlers:
- url: /.*
  script: _go_app
taobaoitem2rst.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
package taobaoitem2rst

import (
	"html/template"
	"net/http"
)

type TemplateValue struct {
	Textarea string
}

var index = `<!doctype html>
<html>
<head>
  <title>Taobao Item to Rst</title>
</head>
<body>
  <form action="/" method="post">
    URL: <input name="url" size="80">
    <button>Send</button>
  </form><br>
  <textarea id="ta" rows="5" cols="80">{{.Textarea}}</textarea><br>
  <button type="button" id="copy">Copy textarea to clipboard</button>

  <br><br>
  <a target="_blank" href="http://html2rst.golden-operator-130720.appspot.com/">HTML to reStructuredText</a>
  <br><br>
  <a target="_blank" href="http://v1.golden-operator-130720.appspot.com/">URL to reStructuredText</a>

<script>
  var textareaElm = document.getElementById("ta");
  var copyElm = document.getElementById("copy");
  copyElm.onclick = function(event) {
    textareaElm.select();
    var isSuccessful = document.execCommand('copy');
    if (isSuccessful) {
      textareaElm.value = "Copy OK";
    } else {
      textareaElm.value = "Copy Fail";
    }
  }
</script>

</body>
</html>`

var tmpl = template.Must(template.New("taobaoitem2rst").Parse(index))

func init() {
	http.HandleFunc("/", handler)
}

func handler(w http.ResponseWriter, r *http.Request) {
	val := TemplateValue{}
	if r.Method == "POST" {
		val.Textarea = getTaobaoItemImgRst(r.PostFormValue("url"), r)
	}

	if err := tmpl.Execute(w, &val); err != nil {
		panic(err)
	}
}
fetch.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
package taobaoitem2rst

import (
	"bytes"
	"html/template"
	"net/http"

	"google.golang.org/appengine"
	"google.golang.org/appengine/urlfetch"
)

var imgRst = `.. image:: {{ .ImgURL }}
   :alt: {{ .Title }}
   :target: {{ .URL }}
   :align: center`

func getTaobaoItemImgRst(url string, r *http.Request) string {
	nUrl := NormalizeURL(url)
	ctx := appengine.NewContext(r)
	client := urlfetch.Client(ctx)
	resp, err := client.Get(nUrl)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()

	ii := getTaobaoItemInfo(resp.Body)
	ii.URL = nUrl

	tmpl := template.Must(template.New("imgRst").Parse(imgRst))
	var rst bytes.Buffer
	err = tmpl.Execute(&rst, &ii)
	if err != nil {
		panic(err)
	}

	return rst.String()
}
urlnormalize.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
package taobaoitem2rst

import (
	"net/url"
)

func NormalizeURL(inputUrl string) string {
	u, err := url.Parse(inputUrl)

	if u.Host != "item.taobao.com" {
		return inputUrl
	}

	if err != nil {
		panic(err)
	}
	u.RawQuery = "id=" + u.Query().Get("id")
	return u.String()
}
iteminfo.go | repository | view raw
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package taobaoitem2rst

import (
	"golang.org/x/net/html"
	"io"
)

type ItemInfo struct {
	Title  string
	URL    string
	ImgURL string
}

func GetAttribute(n *html.Node, key string) (string, bool) {
	for _, attr := range n.Attr {
		if attr.Key == key {
			return attr.Val, true
		}
	}
	return "", false
}

func isTitleElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "title"
}

func isLinkElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "link"
}

func isImgElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "img"
}

func isMetaElement(n *html.Node) bool {
	return n.Type == html.ElementNode && n.Data == "meta"
}

func traverse(n *html.Node, ii *ItemInfo) {
	if isTitleElement(n) {
		ii.Title = n.FirstChild.Data
	}
	if isLinkElement(n) {
		rel, ok := GetAttribute(n, "rel")
		if ok && rel == "canonical" {
			ii.URL, _ = GetAttribute(n, "href")
		}
	}
	if isImgElement(n) {
		// item.taobao.com
		id, ok := GetAttribute(n, "id")
		if ok && id == "J_ImgBooth" {
			ii.ImgURL, _ = GetAttribute(n, "src")
		}
	}
	if isMetaElement(n) {
		// world.taobao.com
		property, ok := GetAttribute(n, "property")
		if ok && property == "og:image" {
			ii.ImgURL, _ = GetAttribute(n, "content")
		}
	}

	for c := n.FirstChild; c != nil; c = c.NextSibling {
		traverse(c, ii)
	}
}

func getTaobaoItemInfo(r io.Reader) ItemInfo {
	ii := ItemInfo{}

	doc, err := html.Parse(r)
	if err != nil {
		panic("Fail to parse html")
	}
	traverse(doc, &ii)

	return ii
}

Tested on: Ubuntu Linux 16.04, Google App Engine SDK for Go 1.9.37.


References:

[1]Google App Engine Go - HTML Link to reStructuredText
[2][Golang] Remove Query String From URL
[3][Golang] Hacker News Link to reStructuredText
[4][Golang] getElementById via net/html Package
[5]

go http no redirect - Google search

Query URL without redirect in Go - Stack Overflow