Online service on Google App Engine Go, which helps you extract title, image
URL from Taobao item webpage, and output in reStructuredText format.
Online Taobao Item to reStructuredText
Source code:
Makefile |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 | export PATH := $(PATH):$(realpath ../../../../go_appengine/)
PROJECT_DIR=$(CURDIR)
PROJECT_ID=golden-operator-130720
PROJECT_VERSION=taobao-item2rst
default:
@echo "\033[92mRun development web server ...\033[0m"
@cd ../; goapp serve ${PROJECT_DIR}
fmt:
@echo "\033[92mGo fmt source code ...\033[0m"
@goapp fmt *.go
deploy:
cd ../; appcfg.py -A ${PROJECT_ID} -V ${PROJECT_VERSION} update ${PROJECT_DIR}
@echo "\033[92mDeployed URL: http://${PROJECT_VERSION}.${PROJECT_ID}.appspot.com/\033[0m"
install:
@echo "\033[92mInstall golang.org/x/net/html ...\033[0m"
@goapp get -u golang.org/x/net/html
@echo "\033[92mInstall google.golang.org/appengine ...\033[0m"
@goapp get -u google.golang.org/appengine
|
app.yaml |
repository |
view raw
| runtime: go
api_version: go1
handlers:
- url: /.*
script: _go_app
|
taobaoitem2rst.go |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62 | package taobaoitem2rst
import (
"html/template"
"net/http"
)
type TemplateValue struct {
Textarea string
}
var index = `<!doctype html>
<html>
<head>
<title>Taobao Item to Rst</title>
</head>
<body>
<form action="/" method="post">
URL: <input name="url" size="80">
<button>Send</button>
</form><br>
<textarea id="ta" rows="5" cols="80">{{.Textarea}}</textarea><br>
<button type="button" id="copy">Copy textarea to clipboard</button>
<br><br>
<a target="_blank" href="http://html2rst.golden-operator-130720.appspot.com/">HTML to reStructuredText</a>
<br><br>
<a target="_blank" href="http://v1.golden-operator-130720.appspot.com/">URL to reStructuredText</a>
<script>
var textareaElm = document.getElementById("ta");
var copyElm = document.getElementById("copy");
copyElm.onclick = function(event) {
textareaElm.select();
var isSuccessful = document.execCommand('copy');
if (isSuccessful) {
textareaElm.value = "Copy OK";
} else {
textareaElm.value = "Copy Fail";
}
}
</script>
</body>
</html>`
var tmpl = template.Must(template.New("taobaoitem2rst").Parse(index))
func init() {
http.HandleFunc("/", handler)
}
func handler(w http.ResponseWriter, r *http.Request) {
val := TemplateValue{}
if r.Method == "POST" {
val.Textarea = getTaobaoItemImgRst(r.PostFormValue("url"), r)
}
if err := tmpl.Execute(w, &val); err != nil {
panic(err)
}
}
|
fetch.go |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38 | package taobaoitem2rst
import (
"bytes"
"html/template"
"net/http"
"google.golang.org/appengine"
"google.golang.org/appengine/urlfetch"
)
var imgRst = `.. image:: {{ .ImgURL }}
:alt: {{ .Title }}
:target: {{ .URL }}
:align: center`
func getTaobaoItemImgRst(url string, r *http.Request) string {
nUrl := NormalizeURL(url)
ctx := appengine.NewContext(r)
client := urlfetch.Client(ctx)
resp, err := client.Get(nUrl)
if err != nil {
panic(err)
}
defer resp.Body.Close()
ii := getTaobaoItemInfo(resp.Body)
ii.URL = nUrl
tmpl := template.Must(template.New("imgRst").Parse(imgRst))
var rst bytes.Buffer
err = tmpl.Execute(&rst, &ii)
if err != nil {
panic(err)
}
return rst.String()
}
|
urlnormalize.go |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | package taobaoitem2rst
import (
"net/url"
)
func NormalizeURL(inputUrl string) string {
u, err := url.Parse(inputUrl)
if u.Host != "item.taobao.com" {
return inputUrl
}
if err != nil {
panic(err)
}
u.RawQuery = "id=" + u.Query().Get("id")
return u.String()
}
|
iteminfo.go |
repository |
view raw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 | package taobaoitem2rst
import (
"golang.org/x/net/html"
"io"
)
type ItemInfo struct {
Title string
URL string
ImgURL string
}
func GetAttribute(n *html.Node, key string) (string, bool) {
for _, attr := range n.Attr {
if attr.Key == key {
return attr.Val, true
}
}
return "", false
}
func isTitleElement(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == "title"
}
func isLinkElement(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == "link"
}
func isImgElement(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == "img"
}
func isMetaElement(n *html.Node) bool {
return n.Type == html.ElementNode && n.Data == "meta"
}
func traverse(n *html.Node, ii *ItemInfo) {
if isTitleElement(n) {
ii.Title = n.FirstChild.Data
}
if isLinkElement(n) {
rel, ok := GetAttribute(n, "rel")
if ok && rel == "canonical" {
ii.URL, _ = GetAttribute(n, "href")
}
}
if isImgElement(n) {
// item.taobao.com
id, ok := GetAttribute(n, "id")
if ok && id == "J_ImgBooth" {
ii.ImgURL, _ = GetAttribute(n, "src")
}
}
if isMetaElement(n) {
// world.taobao.com
property, ok := GetAttribute(n, "property")
if ok && property == "og:image" {
ii.ImgURL, _ = GetAttribute(n, "content")
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
traverse(c, ii)
}
}
func getTaobaoItemInfo(r io.Reader) ItemInfo {
ii := ItemInfo{}
doc, err := html.Parse(r)
if err != nil {
panic("Fail to parse html")
}
traverse(doc, &ii)
return ii
}
|
Tested on: Ubuntu Linux 16.04, Google App Engine SDK for Go 1.9.37.
References: