This repository has been archived by the owner on Oct 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathutil.go
274 lines (231 loc) · 6.6 KB
/
util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
package zhihu
import (
"fmt"
"io/ioutil"
"net/http"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/fatih/color"
)
const (
userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"
baseZhihuURL = "https://www.zhihu.com"
pageSize = 20
)
var (
reQuestionURL = regexp.MustCompile("^(http|https)://www.zhihu.com/question/[0-9]{8}$")
reCollectionURL = regexp.MustCompile("^(http|https)://www.zhihu.com/collection/[0-9]{8,9}$") // bugfix: for private collection
reTopicURL = regexp.MustCompile("^(http|https)://www.zhihu.com/topic/[0-9]{8}$")
reGetNumber = regexp.MustCompile(`([0-9])+`)
reAvatarReplacer = regexp.MustCompile(`_(s|xs|m|l|xl|hd).(png|jpg)`)
reIsEmail = regexp.MustCompile(`^[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}$`)
logger = Logger{Enabled: true}
)
func validQuestionURL(value string) bool {
return reQuestionURL.MatchString(value)
}
func validCollectionURL(value string) bool {
return reCollectionURL.MatchString(value)
}
func validTopicURL(value string) bool {
return reTopicURL.MatchString(value)
}
func reMatchInt(raw string) int {
matched := reGetNumber.FindStringSubmatch(raw)
if len(matched) == 0 {
return 0
}
rv, _ := strconv.Atoi(matched[0])
return rv
}
func validateAvatarSize(size string) bool {
for _, x := range []string{"s", "xs", "m", "l", "xl", "hd"} {
if size == x {
return true
}
}
return false
}
func replaceAvatarSize(origin string, size string) string {
return reAvatarReplacer.ReplaceAllString(origin, fmt.Sprintf("_%s.$2", size))
}
func isEmail(value string) bool {
return reIsEmail.MatchString(value)
}
func newHTTPHeaders(isXhr bool) http.Header {
headers := make(http.Header)
headers.Set("Accept", "*/*")
headers.Set("Connection", "keep-alive")
headers.Set("Host", "www.zhihu.com")
headers.Set("Origin", "http://www.zhihu.com")
headers.Set("Pragma", "no-cache")
headers.Set("User-Agent", userAgent)
if isXhr {
headers.Set("X-Requested-With", "XMLHttpRequest")
}
return headers
}
func strip(s string) string {
return strings.TrimSpace(s)
}
func minInt(a, b int) int {
if a > b {
return b
}
return a
}
func getCwd() string {
cwd, err := os.Getwd()
if err != nil {
panic("获取 CWD 失败:" + err.Error())
}
return cwd
}
func save(filename string, content []byte) error {
return ioutil.WriteFile(filename, content, 0666)
}
func saveString(filename string, content string) error {
return ioutil.WriteFile(filename, []byte(content), 0666)
}
func openCaptchaFile(filename string) error {
logger.Info("调用外部程序渲染验证码……")
var args []string
switch runtime.GOOS {
case "linux":
args = []string{"xdg-open", filename}
case "darwin":
args = []string{"open", filename}
case "freebsd":
args = []string{"open", filename}
case "netbsd":
args = []string{"open", filename}
case "windows":
var (
cmd = "url.dll,FileProtocolHandler"
runDll32 = filepath.Join(os.Getenv("SYSTEMROOT"), "System32", "rundll32.exe")
)
args = []string{runDll32, cmd, filename}
default:
fmt.Printf("无法确定操作系统,请自行打开验证码 %s 文件,并输入验证码。", filename)
}
logger.Info("Command: %s", strings.Join(args, " "))
err := exec.Command(args[0], args[1:]...).Run()
if err != nil {
return err
}
return nil
}
func readCaptchaInput() string {
var captcha string
fmt.Print(color.CyanString("请输入验证码:"))
fmt.Scanf("%s", &captcha)
return captcha
}
func makeZhihuLink(path string) string {
return urlJoin(baseZhihuURL, path)
}
func urlJoin(base, path string) string {
if strings.HasSuffix(base, "/") {
base = strings.TrimRight(base, "/")
}
if strings.HasPrefix(path, "/") {
path = strings.TrimLeft(path, "/")
}
return base + "/" + path
}
// newDocumentFromUrl 会请求给定的 url,并返回一个 goquery.Document 对象用于解析
func newDocumentFromURL(url string) (*goquery.Document, error) {
resp, err := gSession.Get(url)
if err != nil {
logger.Error("请求 %s 失败:%s", url, err.Error())
return nil, err
}
doc, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
logger.Error("解析页面失败:%s", err.Error())
}
return doc, err
}
// ZhihuPage 是一个知乎页面,User, Question, Answer, Collection 的公共部分
type Page struct {
// Link 是该页面的链接
Link string
// doc 是 HTML document
doc *goquery.Document
// fields 是字段缓存,避免重复解析页面
fields map[string]interface{}
}
// newZhihuPage 是 private 的构造器
func newZhihuPage(link string) *Page {
return &Page{
Link: link,
fields: make(map[string]interface{}),
}
}
// Doc 用于获取当前问题页面的 HTML document,惰性求值
func (page *Page) Doc() *goquery.Document {
if page.doc != nil {
return page.doc
}
err := page.Refresh()
if err != nil {
return nil
}
return page.doc
}
// Refresh 会重新载入当前页面,获取最新的数据
func (page *Page) Refresh() (err error) {
page.fields = make(map[string]interface{}) // 清空缓存
page.doc, err = newDocumentFromURL(page.Link) // 重载页面
return err
}
// GetXsrf 从当前页面内容抓取 xsrf 的值
func (page *Page) GetXSRF() string {
doc := page.Doc()
value, _ := doc.Find(`input[name="_xsrf"]`).Attr("value")
return value
}
// totalPages 获取总页数
func (page *Page) totalPages() int {
return getTotalPages(page.Doc())
}
func (page *Page) setField(field string, value interface{}) {
page.fields[field] = value
}
func (page *Page) getIntField(field string) (value int, exists bool) {
if got, ok := page.fields[field]; ok {
return got.(int), true
}
return 0, false
}
func (page *Page) getStringField(field string) (value string, exists bool) {
if got, ok := page.fields[field]; ok {
return got.(string), true
}
return "", false
}
func getTotalPages(doc *goquery.Document) int {
pager := doc.Find("div.zm-invite-pager")
if pager.Size() == 0 {
return 1
}
text := pager.Find("span").Eq(-2).Text()
pages, _ := strconv.Atoi(text)
return pages
}
// nodeListResult 是形如 /node/XXListV2 这样的 Ajax 请求的 JSON 返回值
type nodeListResult struct {
R int `json:"r"` // 状态码,正确的情况为 0
Msg []string `json:"msg"` // 回答内容,每个元素都是一段 HTML 片段
}
// normalAjaxResult 是页面内,目标 URL 和当前页面 URL 相同的 Ajax 请求返回的 JSON 数据
type normalAjaxResult struct {
R int `json:"r"`
Msg []interface{} `json:"msg"` // 两个元素,第一个为话题数量,第二个是 HTML 片段
}