-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcnblogs_news.py
58 lines (46 loc) · 1.43 KB
/
cnblogs_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# -*- codeing = utf-8 -*-
# @Time : 2021/11/12 20:44
# @Author : zhy
# @File : cnblogs_news.py
# @Software: PyCharm
import requests
import re
import parsel
from cnblogs_db import DataManager
headers ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
def get_page(url):
response = requests.get(url,headers=headers)
#print(response.text)
return response.text
def parse_page(html):
s = parsel.Selector(html)
title_ans = s.xpath('//h1/a/span/text()').get()
content = re.compile(r'<div class="postBody">(.*?)</div>',re.S)
content_ans = re.search(content,html).group(1)
# print(title_ans)
# print(content_ans)
return title_ans, content_ans
def cnblogs_news_spider():
topviews_url = 'https://www.cnblogs.com/aggsite/SideRight'
response = requests.get(topviews_url)
html = response.text
s = parsel.Selector(html)
lis = s.xpath('//div[1]/ul//a/@href')
url_lis = []
for li in lis:
url_lis.append(li.get())
db_manager = DataManager('dbase')
db_manager.clear_table()
for url in url_lis:
print(url)
data = {}
html = get_page(url)
title, content = parse_page(html)
data['title'] = title
data['content'] = content
db_manager.trans_to_news_table(data)
db_manager.close_db()
if __name__ == '__main__':
cnblogs_news_spider()