-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewsScraper.py
28 lines (22 loc) · 1.12 KB
/
newsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import scrapy
class BlogSpider(scrapy.Spider):
name = 'blogspider'
start_urls = ['https://blog.scrapinghub.com','https://www.vox.com/',
'https://fivethirtyeight.com/','https://slate.com/','https://www1.nyc.gov/',
'https://www.post-gazette.com/','https://www.nytimes.com/']
def parse(self, response):
#headlines on fivethirtyeight
for title in response.css("div.hentry"):
yield {'title': title.css('a ::text').extract_first()}
#headlines on slate.com
for title in response.css("div.story-teaser__teaser"):
yield {'title': title.css('h3 ::text').extract_first()}
#for handling nyc govt
for title in response.css("h2.hero-title"):
yield {'title': title.css('a ::text').extract_first()}
#for handling pittsburgh post-gazette headlines
for title in response.css("div.pgevoke-textpack-item-text"):
yield {'title': title.css('span ::text').extract_first()}
#new york times
for title in response.css("div.css-18y7hud.esl82me2"):
yield {'title': title.css('h2 ::text').extract_first()}