-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
57 lines (45 loc) · 1.71 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import datetime, tweepy
from tweepy.auth import AppAuthHandler
"""
This class runs within a new thread each time
it actually is the one that makes the requests to the Twitter API.
"""
class Scraper:
currentCount = 0
def __init__(self, job, budget, db):
self.job = job
self.budget = budget
self.db = db
self.currentCount = self.db.CountDB.get(self.job.url)
if not self.currentCount:
self.currentCount = 0
def updateCount(self):
self.job.commit(self.db.JobDB)
self.db.CountDB.set(self.job.url, self.currentCount)
def run(self):
# Lookup the keys of the URL owner
ownerKey = self.db.Whitelist.get(self.job.url)
keys = self.db.Users.get(ownerKey)
if not keys:
print("Can't run %s as it has no onwer!" % self.job.url)
return
keys = keys.decode('utf-8').split('|')
try:
self.auth = AppAuthHandler(keys[0].strip(), keys[1].strip())
self.api = tweepy.API(self.auth)
except tweepy.error.TweepError:
print("Error running for URL %s" % self.job.url)
return
if not self.api:
print("Error running job, could not authenticate")
return
# Search for some tweets
search_results = tweepy.Cursor(self.api.search, q=self.job.url, count=100, since_id=self.job.maxTweetID).pages(self.budget)
addCount = 0
for page in search_results:
addCount = addCount + len(page)
self.currentCount = int(self.currentCount) + addCount
if addCount != 0:
self.job.maxTweetID = page[0].id
self.updateCount()
print("%s finished running!" % self.job.url)