-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathextractor.py
129 lines (109 loc) · 3.78 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from multiprocessing import Pool
import newspaper
exclude = ['.jp', 'espanol', 'mexico', 'arabic']
def has_exclusion(url, exclude = exclude):
'''
Takes as input a string of a URL, and matches it
against a list of substrings that should not appear
in URLS to consider
'''
for ex in exclude:
if ex in url.lower():
return True
return False
def get_parsed(article, verbose=False):
'''
Takes a newspaper.Article object, downloads, parses it,
and sticks it in a dictionary.
'''
if verbose:
print 'Processing {}'.format(a)
article.download()
article.parse()
return {'title' : article.title, 'text' : article.text, 'url' : article.url}
class Engine(object):
def __init__(self, verbose = False):
self.verbose = verbose
def __call__(self, article):
return get_parsed(article, self.verbose)
def article_cleaner(corpus, generator=True):
def _is_empty(elem):
for value in elem.values():
if len(value) == 0:
return True
return False
def _is_invalid(elem):
if elem['title'] == 'Page not found':
return True
if 'page is not available' in elem['text'].lower():
return True
return False
def _is_excluded(elem):
return has_exclusion(elem['url'], exclude)
def _is_clean(elem):
return (not (_is_invalid(elem) or _is_empty(elem) or _is_excluded(elem)))
if generator:
return (article for article in corpus if _is_clean(article))
return [article for article in corpus if _is_clean(article)]
def DefaultClean(elem):
def _is_empty(elem):
for value in elem.values():
if len(value) == 0:
return True
return False
def _is_invalid(elem):
if elem['title'] == 'Page not found':
return True
if 'page is not available' in elem['text'].lower():
return True
return False
def _is_excluded(elem):
return has_exclusion(elem['url'], exclude)
return (not (_is_invalid(elem) or _is_empty(elem) or _is_excluded(elem)))
class NewsScraper(object):
"""NewsScraper -- A generic class for pulling from a news site, and dumping to a usable format.
>>> news = NewsScraper('http://cnn.com')
>>> news.pull()
>>> news.scrape(10)
>>> articles = news.polished()
"""
def __init__(self, website = '', nthreads = 10, memoize_articles = False):
super(NewsScraper, self).__init__()
self.website = website
self.nthreads = nthreads
self.memoize_articles = memoize_articles
self.paper = None
self.last_processed = 0
self.corpus = []
def pull(self, website = None):
'''
Builds a cached newspaper from the given website.
By Default, it looks at the website passed to the constructor
'''
if not (website is None):
self.website = website
if self.website is None:
raise ValueError('NewsScraper does not have a website.')
self.paper = newspaper.build(self.website, memoize_articles= self.memoize_articles, number_threads = 20, fetch_images = False, verbose = False)
def scrape(self, num_articles = None):
'''
Performs mapreduce on the article downloading and parsing process.
'''
pool = Pool(self.nthreads) # on 100 processors
engine = Engine(False)
if num_articles is not None:
self.corpus += pool.map(engine, self.paper.articles[self.last_processed:(num_articles + self.last_processed)])
self.last_processed += num_articles
else:
self.corpus += pool.map(engine, self.paper.articles)
def polished(self, cleaner = DefaultClean):
'''
By default, uses DefaultClean() which deletes empty articles and urls that
match the default exclude list. The cleaner parameter can be any function
that takes as input a dictionary with the fields 'text', 'url', and 'title',
and produces a boolean indicating whether or not it should be included.
This produces a generator through list comprehention to save memory.
'''
if cleaner is not None:
return (article for article in self.corpus if cleaner(article))
return self.corpus