Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Access class contants through self (instance) to allow overriding #70

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,5 @@ target/
/.project
/.pydevproject

.idea
venv
85 changes: 46 additions & 39 deletions googlesearch/googlesearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,41 @@
from pkg_resources import resource_filename
from contextlib import closing


class SearchResult:

def __init__(self, title, url):
self.title = title
self.url = url
self.__text = None
self.__markup = None

def get_text(self):
if self.__text is None:
soup = BeautifulSoup(self.get_markup(), "lxml")
for junk in soup(['style', 'script', 'head', 'title', 'meta']):
junk.extract()
self.__text = soup.get_text()
return self.__text

def get_markup(self):
if self.__markup is None:
opener = urllib.build_opener()
opener.addheaders = GoogleSearch.DEFAULT_HEADERS
response = opener.open(self.url)
self.__markup = response.read()
return self.__markup

def __str__(self):
return str(self.__dict__)

def __unicode__(self):
return str(self.__str__())

def __repr__(self):
return self.__str__()


class GoogleSearch:
with open(resource_filename('googlesearch', 'browser_agents.txt'), 'r') as file_handle:
USER_AGENTS = file_handle.read().splitlines()
Expand All @@ -29,6 +64,7 @@ class GoogleSearch:
('User-Agent', choice(USER_AGENTS)),
("Accept-Language", "en-US,en;q=0.5"),
]
SEARCH_RESULT_MODEL = SearchResult

def search(self,
query,
Expand All @@ -45,35 +81,35 @@ def search(self,
Time between thread executions in second to void IP block.
'''
search_results = []
pages = int(math.ceil(num_results / float(GoogleSearch.RESULTS_PER_PAGE)))
pages = int(math.ceil(num_results / float(self.RESULTS_PER_PAGE)))
total = None
thread_pool = None
if prefetch_pages:
thread_pool = ThreadPool(num_prefetch_threads)
for i in range(pages) :
start = i * GoogleSearch.RESULTS_PER_PAGE
start = i * self.RESULTS_PER_PAGE
opener = urllib.build_opener()
opener.addheaders = GoogleSearch.DEFAULT_HEADERS
with closing(opener.open(GoogleSearch.SEARCH_URL +
opener.addheaders = self.DEFAULT_HEADERS
with closing(opener.open(self.SEARCH_URL +
"?hl=en&q="+ urllib.quote(query) +
("" if start == 0 else
("&start=" + str(start))))) as response:
soup = BeautifulSoup(response.read(), "lxml")
if total is None:
if sys.version_info[0] > 2:
totalText = soup.select(GoogleSearch.TOTAL_SELECTOR)[0].children.__next__()
totalText = soup.select(self.TOTAL_SELECTOR)[0].children.__next__()
else:
totalText = soup.select(GoogleSearch.TOTAL_SELECTOR)[0].children.next()
totalText = soup.select(self.TOTAL_SELECTOR)[0].children.next()
total = int(re.sub("[', ]", "",
re.search("(([0-9]+[', ])*[0-9]+)",
totalText).group(1)))
selector = GoogleSearch.RESULT_SELECTOR_PAGE1 if i == 0 else GoogleSearch.RESULT_SELECTOR
selector = self.RESULT_SELECTOR_PAGE1 if i == 0 else self.RESULT_SELECTOR
self.results = self.parse_results(soup.select(selector), i)
# if len(search_results) + len(self.results) > num_results:
# del self.results[num_results - len(search_results):]
search_results += self.results
if prefetch_pages:
thread_pool.map_async(SearchResult.get_text, self.results)
thread_pool.map_async(self.SEARCH_RESULT_MODEL.get_text, self.results)
if prefetch_pages:
thread_pool.close()
thread_pool.join()
Expand All @@ -91,44 +127,15 @@ def parse_results(self, results, page):
continue
url = h3.parent["href"]
title = h3.text
search_results.append(SearchResult(title, url))
search_results.append(self.SEARCH_RESULT_MODEL(title, url))
return search_results


class SearchResponse:
def __init__(self, results, total):
self.results = results
self.total = total

class SearchResult:
def __init__(self, title, url):
self.title = title
self.url = url
self.__text = None
self.__markup = None

def get_text(self):
if self.__text is None:
soup = BeautifulSoup(self.get_markup(), "lxml")
for junk in soup(['style', 'script', 'head', 'title', 'meta']):
junk.extract()
self.__text = soup.get_text()
return self.__text

def get_markup(self):
if self.__markup is None:
opener = urllib.build_opener()
opener.addheaders = GoogleSearch.DEFAULT_HEADERS
response = opener.open(self.url)
self.__markup = response.read()
return self.__markup

def __str__(self):
return str(self.__dict__)
def __unicode__(self):
return str(self.__str__())
def __repr__(self):
return self.__str__()


# Main entry for test and external script use.
if __name__ == "__main__":
Expand Down
13 changes: 9 additions & 4 deletions tests/test_googlesearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,26 @@
import unittest
from googlesearch.googlesearch import GoogleSearch


class TestGoolgeSearch(unittest.TestCase):

def test_search(self):
num_results = 15
min_results = 11
min_results = 10
max_results = 20
response = GoogleSearch().search("unittest", num_results = num_results)
response = GoogleSearch().search("unittest", num_results=num_results)
self.assertTrue(response.total > 1000, "repsonse.total is way too low")
self.assertTrue(len(response.results) >= min_results, "number of results is " + str(len(response.results)) + ", expected at least " + str(min_results))
self.assertTrue(len(response.results) <= max_results, "number of results is " + str(len(response.results)) + ", expected at most " + str(max_results))
self.assertTrue(len(response.results) >= min_results,
"number of results is " + str(len(response.results)) + ", expected at least " + str(
min_results))
self.assertTrue(len(response.results) <= max_results,
"number of results is " + str(len(response.results)) + ", expected at most " + str(max_results))
for result in response.results:
self.assertTrue(result.url is not None, "result.url is None")
self.assertTrue(result.url.startswith("http"), "result.url is invalid: " + result.url)
for result in response.results:
self.assertTrue(result.get_text() is not None, "result.text is None")


if __name__ == '__main__':
unittest.main()