-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Daniel Khashabi
committed
Jun 5, 2021
1 parent
4a7258f
commit c6e0f92
Showing
5 changed files
with
491 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/usr/bin/env python3 | ||
|
||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
from selenium.webdriver.chrome.options import Options | ||
from gcp import connect_to_gcp | ||
from fake_useragent import UserAgent | ||
import psycopg2 | ||
import sys | ||
import time | ||
import random | ||
import threading | ||
import urllib.parse | ||
|
||
task_batch_size = 10 | ||
concurrent_sessions = 3 if len(sys.argv) < 2 else int(sys.argv[1]) | ||
ua = UserAgent() | ||
|
||
class CrawlWindow(threading.Thread): | ||
count = 0 | ||
|
||
def __init__(self): | ||
threading.Thread.__init__(self) | ||
|
||
CrawlWindow.count += 1 | ||
self.id = CrawlWindow.count | ||
|
||
chrome_options = Options() | ||
chrome_options.add_argument("--window-size=1024x768") | ||
# chrome_options.add_argument("--headless") | ||
chrome_options.add_argument('log-level=3') | ||
agent = ua.random | ||
print('Window {0} using user agent {1}'.format(self.id, agent)) | ||
chrome_options.add_argument('user-agent={0}'.format(ua.random)) | ||
self.driver = webdriver.Chrome(options=chrome_options) | ||
|
||
self.conn, self.cur = connect_to_gcp() | ||
print('Window {0} successfully connected to DB'.format(self.id)) | ||
|
||
def ask_google(self, query): | ||
# Search for query | ||
query = urllib.parse.quote(query) | ||
self.driver.get('http://www.google.com/search?q=' + query) | ||
|
||
# Get HTML only | ||
return self.driver.find_element_by_xpath('//div[@id="search"]').get_attribute("outerHTML") | ||
|
||
def crawl(self, i, question): | ||
html = self.ask_google(question) | ||
self.cur.execute('UPDATE queries SET html = %s WHERE id = %s;', [html, i]) | ||
print('Window {2} retrieved HTML for question {0}: {1}'.format(i, question, self.id)) | ||
|
||
def do_tasks(self, tasks): | ||
for i, question in tasks: | ||
self.crawl(i, question) | ||
time.sleep(random.randint(2, 10)) | ||
|
||
def run(self): | ||
while True: | ||
self.cur.execute( | ||
''' | ||
SELECT id, question | ||
FROM queries | ||
WHERE html IS NULL | ||
FOR UPDATE SKIP LOCKED | ||
LIMIT %s; | ||
''', | ||
[task_batch_size]) | ||
self.do_tasks(self.cur.fetchmany(task_batch_size)) | ||
# "for update skip locked" means that we shouldn't commit until all tasks | ||
# in a batch are done | ||
self.conn.commit() | ||
print('Window {1} finished {0} tasks'.format(task_batch_size, self.id)) | ||
|
||
for _ in range(concurrent_sessions): | ||
CrawlWindow().start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import gcp | ||
from bs4 import BeautifulSoup | ||
|
||
version = 12 # increment version to go through pages we are uncertain about again | ||
batch_size = 20 | ||
|
||
conn, cur = gcp.connect_to_gcp() | ||
|
||
print('Connected to DB') | ||
|
||
def handle_featured_snippet(snippet): | ||
short_answer_div = snippet.find('div', attrs={'data-tts': 'answers'}) | ||
short_answer = None | ||
if short_answer_div: | ||
short_answer = short_answer_div.get_text() | ||
short_answer_div.parent.decompose() # make it easier to find long answer | ||
long_div = snippet.find('div', attrs={'role': 'heading'}) | ||
if long_div and long_div.span: | ||
long_answer = long_div.span.get_text() | ||
return 'feat_snip', short_answer, long_answer | ||
else: | ||
ol = snippet.find('ol') | ||
ul = snippet.find('ul') | ||
if ol and not ol.has_attr('role'): # see 6916 and 4143239 | ||
long_list = [x.get_text() for x in ol.find_all('li')] | ||
return 'rich_list', short_answer, str(long_list) | ||
elif ul: | ||
long_list = [x.get_text() for x in ul.find_all('li')] | ||
return 'rich_set', short_answer, str(long_list) | ||
else: | ||
return 'rich_snip', short_answer, None | ||
|
||
def get_split(question, delimiter): | ||
split = question.split(delimiter) | ||
if len(split) == 2 and len(split[1]) > 0: | ||
return split[1] | ||
|
||
def handle_unit_converter(featured, question): | ||
equals = featured.parent.div(text='=')[0] | ||
count = equals.find_next('input') | ||
count_value = count.get('value') | ||
unit = count.find_next('option', {'selected': '1'}) | ||
unit_value = '' | ||
if unit: | ||
unit_value = unit.get_text() | ||
else: # see 13783 and 19581 | ||
unit_value = get_split(question, ' how many ') | ||
if unit_value is None: | ||
unit_value = get_split(question, ' equal to ') | ||
|
||
short_answer = count_value # sometimes it's just PEBKAC and no units available; see 20802 | ||
if unit_value: | ||
short_answer = '{0} {1}'.format(count_value, unit_value) | ||
return 'unit_conv', short_answer, None | ||
|
||
def handle_currency_converter(featured): | ||
input = featured.parent.find('select') | ||
count = input.find_next('input') | ||
count_value = count.get('value') | ||
unit = count.find_next('option', {'selected': '1'}) | ||
unit_value = unit.get_text() | ||
short_answer = '{0} {1}'.format(count_value, unit_value) | ||
return 'curr_conv', short_answer, None | ||
|
||
def handle_translation_result(featured): | ||
# todo: 8349104 as an example of one with another result | ||
short_answer = featured.parent.find('pre', {'id': 'tw-target-text'}).get_text() | ||
return 'tr_result', short_answer, None | ||
|
||
def handle_local_results(featured): | ||
return 'local_rst', None, None | ||
|
||
def handle_local_time_conversion(featured): | ||
short_answer = featured.parent.find('div', {'class': 'vk_bk'}).get_text() | ||
return 'time_conv', short_answer, None | ||
|
||
def handle_local_time(featured): | ||
# strip because sometimes there's whitespace at the end due to div spacing | ||
short_answer = featured.parent.find('div', {'class': 'vk_bk'}).get_text().strip() | ||
return 'localtime', short_answer, None | ||
|
||
def handle_weather(featured): | ||
return 'weather', None, None | ||
|
||
def handle_kp_header(header): | ||
gsrt = header.find('div', {'class': 'gsrt'}) | ||
if gsrt: | ||
short_answer = gsrt.div.get_text() | ||
return 'knowledge', short_answer, None | ||
else: | ||
return None, None, None | ||
|
||
def handle_directions(featured): | ||
return 'direction', None, None | ||
|
||
def handle_description(featured): | ||
return 'descript', None, None | ||
|
||
def handle_overview(doc): | ||
short_ans = doc.a.get_text() | ||
return 'overview', short_ans, None | ||
|
||
def handle_no_snippet(featured): | ||
# todo: 1119248 and 8349104 as examples of incorrect no_answer extractions | ||
return 'no_answer', None, None | ||
|
||
def has_no_other_answer_markers(doc): | ||
return doc.find('div', {'class': 'kp-header'}) is None and \ | ||
doc.find('div', {'class': 'answered-question'}) is None | ||
|
||
def get_url(snippet): | ||
r_div = snippet.find('div', attrs={'class': 'r'}) | ||
if r_div: | ||
return r_div.a['href'] | ||
|
||
def do_batch(): | ||
cur.execute(''' | ||
SELECT q.id, question, html | ||
FROM queries AS q | ||
LEFT JOIN extractions AS e ON q.id = e.id | ||
WHERE q.html IS NOT NULL | ||
AND q.id = 7669997 | ||
AND e.answer IS NULL | ||
AND e.short_answer IS NULL | ||
AND e.answer_type IS NULL | ||
AND (e.extract_v < %s OR e.extract_v IS NULL) | ||
FOR UPDATE OF q SKIP LOCKED | ||
LIMIT %s;''', | ||
[version, batch_size]) | ||
|
||
for id, question, html in cur.fetchall(): | ||
extraction_type = None | ||
short_answer = None | ||
long_answer = None | ||
url = None | ||
|
||
doc = BeautifulSoup(html, 'html.parser') | ||
featured = doc.h2 | ||
# the casing in the html is inconsistent, so just always lowercase | ||
featured_type = featured.get_text().lower() if featured else None | ||
|
||
# Examples of ones where featured snippets do not include h2 | ||
# 1389251 | ||
# 1389246 | ||
# 1389247 | ||
|
||
# Example of one where it doesn't include 'kp-header' (it does include "answered-question") | ||
# 41802 | ||
|
||
try: | ||
if featured_type == 'featured snippet from the web': | ||
snippet = featured.parent.div | ||
url = get_url(snippet) | ||
extraction_type, short_answer, long_answer = handle_featured_snippet(snippet) | ||
elif featured_type == 'unit converter': | ||
extraction_type, short_answer, long_answer = handle_unit_converter(featured, question) | ||
elif featured_type == 'currency converter': | ||
extraction_type, short_answer, long_answer = handle_currency_converter(featured) | ||
elif featured_type == 'translation result': | ||
extraction_type, short_answer, long_answer = handle_translation_result(featured) | ||
elif featured_type == 'local results': | ||
extraction_type, short_answer, long_answer = handle_local_results(featured) | ||
elif featured_type == 'local time conversion': | ||
extraction_type, short_answer, long_answer = handle_local_time_conversion(featured) | ||
elif featured_type == 'local time': | ||
extraction_type, short_answer, long_answer = handle_local_time(featured) | ||
elif featured_type == 'weather result': | ||
extraction_type, short_answer, long_answer = handle_weather(featured) | ||
elif featured_type == 'directions': | ||
extraction_type, short_answer, long_answer = handle_directions(featured) | ||
elif featured_type == 'description': | ||
extraction_type, short_answer, long_answer = handle_description(featured) | ||
elif featured_type == 'overview': | ||
extraction_type, short_answer, long_answer = handle_overview(doc) | ||
elif has_no_other_answer_markers(doc) and ( \ | ||
featured_type == 'web results' or | ||
featured_type == 'people also ask' or | ||
featured_type == 'web result with site links' or | ||
featured_type is None): | ||
extraction_type, short_answer, long_answer = handle_no_snippet(featured) | ||
else: | ||
answered_div = doc.find('div', {'class': 'answered-question'}) | ||
if answered_div: | ||
url = get_url(snippet) | ||
extraction_type, short_answer, long_answer = handle_featured_snippet(answered_div) | ||
else: | ||
kp_header = doc.find('div', {'class': 'kp-header'}) | ||
if kp_header: | ||
extraction_type, short_answer, long_answer = handle_kp_header(kp_header) | ||
else: | ||
print(' Unknown featured display "{0}"'.format(featured_type)) | ||
except Exception as e: | ||
print('Extraction for {0} failed: {1}'.format(id, e)) | ||
continue | ||
|
||
long_str = long_answer | ||
if long_str and len(long_str) > 50: | ||
long_str = long_str[:24] + '...' + long_str[-23:] | ||
print('{0:7} {1:10} Short ans: {2}. Long ans: {3}'.format( | ||
id, | ||
str(extraction_type), | ||
short_answer, | ||
long_str)) | ||
if short_answer and len(short_answer) > 100: # example: 80100 | ||
print('TODO: Fix length for {0}'.format(id)) | ||
short_answer = None | ||
answer = None | ||
answer_type = None | ||
cur.execute(''' | ||
INSERT INTO extractions (id, short_answer, answer, answer_url, answer_type, extract_v) | ||
VALUES (%s, %s, %s, %s, %s, %s) | ||
ON CONFLICT (id) | ||
DO UPDATE | ||
SET | ||
short_answer = EXCLUDED.short_answer, | ||
answer = EXCLUDED.answer, | ||
answer_url = EXCLUDED.answer_url, | ||
answer_type = EXCLUDED.answer_type, | ||
extract_v = EXCLUDED.extract_v; | ||
''', [id, short_answer, long_answer, url, extraction_type, version]) | ||
conn.commit() | ||
print('Extracted from {0} pages'.format(batch_size)) | ||
|
||
while True: | ||
do_batch() | ||
|
||
conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
import psycopg2 | ||
import os | ||
|
||
def connect_to_gcp(): | ||
host = 'aa.bb.cc.dd' # IP to your database | ||
conn = psycopg2.connect( | ||
host=host, | ||
port=1234, # port to your DB | ||
dbname='dbname', # DB name | ||
user='dbuser', # username for your database | ||
password=os.getenv('DO_DB_PASSWORD')) # password to your database | ||
cur = conn.cursor() | ||
|
||
return conn, cur |
Oops, something went wrong.