Skip to content

Commit

Permalink
- update the readme.
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Khashabi committed Jun 5, 2021
1 parent 4a7258f commit c6e0f92
Show file tree
Hide file tree
Showing 5 changed files with 491 additions and 0 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ Here are several more examples from the data:
"answer_url": null
}
```
## Question/Answer Extraction Scripts
See [this directory](extraction), which contains two sub-folders:
(1) the question extraction script
(2) the answer extraction scripts


## Baselines
See the scripts for reproducing our [T5](https://github.com/google-research/text-to-text-transfer-transformer/) baselines, see the [`experiments/`](experiments) directory.
Expand Down
76 changes: 76 additions & 0 deletions extraction/answer_extraction/crawl_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python3

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from gcp import connect_to_gcp
from fake_useragent import UserAgent
import psycopg2
import sys
import time
import random
import threading
import urllib.parse

task_batch_size = 10
concurrent_sessions = 3 if len(sys.argv) < 2 else int(sys.argv[1])
ua = UserAgent()

class CrawlWindow(threading.Thread):
count = 0

def __init__(self):
threading.Thread.__init__(self)

CrawlWindow.count += 1
self.id = CrawlWindow.count

chrome_options = Options()
chrome_options.add_argument("--window-size=1024x768")
# chrome_options.add_argument("--headless")
chrome_options.add_argument('log-level=3')
agent = ua.random
print('Window {0} using user agent {1}'.format(self.id, agent))
chrome_options.add_argument('user-agent={0}'.format(ua.random))
self.driver = webdriver.Chrome(options=chrome_options)

self.conn, self.cur = connect_to_gcp()
print('Window {0} successfully connected to DB'.format(self.id))

def ask_google(self, query):
# Search for query
query = urllib.parse.quote(query)
self.driver.get('http://www.google.com/search?q=' + query)

# Get HTML only
return self.driver.find_element_by_xpath('//div[@id="search"]').get_attribute("outerHTML")

def crawl(self, i, question):
html = self.ask_google(question)
self.cur.execute('UPDATE queries SET html = %s WHERE id = %s;', [html, i])
print('Window {2} retrieved HTML for question {0}: {1}'.format(i, question, self.id))

def do_tasks(self, tasks):
for i, question in tasks:
self.crawl(i, question)
time.sleep(random.randint(2, 10))

def run(self):
while True:
self.cur.execute(
'''
SELECT id, question
FROM queries
WHERE html IS NULL
FOR UPDATE SKIP LOCKED
LIMIT %s;
''',
[task_batch_size])
self.do_tasks(self.cur.fetchmany(task_batch_size))
# "for update skip locked" means that we shouldn't commit until all tasks
# in a batch are done
self.conn.commit()
print('Window {1} finished {0} tasks'.format(task_batch_size, self.id))

for _ in range(concurrent_sessions):
CrawlWindow().start()
229 changes: 229 additions & 0 deletions extraction/answer_extraction/extract_answers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
#!/usr/bin/env python3

import gcp
from bs4 import BeautifulSoup

version = 12 # increment version to go through pages we are uncertain about again
batch_size = 20

conn, cur = gcp.connect_to_gcp()

print('Connected to DB')

def handle_featured_snippet(snippet):
short_answer_div = snippet.find('div', attrs={'data-tts': 'answers'})
short_answer = None
if short_answer_div:
short_answer = short_answer_div.get_text()
short_answer_div.parent.decompose() # make it easier to find long answer
long_div = snippet.find('div', attrs={'role': 'heading'})
if long_div and long_div.span:
long_answer = long_div.span.get_text()
return 'feat_snip', short_answer, long_answer
else:
ol = snippet.find('ol')
ul = snippet.find('ul')
if ol and not ol.has_attr('role'): # see 6916 and 4143239
long_list = [x.get_text() for x in ol.find_all('li')]
return 'rich_list', short_answer, str(long_list)
elif ul:
long_list = [x.get_text() for x in ul.find_all('li')]
return 'rich_set', short_answer, str(long_list)
else:
return 'rich_snip', short_answer, None

def get_split(question, delimiter):
split = question.split(delimiter)
if len(split) == 2 and len(split[1]) > 0:
return split[1]

def handle_unit_converter(featured, question):
equals = featured.parent.div(text='=')[0]
count = equals.find_next('input')
count_value = count.get('value')
unit = count.find_next('option', {'selected': '1'})
unit_value = ''
if unit:
unit_value = unit.get_text()
else: # see 13783 and 19581
unit_value = get_split(question, ' how many ')
if unit_value is None:
unit_value = get_split(question, ' equal to ')

short_answer = count_value # sometimes it's just PEBKAC and no units available; see 20802
if unit_value:
short_answer = '{0} {1}'.format(count_value, unit_value)
return 'unit_conv', short_answer, None

def handle_currency_converter(featured):
input = featured.parent.find('select')
count = input.find_next('input')
count_value = count.get('value')
unit = count.find_next('option', {'selected': '1'})
unit_value = unit.get_text()
short_answer = '{0} {1}'.format(count_value, unit_value)
return 'curr_conv', short_answer, None

def handle_translation_result(featured):
# todo: 8349104 as an example of one with another result
short_answer = featured.parent.find('pre', {'id': 'tw-target-text'}).get_text()
return 'tr_result', short_answer, None

def handle_local_results(featured):
return 'local_rst', None, None

def handle_local_time_conversion(featured):
short_answer = featured.parent.find('div', {'class': 'vk_bk'}).get_text()
return 'time_conv', short_answer, None

def handle_local_time(featured):
# strip because sometimes there's whitespace at the end due to div spacing
short_answer = featured.parent.find('div', {'class': 'vk_bk'}).get_text().strip()
return 'localtime', short_answer, None

def handle_weather(featured):
return 'weather', None, None

def handle_kp_header(header):
gsrt = header.find('div', {'class': 'gsrt'})
if gsrt:
short_answer = gsrt.div.get_text()
return 'knowledge', short_answer, None
else:
return None, None, None

def handle_directions(featured):
return 'direction', None, None

def handle_description(featured):
return 'descript', None, None

def handle_overview(doc):
short_ans = doc.a.get_text()
return 'overview', short_ans, None

def handle_no_snippet(featured):
# todo: 1119248 and 8349104 as examples of incorrect no_answer extractions
return 'no_answer', None, None

def has_no_other_answer_markers(doc):
return doc.find('div', {'class': 'kp-header'}) is None and \
doc.find('div', {'class': 'answered-question'}) is None

def get_url(snippet):
r_div = snippet.find('div', attrs={'class': 'r'})
if r_div:
return r_div.a['href']

def do_batch():
cur.execute('''
SELECT q.id, question, html
FROM queries AS q
LEFT JOIN extractions AS e ON q.id = e.id
WHERE q.html IS NOT NULL
AND q.id = 7669997
AND e.answer IS NULL
AND e.short_answer IS NULL
AND e.answer_type IS NULL
AND (e.extract_v < %s OR e.extract_v IS NULL)
FOR UPDATE OF q SKIP LOCKED
LIMIT %s;''',
[version, batch_size])

for id, question, html in cur.fetchall():
extraction_type = None
short_answer = None
long_answer = None
url = None

doc = BeautifulSoup(html, 'html.parser')
featured = doc.h2
# the casing in the html is inconsistent, so just always lowercase
featured_type = featured.get_text().lower() if featured else None

# Examples of ones where featured snippets do not include h2
# 1389251
# 1389246
# 1389247

# Example of one where it doesn't include 'kp-header' (it does include "answered-question")
# 41802

try:
if featured_type == 'featured snippet from the web':
snippet = featured.parent.div
url = get_url(snippet)
extraction_type, short_answer, long_answer = handle_featured_snippet(snippet)
elif featured_type == 'unit converter':
extraction_type, short_answer, long_answer = handle_unit_converter(featured, question)
elif featured_type == 'currency converter':
extraction_type, short_answer, long_answer = handle_currency_converter(featured)
elif featured_type == 'translation result':
extraction_type, short_answer, long_answer = handle_translation_result(featured)
elif featured_type == 'local results':
extraction_type, short_answer, long_answer = handle_local_results(featured)
elif featured_type == 'local time conversion':
extraction_type, short_answer, long_answer = handle_local_time_conversion(featured)
elif featured_type == 'local time':
extraction_type, short_answer, long_answer = handle_local_time(featured)
elif featured_type == 'weather result':
extraction_type, short_answer, long_answer = handle_weather(featured)
elif featured_type == 'directions':
extraction_type, short_answer, long_answer = handle_directions(featured)
elif featured_type == 'description':
extraction_type, short_answer, long_answer = handle_description(featured)
elif featured_type == 'overview':
extraction_type, short_answer, long_answer = handle_overview(doc)
elif has_no_other_answer_markers(doc) and ( \
featured_type == 'web results' or
featured_type == 'people also ask' or
featured_type == 'web result with site links' or
featured_type is None):
extraction_type, short_answer, long_answer = handle_no_snippet(featured)
else:
answered_div = doc.find('div', {'class': 'answered-question'})
if answered_div:
url = get_url(snippet)
extraction_type, short_answer, long_answer = handle_featured_snippet(answered_div)
else:
kp_header = doc.find('div', {'class': 'kp-header'})
if kp_header:
extraction_type, short_answer, long_answer = handle_kp_header(kp_header)
else:
print(' Unknown featured display "{0}"'.format(featured_type))
except Exception as e:
print('Extraction for {0} failed: {1}'.format(id, e))
continue

long_str = long_answer
if long_str and len(long_str) > 50:
long_str = long_str[:24] + '...' + long_str[-23:]
print('{0:7} {1:10} Short ans: {2}. Long ans: {3}'.format(
id,
str(extraction_type),
short_answer,
long_str))
if short_answer and len(short_answer) > 100: # example: 80100
print('TODO: Fix length for {0}'.format(id))
short_answer = None
answer = None
answer_type = None
cur.execute('''
INSERT INTO extractions (id, short_answer, answer, answer_url, answer_type, extract_v)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (id)
DO UPDATE
SET
short_answer = EXCLUDED.short_answer,
answer = EXCLUDED.answer,
answer_url = EXCLUDED.answer_url,
answer_type = EXCLUDED.answer_type,
extract_v = EXCLUDED.extract_v;
''', [id, short_answer, long_answer, url, extraction_type, version])
conn.commit()
print('Extracted from {0} pages'.format(batch_size))

while True:
do_batch()

conn.close()
14 changes: 14 additions & 0 deletions extraction/answer_extraction/gcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import psycopg2
import os

def connect_to_gcp():
host = 'aa.bb.cc.dd' # IP to your database
conn = psycopg2.connect(
host=host,
port=1234, # port to your DB
dbname='dbname', # DB name
user='dbuser', # username for your database
password=os.getenv('DO_DB_PASSWORD')) # password to your database
cur = conn.cursor()

return conn, cur
Loading

0 comments on commit c6e0f92

Please sign in to comment.