- update the readme.

allenai · Jun 5, 2021 · c6e0f92 · c6e0f92
1 parent 4a7258f
commit c6e0f92
Show file tree

Hide file tree

Showing 5 changed files with 491 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -99,6 +99,11 @@ Here are several more examples from the data:
   "answer_url": null
 }
 ``` 
+## Question/Answer Extraction Scripts 
+See [this directory](extraction), which contains two sub-folders:  
+ (1) the question extraction script
+ (2) the answer extraction scripts 
+
 
 ## Baselines 
 See the scripts for reproducing our [T5](https://github.com/google-research/text-to-text-transfer-transformer/) baselines, see the [`experiments/`](experiments) directory.  

diff --git a/extraction/answer_extraction/crawl_only.py b/extraction/answer_extraction/crawl_only.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.chrome.options import Options
+from gcp import connect_to_gcp
+from fake_useragent import UserAgent
+import psycopg2
+import sys
+import time
+import random
+import threading
+import urllib.parse
+
+task_batch_size = 10
+concurrent_sessions = 3 if len(sys.argv) < 2 else int(sys.argv[1])
+ua = UserAgent()
+
+class CrawlWindow(threading.Thread):
+    count = 0
+
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+        CrawlWindow.count += 1
+        self.id = CrawlWindow.count
+
+        chrome_options = Options()
+        chrome_options.add_argument("--window-size=1024x768")
+        # chrome_options.add_argument("--headless")
+        chrome_options.add_argument('log-level=3')
+        agent = ua.random
+        print('Window {0} using user agent {1}'.format(self.id, agent))
+        chrome_options.add_argument('user-agent={0}'.format(ua.random))
+        self.driver = webdriver.Chrome(options=chrome_options)
+
+        self.conn, self.cur = connect_to_gcp()
+        print('Window {0} successfully connected to DB'.format(self.id))
+
+    def ask_google(self, query):
+        # Search for query
+        query = urllib.parse.quote(query)
+        self.driver.get('http://www.google.com/search?q=' + query)
+
+        # Get HTML only
+        return self.driver.find_element_by_xpath('//div[@id="search"]').get_attribute("outerHTML")
+
+    def crawl(self, i, question):
+        html = self.ask_google(question)
+        self.cur.execute('UPDATE queries SET html = %s WHERE id = %s;', [html, i])
+        print('Window {2} retrieved HTML for question {0}: {1}'.format(i, question, self.id))
+
+    def do_tasks(self, tasks):
+        for i, question in tasks:
+            self.crawl(i, question)
+            time.sleep(random.randint(2, 10))
+
+    def run(self):
+        while True:
+            self.cur.execute(
+                    '''
+                    SELECT id, question 
+                    FROM queries 
+                    WHERE html IS NULL 
+                    FOR UPDATE SKIP LOCKED 
+                    LIMIT %s;
+                    ''',
+                    [task_batch_size])
+            self.do_tasks(self.cur.fetchmany(task_batch_size))
+            # "for update skip locked" means that we shouldn't commit until all tasks
+            # in a batch are done
+            self.conn.commit()
+            print('Window {1} finished {0} tasks'.format(task_batch_size, self.id))
+
+for _ in range(concurrent_sessions):
+    CrawlWindow().start()
diff --git a/extraction/answer_extraction/extract_answers.py b/extraction/answer_extraction/extract_answers.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+
+import gcp
+from bs4 import BeautifulSoup
+
+version = 12 # increment version to go through pages we are uncertain about again
+batch_size = 20
+
+conn, cur = gcp.connect_to_gcp()
+
+print('Connected to DB')
+
+def handle_featured_snippet(snippet):
+    short_answer_div = snippet.find('div', attrs={'data-tts': 'answers'})
+    short_answer = None
+    if short_answer_div:
+        short_answer = short_answer_div.get_text()
+        short_answer_div.parent.decompose() # make it easier to find long answer
+    long_div = snippet.find('div', attrs={'role': 'heading'})
+    if long_div and long_div.span:
+        long_answer = long_div.span.get_text()
+        return 'feat_snip', short_answer, long_answer
+    else:
+        ol = snippet.find('ol')
+        ul = snippet.find('ul')
+        if ol and not ol.has_attr('role'): # see 6916 and 4143239
+            long_list = [x.get_text() for x in ol.find_all('li')]
+            return 'rich_list', short_answer, str(long_list)
+        elif ul:
+            long_list = [x.get_text() for x in ul.find_all('li')]
+            return 'rich_set', short_answer, str(long_list)
+        else:
+            return 'rich_snip', short_answer, None
+
+def get_split(question, delimiter):
+    split = question.split(delimiter)
+    if len(split) == 2 and len(split[1]) > 0:
+        return split[1]
+
+def handle_unit_converter(featured, question):
+    equals = featured.parent.div(text='=')[0]
+    count = equals.find_next('input')
+    count_value = count.get('value')
+    unit = count.find_next('option', {'selected': '1'})
+    unit_value = ''
+    if unit:
+        unit_value = unit.get_text()
+    else: # see 13783 and 19581
+        unit_value = get_split(question, ' how many ')
+        if unit_value is None:
+            unit_value = get_split(question, ' equal to ')
+
+    short_answer = count_value # sometimes it's just PEBKAC and no units available; see 20802
+    if unit_value:
+        short_answer = '{0} {1}'.format(count_value, unit_value)
+    return 'unit_conv', short_answer, None
+
+def handle_currency_converter(featured):
+    input = featured.parent.find('select')
+    count = input.find_next('input')
+    count_value = count.get('value')
+    unit = count.find_next('option', {'selected': '1'})
+    unit_value = unit.get_text()
+    short_answer = '{0} {1}'.format(count_value, unit_value)
+    return 'curr_conv', short_answer, None
+
+def handle_translation_result(featured):
+    # todo: 8349104 as an example of one with another result
+    short_answer = featured.parent.find('pre', {'id': 'tw-target-text'}).get_text()
+    return 'tr_result', short_answer, None
+
+def handle_local_results(featured):
+    return 'local_rst', None, None
+
+def handle_local_time_conversion(featured):
+    short_answer = featured.parent.find('div', {'class': 'vk_bk'}).get_text()
+    return 'time_conv', short_answer, None
+
+def handle_local_time(featured):
+    # strip because sometimes there's whitespace at the end due to div spacing
+    short_answer = featured.parent.find('div', {'class': 'vk_bk'}).get_text().strip()
+    return 'localtime', short_answer, None
+
+def handle_weather(featured):
+    return 'weather', None, None
+
+def handle_kp_header(header):
+    gsrt = header.find('div', {'class': 'gsrt'})
+    if gsrt:
+        short_answer = gsrt.div.get_text()
+        return 'knowledge', short_answer, None
+    else:
+        return None, None, None
+
+def handle_directions(featured):
+    return 'direction', None, None
+
+def handle_description(featured):
+    return 'descript', None, None
+
+def handle_overview(doc):
+    short_ans = doc.a.get_text()
+    return 'overview', short_ans, None
+
+def handle_no_snippet(featured):
+    # todo: 1119248 and 8349104 as examples of incorrect no_answer extractions
+    return 'no_answer', None, None
+
+def has_no_other_answer_markers(doc):
+    return doc.find('div', {'class': 'kp-header'}) is None and \
+            doc.find('div', {'class': 'answered-question'}) is None
+
+def get_url(snippet):
+    r_div = snippet.find('div', attrs={'class': 'r'})
+    if r_div:
+        return r_div.a['href']
+
+def do_batch():
+    cur.execute('''
+        SELECT q.id, question, html
+        FROM queries AS q
+          LEFT JOIN extractions AS e ON q.id = e.id
+        WHERE q.html IS NOT NULL
+          AND q.id = 7669997
+          AND e.answer IS NULL
+          AND e.short_answer IS NULL
+          AND e.answer_type IS NULL
+          AND (e.extract_v < %s OR e.extract_v IS NULL)
+        FOR UPDATE OF q SKIP LOCKED
+        LIMIT %s;''',
+        [version, batch_size])
+
+    for id, question, html in cur.fetchall():
+        extraction_type = None
+        short_answer = None
+        long_answer = None
+        url = None
+
+        doc = BeautifulSoup(html, 'html.parser')
+        featured = doc.h2
+        # the casing in the html is inconsistent, so just always lowercase
+        featured_type = featured.get_text().lower() if featured else None
+
+        # Examples of ones where featured snippets do not include h2
+        # 1389251
+        # 1389246
+        # 1389247
+
+        # Example of one where it doesn't include 'kp-header' (it does include "answered-question")
+        # 41802
+
+        try:
+            if featured_type == 'featured snippet from the web':
+                snippet = featured.parent.div
+                url = get_url(snippet)
+                extraction_type, short_answer, long_answer = handle_featured_snippet(snippet)
+            elif featured_type == 'unit converter':
+                extraction_type, short_answer, long_answer = handle_unit_converter(featured, question)
+            elif featured_type == 'currency converter':
+                extraction_type, short_answer, long_answer = handle_currency_converter(featured)
+            elif featured_type == 'translation result':
+                extraction_type, short_answer, long_answer = handle_translation_result(featured)
+            elif featured_type == 'local results':
+                extraction_type, short_answer, long_answer = handle_local_results(featured)
+            elif featured_type == 'local time conversion':
+                extraction_type, short_answer, long_answer = handle_local_time_conversion(featured)
+            elif featured_type == 'local time':
+                extraction_type, short_answer, long_answer = handle_local_time(featured)
+            elif featured_type == 'weather result':
+                extraction_type, short_answer, long_answer = handle_weather(featured)
+            elif featured_type == 'directions':
+                extraction_type, short_answer, long_answer = handle_directions(featured)
+            elif featured_type == 'description':
+                extraction_type, short_answer, long_answer = handle_description(featured)
+            elif featured_type == 'overview':
+                extraction_type, short_answer, long_answer = handle_overview(doc)
+            elif has_no_other_answer_markers(doc) and ( \
+                featured_type == 'web results' or
+                featured_type == 'people also ask' or
+                featured_type == 'web result with site links' or
+                featured_type is None):
+                extraction_type, short_answer, long_answer = handle_no_snippet(featured)
+            else:
+                answered_div = doc.find('div', {'class': 'answered-question'})
+                if answered_div:
+                    url = get_url(snippet)
+                    extraction_type, short_answer, long_answer = handle_featured_snippet(answered_div)
+                else:
+                    kp_header = doc.find('div', {'class': 'kp-header'})
+                    if kp_header:
+                        extraction_type, short_answer, long_answer = handle_kp_header(kp_header)
+                    else:
+                        print('        Unknown featured display "{0}"'.format(featured_type))
+        except Exception as e:
+            print('Extraction for {0} failed: {1}'.format(id, e))
+            continue
+
+        long_str = long_answer
+        if long_str and len(long_str) > 50:
+            long_str = long_str[:24] + '...' + long_str[-23:]
+        print('{0:7} {1:10} Short ans: {2}. Long ans: {3}'.format(
+            id,
+            str(extraction_type),
+            short_answer,
+            long_str))
+        if short_answer and len(short_answer) > 100: # example: 80100
+            print('TODO: Fix length for {0}'.format(id))
+            short_answer = None
+            answer = None
+            answer_type = None
+        cur.execute('''
+            INSERT INTO extractions (id, short_answer, answer, answer_url, answer_type, extract_v)
+            VALUES (%s, %s, %s, %s, %s, %s)
+            ON CONFLICT (id)
+            DO UPDATE
+              SET
+                short_answer = EXCLUDED.short_answer,
+                answer = EXCLUDED.answer,
+                answer_url = EXCLUDED.answer_url,
+                answer_type = EXCLUDED.answer_type,
+                extract_v = EXCLUDED.extract_v;
+        ''', [id, short_answer, long_answer, url, extraction_type, version])
+    conn.commit()
+    print('Extracted from {0} pages'.format(batch_size))
+
+while True:
+    do_batch()
+
+conn.close()
diff --git a/extraction/answer_extraction/gcp.py b/extraction/answer_extraction/gcp.py
@@ -0,0 +1,14 @@
+import psycopg2
+import os
+
+def connect_to_gcp():
+    host = 'aa.bb.cc.dd' # IP to your database
+    conn = psycopg2.connect(
+        host=host,
+        port=1234, # port to your DB
+        dbname='dbname', # DB name
+        user='dbuser', # username for your database
+        password=os.getenv('DO_DB_PASSWORD')) # password to your database
+    cur = conn.cursor()
+
+    return conn, cur