-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbotutils.py
102 lines (86 loc) · 3.64 KB
/
botutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from os.path import exists, join, basename, splitext
from stackapi import StackAPI
from lxml import html, etree
import subprocess as sp
import requests
import json
import time
import glob
import os
# Some math-related tags that are Cayley does not need to know
_notinclude = ["math-software", "book-recommendation", "mathematica", "matlab", "soft-question", "recreational-mathematics", "networkx", "python", "recreational-mathematics", "online-resources"]
def fetch_stack_posts(_siteToQuery='math', _tags="graph-theory", _nottags=_notinclude, _sort='votes', _accepted=True, log=False):
''' Currently do not support multiple tags and nottags.
'''
# fetch the data
SITE = StackAPI(_siteToQuery, max_pages=200)
queries = SITE.fetch('search/advanced', order='desc', min=1, sort=_sort, tagged=_tags, nottagged=_nottags[-1], accepted=_accepted)
questions = queries['items']
if log:
print("Original fetch: %d items."%len(questions))
# clean the data
_nottags.pop() #query the items without the last tags already
questions = clean(questions, nottagged=_nottags)
if log:
print("After clean: %d items."%len(questions))
# remove old data of the site if exist
site_files = glob.glob(f"./data/*_{_tags}.json")
if len(site_files)!=0:
for f in site_files:
os.remove(f)
# store the data
filename = time.strftime("%Y%m%d_%H%M%S") + "_" + _tags + ".json"
pathtofile = os.path.join("./data", filename)
with open(pathtofile, 'w') as file:
json.dump(questions, file)
if log:
print(f"Data saved at {os.path.abspath(pathtofile)}")
return
def clean(data, _to_keep=["tags", "accepted_answer_id", "score", "question_id", "link", "title"], nottagged=[]):
''' Remove unnessessary data such as details of the person who posted the questionss, etc.
'''
# Need a more efficient ways to do this
to_remove_keys = [k for k in data[0].keys() if k not in _to_keep]
cleaned_data = []
nottagged = set(nottagged)
for item in data:
if bool(nottagged.intersection(set(item["tags"]))): continue
else:
for k in to_remove_keys:
item.pop(k, None)
cleaned_data.append(item)
return cleaned_data
def parse_a_post(url,accepted_answer_id=None):
''' Parse the data for extract the content of the post (question) and answer for Cayley to use.
Return the details of the question and the accepted answer.
'''
if not accepted_answer_id:
raise Exception('Must provide accepted answer ID! Cayley only accepts legit answers')
else:
page = requests.get(url)
tree = etree.HTML(page.content)
post = tree.xpath('string(//*[@id="question"]/div[2]/div[2]/div[1])')
answer = tree.xpath(f'string(//*[@id="answer-{accepted_answer_id}"]/div/div[2]/div[1])')
return post, answer
def render_tex(to_render, path_to_png='tex/temp.png'):
'''Shell commands to render Latex math display.
:param to_render: string
:return None
'''
# create the tex file with the tex in Latex in it
with open('tex/preamble.tex') as f:
buffer = f.read()
buffer+= "\n"+to_render+"\n \end{document}"
with open('tex/temp.tex', 'w') as f:
f.write(buffer)
try:
# convert the file to pdf
p = sp.run('xelatex tex/temp.tex -output-directory="./tex"', shell=True, check=True)
# convert the file to png
p = sp.run(f'pdftoppm -png tex/temp.pdf > {path_to_png}', shell=True, check=True)
except Exception as e:
print(e)
return
if __name__ == "__main__":
# uncomment to query the questions
fetch_stack_posts(log=True)