-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathmain.py
executable file
·198 lines (171 loc) · 7.97 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
import argparse
import logging
import math
import os.path
import sys
import time
import requests
from bs4 import BeautifulSoup
import bibtexparser
from bibtexparser.bibdatabase import BibDatabase
REQUEST_HEADERS = {"User-Agent": "Innocent Browser", "Accept-Charset": "UTF-8,*;q=0.5"}
parser = argparse.ArgumentParser(description='To retrieve all of your citations from Google Scholar.')
parser.add_argument('google_scholar_uri', type=str, help='Your google scholar homepage')
parser.add_argument('--request-interval', action='store', type=int, default=100,
help='Interval (in seconds) between requests to google scholar')
parser.add_argument('--should-download', action='store_const', const=True, default=False,
help='Download PS/PDF files of all citations iff True')
parser.add_argument('--download-dir', action='store', default="pdf",
help='Directory for downloaded citations PDF files')
parser.add_argument('--citation-name', action='store', default="citation.bib",
help='File name for all your citations in BibTex format')
opts = parser.parse_args()
citation_num = 0
download_num = 0
session = requests.Session()
def get_start_citation_num():
if not os.path.exists(opts.citation_name):
with open(opts.citation_name, 'w+') as f:
f.write("% -*-coding: utf-8 -*-\n\n")
return 0
with open(opts.citation_name, 'r') as f:
citation_list = f.readlines()
i = len(citation_list) - 1
while i > -1:
if citation_list[i].startswith("% ["):
break
else:
i -= 1
if i < 0:
return 0
last_line = citation_list[i].strip()
logging.debug("find last line: %s" % last_line)
start_number = int(last_line[last_line.index('[') + 1: last_line.index(']')])
return start_number
def get_all_citations():
global citation_num
total_citations_num = get_total_citations_num()
citation_num = get_start_citation_num()
if citation_num > total_citations_num:
logging.error("Unexpected start citation number: %d, total citations number: %d" % (citation_num, total_citations_num))
sys.exit(2)
logging.info("Total citations number: %d, starting from citation: %d" % (total_citations_num, citation_num))
papers_per_page = 20
citation_num_bynow = 0
page_num = 0
while True:
params = {"cstart": papers_per_page * page_num, "pagesize": papers_per_page}
logging.info("Processing homepage %s, page number: %d" % (opts.google_scholar_uri, page_num))
soup = create_soup_by_url(opts.google_scholar_uri, params)
paper_records = soup("tr", {"class": 'gsc_a_tr'})
for p in paper_records:
paper_title = p.find('a', {"class": "gsc_a_at"}).getText()
logging.info("Processing paper: " + paper_title)
citations_anchor = p.find('a', {"class": 'gsc_a_ac'})
if citations_anchor['href']:
citation_num_curr_paper = int(citations_anchor.getText())
citation_num_bynow += citation_num_curr_paper
if citation_num_bynow <= citation_num:
continue
start_index = citation_num_curr_paper - (citation_num_bynow - citation_num)
if start_index == 0:
with open(opts.citation_name, "a+") as f:
f.write("%%%%%%%%%%%%\n%% %s\n%%%%%%%%%%%%\n" % paper_title)
get_citations_by_paper(citations_anchor['href'], citation_num_curr_paper, start_index)
else:
logging.warn("Current paper has not been cited.")
# has next page?
next_button = soup.find('button', {"id": "gsc_bpf_next"})
if "disabled" in dict(next_button.attrs):
break
else:
page_num += 1
def get_total_citations_num():
"""
Get the total citation number from user's google scholar homepage
"""
soup = create_soup_by_url(opts.google_scholar_uri)
total_citations_num = int(soup("td", {"class": "gsc_rsb_std"})[0].getText())
return total_citations_num
def get_citations_by_paper(citations_uri, count, start_index):
for c in range(0, int(math.ceil((count - start_index) / 10.0))):
soup = create_soup_by_url(citations_uri, {"start": c * 10 + start_index})
for citation_record in soup('div', {"class": "gs_r"}):
save_citation(citation_record)
def save_citation(citation_record):
cite_anchor = citation_record.find('a', {'class': 'gs_nph', 'href': '#', "role": "button"})
if not cite_anchor or not cite_anchor['onclick']:
logging.warn("No Cite anchor for citation: %s" % citation_record)
return
citation_id = cite_anchor['onclick'].split(',')[1][1:-1]
logging.info("Getting formated cite from citation id: " + citation_id)
params = {"q": "info:%s:scholar.google.com/" % citation_id, "output": "cite"}
soup = create_soup_by_url("https://scholar.google.com/scholar", params)
bib_anchor = soup.find('a', {"class": "gs_citi"})
if not bib_anchor:
logging.debug("BibTex page soup is: %s" % soup.getText())
logging.warn("No BibTex citation provided for citation: %s" % citation_id)
return
soup = create_soup_by_url(bib_anchor['href'])
global citation_num
citation_num += 1
# Adding a tag to the bib entry about google scholar citation ID
citation_entry = bibtexparser.loads(soup.getText()).entries[0]
citationID = citation_entry['ID'] # e.g., melville2004review
citation_entry["gscholar_id"] = citation_id
db_entry=[]
db_entry.append(citation_entry)
db = BibDatabase()
db.entries = db_entry
g_bib_entry = bibtexparser.dumps(db)
bib_entry = "%% [%d]\n%s" % (citation_num, g_bib_entry)
logging.info(bib_entry.strip())
with open(opts.citation_name, "ab+") as f:
f.write(bib_entry.encode('utf-8'))
if opts.should_download:
pdf_div = citation_record.find('div', {"class": "gs_ggs gs_fl"})
if pdf_div:
download_pdf(pdf_div.a['href'], citationID)
def download_pdf(pdf_url, citationID):
'''
helper function to download citation pdf files
'''
global citation_num, download_num
try:
res = requests.get(pdf_url, stream=True, timeout=30)
with open(os.path.join(opts.download_dir, "%d_%s.pdf" % (citation_num, citationID)), "wb") as mypdf:
mypdf.write(res.content)
download_num += 1
logging.info("Downloaded [%d] pdf file from link %s " % (citation_num, pdf_url))
except Exception as err:
logging.error("Can't download pdf file from link: " + pdf_url + " Error: " + str(err))
def create_soup_by_url(page_url, params=None):
'''
helper function to create a beautiful soup object with given URL and parameters
'''
try:
time.sleep(opts.request_interval)
res = session.get(page_url, params=params, headers=REQUEST_HEADERS, timeout=10)
res.encoding = 'UTF-8'
logging.debug("Creating soup for URL: %s" % res.url)
if res.status_code != 200:
logging.debug("Response text: %s" % res.text)
raise Exception("Bad response status code %d" % res.status_code)
soup = BeautifulSoup(res.text, "html.parser")
if soup.h1 and soup.h1.text == "Please show you're not a robot":
raise Exception("You need to verify manually that you're not a robot.")
return soup
except Exception as err:
logging.error("Can't open link: " + page_url + " Error: " + str(err))
sys.exit(1)
def main():
logging.basicConfig(level=logging.DEBUG)
logging.info(opts)
if opts.should_download and not os.path.exists(opts.download_dir):
logging.debug("Creating directory %s for holding pdf files" % opts.download_dir)
os.mkdir(opts.download_dir)
get_all_citations()
logging.info("Found %d citations and download %d files" % (citation_num, download_num))
if __name__ == "__main__":
sys.exit(main())