-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcitation_data_updater.py
176 lines (145 loc) · 5.45 KB
/
citation_data_updater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/bin/bash python
# -*- coding: utf-8
import os
import time
import click
import requests
# Python 2/3 imports
try:
from urllib import quote_plus
except:
from urllib.parse import quote_plus
CROSSREF_API = "https://api.crossref.org/"
NUMBER_ROWS = 500
@click.command()
@click.option('-d', '--date', 'inputdate', type=str,
multiple=False, help='Date in the format: year-month-day', )
@click.option('-o', '--output', 'outputfile', type=str,
multiple=False, help='Path to outpufile.')
def cli(inputdate, outputfile):
"""Fetch data from CROSSREF API into an appropriate output format.
"""
if inputdate is None or outputfile is None:
print("Run python %s --help for more information." %
(os.path.basename(__file__)))
return
return get_crossref_data(inputdate, outputfile)
def get_crossref_data(inputdate, outputfile):
"""
:param inputdate: (str) basic date in the format: year-month-day
e.g. 2018-05-11
:param outputfile: (str) path to outpufile; does not test the filte
is writable
:return: (side-effects) writes to an output file.
"""
# url parts
url_root = CROSSREF_API + "works/?"
# TODO check if the inputdate is valid
pars = ["filter=from-index-date:%s,reference-visibility:open" % inputdate,
"rows=%s" % str(NUMBER_ROWS)]
# TODO check if the filepath is valid
# emptying the file if it had something
out = open(outputfile, "w")
out.close()
fetching = True
icount = 0
next_cursor = ""
while fetching:
print("Processing cursor #%s" % str(icount + 1))
# handling the first vs next-cursor signals
if icount > 0 and next_cursor != "":
next_cursor = quote_plus(next_cursor)
url = url_root + "&".join(pars) + "&cursor=%s" % next_cursor
else:
url = url_root + "&".join(pars) + "&cursor=*"
# fetching the data
# print(url)
data = fetch_from_url_or_retry(url, json=True)
citeFrom = []
citeTo = []
try:
data = data.json()
except Exception:
print("Request failed...")
print(url)
return
if data["status"] == "ok":
if "message" in data and "items" in data["message"]:
nitems = int(data["message"]["items-per-page"])
assert nitems == NUMBER_ROWS
nentries = 0
for entry in data["message"]["items"]:
nentries += 1
if int(entry["reference-count"]) > 0:
if "reference" in entry:
for subentry in entry["reference"]:
if "DOI" in subentry:
citeFrom.append(entry["DOI"])
citeTo.append(subentry["DOI"])
if nentries < nitems:
fetching = False
if "next-cursor" in data["message"]:
next_cursor = data["message"]["next-cursor"]
if next_cursor == "":
fetching = False
# write (append) to output
assert len(citeFrom) == len(citeTo)
with open(outputfile, "a") as outfile:
lines = ["%s\t%s" % (i, j) for i, j in zip(citeFrom, citeTo)]
outfile.write("\n".join(lines) + "\n")
icount += 1
# print("Fetching data: iteration number %s done..." % str(icount))
return
def fetch_from_url_or_retry(url, json=True, header=None, post=False, data=None,
retry_in=None, wait=1, n_retries=10, stream=False, **params):
"""
Fetch an url using Requests or retry fetching it if the server is
complaining with retry_in error. There is a limit to the number of retries.
Retry code examples: 429, 500 and 503
:param url: url to be fetched as a string
:param json: json output
:param header: dictionary
:param post: boolean
:param data: dictionary: only if post is True
:param retry_in: http codes for retrying
:param wait: sleeping between tries in seconds
:param n_retries: number of retry attempts
:param stream: boolean
:param params: request.get kwargs.
:return: url content
"""
if retry_in is None:
retry_in = ()
else:
assert type(retry_in) is tuple or type(retry_in) is list
if header is None:
header = {}
else:
assert type(header) is dict
if json:
header.update({"Content-Type": "application/json"})
else:
if "Content-Type" not in header:
header.update({"Content-Type": "text/plain"})
if post:
if data is not None:
assert type(data) is dict or type(data) is str
response = requests.post(url, headers=header, data=data)
else:
return None
else:
response = requests.get(url, headers=header, params=params, stream=stream)
if response.ok:
return response
elif response.status_code in retry_in and n_retries >= 0:
time.sleep(wait)
return fetch_from_url_or_retry(url, json, header, post, data, retry_in, wait,
(n_retries - 1), stream, **params)
else:
try:
response.raise_for_status()
except requests.exceptions.HTTPError as e:
print('%s: Unable to retrieve %s for %s',
response.status_code, url, e)
if __name__ == '__main__':
cli()