Skip to content

Commit 7485f30

Browse files
committed
Updated gl scraper to be more resilient - now outputs each successful page, inserts page_number into the database, and continues running even if it encounters empty page. Changed sfRepos and extractSfGit to work out of box - old version was using code that was looking for objects in the database past a certain ObjectID because the script had crashed earlier, it now looks for ALL objects in the database after sfRepos has finished running
1 parent 7d8913e commit 7485f30

File tree

3 files changed

+45
-22
lines changed

3 files changed

+45
-22
lines changed

extractSfGit.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
'''
2+
Script used to extract git urls of sourceforge projects
3+
Uses the 'git ls-remote' command to test if a repo exists
4+
'''
15
import pymongo
26
import sys
37
import json
@@ -13,11 +17,14 @@
1317
db = client[dbname]
1418
coll = db[collName]
1519

16-
gitBase = 'https://git.code.sf.net/p/{}/{}/'
17-
params = ['git', 'ls-remote']
18-
last_object = ObjectId("5c2f59fd256eee0016f01e8b")
19-
cursor = coll.find({"git": None, "_id": {"$gt": last_object}}, no_cursor_timeout=True)
20+
gitBase = 'https://git.code.sf.net/p/{}/{}/' # base url
21+
params = ['git', 'ls-remote'] # subprocess parameters
22+
# last_object = ObjectId("5c2f59fd256eee0016f01e8b") # used for script crashes
2023

24+
# the query itself (currently uses the last object when script crashed)
25+
cursor = coll.find({"git": None}), no_cursor_timeout=True)
26+
27+
# traverse database and attempt 'git ls-remote' on two options (code vs. git)
2128
for doc in cursor:
2229
proj = re.search('projects\/(.+)', doc['url']).group(1)
2330
bases = [gitBase.format(proj, 'git'), gitBase.format(proj, 'code')]
@@ -29,5 +36,5 @@
2936
coll.update_one({'_id': doc['_id']}, {'$set': {'http_url_to_repo': base}}, upsert=False)
3037
except subprocess.CalledProcessError as err:
3138
continue
32-
39+
# close the cursor since we set it to no timeout
3340
cursor.close()

glRepos.py

+32-16
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
db = client[dbname]
1717
coll = db[collName]
1818

19-
beginurl = "https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=" + begin + \
20-
"&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false"
19+
beginurl = "https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page={}&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false"
2120

2221
gleft = 0
22+
success = "Successfully loaded page {}. Got {} repos, current total is {}"
2323

2424
header = {'per_page': str(99)}
2525

@@ -30,7 +30,8 @@ def wait(left):
3030
l = requests.get('https://gitlab.com/api/v4/projects', headers=header)
3131
if (l.ok):
3232
left = int(l.headers.get('RateLimit-Remaining'))
33-
time .sleep(60)
33+
print("Waiting for rate limit...")
34+
time.sleep(60)
3435
return left
3536

3637
# send queries and extract urls
@@ -40,34 +41,41 @@ def get(url, coll):
4041
global header
4142
global bginnum
4243
gleft = wait(gleft)
43-
values = []
44-
size = 0
44+
total = 0
4545

4646
try:
4747
r = requests .get(url, headers=header)
48-
time .sleep(0.5)
48+
time.sleep(0.5)
4949
# got blocked
5050
if r.status_code == 403:
5151
return "got blocked", str(bginnum)
5252
if (r.ok):
53-
5453
gleft = int(r.headers.get('RateLimit-Remaining'))
54+
55+
# get total number of pages (i.e. get last possible page)
5556
lll = r.headers.get('Link')
57+
ll = lll.replace(';', ',').split(',')
58+
url = ll[ll.index(' rel="last"') -
59+
1].replace('<', '').replace('>', '').lstrip()
60+
last = re.findall(r'&page=(\d+)&', url)
61+
if (len(last) == 1):
62+
last = int(last[0])
63+
5664
t = r.text
5765
array = json.loads(t)
66+
total += len(array)
67+
print(success.format(begin, len(array), total))
5868

5969
for el in array:
70+
el['page_number'] = begin
6071
coll.insert(el)
6172

62-
#next page
63-
while ('; rel="next"' in lll):
73+
pageNum = int(r.headers.get('X-Next-Page'))
74+
while (pageNum <= last):
6475
gleft = int(r.headers.get('RateLimit-Remaining'))
6576
gleft = wait(gleft)
6677
# extract next page url
67-
ll = lll.replace(';', ',').split(',')
68-
url = ll[ll.index(' rel="next"') -
69-
1].replace('<', '').replace('>', '').lstrip()
70-
78+
url = beginurl.format(pageNum)
7179
try:
7280
r = requests .get(url, headers=header)
7381
if r.status_code == 403:
@@ -76,11 +84,19 @@ def get(url, coll):
7684
lll = r.headers.get('Link')
7785
t = r.text
7886
array1 = json.loads(t)
87+
total += len(array1)
88+
print(success.format(pageNum, len(array1), total))
89+
7990
for el in array1:
91+
el['page_number'] = pageNum
8092
coll.insert(el)
93+
94+
pageNum = int(r.headers.get('X-Next-Page'))
8195
else:
82-
sys.stderr.write("url can not found:\n" + url + '\n')
83-
return
96+
sys.stderr.write("Can't find:{}{}{}".format('\n', url, '\n'))
97+
pageNum += 1
98+
continue
99+
84100
except requests.exceptions.ConnectionError:
85101
sys.stderr.write('could not get ' + url + '\n')
86102

@@ -94,4 +110,4 @@ def get(url, coll):
94110
sys.stderr.write(url + ';' + str(e) + '\n')
95111

96112
#start retrieving
97-
get(beginurl,coll)
113+
get(beginurl.format(begin),coll)

sfRepos.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def get(page):
6262

6363
# Insert all projects into collection
6464
for i, proj in enumerate(itertools.islice(projects, len(projects))):
65-
coll.insert({"url": proj, "source": "SourceForge"})
65+
coll.insert({"url": proj, "source": "SourceForge", "git": None})
6666

6767
# Print how many projects we found
6868
print("# projects: " + len(projects))

0 commit comments

Comments
 (0)