16
16
db = client [dbname ]
17
17
coll = db [collName ]
18
18
19
- beginurl = "https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=" + begin + \
20
- "&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false"
19
+ beginurl = "https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page={}&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false"
21
20
22
21
gleft = 0
22
+ success = "Successfully loaded page {}. Got {} repos, current total is {}"
23
23
24
24
header = {'per_page' : str (99 )}
25
25
@@ -30,7 +30,8 @@ def wait(left):
30
30
l = requests .get ('https://gitlab.com/api/v4/projects' , headers = header )
31
31
if (l .ok ):
32
32
left = int (l .headers .get ('RateLimit-Remaining' ))
33
- time .sleep (60 )
33
+ print ("Waiting for rate limit..." )
34
+ time .sleep (60 )
34
35
return left
35
36
36
37
# send queries and extract urls
@@ -40,34 +41,41 @@ def get(url, coll):
40
41
global header
41
42
global bginnum
42
43
gleft = wait (gleft )
43
- values = []
44
- size = 0
44
+ total = 0
45
45
46
46
try :
47
47
r = requests .get (url , headers = header )
48
- time .sleep (0.5 )
48
+ time .sleep (0.5 )
49
49
# got blocked
50
50
if r .status_code == 403 :
51
51
return "got blocked" , str (bginnum )
52
52
if (r .ok ):
53
-
54
53
gleft = int (r .headers .get ('RateLimit-Remaining' ))
54
+
55
+ # get total number of pages (i.e. get last possible page)
55
56
lll = r .headers .get ('Link' )
57
+ ll = lll .replace (';' , ',' ).split (',' )
58
+ url = ll [ll .index (' rel="last"' ) -
59
+ 1 ].replace ('<' , '' ).replace ('>' , '' ).lstrip ()
60
+ last = re .findall (r'&page=(\d+)&' , url )
61
+ if (len (last ) == 1 ):
62
+ last = int (last [0 ])
63
+
56
64
t = r .text
57
65
array = json .loads (t )
66
+ total += len (array )
67
+ print (success .format (begin , len (array ), total ))
58
68
59
69
for el in array :
70
+ el ['page_number' ] = begin
60
71
coll .insert (el )
61
72
62
- #next page
63
- while ('; rel="next"' in lll ):
73
+ pageNum = int ( r . headers . get ( 'X-Next-Page' ))
74
+ while (pageNum <= last ):
64
75
gleft = int (r .headers .get ('RateLimit-Remaining' ))
65
76
gleft = wait (gleft )
66
77
# extract next page url
67
- ll = lll .replace (';' , ',' ).split (',' )
68
- url = ll [ll .index (' rel="next"' ) -
69
- 1 ].replace ('<' , '' ).replace ('>' , '' ).lstrip ()
70
-
78
+ url = beginurl .format (pageNum )
71
79
try :
72
80
r = requests .get (url , headers = header )
73
81
if r .status_code == 403 :
@@ -76,11 +84,19 @@ def get(url, coll):
76
84
lll = r .headers .get ('Link' )
77
85
t = r .text
78
86
array1 = json .loads (t )
87
+ total += len (array1 )
88
+ print (success .format (pageNum , len (array1 ), total ))
89
+
79
90
for el in array1 :
91
+ el ['page_number' ] = pageNum
80
92
coll .insert (el )
93
+
94
+ pageNum = int (r .headers .get ('X-Next-Page' ))
81
95
else :
82
- sys .stderr .write ("url can not found:\n " + url + '\n ' )
83
- return
96
+ sys .stderr .write ("Can't find:{}{}{}" .format ('\n ' , url , '\n ' ))
97
+ pageNum += 1
98
+ continue
99
+
84
100
except requests .exceptions .ConnectionError :
85
101
sys .stderr .write ('could not get ' + url + '\n ' )
86
102
@@ -94,4 +110,4 @@ def get(url, coll):
94
110
sys .stderr .write (url + ';' + str (e ) + '\n ' )
95
111
96
112
#start retrieving
97
- get (beginurl ,coll )
113
+ get (beginurl . format ( begin ) ,coll )
0 commit comments