Updated gl scraper to be more resilient - now outputs each successful page, inserts page_number into the database, and continues running even if it encounters empty page. Changed sfRepos and extractSfGit to work out of box - old version was using code that was looking for objects in the database past a certain ObjectID because the script had crashed earlier, it now looks for ALL objects in the database after sfRepos has finished running

zol0 · zol0 · commit 7485f3055c97 · 2019-01-31T22:20:14.000Z
diff --git a/extractSfGit.py b/extractSfGit.py
@@ -1,3 +1,7 @@
+'''
+Script used to extract git urls of sourceforge projects
+Uses the 'git ls-remote' command to test if a repo exists
+'''
 import pymongo
 import sys
 import json
@@ -13,11 +17,14 @@
 db = client[dbname]
 coll = db[collName]
 
-gitBase = 'https://git.code.sf.net/p/{}/{}/'
-params = ['git', 'ls-remote']
-last_object = ObjectId("5c2f59fd256eee0016f01e8b")
-cursor = coll.find({"git": None, "_id": {"$gt": last_object}}, no_cursor_timeout=True)
+gitBase = 'https://git.code.sf.net/p/{}/{}/' # base url
+params = ['git', 'ls-remote'] # subprocess parameters
+# last_object = ObjectId("5c2f59fd256eee0016f01e8b") # used for script crashes
 
+# the query itself (currently uses the last object when script crashed)
+cursor = coll.find({"git": None}), no_cursor_timeout=True)
+
+# traverse database and attempt 'git ls-remote' on two options (code vs. git)
 for doc in cursor:
 	proj = re.search('projects\/(.+)', doc['url']).group(1)
 	bases = [gitBase.format(proj, 'git'), gitBase.format(proj, 'code')]
@@ -29,5 +36,5 @@
 			coll.update_one({'_id': doc['_id']}, {'$set': {'http_url_to_repo': base}}, upsert=False)
 		except subprocess.CalledProcessError as err:
 			continue
-
+# close the cursor since we set it to no timeout
 cursor.close()	
diff --git a/glRepos.py b/glRepos.py
@@ -16,10 +16,10 @@
 db = client[dbname]
 coll = db[collName]
 
-beginurl = "https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=" + begin + \
-    "&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false"
+beginurl = "https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page={}&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false"
 
 gleft = 0
+success = "Successfully loaded page {}. Got {} repos, current total is {}"
 
 header = {'per_page': str(99)}
 
@@ -30,7 +30,8 @@ def wait(left):
         l = requests.get('https://gitlab.com/api/v4/projects', headers=header)
         if (l.ok):
             left = int(l.headers.get('RateLimit-Remaining'))
-        time .sleep(60)
+        print("Waiting for rate limit...")
+        time.sleep(60)
     return left
 
 # send queries and extract urls
@@ -40,34 +41,41 @@ def get(url, coll):
     global header
     global bginnum
     gleft = wait(gleft)
-    values = []
-    size = 0
+    total = 0
 
     try:
         r = requests .get(url, headers=header)
-        time .sleep(0.5)
+        time.sleep(0.5)
         # got blocked
         if r.status_code == 403:
             return "got blocked", str(bginnum)
         if (r.ok):
-
             gleft = int(r.headers.get('RateLimit-Remaining'))
+
+            # get total number of pages (i.e. get last possible page)
             lll = r.headers.get('Link')
+            ll = lll.replace(';', ',').split(',')
+            url = ll[ll.index(' rel="last"') -
+                    1].replace('<', '').replace('>', '').lstrip()
+            last = re.findall(r'&page=(\d+)&', url)
+            if (len(last) == 1):
+              last = int(last[0])
+
             t = r.text
             array = json.loads(t)
+            total += len(array)
+            print(success.format(begin, len(array), total))
 
             for el in array:
+                el['page_number'] = begin
                 coll.insert(el)
 
-            #next page
-            while ('; rel="next"' in lll):
+            pageNum = int(r.headers.get('X-Next-Page'))
+            while (pageNum <= last):
                 gleft = int(r.headers.get('RateLimit-Remaining'))
                 gleft = wait(gleft)
                 # extract next page url
-                ll = lll.replace(';', ',').split(',')
-                url = ll[ll.index(' rel="next"') -
-                         1].replace('<', '').replace('>', '').lstrip()
-
+                url = beginurl.format(pageNum)
                 try:
                     r = requests .get(url, headers=header)
                     if r.status_code == 403:
@@ -76,11 +84,19 @@ def get(url, coll):
                         lll = r.headers.get('Link')
                         t = r.text
                         array1 = json.loads(t)
+                        total += len(array1)
+                        print(success.format(pageNum, len(array1), total))
+                        
                         for el in array1:
+                            el['page_number'] = pageNum
                             coll.insert(el)
+
+                        pageNum = int(r.headers.get('X-Next-Page'))
                     else:
-                        sys.stderr.write("url can not found:\n" + url + '\n')
-                        return
+                        sys.stderr.write("Can't find:{}{}{}".format('\n', url, '\n'))
+                        pageNum += 1
+                        continue
+
                 except requests.exceptions.ConnectionError:
                     sys.stderr.write('could not get ' + url + '\n')
 
@@ -94,4 +110,4 @@ def get(url, coll):
         sys.stderr.write(url + ';' + str(e) + '\n')
 
 #start retrieving
-get(beginurl,coll)
+get(beginurl.format(begin),coll)
diff --git a/sfRepos.py b/sfRepos.py
@@ -62,7 +62,7 @@ def get(page):
 
 # Insert all projects into collection
 for i, proj in enumerate(itertools.islice(projects, len(projects))):
-    coll.insert({"url": proj, "source": "SourceForge"})
+    coll.insert({"url": proj, "source": "SourceForge", "git": None})
 
 # Print how many projects we found
 print("# projects: " + len(projects))