Skip to content

Commit cc68589

Browse files
committed
updating for the final version of 1910 gather
1 parent fe85b4c commit cc68589

File tree

6 files changed

+22
-18
lines changed

6 files changed

+22
-18
lines changed

bbRepos.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
before = sys.argv[3]
1313
url = url + '&before='+sys.argv[3]
1414

15-
client = pymongo.MongoClient (host="da1")
15+
client = pymongo.MongoClient()
1616
dbname = sys .argv[2]
1717
# Get a reference to a particular database
1818
db = client [dbname]

ghUpdatedRepos.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818
raise ValueError("Incorrect beginning date format, should be YYYY-MM-DD")
1919

2020
# DB info
21-
client = pymongo.MongoClient(host='da1')
21+
client = pymongo.MongoClient()
2222
dbName = sys.argv[2] # db name as second arg
2323
collName = sys.argv[3] # coll name as third arg
2424
db = client[dbName]
2525
coll = db[collName]
2626

27-
token = '' # PROVIDE YOUR GITHUB API TOKEN HERE
27+
token = '9de7ae1b92c2af1b997c498a5f2605e0e4950300' # PROVIDE YOUR GITHUB API TOKEN HERE
2828
url = 'https://api.github.com/graphql'
2929
headers = {'Authorization': 'token ' + token}
3030
start = begin + 'T00:00:00Z'
@@ -113,10 +113,13 @@ def gatherData(res):
113113

114114
r = requests.post(url=url, json=jsonS, headers=headers)
115115
res = json.loads(r.text)
116-
remaining = res['data']['rateLimit']['remaining']
117-
reset = res['data']['rateLimit']['resetAt']
118-
if remaining == 0:
119-
wait(reset)
116+
try:
117+
remaining = res['data']['rateLimit']['remaining']
118+
reset = res['data']['rateLimit']['resetAt']
119+
if remaining == 0:
120+
wait(reset)
121+
except TypeError as e:
122+
print(e)
120123

121124
repos = res['data']['search']['repositoryCount']
122125
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']

glRepos.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
begin = sys.argv[1] # beginning page index from first arg
1111

1212
# DB info
13-
client = pymongo.MongoClient(host='da1')
13+
client = pymongo.MongoClient()
1414
dbname = sys.argv[2] # expects db name as second arg
1515
collName = sys.argv[3] # expects collection name as third arg
1616
db = client[dbname]

listU.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
jsonDict = {}
66

7-
client = pymongo.MongoClient (host="da1")
7+
client = pymongo.MongoClient ()
88
# Get a reference to a particular database
99
args = list(sys.argv)
1010
args.pop (0)

run1910.sh

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
#!/bin/bash
22
PDT=201905
3+
PDTdash=2019-05-01
34
DT=201910
45
# Get updated repos only: updated since last gathering
56
#python3 ghUpdatedRepos.py 2018-12-01 gh201813 repos &> ghReposList201813.updt &
6-
python3 ghUpdatedRepos.py 2019-05-01 gh$DT repos &> ghReposList$DT.updt &
7+
cat tokens_date | while read r; do echo $r | python3 ghUpdatedRepos.new.py gh$DT repos &> ghReposList$(echo $r | cut -d ' ' -f2).updt & done
78

89
# BB: need to extract all, no way to check for updated ones
910
#python3 bbRepos.py 1980-01-01 bitbucket$DT 2013-00-01 &> bbRepos${DT}0.out &
@@ -15,7 +16,7 @@ python3 ghUpdatedRepos.py 2019-05-01 gh$DT repos &> ghReposList$DT.updt &
1516
#python3 bbRepos.py 2017-05-03 bitbucket$DT 2018-05-03 &> bbRepos${DT}6.out &
1617
#python3 bbRepos.py 2018-05-03 bitbucket$DT 2022-05-03 &> bbRepos${DT}7.out &
1718
#get only new, use heads for existing repos
18-
python3 bbRepos.py 2019-02-01 bitbucket$DT 2022-05-03 &> bbRepos${DT}0.out &
19+
python3 bbRepos.py $PDTdash bitbucket$DT 2022-05-03 &> bbRepos${DT}0.out &
1920

2021

2122
# SF
@@ -64,9 +65,9 @@ done | gzip > cgit.kde.org.$DT.heads &
6465
# https://gitlab.gnome.org/explore/projects?page=300&sort=latest_activity_desc
6566
# insert username/password to prevend password requests
6667
for p in {1..300}
67-
do wget "https://gitlab.gnome.org/explore/projects?page=$p" -O - 2> /dev/null | perl -ane 'chop();if (m|^<a class="text-plain" href="|){s|<a class="text-plain" href="||;s|".*||;s|^/||;print "https://a:[email protected]/$_\n"}'
68+
do wget "https://gitlab.gnome.org/explore/projects?page=$p" -O - 2> /dev/null | perl -ane 'chop();if (m|^<a class="text-plain" href="|){s|<a class="text-plain" href="||;s|".*||;s|^/||;print "https://a:a\@gitlab.gnome.org/$_\n"}'
6869
done | sort -u > gitlab.gnome.org.$DT
69-
cat gitlab.gnome.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > gitlab.gnome.org.heads.$DT &
70+
cat gitlab.gnome.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > gitlab.gnome.org.$DT.heads &
7071

7172

7273
# pages 1-1530
@@ -132,13 +133,13 @@ cat git.kernel.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"
132133

133134

134135
wget http://git.savannah.gnu.org/cgit -O git.savannah.gnu.org.html
135-
perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.savannah.gnu.org\$1\n\";}" < git.savannah.gnu.org.html | sed 's|/cgit/|/git/|' | sort -u | > git.savannah.gnu.org.$DT
136+
perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.savannah.gnu.org\$1\n\";}" < git.savannah.gnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.gnu.org.$DT
136137
cat git.savannah.gnu.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.savannah.gnu.org.$DT.heads &
137138

138139
wait
139140

140141
# Get update repos for GL
141-
python3 listU.py gl$DT repos '{ "last_activity_at" : { "$gt" : "2019-02-01" }}' http_url_to_repo | sed "s|^b'||;s|'$||" > gl$DT.new
142+
python3 listU.py gl$DT repos '{ "last_activity_at" : { "$gt" : "'"$PDTdash"'" }}' http_url_to_repo | sed "s|^b'||;s|'$||" > gl$DT.new
142143
cat gl$DT.new | sed 's|https://gitlab.com/|gl:|' | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
143144
done | gzip > gl$DT.new.heads &
144145

@@ -152,7 +153,7 @@ do cat gh$DT.u.$j | while read r; do
152153
done
153154

154155
# Get updated bb (do heads on all 2M?)
155-
python3 listU.py bitbucket$DT repos '{ "updated_on" : { "$gt" : "2019-02-01" } }' full_name | \
156+
python3 listU.py bitbucket$DT repos '{ "updated_on" : { "$gt" : "'"$PDTdash"'" } }' full_name | \
156157
sed "s|^b'||;s|'$||" | sort -u > bitbucket$DT.new
157158
split -n l/10 -da1 bitbucket$DT.new bitbucket$DT.new.
158159
for j in {0..8}

sfRepos.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import re, itertools, os, pymongo, sys
66

77
# DB info
8-
client = pymongo.MongoClient(host='da1')
8+
client = pymongo.MongoClient()
99
dbname = sys.argv[1] # expects db name as first argument
1010
collName = sys.argv[2] # expect collection name as second arg
1111
db = client[dbname]
@@ -65,4 +65,4 @@ def get(page):
6565
coll.insert({"url": proj, "source": "SourceForge", "git": None})
6666

6767
# Print how many projects we found
68-
print("# projects: " + len(projects))
68+
print("# projects: " + str(len(projects)))

0 commit comments

Comments
 (0)