Skip to content

Commit e77873b

Browse files
committed
adding 2022 script
1 parent f14c0a8 commit e77873b

File tree

1 file changed

+386
-0
lines changed

1 file changed

+386
-0
lines changed

run2203.sh

+386
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,386 @@
1+
#!/bin/bash
2+
3+
# add php for next collection http://git.php.net/
4+
# check these
5+
# https://foss.heptapod.net
6+
# git.joeyh.name git 27
7+
# git.gnu.org.ua git 45
8+
# git.eclipse.org git 1,382
9+
# gitweb.torproject.org git 364
10+
# hdiff.luite.com git 15,274
11+
# git.alpinelinux.org git 11
12+
# git.openembedded.org git 9
13+
# git.yoctoproject.org git 165
14+
# git.zx2c4.com git 154
15+
# git.kernel.org git 893
16+
# fedorapeople.org git 860
17+
# git.baserock.org git 1,455
18+
# code.qt.io git 279
19+
# codeberg.org git 5,518
20+
# git.fsfe.org
21+
gitlab.lip6.fr git 26
22+
0xacab.org git 1,106
23+
gitlab.inria.fr git 1,981
24+
gitlab.freedesktop.org git 6,514
25+
gitlab.common-lisp.net git 801
26+
gitlab.ow2.org git 1,185
27+
gitlab.gnome.org git 11,061
28+
gite.lirmm.fr git 490
29+
framagit.org git 15,523
30+
launchpad git 20,412
31+
nixos nixguix
32+
opam.ocaml.org opam 3,609
33+
coq.inria.fr opam 366
34+
pypi pypi 439,042
35+
36+
# on GCP use E2 8cpu 32GB, have at least 100Gb disk and check periodicly not to run it over
37+
# container swsc/gather
38+
# Once container is created,
39+
# add tokens to /data/gather,
40+
# add id_rsagihub to ~/.ssh and
41+
# copy ssh_config to dockers's .ssh/config
42+
# do git ls-remote for each of the forges to avoid yes/no question (alternatively, add options to config to prevent that)
43+
# A command line to start container on gcp allow https
44+
# sudo docker run -d -v /home/audris/gather:/data/gather -w /home/audris -p443:22 --name gather audris/gather /bin/startDef.sh audris
45+
46+
47+
# --- Do everything by running on da servers (in tmux/screen) and connected to gcp via
48+
#ssh gc
49+
#
50+
#To forward mongodb have in your ~/.ssh/config
51+
#
52+
#host gc
53+
# user YourID
54+
# hostname GCPIP
55+
# RemoteForward 27017 da1.eecs.utk.edu:27017
56+
# port 443
57+
# IdentityFile ~/.ssh/id_rsa_gcloud
58+
59+
# in the first stage bbRepos.py, glRepos.py, and ghUpdatedRepos.py populate mongodb, which is then used to
60+
# get project list, while the rest populate project list into XXXX.$DT
61+
# all XXXX.$DT need to be copied to da cluster
62+
63+
# in the second stage the check for latest objects is produced vi ls-relote
64+
# second stage typically requires a much larger disk to store *.heads
65+
# all *.heads need to be copied to da cluster
66+
67+
git ls-remote bb:swsc/lookup
68+
git ls-remote gh:fdac20/news
69+
git ls-remote gl:inkscape/inkscape
70+
git ls-remote gl_gnome:gnome/gtk
71+
git ls-remote dr:
72+
git ls-remote deb:
73+
74+
DT=202109
75+
DTdash=2021-09-10
76+
PDT=202102
77+
PDTdash=2021-02-10
78+
79+
PT=$(date -d"$PDTdash" +%s)
80+
T=$(date -d"$DTdash" +%s)
81+
82+
# Get updated repos only: updated since last gathering
83+
ntok=$(cat tokens|wc -l)
84+
inc=$(( ($T-$PT)/$ntok ))
85+
for i in $(eval "echo {1..$ntok}")
86+
do ptt=$(date -d"@"$(($PT+($i-1)*$inc)) +"%Y-%m-%d")
87+
tt=$(date -d"@"$(($PT+($i)*$inc)) +"%Y-%m-%d")
88+
echo $(head -$i tokens|tail -1) $ptt $tt
89+
done > tokens_date
90+
91+
for i in {1..9}; do (r=$(head -$i tokens_date|tail -1); echo $r | python3 ghUpdatedRepos.py gh$DT repos &> ghReposList$(echo $r | cut -d ' ' -f2).updt) & done
92+
93+
# BB: need to extract all, no way to check for updated ones
94+
python3 bbRepos.py 1980-01-01 bitbucket$DT 2013-00-01 &> bbRepos${DT}1.out &
95+
python3 bbRepos.py 2013-01-01 bitbucket$DT 2014-05-03 &> bbRepos${DT}2.out &
96+
python3 bbRepos.py 2014-05-03 bitbucket$DT 2015-05-03 &> bbRepos${DT}3.out &
97+
python3 bbRepos.py 2015-05-03 bitbucket$DT 2016-05-03 &> bbRepos${DT}4.out &
98+
python3 bbRepos.py 2016-05-03 bitbucket$DT 2017-05-03 &> bbRepos${DT}5.out &
99+
python3 bbRepos.py 2017-05-03 bitbucket$DT 2018-05-03 &> bbRepos${DT}6.out &
100+
python3 bbRepos.py 2018-05-03 bitbucket$DT 2019-05-03 &> bbRepos${DT}7.out &
101+
python3 bbRepos.py 2019-05-03 bitbucket$DT 2020-05-01 &> bbRepos${DT}8.out &
102+
python3 bbRepos.py 2020-05-03 bitbucket$DT 2021-05-01 &> bbRepos${DT}9.out &
103+
#get only new, use heads for existing repos
104+
python3 bbRepos.py 2021-05-01 bitbucket$DT 2022-05-03 &> bbRepos${DT}0.out &
105+
106+
107+
# SF
108+
python3 sfRepos.py sf$DT repos
109+
python3 listU.py sf$DT repos '{}' url | sed "s|b'https://sourceforge.net/projects/||;s|'$||;" | sort -u > sf$DT.prj
110+
#join -v1 sf$DT.prj sf$PDT.prj > sf$DT.prj.new
111+
112+
#python3 extractSfGit.py sf201813 repos &>> sf201813.out
113+
114+
# Gitlab
115+
python3 glRepos.py 1 gl$DT repos &> gl$DT.out &
116+
117+
wait
118+
119+
# Split for parallel processing
120+
split -n l/10 -da1 sf$DT.prj sf$DT.prj.
121+
for i in {0..9}
122+
do cat sf$DT.prj.$i | while read r;
123+
do gg=$(git ls-remote "https://git.code.sf.net/p/$r/git" 2> /dev/null| awk '{print ";"$1}')
124+
cc=$(git ls-remote "https://git.code.sf.net/p/$r/code" 2> /dev/null| awk '{print ";"$1}');
125+
[[ $gg == "" ]] || echo https://git.code.sf.net/p/$r/git$gg |sed 's/ ;/;/g'
126+
[[ $cc == "" ]] || echo https://git.code.sf.net/p/$r/code$cc|sed 's/ ;/;/g';
127+
done | gzip > sf$DT.prj.$i.heads &
128+
done
129+
130+
#now do for existing
131+
zcat sf$PDT.prj.*.heads
132+
133+
# Do other forges git.bioconductor.org,
134+
wget http://git.bioconductor.org -O bio.html
135+
cat bio.html | awk '{print $2}' | grep / | grep -v '\*' | awk '{ print "https://git.bioconductor.org/"$1}' > bioconductor.org.$DT
136+
cat bioconductor.org.$DT | \
137+
while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
138+
done | gzip > bioconductor.org.$DT.heads &
139+
140+
141+
wget "https://blitiri.com.ar/git/" -O blitiri.com.ar.html
142+
grep '<td class="name"><a href="' blitiri.com.ar.html|sed 's|^\s*<td class="name"><a href="||;s|".*||' | sort -u | awk '{print "https://blitiri.com.ar/git/"$1}' > blitiri.com.ar.$DT
143+
144+
u=fedorapeople.org
145+
wget "https://$u" -O $u.html
146+
grep 'Git repositories' $u.html|sed 's|<a href="||;s|".*||' | sort -u > $u.$DT
147+
148+
u=code.qt.io
149+
wget "https://$u/cgit/" -O $u.html
150+
grep 'toplevel-repo' $u.html| sed "s|.*href='/cgit/|/cgit/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT
151+
152+
u=git.alpinelinux.org
153+
wget "https://$u" -O $u.html
154+
grep 'toplevel-repo' $u.html | sed "s|.*href='/|/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT
155+
156+
u=git.openembedded.org
157+
wget "https://$u" -O $u.html
158+
grep 'toplevel-repo' $u.html | sed "s|.*' href='/|/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT
159+
160+
for u in git.torproject.org git.xfce.org git.yoctoproject.org
161+
do wget "https://$u" -O $u.html
162+
grep -E '(sublevel|toplevel)-repo' $u.html | sed "s|.*' href='/|/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT
163+
done
164+
165+
wget "https://repo.or.cz/?a=project_list" -O cz.html
166+
grep '\.git' cz.html | sed 's|.*"/\([^/"]*\.git\).*|\1|' | uniq | sort -u | awk '{print "https://repo.or.cz/"$1}'> repo.or.cz.$DT
167+
cat repo.or.cz.$DT | \
168+
while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
169+
done | gzip > repo.or.cz.$DT.heads &
170+
171+
172+
wget "https://gitbox.apache.org/repos/asf" -O gitbox.apache.org.html
173+
grep '<td><a href="/repos/asf/[^\?]' gitbox.apache.org.html|sed 's|.*<td><a href="/||;s|".*||' | sort -u | awk '{print "https://gitbox.apache.org/"$1}' > gitbox.apache.org.$DT
174+
175+
echo https://gcc.gnu.org/git/gcc.git > gcc.git.$DT
176+
177+
for i in {1..50}
178+
do wget "https://pagure.io/?page=$i&sorting=None" -O pagure.io.html
179+
grep '^\s*<a href="/' pagure.io.html |sed 's|^\s*<a href="||;s|".*||'|grep -Ev '^/(about|ssh_info)$'
180+
done | uniq | sort -u | awk '{print "https://pagure.io"$1}' > pagure.io.$DT
181+
182+
u=notabug.org
183+
for i in {1..50}
184+
do wget "https://$u/explore/repos?page=$i&q=" -O $u.html
185+
grep '<a class="name" href="/' $u.html |sed 's|<a class="name" href="/|/|;s|".*||'
186+
done |sort -u | awk '{print "https://'$u'"$1}' > $u.$DT
187+
188+
for u in framagit.org gitlab.adullact.net code.ill.fr forgemia.inra.fr git.unicaen.fr git.unistra.fr git.pleroma.social gitlab.fing.edu.uy gitlab.huma-num.fr gitlab.irstea.fr gitlab.cerema.fr gite.lirmm.fr gitlab.common-lisp.net
189+
do for i in {1..50}
190+
do sleep 2; wget "https://$u/explore/projects?non_archived=true&page=$i&sort=name_asc" -O $u.html
191+
grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||'
192+
done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.$DT
193+
done
194+
195+
for u in gitlab.freedesktop.org gitlab.inria.fr gitlab.ow2.org 0xacab.org invent.kde.org
196+
do for i in {1..50}
197+
do for o in latest_activity_desc name_asc name_desc created_desc created_asc
198+
do sleep 2; wget "https://$u/explore/projects?non_archived=true&page=$i&sort=$o" -O $u.html
199+
grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||'
200+
done
201+
done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.a.$DT
202+
for i in {1..50}
203+
do for o in latest_activity_desc name_asc name_desc created_desc created_asc
204+
do sleep 2; wget "https://$u/explore/projects/starred?non_archived=true&page=$i&sort=$o" -O $u.html
205+
grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||'
206+
done
207+
done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.s.$DT
208+
for i in {1..50}
209+
do for o in latest_activity_desc name_asc name_desc created_desc created_asc
210+
do sleep 2; wget "https://$u/explore/projects/trending?non_archived=true&page=$i&sort=$o" -O $u.html
211+
grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||'
212+
done
213+
done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.t.$DT
214+
215+
cat $u.?.$DT | sort -u > $u.$DT
216+
done
217+
218+
cat invent.kde.org.$DT | \
219+
while read r; do r="$r.git";a=$(git ls-remote $r.git | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
220+
done | gzip > invent.kde.org.$DT.heads &
221+
222+
223+
#repo.or.cz.$DT gitlab.gnome.org.$DT android.googlesource.com.$DT git.zx2c4.com.$DT git.eclipse.org.$DT git.kernel.org.$DT git.savannah.gnu.org.$DT git.savannah.nongnu.org.$DT
224+
#fedorapeople.org.$DT list peoples git websites: need to append /public_git/ to get their projects
225+
cat fedorapeople.org.$DT|sed 's|^\s*||'|while read r; do wget "$r/public_git" -O -; done > fedorapeople.org.fix.$DT.html
226+
grep /public_git/ fedorapeople.org.fix.$DT.html| sed "s|.* href='/cgit/||;s|'.*||;s|/tree/$||;s|^|https://fedorapeople.org/cgit/" | sort -u > fedorapeople.org.fix.$DT
227+
228+
229+
#git.pleroma.social.$DT - seems not to allow listing
230+
echo https://git.pleroma.social/pleroma/pleroma > git.pleroma.social.$DT
231+
232+
for i in fedorapeople.org.fix.$DT pagure.io.$DT blitiri.com.ar.$DT code.qt.io.$DT gitlab.common-lisp.net.$DT code.ill.fr.$DT forgemia.inra.fr.$DT git.unicaen.fr.$DT notabug.org.$DT git.unistra.fr.$DT gcc.git.$DT gitlab.fing.edu.uy.$DT gitlab.huma-num.fr.$DT gitlab.adullact.net.$DT gitlab.irstea.fr.$DT git.alpinelinux.org.$DT gitlab.cerema.fr.$DT git.openembedded.org.$DT gite.lirmm.fr.$DT git.torproject.org.$DT git.xfce.org.$DT git.yoctoproject.org.$DT framagit.org.$DT gitlab.freedesktop.org.$DT gitlab.ow2.org.$DT gitbox.apache.org.$DT gitlab.inria.fr.$DT
233+
do (sed 's|/\.git/$||;s|^\s*||;s|//|//a:a@|;s|/tree/$||;s|/$||;s|blitiri.com.ar/git/r/|blitiri.com.ar/repos/|;' $i | while read r; do a=$(git ls-remote "$r" 2> $i.err| awk '{print ";"$1}'); echo "$r$a"|sed 's/ //g'; done| gzip > $i.heads; sleep 2) &
234+
done
235+
236+
237+
# pages 1-300
238+
# https://gitlab.gnome.org/explore/projects?page=300&sort=latest_activity_desc
239+
# insert username/password to prevend password requests
240+
for p in {1..300}
241+
do wget "https://gitlab.gnome.org/explore/projects?page=$p" -O - 2> /dev/null | perl -ane 'chop();if (m|^<a class="text-plain" href="|){s|<a class="text-plain" href="||;s|".*||;s|^/||;print "https://a:a\@gitlab.gnome.org/$_\n"}'
242+
done | sort -u > gitlab.gnome.org.$DT
243+
cat gitlab.gnome.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > gitlab.gnome.org.$DT.heads &
244+
245+
246+
# pages 1-1530
247+
# git.debian.org -> https://salsa.debian.org/explore/projects?page=1540&sort=latest_activity_desc
248+
for of in {0..9}; do
249+
for p in $(eval "echo {$((1+$of*20))..$((20+$of*20))}")
250+
do wget "https://salsa.debian.org/explore/projects?page=$p" -O - 2> /dev/null | perl -ane 'chop(); while (m|<a class="project" href="([^"]*)"|g){print "https://a:a\@salsa.debian.org$1.git\n"}'
251+
done > git.debian.org.$DT.$of &
252+
done
253+
wait
254+
for of in {0..9}; do
255+
cat git.debian.org.$DT.$of | while read r; do a=$(git ls-remote $r 2> err | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; sleep 20; done | gzip > git.debian.org.$DT.$of.heads
256+
done
257+
258+
sort=name_desc
259+
# Add following forges as well
260+
# android.git.kernel.org ??
261+
262+
#there are totally 2398 pages in https://git.drupalcode.org/explore/projects
263+
thost="https://git.drupalcode.org/explore/projects?page="
264+
i=0
265+
rm drupal.org
266+
for i in {1..50}
267+
do
268+
rhost="$thost$i".'&sort=latest_activity_desc';
269+
curl -o drupal0.html $rhost;
270+
rhost="$thost$i".'&sort=latest_activity_asc';
271+
curl -o drupal1.html $rhost;
272+
rhost="$thost$i".'&sort=created_desc';
273+
curl -o drupal2.html $rhost;
274+
rhost="$thost$i".'&sort=created_asc';
275+
curl -o drupal3.html $rhost;
276+
rhost="$thost$i".'&sort=name_asc';
277+
curl -o drupal4.html $rhost;
278+
rhost="$thost$i".'&sort=name_desc';
279+
curl -o drupal5.html $rhost;
280+
rhost="$thost$i".'&sort=stars_desc';
281+
curl -o drupal6.html $rhost;
282+
rhost="$thost$i".'&sort=stars_asc';
283+
curl -o drupal7.html $rhost;
284+
285+
#if j==-1,this is invalid page,we have gotten all pages successfully.
286+
for t in {0..7}
287+
do j=$(perl -e '$e=0;while(<STDIN>){if(m|<h5>This user doesn|){$e=-1;last;};}; print "$e\n"' < drupal$t.html);
288+
if [ "$j" -eq "-1" ]; then break; fi;
289+
#all urls will be stored in ./drupal.com
290+
perl -ane 'while(m|<span class="project-name">([^<]*)</span>|g){print "https://git.drupalcode.org/project/$1\n"}' < drupal$t.html >> drupal.com.$DT;
291+
if [ `expr $i % 10` -eq 0 ]; then sleep 2; fi;
292+
done
293+
done
294+
295+
sort -u drupal.com.$DT > drupal.com.$DT.u
296+
mv drupal.com.$DT.u drupal.com.$DT
297+
cat drupal.com.$DT| sed 's|.*/project/|dr:project/|' | \
298+
while read r; do git ls-remote $r | grep -E 'refs/heads|HEAD' | sed 's|\s*refs/heads/|;|;s|\s*HEAD|;HEAD|;s|^|'$r';|';
299+
done | gzip > drupal.com.$DT.heads &
300+
301+
302+
wget https://android.googlesource.com/ -O android.googlesource.com.html
303+
perl -ane 'while(m|class="RepoList-itemName">([^<]*)</|g){print "https://android.googlesource.com/$1\n";}' < android.googlesource.com.html > android.googlesource.com.$DT
304+
cat android.googlesource.com.$DT | \
305+
while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
306+
done | gzip > android.googlesource.com.$DT.heads &
307+
308+
###
309+
310+
wget https://git.zx2c4.com -O git.zx2c4.com.html
311+
perl -ane "while (m|<td class='toplevel-repo'><a title='([^']*)'|g){print \"https://git.zx2c4.com/\$1\n\";}" < git.zx2c4.com.html > git.zx2c4.com.$DT
312+
cat git.zx2c4.com.$DT | \
313+
while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
314+
done | gzip > git.zx2c4.com.$DT.heads &
315+
316+
wget http://git.eclipse.org/ -O git.eclipse.org.html
317+
perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.eclipse.org\$1\n\";}" < git.eclipse.org.html | sed 's|/c/|/r/|' > git.eclipse.org.$DT
318+
cat git.eclipse.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.eclipse.org.$DT.heads &
319+
320+
321+
wget http://git.postgresql.org -O git.postgresql.org.html
322+
perl -ane 'while(m|<a class="list" href="/gitweb/\?p=([^"]*);a=summary"|g){print "https://git.postgresql.org/git/$1\n"}' < git.postgresql.org.html | sort -u > git.postgresql.org.$DT
323+
cat git.postgresql.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.postgresql.org.$DT.heads &
324+
325+
wget http://git.kernel.org -O git.kernel.org.html
326+
perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.kernel.org\$1\n\";}" < git.kernel.org.html > git.kernel.org.$DT
327+
cat git.kernel.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.kernel.org.$DT.heads &
328+
329+
330+
wget http://git.savannah.gnu.org/cgit -O git.savannah.gnu.org.html
331+
#perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.savannah.gnu.org\$1\n\";}" < git.savannah.gnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.gnu.org.$DT
332+
perl -ane "while (m|<td class='toplevel-repo'><a title='([^']*)'|g){print \"https://git.savannah.gnu.org/git/\$1\n\";}" < git.savannah.gnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.gnu.org.$DT
333+
cat git.savannah.gnu.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.savannah.gnu.org.$DT.heads &
334+
335+
wget http://git.savannah.nongnu.org/cgit -O git.savannah.nongnu.org.html
336+
perl -ane "while (m|<td class='toplevel-repo'><a title='([^']*)'|g){print \"https://git.savannah.nongnu.org/git/\$1\n\";}" < git.savannah.nongnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.nongnu.org.$DT
337+
cat git.savannah.nongnu.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.savannah.nongnu.org.$DT.heads &
338+
339+
340+
#get old repos for gh, these may have changed again
341+
python3 listU.py gh$PDT repos '{}' nameWithOwner | sed "s|^b'||;s|'$||" | sort -u > gh$PDT.u
342+
split -n l/50 -da2 gh$PDT.u gh$PDT.u.
343+
for j in {00..49}
344+
do cat gh$PDT.u.$j | while read r; do
345+
a=$(git ls-remote gh:$r | awk '{print ";"$1}'); echo gh:$r$a | sed 's/ //g';
346+
done | gzip > gh$PDT.u.$j.heads &
347+
done
348+
349+
350+
wait
351+
352+
353+
354+
# Get update repos for GL
355+
python3 listU.py gl$DT repos '{ "last_activity_at" : { "$gt" : "'"$PDTdash"'" }}' http_url_to_repo | sed "s|^b'||;s|'$||"|sort -u > gl$DT.new
356+
cat gl$DT.new | sed 's|https://gitlab.com/|gl:|' | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g';
357+
done | gzip > gl$DT.new.heads &
358+
359+
# Get updated, no-forks for GH
360+
#python3 listU.py gh$DT repos '{"isFork" : false}' nameWithOwner | sed "s|^b'||;s|'$||" | sort -u > gh$DT.u
361+
python3 listU.py gh$DT repos '{ "pushed_at" : { "$gt" : "'"$PDTdash"'"}' nameWithOwner | sed "s|^b'||;s|'$||" | sort -u > gh$DT.u
362+
cat gh$PDT.u.*[0-9] | sort -t\; | join -t\; -v2 - gh$DT.u > gh$DT.new.u
363+
split -n l/50 -da2 gh$DT.new.u gh$DT.u.
364+
for j in {00..49}
365+
do cat gh$DT.u.$j | while read r; do
366+
a=$(git ls-remote gh:$r | awk '{print ";"$1}'); echo gh:$r$a | sed 's/ //g';
367+
done | gzip > gh$DT.u.$j.heads &
368+
done
369+
370+
371+
372+
# Get updated bb (do heads on all 2M?)
373+
python3 listU.py bitbucket$DT repos '{ "updated_on" : { "$gt" : "'"$PDTdash"'" } }' full_name | \
374+
sed "s|^b'||;s|'$||" | sort -u > bitbucket$DT.new
375+
split -n l/10 -da1 bitbucket$DT.new bitbucket$DT.new.
376+
for j in {0..9}
377+
do cat bitbucket$DT.new.$j | while read r; do
378+
a=$(git ls-remote bb:$r | awk '{print ";"$1}'); echo bb:$r$a | sed 's/ //g';
379+
done | gzip > bitbucket$DT.new.$j.heads &
380+
done
381+
382+
wait
383+
384+
#dump all the collected mongo data
385+
#mongodump
386+
#${un[$i]} ${ps[$i]}

0 commit comments

Comments
 (0)