|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# add php for next collection http://git.php.net/ |
| 4 | +# check these |
| 5 | +# https://foss.heptapod.net |
| 6 | +# git.joeyh.name git 27 |
| 7 | +# git.gnu.org.ua git 45 |
| 8 | +# git.eclipse.org git 1,382 |
| 9 | +# gitweb.torproject.org git 364 |
| 10 | +# hdiff.luite.com git 15,274 |
| 11 | +# git.alpinelinux.org git 11 |
| 12 | +# git.openembedded.org git 9 |
| 13 | +# git.yoctoproject.org git 165 |
| 14 | +# git.zx2c4.com git 154 |
| 15 | +# git.kernel.org git 893 |
| 16 | +# fedorapeople.org git 860 |
| 17 | +# git.baserock.org git 1,455 |
| 18 | +# code.qt.io git 279 |
| 19 | +# codeberg.org git 5,518 |
| 20 | +# git.fsfe.org |
| 21 | +gitlab.lip6.fr git 26 |
| 22 | +0xacab.org git 1,106 |
| 23 | +gitlab.inria.fr git 1,981 |
| 24 | +gitlab.freedesktop.org git 6,514 |
| 25 | +gitlab.common-lisp.net git 801 |
| 26 | +gitlab.ow2.org git 1,185 |
| 27 | +gitlab.gnome.org git 11,061 |
| 28 | +gite.lirmm.fr git 490 |
| 29 | +framagit.org git 15,523 |
| 30 | +launchpad git 20,412 |
| 31 | +nixos nixguix |
| 32 | +opam.ocaml.org opam 3,609 |
| 33 | +coq.inria.fr opam 366 |
| 34 | +pypi pypi 439,042 |
| 35 | + |
| 36 | +# on GCP use E2 8cpu 32GB, have at least 100Gb disk and check periodicly not to run it over |
| 37 | +# container swsc/gather |
| 38 | +# Once container is created, |
| 39 | +# add tokens to /data/gather, |
| 40 | +# add id_rsagihub to ~/.ssh and |
| 41 | +# copy ssh_config to dockers's .ssh/config |
| 42 | +# do git ls-remote for each of the forges to avoid yes/no question (alternatively, add options to config to prevent that) |
| 43 | +# A command line to start container on gcp allow https |
| 44 | +# sudo docker run -d -v /home/audris/gather:/data/gather -w /home/audris -p443:22 --name gather audris/gather /bin/startDef.sh audris |
| 45 | + |
| 46 | + |
| 47 | +# --- Do everything by running on da servers (in tmux/screen) and connected to gcp via |
| 48 | +#ssh gc |
| 49 | +# |
| 50 | +#To forward mongodb have in your ~/.ssh/config |
| 51 | +# |
| 52 | +#host gc |
| 53 | +# user YourID |
| 54 | +# hostname GCPIP |
| 55 | +# RemoteForward 27017 da1.eecs.utk.edu:27017 |
| 56 | +# port 443 |
| 57 | +# IdentityFile ~/.ssh/id_rsa_gcloud |
| 58 | + |
| 59 | +# in the first stage bbRepos.py, glRepos.py, and ghUpdatedRepos.py populate mongodb, which is then used to |
| 60 | +# get project list, while the rest populate project list into XXXX.$DT |
| 61 | +# all XXXX.$DT need to be copied to da cluster |
| 62 | + |
| 63 | +# in the second stage the check for latest objects is produced vi ls-relote |
| 64 | +# second stage typically requires a much larger disk to store *.heads |
| 65 | +# all *.heads need to be copied to da cluster |
| 66 | + |
| 67 | +git ls-remote bb:swsc/lookup |
| 68 | +git ls-remote gh:fdac20/news |
| 69 | +git ls-remote gl:inkscape/inkscape |
| 70 | +git ls-remote gl_gnome:gnome/gtk |
| 71 | +git ls-remote dr: |
| 72 | +git ls-remote deb: |
| 73 | + |
| 74 | +DT=202109 |
| 75 | +DTdash=2021-09-10 |
| 76 | +PDT=202102 |
| 77 | +PDTdash=2021-02-10 |
| 78 | + |
| 79 | +PT=$(date -d"$PDTdash" +%s) |
| 80 | +T=$(date -d"$DTdash" +%s) |
| 81 | + |
| 82 | +# Get updated repos only: updated since last gathering |
| 83 | +ntok=$(cat tokens|wc -l) |
| 84 | +inc=$(( ($T-$PT)/$ntok )) |
| 85 | +for i in $(eval "echo {1..$ntok}") |
| 86 | +do ptt=$(date -d"@"$(($PT+($i-1)*$inc)) +"%Y-%m-%d") |
| 87 | + tt=$(date -d"@"$(($PT+($i)*$inc)) +"%Y-%m-%d") |
| 88 | + echo $(head -$i tokens|tail -1) $ptt $tt |
| 89 | +done > tokens_date |
| 90 | + |
| 91 | +for i in {1..9}; do (r=$(head -$i tokens_date|tail -1); echo $r | python3 ghUpdatedRepos.py gh$DT repos &> ghReposList$(echo $r | cut -d ' ' -f2).updt) & done |
| 92 | + |
| 93 | +# BB: need to extract all, no way to check for updated ones |
| 94 | +python3 bbRepos.py 1980-01-01 bitbucket$DT 2013-00-01 &> bbRepos${DT}1.out & |
| 95 | +python3 bbRepos.py 2013-01-01 bitbucket$DT 2014-05-03 &> bbRepos${DT}2.out & |
| 96 | +python3 bbRepos.py 2014-05-03 bitbucket$DT 2015-05-03 &> bbRepos${DT}3.out & |
| 97 | +python3 bbRepos.py 2015-05-03 bitbucket$DT 2016-05-03 &> bbRepos${DT}4.out & |
| 98 | +python3 bbRepos.py 2016-05-03 bitbucket$DT 2017-05-03 &> bbRepos${DT}5.out & |
| 99 | +python3 bbRepos.py 2017-05-03 bitbucket$DT 2018-05-03 &> bbRepos${DT}6.out & |
| 100 | +python3 bbRepos.py 2018-05-03 bitbucket$DT 2019-05-03 &> bbRepos${DT}7.out & |
| 101 | +python3 bbRepos.py 2019-05-03 bitbucket$DT 2020-05-01 &> bbRepos${DT}8.out & |
| 102 | +python3 bbRepos.py 2020-05-03 bitbucket$DT 2021-05-01 &> bbRepos${DT}9.out & |
| 103 | +#get only new, use heads for existing repos |
| 104 | +python3 bbRepos.py 2021-05-01 bitbucket$DT 2022-05-03 &> bbRepos${DT}0.out & |
| 105 | + |
| 106 | + |
| 107 | +# SF |
| 108 | +python3 sfRepos.py sf$DT repos |
| 109 | +python3 listU.py sf$DT repos '{}' url | sed "s|b'https://sourceforge.net/projects/||;s|'$||;" | sort -u > sf$DT.prj |
| 110 | +#join -v1 sf$DT.prj sf$PDT.prj > sf$DT.prj.new |
| 111 | + |
| 112 | +#python3 extractSfGit.py sf201813 repos &>> sf201813.out |
| 113 | + |
| 114 | +# Gitlab |
| 115 | +python3 glRepos.py 1 gl$DT repos &> gl$DT.out & |
| 116 | + |
| 117 | +wait |
| 118 | + |
| 119 | +# Split for parallel processing |
| 120 | +split -n l/10 -da1 sf$DT.prj sf$DT.prj. |
| 121 | +for i in {0..9} |
| 122 | +do cat sf$DT.prj.$i | while read r; |
| 123 | + do gg=$(git ls-remote "https://git.code.sf.net/p/$r/git" 2> /dev/null| awk '{print ";"$1}') |
| 124 | + cc=$(git ls-remote "https://git.code.sf.net/p/$r/code" 2> /dev/null| awk '{print ";"$1}'); |
| 125 | + [[ $gg == "" ]] || echo https://git.code.sf.net/p/$r/git$gg |sed 's/ ;/;/g' |
| 126 | + [[ $cc == "" ]] || echo https://git.code.sf.net/p/$r/code$cc|sed 's/ ;/;/g'; |
| 127 | + done | gzip > sf$DT.prj.$i.heads & |
| 128 | +done |
| 129 | + |
| 130 | +#now do for existing |
| 131 | +zcat sf$PDT.prj.*.heads |
| 132 | + |
| 133 | +# Do other forges git.bioconductor.org, |
| 134 | +wget http://git.bioconductor.org -O bio.html |
| 135 | +cat bio.html | awk '{print $2}' | grep / | grep -v '\*' | awk '{ print "https://git.bioconductor.org/"$1}' > bioconductor.org.$DT |
| 136 | +cat bioconductor.org.$DT | \ |
| 137 | +while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; |
| 138 | +done | gzip > bioconductor.org.$DT.heads & |
| 139 | + |
| 140 | + |
| 141 | +wget "https://blitiri.com.ar/git/" -O blitiri.com.ar.html |
| 142 | +grep '<td class="name"><a href="' blitiri.com.ar.html|sed 's|^\s*<td class="name"><a href="||;s|".*||' | sort -u | awk '{print "https://blitiri.com.ar/git/"$1}' > blitiri.com.ar.$DT |
| 143 | + |
| 144 | +u=fedorapeople.org |
| 145 | +wget "https://$u" -O $u.html |
| 146 | +grep 'Git repositories' $u.html|sed 's|<a href="||;s|".*||' | sort -u > $u.$DT |
| 147 | + |
| 148 | +u=code.qt.io |
| 149 | +wget "https://$u/cgit/" -O $u.html |
| 150 | +grep 'toplevel-repo' $u.html| sed "s|.*href='/cgit/|/cgit/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT |
| 151 | + |
| 152 | +u=git.alpinelinux.org |
| 153 | +wget "https://$u" -O $u.html |
| 154 | +grep 'toplevel-repo' $u.html | sed "s|.*href='/|/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT |
| 155 | + |
| 156 | +u=git.openembedded.org |
| 157 | +wget "https://$u" -O $u.html |
| 158 | +grep 'toplevel-repo' $u.html | sed "s|.*' href='/|/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT |
| 159 | + |
| 160 | +for u in git.torproject.org git.xfce.org git.yoctoproject.org |
| 161 | +do wget "https://$u" -O $u.html |
| 162 | +grep -E '(sublevel|toplevel)-repo' $u.html | sed "s|.*' href='/|/|;s|'.*||"|sort -u | awk '{print "https://'$u'"$1}' > $u.$DT |
| 163 | +done |
| 164 | + |
| 165 | +wget "https://repo.or.cz/?a=project_list" -O cz.html |
| 166 | +grep '\.git' cz.html | sed 's|.*"/\([^/"]*\.git\).*|\1|' | uniq | sort -u | awk '{print "https://repo.or.cz/"$1}'> repo.or.cz.$DT |
| 167 | +cat repo.or.cz.$DT | \ |
| 168 | +while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; |
| 169 | +done | gzip > repo.or.cz.$DT.heads & |
| 170 | + |
| 171 | + |
| 172 | +wget "https://gitbox.apache.org/repos/asf" -O gitbox.apache.org.html |
| 173 | +grep '<td><a href="/repos/asf/[^\?]' gitbox.apache.org.html|sed 's|.*<td><a href="/||;s|".*||' | sort -u | awk '{print "https://gitbox.apache.org/"$1}' > gitbox.apache.org.$DT |
| 174 | + |
| 175 | +echo https://gcc.gnu.org/git/gcc.git > gcc.git.$DT |
| 176 | + |
| 177 | +for i in {1..50} |
| 178 | +do wget "https://pagure.io/?page=$i&sorting=None" -O pagure.io.html |
| 179 | + grep '^\s*<a href="/' pagure.io.html |sed 's|^\s*<a href="||;s|".*||'|grep -Ev '^/(about|ssh_info)$' |
| 180 | +done | uniq | sort -u | awk '{print "https://pagure.io"$1}' > pagure.io.$DT |
| 181 | + |
| 182 | +u=notabug.org |
| 183 | +for i in {1..50} |
| 184 | +do wget "https://$u/explore/repos?page=$i&q=" -O $u.html |
| 185 | + grep '<a class="name" href="/' $u.html |sed 's|<a class="name" href="/|/|;s|".*||' |
| 186 | +done |sort -u | awk '{print "https://'$u'"$1}' > $u.$DT |
| 187 | + |
| 188 | +for u in framagit.org gitlab.adullact.net code.ill.fr forgemia.inra.fr git.unicaen.fr git.unistra.fr git.pleroma.social gitlab.fing.edu.uy gitlab.huma-num.fr gitlab.irstea.fr gitlab.cerema.fr gite.lirmm.fr gitlab.common-lisp.net |
| 189 | +do for i in {1..50} |
| 190 | +do sleep 2; wget "https://$u/explore/projects?non_archived=true&page=$i&sort=name_asc" -O $u.html |
| 191 | + grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||' |
| 192 | +done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.$DT |
| 193 | +done |
| 194 | + |
| 195 | +for u in gitlab.freedesktop.org gitlab.inria.fr gitlab.ow2.org 0xacab.org invent.kde.org |
| 196 | +do for i in {1..50} |
| 197 | + do for o in latest_activity_desc name_asc name_desc created_desc created_asc |
| 198 | + do sleep 2; wget "https://$u/explore/projects?non_archived=true&page=$i&sort=$o" -O $u.html |
| 199 | + grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||' |
| 200 | + done |
| 201 | + done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.a.$DT |
| 202 | + for i in {1..50} |
| 203 | + do for o in latest_activity_desc name_asc name_desc created_desc created_asc |
| 204 | + do sleep 2; wget "https://$u/explore/projects/starred?non_archived=true&page=$i&sort=$o" -O $u.html |
| 205 | + grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||' |
| 206 | + done |
| 207 | + done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.s.$DT |
| 208 | + for i in {1..50} |
| 209 | + do for o in latest_activity_desc name_asc name_desc created_desc created_asc |
| 210 | + do sleep 2; wget "https://$u/explore/projects/trending?non_archived=true&page=$i&sort=$o" -O $u.html |
| 211 | + grep '<a class="project" href="' $u.html | sed 's|<a class="project" href="||;s|".*||' |
| 212 | + done |
| 213 | + done | uniq | sort -u | awk '{print "https://'$u'"$1}' > $u.t.$DT |
| 214 | + |
| 215 | + cat $u.?.$DT | sort -u > $u.$DT |
| 216 | +done |
| 217 | + |
| 218 | +cat invent.kde.org.$DT | \ |
| 219 | +while read r; do r="$r.git";a=$(git ls-remote $r.git | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; |
| 220 | +done | gzip > invent.kde.org.$DT.heads & |
| 221 | + |
| 222 | + |
| 223 | +#repo.or.cz.$DT gitlab.gnome.org.$DT android.googlesource.com.$DT git.zx2c4.com.$DT git.eclipse.org.$DT git.kernel.org.$DT git.savannah.gnu.org.$DT git.savannah.nongnu.org.$DT |
| 224 | +#fedorapeople.org.$DT list peoples git websites: need to append /public_git/ to get their projects |
| 225 | +cat fedorapeople.org.$DT|sed 's|^\s*||'|while read r; do wget "$r/public_git" -O -; done > fedorapeople.org.fix.$DT.html |
| 226 | +grep /public_git/ fedorapeople.org.fix.$DT.html| sed "s|.* href='/cgit/||;s|'.*||;s|/tree/$||;s|^|https://fedorapeople.org/cgit/" | sort -u > fedorapeople.org.fix.$DT |
| 227 | + |
| 228 | + |
| 229 | +#git.pleroma.social.$DT - seems not to allow listing |
| 230 | +echo https://git.pleroma.social/pleroma/pleroma > git.pleroma.social.$DT |
| 231 | + |
| 232 | +for i in fedorapeople.org.fix.$DT pagure.io.$DT blitiri.com.ar.$DT code.qt.io.$DT gitlab.common-lisp.net.$DT code.ill.fr.$DT forgemia.inra.fr.$DT git.unicaen.fr.$DT notabug.org.$DT git.unistra.fr.$DT gcc.git.$DT gitlab.fing.edu.uy.$DT gitlab.huma-num.fr.$DT gitlab.adullact.net.$DT gitlab.irstea.fr.$DT git.alpinelinux.org.$DT gitlab.cerema.fr.$DT git.openembedded.org.$DT gite.lirmm.fr.$DT git.torproject.org.$DT git.xfce.org.$DT git.yoctoproject.org.$DT framagit.org.$DT gitlab.freedesktop.org.$DT gitlab.ow2.org.$DT gitbox.apache.org.$DT gitlab.inria.fr.$DT |
| 233 | +do (sed 's|/\.git/$||;s|^\s*||;s|//|//a:a@|;s|/tree/$||;s|/$||;s|blitiri.com.ar/git/r/|blitiri.com.ar/repos/|;' $i | while read r; do a=$(git ls-remote "$r" 2> $i.err| awk '{print ";"$1}'); echo "$r$a"|sed 's/ //g'; done| gzip > $i.heads; sleep 2) & |
| 234 | +done |
| 235 | + |
| 236 | + |
| 237 | +# pages 1-300 |
| 238 | +# https://gitlab.gnome.org/explore/projects?page=300&sort=latest_activity_desc |
| 239 | +# insert username/password to prevend password requests |
| 240 | +for p in {1..300} |
| 241 | +do wget "https://gitlab.gnome.org/explore/projects?page=$p" -O - 2> /dev/null | perl -ane 'chop();if (m|^<a class="text-plain" href="|){s|<a class="text-plain" href="||;s|".*||;s|^/||;print "https://a:a\@gitlab.gnome.org/$_\n"}' |
| 242 | +done | sort -u > gitlab.gnome.org.$DT |
| 243 | +cat gitlab.gnome.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > gitlab.gnome.org.$DT.heads & |
| 244 | + |
| 245 | + |
| 246 | +# pages 1-1530 |
| 247 | +# git.debian.org -> https://salsa.debian.org/explore/projects?page=1540&sort=latest_activity_desc |
| 248 | +for of in {0..9}; do |
| 249 | +for p in $(eval "echo {$((1+$of*20))..$((20+$of*20))}") |
| 250 | +do wget "https://salsa.debian.org/explore/projects?page=$p" -O - 2> /dev/null | perl -ane 'chop(); while (m|<a class="project" href="([^"]*)"|g){print "https://a:a\@salsa.debian.org$1.git\n"}' |
| 251 | +done > git.debian.org.$DT.$of & |
| 252 | +done |
| 253 | +wait |
| 254 | +for of in {0..9}; do |
| 255 | +cat git.debian.org.$DT.$of | while read r; do a=$(git ls-remote $r 2> err | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; sleep 20; done | gzip > git.debian.org.$DT.$of.heads |
| 256 | +done |
| 257 | + |
| 258 | +sort=name_desc |
| 259 | +# Add following forges as well |
| 260 | +# android.git.kernel.org ?? |
| 261 | + |
| 262 | +#there are totally 2398 pages in https://git.drupalcode.org/explore/projects |
| 263 | +thost="https://git.drupalcode.org/explore/projects?page=" |
| 264 | +i=0 |
| 265 | +rm drupal.org |
| 266 | +for i in {1..50} |
| 267 | +do |
| 268 | + rhost="$thost$i".'&sort=latest_activity_desc'; |
| 269 | + curl -o drupal0.html $rhost; |
| 270 | + rhost="$thost$i".'&sort=latest_activity_asc'; |
| 271 | + curl -o drupal1.html $rhost; |
| 272 | + rhost="$thost$i".'&sort=created_desc'; |
| 273 | + curl -o drupal2.html $rhost; |
| 274 | + rhost="$thost$i".'&sort=created_asc'; |
| 275 | + curl -o drupal3.html $rhost; |
| 276 | + rhost="$thost$i".'&sort=name_asc'; |
| 277 | + curl -o drupal4.html $rhost; |
| 278 | + rhost="$thost$i".'&sort=name_desc'; |
| 279 | + curl -o drupal5.html $rhost; |
| 280 | + rhost="$thost$i".'&sort=stars_desc'; |
| 281 | + curl -o drupal6.html $rhost; |
| 282 | + rhost="$thost$i".'&sort=stars_asc'; |
| 283 | + curl -o drupal7.html $rhost; |
| 284 | + |
| 285 | + #if j==-1,this is invalid page,we have gotten all pages successfully. |
| 286 | + for t in {0..7} |
| 287 | + do j=$(perl -e '$e=0;while(<STDIN>){if(m|<h5>This user doesn|){$e=-1;last;};}; print "$e\n"' < drupal$t.html); |
| 288 | + if [ "$j" -eq "-1" ]; then break; fi; |
| 289 | + #all urls will be stored in ./drupal.com |
| 290 | + perl -ane 'while(m|<span class="project-name">([^<]*)</span>|g){print "https://git.drupalcode.org/project/$1\n"}' < drupal$t.html >> drupal.com.$DT; |
| 291 | + if [ `expr $i % 10` -eq 0 ]; then sleep 2; fi; |
| 292 | + done |
| 293 | +done |
| 294 | + |
| 295 | +sort -u drupal.com.$DT > drupal.com.$DT.u |
| 296 | +mv drupal.com.$DT.u drupal.com.$DT |
| 297 | +cat drupal.com.$DT| sed 's|.*/project/|dr:project/|' | \ |
| 298 | +while read r; do git ls-remote $r | grep -E 'refs/heads|HEAD' | sed 's|\s*refs/heads/|;|;s|\s*HEAD|;HEAD|;s|^|'$r';|'; |
| 299 | +done | gzip > drupal.com.$DT.heads & |
| 300 | + |
| 301 | + |
| 302 | +wget https://android.googlesource.com/ -O android.googlesource.com.html |
| 303 | +perl -ane 'while(m|class="RepoList-itemName">([^<]*)</|g){print "https://android.googlesource.com/$1\n";}' < android.googlesource.com.html > android.googlesource.com.$DT |
| 304 | +cat android.googlesource.com.$DT | \ |
| 305 | +while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; |
| 306 | +done | gzip > android.googlesource.com.$DT.heads & |
| 307 | + |
| 308 | +### |
| 309 | + |
| 310 | +wget https://git.zx2c4.com -O git.zx2c4.com.html |
| 311 | +perl -ane "while (m|<td class='toplevel-repo'><a title='([^']*)'|g){print \"https://git.zx2c4.com/\$1\n\";}" < git.zx2c4.com.html > git.zx2c4.com.$DT |
| 312 | +cat git.zx2c4.com.$DT | \ |
| 313 | +while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; |
| 314 | +done | gzip > git.zx2c4.com.$DT.heads & |
| 315 | + |
| 316 | +wget http://git.eclipse.org/ -O git.eclipse.org.html |
| 317 | +perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.eclipse.org\$1\n\";}" < git.eclipse.org.html | sed 's|/c/|/r/|' > git.eclipse.org.$DT |
| 318 | +cat git.eclipse.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.eclipse.org.$DT.heads & |
| 319 | + |
| 320 | + |
| 321 | +wget http://git.postgresql.org -O git.postgresql.org.html |
| 322 | +perl -ane 'while(m|<a class="list" href="/gitweb/\?p=([^"]*);a=summary"|g){print "https://git.postgresql.org/git/$1\n"}' < git.postgresql.org.html | sort -u > git.postgresql.org.$DT |
| 323 | +cat git.postgresql.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.postgresql.org.$DT.heads & |
| 324 | + |
| 325 | +wget http://git.kernel.org -O git.kernel.org.html |
| 326 | +perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.kernel.org\$1\n\";}" < git.kernel.org.html > git.kernel.org.$DT |
| 327 | +cat git.kernel.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.kernel.org.$DT.heads & |
| 328 | + |
| 329 | + |
| 330 | +wget http://git.savannah.gnu.org/cgit -O git.savannah.gnu.org.html |
| 331 | +#perl -ane "while (m|<td class='sublevel-repo'><a title='[^']*' href='([^']*)'|g){print \"https://git.savannah.gnu.org\$1\n\";}" < git.savannah.gnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.gnu.org.$DT |
| 332 | +perl -ane "while (m|<td class='toplevel-repo'><a title='([^']*)'|g){print \"https://git.savannah.gnu.org/git/\$1\n\";}" < git.savannah.gnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.gnu.org.$DT |
| 333 | +cat git.savannah.gnu.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.savannah.gnu.org.$DT.heads & |
| 334 | + |
| 335 | +wget http://git.savannah.nongnu.org/cgit -O git.savannah.nongnu.org.html |
| 336 | +perl -ane "while (m|<td class='toplevel-repo'><a title='([^']*)'|g){print \"https://git.savannah.nongnu.org/git/\$1\n\";}" < git.savannah.nongnu.org.html | sed 's|/cgit/|/git/|' | sort -u > git.savannah.nongnu.org.$DT |
| 337 | +cat git.savannah.nongnu.org.$DT | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; done | gzip > git.savannah.nongnu.org.$DT.heads & |
| 338 | + |
| 339 | + |
| 340 | +#get old repos for gh, these may have changed again |
| 341 | +python3 listU.py gh$PDT repos '{}' nameWithOwner | sed "s|^b'||;s|'$||" | sort -u > gh$PDT.u |
| 342 | +split -n l/50 -da2 gh$PDT.u gh$PDT.u. |
| 343 | +for j in {00..49} |
| 344 | +do cat gh$PDT.u.$j | while read r; do |
| 345 | + a=$(git ls-remote gh:$r | awk '{print ";"$1}'); echo gh:$r$a | sed 's/ //g'; |
| 346 | + done | gzip > gh$PDT.u.$j.heads & |
| 347 | +done |
| 348 | + |
| 349 | + |
| 350 | +wait |
| 351 | + |
| 352 | + |
| 353 | + |
| 354 | +# Get update repos for GL |
| 355 | +python3 listU.py gl$DT repos '{ "last_activity_at" : { "$gt" : "'"$PDTdash"'" }}' http_url_to_repo | sed "s|^b'||;s|'$||"|sort -u > gl$DT.new |
| 356 | +cat gl$DT.new | sed 's|https://gitlab.com/|gl:|' | while read r; do a=$(git ls-remote $r | awk '{print ";"$1}'); echo $r$a|sed 's/ //g'; |
| 357 | +done | gzip > gl$DT.new.heads & |
| 358 | + |
| 359 | +# Get updated, no-forks for GH |
| 360 | +#python3 listU.py gh$DT repos '{"isFork" : false}' nameWithOwner | sed "s|^b'||;s|'$||" | sort -u > gh$DT.u |
| 361 | +python3 listU.py gh$DT repos '{ "pushed_at" : { "$gt" : "'"$PDTdash"'"}' nameWithOwner | sed "s|^b'||;s|'$||" | sort -u > gh$DT.u |
| 362 | +cat gh$PDT.u.*[0-9] | sort -t\; | join -t\; -v2 - gh$DT.u > gh$DT.new.u |
| 363 | +split -n l/50 -da2 gh$DT.new.u gh$DT.u. |
| 364 | +for j in {00..49} |
| 365 | +do cat gh$DT.u.$j | while read r; do |
| 366 | + a=$(git ls-remote gh:$r | awk '{print ";"$1}'); echo gh:$r$a | sed 's/ //g'; |
| 367 | + done | gzip > gh$DT.u.$j.heads & |
| 368 | +done |
| 369 | + |
| 370 | + |
| 371 | + |
| 372 | +# Get updated bb (do heads on all 2M?) |
| 373 | +python3 listU.py bitbucket$DT repos '{ "updated_on" : { "$gt" : "'"$PDTdash"'" } }' full_name | \ |
| 374 | + sed "s|^b'||;s|'$||" | sort -u > bitbucket$DT.new |
| 375 | +split -n l/10 -da1 bitbucket$DT.new bitbucket$DT.new. |
| 376 | +for j in {0..9} |
| 377 | +do cat bitbucket$DT.new.$j | while read r; do |
| 378 | + a=$(git ls-remote bb:$r | awk '{print ";"$1}'); echo bb:$r$a | sed 's/ //g'; |
| 379 | + done | gzip > bitbucket$DT.new.$j.heads & |
| 380 | +done |
| 381 | + |
| 382 | +wait |
| 383 | + |
| 384 | +#dump all the collected mongo data |
| 385 | +#mongodump |
| 386 | +#${un[$i]} ${ps[$i]} |
0 commit comments