From 9bd18ca8dde9b79dccf27f481453dabbfce2faf3 Mon Sep 17 00:00:00 2001 From: mkg20001 Date: Fri, 19 May 2017 19:28:30 +0200 Subject: [PATCH 01/10] Add getzim script --- getzim.sh | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 getzim.sh diff --git a/getzim.sh b/getzim.sh new file mode 100644 index 0000000..fcd361a --- /dev/null +++ b/getzim.sh @@ -0,0 +1,92 @@ +#!/bin/bash + +if [ ! -e ".content" ]; then + echo "Downloading content list..." + curl -s http://wiki.kiwix.org/wiki/Content_in_all_languages > .content +fi +html=$(cat .content) +urls=$(echo "$html" | grep "Download" | grep "http://download.kiwix.org/zim/.*_all.zim\"" -o | grep "http://download.kiwix.org/zim/.*_all.zim" -o | uniq) #filter all urls + +textmenu() { #allows the user to choose an id or item from a list + echo + a=($1) #turn $1 into an array + [ -z "${a[1]}" ] && echo "Skipping question \"$2\" because there is only one choice: ${a[0]}" && res="${a[0]}" && return 0 + i=0 + for c in $1; do #build a list + echo "[$i] $c" + i=$(expr $i + 1) + done + [ -z "$3" ] && read -p "[?] $2 > " _id #if no element was specified as cli arg ask the user + [ ! -z "$3" ] && _id="$3" #otherwise use that + id=$(echo "$_id" | sed "s|[^0-9]||g") #only numbers + for e in $1; do [ "$e" == "$_id" ] && res="$e" && echo "< $res" && return 0; done #check if item is in list + res=${a[$id]} + [ -z "$res" ] && [ ! -z "$3" ] && echo "Invalid ID or item: $3" && exit 2 #if id/item was specified via cli exit if invalid + [ -z "$id" ] && [ -z "$_id" ] && textmenu "$1" "$2" && return 0 #no input + [ -z "$id" ] && echo "Please enter a number or what you want" && textmenu "$1" "$2" && return 0 + [ -z "$res" ] && echo "INVALID ID" && textmenu "$1" "$2" && return 0 + echo "< $res" #show the choice to the user +} + +#Select Source +srcs=$(echo "$urls" | grep "zim/.*_.*_all.zim" -o | grep "/[a-z]*_" -o | grep "[a-z]*" -o | uniq) +textmenu "$srcs" "Select which source to mirror" "$1" +src="$res" + +#Select Language +langs=$(echo "$urls" | grep "/${res}_.*" -o | grep -o "_.*_" | sed "s|^_||g" | sed "s|_$||g") +textmenu "$langs" "Select which language to mirror" "$2" +lang="$res" + +#Get URL +url="http://download.kiwix.org/zim/${src}_${lang}_all.zim" +urlverify=$(echo "$urls" | grep "$url") +[ -z "$urlverify" ] && echo "INTERNAL ERROR: $url was not found in list but seems to be valid - Please report!" && exit 2 + +echo +echo "Source: $src, Language: $lang, Url: $url" + +[ -z "$*" ] && read -p "Press return to start downloading (this may take a long time)... +" _foo + +md5=$(curl -sL $url.md5) #get the md5 +real=$(curl -sLI $url | grep "^Location:" | sed "s|Location: ||g" | grep "[a-zA-Z0-9\/:\._-]*" -o) #all the redirects +dest=$(basename $(echo "$real" | head -n 1)) #the real filename (includes date in filename, different from the one in the wiki) + +md5check() { + echo + echo "Verify md5 checksum..." + md5sum -c > /dev/null 2> /dev/null << q +$md5 +q + e=$? + if [ $e -ne 0 ]; then + echo "md5sum FAILED!" + if [ -z "$1" ]; then + echo "Trying to continue the download..." + echo + wget --continue "$url" -O $dest + echo + md5check "r" + else + echo + echo "It seems like your file is corrupted" + echo "Please remove it:" #we won't do that because the user might not want this + echo " $ rm $dest" + exit 2 + fi + else + echo + echo "Success! File saved as $dest" + fi +} + +if [ -e "$dest" ]; then + echo "Skipping download, file already exists..." + md5check +else + echo + wget $url -O $dest + echo + md5check +fi From 9d000979581b7e55122668ee0b8b28116f3e0c2f Mon Sep 17 00:00:00 2001 From: mkg20001 Date: Fri, 19 May 2017 19:35:31 +0200 Subject: [PATCH 02/10] Replace source with wiki --- getzim.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/getzim.sh b/getzim.sh index fcd361a..2911df7 100644 --- a/getzim.sh +++ b/getzim.sh @@ -28,10 +28,10 @@ textmenu() { #allows the user to choose an id or item from a list echo "< $res" #show the choice to the user } -#Select Source -srcs=$(echo "$urls" | grep "zim/.*_.*_all.zim" -o | grep "/[a-z]*_" -o | grep "[a-z]*" -o | uniq) -textmenu "$srcs" "Select which source to mirror" "$1" -src="$res" +#Select Wiki +wikis=$(echo "$urls" | grep "zim/.*_.*_all.zim" -o | grep "/[a-z]*_" -o | grep "[a-z]*" -o | uniq) +textmenu "$wikis" "Select which wiki to mirror" "$1" +wiki="$res" #Select Language langs=$(echo "$urls" | grep "/${res}_.*" -o | grep -o "_.*_" | sed "s|^_||g" | sed "s|_$||g") @@ -39,12 +39,12 @@ textmenu "$langs" "Select which language to mirror" "$2" lang="$res" #Get URL -url="http://download.kiwix.org/zim/${src}_${lang}_all.zim" +url="http://download.kiwix.org/zim/${wiki}_${lang}_all.zim" urlverify=$(echo "$urls" | grep "$url") [ -z "$urlverify" ] && echo "INTERNAL ERROR: $url was not found in list but seems to be valid - Please report!" && exit 2 echo -echo "Source: $src, Language: $lang, Url: $url" +echo "Wiki: $wiki, Language: $lang, Url: $url" [ -z "$*" ] && read -p "Press return to start downloading (this may take a long time)... " _foo From 3d6268b1552d2c1f8a996c445ab845b70b36dbfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= Date: Tue, 10 Sep 2019 18:19:07 +0200 Subject: [PATCH 03/10] feat: re-work script --- .gitignore | 1 + getzim.sh | 231 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 174 insertions(+), 58 deletions(-) diff --git a/.gitignore b/.gitignore index e43b0f9..e57ba99 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .DS_Store +.cache diff --git a/getzim.sh b/getzim.sh index 2911df7..4093dd9 100644 --- a/getzim.sh +++ b/getzim.sh @@ -1,18 +1,26 @@ #!/bin/bash -if [ ! -e ".content" ]; then - echo "Downloading content list..." - curl -s http://wiki.kiwix.org/wiki/Content_in_all_languages > .content -fi -html=$(cat .content) -urls=$(echo "$html" | grep "Download" | grep "http://download.kiwix.org/zim/.*_all.zim\"" -o | grep "http://download.kiwix.org/zim/.*_all.zim" -o | uniq) #filter all urls +# internal + +BASE=$(readlink -f $(dirname "$0")) +CACHE="$BASE/.cache" + +# conf + +BASEURL="https://download.kiwix.org/zim/" -textmenu() { #allows the user to choose an id or item from a list +# ui funcs + +log() { + echo "$(date +%s): $*" +} + +textmenu() { # allows the user to choose an id or item from a list echo - a=($1) #turn $1 into an array + a=($1) # turn $1 into an array [ -z "${a[1]}" ] && echo "Skipping question \"$2\" because there is only one choice: ${a[0]}" && res="${a[0]}" && return 0 i=0 - for c in $1; do #build a list + for c in $1; do # build a list echo "[$i] $c" i=$(expr $i + 1) done @@ -23,70 +31,177 @@ textmenu() { #allows the user to choose an id or item from a list res=${a[$id]} [ -z "$res" ] && [ ! -z "$3" ] && echo "Invalid ID or item: $3" && exit 2 #if id/item was specified via cli exit if invalid [ -z "$id" ] && [ -z "$_id" ] && textmenu "$1" "$2" && return 0 #no input - [ -z "$id" ] && echo "Please enter a number or what you want" && textmenu "$1" "$2" && return 0 + [ -z "$id" ] && echo "Please enter a number or an item name" && textmenu "$1" "$2" && return 0 [ -z "$res" ] && echo "INVALID ID" && textmenu "$1" "$2" && return 0 echo "< $res" #show the choice to the user } -#Select Wiki -wikis=$(echo "$urls" | grep "zim/.*_.*_all.zim" -o | grep "/[a-z]*_" -o | grep "[a-z]*" -o | uniq) -textmenu "$wikis" "Select which wiki to mirror" "$1" -wiki="$res" +# scraper fncs -#Select Language -langs=$(echo "$urls" | grep "/${res}_.*" -o | grep -o "_.*_" | sed "s|^_||g" | sed "s|_$||g") -textmenu "$langs" "Select which language to mirror" "$2" -lang="$res" +fetch_with_cache() { + OUTFILE=${1/"/"//"_"} + OUTFILE="$CACHE/_page$OUTFILE" -#Get URL -url="http://download.kiwix.org/zim/${wiki}_${lang}_all.zim" -urlverify=$(echo "$urls" | grep "$url") -[ -z "$urlverify" ] && echo "INTERNAL ERROR: $url was not found in list but seems to be valid - Please report!" && exit 2 + if [ -e "$OUTFILE" ]; then + cat "$OUTFILE" + else + OUT=$(curl -sL "$BASEURL$1") + echo "$OUT" > "$OUTFILE" + echo "$OUT" + fi +} + +get_urls() { + grep href | sed -r 's|.*href="(.*)".*|\1|g' | sed "s|/||g" +} -echo -echo "Wiki: $wiki, Language: $lang, Url: $url" +# main funcs -[ -z "$*" ] && read -p "Press return to start downloading (this may take a long time)... -" _foo +cmd_cache_update() { + echo "Updating cache..." + + rm -rf "$CACHE" + mkdir -p "$CACHE" + for url in $(fetch_with_cache | get_urls); do + echo "Updating cache for $url..." + fetch_with_cache "$url" > /dev/null + done +} -md5=$(curl -sL $url.md5) #get the md5 -real=$(curl -sLI $url | grep "^Location:" | sed "s|Location: ||g" | grep "[a-zA-Z0-9\/:\._-]*" -o) #all the redirects -dest=$(basename $(echo "$real" | head -n 1)) #the real filename (includes date in filename, different from the one in the wiki) +filter_group() { + # wikipedia_ru_molcell_nopic_2019-05.zim + # base: + sed -r "s|([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)\\.zim|\\$1|g" +} + +cmd_choose() { + # Select wiki + # TODO: there is a special case, "other", where multiple wikis are available + wikis=$(fetch_with_cache | get_urls) + textmenu "$wikis" "Select which wiki to mirror (choose 'other' for more)" "$1" + wiki="$res" + + # https://download.kiwix.org/zim/wikipedia/wikipedia_ar_medicine_nopic_2019-08.zim + # TYPE TYPE LANG CAT EDITION DATE + + fetch_with_cache "$wiki" | get_urls | filter_group 1 | uniq + fetch_with_cache "$wiki" | get_urls | filter_group 1 | uniq | wc -l + + reallist=$(fetch_with_cache "$wiki" | get_urls | filter_group 1 | uniq | wc -l) + + if [ "$reallist" != "1" ]; then + wikireals=$(fetch_with_cache "$wiki" | get_urls | filter_group 1 | sort | uniq) + textmenu "$wikireals" "Select which wiki to mirror" "$1" + wikireal="$res" + else + wikireal="$wiki" + fi + + langs=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_" | filter_group 2 | sort | uniq) + textmenu "$langs" "Select which language to mirror" "$2" + lang="$res" + + cats=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}" | filter_group 3 | sort | uniq) + textmenu "$cats" "Select which category to mirror" "$3" + cat="$res" + + editions=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${cat}" | filter_group 4 | sort | uniq) + textmenu "$editions" "Select which edition to mirror" "$4" + edition="$res" + + dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${cat}_${edition}" | filter_group 5 | sort | uniq) + textmenu "$dates" "Select which date to mirror" "$5" + date="$res" + + if [ "$wikireal" != "$wiki" ]; then + wiki="$wiki $wikireal" + fi + + echo "Download command: $0 download $wiki $lang $cat $edition $date" +} + +cmd_download() { + : +} + +if [ -n "$(LC_ALL=C type -t cmd_$1)" ] && [ "$(LC_ALL=C type -t cmd_$1)" = function ]; then + CMD="$1" + shift + "cmd_$CMD" "$@" + exit 0 +else + echo "Usage: $0 cache_update" + echo " $0 choose" + echo " $0 download []" + exit 2 +fi + + + + +old() { + #urls=$(echo "$html" | grep "Download" | grep "https://download.kiwix.org/zim/.*_all.zim\"" -o | grep "https://download.kiwix.org/zim/.*_all.zim" -o | uniq) #filter all urls + + #Select Wiki + # wikis=$(echo "$urls" | grep "zim/.*_.*_all.zim" -o | grep "/[a-z]*_" -o | grep "[a-z]*" -o | uniq) + + fetch_with_cache "$wiki" + + #Select Language + langs=$(echo "$urls" | grep "/${res}_.*" -o | grep -o "_.*_" | sed "s|^_||g" | sed "s|_$||g") + textmenu "$langs" "Select which language to mirror" "$2" + lang="$res" + + #Get URL + url="https://download.kiwix.org/zim/${wiki}_${lang}_all.zim" + urlverify=$(echo "$urls" | grep "$url") + [ -z "$urlverify" ] && echo "INTERNAL ERROR: $url was not found in list but seems to be valid - Please report!" && exit 2 -md5check() { echo - echo "Verify md5 checksum..." - md5sum -c > /dev/null 2> /dev/null << q -$md5 -q - e=$? - if [ $e -ne 0 ]; then - echo "md5sum FAILED!" - if [ -z "$1" ]; then - echo "Trying to continue the download..." - echo - wget --continue "$url" -O $dest - echo - md5check "r" + echo "Wiki: $wiki, Language: $lang, Url: $url" + + [ -z "$*" ] && read -p "Press return to start downloading (this may take a long time)... + " _foo + + md5=$(curl -sL $url.md5) #get the md5 + real=$(curl -sLI $url | grep "^Location:" | sed "s|Location: ||g" | grep "[a-zA-Z0-9\/:\._-]*" -o) #all the redirects + dest=$(basename $(echo "$real" | head -n 1)) #the real filename (includes date in filename, different from the one in the wiki) + + md5check() { + echo + echo "Verify md5 checksum..." + md5sum -c > /dev/null 2> /dev/null << q + $md5 + q + e=$? + if [ $e -ne 0 ]; then + echo "md5sum FAILED!" + if [ -z "$1" ]; then + echo "Trying to continue the download..." + echo + wget --continue "$url" -O $dest + echo + md5check "r" + else + echo + echo "It seems like your file is corrupted" + echo "Please remove it:" #we won't do that because the user might not want this + echo " $ rm $dest" + exit 2 + fi else echo - echo "It seems like your file is corrupted" - echo "Please remove it:" #we won't do that because the user might not want this - echo " $ rm $dest" - exit 2 + echo "Success! File saved as $dest" fi + } + + if [ -e "$dest" ]; then + echo "Skipping download, file already exists..." + md5check else echo - echo "Success! File saved as $dest" + wget $url -O $dest + echo + md5check fi } - -if [ -e "$dest" ]; then - echo "Skipping download, file already exists..." - md5check -else - echo - wget $url -O $dest - echo - md5check -fi From 60b0915e69f34ce6468d2e2f5c47c2eac2f998e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= Date: Tue, 10 Sep 2019 18:46:56 +0200 Subject: [PATCH 04/10] feat: finish re-working choose command --- getzim.sh | 116 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 23 deletions(-) diff --git a/getzim.sh b/getzim.sh index 4093dd9..3255c2a 100644 --- a/getzim.sh +++ b/getzim.sh @@ -52,7 +52,7 @@ fetch_with_cache() { } get_urls() { - grep href | sed -r 's|.*href="(.*)".*|\1|g' | sed "s|/||g" + grep href | grep -v "
" | sed -r 's|.*href="(.*)".*|\1|g' | sed "s|/||g"
 }
 
 # main funcs
@@ -68,48 +68,117 @@ cmd_cache_update() {
   done
 }
 
-filter_group() {
-  # wikipedia_ru_molcell_nopic_2019-05.zim
-  # base:
-  sed -r "s|([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)_([a-z0-9-]+)\\.zim|\\$1|g"
+urlp() {
+  # usage: get var
+  # usage: filter type lang edition tags... date/"any"
+
+  case "$1" in
+    get)
+      get_var="$2"
+      ;;
+    filter)
+      filter_type="$1"
+      shift
+      filter_lang="$1"
+      shift
+      filter_edition="$1"
+      shift
+
+      filter_tags=()
+      while [ ! -z "$2" ]; do
+        filter_tags+=("$1")
+        shift
+      done
+      filter_tags="${filter_tags[*]}"
+
+      if [ "$1" != "any" ]; then
+        filter_date="$1"
+      fi
+      shift
+      ;;
+  esac
+
+  while read url; do
+    type=""
+    lang=""
+    edition=""
+
+    tags=()
+    date=""
+
+    for group in $(echo "$url" | sed "s|.zim||g" | tr "_" "\n"); do
+      if [ -z "$type" ]; then
+        type="$group"
+      elif [ -z "$lang" ]; then
+        lang="$group"
+      elif [ -z "$edition" ]; then
+        edition="$group"
+      elif [[ "$group" == "20"* ]]; then
+        date="$group"
+      else
+        tags+=("$group")
+      fi
+    done
+
+    tags="${tags[*]}"
+
+    if [ ! -z "$get_var" ]; then
+      echo "${!get_var}"
+    else
+      if [ -z "$filter_type" ] || [[ "$filter_type" == "$type" ]]; then
+        if [ -z "$filter_lang" ] || [[ "$filter_lang" == "$lang" ]]; then
+          if [ -z "$filter_edition" ] || [[ "$filter_edition" == "$edition" ]]; then
+            if [ -z "$filter_tags" ] || [[ "$filter_tags" == "$tags" ]]; then
+              if [ -z "$filter_date" ] || [[ "$filter_date" == "$date" ]]; then
+                echo "$url"
+              fi
+            fi
+          fi
+        fi
+      fi
+    fi
+
+    # echo "type=$type, lang=$lang, edition=$edition, date=$date, tags=${tags[*]}"
+  done
 }
 
 cmd_choose() {
   # Select wiki
-  # TODO: there is a special case, "other", where multiple wikis are available
+  log "Getting wiki list..."
   wikis=$(fetch_with_cache | get_urls)
   textmenu "$wikis" "Select which wiki to mirror (choose 'other' for more)" "$1"
   wiki="$res"
 
-  # https://download.kiwix.org/zim/wikipedia/wikipedia_ar_medicine_nopic_2019-08.zim
-  #                               TYPE      TYPE      LANG CAT     EDITION DATE
-
-  fetch_with_cache "$wiki" | get_urls | filter_group 1 | uniq
-  fetch_with_cache "$wiki" | get_urls | filter_group 1 | uniq | wc -l
-
-  reallist=$(fetch_with_cache "$wiki" | get_urls | filter_group 1 | uniq | wc -l)
+  log "Getting sub-wiki list..."
+  # there is a special case, "other", where multiple wikis are available
+  reallist=$(fetch_with_cache "$wiki" | get_urls | urlp get type | uniq | wc -l)
 
   if [ "$reallist" != "1" ]; then
-    wikireals=$(fetch_with_cache "$wiki" | get_urls | filter_group 1 | sort | uniq)
+    wikireals=$(fetch_with_cache "$wiki" | get_urls | urlp get type | sort | uniq)
     textmenu "$wikireals" "Select which wiki to mirror" "$1"
     wikireal="$res"
   else
     wikireal="$wiki"
   fi
 
-  langs=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_" | filter_group 2 | sort | uniq)
+  log "Getting language list..."
+  langs=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_" | urlp get lang | sort | uniq)
   textmenu "$langs" "Select which language to mirror" "$2"
   lang="$res"
 
-  cats=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}" | filter_group 3 | sort | uniq)
-  textmenu "$cats" "Select which category to mirror" "$3"
-  cat="$res"
-
-  editions=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${cat}" | filter_group 4 | sort | uniq)
-  textmenu "$editions" "Select which edition to mirror" "$4"
+  log "Getting edition list..."
+  editions=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}" | urlp get edition | sort | uniq)
+  textmenu "$editions" "Select which edition to mirror" "$3"
   edition="$res"
 
-  dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${cat}_${edition}" | filter_group 5 | sort | uniq)
+  log "Getting tag list.."
+  tags=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}" | urlp get tags | sort | uniq)
+  textmenu "$tags" "Select which tags to use" "$4"
+  tag=$(echo "$res" | sed "s| |_|g")
+
+  log "Getting date list..."
+  dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}_${tag}" | urlp get date | sort | uniq)
+  dates="any $dates"
   textmenu "$dates" "Select which date to mirror" "$5"
   date="$res"
 
@@ -117,7 +186,8 @@ cmd_choose() {
     wiki="$wiki $wikireal"
   fi
 
-  echo "Download command: $0 download $wiki $lang $cat $edition $date"
+  echo "Download command:"
+  echo "  \$ $0 download $wiki $lang $edition $tag $date"
 }
 
 cmd_download() {

From 3035649a9e4ba4e5486df776cdd2dbb09c40adf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= 
Date: Tue, 10 Sep 2019 19:16:14 +0200
Subject: [PATCH 05/10] fix: downloading

---
 .gitignore |   1 +
 getzim.sh  | 177 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 100 insertions(+), 78 deletions(-)

diff --git a/.gitignore b/.gitignore
index e57ba99..665bd40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .DS_Store
 .cache
+*.zim
diff --git a/getzim.sh b/getzim.sh
index 3255c2a..1297d7b 100644
--- a/getzim.sh
+++ b/getzim.sh
@@ -77,6 +77,7 @@ urlp() {
       get_var="$2"
       ;;
     filter)
+      shift
       filter_type="$1"
       shift
       filter_lang="$1"
@@ -91,6 +92,10 @@ urlp() {
       done
       filter_tags="${filter_tags[*]}"
 
+      if [ -z "$filter_tags" ]; then
+        filter_tags="notag"
+      fi
+
       if [ "$1" != "any" ]; then
         filter_date="$1"
       fi
@@ -122,6 +127,10 @@ urlp() {
 
     tags="${tags[*]}"
 
+    if [ -z "$tags" ]; then
+      tags="notag"
+    fi
+
     if [ ! -z "$get_var" ]; then
       echo "${!get_var}"
     else
@@ -171,107 +180,119 @@ cmd_choose() {
   textmenu "$editions" "Select which edition to mirror" "$3"
   edition="$res"
 
-  log "Getting tag list.."
-  tags=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}" | urlp get tags | sort | uniq)
+  log "Getting tag list..."
+  tags=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}" | urlp get tags | sed "s| |_|g" | sort | uniq)
   textmenu "$tags" "Select which tags to use" "$4"
-  tag=$(echo "$res" | sed "s| |_|g")
+  tag="$res"
+
+  if [ "$tag" != "notag" ]; then
+    tagu="_$tag"
+  fi
 
   log "Getting date list..."
-  dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}_${tag}" | urlp get date | sort | uniq)
+  dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}${tagu}" | urlp get date | sort | uniq)
   dates="any $dates"
   textmenu "$dates" "Select which date to mirror" "$5"
   date="$res"
 
-  if [ "$wikireal" != "$wiki" ]; then
-    wiki="$wiki $wikireal"
-  fi
+  echo
+  echo "  Download command:"
+  echo "    \$ $0 download $wiki $wikireal $lang $edition $tag $date"
+  echo
 
-  echo "Download command:"
-  echo "  \$ $0 download $wiki $lang $edition $tag $date"
+  while true; do
+    read -p "Download [y/N]: " doit
+    case "$doit" in
+      y)
+        cmd_download "$wiki" "$wikireal" "$lang" "$edition" "$tag" "$date"
+        exit $?
+        ;;
+      n)
+        exit 0
+        ;;
+    esac
+  done
 }
 
-cmd_download() {
-  :
-}
+cmd_download_url() {
+  wiki="$1"
+  wikireal="$2"
+  lang="$3"
+  edition="$4"
+  tag="$5"
+  date="$6"
 
-if [ -n "$(LC_ALL=C type -t cmd_$1)" ] && [ "$(LC_ALL=C type -t cmd_$1)" = function ]; then
-  CMD="$1"
-  shift
-  "cmd_$CMD" "$@"
-  exit 0
-else
-  echo "Usage: $0 cache_update"
-  echo "       $0 choose"
-  echo "       $0 download     []"
-  exit 2
-fi
+  tag=$(echo "$tag" | sed "s|_| |g")
+  tag=($tag)
 
+  log "Getting download URL..."
+  URL=$(fetch_with_cache "$1" | get_urls | urlp filter "$wikireal" "$lang" "$edition" "${tag[@]}" "$date" | sort -s -r | head -n 1)
 
+  if [ -z "$URL" ]; then
+    echo "ERROR: Download URL not found. Possibly removed?" >&2
+    exit 2
+  fi
 
+  URL="$BASEURL$wiki/$URL"
 
-old() {
-  #urls=$(echo "$html" | grep "Download" | grep "https://download.kiwix.org/zim/.*_all.zim\"" -o | grep "https://download.kiwix.org/zim/.*_all.zim" -o | uniq) #filter all urls
+  log "URL: $URL"
 
-  #Select Wiki
-  # wikis=$(echo "$urls" | grep "zim/.*_.*_all.zim" -o | grep "/[a-z]*_" -o | grep "[a-z]*" -o | uniq)
+  # below is a mixture of https://stackoverflow.com/a/19841872/3990041, my knowledge and guesswork :P
+  SHA256=$(curl -sI "$URL" | grep digest | grep "SHA-256" | sed "s|digest: SHA-256=||g" | base64 -i -w 0 -d | od -t x8 -An | tr "\n" " " | sed "s| ||g")
 
-  fetch_with_cache "$wiki"
+  log "SHA256: $SHA256"
+}
 
-  #Select Language
-  langs=$(echo "$urls" | grep "/${res}_.*" -o | grep -o "_.*_" | sed "s|^_||g" | sed "s|_$||g")
-  textmenu "$langs" "Select which language to mirror" "$2"
-  lang="$res"
+cmd_download() {
+  cmd_download_url "$@"
 
-  #Get URL
-  url="https://download.kiwix.org/zim/${wiki}_${lang}_all.zim"
-  urlverify=$(echo "$urls" | grep "$url")
-  [ -z "$urlverify" ] && echo "INTERNAL ERROR: $url was not found in list but seems to be valid - Please report!" && exit 2
+  # real=$(curl -sLI $url | grep "^Location:"  | sed "s|Location: ||g" | grep "[a-zA-Z0-9\/:\._-]*" -o) #all the redirects
+  OUTNAME=$(basename "$URL")
 
-  echo
-  echo "Wiki: $wiki, Language: $lang, Url: $url"
-
-  [ -z "$*" ] && read -p "Press return to start downloading (this may take a long time)...
-  " _foo
-
-  md5=$(curl -sL $url.md5) #get the md5
-  real=$(curl -sLI $url | grep "^Location:"  | sed "s|Location: ||g" | grep "[a-zA-Z0-9\/:\._-]*" -o) #all the redirects
-  dest=$(basename $(echo "$real" | head -n 1)) #the real filename (includes date in filename, different from the one in the wiki)
-
-  md5check() {
-    echo
-    echo "Verify md5 checksum..."
-    md5sum -c > /dev/null 2> /dev/null << q
-  $md5
-  q
-    e=$?
-    if [ $e -ne 0 ]; then
-      echo "md5sum FAILED!"
-      if [ -z "$1" ]; then
-        echo "Trying to continue the download..."
-        echo
-        wget --continue "$url" -O $dest
-        echo
-        md5check "r"
-      else
-        echo
+  dl_cycle() {
+    log "Downloading $OUTNAME..."
+    wget --continue "$URL"
+    return $?
+  }
+
+  check_cycle() {
+    log "Verifiying $OUTNAME..."
+    sha256="$SHA256  $OUTNAME"
+    echo "$sha256" | sha256sum -c -
+    return $?
+  }
+
+  if [ -e "$OUTNAME" ]; then
+    if ! check_cycle; then
+      if ! dl_cycle; then
+        echo "Download failed! Check your network!"
+      fi
+      if ! check_cycle; then
         echo "It seems like your file is corrupted"
-        echo "Please remove it:" #we won't do that because the user might not want this
-        echo " $ rm $dest"
-        exit 2
+        echo "Please remove it:" # we won't do that because the user might not want this
+        echo " \$ rm $OUTNAME"
       fi
-    else
-      echo
-      echo "Success! File saved as $dest"
     fi
-  }
-
-  if [ -e "$dest" ]; then
-    echo "Skipping download, file already exists..."
-    md5check
   else
-    echo
-    wget $url -O $dest
-    echo
-    md5check
+    if ! dl_cycle; then
+      echo "Download failed! Check your network!"
+    fi
+    if ! check_cycle; then
+      echo "It seems like your file is corrupted"
+      echo "Please remove it:" # we won't do that because the user might not want this
+      echo " \$ rm $OUTNAME"
+    fi
   fi
 }
+
+if [ -n "$(LC_ALL=C type -t cmd_$1)" ] && [ "$(LC_ALL=C type -t cmd_$1)" = function ]; then
+  CMD="$1"
+  shift
+  "cmd_$CMD" "$@"
+  exit 0
+else
+  echo "Usage: $0 cache_update"
+  echo "       $0 choose"
+  echo "       $0 download      "
+  exit 2
+fi

From 9eed20d64e751f648464e110dd2216cb1574db6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= 
Date: Tue, 10 Sep 2019 19:30:45 +0200
Subject: [PATCH 06/10] feat: update README

---
 README.md | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index d976dc0..4f64273 100644
--- a/README.md
+++ b/README.md
@@ -42,8 +42,27 @@ If you would like to create an updated Wikipedia snapshot on IPFS, you can follo
 
 **Note: This is a work in progress.**. We intend to make it easy for anyone to create their own wikipedia snapshots and add them to IPFS, but our first emphasis has been to get the initial snapshots onto the network. This means some of the steps aren't as easy as we want them to be. If you run into trouble, seek help through a github issue, commenting in the #ipfs channel in IRC, or by posting a thread on https://discuss.ipfs.io.
 
+### Step 0: Clone this repository
+All commands assume to be run inside a cloned version of this repository
+
+Clone the distributed-wikipedia-mirror git repository
+
+```sh
+$ git clone git@github.com:ipfs/distributed-wikipedia-mirror.git
+```
+
+then `cd` into that directory
+
+```sh
+$ cd distributed-wikipedia-mirror
+```
+
 ### Step 1: Download the latest snapshot from kiwix.org
-Download the latest snapshot of Wikipedia (in ZIM format) from http://wiki.kiwix.org/wiki/Content_in_all_languages
+For that you can use the getzim.sh script
+
+First, download the latest wiki lists using `bash getzim.sh cache_update`
+
+After that create a download command using `bash getzim.sh choose`
 
 ### Step 2: Unpack the ZIM snapshot
 Unpack the ZIM snapshot using https://github.com/dignifiedquire/zim/commit/a283151105ab4c1905d7f5cb56fb8eb2a854ad67
@@ -68,18 +87,6 @@ Save the last hash of the output from that process. You will use that in the nex
 
 We have provided a script that adds the necessary information. It also adds a decentralized, serverless search utility to the page.
 
-Clone the distributed-wikipedia-mirror git repository
-
-```sh
-$ git clone git@github.com:ipfs/distributed-wikipedia-mirror.git
-```
-
-then `cd` into that directory
-
-```sh
-$ cd distributed-wikipedia-mirror
-```
-
 Write a copy of the snapshot from IPFS to `/root` on your machine
 
 ```sh

From 9a456fd987673488a2d1f948ee36663b291de31e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= 
Date: Tue, 10 Sep 2019 19:41:44 +0200
Subject: [PATCH 07/10] fix: hash decoding

---
 getzim.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/getzim.sh b/getzim.sh
index 1297d7b..3d428c6 100644
--- a/getzim.sh
+++ b/getzim.sh
@@ -238,7 +238,7 @@ cmd_download_url() {
   log "URL: $URL"
 
   # below is a mixture of https://stackoverflow.com/a/19841872/3990041, my knowledge and guesswork :P
-  SHA256=$(curl -sI "$URL" | grep digest | grep "SHA-256" | sed "s|digest: SHA-256=||g" | base64 -i -w 0 -d | od -t x8 -An | tr "\n" " " | sed "s| ||g")
+  SHA256=$(curl -sI "$URL" | grep digest | grep "SHA-256" | sed "s|digest: SHA-256=||g" | base64 -i -w 0 -d | od -t x1 -An | tr "\n" " " | sed "s| ||g")
 
   log "SHA256: $SHA256"
 }

From 988c62b93d5f99f5c198cbe607421305ee36528e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= 
Date: Tue, 8 Oct 2019 20:23:06 +0200
Subject: [PATCH 08/10] feat: apply changes

---
 getzim.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/getzim.sh b/getzim.sh
index 3d428c6..808f2ef 100644
--- a/getzim.sh
+++ b/getzim.sh
@@ -70,7 +70,7 @@ cmd_cache_update() {
 
 urlp() {
   # usage: get var
-  # usage: filter type lang edition tags... date/"any"
+  # usage: filter type lang edition tags... date/"latest"
 
   case "$1" in
     get)
@@ -96,7 +96,7 @@ urlp() {
         filter_tags="notag"
       fi
 
-      if [ "$1" != "any" ]; then
+      if [ "$1" != "latest" ]; then
         filter_date="$1"
       fi
       shift
@@ -191,7 +191,7 @@ cmd_choose() {
 
   log "Getting date list..."
   dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}${tagu}" | urlp get date | sort | uniq)
-  dates="any $dates"
+  dates="latest $dates"
   textmenu "$dates" "Select which date to mirror" "$5"
   date="$res"
 
@@ -243,6 +243,10 @@ cmd_download_url() {
   log "SHA256: $SHA256"
 }
 
+cmd_url() {
+  cmd_download_url "$@"
+}
+
 cmd_download() {
   cmd_download_url "$@"
 
@@ -293,6 +297,6 @@ if [ -n "$(LC_ALL=C type -t cmd_$1)" ] && [ "$(LC_ALL=C type -t cmd_$1)" = funct
 else
   echo "Usage: $0 cache_update"
   echo "       $0 choose"
-  echo "       $0 download      "
+  echo "       $0 download/url      "
   exit 2
 fi

From 954cbe531907a0f99b24920fd4bcc13da7c2282e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= 
Date: Tue, 8 Oct 2019 20:26:13 +0200
Subject: [PATCH 09/10] feat: make url command return json

---
 getzim.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/getzim.sh b/getzim.sh
index 808f2ef..0a4ec72 100644
--- a/getzim.sh
+++ b/getzim.sh
@@ -244,7 +244,8 @@ cmd_download_url() {
 }
 
 cmd_url() {
-  cmd_download_url "$@"
+  cmd_download_url "$@" >&2
+  echo '{"url":"'"$URL"'","sha256":"'"$SHA256"'"}'
 }
 
 cmd_download() {

From a9dd6cb8d389631bd5712160394bb56a53db1942 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20Kr=C3=BCger?= 
Date: Wed, 9 Oct 2019 13:53:44 +0200
Subject: [PATCH 10/10] Update getzim.sh

Co-Authored-By: Marcin Rataj 
---
 getzim.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/getzim.sh b/getzim.sh
index 0a4ec72..b494b1d 100644
--- a/getzim.sh
+++ b/getzim.sh
@@ -46,6 +46,7 @@ fetch_with_cache() {
     cat "$OUTFILE"
   else
     OUT=$(curl -sL "$BASEURL$1")
+    mkdir -p "$CACHE"
     echo "$OUT" > "$OUTFILE"
     echo "$OUT"
   fi