diff --git a/scripts/retrieve_domains.sh b/scripts/retrieve_domains.sh index d24731ee3..3ce44cdb8 100644 --- a/scripts/retrieve_domains.sh +++ b/scripts/retrieve_domains.sh @@ -62,7 +62,8 @@ main() { mkdir -p data/pending fi - # Install idn2 (requires sudo. -qq doesn not work here) + # Install idn2 here instead of in $FUNCTION to not bias source processing + # time. command -v idn2 > /dev/null || sudo apt-get install idn2 > /dev/null # Download toplist @@ -223,8 +224,7 @@ process_source_results() { sed -i 's/https\?:\/\///; s/\[//; s/\]//' "$source_results" # Convert Unicode to Punycode - idn2 < "$source_results" > temp || exit 1 - mv temp "$source_results" + $FUNCTION --convert-unicode "$source_results" sort -u "$source_results" -o "$source_results" @@ -441,6 +441,7 @@ ${parked_count},${in_toplist_count},${query_count},${status}" >> "$SOURCE_LOG" } # Call a shell wrapper to to log domain processing events into the domain log. +# Input: # $1: domains to log either in a file or variable # $2: event type (dead, whitelisted, etc.) log_domains() { @@ -448,6 +449,8 @@ log_domains() { } # Print error message and exit. +# Input: +# $1: error message to print error() { printf "%s\n" "$1" >&2 exit 1 diff --git a/scripts/test_functions.sh b/scripts/test_functions.sh index 987473a14..0b423f890 100644 --- a/scripts/test_functions.sh +++ b/scripts/test_functions.sh @@ -294,8 +294,12 @@ test_url_conversion() { # Test conversion of Unicode to Punycode test_punycode_conversion() { input 'ⴰⵣⵓⵍ.bortzmeyer.fr' + # Test that entries that may cause idn2 to error are handled properly + input 12--5.com output xn--4lj0cra7d.bortzmeyer.fr "$RAW" + output 12--5.com "$RAW" output xn--4lj0cra7d.bortzmeyer.fr "$RAW_LIGHT" + output 12--5.com "$RAW_LIGHT" } # Test removal of known dead domains including subdomains @@ -393,6 +397,7 @@ test_invalid_removal() { input invalid-test.100 input invalid-test.1x input invalid-test.com/subfolder + input invalid-test-.com input i.com # Test that invalid subdomains/root domains are not added into the # subdomains/root domains files @@ -410,6 +415,7 @@ test_invalid_removal() { output invalid,invalid-test.100 "$DOMAIN_LOG" output invalid,invalid-test.1x "$DOMAIN_LOG" output invalid,invalid-test.com/subfolder "$DOMAIN_LOG" + output invalid,invalid-test-.com "$DOMAIN_LOG" output invalid,i.com "$DOMAIN_LOG" # The validate script does not add invalid entries to the review config @@ -421,6 +427,7 @@ test_invalid_removal() { output invalid-test.100,invalid "$REVIEW_CONFIG" output invalid-test.1x,invalid "$REVIEW_CONFIG" output invalid-test.com/subfolder,invalid "$REVIEW_CONFIG" + output invalid-test-.com,invalid "$REVIEW_CONFIG" output i.com,invalid "$REVIEW_CONFIG" } @@ -633,6 +640,8 @@ output() { } # Print error message and exit. +# Input: +# $1: error message to print error() { printf "%s\n" "$1" >&2 exit 1 diff --git a/scripts/tools.sh b/scripts/tools.sh index 442938ca4..754f22e28 100644 --- a/scripts/tools.sh +++ b/scripts/tools.sh @@ -3,6 +3,7 @@ # tools.sh is a shell wrapper that stores commonly used functions. # Function 'format_file' standardizes the format of the given file. +# Input: # $1: file to be formatted format_file() { local file="$1" @@ -46,7 +47,24 @@ format_all() { done } +# Function 'convert_unicode' converts Unicode to Punycode. +# Input: +# $1: file to process +convert_unicode() { + # Install idn2 (requires sudo. -qq doesn not work here) + command -v idn2 > /dev/null || sudo apt-get install idn2 > /dev/null + + # Process the file, handling entries that may cause idn2 to error + # https://www.rfc-editor.org/rfc/rfc5891#section-4.2.3.1. If idn2 does + # error, exit 1. + { + grep -vE '\-\.|..--' "$1" | idn2 || exit 1 + grep -E '\-\.|..--' "$1" + } | sort -u -o "$1" +} + # Function 'log_domains' logs domain processing events into the domain log. +# Input: # $1: domains to log either in a file or variable # $2: event type (dead, whitelisted, etc.) # $3: source @@ -73,6 +91,7 @@ log_domains() { # Function 'prune_lines' prunes lines in the given file to keep its number of # lines within the given threshold. +# Input: # $1: file to be pruned # $2: maximum number of lines to keep prune_lines() { @@ -131,6 +150,7 @@ download_nrd_feed() { # Function 'send_telegram' sends a Telegram notification with the given # message. +# Input: # $TELEGRAM_CHAT_ID: Telegram user Chat ID # $TELEGRAM_BOT_TOKEN: Telegram Bot Token # $1: message body @@ -148,16 +168,19 @@ set -e case "$1" in --format) - format_file "$2" + format_file "$*" ;; --format-all) format_all ;; + --convert-unicode) + convert_unicode "$*" + ;; --log-domains) - log_domains "$2" "$3" "$4" + log_domains "$*" ;; --prune-lines) - prune_lines "$2" "$3" + prune_lines "$*" ;; --download-toplist) download_toplist @@ -166,10 +189,10 @@ case "$1" in download_nrd_feed ;; --send-telegram) - send_telegram "$2" + send_telegram "$*" ;; *) - printf "\n\e[1;31mInvalid argument: %s\e[0m\n\n" "$1" >&2 + printf "\n\e[1;31mInvalid argument: %s\e[0m\n\n" "$*" >&2 exit 1 ;; esac diff --git a/scripts/update_readme.sh b/scripts/update_readme.sh index 46de7b152..f252b38c0 100644 --- a/scripts/update_readme.sh +++ b/scripts/update_readme.sh @@ -171,6 +171,7 @@ readonly TODAY THIS_MONTH # Function 'print_stats' is an echo wrapper that returns the formatted # statistics for the given source. +# Input: # $1: source to process (default is all sources) print_stats() { local this_month total_this_month @@ -188,6 +189,7 @@ print_stats() { # Function 'sum' is an echo wrapper that returns the total sum of filtered # domains retrieved by the given source for that timeframe. +# Input: # $1: timeframe to process # $2: source to process (default is all sources) sum() { @@ -201,6 +203,7 @@ sum() { # Function 'sum_excluded' is an echo wrapper that returns the percentage of # excluded domains out of the raw count retrieved by the given source. +# Input: # $1: source to process (default is all sources) sum_excluded() { local raw_count white_count dead_count parked_count excluded_count diff --git a/scripts/validate_domains.sh b/scripts/validate_domains.sh index df534c285..8390dc209 100644 --- a/scripts/validate_domains.sh +++ b/scripts/validate_domains.sh @@ -15,9 +15,6 @@ readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt' readonly DOMAIN_REGEX='[[:alnum:]][[:alnum:].-]*[[:alnum:]]\.[[:alnum:]-]*[a-z]{2,}[[:alnum:]-]*' main() { - # Install idn2 (requires sudo. -qq doesn not work here) - command -v idn2 > /dev/null || sudo apt-get install idn2 > /dev/null - # Download toplist $FUNCTION --download-toplist @@ -83,9 +80,7 @@ validate() { # Convert Unicode to Punycode in raw file and raw light file local file for file in "$RAW" "$RAW_LIGHT"; do - idn2 < "$file" > temp || exit 1 - mv temp "$file" - sort -u "$file" -o "$file" + $FUNCTION --convert-unicode "$file" done # Strip away subdomains