Skip to content

Commit

Permalink
update comments and refactor unicode conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
jarelllama authored Feb 2, 2025
1 parent a54b688 commit 14ff31c
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 14 deletions.
9 changes: 6 additions & 3 deletions scripts/retrieve_domains.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ main() {
mkdir -p data/pending
fi

# Install idn2 (requires sudo. -qq doesn not work here)
# Install idn2 here instead of in $FUNCTION to not bias source processing
# time.
command -v idn2 > /dev/null || sudo apt-get install idn2 > /dev/null

# Download toplist
Expand Down Expand Up @@ -223,8 +224,7 @@ process_source_results() {
sed -i 's/https\?:\/\///; s/\[//; s/\]//' "$source_results"

# Convert Unicode to Punycode
idn2 < "$source_results" > temp || exit 1
mv temp "$source_results"
$FUNCTION --convert-unicode "$source_results"

sort -u "$source_results" -o "$source_results"

Expand Down Expand Up @@ -441,13 +441,16 @@ ${parked_count},${in_toplist_count},${query_count},${status}" >> "$SOURCE_LOG"
}

# Call a shell wrapper to to log domain processing events into the domain log.
# Input:
# $1: domains to log either in a file or variable
# $2: event type (dead, whitelisted, etc.)
log_domains() {
$FUNCTION --log-domains "$1" "$2" "$source_name"
}

# Print error message and exit.
# Input:
# $1: error message to print
error() {
printf "%s\n" "$1" >&2
exit 1
Expand Down
9 changes: 9 additions & 0 deletions scripts/test_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,12 @@ test_url_conversion() {
# Test conversion of Unicode to Punycode
test_punycode_conversion() {
input 'ⴰⵣⵓⵍ.bortzmeyer.fr'
# Test that entries that may cause idn2 to error are handled properly
input 12--5.com
output xn--4lj0cra7d.bortzmeyer.fr "$RAW"
output 12--5.com "$RAW"
output xn--4lj0cra7d.bortzmeyer.fr "$RAW_LIGHT"
output 12--5.com "$RAW_LIGHT"
}

# Test removal of known dead domains including subdomains
Expand Down Expand Up @@ -393,6 +397,7 @@ test_invalid_removal() {
input invalid-test.100
input invalid-test.1x
input invalid-test.com/subfolder
input invalid-test-.com
input i.com
# Test that invalid subdomains/root domains are not added into the
# subdomains/root domains files
Expand All @@ -410,6 +415,7 @@ test_invalid_removal() {
output invalid,invalid-test.100 "$DOMAIN_LOG"
output invalid,invalid-test.1x "$DOMAIN_LOG"
output invalid,invalid-test.com/subfolder "$DOMAIN_LOG"
output invalid,invalid-test-.com "$DOMAIN_LOG"
output invalid,i.com "$DOMAIN_LOG"

# The validate script does not add invalid entries to the review config
Expand All @@ -421,6 +427,7 @@ test_invalid_removal() {
output invalid-test.100,invalid "$REVIEW_CONFIG"
output invalid-test.1x,invalid "$REVIEW_CONFIG"
output invalid-test.com/subfolder,invalid "$REVIEW_CONFIG"
output invalid-test-.com,invalid "$REVIEW_CONFIG"
output i.com,invalid "$REVIEW_CONFIG"
}

Expand Down Expand Up @@ -633,6 +640,8 @@ output() {
}

# Print error message and exit.
# Input:
# $1: error message to print
error() {
printf "%s\n" "$1" >&2
exit 1
Expand Down
33 changes: 28 additions & 5 deletions scripts/tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# tools.sh is a shell wrapper that stores commonly used functions.

# Function 'format_file' standardizes the format of the given file.
# Input:
# $1: file to be formatted
format_file() {
local file="$1"
Expand Down Expand Up @@ -46,7 +47,24 @@ format_all() {
done
}

# Function 'convert_unicode' converts Unicode to Punycode.
# Input:
# $1: file to process
convert_unicode() {
# Install idn2 (requires sudo. -qq doesn not work here)
command -v idn2 > /dev/null || sudo apt-get install idn2 > /dev/null

# Process the file, handling entries that may cause idn2 to error
# https://www.rfc-editor.org/rfc/rfc5891#section-4.2.3.1. If idn2 does
# error, exit 1.
{
grep -vE '\-\.|..--' "$1" | idn2 || exit 1
grep -E '\-\.|..--' "$1"
} | sort -u -o "$1"
}

# Function 'log_domains' logs domain processing events into the domain log.
# Input:
# $1: domains to log either in a file or variable
# $2: event type (dead, whitelisted, etc.)
# $3: source
Expand All @@ -73,6 +91,7 @@ log_domains() {

# Function 'prune_lines' prunes lines in the given file to keep its number of
# lines within the given threshold.
# Input:
# $1: file to be pruned
# $2: maximum number of lines to keep
prune_lines() {
Expand Down Expand Up @@ -131,6 +150,7 @@ download_nrd_feed() {

# Function 'send_telegram' sends a Telegram notification with the given
# message.
# Input:
# $TELEGRAM_CHAT_ID: Telegram user Chat ID
# $TELEGRAM_BOT_TOKEN: Telegram Bot Token
# $1: message body
Expand All @@ -148,16 +168,19 @@ set -e

case "$1" in
--format)
format_file "$2"
format_file "$*"
;;
--format-all)
format_all
;;
--convert-unicode)
convert_unicode "$*"
;;
--log-domains)
log_domains "$2" "$3" "$4"
log_domains "$*"
;;
--prune-lines)
prune_lines "$2" "$3"
prune_lines "$*"
;;
--download-toplist)
download_toplist
Expand All @@ -166,10 +189,10 @@ case "$1" in
download_nrd_feed
;;
--send-telegram)
send_telegram "$2"
send_telegram "$*"
;;
*)
printf "\n\e[1;31mInvalid argument: %s\e[0m\n\n" "$1" >&2
printf "\n\e[1;31mInvalid argument: %s\e[0m\n\n" "$*" >&2
exit 1
;;
esac
3 changes: 3 additions & 0 deletions scripts/update_readme.sh
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ readonly TODAY THIS_MONTH

# Function 'print_stats' is an echo wrapper that returns the formatted
# statistics for the given source.
# Input:
# $1: source to process (default is all sources)
print_stats() {
local this_month total_this_month
Expand All @@ -188,6 +189,7 @@ print_stats() {

# Function 'sum' is an echo wrapper that returns the total sum of filtered
# domains retrieved by the given source for that timeframe.
# Input:
# $1: timeframe to process
# $2: source to process (default is all sources)
sum() {
Expand All @@ -201,6 +203,7 @@ sum() {

# Function 'sum_excluded' is an echo wrapper that returns the percentage of
# excluded domains out of the raw count retrieved by the given source.
# Input:
# $1: source to process (default is all sources)
sum_excluded() {
local raw_count white_count dead_count parked_count excluded_count
Expand Down
7 changes: 1 addition & 6 deletions scripts/validate_domains.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ readonly SUBDOMAINS_TO_REMOVE='config/subdomains.txt'
readonly DOMAIN_REGEX='[[:alnum:]][[:alnum:].-]*[[:alnum:]]\.[[:alnum:]-]*[a-z]{2,}[[:alnum:]-]*'

main() {
# Install idn2 (requires sudo. -qq doesn not work here)
command -v idn2 > /dev/null || sudo apt-get install idn2 > /dev/null

# Download toplist
$FUNCTION --download-toplist

Expand Down Expand Up @@ -83,9 +80,7 @@ validate() {
# Convert Unicode to Punycode in raw file and raw light file
local file
for file in "$RAW" "$RAW_LIGHT"; do
idn2 < "$file" > temp || exit 1
mv temp "$file"
sort -u "$file" -o "$file"
$FUNCTION --convert-unicode "$file"
done

# Strip away subdomains
Expand Down

0 comments on commit 14ff31c

Please sign in to comment.