From e2c8eab6be5e2c995b360c39001a9319e2322389 Mon Sep 17 00:00:00 2001 From: amos Date: Sun, 12 Jan 2025 15:45:57 +0200 Subject: [PATCH 1/5] check_parked: avoid subshell in loop --- scripts/check_parked.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/check_parked.sh b/scripts/check_parked.sh index 2746d569c..d6e39afc9 100644 --- a/scripts/check_parked.sh +++ b/scripts/check_parked.sh @@ -150,12 +150,16 @@ find_parked() { local count=1 fi + # Count lines + local lines_cnt + lines_cnt="$(wc -l < "$1")" + # Loop through domains while read -r domain; do if [[ "$track" == true ]]; then if (( count % 100 == 0 )); then printf "[progress] Analyzed %s%% of domains\n" \ - "$(( count * 100 / $(wc -l < "$1") ))" + "$(( count * 100 / lines_cnt ))" fi (( count++ )) From 0877484fceff4512033efe7d73e431f0af9b932e Mon Sep 17 00:00:00 2001 From: amos Date: Sun, 12 Jan 2025 17:13:01 +0200 Subject: [PATCH 2/5] check_parked: strip subdomains with gawk --- scripts/check_parked.sh | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/scripts/check_parked.sh b/scripts/check_parked.sh index d6e39afc9..d096c8e56 100644 --- a/scripts/check_parked.sh +++ b/scripts/check_parked.sh @@ -204,10 +204,31 @@ remove_parked() { mv temp "$SUBDOMAINS" # Strip subdomains from parked domains - while read -r subdomain; do - sed -i "s/^${subdomain}\.//" parked.tmp - done < "$SUBDOMAINS_TO_REMOVE" - sort -u parked.tmp -o parked.tmp + gawk ' + # store lines from subdomains_to_remove as keys in array "dom" + NR==FNR { dom[$0]; next } + # process parked.tmp + { + # split current line by "." and store strings in array "arr" + n=split($0,arr,".") + # if "arr" has more than 1 element, loop over domains in array "dom" + if (n>1) { + for (d in dom) { + # if string in "dom" matches 1st element of array "arr", remove subdomain from the line and break the loop + if (match(arr[1], d)) { + regex="^" d "." + sub(regex,"") + break + } + } + } + # print out the line + print $0 + } + ' "$SUBDOMAINS_TO_REMOVE" parked.tmp | + sort -u > parked-removed-subdomains.tmp + + mv parked-removed-subdomains.tmp parked.tmp # Remove parked domains from the various files for file in "$RAW" "$RAW_LIGHT" "$ROOT_DOMAINS"; do @@ -236,6 +257,11 @@ cleanup() { # Entry point +command -v "gawk" 1>/dev/null || { + echo "Error: gawk not found." >&2 + exit 1 +} + trap cleanup EXIT $FUNCTION --format-all From 40bfa649ff2d931a7868a89c60c5cdd4b1e0dc88 Mon Sep 17 00:00:00 2001 From: J <91372088+jarelllama@users.noreply.github.com> Date: Mon, 13 Jan 2025 06:39:27 +0000 Subject: [PATCH 3/5] check_parked.sh: only count lines of the first split file --- scripts/check_parked.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/check_parked.sh b/scripts/check_parked.sh index d096c8e56..04155c85e 100644 --- a/scripts/check_parked.sh +++ b/scripts/check_parked.sh @@ -148,18 +148,16 @@ find_parked() { if [[ "$1" == 'x00' ]]; then local track=true local count=1 + local lines + lines="$(wc -l < "$1")" fi - # Count lines - local lines_cnt - lines_cnt="$(wc -l < "$1")" - # Loop through domains while read -r domain; do if [[ "$track" == true ]]; then if (( count % 100 == 0 )); then printf "[progress] Analyzed %s%% of domains\n" \ - "$(( count * 100 / lines_cnt ))" + "$(( count * 100 / lines ))" fi (( count++ )) From 32445392817eb2f63973da2e33253d3175e23e3c Mon Sep 17 00:00:00 2001 From: amos Date: Mon, 13 Jan 2025 14:19:27 +0200 Subject: [PATCH 4/5] check_parked: remove_parked: improve awk command --- scripts/check_parked.sh | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/scripts/check_parked.sh b/scripts/check_parked.sh index d096c8e56..a796a2ea2 100644 --- a/scripts/check_parked.sh +++ b/scripts/check_parked.sh @@ -211,16 +211,11 @@ remove_parked() { { # split current line by "." and store strings in array "arr" n=split($0,arr,".") - # if "arr" has more than 1 element, loop over domains in array "dom" - if (n>1) { - for (d in dom) { - # if string in "dom" matches 1st element of array "arr", remove subdomain from the line and break the loop - if (match(arr[1], d)) { - regex="^" d "." - sub(regex,"") - break - } - } + # if "arr" has more than 1 element, + # and string in "dom" matches 1st element of array "arr", remove subdomain from the line + if (n>1 && arr[1] in dom) { + regex="^" arr[1] "." + sub(regex,"") } # print out the line print $0 From 9978bda5385025bb7ead5508e451fbae173853f4 Mon Sep 17 00:00:00 2001 From: amos Date: Mon, 13 Jan 2025 14:26:09 +0200 Subject: [PATCH 5/5] check_parked: remove_parked: improve awk arr name --- scripts/check_parked.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/check_parked.sh b/scripts/check_parked.sh index bd25cc1f6..e4070648c 100644 --- a/scripts/check_parked.sh +++ b/scripts/check_parked.sh @@ -203,15 +203,15 @@ remove_parked() { # Strip subdomains from parked domains gawk ' - # store lines from subdomains_to_remove as keys in array "dom" - NR==FNR { dom[$0]; next } + # store lines from subdomains_to_remove as keys in array "subdom" + NR==FNR { subdom[$0]; next } # process parked.tmp { # split current line by "." and store strings in array "arr" n=split($0,arr,".") # if "arr" has more than 1 element, - # and string in "dom" matches 1st element of array "arr", remove subdomain from the line - if (n>1 && arr[1] in dom) { + # and string in "subdom" matches 1st element of array "arr", remove subdomain from the line + if (n>1 && arr[1] in subdom) { regex="^" arr[1] "." sub(regex,"") }