Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
jarelllama authored Apr 1, 2024
1 parent 0133cf3 commit 20f0124
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 40 deletions.
14 changes: 7 additions & 7 deletions functions/check_dead.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ check_dead() {
check_alive() {
find_dead "$DEAD_DOMAINS" || return

# Find resurrected domains in dead domains file
# Get resurrected domains in dead domains file
# (dead domains file is unsorted)
alive_domains="$(comm -23 <(sort "$DEAD_DOMAINS") <(sort dead.tmp))"
[[ -z "$alive_domains" ]] && return
Expand Down Expand Up @@ -135,7 +135,7 @@ check_alive() {
# $1: file to process
# Output:
# dead.tmp (if dead domains found)
# return 1 (if dead domains not found)
# exit status 1 (if dead domains not found)
find_dead() {
sed 's/^/||/; s/$/^/' "$1" > formatted_domains.tmp
dead-domains-linter -i formatted_domains.tmp --export dead.tmp
Expand All @@ -162,11 +162,11 @@ cleanup() {
find . -maxdepth 1 -type f -name "*.tmp" -delete

# Prune old entries from dead domains file
(( $(wc -l < "$DEAD_DOMAINS") > 5000 )) && sed -i '1,100d' "$DEAD_DOMAINS"
if (( $(wc -l < "$DEAD_DOMAINS") > 5000 )); then
sed -i '1,100d' "$DEAD_DOMAINS"
fi
}

main
trap cleanup EXIT

cleanup

exit 0
main
64 changes: 31 additions & 33 deletions functions/check_parked.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash
# This script checks for parked/unparked domains and removes/adds them accordingly.

# This script checks for parked/unparked domains and
# removes/adds them accordingly.

readonly RAW='data/raw.txt'
readonly RAW_LIGHT='data/raw_light.txt'
Expand All @@ -14,15 +16,17 @@ main() {

remove_parked_domains
add_unparked_domains
update_light_file

# Remove domains from light raw file that are not found in full raw file
comm -12 "$RAW" "$RAW_LIGHT" > light.tmp
mv light.tmp "$RAW_LIGHT"

# Cache parked domains (done last to skip unparked domains check)
cat parked_domains.tmp >> "$PARKED_DOMAINS"
format_file "$PARKED_DOMAINS"
}

remove_parked_domains() {
# Retrieve parked domains and return if none found
retrieve_parked "$RAW" || return

# Remove parked domains from raw file
Expand All @@ -32,7 +36,6 @@ remove_parked_domains() {
}

add_unparked_domains() {
# Retrieve parked domains and return if none found
retrieve_parked "$PARKED_DOMAINS" || return

# Get unparked domains
Expand All @@ -49,11 +52,13 @@ add_unparked_domains() {
log_event "$unparked_domains" unparked parked_domains_file
}

# Function 'retrieve_parked' efficiently checks for parked domains.
# Function 'retrieve_parked' efficiently checks for parked domains from a
# given file by running the checks in parallel.
# Input:
# $1: list of domains to check
# $1: file to process
# Output:
# exit status 1 if no parked domains were found
# parked_domains.tmp (if parked domains found)
# exit status 1 (if parked domains not found)
retrieve_parked() {
# Truncate temporary files between runs
: > parked_domains.tmp # File needs to exist to avoid not found errors
Expand All @@ -66,13 +71,13 @@ retrieve_parked() {
split -d -l $(( $(wc -l < "$1") / 12 )) "$1"

# Run checks in parallel
check_parked "x00" & check_parked "x01" &
check_parked "x02" & check_parked "x03" &
check_parked "x04" & check_parked "x05" &
check_parked "x06" & check_parked "x07" &
check_parked "x08" & check_parked "x09" &
check_parked "x10" & check_parked "x11" &
check_parked "x12" & check_parked "x13"
find_parked "x00" & find_parked "x01" &
find_parked "x02" & find_parked "x03" &
find_parked "x04" & find_parked "x05" &
find_parked "x06" & find_parked "x07" &
find_parked "x08" & find_parked "x09" &
find_parked "x10" & find_parked "x11" &
find_parked "x12" & find_parked "x13"
wait

# Return 1 if no parked domains were found
Expand All @@ -81,12 +86,13 @@ retrieve_parked() {
format_file parked_domains.tmp
}

# Function 'check_parked' queries sites for parked messages in their HTML.
# Function 'find_parked' queries sites from a given file for parked messages
# in their HTML.
# Input:
# $1: list of domains to check
# $1: file to process
# Output:
# parked_domains.tmp (if parked domains were found)
check_parked() {
# parked_domains.tmp (if parked domains found)
find_parked() {
[[ ! -f "$1" ]] && return

# Track progress only for first split file
Expand Down Expand Up @@ -119,19 +125,6 @@ check_parked() {
fi
}

# Function 'update_light_file' removes any domains from the light raw file that
# are not found in the full raw file.
update_light_file() {
comm -12 "$RAW" "$RAW_LIGHT" > light.tmp && mv light.tmp "$RAW_LIGHT"
}

# Function 'prune_parked_domains_file' removes old entries once the file reaches
# a threshold of entries.
prune_parked_domains_file() {
(( $(wc -l < "$PARKED_DOMAINS") > 4000 )) && sed -i '1,100d' "$PARKED_DOMAINS"
true
}

# Function 'log_event' logs domain processing events into the domain log.
# $1: domains to log stored in a variable
# $2: event type (dead, whitelisted, etc.)
Expand All @@ -141,7 +134,8 @@ log_event() {
'{print time "," type "," $0 "," source}' >> "$DOMAIN_LOG"
}

# Function 'format_file' calls a shell wrapper to standardize the format of a file.
# Function 'format_file' calls a shell wrapper to
# standardize the format of a file.
# $1: file to format
format_file() {
bash functions/tools.sh format "$1"
Expand All @@ -150,7 +144,11 @@ format_file() {
cleanup() {
find . -maxdepth 1 -type f -name "*.tmp" -delete
find . -maxdepth 1 -type f -name "x??" -delete
prune_parked_domains_file

# Prune old entries from parked domains file
if (( $(wc -l < "$PARKED_DOMAINS") > 4000 )); then
sed -i '1,100d' "$PARKED_DOMAINS"
fi
}

trap cleanup EXIT
Expand Down

0 comments on commit 20f0124

Please sign in to comment.