mediacloud · m453h · Feb 26, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 13, 2025
diff --git a/bin/run-arch-eraser.sh b/bin/run-arch-eraser.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+SCRIPT_DIR="$(dirname "$0")"
+. "$SCRIPT_DIR/func.sh"
+
+print_help(){
+    echo ""
+    echo "Usage: $0 <path_to_url_list_file> [OPTIONS]"
+    echo ""
+    echo "Description:"
+    echo " Description: Deletes documents from Elasticsearch based on original URLs provided in input files"
+    echo ""
+    echo "Options:"
+    echo "  --elasticsearch-hosts    Elasticsearch host URL"
+    echo "  --indices                The name of the Elasticsearch indices to delete from"
+    echo "  --min-delay              The minimum time to wait between delete operations (default: 0.5 seconds)"
+    echo "  --max-delay              The maximum time to wait between delete operations (default: 3.0 seconds)"
+    echo "  --fetch-batch-size       The number of documents to fetch from Elasticsearch in each batch (default: 1000)"
+    echo "  --batch-delete           Enable batch deletion of documents (default: False)"
+    echo "  --buffer                 The maximum number of delete operations to buffer before flushing to Elasticsearch (default: 2000)"
+    echo ""
+    echo " Example:"
+    echo "  $0  arch-lister/url_list --elasticsearch-hosts=http://localhost:9200 --indices=index1,index2 --fetch-batch-size=5000 --min-delay=1 --max-delay=3"
+    echo "  $0  arch-lister/url_list --elasticsearch-hosts=http://localhost:9200 --indices=index1,index2 --fetch-batch-size=5000 --min-delay=1 --max-delay=3 --batch-delete --buffer=1000"
+}
+
+# Handle help flag
+if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
+    print_help
+    exit 0
+fi
+
+if [ $# -lt 1 ]; then
+  print_help
+  exit 1
+fi
+
+input_path=$1
+shift 1
+
+run_python indexer.scripts.arch-eraser "$input_path" "$@" --rabbitmq-url='-'
diff --git a/bin/run-arch-url-lister.sh b/bin/run-arch-url-lister.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+SCRIPT_DIR="$(dirname "$0")"
+. "$SCRIPT_DIR/func.sh"
+
+print_help(){
+    echo ""
+    echo "Usage: $0 <input_file_path> [output_file_path]"
+    echo ""
+    echo "Description:"
+    echo " Writes a list of URLs in a WARC file to a txt file "
+    echo ""
+    echo "Arguments:"
+    echo " <input_file_path>       Path to an indirect file (a file that contains a list of files to process)."
+    echo " [output_file_path]      Optional. Path to output file for the URL list."
+    echo "                         Default: <PROJECT_DIR>/data/arch-lister/url_list/<WARC_FILE_NAME>.txt"
+    echo ""
+    echo " Example:"
+    echo "  $0 arch-lister/file-1.txt"
+    echo "  $0 arch-lister/file-1.txt url-lists.txt"
+}
+
+# Handle help flag
+if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
+    print_help
+    exit 0
+fi
+
+# We expect (1) argument when we want to process files and list URLs
+if [ $# -lt 1 ]; then
+  print_help
+  exit 1
+fi
+
+# Verify that the input file path exists
+if [ ! -f "$1" ]; then
+   echo "Error: The input file '$1' does not exist, please check the path and try again"
+   exit 1
+fi
+
+OUTPUT_PARAM=""
+if [ -n "$2" ]; then
+    OUTPUT_PARAM="-o $2"
+fi
+
+run_python indexer.scripts.arch-lister "@$1" $OUTPUT_PARAM --rabbitmq-url='-'
diff --git a/bin/run-arch-warc-lister.sh b/bin/run-arch-warc-lister.sh
@@ -0,0 +1,90 @@
+#!/bin/sh
+
+SCRIPT_DIR="$(dirname "$0")"
+. "$SCRIPT_DIR/func.sh"
+
+is_valid_date() {
+    case "$1" in
+        [0-9][0-9][0-9][0-9]/[0-9][0-9]/[0-9][0-9]) return 0 ;;
+        *) return 1 ;;
+    esac
+}
+
+increment_date() {
+    date_cmd=$(command -v date)
+    case "$(uname)" in
+        Darwin)
+            "$date_cmd" -j -v+1d -f "%Y/%m/%d" "$1" +"%Y/%m/%d"
+            ;;
+        Linux)
+            "$date_cmd" -d "$1 + 1 day" +"%Y/%m/%d" 2>/dev/null
+            ;;
+        *)
+            echo "Unsupported Environment" >&2
+            return 1
+            ;;
+    esac
+}
+
+convert_date_to_int() {
+    input_date="$1"
+    if [ "$(uname)" = "Darwin" ]; then
+        date -j -f "%Y/%m/%d" "$input_date" +"%Y%m%d" 2>/dev/null
+    elif [ "$(uname)" = "Linux" ]; then
+        date -d "$input_date" +"%Y%m%d" 2>/dev/null
+    else
+        echo "Unsupported OS" >&2
+        return 1
+    fi
+}
+
+print_help(){
+    echo ""
+    echo "Usage: $0 <start_date> <end_date> <pattern>"
+    echo ""
+    echo "Description:"
+    echo " Outputs a list of files from archives based on a specified date range and matching pattern."
+    echo ""
+    echo "Arguments:"
+    echo " <start_date>            Start date for filtering records (format: YYYY/MM/DD)."
+    echo " <end_date>              End date for filtering records (format: YYYY/MM/DD)."
+    echo " <pattern>               String pattern used to construct file paths (e.g. 'b2://archives/{pattern}/mchist2022')"
+    echo " <output>                The path to the output file where the archive list will be written"
+    echo ""
+    echo " Example:"
+    echo "  $0 2024/12/15 2024/12/31 'b2://archives/{pattern}/mchist2022'"
+}
+
+# Handle help flag
+if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then
+    print_help
+    exit 0
+fi
+
+# We expect (4) arguments when we want to list files by date
+if [ $# -lt 4 ]; then
+  print_help
+  exit 1
+fi
+
+start_date="$1"
+end_date="$2"
+pattern="$3"
+
+if ! is_valid_date "$start_date" || ! is_valid_date "$end_date"; then
+    echo "Error: Invalid date format. Use YYYY/MM/DD" >&2
+    exit 1
+fi
+
+search_pattern=""
+start_date_int=$(convert_date_to_int "$start_date")
+end_date_int=$(convert_date_to_int "$end_date")
+
+while [ "$start_date_int" -le "$end_date_int" ]; do
+    current_url=$(echo "$pattern" | sed "s|{pattern}|$start_date|g")
+    search_pattern="${search_pattern}${current_url} "
+    start_date=$(increment_date "$start_date")
+    start_date_int=$(convert_date_to_int "$start_date")
+done
+
+run_python indexer.scripts.arch-lister $search_pattern -o "$4" -w --rabbitmq-url='-'
diff --git a/indexer/queuer.py b/indexer/queuer.py
@@ -212,6 +212,17 @@ def maybe_process_file(self, fname: str) -> None:
         args = self.args
         assert args
 
+        if fname[0] == "@":
+            # implement "indirect file" (file containing file names)
+            # NOTE! paths read from indirect files are NOT interpreted
+            # as relative to the path of the indirect file.
+            logger.info("indirect file %s", fname)
+
+            f = self.open_file(fname[1:])
+            for line in f:
+                self.maybe_process_file(line.decode().rstrip())
+            return
+
         if args.test:
             logger.info("maybe_process_file %s", fname)
             return