mediacloud · m453h · Feb 26, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ docker/deploy.log
 docker/docker-compose.yml
 docker/docker-compose.yml.dump
 docker/docker-compose.yml.save-*
+*.sqlite3
diff --git a/bin/run-arch-eraser.sh b/bin/run-arch-eraser.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+
+. bin/func.sh
+
+is_valid_date() {
+    case "$1" in
+        [0-9][0-9][0-9][0-9]/[0-9][0-9]/[0-9][0-9]) return 0 ;;
+        *) return 1 ;;
+    esac
+}
+increment_date() {
+    date_cmd=$(command -v date)
+    case "$(uname)" in
+        Darwin)
+            "$date_cmd" -j -v+1d -f "%Y/%m/%d" "$1" +"%Y/%m/%d"
+            ;;
+        Linux)
+            "$date_cmd" -d "$1 + 1 day" +"%Y/%m/%d" 2>/dev/null
+            ;;
+        *)
+            echo "Unsupported Environment" >&2
+            return 1
+            ;;
+    esac
+}
+
+if [ $# -lt 3 ]; then
+    echo "Usage: $0 [start_date] [end_date] [pattern]" >&2
+    exit 1
+fi
+
+start_date="$1"
+end_date="$2"
+pattern="$3"
+shift 3
+other_params="$*"
+
+if ! is_valid_date "$start_date" || ! is_valid_date "$end_date"; then
+    echo "Error: Invalid date format. Use YYYY/MM/DD" >&2
+    exit 1
+fi
+
+convert_date_to_int() {
+    input_date="$1"
+    if [ "$(uname)" = "Darwin" ]; then
+        date -j -f "%Y/%m/%d" "$input_date" +"%Y%m%d" 2>/dev/null
+    elif [ "$(uname)" = "Linux" ]; then
+        date -d "$input_date" +"%Y%m%d" 2>/dev/null
+    else
+        echo "Unsupported OS" >&2
+        return 1
+    fi
+}
+
+output_string=""
+start_date_int=$(convert_date_to_int "$start_date")
+end_date_int=$(convert_date_to_int "$end_date")
+
+while [ "$start_date_int" -le "$end_date_int" ]; do
+    current_url=$(echo "$pattern" | sed "s|{pattern}|$start_date|g")
+    output_string="${output_string}${current_url} "
+    start_date=$(increment_date "$start_date")
+    start_date_int=$(convert_date_to_int "$start_date")
+done
+
+run_python indexer.scripts.arch-eraser ${output_string} ${other_params}
diff --git a/indexer/queuer.py b/indexer/queuer.py
@@ -27,7 +27,8 @@
 import os
 import sys
 import tempfile
-from typing import BinaryIO, cast
+from contextlib import nullcontext
+from typing import BinaryIO, cast, ContextManager, Optional, TextIO
 
 import requests
 
@@ -71,6 +72,7 @@ def define_options(self, ap: argparse.ArgumentParser) -> None:
             help="clean up old, incompletely processed files",
         )
         ap.add_argument("input_files", nargs="*", default=None)
+        ap.add_argument("--output-file", dest="output_file", default=None)
 
     def process_file(self, fname: str, fobj: BinaryIO) -> None:
         """
@@ -189,8 +191,19 @@ def maybe_process_files(self, fname: str) -> None:
                     # more likely a queuer might actually need
                     # multiple keys (eg; reading CSVs from one bucket
                     # that reference objects in another bucket).
-                    for key in sorted(bs.list_objects(prefix), reverse=True):
-                        self.maybe_process_file(f"{scheme}://{bs.bucket}/{key}")
+                    assert self.args
+                    output_file: Optional[str] = self.args.output_file
+                    context: ContextManager[Optional[object]] = nullcontext()
+                    with (
+                        open(output_file, "a")
+                        if output_file
+                        else context
+                    ) as file:
+                        for key in sorted(bs.list_objects(prefix), reverse=True):
+                            if output_file:
+                                assert isinstance(file, io.TextIOWrapper)
+                                file.write(f"{scheme}://{bs.bucket}/{key}\n")
+                            self.maybe_process_file(f"{scheme}://{bs.bucket}/{key}")
                     break  # found working config: for store .... loop
 
                 except tuple(bs.EXCEPTIONS) as e:
@@ -212,6 +225,17 @@ def maybe_process_file(self, fname: str) -> None:
         args = self.args
         assert args
 
+        if fname[0] == "@":
+            # implement "indirect file" (file containing file names)
+            # NOTE! paths read from indirect files are NOT interpreted
+            # as relative to the path of the indirect file.
+            logger.info("indirect file %s", fname)
+
+            f = self.open_file(fname[1:])
+            for line in f:
+                self.maybe_process_file(line.decode().rstrip())
+            return
+
         if args.test:
             logger.info("maybe_process_file %s", fname)
             return

diff --git a/indexer/scripts/arch-eraser.py b/indexer/scripts/arch-eraser.py
@@ -0,0 +1,221 @@
+"""
+Sketch of an "archive eraser"
+
+Reads archive files (possibly from remote "blob stores"), extracts URLs
+and removes objects from Elasticsearch.
+
+NOTE!! Does not actually remove the archive files from their
+off-site storage location(s)!!!!
+
+This was written to remove stories for late Jan thru early March 2022
+(database E) that were initially recovered in 2024 (with some link
+rot) thru various means(*), so that stories "blindly" recovered from
+S3 (without known URL) where canonical URLs were then extracted.
+
+However, experimention suggested that loading the canonical URLs
+without first removing the first attempt could lead to a 10% duplicate
+rate (initial vs. final URLs and other URL differences).
+
+From the below, it appears all of the WARC files are available on S3,
+with some available from B2 as well (may be cheaper to fetch from B2).
+
+(*) The different ways the URLs were recovered:
+1. From synthetic RSS files written at the time for IA (both by the legacy system
+   and the then "backup" rss-fetcher.
+
+2. From RSS files blindly extracted from S3 (ignoring the HTML files!) into CSV files of URLs
+
+Stories in index mc_search-00002 and mc_search-00003
+
+All files on S3, some on B2 (starting 2024/05/31)
+
+arch prefix     start           end             archives
+mccsv           2024/05/22 -> 2024/06/27        S3/(B2)
+mc(rss)         2024/05/27 -> 2024/06/20        S3/(B2) [1]
+mcrss           2024/06/20 -> 2024/08/16        S3/B2
+
+(B2) means some of the date range on B2
+
+[1] initial WARC files from RSS files written from 2024/05/27 thru
+    2024/06/20 start with mc- (see below):
+
+    THESE SHOULD BE VERIFIED!!! The "via" field in the metadata
+    should indicate how the URL was obtained!
+
+    dates                   container name (in WARC filename)
+    2024/05/27-2024/05/28   cf94b52abe5a        S3 [154 files]
+    2024/05/29-2024/06/04   cefd3fdce464        S3 [882 files]
+    2024/06/05              0c501ed61cf4        S3 & B2 [497 files]
+    2024/06/05              446d55936e82        S3 & B2 [27 files]
+    2024/06/05              cefd3fdce464
+    2024/06/06-2024/06/09   0c501ed61cf4
+    2024/06/09              7e1b47c305f1        S3 & B2 [1 file]
+    2024/06/11-2024/06/20   6c55aaf9daaa
+
+================
+
+This is based on the "Queuer" class, which reads both local and remote
+input files, and keeps track of which files have been processed.
+
+No queues are involved (provide any value for --rabbitmq-url or RABBITMQ_URL)
+
+The "tracker" uses SQLite3 (**), and should be multi-process safe,
+although this application may experience more contention (SQLite3 does
+full-table locks for row creation), and testing should be done (using
+--dry-run) with multiple processes running to see if any errors or
+exceptions are thrown due to lock contention!
+
+(**) The author doesn't care how you pronounce it, but he says "ess cue ell ite"
+(like it's a mineral): https://www.youtube.com/watch?v=Jib2AmRb_rk
+
+Because the files involved span a wide range of dates, and have
+various forms, rather than implement fancy wildcard or filtering
+support, the idea is to collect all the (full) archive URLs into a
+file (or files), and use the "indirect file" feature in the queuer
+to read the files of URLs.
+"""
+
+import argparse
+import logging
+from typing import BinaryIO, List, Optional
+
+from elasticsearch import Elasticsearch
+
+from indexer.elastic import ElasticMixin
+from indexer.queuer import Queuer
+from indexer.story_archive_writer import StoryArchiveReader
+
+logger = logging.getLogger("arch-eraser")
+
+
+class ArchEraser(ElasticMixin, Queuer):
+    APP_BLOBSTORE = "HIST"  # first choice for blobstore conf vars
+    HANDLE_GZIP = False  # StoryArchiveReader handles compression
+
+    # don't want to talk to RabbitMQ, but too much work
+    # to refactor Queuer into a FileProcessor add-in
+
+    def __init__(self, process_name: str, descr: str):
+        super().__init__(process_name, descr)
+        self.is_batch_delete: bool = False
+        self.keep_alive: str = ""
+        self.fetch_batch_size: Optional[int] = None
+        self.indices: str = ""
+
+    def qconnect(self) -> None:
+        return
+
+    def check_output_queues(self) -> None:
+        return
+
+    def define_options(self, ap: argparse.ArgumentParser) -> None:
+        super().define_options(ap)
+        ap.add_argument(
+            "--fetch-batch-size",
+            dest="fetch_batch_size",
+            type=int,
+            default=1000,
+            help="The number of documents to fetch from Elasticsearch in each batch (default: 1000)",
+        )
+        ap.add_argument(
+            "--indices",
+            dest="indices",
+            help="The name of the Elasticsearch indices to delete",
+        )
+        ap.add_argument(
+            "--keep-alive",
+            dest="keep_alive",
+            default="1m",
+            help="How long should Elasticsearch keep the PIT alive e.g. 1m -> 1 minute",
+        )
+        ap.add_argument(
+            "--batch-delete",
+            dest="is_batch_delete",
+            action="store_true",
+            default=False,
+            help="Enable batch deletion of documents (default: False)",
+        )
+
+    def process_args(self) -> None:
+        """
+        Process command line arguments and set instance variables.
+        """
+        super().process_args()
+        assert self.args
+        self.fetch_batch_size = self.args.fetch_batch_size
+        self.indices = self.args.indices
+        self.keep_alive = self.args.keep_alive
+        self.is_batch_delete = self.args.is_batch_delete
+
+    def delete_documents(self, urls: List[Optional[str]]) -> None:
+        es = self.elasticsearch_client()
+        pit_id = None
+        total_deleted = 0
+        try:
+            pit_id = es.open_point_in_time(
+                index=self.indices, keep_alive=self.keep_alive
+            ).get("id")
+            logger.info("Opened Point-in-Time with ID %s", pit_id)
+            query = {
+                "size": self.fetch_batch_size,
+                "query": {"terms": {"original_url": urls}},
+                "pit": {"id": pit_id, "keep_alive": self.keep_alive},
+                "sort": [{"_doc": "asc"}],
+            }
+            search_after = None
+            while True:
+                if search_after:
+                    query["search_after"] = search_after
+                # Fetch the next batch of documents
+                response = es.search(body=query)
+                hits = response["hits"]["hits"]
+                # Each result will return a PIT ID which may change, thus we just need to update it
+                pit_id = response.get("pit_id")
+                if not hits:
+                    break
+                bulk_actions = []
+                for hit in hits:
+                    document_index = hit["_index"]
+                    document_id = hit["_id"]
+                    if self.is_batch_delete:
+                        bulk_actions.append(
+                            {"delete": {"_index": document_index, "_id": document_id}}
+                        )
+                    else:
+                        es.delete(index=document_index, id=document_id)
+                        total_deleted += 1
+                if bulk_actions:
+                    es.bulk(index=self.indices, body=bulk_actions)
+                    total_deleted += len(bulk_actions)
+                search_after = hits[-1]["sort"]
+        except Exception as e:
+            logger.exception(e)
+        finally:
+            if total_deleted != len(urls):
+                logger.warning(
+                    f"Mismatch in document deletion count: [{total_deleted}] deleted out of [{len(urls)}] expected."
+                )
+            else:
+                logger.info(f"Deleted [{total_deleted}/{len(urls)}] documents.")
+            if isinstance(es, Elasticsearch) and pit_id:
+                response = es.close_point_in_time(id=pit_id)
+                if response.get("succeeded"):
+                    logger.info("Successfully closed Point-in-Time with ID %s", pit_id)
+
+    def process_file(self, fname: str, fobj: BinaryIO) -> None:
+        assert self.args
+        logger.info("process_file %s", fname)
+        # it may be possible to make this faster by NOT using
+        # StoryArchiveReader and warcio, but it came for "free":
+        reader = StoryArchiveReader(fobj)
+        urls = []
+        for story in reader.read_stories():
+            urls.append(story.content_metadata().url)
+        logger.info("collected %d urls from %s", len(urls), fname)
+        if not self.args.dry_run:
+            logger.warning("delete %d urls from %s here!", len(urls), fname)
+
+
+if __name__ == "__main__":
+    app = ArchEraser("arch-eraser", "remove stories loaded from archive files from ES")
+    app.main()