From 6d4c64be0f2bafbc976bfafa81ea4505a7ea0f06 Mon Sep 17 00:00:00 2001
From: Malcolm Greaves <malcolmgreaves@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:28:06 -0500
Subject: [PATCH 1/2] Refactored `load` & `download_bionemo_data` into
 bionemo-core (#396)

`bionemo-testing` re-exports the same values defined in `__all__`: they
are imported from `bionemo-core` as the implementations have moved.
Note that the tests have also moved.

Now, all sub-packages can use `load` at runtime, not just during tests.

All previous imports of `bionemo.testing.load` have been changed to
`bionemo.core.load`.

Additionally moves over the YAML resource files from bionemo-testing
into bionemo-core & adjusted the `get_all_resources` function.

This PR also fixes an error in the naming convention for
`bionemo-core`'s tests.
---
 scripts/protein/esm2/test_esm2_infer.py       |   2 +-
 scripts/protein/esm2/test_pydantic_train.py   |   2 +-
 sub-packages/bionemo-core/pyproject.toml      |  16 +-
 .../src/bionemo/core}/data/README.md          |   0
 .../src/bionemo/core/data/load.py             | 300 ++++++++++++++++++
 .../src/bionemo/core/data/resource.py         | 107 +++++++
 .../bionemo/core}/data/resources/esm2.yaml    |   0
 .../core}/data/resources/geneformer.yaml      |   0
 .../bionemo/core}/data/resources/scdl.yaml    |   0
 .../core}/data/resources/single_cell.yaml     |   0
 .../tests/bionemo/core}/data/test_load.py     |  20 +-
 .../data/test_multi_epoch_dataset.py          |   0
 .../{pytorch => core}/data/test_permute.py    |   0
 .../{pytorch => core}/data/test_resamplers.py |   0
 .../tests/bionemo/core}/data/test_resource.py |   2 +-
 .../{pytorch => core}/utils/test_dtypes.py    |   0
 .../tests/bionemo/esm2/model/test_model.py    |   2 +-
 .../bionemo/esm2/model/test_stop_and_go.py    |   2 +-
 .../scripts/geneformer_mlm_loss_eval.py       |   2 +-
 .../geneformer/scripts/infer_geneformer.py    |   2 +-
 .../geneformer/scripts/test_pydantic_train.py |   2 +-
 .../scripts/test_train_geneformer.py          |   2 +-
 .../tests/bionemo/geneformer/test_model.py    |   2 +-
 .../bionemo/geneformer/test_stop_and_go.py    |   2 +-
 .../tests/bionemo/scdl/conftest.py            |   2 +-
 sub-packages/bionemo-testing/pyproject.toml   |  13 -
 .../src/bionemo/testing/data/load.py          | 288 +----------------
 .../src/bionemo/testing/data/resource.py      |  90 +-----
 28 files changed, 462 insertions(+), 396 deletions(-)
 rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/README.md (100%)
 create mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/load.py
 create mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/resource.py
 rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/esm2.yaml (100%)
 rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/geneformer.yaml (100%)
 rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/scdl.yaml (100%)
 rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/single_cell.yaml (100%)
 rename sub-packages/{bionemo-testing/tests/bionemo/testing => bionemo-core/tests/bionemo/core}/data/test_load.py (95%)
 rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/data/test_multi_epoch_dataset.py (100%)
 rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/data/test_permute.py (100%)
 rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/data/test_resamplers.py (100%)
 rename sub-packages/{bionemo-testing/tests/bionemo/testing => bionemo-core/tests/bionemo/core}/data/test_resource.py (98%)
 rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/utils/test_dtypes.py (100%)

diff --git a/scripts/protein/esm2/test_esm2_infer.py b/scripts/protein/esm2/test_esm2_infer.py
index c5ec1f633d..4c214d2c2f 100644
--- a/scripts/protein/esm2/test_esm2_infer.py
+++ b/scripts/protein/esm2/test_esm2_infer.py
@@ -21,10 +21,10 @@
 from esm2_infer import infer_model
 from torch.utils.data import DataLoader
 
+from bionemo.core.data.load import load
 from bionemo.esm2.api import ESM2Config
 from bionemo.esm2.data.tokenizer import get_tokenizer
 from bionemo.esm2.model.finetune.datamodule import ESM2FineTuneDataModule, InMemoryCSVDataset
-from bionemo.testing.data.load import load
 
 
 esm2_650m_checkpoint_path = load("esm2/650m:2.0")
diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
index 2522e538f8..a7c3ababa8 100644
--- a/scripts/protein/esm2/test_pydantic_train.py
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -21,8 +21,8 @@
 import pytest
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
+from bionemo.core.data.load import load
 from bionemo.testing.data.esm2 import create_mock_parquet_train_val_inputs, create_mock_protein_dataset
-from bionemo.testing.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-core/pyproject.toml b/sub-packages/bionemo-core/pyproject.toml
index c6e04c7270..079d9c8e58 100644
--- a/sub-packages/bionemo-core/pyproject.toml
+++ b/sub-packages/bionemo-core/pyproject.toml
@@ -12,14 +12,28 @@ license = { file = "LICENSE" }
 dynamic = ["version"]
 dependencies = [
     # bionemo sub-packages
+    # bionemo-core **MUST NOT** depend on any other sub-packages !!!!!
     # external
     "numpy",
     "platformdirs",
     "torch>=2.2.1",
-    'pytorch-lightning>=2.2.1',
+    'boto3',
     'lightning>=2.2.1',
+    'ngcsdk',
+    'pooch',
+    'pydantic>=2.7.0',
+    'pytorch-lightning>=2.2.1',
+    'pyyaml',
+    'tqdm',
 ]
 
+[project.scripts]
+download_bionemo_data = "bionemo.core.data.load:entrypoint"
+
+# Make sure that the resource yaml files are being packaged alongside the python files.
+[tool.setuptools.package-data]
+"bionemo.core" = ["**/*.yaml"]
+
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["bionemo.*"]
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md b/sub-packages/bionemo-core/src/bionemo/core/data/README.md
similarity index 100%
rename from sub-packages/bionemo-testing/src/bionemo/testing/data/README.md
rename to sub-packages/bionemo-core/src/bionemo/core/data/README.md
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/load.py b/sub-packages/bionemo-core/src/bionemo/core/data/load.py
new file mode 100644
index 0000000000..ca737c37a4
--- /dev/null
+++ b/sub-packages/bionemo-core/src/bionemo/core/data/load.py
@@ -0,0 +1,300 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import contextlib
+import shutil
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal, Optional, Sequence, TextIO
+
+import boto3
+import ngcsdk
+import pooch
+from botocore.config import Config
+from tqdm import tqdm
+
+from bionemo.core import BIONEMO_CACHE_DIR
+from bionemo.core.data.resource import Resource, get_all_resources
+
+
+__all__: Sequence[str] = (
+    "load",
+    "default_ngc_client",
+    "default_pbss_client",
+    "NGCDownloader",
+)
+
+
+def default_pbss_client():
+    """Create a default S3 client for PBSS."""
+    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
+    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)
+
+
+def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None:
+    """Download a file from PBSS."""
+    # Parse S3 URL to get bucket and key
+    parts = url.replace("s3://", "").split("/")
+    bucket = parts[0]
+    key = "/".join(parts[1:])
+
+    with contextlib.closing(default_pbss_client()) as s3:
+        object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"]
+        progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url)
+
+        # Define callback
+        def progress_callback(bytes_transferred):
+            progress_bar.update(bytes_transferred)
+
+        # Download file from S3
+        s3.download_file(bucket, key, output_file, Callback=progress_callback)
+
+
+def default_ngc_client() -> ngcsdk.Client:
+    """Create a default NGC client.
+
+    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
+    """
+    return ngcsdk.Client()
+
+
+@dataclass
+class NGCDownloader:
+    """A class to download files from NGC in a Pooch-compatible way.
+
+    NGC downloads are typically structured as directories, while pooch expects a single file. This class
+    downloads a single file from an NGC directory and moves it to the desired location.
+    """
+
+    filename: str
+    ngc_registry: Literal["model", "resource"]
+
+    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
+        """Download a file from NGC."""
+        client = default_ngc_client()
+
+        download_fns = {
+            "model": client.registry.model.download_version,
+            "resource": client.registry.resource.download_version,
+        }
+
+        output_file = Path(output_file)
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+
+        # NGC seems to always download to a specific directory that we can't specify ourselves.
+        ngc_dirname = Path(url).name.replace(":", "_v")
+
+        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
+            download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename])
+            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)
+
+
+def load(
+    model_or_data_tag: str,
+    source: Literal["ngc", "pbss"] = "pbss",
+    resources: dict[str, Resource] | None = None,
+    cache_dir: Path | None = None,
+) -> Path:
+    """Download a resource from PBSS or NGC.
+
+    Args:
+        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
+        source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".
+        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
+        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)
+
+    Raises:
+        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.
+
+    Returns:
+        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
+        file(s).
+
+    Examples:
+        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
+        >>> load("filename/tag")
+        PosixPath(/tmp/bionemo/downloaded-file-name)
+    """
+    if resources is None:
+        resources = get_all_resources()
+
+    if cache_dir is None:
+        cache_dir = BIONEMO_CACHE_DIR
+
+    if model_or_data_tag not in resources:
+        raise ValueError(f"Resource '{model_or_data_tag}' not found.")
+
+    if source == "ngc" and resources[model_or_data_tag].ngc is None:
+        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")
+
+    resource = resources[model_or_data_tag]
+    filename = str(resource.pbss).split("/")[-1]
+
+    extension = "".join(Path(filename).suffixes)
+    processor = _get_processor(extension, resource.unpack, resource.decompress)
+
+    if source == "pbss":
+        download_fn = _s3_download
+        url = resource.pbss
+
+    elif source == "ngc":
+        assert resource.ngc_registry is not None
+        download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry)
+        url = resource.ngc
+
+    else:
+        raise ValueError(f"Source '{source}' not supported.")
+
+    download = pooch.retrieve(
+        url=str(url),
+        known_hash=resource.sha256,
+        path=cache_dir,
+        downloader=download_fn,
+        processor=processor,
+    )
+
+    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
+    # just want the unpacked, parent folder.
+    if isinstance(download, list):
+        return Path(processor.extract_dir)  # type: ignore
+
+    else:
+        return Path(download)
+
+
+def _get_processor(extension: str, unpack: bool | None, decompress: bool | None):
+    """Get the processor for a given file extension.
+
+    If unpack and decompress are both None, the processor will be inferred from the file extension.
+
+    Args:
+        extension: The file extension.
+        unpack: Whether to unpack the file.
+        decompress: Whether to decompress the file.
+
+    Returns:
+        A Pooch processor object.
+    """
+    if extension in {".gz", ".bz2", ".xz"} and decompress is None:
+        return pooch.Decompress()
+
+    elif extension in {".tar", ".tar.gz"} and unpack is None:
+        return pooch.Untar()
+
+    elif extension == ".zip" and unpack is None:
+        return pooch.Unzip()
+
+    else:
+        return None
+
+
+def print_resources(*, output_source: TextIO = sys.stdout) -> None:
+    """Prints all available downloadable resources & their sources to STDOUT."""
+    print("#resource_name\tsource_options", file=output_source)
+    for resource_name, resource in sorted(get_all_resources().items()):
+        sources = []
+        if resource.ngc is not None:
+            sources.append("ngc")
+        if resource.pbss is not None:
+            sources.append("pbss")
+        print(f"{resource_name}\t{','.join(sources)}", file=output_source)
+
+
+def entrypoint():
+    """Allows a user to get a specific artifact from the command line."""
+    parser = argparse.ArgumentParser(
+        description="Retrieve the local path to the requested artifact name or list resources."
+    )
+
+    # Create mutually exclusive group
+    group = parser.add_mutually_exclusive_group(required=True)
+
+    # Add the argument for artifact name, which is required if --list-resources is not used
+    group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact")
+
+    # Add the --list-resources option
+    group.add_argument(
+        "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit."
+    )
+
+    # Add the --source option
+    parser.add_argument(
+        "--source",
+        type=str,
+        choices=["pbss", "ngc"],
+        default="ngc",
+        help='Backend to use, Internal NVIDIA users can set this to "pbss".',
+    )
+
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        default=False,
+        help="Download all resources. Ignores all other options.",
+    )
+    args = parser.parse_args()
+    maybe_error = main(
+        download_all=args.all,
+        list_resources=args.list_resources,
+        artifact_name=args.artifact_name,
+        source=args.source,
+    )
+    if maybe_error is not None:
+        parser.error(maybe_error)
+
+
+if __name__ == "__main__":
+    entrypoint()
+
+
+def main(
+    download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"]
+) -> Optional[str]:
+    """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure."""
+    if download_all:
+        print("Downloading all resources:", file=sys.stderr)
+        print_resources(output_source=sys.stderr)
+        print("-" * 80, file=sys.stderr)
+
+        resource_to_local: dict[str, Path] = {}
+        for resource_name in tqdm(
+            sorted(get_all_resources()),
+            desc="Downloading Resources",
+        ):
+            with contextlib.redirect_stdout(sys.stderr):
+                local_path = load(resource_name, source=source)
+            resource_to_local[resource_name] = local_path
+
+        print("-" * 80, file=sys.stderr)
+        print("All resources downloaded:", file=sys.stderr)
+        for resource_name, local_path in sorted(resource_to_local.items()):
+            print(f"  {resource_name}: {str(local_path.absolute())}", file=sys.stderr)
+
+    elif list_resources:
+        print_resources(output_source=sys.stdout)
+
+    elif artifact_name is not None and len(artifact_name) > 0:
+        # Get the local path for the provided artifact name
+        with contextlib.redirect_stdout(sys.stderr):
+            local_path = load(artifact_name, source=source)
+
+        # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT
+        print(str(local_path.absolute()))
+
+    else:
+        return "You must provide an artifact name if --list-resources or --all is not set!"
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resource.py b/sub-packages/bionemo-core/src/bionemo/core/data/resource.py
new file mode 100644
index 0000000000..419975854f
--- /dev/null
+++ b/sub-packages/bionemo-core/src/bionemo/core/data/resource.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+import itertools
+from collections import Counter
+from importlib.resources import files
+from pathlib import Path
+from typing import Annotated, Any, Literal, Sequence
+
+import pydantic
+import yaml
+from registry.api.utils import RegistryTarget
+
+
+__all__: Sequence[str] = (
+    "Resource",
+    "get_all_resources",
+)
+
+
+def _validate_ngc_resource(value: str) -> str:
+    return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]"))
+
+
+class Resource(pydantic.BaseModel):
+    """Class that represents a remote resource for downloading and caching test data."""
+
+    model_config = pydantic.ConfigDict(use_attribute_docstrings=True)
+
+    tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")]  # Only slash between filename and tag.
+    """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag")."""
+
+    ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None
+    """The NGC URL for the resource.
+
+    Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.
+    """
+
+    ngc_registry: Literal["model", "resource"] | None = None
+    """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None."""
+
+    pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])]
+    """The PBSS (NVIDIA-internal) URL of the resource."""
+
+    sha256: str | None
+    """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended)."""
+
+    owner: pydantic.NameEmail
+    """The owner or primary point of contact for the resource, in the format "Name <email>"."""
+
+    description: str | None = None
+    """A description of the file(s)."""
+
+    unpack: Literal[False, None] = None
+    """Whether the resource should be unpacked after download. If None, will defer to the file extension."""
+
+    decompress: Literal[False, None] = None
+    """Whether the resource should be decompressed after download. If None, will defer to the file extension."""
+
+    @pydantic.model_validator(mode="after")
+    def _validate_ngc_registry(self):
+        if self.ngc and not self.ngc_registry:
+            raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}")
+        return self
+
+
+@functools.cache
+def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
+    """Return a dictionary of all resources."""
+    if not resource_path:
+        resource_path = Path(files("bionemo.core.data").joinpath("resources"))  # type: ignore
+
+    resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))
+
+    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]
+
+    resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
+    resource_dict = {resource.tag: resource for resource in resource_list}
+
+    if len(resource_dict) != len(resource_list):
+        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
+        tag_counts = Counter([resource.tag for resource in resource_list])
+        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")
+
+    return resource_dict
+
+
+def _parse_resource_file(file) -> list[dict[str, Any]]:
+    with file.open("r") as f:
+        resources = yaml.safe_load(f)
+        for resource in resources:
+            resource["tag"] = f"{file.stem}/{resource['tag']}"
+        return resources
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml
similarity index 100%
rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml
rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml
similarity index 100%
rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml
rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml
similarity index 100%
rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml
rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml
similarity index 100%
rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml
rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml
diff --git a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py
similarity index 95%
rename from sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py
rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py
index 452604ce3d..ae413d8147 100644
--- a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py
+++ b/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py
@@ -23,8 +23,8 @@
 
 import pytest
 
-from bionemo.testing.data.load import default_ngc_client, default_pbss_client, load
-from bionemo.testing.data.resource import get_all_resources
+from bionemo.core.data.load import default_ngc_client, default_pbss_client, load
+from bionemo.core.data.resource import get_all_resources
 
 
 def test_load_raises_error_on_invalid_tag(tmp_path):
@@ -98,7 +98,7 @@ def test_load_raises_with_no_ngc_url(tmp_path):
         load("foo/bar", source="ngc", resources=get_all_resources(tmp_path), cache_dir=tmp_path)  # type: ignore
 
 
-@patch("bionemo.testing.data.load._s3_download")
+@patch("bionemo.core.data.load._s3_download")
 def test_load_with_file(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -115,7 +115,7 @@ def test_load_with_file(mocked_s3_download, tmp_path):
     assert file_path.read_text() == "test"
 
 
-@patch("bionemo.testing.data.load._s3_download")
+@patch("bionemo.core.data.load._s3_download")
 def test_load_with_gzipped_file(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -137,7 +137,7 @@ def write_compressed_text(_1, output_file: str, _2):
     assert file_path.read_text() == "test"
 
 
-@patch("bionemo.testing.data.load._s3_download")
+@patch("bionemo.core.data.load._s3_download")
 def test_load_with_gzipped_file_no_decomp(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -163,7 +163,7 @@ def write_compressed_text(_1, output_file: str, _2):
         assert f.read() == "test"
 
 
-@patch("bionemo.testing.data.load._s3_download")
+@patch("bionemo.core.data.load._s3_download")
 def test_load_with_tar_directory(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -195,7 +195,7 @@ def write_compressed_dir(_1, output_file: str, _2):
     assert (file_path / "test_file").read_text() == "test"
 
 
-@patch("bionemo.testing.data.load._s3_download")
+@patch("bionemo.core.data.load._s3_download")
 def test_load_with_tar_directory_no_unpack(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -232,7 +232,7 @@ def write_tarfile_dir(_1, output_file: str, _2):
         assert (tmp_path / "extracted/test_file").read_text() == "test"
 
 
-@patch("bionemo.testing.data.load._s3_download")
+@patch("bionemo.core.data.load._s3_download")
 def test_load_with_targz_directory(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -274,7 +274,7 @@ def test_default_ngc_client():
     assert clt.api_key is not None
 
 
-@patch("bionemo.testing.data.load.default_ngc_client")
+@patch("bionemo.core.data.load.default_ngc_client")
 def test_load_with_file_from_ngc_model(mocked_get_ngc_client, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -305,7 +305,7 @@ def mocked_ngc_download(url, destination, file_patterns):
     mocked_ngc_client.registry.model.download_version.assert_called_once()
 
 
-@patch("bionemo.testing.data.load.default_ngc_client")
+@patch("bionemo.core.data.load.default_ngc_client")
 def test_load_with_file_from_ngc_resource(mocked_get_ngc_client, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py
rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py
diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py
rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py
diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py
rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py
diff --git a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py
similarity index 98%
rename from sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py
rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py
index 2b3370e5ad..ebd81abbb9 100644
--- a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py
+++ b/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py
@@ -20,7 +20,7 @@
 import pydantic
 import pytest
 
-from bionemo.testing.data.resource import Resource, get_all_resources
+from bionemo.core.data.resource import Resource, get_all_resources
 
 
 def test_get_all_resources_returns_valid_entries():
diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py b/sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py
rename to sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
index 4b091c9d77..0438483901 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
@@ -27,6 +27,7 @@
 from torch import Tensor
 from transformers import EsmForMaskedLM
 
+from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.core.utils.random_utils import random_numpy_context
 from bionemo.esm2.api import ESM2Config, ESM2Model
@@ -36,7 +37,6 @@
 from bionemo.llm.model.biobert.model import MegatronBioBertModel
 from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
 from bionemo.testing import megatron_parallel_state_utils
-from bionemo.testing.data.load import load
 
 
 nemo1_checkpoint_path: Path = load("esm2/nv_650m:1.0")
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
index 18be7eccf3..8be351eb5b 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
@@ -23,6 +23,7 @@
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from typing_extensions import override
 
+from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.esm2.api import ESM2Config
 from bionemo.esm2.data.datamodule import ESMDataModule
@@ -30,7 +31,6 @@
 from bionemo.esm2.data.tokenizer import BioNeMoESMTokenizer, get_tokenizer
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
-from bionemo.testing.data.load import load
 from bionemo.testing.harnesses import stop_and_go
 from bionemo.testing.harnesses.mode import Mode
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py
index a79ae52269..c8965d092d 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py
@@ -42,6 +42,7 @@
 from tqdm import trange
 from transformers import AutoModelForMaskedLM
 
+from bionemo.core.data.load import load
 from bionemo.core.data.multi_epoch_dataset import EpochIndex
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.geneformer.api import GeneformerConfig
@@ -51,7 +52,6 @@
 from bionemo.llm.data import collate
 from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.testing import megatron_parallel_state_utils
-from bionemo.testing.data.load import load
 
 
 class GeneformerHFAdapter(torch.nn.Module):
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py
index 1215f864b7..3a72d11e9e 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py
@@ -20,6 +20,7 @@
 from nemo import lightning as nl
 from nemo.utils import logging
 
+from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype
 from bionemo.geneformer.api import FineTuneSeqLenBioBertConfig, GeneformerConfig
 from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
@@ -28,7 +29,6 @@
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
-from bionemo.testing.data.load import load
 
 
 def infer_model(
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
index 7eeb47a613..f76682fbc0 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
@@ -20,7 +20,7 @@
 
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
-from bionemo.testing.data.load import load
+from bionemo.core.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py
index 14ba0b03c0..5799ec7611 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py
@@ -22,11 +22,11 @@
 import pytest
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
+from bionemo.core.data.load import load
 from bionemo.geneformer.scripts.train_geneformer import get_parser, main
 from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import parse_kwargs_to_arglist
 from bionemo.testing import megatron_parallel_state_utils
-from bionemo.testing.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
index a561cb5b87..35621c1f13 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
@@ -38,6 +38,7 @@
 from torch.nn import functional as F
 from tqdm import tqdm
 
+from bionemo.core.data.load import load
 from bionemo.core.utils.batching_utils import pad_token_ids
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.core.utils.random_utils import random_numpy_context
@@ -52,7 +53,6 @@
 from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
 from bionemo.testing import megatron_parallel_state_utils
 from bionemo.testing.callbacks import MetricTracker
-from bionemo.testing.data.load import load
 from bionemo.testing.utils import (
     assert_matrix_correlation_above_value,
     assert_matrix_mape_below_value,
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
index 5d55514073..3820844131 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
@@ -36,12 +36,12 @@
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from typing_extensions import override
 
+from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.geneformer.api import GeneformerConfig
 from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.testing import testing_callbacks
-from bionemo.testing.data.load import load
 from bionemo.testing.harnesses import stop_and_go
 from bionemo.testing.harnesses.mode import Mode
 
diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
index a13477c81b..0c128e76e2 100644
--- a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
+++ b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
@@ -19,7 +19,7 @@
 
 import pytest
 
-from bionemo.testing.data.load import load
+from bionemo.core.data.load import load
 
 
 @pytest.fixture
diff --git a/sub-packages/bionemo-testing/pyproject.toml b/sub-packages/bionemo-testing/pyproject.toml
index ccac13b4ad..9c503a842e 100644
--- a/sub-packages/bionemo-testing/pyproject.toml
+++ b/sub-packages/bionemo-testing/pyproject.toml
@@ -15,23 +15,10 @@ dependencies = [
     'bionemo-core',
     'bionemo-llm',
     # external
-    'boto3',
     'email-validator',
-    'ngcsdk',
-    'pooch',
-    'pydantic>=2.7.0',
     'pytest',
-    'pyyaml',
-    'tqdm',
 ]
 
-[project.scripts]
-download_bionemo_data = "bionemo.testing.data.load:entrypoint"
-
-# Make sure that the resource yaml files are being packaged alongside the python files.
-[tool.setuptools.package-data]
-"bionemo.testing" = ["**/*.yaml"]
-
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["bionemo.*"]
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py
index 58978abec6..cc5ec123cb 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py
@@ -12,284 +12,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Sequence
 
-import argparse
-import contextlib
-import shutil
-import sys
-import tempfile
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Literal, Optional, Sequence, TextIO
+from bionemo.core.data.load import default_ngc_client, default_pbss_client, entrypoint, load
 
-import boto3
-import ngcsdk
-import pooch
-from botocore.config import Config
-from tqdm import tqdm
 
-from bionemo.core import BIONEMO_CACHE_DIR
-from bionemo.testing.data.resource import Resource, get_all_resources
+_ = entrypoint
+# This needs to be around so that ruff doesn't automatically remove it as it's unused.
+# We don't want to include it in __all__.
+# But older installations __may__ be using the old CLI path (bionemo.core.data.load:entrypoint)
+# so this is here for backwards compatability.
 
 
-__all__: Sequence[str] = ("load",)
-
-
-def default_pbss_client():
-    """Create a default S3 client for PBSS."""
-    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
-    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)
-
-
-def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None:
-    """Download a file from PBSS."""
-    # Parse S3 URL to get bucket and key
-    parts = url.replace("s3://", "").split("/")
-    bucket = parts[0]
-    key = "/".join(parts[1:])
-
-    with contextlib.closing(default_pbss_client()) as s3:
-        object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"]
-        progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url)
-
-        # Define callback
-        def progress_callback(bytes_transferred):
-            progress_bar.update(bytes_transferred)
-
-        # Download file from S3
-        s3.download_file(bucket, key, output_file, Callback=progress_callback)
-
-
-def default_ngc_client() -> ngcsdk.Client:
-    """Create a default NGC client.
-
-    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
-    """
-    return ngcsdk.Client()
-
-
-@dataclass
-class NGCDownloader:
-    """A class to download files from NGC in a Pooch-compatible way.
-
-    NGC downloads are typically structured as directories, while pooch expects a single file. This class
-    downloads a single file from an NGC directory and moves it to the desired location.
-    """
-
-    filename: str
-    ngc_registry: Literal["model", "resource"]
-
-    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
-        """Download a file from NGC."""
-        client = default_ngc_client()
-
-        download_fns = {
-            "model": client.registry.model.download_version,
-            "resource": client.registry.resource.download_version,
-        }
-
-        output_file = Path(output_file)
-        output_file.parent.mkdir(parents=True, exist_ok=True)
-
-        # NGC seems to always download to a specific directory that we can't specify ourselves.
-        ngc_dirname = Path(url).name.replace(":", "_v")
-
-        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
-            download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename])
-            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)
-
-
-def load(
-    model_or_data_tag: str,
-    source: Literal["ngc", "pbss"] = "pbss",
-    resources: dict[str, Resource] | None = None,
-    cache_dir: Path | None = None,
-) -> Path:
-    """Download a resource from PBSS or NGC.
-
-    Args:
-        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
-        source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".
-        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
-        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)
-
-    Raises:
-        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.
-
-    Returns:
-        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
-        file(s).
-
-    Examples:
-        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
-        >>> load("filename/tag")
-        PosixPath(/tmp/bionemo/downloaded-file-name)
-    """
-    if resources is None:
-        resources = get_all_resources()
-
-    if cache_dir is None:
-        cache_dir = BIONEMO_CACHE_DIR
-
-    if model_or_data_tag not in resources:
-        raise ValueError(f"Resource '{model_or_data_tag}' not found.")
-
-    if source == "ngc" and resources[model_or_data_tag].ngc is None:
-        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")
-
-    resource = resources[model_or_data_tag]
-    filename = str(resource.pbss).split("/")[-1]
-
-    extension = "".join(Path(filename).suffixes)
-    processor = _get_processor(extension, resource.unpack, resource.decompress)
-
-    if source == "pbss":
-        download_fn = _s3_download
-        url = resource.pbss
-
-    elif source == "ngc":
-        assert resource.ngc_registry is not None
-        download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry)
-        url = resource.ngc
-
-    else:
-        raise ValueError(f"Source '{source}' not supported.")
-
-    download = pooch.retrieve(
-        url=str(url),
-        known_hash=resource.sha256,
-        path=cache_dir,
-        downloader=download_fn,
-        processor=processor,
-    )
-
-    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
-    # just want the unpacked, parent folder.
-    if isinstance(download, list):
-        return Path(processor.extract_dir)  # type: ignore
-
-    else:
-        return Path(download)
-
-
-def _get_processor(extension: str, unpack: bool | None, decompress: bool | None):
-    """Get the processor for a given file extension.
-
-    If unpack and decompress are both None, the processor will be inferred from the file extension.
-
-    Args:
-        extension: The file extension.
-        unpack: Whether to unpack the file.
-        decompress: Whether to decompress the file.
-
-    Returns:
-        A Pooch processor object.
-    """
-    if extension in {".gz", ".bz2", ".xz"} and decompress is None:
-        return pooch.Decompress()
-
-    elif extension in {".tar", ".tar.gz"} and unpack is None:
-        return pooch.Untar()
-
-    elif extension == ".zip" and unpack is None:
-        return pooch.Unzip()
-
-    else:
-        return None
-
-
-def print_resources(*, output_source: TextIO = sys.stdout) -> None:
-    """Prints all available downloadable resources & their sources to STDOUT."""
-    print("#resource_name\tsource_options", file=output_source)
-    for resource_name, resource in sorted(get_all_resources().items()):
-        sources = []
-        if resource.ngc is not None:
-            sources.append("ngc")
-        if resource.pbss is not None:
-            sources.append("pbss")
-        print(f"{resource_name}\t{','.join(sources)}", file=output_source)
-
-
-def entrypoint():
-    """Allows a user to get a specific artifact from the command line."""
-    parser = argparse.ArgumentParser(
-        description="Retrieve the local path to the requested artifact name or list resources."
-    )
-
-    # Create mutually exclusive group
-    group = parser.add_mutually_exclusive_group(required=True)
-
-    # Add the argument for artifact name, which is required if --list-resources is not used
-    group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact")
-
-    # Add the --list-resources option
-    group.add_argument(
-        "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit."
-    )
-
-    # Add the --source option
-    parser.add_argument(
-        "--source",
-        type=str,
-        choices=["pbss", "ngc"],
-        default="ngc",
-        help='Backend to use, Internal NVIDIA users can set this to "pbss".',
-    )
-
-    parser.add_argument(
-        "--all",
-        action="store_true",
-        default=False,
-        help="Download all resources. Ignores all other options.",
-    )
-    args = parser.parse_args()
-    maybe_error = main(
-        download_all=args.all,
-        list_resources=args.list_resources,
-        artifact_name=args.artifact_name,
-        source=args.source,
-    )
-    if maybe_error is not None:
-        parser.error(maybe_error)
-
-
-if __name__ == "__main__":
-    entrypoint()
-
-
-def main(
-    download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"]
-) -> Optional[str]:
-    """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure."""
-    if download_all:
-        print("Downloading all resources:", file=sys.stderr)
-        print_resources(output_source=sys.stderr)
-        print("-" * 80, file=sys.stderr)
-
-        resource_to_local: dict[str, Path] = {}
-        for resource_name in tqdm(
-            sorted(get_all_resources()),
-            desc="Downloading Resources",
-        ):
-            with contextlib.redirect_stdout(sys.stderr):
-                local_path = load(resource_name, source=source)
-            resource_to_local[resource_name] = local_path
-
-        print("-" * 80, file=sys.stderr)
-        print("All resources downloaded:", file=sys.stderr)
-        for resource_name, local_path in sorted(resource_to_local.items()):
-            print(f"  {resource_name}: {str(local_path.absolute())}", file=sys.stderr)
-
-    elif list_resources:
-        print_resources(output_source=sys.stdout)
-
-    elif artifact_name is not None and len(artifact_name) > 0:
-        # Get the local path for the provided artifact name
-        with contextlib.redirect_stdout(sys.stderr):
-            local_path = load(artifact_name, source=source)
-
-        # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT
-        print(str(local_path.absolute()))
-
-    else:
-        return "You must provide an artifact name if --list-resources or --all is not set!"
+__all__: Sequence[str] = (
+    "load",
+    "default_ngc_client",
+    "default_pbss_client",
+)
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py
index 065af432dc..677f6f49e0 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py
@@ -12,90 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Sequence
 
+from bionemo.core.data.resource import Resource, get_all_resources
 
-import functools
-import itertools
-from collections import Counter
-from importlib.resources import files
-from pathlib import Path
-from typing import Annotated, Literal
 
-import pydantic
-import yaml
-from registry.api.utils import RegistryTarget
-
-
-def _validate_ngc_resource(value: str) -> str:
-    return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]"))
-
-
-class Resource(pydantic.BaseModel):
-    """Class that represents a remote resource for downloading and caching test data."""
-
-    model_config = pydantic.ConfigDict(use_attribute_docstrings=True)
-
-    tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")]  # Only slash between filename and tag.
-    """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag")."""
-
-    ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None
-    """The NGC URL for the resource.
-
-    Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.
-    """
-
-    ngc_registry: Literal["model", "resource"] | None = None
-    """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None."""
-
-    pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])]
-    """The PBSS (NVIDIA-internal) URL of the resource."""
-
-    sha256: str | None
-    """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended)."""
-
-    owner: pydantic.NameEmail
-    """The owner or primary point of contact for the resource, in the format "Name <email>"."""
-
-    description: str | None = None
-    """A description of the file(s)."""
-
-    unpack: Literal[False, None] = None
-    """Whether the resource should be unpacked after download. If None, will defer to the file extension."""
-
-    decompress: Literal[False, None] = None
-    """Whether the resource should be decompressed after download. If None, will defer to the file extension."""
-
-    @pydantic.model_validator(mode="after")
-    def _validate_ngc_registry(self):
-        if self.ngc and not self.ngc_registry:
-            raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}")
-        return self
-
-
-@functools.cache
-def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
-    """Return a dictionary of all resources."""
-    if not resource_path:
-        resource_path = Path(files("bionemo.testing.data").joinpath("resources"))  # type: ignore
-
-    resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))
-
-    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]
-
-    resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
-    resource_dict = {resource.tag: resource for resource in resource_list}
-
-    if len(resource_dict) != len(resource_list):
-        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
-        tag_counts = Counter([resource.tag for resource in resource_list])
-        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")
-
-    return resource_dict
-
-
-def _parse_resource_file(file) -> list:
-    with file.open("r") as f:
-        resources = yaml.safe_load(f)
-        for resource in resources:
-            resource["tag"] = f"{file.stem}/{resource['tag']}"
-        return resources
+__all__: Sequence[str] = (
+    "Resource",
+    "get_all_resources",
+)

From 0c923edff5d93b16d742f02b9b2f870d688a72f0 Mon Sep 17 00:00:00 2001
From: Malcolm Greaves <malcolmgreaves@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:28:46 -0500
Subject: [PATCH 2/2] Revert "Refactored `load` & `download_bionemo_data` into
 bionemo-core" (#411)

Reverts NVIDIA/bionemo-framework#396
---
 scripts/protein/esm2/test_esm2_infer.py       |   2 +-
 scripts/protein/esm2/test_pydantic_train.py   |   2 +-
 sub-packages/bionemo-core/pyproject.toml      |  16 +-
 .../src/bionemo/core/data/load.py             | 300 ------------------
 .../src/bionemo/core/data/resource.py         | 107 -------
 .../data/test_multi_epoch_dataset.py          |   0
 .../{core => pytorch}/data/test_permute.py    |   0
 .../{core => pytorch}/data/test_resamplers.py |   0
 .../{core => pytorch}/utils/test_dtypes.py    |   0
 .../tests/bionemo/esm2/model/test_model.py    |   2 +-
 .../bionemo/esm2/model/test_stop_and_go.py    |   2 +-
 .../scripts/geneformer_mlm_loss_eval.py       |   2 +-
 .../geneformer/scripts/infer_geneformer.py    |   2 +-
 .../geneformer/scripts/test_pydantic_train.py |   2 +-
 .../scripts/test_train_geneformer.py          |   2 +-
 .../tests/bionemo/geneformer/test_model.py    |   2 +-
 .../bionemo/geneformer/test_stop_and_go.py    |   2 +-
 .../tests/bionemo/scdl/conftest.py            |   2 +-
 sub-packages/bionemo-testing/pyproject.toml   |  13 +
 .../src/bionemo/testing}/data/README.md       |   0
 .../src/bionemo/testing/data/load.py          | 288 ++++++++++++++++-
 .../src/bionemo/testing/data/resource.py      |  90 +++++-
 .../bionemo/testing}/data/resources/esm2.yaml |   0
 .../testing}/data/resources/geneformer.yaml   |   0
 .../bionemo/testing}/data/resources/scdl.yaml |   0
 .../testing}/data/resources/single_cell.yaml  |   0
 .../tests/bionemo/testing}/data/test_load.py  |  20 +-
 .../bionemo/testing}/data/test_resource.py    |   2 +-
 28 files changed, 396 insertions(+), 462 deletions(-)
 delete mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/load.py
 delete mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/resource.py
 rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/data/test_multi_epoch_dataset.py (100%)
 rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/data/test_permute.py (100%)
 rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/data/test_resamplers.py (100%)
 rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/utils/test_dtypes.py (100%)
 rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/README.md (100%)
 rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/esm2.yaml (100%)
 rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/geneformer.yaml (100%)
 rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/scdl.yaml (100%)
 rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/single_cell.yaml (100%)
 rename sub-packages/{bionemo-core/tests/bionemo/core => bionemo-testing/tests/bionemo/testing}/data/test_load.py (95%)
 rename sub-packages/{bionemo-core/tests/bionemo/core => bionemo-testing/tests/bionemo/testing}/data/test_resource.py (98%)

diff --git a/scripts/protein/esm2/test_esm2_infer.py b/scripts/protein/esm2/test_esm2_infer.py
index 4c214d2c2f..c5ec1f633d 100644
--- a/scripts/protein/esm2/test_esm2_infer.py
+++ b/scripts/protein/esm2/test_esm2_infer.py
@@ -21,10 +21,10 @@
 from esm2_infer import infer_model
 from torch.utils.data import DataLoader
 
-from bionemo.core.data.load import load
 from bionemo.esm2.api import ESM2Config
 from bionemo.esm2.data.tokenizer import get_tokenizer
 from bionemo.esm2.model.finetune.datamodule import ESM2FineTuneDataModule, InMemoryCSVDataset
+from bionemo.testing.data.load import load
 
 
 esm2_650m_checkpoint_path = load("esm2/650m:2.0")
diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py
index a7c3ababa8..2522e538f8 100644
--- a/scripts/protein/esm2/test_pydantic_train.py
+++ b/scripts/protein/esm2/test_pydantic_train.py
@@ -21,8 +21,8 @@
 import pytest
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
-from bionemo.core.data.load import load
 from bionemo.testing.data.esm2 import create_mock_parquet_train_val_inputs, create_mock_protein_dataset
+from bionemo.testing.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-core/pyproject.toml b/sub-packages/bionemo-core/pyproject.toml
index 079d9c8e58..c6e04c7270 100644
--- a/sub-packages/bionemo-core/pyproject.toml
+++ b/sub-packages/bionemo-core/pyproject.toml
@@ -12,28 +12,14 @@ license = { file = "LICENSE" }
 dynamic = ["version"]
 dependencies = [
     # bionemo sub-packages
-    # bionemo-core **MUST NOT** depend on any other sub-packages !!!!!
     # external
     "numpy",
     "platformdirs",
     "torch>=2.2.1",
-    'boto3',
-    'lightning>=2.2.1',
-    'ngcsdk',
-    'pooch',
-    'pydantic>=2.7.0',
     'pytorch-lightning>=2.2.1',
-    'pyyaml',
-    'tqdm',
+    'lightning>=2.2.1',
 ]
 
-[project.scripts]
-download_bionemo_data = "bionemo.core.data.load:entrypoint"
-
-# Make sure that the resource yaml files are being packaged alongside the python files.
-[tool.setuptools.package-data]
-"bionemo.core" = ["**/*.yaml"]
-
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["bionemo.*"]
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/load.py b/sub-packages/bionemo-core/src/bionemo/core/data/load.py
deleted file mode 100644
index ca737c37a4..0000000000
--- a/sub-packages/bionemo-core/src/bionemo/core/data/load.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import contextlib
-import shutil
-import sys
-import tempfile
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Literal, Optional, Sequence, TextIO
-
-import boto3
-import ngcsdk
-import pooch
-from botocore.config import Config
-from tqdm import tqdm
-
-from bionemo.core import BIONEMO_CACHE_DIR
-from bionemo.core.data.resource import Resource, get_all_resources
-
-
-__all__: Sequence[str] = (
-    "load",
-    "default_ngc_client",
-    "default_pbss_client",
-    "NGCDownloader",
-)
-
-
-def default_pbss_client():
-    """Create a default S3 client for PBSS."""
-    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
-    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)
-
-
-def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None:
-    """Download a file from PBSS."""
-    # Parse S3 URL to get bucket and key
-    parts = url.replace("s3://", "").split("/")
-    bucket = parts[0]
-    key = "/".join(parts[1:])
-
-    with contextlib.closing(default_pbss_client()) as s3:
-        object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"]
-        progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url)
-
-        # Define callback
-        def progress_callback(bytes_transferred):
-            progress_bar.update(bytes_transferred)
-
-        # Download file from S3
-        s3.download_file(bucket, key, output_file, Callback=progress_callback)
-
-
-def default_ngc_client() -> ngcsdk.Client:
-    """Create a default NGC client.
-
-    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
-    """
-    return ngcsdk.Client()
-
-
-@dataclass
-class NGCDownloader:
-    """A class to download files from NGC in a Pooch-compatible way.
-
-    NGC downloads are typically structured as directories, while pooch expects a single file. This class
-    downloads a single file from an NGC directory and moves it to the desired location.
-    """
-
-    filename: str
-    ngc_registry: Literal["model", "resource"]
-
-    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
-        """Download a file from NGC."""
-        client = default_ngc_client()
-
-        download_fns = {
-            "model": client.registry.model.download_version,
-            "resource": client.registry.resource.download_version,
-        }
-
-        output_file = Path(output_file)
-        output_file.parent.mkdir(parents=True, exist_ok=True)
-
-        # NGC seems to always download to a specific directory that we can't specify ourselves.
-        ngc_dirname = Path(url).name.replace(":", "_v")
-
-        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
-            download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename])
-            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)
-
-
-def load(
-    model_or_data_tag: str,
-    source: Literal["ngc", "pbss"] = "pbss",
-    resources: dict[str, Resource] | None = None,
-    cache_dir: Path | None = None,
-) -> Path:
-    """Download a resource from PBSS or NGC.
-
-    Args:
-        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
-        source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".
-        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
-        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)
-
-    Raises:
-        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.
-
-    Returns:
-        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
-        file(s).
-
-    Examples:
-        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
-        >>> load("filename/tag")
-        PosixPath(/tmp/bionemo/downloaded-file-name)
-    """
-    if resources is None:
-        resources = get_all_resources()
-
-    if cache_dir is None:
-        cache_dir = BIONEMO_CACHE_DIR
-
-    if model_or_data_tag not in resources:
-        raise ValueError(f"Resource '{model_or_data_tag}' not found.")
-
-    if source == "ngc" and resources[model_or_data_tag].ngc is None:
-        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")
-
-    resource = resources[model_or_data_tag]
-    filename = str(resource.pbss).split("/")[-1]
-
-    extension = "".join(Path(filename).suffixes)
-    processor = _get_processor(extension, resource.unpack, resource.decompress)
-
-    if source == "pbss":
-        download_fn = _s3_download
-        url = resource.pbss
-
-    elif source == "ngc":
-        assert resource.ngc_registry is not None
-        download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry)
-        url = resource.ngc
-
-    else:
-        raise ValueError(f"Source '{source}' not supported.")
-
-    download = pooch.retrieve(
-        url=str(url),
-        known_hash=resource.sha256,
-        path=cache_dir,
-        downloader=download_fn,
-        processor=processor,
-    )
-
-    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
-    # just want the unpacked, parent folder.
-    if isinstance(download, list):
-        return Path(processor.extract_dir)  # type: ignore
-
-    else:
-        return Path(download)
-
-
-def _get_processor(extension: str, unpack: bool | None, decompress: bool | None):
-    """Get the processor for a given file extension.
-
-    If unpack and decompress are both None, the processor will be inferred from the file extension.
-
-    Args:
-        extension: The file extension.
-        unpack: Whether to unpack the file.
-        decompress: Whether to decompress the file.
-
-    Returns:
-        A Pooch processor object.
-    """
-    if extension in {".gz", ".bz2", ".xz"} and decompress is None:
-        return pooch.Decompress()
-
-    elif extension in {".tar", ".tar.gz"} and unpack is None:
-        return pooch.Untar()
-
-    elif extension == ".zip" and unpack is None:
-        return pooch.Unzip()
-
-    else:
-        return None
-
-
-def print_resources(*, output_source: TextIO = sys.stdout) -> None:
-    """Prints all available downloadable resources & their sources to STDOUT."""
-    print("#resource_name\tsource_options", file=output_source)
-    for resource_name, resource in sorted(get_all_resources().items()):
-        sources = []
-        if resource.ngc is not None:
-            sources.append("ngc")
-        if resource.pbss is not None:
-            sources.append("pbss")
-        print(f"{resource_name}\t{','.join(sources)}", file=output_source)
-
-
-def entrypoint():
-    """Allows a user to get a specific artifact from the command line."""
-    parser = argparse.ArgumentParser(
-        description="Retrieve the local path to the requested artifact name or list resources."
-    )
-
-    # Create mutually exclusive group
-    group = parser.add_mutually_exclusive_group(required=True)
-
-    # Add the argument for artifact name, which is required if --list-resources is not used
-    group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact")
-
-    # Add the --list-resources option
-    group.add_argument(
-        "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit."
-    )
-
-    # Add the --source option
-    parser.add_argument(
-        "--source",
-        type=str,
-        choices=["pbss", "ngc"],
-        default="ngc",
-        help='Backend to use, Internal NVIDIA users can set this to "pbss".',
-    )
-
-    parser.add_argument(
-        "--all",
-        action="store_true",
-        default=False,
-        help="Download all resources. Ignores all other options.",
-    )
-    args = parser.parse_args()
-    maybe_error = main(
-        download_all=args.all,
-        list_resources=args.list_resources,
-        artifact_name=args.artifact_name,
-        source=args.source,
-    )
-    if maybe_error is not None:
-        parser.error(maybe_error)
-
-
-if __name__ == "__main__":
-    entrypoint()
-
-
-def main(
-    download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"]
-) -> Optional[str]:
-    """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure."""
-    if download_all:
-        print("Downloading all resources:", file=sys.stderr)
-        print_resources(output_source=sys.stderr)
-        print("-" * 80, file=sys.stderr)
-
-        resource_to_local: dict[str, Path] = {}
-        for resource_name in tqdm(
-            sorted(get_all_resources()),
-            desc="Downloading Resources",
-        ):
-            with contextlib.redirect_stdout(sys.stderr):
-                local_path = load(resource_name, source=source)
-            resource_to_local[resource_name] = local_path
-
-        print("-" * 80, file=sys.stderr)
-        print("All resources downloaded:", file=sys.stderr)
-        for resource_name, local_path in sorted(resource_to_local.items()):
-            print(f"  {resource_name}: {str(local_path.absolute())}", file=sys.stderr)
-
-    elif list_resources:
-        print_resources(output_source=sys.stdout)
-
-    elif artifact_name is not None and len(artifact_name) > 0:
-        # Get the local path for the provided artifact name
-        with contextlib.redirect_stdout(sys.stderr):
-            local_path = load(artifact_name, source=source)
-
-        # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT
-        print(str(local_path.absolute()))
-
-    else:
-        return "You must provide an artifact name if --list-resources or --all is not set!"
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resource.py b/sub-packages/bionemo-core/src/bionemo/core/data/resource.py
deleted file mode 100644
index 419975854f..0000000000
--- a/sub-packages/bionemo-core/src/bionemo/core/data/resource.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-Apache2
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import functools
-import itertools
-from collections import Counter
-from importlib.resources import files
-from pathlib import Path
-from typing import Annotated, Any, Literal, Sequence
-
-import pydantic
-import yaml
-from registry.api.utils import RegistryTarget
-
-
-__all__: Sequence[str] = (
-    "Resource",
-    "get_all_resources",
-)
-
-
-def _validate_ngc_resource(value: str) -> str:
-    return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]"))
-
-
-class Resource(pydantic.BaseModel):
-    """Class that represents a remote resource for downloading and caching test data."""
-
-    model_config = pydantic.ConfigDict(use_attribute_docstrings=True)
-
-    tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")]  # Only slash between filename and tag.
-    """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag")."""
-
-    ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None
-    """The NGC URL for the resource.
-
-    Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.
-    """
-
-    ngc_registry: Literal["model", "resource"] | None = None
-    """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None."""
-
-    pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])]
-    """The PBSS (NVIDIA-internal) URL of the resource."""
-
-    sha256: str | None
-    """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended)."""
-
-    owner: pydantic.NameEmail
-    """The owner or primary point of contact for the resource, in the format "Name <email>"."""
-
-    description: str | None = None
-    """A description of the file(s)."""
-
-    unpack: Literal[False, None] = None
-    """Whether the resource should be unpacked after download. If None, will defer to the file extension."""
-
-    decompress: Literal[False, None] = None
-    """Whether the resource should be decompressed after download. If None, will defer to the file extension."""
-
-    @pydantic.model_validator(mode="after")
-    def _validate_ngc_registry(self):
-        if self.ngc and not self.ngc_registry:
-            raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}")
-        return self
-
-
-@functools.cache
-def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
-    """Return a dictionary of all resources."""
-    if not resource_path:
-        resource_path = Path(files("bionemo.core.data").joinpath("resources"))  # type: ignore
-
-    resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))
-
-    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]
-
-    resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
-    resource_dict = {resource.tag: resource for resource in resource_list}
-
-    if len(resource_dict) != len(resource_list):
-        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
-        tag_counts = Counter([resource.tag for resource in resource_list])
-        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")
-
-    return resource_dict
-
-
-def _parse_resource_file(file) -> list[dict[str, Any]]:
-    with file.open("r") as f:
-        resources = yaml.safe_load(f)
-        for resource in resources:
-            resource["tag"] = f"{file.stem}/{resource['tag']}"
-        return resources
diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py
rename to sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py
diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py
rename to sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py
diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py
rename to sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py
diff --git a/sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py
similarity index 100%
rename from sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py
rename to sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
index 0438483901..4b091c9d77 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py
@@ -27,7 +27,6 @@
 from torch import Tensor
 from transformers import EsmForMaskedLM
 
-from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.core.utils.random_utils import random_numpy_context
 from bionemo.esm2.api import ESM2Config, ESM2Model
@@ -37,6 +36,7 @@
 from bionemo.llm.model.biobert.model import MegatronBioBertModel
 from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import load
 
 
 nemo1_checkpoint_path: Path = load("esm2/nv_650m:1.0")
diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
index 8be351eb5b..18be7eccf3 100644
--- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
+++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py
@@ -23,7 +23,6 @@
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule
 from typing_extensions import override
 
-from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.esm2.api import ESM2Config
 from bionemo.esm2.data.datamodule import ESMDataModule
@@ -31,6 +30,7 @@
 from bionemo.esm2.data.tokenizer import BioNeMoESMTokenizer, get_tokenizer
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler
+from bionemo.testing.data.load import load
 from bionemo.testing.harnesses import stop_and_go
 from bionemo.testing.harnesses.mode import Mode
 
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py
index c8965d092d..a79ae52269 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py
@@ -42,7 +42,6 @@
 from tqdm import trange
 from transformers import AutoModelForMaskedLM
 
-from bionemo.core.data.load import load
 from bionemo.core.data.multi_epoch_dataset import EpochIndex
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.geneformer.api import GeneformerConfig
@@ -52,6 +51,7 @@
 from bionemo.llm.data import collate
 from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import load
 
 
 class GeneformerHFAdapter(torch.nn.Module):
diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py
index 3a72d11e9e..1215f864b7 100644
--- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py
+++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py
@@ -20,7 +20,6 @@
 from nemo import lightning as nl
 from nemo.utils import logging
 
-from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype
 from bionemo.geneformer.api import FineTuneSeqLenBioBertConfig, GeneformerConfig
 from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule
@@ -29,6 +28,7 @@
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.llm.model.biobert.model import BioBertConfig
 from bionemo.llm.utils.datamodule_utils import infer_global_batch_size
+from bionemo.testing.data.load import load
 
 
 def infer_model(
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
index f76682fbc0..7eeb47a613 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py
@@ -20,7 +20,7 @@
 
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
-from bionemo.core.data.load import load
+from bionemo.testing.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py
index 5799ec7611..14ba0b03c0 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py
@@ -22,11 +22,11 @@
 import pytest
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
 
-from bionemo.core.data.load import load
 from bionemo.geneformer.scripts.train_geneformer import get_parser, main
 from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption
 from bionemo.llm.utils.datamodule_utils import parse_kwargs_to_arglist
 from bionemo.testing import megatron_parallel_state_utils
+from bionemo.testing.data.load import load
 
 
 data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data"
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
index 35621c1f13..a561cb5b87 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py
@@ -38,7 +38,6 @@
 from torch.nn import functional as F
 from tqdm import tqdm
 
-from bionemo.core.data.load import load
 from bionemo.core.utils.batching_utils import pad_token_ids
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.core.utils.random_utils import random_numpy_context
@@ -53,6 +52,7 @@
 from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping
 from bionemo.testing import megatron_parallel_state_utils
 from bionemo.testing.callbacks import MetricTracker
+from bionemo.testing.data.load import load
 from bionemo.testing.utils import (
     assert_matrix_correlation_above_value,
     assert_matrix_mape_below_value,
diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
index 3820844131..5d55514073 100644
--- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
+++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py
@@ -36,12 +36,12 @@
 from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from typing_extensions import override
 
-from bionemo.core.data.load import load
 from bionemo.core.utils.dtypes import get_autocast_dtype
 from bionemo.geneformer.api import GeneformerConfig
 from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess
 from bionemo.llm.model.biobert.lightning import biobert_lightning_module
 from bionemo.testing import testing_callbacks
+from bionemo.testing.data.load import load
 from bionemo.testing.harnesses import stop_and_go
 from bionemo.testing.harnesses.mode import Mode
 
diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
index 0c128e76e2..a13477c81b 100644
--- a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
+++ b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py
@@ -19,7 +19,7 @@
 
 import pytest
 
-from bionemo.core.data.load import load
+from bionemo.testing.data.load import load
 
 
 @pytest.fixture
diff --git a/sub-packages/bionemo-testing/pyproject.toml b/sub-packages/bionemo-testing/pyproject.toml
index 9c503a842e..ccac13b4ad 100644
--- a/sub-packages/bionemo-testing/pyproject.toml
+++ b/sub-packages/bionemo-testing/pyproject.toml
@@ -15,10 +15,23 @@ dependencies = [
     'bionemo-core',
     'bionemo-llm',
     # external
+    'boto3',
     'email-validator',
+    'ngcsdk',
+    'pooch',
+    'pydantic>=2.7.0',
     'pytest',
+    'pyyaml',
+    'tqdm',
 ]
 
+[project.scripts]
+download_bionemo_data = "bionemo.testing.data.load:entrypoint"
+
+# Make sure that the resource yaml files are being packaged alongside the python files.
+[tool.setuptools.package-data]
+"bionemo.testing" = ["**/*.yaml"]
+
 [tool.setuptools.packages.find]
 where = ["src"]
 include = ["bionemo.*"]
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/README.md b/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md
similarity index 100%
rename from sub-packages/bionemo-core/src/bionemo/core/data/README.md
rename to sub-packages/bionemo-testing/src/bionemo/testing/data/README.md
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py
index cc5ec123cb..58978abec6 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py
@@ -12,20 +12,284 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Sequence
 
-from bionemo.core.data.load import default_ngc_client, default_pbss_client, entrypoint, load
+import argparse
+import contextlib
+import shutil
+import sys
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal, Optional, Sequence, TextIO
 
+import boto3
+import ngcsdk
+import pooch
+from botocore.config import Config
+from tqdm import tqdm
 
-_ = entrypoint
-# This needs to be around so that ruff doesn't automatically remove it as it's unused.
-# We don't want to include it in __all__.
-# But older installations __may__ be using the old CLI path (bionemo.core.data.load:entrypoint)
-# so this is here for backwards compatability.
+from bionemo.core import BIONEMO_CACHE_DIR
+from bionemo.testing.data.resource import Resource, get_all_resources
 
 
-__all__: Sequence[str] = (
-    "load",
-    "default_ngc_client",
-    "default_pbss_client",
-)
+__all__: Sequence[str] = ("load",)
+
+
+def default_pbss_client():
+    """Create a default S3 client for PBSS."""
+    retry_config = Config(retries={"max_attempts": 10, "mode": "standard"})
+    return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config)
+
+
+def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None:
+    """Download a file from PBSS."""
+    # Parse S3 URL to get bucket and key
+    parts = url.replace("s3://", "").split("/")
+    bucket = parts[0]
+    key = "/".join(parts[1:])
+
+    with contextlib.closing(default_pbss_client()) as s3:
+        object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"]
+        progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url)
+
+        # Define callback
+        def progress_callback(bytes_transferred):
+            progress_bar.update(bytes_transferred)
+
+        # Download file from S3
+        s3.download_file(bucket, key, output_file, Callback=progress_callback)
+
+
+def default_ngc_client() -> ngcsdk.Client:
+    """Create a default NGC client.
+
+    This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container.
+    """
+    return ngcsdk.Client()
+
+
+@dataclass
+class NGCDownloader:
+    """A class to download files from NGC in a Pooch-compatible way.
+
+    NGC downloads are typically structured as directories, while pooch expects a single file. This class
+    downloads a single file from an NGC directory and moves it to the desired location.
+    """
+
+    filename: str
+    ngc_registry: Literal["model", "resource"]
+
+    def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None:
+        """Download a file from NGC."""
+        client = default_ngc_client()
+
+        download_fns = {
+            "model": client.registry.model.download_version,
+            "resource": client.registry.resource.download_version,
+        }
+
+        output_file = Path(output_file)
+        output_file.parent.mkdir(parents=True, exist_ok=True)
+
+        # NGC seems to always download to a specific directory that we can't specify ourselves.
+        ngc_dirname = Path(url).name.replace(":", "_v")
+
+        with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir:
+            download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename])
+            shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file)
+
+
+def load(
+    model_or_data_tag: str,
+    source: Literal["ngc", "pbss"] = "pbss",
+    resources: dict[str, Resource] | None = None,
+    cache_dir: Path | None = None,
+) -> Path:
+    """Download a resource from PBSS or NGC.
+
+    Args:
+        model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary.
+        source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss".
+        resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.)
+        cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.)
+
+    Raises:
+        ValueError: If the desired tag was not found, or if an NGC url was requested but not provided.
+
+    Returns:
+        A Path object pointing either at the downloaded file, or at a decompressed folder containing the
+        file(s).
+
+    Examples:
+        For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file:
+        >>> load("filename/tag")
+        PosixPath(/tmp/bionemo/downloaded-file-name)
+    """
+    if resources is None:
+        resources = get_all_resources()
+
+    if cache_dir is None:
+        cache_dir = BIONEMO_CACHE_DIR
+
+    if model_or_data_tag not in resources:
+        raise ValueError(f"Resource '{model_or_data_tag}' not found.")
+
+    if source == "ngc" and resources[model_or_data_tag].ngc is None:
+        raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.")
+
+    resource = resources[model_or_data_tag]
+    filename = str(resource.pbss).split("/")[-1]
+
+    extension = "".join(Path(filename).suffixes)
+    processor = _get_processor(extension, resource.unpack, resource.decompress)
+
+    if source == "pbss":
+        download_fn = _s3_download
+        url = resource.pbss
+
+    elif source == "ngc":
+        assert resource.ngc_registry is not None
+        download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry)
+        url = resource.ngc
+
+    else:
+        raise ValueError(f"Source '{source}' not supported.")
+
+    download = pooch.retrieve(
+        url=str(url),
+        known_hash=resource.sha256,
+        path=cache_dir,
+        downloader=download_fn,
+        processor=processor,
+    )
+
+    # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we
+    # just want the unpacked, parent folder.
+    if isinstance(download, list):
+        return Path(processor.extract_dir)  # type: ignore
+
+    else:
+        return Path(download)
+
+
+def _get_processor(extension: str, unpack: bool | None, decompress: bool | None):
+    """Get the processor for a given file extension.
+
+    If unpack and decompress are both None, the processor will be inferred from the file extension.
+
+    Args:
+        extension: The file extension.
+        unpack: Whether to unpack the file.
+        decompress: Whether to decompress the file.
+
+    Returns:
+        A Pooch processor object.
+    """
+    if extension in {".gz", ".bz2", ".xz"} and decompress is None:
+        return pooch.Decompress()
+
+    elif extension in {".tar", ".tar.gz"} and unpack is None:
+        return pooch.Untar()
+
+    elif extension == ".zip" and unpack is None:
+        return pooch.Unzip()
+
+    else:
+        return None
+
+
+def print_resources(*, output_source: TextIO = sys.stdout) -> None:
+    """Prints all available downloadable resources & their sources to STDOUT."""
+    print("#resource_name\tsource_options", file=output_source)
+    for resource_name, resource in sorted(get_all_resources().items()):
+        sources = []
+        if resource.ngc is not None:
+            sources.append("ngc")
+        if resource.pbss is not None:
+            sources.append("pbss")
+        print(f"{resource_name}\t{','.join(sources)}", file=output_source)
+
+
+def entrypoint():
+    """Allows a user to get a specific artifact from the command line."""
+    parser = argparse.ArgumentParser(
+        description="Retrieve the local path to the requested artifact name or list resources."
+    )
+
+    # Create mutually exclusive group
+    group = parser.add_mutually_exclusive_group(required=True)
+
+    # Add the argument for artifact name, which is required if --list-resources is not used
+    group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact")
+
+    # Add the --list-resources option
+    group.add_argument(
+        "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit."
+    )
+
+    # Add the --source option
+    parser.add_argument(
+        "--source",
+        type=str,
+        choices=["pbss", "ngc"],
+        default="ngc",
+        help='Backend to use, Internal NVIDIA users can set this to "pbss".',
+    )
+
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        default=False,
+        help="Download all resources. Ignores all other options.",
+    )
+    args = parser.parse_args()
+    maybe_error = main(
+        download_all=args.all,
+        list_resources=args.list_resources,
+        artifact_name=args.artifact_name,
+        source=args.source,
+    )
+    if maybe_error is not None:
+        parser.error(maybe_error)
+
+
+if __name__ == "__main__":
+    entrypoint()
+
+
+def main(
+    download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"]
+) -> Optional[str]:
+    """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure."""
+    if download_all:
+        print("Downloading all resources:", file=sys.stderr)
+        print_resources(output_source=sys.stderr)
+        print("-" * 80, file=sys.stderr)
+
+        resource_to_local: dict[str, Path] = {}
+        for resource_name in tqdm(
+            sorted(get_all_resources()),
+            desc="Downloading Resources",
+        ):
+            with contextlib.redirect_stdout(sys.stderr):
+                local_path = load(resource_name, source=source)
+            resource_to_local[resource_name] = local_path
+
+        print("-" * 80, file=sys.stderr)
+        print("All resources downloaded:", file=sys.stderr)
+        for resource_name, local_path in sorted(resource_to_local.items()):
+            print(f"  {resource_name}: {str(local_path.absolute())}", file=sys.stderr)
+
+    elif list_resources:
+        print_resources(output_source=sys.stdout)
+
+    elif artifact_name is not None and len(artifact_name) > 0:
+        # Get the local path for the provided artifact name
+        with contextlib.redirect_stdout(sys.stderr):
+            local_path = load(artifact_name, source=source)
+
+        # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT
+        print(str(local_path.absolute()))
+
+    else:
+        return "You must provide an artifact name if --list-resources or --all is not set!"
diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py
index 677f6f49e0..065af432dc 100644
--- a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py
+++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py
@@ -12,12 +12,90 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Sequence
 
-from bionemo.core.data.resource import Resource, get_all_resources
 
+import functools
+import itertools
+from collections import Counter
+from importlib.resources import files
+from pathlib import Path
+from typing import Annotated, Literal
 
-__all__: Sequence[str] = (
-    "Resource",
-    "get_all_resources",
-)
+import pydantic
+import yaml
+from registry.api.utils import RegistryTarget
+
+
+def _validate_ngc_resource(value: str) -> str:
+    return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]"))
+
+
+class Resource(pydantic.BaseModel):
+    """Class that represents a remote resource for downloading and caching test data."""
+
+    model_config = pydantic.ConfigDict(use_attribute_docstrings=True)
+
+    tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")]  # Only slash between filename and tag.
+    """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag")."""
+
+    ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None
+    """The NGC URL for the resource.
+
+    Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC.
+    """
+
+    ngc_registry: Literal["model", "resource"] | None = None
+    """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None."""
+
+    pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])]
+    """The PBSS (NVIDIA-internal) URL of the resource."""
+
+    sha256: str | None
+    """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended)."""
+
+    owner: pydantic.NameEmail
+    """The owner or primary point of contact for the resource, in the format "Name <email>"."""
+
+    description: str | None = None
+    """A description of the file(s)."""
+
+    unpack: Literal[False, None] = None
+    """Whether the resource should be unpacked after download. If None, will defer to the file extension."""
+
+    decompress: Literal[False, None] = None
+    """Whether the resource should be decompressed after download. If None, will defer to the file extension."""
+
+    @pydantic.model_validator(mode="after")
+    def _validate_ngc_registry(self):
+        if self.ngc and not self.ngc_registry:
+            raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}")
+        return self
+
+
+@functools.cache
+def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]:
+    """Return a dictionary of all resources."""
+    if not resource_path:
+        resource_path = Path(files("bionemo.testing.data").joinpath("resources"))  # type: ignore
+
+    resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml"))
+
+    all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)]
+
+    resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources)
+    resource_dict = {resource.tag: resource for resource in resource_list}
+
+    if len(resource_dict) != len(resource_list):
+        # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue.
+        tag_counts = Counter([resource.tag for resource in resource_list])
+        raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}")
+
+    return resource_dict
+
+
+def _parse_resource_file(file) -> list:
+    with file.open("r") as f:
+        resources = yaml.safe_load(f)
+        for resource in resources:
+            resource["tag"] = f"{file.stem}/{resource['tag']}"
+        return resources
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml
similarity index 100%
rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml
rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml
similarity index 100%
rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml
rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml
similarity index 100%
rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml
rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml
diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml
similarity index 100%
rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml
rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml
diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py
similarity index 95%
rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py
rename to sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py
index ae413d8147..452604ce3d 100644
--- a/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py
+++ b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py
@@ -23,8 +23,8 @@
 
 import pytest
 
-from bionemo.core.data.load import default_ngc_client, default_pbss_client, load
-from bionemo.core.data.resource import get_all_resources
+from bionemo.testing.data.load import default_ngc_client, default_pbss_client, load
+from bionemo.testing.data.resource import get_all_resources
 
 
 def test_load_raises_error_on_invalid_tag(tmp_path):
@@ -98,7 +98,7 @@ def test_load_raises_with_no_ngc_url(tmp_path):
         load("foo/bar", source="ngc", resources=get_all_resources(tmp_path), cache_dir=tmp_path)  # type: ignore
 
 
-@patch("bionemo.core.data.load._s3_download")
+@patch("bionemo.testing.data.load._s3_download")
 def test_load_with_file(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -115,7 +115,7 @@ def test_load_with_file(mocked_s3_download, tmp_path):
     assert file_path.read_text() == "test"
 
 
-@patch("bionemo.core.data.load._s3_download")
+@patch("bionemo.testing.data.load._s3_download")
 def test_load_with_gzipped_file(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -137,7 +137,7 @@ def write_compressed_text(_1, output_file: str, _2):
     assert file_path.read_text() == "test"
 
 
-@patch("bionemo.core.data.load._s3_download")
+@patch("bionemo.testing.data.load._s3_download")
 def test_load_with_gzipped_file_no_decomp(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -163,7 +163,7 @@ def write_compressed_text(_1, output_file: str, _2):
         assert f.read() == "test"
 
 
-@patch("bionemo.core.data.load._s3_download")
+@patch("bionemo.testing.data.load._s3_download")
 def test_load_with_tar_directory(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -195,7 +195,7 @@ def write_compressed_dir(_1, output_file: str, _2):
     assert (file_path / "test_file").read_text() == "test"
 
 
-@patch("bionemo.core.data.load._s3_download")
+@patch("bionemo.testing.data.load._s3_download")
 def test_load_with_tar_directory_no_unpack(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -232,7 +232,7 @@ def write_tarfile_dir(_1, output_file: str, _2):
         assert (tmp_path / "extracted/test_file").read_text() == "test"
 
 
-@patch("bionemo.core.data.load._s3_download")
+@patch("bionemo.testing.data.load._s3_download")
 def test_load_with_targz_directory(mocked_s3_download, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -274,7 +274,7 @@ def test_default_ngc_client():
     assert clt.api_key is not None
 
 
-@patch("bionemo.core.data.load.default_ngc_client")
+@patch("bionemo.testing.data.load.default_ngc_client")
 def test_load_with_file_from_ngc_model(mocked_get_ngc_client, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
@@ -305,7 +305,7 @@ def mocked_ngc_download(url, destination, file_patterns):
     mocked_ngc_client.registry.model.download_version.assert_called_once()
 
 
-@patch("bionemo.core.data.load.default_ngc_client")
+@patch("bionemo.testing.data.load.default_ngc_client")
 def test_load_with_file_from_ngc_resource(mocked_get_ngc_client, tmp_path):
     (tmp_path / "foo.yaml").write_text(
         """
diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py
similarity index 98%
rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py
rename to sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py
index ebd81abbb9..2b3370e5ad 100644
--- a/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py
+++ b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py
@@ -20,7 +20,7 @@
 import pydantic
 import pytest
 
-from bionemo.core.data.resource import Resource, get_all_resources
+from bionemo.testing.data.resource import Resource, get_all_resources
 
 
 def test_get_all_resources_returns_valid_entries():