From 6d4c64be0f2bafbc976bfafa81ea4505a7ea0f06 Mon Sep 17 00:00:00 2001 From: Malcolm Greaves Date: Thu, 7 Nov 2024 16:28:06 -0500 Subject: [PATCH 1/2] Refactored `load` & `download_bionemo_data` into bionemo-core (#396) `bionemo-testing` re-exports the same values defined in `__all__`: they are imported from `bionemo-core` as the implementations have moved. Note that the tests have also moved. Now, all sub-packages can use `load` at runtime, not just during tests. All previous imports of `bionemo.testing.load` have been changed to `bionemo.core.load`. Additionally moves over the YAML resource files from bionemo-testing into bionemo-core & adjusted the `get_all_resources` function. This PR also fixes an error in the naming convention for `bionemo-core`'s tests. --- scripts/protein/esm2/test_esm2_infer.py | 2 +- scripts/protein/esm2/test_pydantic_train.py | 2 +- sub-packages/bionemo-core/pyproject.toml | 16 +- .../src/bionemo/core}/data/README.md | 0 .../src/bionemo/core/data/load.py | 300 ++++++++++++++++++ .../src/bionemo/core/data/resource.py | 107 +++++++ .../bionemo/core}/data/resources/esm2.yaml | 0 .../core}/data/resources/geneformer.yaml | 0 .../bionemo/core}/data/resources/scdl.yaml | 0 .../core}/data/resources/single_cell.yaml | 0 .../tests/bionemo/core}/data/test_load.py | 20 +- .../data/test_multi_epoch_dataset.py | 0 .../{pytorch => core}/data/test_permute.py | 0 .../{pytorch => core}/data/test_resamplers.py | 0 .../tests/bionemo/core}/data/test_resource.py | 2 +- .../{pytorch => core}/utils/test_dtypes.py | 0 .../tests/bionemo/esm2/model/test_model.py | 2 +- .../bionemo/esm2/model/test_stop_and_go.py | 2 +- .../scripts/geneformer_mlm_loss_eval.py | 2 +- .../geneformer/scripts/infer_geneformer.py | 2 +- .../geneformer/scripts/test_pydantic_train.py | 2 +- .../scripts/test_train_geneformer.py | 2 +- .../tests/bionemo/geneformer/test_model.py | 2 +- .../bionemo/geneformer/test_stop_and_go.py | 2 +- .../tests/bionemo/scdl/conftest.py | 2 +- sub-packages/bionemo-testing/pyproject.toml | 13 - .../src/bionemo/testing/data/load.py | 288 +---------------- .../src/bionemo/testing/data/resource.py | 90 +----- 28 files changed, 462 insertions(+), 396 deletions(-) rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/README.md (100%) create mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/load.py create mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/resource.py rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/esm2.yaml (100%) rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/geneformer.yaml (100%) rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/scdl.yaml (100%) rename sub-packages/{bionemo-testing/src/bionemo/testing => bionemo-core/src/bionemo/core}/data/resources/single_cell.yaml (100%) rename sub-packages/{bionemo-testing/tests/bionemo/testing => bionemo-core/tests/bionemo/core}/data/test_load.py (95%) rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/data/test_multi_epoch_dataset.py (100%) rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/data/test_permute.py (100%) rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/data/test_resamplers.py (100%) rename sub-packages/{bionemo-testing/tests/bionemo/testing => bionemo-core/tests/bionemo/core}/data/test_resource.py (98%) rename sub-packages/bionemo-core/tests/bionemo/{pytorch => core}/utils/test_dtypes.py (100%) diff --git a/scripts/protein/esm2/test_esm2_infer.py b/scripts/protein/esm2/test_esm2_infer.py index c5ec1f633d..4c214d2c2f 100644 --- a/scripts/protein/esm2/test_esm2_infer.py +++ b/scripts/protein/esm2/test_esm2_infer.py @@ -21,10 +21,10 @@ from esm2_infer import infer_model from torch.utils.data import DataLoader +from bionemo.core.data.load import load from bionemo.esm2.api import ESM2Config from bionemo.esm2.data.tokenizer import get_tokenizer from bionemo.esm2.model.finetune.datamodule import ESM2FineTuneDataModule, InMemoryCSVDataset -from bionemo.testing.data.load import load esm2_650m_checkpoint_path = load("esm2/650m:2.0") diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py index 2522e538f8..a7c3ababa8 100644 --- a/scripts/protein/esm2/test_pydantic_train.py +++ b/scripts/protein/esm2/test_pydantic_train.py @@ -21,8 +21,8 @@ import pytest from lightning.fabric.plugins.environments.lightning import find_free_network_port +from bionemo.core.data.load import load from bionemo.testing.data.esm2 import create_mock_parquet_train_val_inputs, create_mock_protein_dataset -from bionemo.testing.data.load import load data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data" diff --git a/sub-packages/bionemo-core/pyproject.toml b/sub-packages/bionemo-core/pyproject.toml index c6e04c7270..079d9c8e58 100644 --- a/sub-packages/bionemo-core/pyproject.toml +++ b/sub-packages/bionemo-core/pyproject.toml @@ -12,14 +12,28 @@ license = { file = "LICENSE" } dynamic = ["version"] dependencies = [ # bionemo sub-packages + # bionemo-core **MUST NOT** depend on any other sub-packages !!!!! # external "numpy", "platformdirs", "torch>=2.2.1", - 'pytorch-lightning>=2.2.1', + 'boto3', 'lightning>=2.2.1', + 'ngcsdk', + 'pooch', + 'pydantic>=2.7.0', + 'pytorch-lightning>=2.2.1', + 'pyyaml', + 'tqdm', ] +[project.scripts] +download_bionemo_data = "bionemo.core.data.load:entrypoint" + +# Make sure that the resource yaml files are being packaged alongside the python files. +[tool.setuptools.package-data] +"bionemo.core" = ["**/*.yaml"] + [tool.setuptools.packages.find] where = ["src"] include = ["bionemo.*"] diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md b/sub-packages/bionemo-core/src/bionemo/core/data/README.md similarity index 100% rename from sub-packages/bionemo-testing/src/bionemo/testing/data/README.md rename to sub-packages/bionemo-core/src/bionemo/core/data/README.md diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/load.py b/sub-packages/bionemo-core/src/bionemo/core/data/load.py new file mode 100644 index 0000000000..ca737c37a4 --- /dev/null +++ b/sub-packages/bionemo-core/src/bionemo/core/data/load.py @@ -0,0 +1,300 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import contextlib +import shutil +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Literal, Optional, Sequence, TextIO + +import boto3 +import ngcsdk +import pooch +from botocore.config import Config +from tqdm import tqdm + +from bionemo.core import BIONEMO_CACHE_DIR +from bionemo.core.data.resource import Resource, get_all_resources + + +__all__: Sequence[str] = ( + "load", + "default_ngc_client", + "default_pbss_client", + "NGCDownloader", +) + + +def default_pbss_client(): + """Create a default S3 client for PBSS.""" + retry_config = Config(retries={"max_attempts": 10, "mode": "standard"}) + return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config) + + +def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None: + """Download a file from PBSS.""" + # Parse S3 URL to get bucket and key + parts = url.replace("s3://", "").split("/") + bucket = parts[0] + key = "/".join(parts[1:]) + + with contextlib.closing(default_pbss_client()) as s3: + object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] + progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url) + + # Define callback + def progress_callback(bytes_transferred): + progress_bar.update(bytes_transferred) + + # Download file from S3 + s3.download_file(bucket, key, output_file, Callback=progress_callback) + + +def default_ngc_client() -> ngcsdk.Client: + """Create a default NGC client. + + This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container. + """ + return ngcsdk.Client() + + +@dataclass +class NGCDownloader: + """A class to download files from NGC in a Pooch-compatible way. + + NGC downloads are typically structured as directories, while pooch expects a single file. This class + downloads a single file from an NGC directory and moves it to the desired location. + """ + + filename: str + ngc_registry: Literal["model", "resource"] + + def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None: + """Download a file from NGC.""" + client = default_ngc_client() + + download_fns = { + "model": client.registry.model.download_version, + "resource": client.registry.resource.download_version, + } + + output_file = Path(output_file) + output_file.parent.mkdir(parents=True, exist_ok=True) + + # NGC seems to always download to a specific directory that we can't specify ourselves. + ngc_dirname = Path(url).name.replace(":", "_v") + + with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir: + download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename]) + shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file) + + +def load( + model_or_data_tag: str, + source: Literal["ngc", "pbss"] = "pbss", + resources: dict[str, Resource] | None = None, + cache_dir: Path | None = None, +) -> Path: + """Download a resource from PBSS or NGC. + + Args: + model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary. + source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss". + resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.) + cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.) + + Raises: + ValueError: If the desired tag was not found, or if an NGC url was requested but not provided. + + Returns: + A Path object pointing either at the downloaded file, or at a decompressed folder containing the + file(s). + + Examples: + For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file: + >>> load("filename/tag") + PosixPath(/tmp/bionemo/downloaded-file-name) + """ + if resources is None: + resources = get_all_resources() + + if cache_dir is None: + cache_dir = BIONEMO_CACHE_DIR + + if model_or_data_tag not in resources: + raise ValueError(f"Resource '{model_or_data_tag}' not found.") + + if source == "ngc" and resources[model_or_data_tag].ngc is None: + raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.") + + resource = resources[model_or_data_tag] + filename = str(resource.pbss).split("/")[-1] + + extension = "".join(Path(filename).suffixes) + processor = _get_processor(extension, resource.unpack, resource.decompress) + + if source == "pbss": + download_fn = _s3_download + url = resource.pbss + + elif source == "ngc": + assert resource.ngc_registry is not None + download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry) + url = resource.ngc + + else: + raise ValueError(f"Source '{source}' not supported.") + + download = pooch.retrieve( + url=str(url), + known_hash=resource.sha256, + path=cache_dir, + downloader=download_fn, + processor=processor, + ) + + # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we + # just want the unpacked, parent folder. + if isinstance(download, list): + return Path(processor.extract_dir) # type: ignore + + else: + return Path(download) + + +def _get_processor(extension: str, unpack: bool | None, decompress: bool | None): + """Get the processor for a given file extension. + + If unpack and decompress are both None, the processor will be inferred from the file extension. + + Args: + extension: The file extension. + unpack: Whether to unpack the file. + decompress: Whether to decompress the file. + + Returns: + A Pooch processor object. + """ + if extension in {".gz", ".bz2", ".xz"} and decompress is None: + return pooch.Decompress() + + elif extension in {".tar", ".tar.gz"} and unpack is None: + return pooch.Untar() + + elif extension == ".zip" and unpack is None: + return pooch.Unzip() + + else: + return None + + +def print_resources(*, output_source: TextIO = sys.stdout) -> None: + """Prints all available downloadable resources & their sources to STDOUT.""" + print("#resource_name\tsource_options", file=output_source) + for resource_name, resource in sorted(get_all_resources().items()): + sources = [] + if resource.ngc is not None: + sources.append("ngc") + if resource.pbss is not None: + sources.append("pbss") + print(f"{resource_name}\t{','.join(sources)}", file=output_source) + + +def entrypoint(): + """Allows a user to get a specific artifact from the command line.""" + parser = argparse.ArgumentParser( + description="Retrieve the local path to the requested artifact name or list resources." + ) + + # Create mutually exclusive group + group = parser.add_mutually_exclusive_group(required=True) + + # Add the argument for artifact name, which is required if --list-resources is not used + group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact") + + # Add the --list-resources option + group.add_argument( + "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit." + ) + + # Add the --source option + parser.add_argument( + "--source", + type=str, + choices=["pbss", "ngc"], + default="ngc", + help='Backend to use, Internal NVIDIA users can set this to "pbss".', + ) + + parser.add_argument( + "--all", + action="store_true", + default=False, + help="Download all resources. Ignores all other options.", + ) + args = parser.parse_args() + maybe_error = main( + download_all=args.all, + list_resources=args.list_resources, + artifact_name=args.artifact_name, + source=args.source, + ) + if maybe_error is not None: + parser.error(maybe_error) + + +if __name__ == "__main__": + entrypoint() + + +def main( + download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"] +) -> Optional[str]: + """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure.""" + if download_all: + print("Downloading all resources:", file=sys.stderr) + print_resources(output_source=sys.stderr) + print("-" * 80, file=sys.stderr) + + resource_to_local: dict[str, Path] = {} + for resource_name in tqdm( + sorted(get_all_resources()), + desc="Downloading Resources", + ): + with contextlib.redirect_stdout(sys.stderr): + local_path = load(resource_name, source=source) + resource_to_local[resource_name] = local_path + + print("-" * 80, file=sys.stderr) + print("All resources downloaded:", file=sys.stderr) + for resource_name, local_path in sorted(resource_to_local.items()): + print(f" {resource_name}: {str(local_path.absolute())}", file=sys.stderr) + + elif list_resources: + print_resources(output_source=sys.stdout) + + elif artifact_name is not None and len(artifact_name) > 0: + # Get the local path for the provided artifact name + with contextlib.redirect_stdout(sys.stderr): + local_path = load(artifact_name, source=source) + + # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT + print(str(local_path.absolute())) + + else: + return "You must provide an artifact name if --list-resources or --all is not set!" diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resource.py b/sub-packages/bionemo-core/src/bionemo/core/data/resource.py new file mode 100644 index 0000000000..419975854f --- /dev/null +++ b/sub-packages/bionemo-core/src/bionemo/core/data/resource.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import functools +import itertools +from collections import Counter +from importlib.resources import files +from pathlib import Path +from typing import Annotated, Any, Literal, Sequence + +import pydantic +import yaml +from registry.api.utils import RegistryTarget + + +__all__: Sequence[str] = ( + "Resource", + "get_all_resources", +) + + +def _validate_ngc_resource(value: str) -> str: + return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]")) + + +class Resource(pydantic.BaseModel): + """Class that represents a remote resource for downloading and caching test data.""" + + model_config = pydantic.ConfigDict(use_attribute_docstrings=True) + + tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")] # Only slash between filename and tag. + """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag").""" + + ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None + """The NGC URL for the resource. + + Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC. + """ + + ngc_registry: Literal["model", "resource"] | None = None + """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None.""" + + pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])] + """The PBSS (NVIDIA-internal) URL of the resource.""" + + sha256: str | None + """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended).""" + + owner: pydantic.NameEmail + """The owner or primary point of contact for the resource, in the format "Name ".""" + + description: str | None = None + """A description of the file(s).""" + + unpack: Literal[False, None] = None + """Whether the resource should be unpacked after download. If None, will defer to the file extension.""" + + decompress: Literal[False, None] = None + """Whether the resource should be decompressed after download. If None, will defer to the file extension.""" + + @pydantic.model_validator(mode="after") + def _validate_ngc_registry(self): + if self.ngc and not self.ngc_registry: + raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}") + return self + + +@functools.cache +def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]: + """Return a dictionary of all resources.""" + if not resource_path: + resource_path = Path(files("bionemo.core.data").joinpath("resources")) # type: ignore + + resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml")) + + all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)] + + resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources) + resource_dict = {resource.tag: resource for resource in resource_list} + + if len(resource_dict) != len(resource_list): + # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue. + tag_counts = Counter([resource.tag for resource in resource_list]) + raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}") + + return resource_dict + + +def _parse_resource_file(file) -> list[dict[str, Any]]: + with file.open("r") as f: + resources = yaml.safe_load(f) + for resource in resources: + resource["tag"] = f"{file.stem}/{resource['tag']}" + return resources diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml similarity index 100% rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml similarity index 100% rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml similarity index 100% rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml b/sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml similarity index 100% rename from sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml rename to sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml diff --git a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py similarity index 95% rename from sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py index 452604ce3d..ae413d8147 100644 --- a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py +++ b/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py @@ -23,8 +23,8 @@ import pytest -from bionemo.testing.data.load import default_ngc_client, default_pbss_client, load -from bionemo.testing.data.resource import get_all_resources +from bionemo.core.data.load import default_ngc_client, default_pbss_client, load +from bionemo.core.data.resource import get_all_resources def test_load_raises_error_on_invalid_tag(tmp_path): @@ -98,7 +98,7 @@ def test_load_raises_with_no_ngc_url(tmp_path): load("foo/bar", source="ngc", resources=get_all_resources(tmp_path), cache_dir=tmp_path) # type: ignore -@patch("bionemo.testing.data.load._s3_download") +@patch("bionemo.core.data.load._s3_download") def test_load_with_file(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -115,7 +115,7 @@ def test_load_with_file(mocked_s3_download, tmp_path): assert file_path.read_text() == "test" -@patch("bionemo.testing.data.load._s3_download") +@patch("bionemo.core.data.load._s3_download") def test_load_with_gzipped_file(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -137,7 +137,7 @@ def write_compressed_text(_1, output_file: str, _2): assert file_path.read_text() == "test" -@patch("bionemo.testing.data.load._s3_download") +@patch("bionemo.core.data.load._s3_download") def test_load_with_gzipped_file_no_decomp(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -163,7 +163,7 @@ def write_compressed_text(_1, output_file: str, _2): assert f.read() == "test" -@patch("bionemo.testing.data.load._s3_download") +@patch("bionemo.core.data.load._s3_download") def test_load_with_tar_directory(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -195,7 +195,7 @@ def write_compressed_dir(_1, output_file: str, _2): assert (file_path / "test_file").read_text() == "test" -@patch("bionemo.testing.data.load._s3_download") +@patch("bionemo.core.data.load._s3_download") def test_load_with_tar_directory_no_unpack(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -232,7 +232,7 @@ def write_tarfile_dir(_1, output_file: str, _2): assert (tmp_path / "extracted/test_file").read_text() == "test" -@patch("bionemo.testing.data.load._s3_download") +@patch("bionemo.core.data.load._s3_download") def test_load_with_targz_directory(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -274,7 +274,7 @@ def test_default_ngc_client(): assert clt.api_key is not None -@patch("bionemo.testing.data.load.default_ngc_client") +@patch("bionemo.core.data.load.default_ngc_client") def test_load_with_file_from_ngc_model(mocked_get_ngc_client, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -305,7 +305,7 @@ def mocked_ngc_download(url, destination, file_patterns): mocked_ngc_client.registry.model.download_version.assert_called_once() -@patch("bionemo.testing.data.load.default_ngc_client") +@patch("bionemo.core.data.load.default_ngc_client") def test_load_with_file_from_ngc_resource(mocked_get_ngc_client, tmp_path): (tmp_path / "foo.yaml").write_text( """ diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py diff --git a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py b/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py similarity index 98% rename from sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py rename to sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py index 2b3370e5ad..ebd81abbb9 100644 --- a/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py +++ b/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py @@ -20,7 +20,7 @@ import pydantic import pytest -from bionemo.testing.data.resource import Resource, get_all_resources +from bionemo.core.data.resource import Resource, get_all_resources def test_get_all_resources_returns_valid_entries(): diff --git a/sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py b/sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py rename to sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py index 4b091c9d77..0438483901 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py @@ -27,6 +27,7 @@ from torch import Tensor from transformers import EsmForMaskedLM +from bionemo.core.data.load import load from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.core.utils.random_utils import random_numpy_context from bionemo.esm2.api import ESM2Config, ESM2Model @@ -36,7 +37,6 @@ from bionemo.llm.model.biobert.model import MegatronBioBertModel from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping from bionemo.testing import megatron_parallel_state_utils -from bionemo.testing.data.load import load nemo1_checkpoint_path: Path = load("esm2/nv_650m:1.0") diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py index 18be7eccf3..8be351eb5b 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py @@ -23,6 +23,7 @@ from nemo.lightning.pytorch.optim import MegatronOptimizerModule from typing_extensions import override +from bionemo.core.data.load import load from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.esm2.api import ESM2Config from bionemo.esm2.data.datamodule import ESMDataModule @@ -30,7 +31,6 @@ from bionemo.esm2.data.tokenizer import BioNeMoESMTokenizer, get_tokenizer from bionemo.llm.model.biobert.lightning import biobert_lightning_module from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler -from bionemo.testing.data.load import load from bionemo.testing.harnesses import stop_and_go from bionemo.testing.harnesses.mode import Mode diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py index a79ae52269..c8965d092d 100644 --- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py +++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py @@ -42,6 +42,7 @@ from tqdm import trange from transformers import AutoModelForMaskedLM +from bionemo.core.data.load import load from bionemo.core.data.multi_epoch_dataset import EpochIndex from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.geneformer.api import GeneformerConfig @@ -51,7 +52,6 @@ from bionemo.llm.data import collate from bionemo.llm.model.biobert.model import BioBertConfig from bionemo.testing import megatron_parallel_state_utils -from bionemo.testing.data.load import load class GeneformerHFAdapter(torch.nn.Module): diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py index 1215f864b7..3a72d11e9e 100644 --- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py +++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py @@ -20,6 +20,7 @@ from nemo import lightning as nl from nemo.utils import logging +from bionemo.core.data.load import load from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype from bionemo.geneformer.api import FineTuneSeqLenBioBertConfig, GeneformerConfig from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule @@ -28,7 +29,6 @@ from bionemo.llm.model.biobert.lightning import biobert_lightning_module from bionemo.llm.model.biobert.model import BioBertConfig from bionemo.llm.utils.datamodule_utils import infer_global_batch_size -from bionemo.testing.data.load import load def infer_model( diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py index 7eeb47a613..f76682fbc0 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py @@ -20,7 +20,7 @@ from lightning.fabric.plugins.environments.lightning import find_free_network_port -from bionemo.testing.data.load import load +from bionemo.core.data.load import load data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data" diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py index 14ba0b03c0..5799ec7611 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py @@ -22,11 +22,11 @@ import pytest from lightning.fabric.plugins.environments.lightning import find_free_network_port +from bionemo.core.data.load import load from bionemo.geneformer.scripts.train_geneformer import get_parser, main from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption from bionemo.llm.utils.datamodule_utils import parse_kwargs_to_arglist from bionemo.testing import megatron_parallel_state_utils -from bionemo.testing.data.load import load data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data" diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py index a561cb5b87..35621c1f13 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py @@ -38,6 +38,7 @@ from torch.nn import functional as F from tqdm import tqdm +from bionemo.core.data.load import load from bionemo.core.utils.batching_utils import pad_token_ids from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.core.utils.random_utils import random_numpy_context @@ -52,7 +53,6 @@ from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping from bionemo.testing import megatron_parallel_state_utils from bionemo.testing.callbacks import MetricTracker -from bionemo.testing.data.load import load from bionemo.testing.utils import ( assert_matrix_correlation_above_value, assert_matrix_mape_below_value, diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py index 5d55514073..3820844131 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py @@ -36,12 +36,12 @@ from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule from typing_extensions import override +from bionemo.core.data.load import load from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.geneformer.api import GeneformerConfig from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess from bionemo.llm.model.biobert.lightning import biobert_lightning_module from bionemo.testing import testing_callbacks -from bionemo.testing.data.load import load from bionemo.testing.harnesses import stop_and_go from bionemo.testing.harnesses.mode import Mode diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py index a13477c81b..0c128e76e2 100644 --- a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py +++ b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py @@ -19,7 +19,7 @@ import pytest -from bionemo.testing.data.load import load +from bionemo.core.data.load import load @pytest.fixture diff --git a/sub-packages/bionemo-testing/pyproject.toml b/sub-packages/bionemo-testing/pyproject.toml index ccac13b4ad..9c503a842e 100644 --- a/sub-packages/bionemo-testing/pyproject.toml +++ b/sub-packages/bionemo-testing/pyproject.toml @@ -15,23 +15,10 @@ dependencies = [ 'bionemo-core', 'bionemo-llm', # external - 'boto3', 'email-validator', - 'ngcsdk', - 'pooch', - 'pydantic>=2.7.0', 'pytest', - 'pyyaml', - 'tqdm', ] -[project.scripts] -download_bionemo_data = "bionemo.testing.data.load:entrypoint" - -# Make sure that the resource yaml files are being packaged alongside the python files. -[tool.setuptools.package-data] -"bionemo.testing" = ["**/*.yaml"] - [tool.setuptools.packages.find] where = ["src"] include = ["bionemo.*"] diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py index 58978abec6..cc5ec123cb 100644 --- a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py +++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py @@ -12,284 +12,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Sequence -import argparse -import contextlib -import shutil -import sys -import tempfile -from dataclasses import dataclass -from pathlib import Path -from typing import Literal, Optional, Sequence, TextIO +from bionemo.core.data.load import default_ngc_client, default_pbss_client, entrypoint, load -import boto3 -import ngcsdk -import pooch -from botocore.config import Config -from tqdm import tqdm -from bionemo.core import BIONEMO_CACHE_DIR -from bionemo.testing.data.resource import Resource, get_all_resources +_ = entrypoint +# This needs to be around so that ruff doesn't automatically remove it as it's unused. +# We don't want to include it in __all__. +# But older installations __may__ be using the old CLI path (bionemo.core.data.load:entrypoint) +# so this is here for backwards compatability. -__all__: Sequence[str] = ("load",) - - -def default_pbss_client(): - """Create a default S3 client for PBSS.""" - retry_config = Config(retries={"max_attempts": 10, "mode": "standard"}) - return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config) - - -def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None: - """Download a file from PBSS.""" - # Parse S3 URL to get bucket and key - parts = url.replace("s3://", "").split("/") - bucket = parts[0] - key = "/".join(parts[1:]) - - with contextlib.closing(default_pbss_client()) as s3: - object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] - progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url) - - # Define callback - def progress_callback(bytes_transferred): - progress_bar.update(bytes_transferred) - - # Download file from S3 - s3.download_file(bucket, key, output_file, Callback=progress_callback) - - -def default_ngc_client() -> ngcsdk.Client: - """Create a default NGC client. - - This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container. - """ - return ngcsdk.Client() - - -@dataclass -class NGCDownloader: - """A class to download files from NGC in a Pooch-compatible way. - - NGC downloads are typically structured as directories, while pooch expects a single file. This class - downloads a single file from an NGC directory and moves it to the desired location. - """ - - filename: str - ngc_registry: Literal["model", "resource"] - - def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None: - """Download a file from NGC.""" - client = default_ngc_client() - - download_fns = { - "model": client.registry.model.download_version, - "resource": client.registry.resource.download_version, - } - - output_file = Path(output_file) - output_file.parent.mkdir(parents=True, exist_ok=True) - - # NGC seems to always download to a specific directory that we can't specify ourselves. - ngc_dirname = Path(url).name.replace(":", "_v") - - with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir: - download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename]) - shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file) - - -def load( - model_or_data_tag: str, - source: Literal["ngc", "pbss"] = "pbss", - resources: dict[str, Resource] | None = None, - cache_dir: Path | None = None, -) -> Path: - """Download a resource from PBSS or NGC. - - Args: - model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary. - source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss". - resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.) - cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.) - - Raises: - ValueError: If the desired tag was not found, or if an NGC url was requested but not provided. - - Returns: - A Path object pointing either at the downloaded file, or at a decompressed folder containing the - file(s). - - Examples: - For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file: - >>> load("filename/tag") - PosixPath(/tmp/bionemo/downloaded-file-name) - """ - if resources is None: - resources = get_all_resources() - - if cache_dir is None: - cache_dir = BIONEMO_CACHE_DIR - - if model_or_data_tag not in resources: - raise ValueError(f"Resource '{model_or_data_tag}' not found.") - - if source == "ngc" and resources[model_or_data_tag].ngc is None: - raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.") - - resource = resources[model_or_data_tag] - filename = str(resource.pbss).split("/")[-1] - - extension = "".join(Path(filename).suffixes) - processor = _get_processor(extension, resource.unpack, resource.decompress) - - if source == "pbss": - download_fn = _s3_download - url = resource.pbss - - elif source == "ngc": - assert resource.ngc_registry is not None - download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry) - url = resource.ngc - - else: - raise ValueError(f"Source '{source}' not supported.") - - download = pooch.retrieve( - url=str(url), - known_hash=resource.sha256, - path=cache_dir, - downloader=download_fn, - processor=processor, - ) - - # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we - # just want the unpacked, parent folder. - if isinstance(download, list): - return Path(processor.extract_dir) # type: ignore - - else: - return Path(download) - - -def _get_processor(extension: str, unpack: bool | None, decompress: bool | None): - """Get the processor for a given file extension. - - If unpack and decompress are both None, the processor will be inferred from the file extension. - - Args: - extension: The file extension. - unpack: Whether to unpack the file. - decompress: Whether to decompress the file. - - Returns: - A Pooch processor object. - """ - if extension in {".gz", ".bz2", ".xz"} and decompress is None: - return pooch.Decompress() - - elif extension in {".tar", ".tar.gz"} and unpack is None: - return pooch.Untar() - - elif extension == ".zip" and unpack is None: - return pooch.Unzip() - - else: - return None - - -def print_resources(*, output_source: TextIO = sys.stdout) -> None: - """Prints all available downloadable resources & their sources to STDOUT.""" - print("#resource_name\tsource_options", file=output_source) - for resource_name, resource in sorted(get_all_resources().items()): - sources = [] - if resource.ngc is not None: - sources.append("ngc") - if resource.pbss is not None: - sources.append("pbss") - print(f"{resource_name}\t{','.join(sources)}", file=output_source) - - -def entrypoint(): - """Allows a user to get a specific artifact from the command line.""" - parser = argparse.ArgumentParser( - description="Retrieve the local path to the requested artifact name or list resources." - ) - - # Create mutually exclusive group - group = parser.add_mutually_exclusive_group(required=True) - - # Add the argument for artifact name, which is required if --list-resources is not used - group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact") - - # Add the --list-resources option - group.add_argument( - "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit." - ) - - # Add the --source option - parser.add_argument( - "--source", - type=str, - choices=["pbss", "ngc"], - default="ngc", - help='Backend to use, Internal NVIDIA users can set this to "pbss".', - ) - - parser.add_argument( - "--all", - action="store_true", - default=False, - help="Download all resources. Ignores all other options.", - ) - args = parser.parse_args() - maybe_error = main( - download_all=args.all, - list_resources=args.list_resources, - artifact_name=args.artifact_name, - source=args.source, - ) - if maybe_error is not None: - parser.error(maybe_error) - - -if __name__ == "__main__": - entrypoint() - - -def main( - download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"] -) -> Optional[str]: - """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure.""" - if download_all: - print("Downloading all resources:", file=sys.stderr) - print_resources(output_source=sys.stderr) - print("-" * 80, file=sys.stderr) - - resource_to_local: dict[str, Path] = {} - for resource_name in tqdm( - sorted(get_all_resources()), - desc="Downloading Resources", - ): - with contextlib.redirect_stdout(sys.stderr): - local_path = load(resource_name, source=source) - resource_to_local[resource_name] = local_path - - print("-" * 80, file=sys.stderr) - print("All resources downloaded:", file=sys.stderr) - for resource_name, local_path in sorted(resource_to_local.items()): - print(f" {resource_name}: {str(local_path.absolute())}", file=sys.stderr) - - elif list_resources: - print_resources(output_source=sys.stdout) - - elif artifact_name is not None and len(artifact_name) > 0: - # Get the local path for the provided artifact name - with contextlib.redirect_stdout(sys.stderr): - local_path = load(artifact_name, source=source) - - # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT - print(str(local_path.absolute())) - - else: - return "You must provide an artifact name if --list-resources or --all is not set!" +__all__: Sequence[str] = ( + "load", + "default_ngc_client", + "default_pbss_client", +) diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py index 065af432dc..677f6f49e0 100644 --- a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py +++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py @@ -12,90 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Sequence +from bionemo.core.data.resource import Resource, get_all_resources -import functools -import itertools -from collections import Counter -from importlib.resources import files -from pathlib import Path -from typing import Annotated, Literal -import pydantic -import yaml -from registry.api.utils import RegistryTarget - - -def _validate_ngc_resource(value: str) -> str: - return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]")) - - -class Resource(pydantic.BaseModel): - """Class that represents a remote resource for downloading and caching test data.""" - - model_config = pydantic.ConfigDict(use_attribute_docstrings=True) - - tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")] # Only slash between filename and tag. - """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag").""" - - ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None - """The NGC URL for the resource. - - Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC. - """ - - ngc_registry: Literal["model", "resource"] | None = None - """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None.""" - - pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])] - """The PBSS (NVIDIA-internal) URL of the resource.""" - - sha256: str | None - """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended).""" - - owner: pydantic.NameEmail - """The owner or primary point of contact for the resource, in the format "Name ".""" - - description: str | None = None - """A description of the file(s).""" - - unpack: Literal[False, None] = None - """Whether the resource should be unpacked after download. If None, will defer to the file extension.""" - - decompress: Literal[False, None] = None - """Whether the resource should be decompressed after download. If None, will defer to the file extension.""" - - @pydantic.model_validator(mode="after") - def _validate_ngc_registry(self): - if self.ngc and not self.ngc_registry: - raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}") - return self - - -@functools.cache -def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]: - """Return a dictionary of all resources.""" - if not resource_path: - resource_path = Path(files("bionemo.testing.data").joinpath("resources")) # type: ignore - - resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml")) - - all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)] - - resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources) - resource_dict = {resource.tag: resource for resource in resource_list} - - if len(resource_dict) != len(resource_list): - # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue. - tag_counts = Counter([resource.tag for resource in resource_list]) - raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}") - - return resource_dict - - -def _parse_resource_file(file) -> list: - with file.open("r") as f: - resources = yaml.safe_load(f) - for resource in resources: - resource["tag"] = f"{file.stem}/{resource['tag']}" - return resources +__all__: Sequence[str] = ( + "Resource", + "get_all_resources", +) From 0c923edff5d93b16d742f02b9b2f870d688a72f0 Mon Sep 17 00:00:00 2001 From: Malcolm Greaves Date: Thu, 7 Nov 2024 16:28:46 -0500 Subject: [PATCH 2/2] Revert "Refactored `load` & `download_bionemo_data` into bionemo-core" (#411) Reverts NVIDIA/bionemo-framework#396 --- scripts/protein/esm2/test_esm2_infer.py | 2 +- scripts/protein/esm2/test_pydantic_train.py | 2 +- sub-packages/bionemo-core/pyproject.toml | 16 +- .../src/bionemo/core/data/load.py | 300 ------------------ .../src/bionemo/core/data/resource.py | 107 ------- .../data/test_multi_epoch_dataset.py | 0 .../{core => pytorch}/data/test_permute.py | 0 .../{core => pytorch}/data/test_resamplers.py | 0 .../{core => pytorch}/utils/test_dtypes.py | 0 .../tests/bionemo/esm2/model/test_model.py | 2 +- .../bionemo/esm2/model/test_stop_and_go.py | 2 +- .../scripts/geneformer_mlm_loss_eval.py | 2 +- .../geneformer/scripts/infer_geneformer.py | 2 +- .../geneformer/scripts/test_pydantic_train.py | 2 +- .../scripts/test_train_geneformer.py | 2 +- .../tests/bionemo/geneformer/test_model.py | 2 +- .../bionemo/geneformer/test_stop_and_go.py | 2 +- .../tests/bionemo/scdl/conftest.py | 2 +- sub-packages/bionemo-testing/pyproject.toml | 13 + .../src/bionemo/testing}/data/README.md | 0 .../src/bionemo/testing/data/load.py | 288 ++++++++++++++++- .../src/bionemo/testing/data/resource.py | 90 +++++- .../bionemo/testing}/data/resources/esm2.yaml | 0 .../testing}/data/resources/geneformer.yaml | 0 .../bionemo/testing}/data/resources/scdl.yaml | 0 .../testing}/data/resources/single_cell.yaml | 0 .../tests/bionemo/testing}/data/test_load.py | 20 +- .../bionemo/testing}/data/test_resource.py | 2 +- 28 files changed, 396 insertions(+), 462 deletions(-) delete mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/load.py delete mode 100644 sub-packages/bionemo-core/src/bionemo/core/data/resource.py rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/data/test_multi_epoch_dataset.py (100%) rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/data/test_permute.py (100%) rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/data/test_resamplers.py (100%) rename sub-packages/bionemo-core/tests/bionemo/{core => pytorch}/utils/test_dtypes.py (100%) rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/README.md (100%) rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/esm2.yaml (100%) rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/geneformer.yaml (100%) rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/scdl.yaml (100%) rename sub-packages/{bionemo-core/src/bionemo/core => bionemo-testing/src/bionemo/testing}/data/resources/single_cell.yaml (100%) rename sub-packages/{bionemo-core/tests/bionemo/core => bionemo-testing/tests/bionemo/testing}/data/test_load.py (95%) rename sub-packages/{bionemo-core/tests/bionemo/core => bionemo-testing/tests/bionemo/testing}/data/test_resource.py (98%) diff --git a/scripts/protein/esm2/test_esm2_infer.py b/scripts/protein/esm2/test_esm2_infer.py index 4c214d2c2f..c5ec1f633d 100644 --- a/scripts/protein/esm2/test_esm2_infer.py +++ b/scripts/protein/esm2/test_esm2_infer.py @@ -21,10 +21,10 @@ from esm2_infer import infer_model from torch.utils.data import DataLoader -from bionemo.core.data.load import load from bionemo.esm2.api import ESM2Config from bionemo.esm2.data.tokenizer import get_tokenizer from bionemo.esm2.model.finetune.datamodule import ESM2FineTuneDataModule, InMemoryCSVDataset +from bionemo.testing.data.load import load esm2_650m_checkpoint_path = load("esm2/650m:2.0") diff --git a/scripts/protein/esm2/test_pydantic_train.py b/scripts/protein/esm2/test_pydantic_train.py index a7c3ababa8..2522e538f8 100644 --- a/scripts/protein/esm2/test_pydantic_train.py +++ b/scripts/protein/esm2/test_pydantic_train.py @@ -21,8 +21,8 @@ import pytest from lightning.fabric.plugins.environments.lightning import find_free_network_port -from bionemo.core.data.load import load from bionemo.testing.data.esm2 import create_mock_parquet_train_val_inputs, create_mock_protein_dataset +from bionemo.testing.data.load import load data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data" diff --git a/sub-packages/bionemo-core/pyproject.toml b/sub-packages/bionemo-core/pyproject.toml index 079d9c8e58..c6e04c7270 100644 --- a/sub-packages/bionemo-core/pyproject.toml +++ b/sub-packages/bionemo-core/pyproject.toml @@ -12,28 +12,14 @@ license = { file = "LICENSE" } dynamic = ["version"] dependencies = [ # bionemo sub-packages - # bionemo-core **MUST NOT** depend on any other sub-packages !!!!! # external "numpy", "platformdirs", "torch>=2.2.1", - 'boto3', - 'lightning>=2.2.1', - 'ngcsdk', - 'pooch', - 'pydantic>=2.7.0', 'pytorch-lightning>=2.2.1', - 'pyyaml', - 'tqdm', + 'lightning>=2.2.1', ] -[project.scripts] -download_bionemo_data = "bionemo.core.data.load:entrypoint" - -# Make sure that the resource yaml files are being packaged alongside the python files. -[tool.setuptools.package-data] -"bionemo.core" = ["**/*.yaml"] - [tool.setuptools.packages.find] where = ["src"] include = ["bionemo.*"] diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/load.py b/sub-packages/bionemo-core/src/bionemo/core/data/load.py deleted file mode 100644 index ca737c37a4..0000000000 --- a/sub-packages/bionemo-core/src/bionemo/core/data/load.py +++ /dev/null @@ -1,300 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-Apache2 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import contextlib -import shutil -import sys -import tempfile -from dataclasses import dataclass -from pathlib import Path -from typing import Literal, Optional, Sequence, TextIO - -import boto3 -import ngcsdk -import pooch -from botocore.config import Config -from tqdm import tqdm - -from bionemo.core import BIONEMO_CACHE_DIR -from bionemo.core.data.resource import Resource, get_all_resources - - -__all__: Sequence[str] = ( - "load", - "default_ngc_client", - "default_pbss_client", - "NGCDownloader", -) - - -def default_pbss_client(): - """Create a default S3 client for PBSS.""" - retry_config = Config(retries={"max_attempts": 10, "mode": "standard"}) - return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config) - - -def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None: - """Download a file from PBSS.""" - # Parse S3 URL to get bucket and key - parts = url.replace("s3://", "").split("/") - bucket = parts[0] - key = "/".join(parts[1:]) - - with contextlib.closing(default_pbss_client()) as s3: - object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] - progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url) - - # Define callback - def progress_callback(bytes_transferred): - progress_bar.update(bytes_transferred) - - # Download file from S3 - s3.download_file(bucket, key, output_file, Callback=progress_callback) - - -def default_ngc_client() -> ngcsdk.Client: - """Create a default NGC client. - - This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container. - """ - return ngcsdk.Client() - - -@dataclass -class NGCDownloader: - """A class to download files from NGC in a Pooch-compatible way. - - NGC downloads are typically structured as directories, while pooch expects a single file. This class - downloads a single file from an NGC directory and moves it to the desired location. - """ - - filename: str - ngc_registry: Literal["model", "resource"] - - def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None: - """Download a file from NGC.""" - client = default_ngc_client() - - download_fns = { - "model": client.registry.model.download_version, - "resource": client.registry.resource.download_version, - } - - output_file = Path(output_file) - output_file.parent.mkdir(parents=True, exist_ok=True) - - # NGC seems to always download to a specific directory that we can't specify ourselves. - ngc_dirname = Path(url).name.replace(":", "_v") - - with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir: - download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename]) - shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file) - - -def load( - model_or_data_tag: str, - source: Literal["ngc", "pbss"] = "pbss", - resources: dict[str, Resource] | None = None, - cache_dir: Path | None = None, -) -> Path: - """Download a resource from PBSS or NGC. - - Args: - model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary. - source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss". - resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.) - cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.) - - Raises: - ValueError: If the desired tag was not found, or if an NGC url was requested but not provided. - - Returns: - A Path object pointing either at the downloaded file, or at a decompressed folder containing the - file(s). - - Examples: - For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file: - >>> load("filename/tag") - PosixPath(/tmp/bionemo/downloaded-file-name) - """ - if resources is None: - resources = get_all_resources() - - if cache_dir is None: - cache_dir = BIONEMO_CACHE_DIR - - if model_or_data_tag not in resources: - raise ValueError(f"Resource '{model_or_data_tag}' not found.") - - if source == "ngc" and resources[model_or_data_tag].ngc is None: - raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.") - - resource = resources[model_or_data_tag] - filename = str(resource.pbss).split("/")[-1] - - extension = "".join(Path(filename).suffixes) - processor = _get_processor(extension, resource.unpack, resource.decompress) - - if source == "pbss": - download_fn = _s3_download - url = resource.pbss - - elif source == "ngc": - assert resource.ngc_registry is not None - download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry) - url = resource.ngc - - else: - raise ValueError(f"Source '{source}' not supported.") - - download = pooch.retrieve( - url=str(url), - known_hash=resource.sha256, - path=cache_dir, - downloader=download_fn, - processor=processor, - ) - - # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we - # just want the unpacked, parent folder. - if isinstance(download, list): - return Path(processor.extract_dir) # type: ignore - - else: - return Path(download) - - -def _get_processor(extension: str, unpack: bool | None, decompress: bool | None): - """Get the processor for a given file extension. - - If unpack and decompress are both None, the processor will be inferred from the file extension. - - Args: - extension: The file extension. - unpack: Whether to unpack the file. - decompress: Whether to decompress the file. - - Returns: - A Pooch processor object. - """ - if extension in {".gz", ".bz2", ".xz"} and decompress is None: - return pooch.Decompress() - - elif extension in {".tar", ".tar.gz"} and unpack is None: - return pooch.Untar() - - elif extension == ".zip" and unpack is None: - return pooch.Unzip() - - else: - return None - - -def print_resources(*, output_source: TextIO = sys.stdout) -> None: - """Prints all available downloadable resources & their sources to STDOUT.""" - print("#resource_name\tsource_options", file=output_source) - for resource_name, resource in sorted(get_all_resources().items()): - sources = [] - if resource.ngc is not None: - sources.append("ngc") - if resource.pbss is not None: - sources.append("pbss") - print(f"{resource_name}\t{','.join(sources)}", file=output_source) - - -def entrypoint(): - """Allows a user to get a specific artifact from the command line.""" - parser = argparse.ArgumentParser( - description="Retrieve the local path to the requested artifact name or list resources." - ) - - # Create mutually exclusive group - group = parser.add_mutually_exclusive_group(required=True) - - # Add the argument for artifact name, which is required if --list-resources is not used - group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact") - - # Add the --list-resources option - group.add_argument( - "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit." - ) - - # Add the --source option - parser.add_argument( - "--source", - type=str, - choices=["pbss", "ngc"], - default="ngc", - help='Backend to use, Internal NVIDIA users can set this to "pbss".', - ) - - parser.add_argument( - "--all", - action="store_true", - default=False, - help="Download all resources. Ignores all other options.", - ) - args = parser.parse_args() - maybe_error = main( - download_all=args.all, - list_resources=args.list_resources, - artifact_name=args.artifact_name, - source=args.source, - ) - if maybe_error is not None: - parser.error(maybe_error) - - -if __name__ == "__main__": - entrypoint() - - -def main( - download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"] -) -> Optional[str]: - """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure.""" - if download_all: - print("Downloading all resources:", file=sys.stderr) - print_resources(output_source=sys.stderr) - print("-" * 80, file=sys.stderr) - - resource_to_local: dict[str, Path] = {} - for resource_name in tqdm( - sorted(get_all_resources()), - desc="Downloading Resources", - ): - with contextlib.redirect_stdout(sys.stderr): - local_path = load(resource_name, source=source) - resource_to_local[resource_name] = local_path - - print("-" * 80, file=sys.stderr) - print("All resources downloaded:", file=sys.stderr) - for resource_name, local_path in sorted(resource_to_local.items()): - print(f" {resource_name}: {str(local_path.absolute())}", file=sys.stderr) - - elif list_resources: - print_resources(output_source=sys.stdout) - - elif artifact_name is not None and len(artifact_name) > 0: - # Get the local path for the provided artifact name - with contextlib.redirect_stdout(sys.stderr): - local_path = load(artifact_name, source=source) - - # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT - print(str(local_path.absolute())) - - else: - return "You must provide an artifact name if --list-resources or --all is not set!" diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resource.py b/sub-packages/bionemo-core/src/bionemo/core/data/resource.py deleted file mode 100644 index 419975854f..0000000000 --- a/sub-packages/bionemo-core/src/bionemo/core/data/resource.py +++ /dev/null @@ -1,107 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-Apache2 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import functools -import itertools -from collections import Counter -from importlib.resources import files -from pathlib import Path -from typing import Annotated, Any, Literal, Sequence - -import pydantic -import yaml -from registry.api.utils import RegistryTarget - - -__all__: Sequence[str] = ( - "Resource", - "get_all_resources", -) - - -def _validate_ngc_resource(value: str) -> str: - return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]")) - - -class Resource(pydantic.BaseModel): - """Class that represents a remote resource for downloading and caching test data.""" - - model_config = pydantic.ConfigDict(use_attribute_docstrings=True) - - tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")] # Only slash between filename and tag. - """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag").""" - - ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None - """The NGC URL for the resource. - - Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC. - """ - - ngc_registry: Literal["model", "resource"] | None = None - """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None.""" - - pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])] - """The PBSS (NVIDIA-internal) URL of the resource.""" - - sha256: str | None - """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended).""" - - owner: pydantic.NameEmail - """The owner or primary point of contact for the resource, in the format "Name ".""" - - description: str | None = None - """A description of the file(s).""" - - unpack: Literal[False, None] = None - """Whether the resource should be unpacked after download. If None, will defer to the file extension.""" - - decompress: Literal[False, None] = None - """Whether the resource should be decompressed after download. If None, will defer to the file extension.""" - - @pydantic.model_validator(mode="after") - def _validate_ngc_registry(self): - if self.ngc and not self.ngc_registry: - raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}") - return self - - -@functools.cache -def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]: - """Return a dictionary of all resources.""" - if not resource_path: - resource_path = Path(files("bionemo.core.data").joinpath("resources")) # type: ignore - - resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml")) - - all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)] - - resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources) - resource_dict = {resource.tag: resource for resource in resource_list} - - if len(resource_dict) != len(resource_list): - # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue. - tag_counts = Counter([resource.tag for resource in resource_list]) - raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}") - - return resource_dict - - -def _parse_resource_file(file) -> list[dict[str, Any]]: - with file.open("r") as f: - resources = yaml.safe_load(f) - for resource in resources: - resource["tag"] = f"{file.stem}/{resource['tag']}" - return resources diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_multi_epoch_dataset.py rename to sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_multi_epoch_dataset.py diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_permute.py rename to sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_permute.py diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_resamplers.py rename to sub-packages/bionemo-core/tests/bionemo/pytorch/data/test_resamplers.py diff --git a/sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py b/sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py similarity index 100% rename from sub-packages/bionemo-core/tests/bionemo/core/utils/test_dtypes.py rename to sub-packages/bionemo-core/tests/bionemo/pytorch/utils/test_dtypes.py diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py index 0438483901..4b091c9d77 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_model.py @@ -27,7 +27,6 @@ from torch import Tensor from transformers import EsmForMaskedLM -from bionemo.core.data.load import load from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.core.utils.random_utils import random_numpy_context from bionemo.esm2.api import ESM2Config, ESM2Model @@ -37,6 +36,7 @@ from bionemo.llm.model.biobert.model import MegatronBioBertModel from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping from bionemo.testing import megatron_parallel_state_utils +from bionemo.testing.data.load import load nemo1_checkpoint_path: Path = load("esm2/nv_650m:1.0") diff --git a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py index 8be351eb5b..18be7eccf3 100644 --- a/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py +++ b/sub-packages/bionemo-esm2/tests/bionemo/esm2/model/test_stop_and_go.py @@ -23,7 +23,6 @@ from nemo.lightning.pytorch.optim import MegatronOptimizerModule from typing_extensions import override -from bionemo.core.data.load import load from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.esm2.api import ESM2Config from bionemo.esm2.data.datamodule import ESMDataModule @@ -31,6 +30,7 @@ from bionemo.esm2.data.tokenizer import BioNeMoESMTokenizer, get_tokenizer from bionemo.llm.model.biobert.lightning import biobert_lightning_module from bionemo.llm.model.lr_scheduler import WarmupAnnealDecayHoldScheduler +from bionemo.testing.data.load import load from bionemo.testing.harnesses import stop_and_go from bionemo.testing.harnesses.mode import Mode diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py index c8965d092d..a79ae52269 100644 --- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py +++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/geneformer_mlm_loss_eval.py @@ -42,7 +42,6 @@ from tqdm import trange from transformers import AutoModelForMaskedLM -from bionemo.core.data.load import load from bionemo.core.data.multi_epoch_dataset import EpochIndex from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.geneformer.api import GeneformerConfig @@ -52,6 +51,7 @@ from bionemo.llm.data import collate from bionemo.llm.model.biobert.model import BioBertConfig from bionemo.testing import megatron_parallel_state_utils +from bionemo.testing.data.load import load class GeneformerHFAdapter(torch.nn.Module): diff --git a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py index 3a72d11e9e..1215f864b7 100644 --- a/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py +++ b/sub-packages/bionemo-geneformer/src/bionemo/geneformer/scripts/infer_geneformer.py @@ -20,7 +20,6 @@ from nemo import lightning as nl from nemo.utils import logging -from bionemo.core.data.load import load from bionemo.core.utils.dtypes import PrecisionTypes, get_autocast_dtype from bionemo.geneformer.api import FineTuneSeqLenBioBertConfig, GeneformerConfig from bionemo.geneformer.data.singlecell.datamodule import SingleCellDataModule @@ -29,6 +28,7 @@ from bionemo.llm.model.biobert.lightning import biobert_lightning_module from bionemo.llm.model.biobert.model import BioBertConfig from bionemo.llm.utils.datamodule_utils import infer_global_batch_size +from bionemo.testing.data.load import load def infer_model( diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py index f76682fbc0..7eeb47a613 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_pydantic_train.py @@ -20,7 +20,7 @@ from lightning.fabric.plugins.environments.lightning import find_free_network_port -from bionemo.core.data.load import load +from bionemo.testing.data.load import load data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data" diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py index 5799ec7611..14ba0b03c0 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/scripts/test_train_geneformer.py @@ -22,11 +22,11 @@ import pytest from lightning.fabric.plugins.environments.lightning import find_free_network_port -from bionemo.core.data.load import load from bionemo.geneformer.scripts.train_geneformer import get_parser, main from bionemo.llm.model.biobert.transformer_specs import BiobertSpecOption from bionemo.llm.utils.datamodule_utils import parse_kwargs_to_arglist from bionemo.testing import megatron_parallel_state_utils +from bionemo.testing.data.load import load data_path: Path = load("single_cell/testdata-20240506") / "cellxgene_2023-12-15_small" / "processed_data" diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py index 35621c1f13..a561cb5b87 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_model.py @@ -38,7 +38,6 @@ from torch.nn import functional as F from tqdm import tqdm -from bionemo.core.data.load import load from bionemo.core.utils.batching_utils import pad_token_ids from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.core.utils.random_utils import random_numpy_context @@ -53,6 +52,7 @@ from bionemo.llm.utils.weight_utils import nemo1_to_nemo2_biobert_key_mapping from bionemo.testing import megatron_parallel_state_utils from bionemo.testing.callbacks import MetricTracker +from bionemo.testing.data.load import load from bionemo.testing.utils import ( assert_matrix_correlation_above_value, assert_matrix_mape_below_value, diff --git a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py index 3820844131..5d55514073 100644 --- a/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py +++ b/sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_stop_and_go.py @@ -36,12 +36,12 @@ from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule from typing_extensions import override -from bionemo.core.data.load import load from bionemo.core.utils.dtypes import get_autocast_dtype from bionemo.geneformer.api import GeneformerConfig from bionemo.geneformer.data.singlecell.preprocess import GeneformerPreprocess from bionemo.llm.model.biobert.lightning import biobert_lightning_module from bionemo.testing import testing_callbacks +from bionemo.testing.data.load import load from bionemo.testing.harnesses import stop_and_go from bionemo.testing.harnesses.mode import Mode diff --git a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py index 0c128e76e2..a13477c81b 100644 --- a/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py +++ b/sub-packages/bionemo-scdl/tests/bionemo/scdl/conftest.py @@ -19,7 +19,7 @@ import pytest -from bionemo.core.data.load import load +from bionemo.testing.data.load import load @pytest.fixture diff --git a/sub-packages/bionemo-testing/pyproject.toml b/sub-packages/bionemo-testing/pyproject.toml index 9c503a842e..ccac13b4ad 100644 --- a/sub-packages/bionemo-testing/pyproject.toml +++ b/sub-packages/bionemo-testing/pyproject.toml @@ -15,10 +15,23 @@ dependencies = [ 'bionemo-core', 'bionemo-llm', # external + 'boto3', 'email-validator', + 'ngcsdk', + 'pooch', + 'pydantic>=2.7.0', 'pytest', + 'pyyaml', + 'tqdm', ] +[project.scripts] +download_bionemo_data = "bionemo.testing.data.load:entrypoint" + +# Make sure that the resource yaml files are being packaged alongside the python files. +[tool.setuptools.package-data] +"bionemo.testing" = ["**/*.yaml"] + [tool.setuptools.packages.find] where = ["src"] include = ["bionemo.*"] diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/README.md b/sub-packages/bionemo-testing/src/bionemo/testing/data/README.md similarity index 100% rename from sub-packages/bionemo-core/src/bionemo/core/data/README.md rename to sub-packages/bionemo-testing/src/bionemo/testing/data/README.md diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py index cc5ec123cb..58978abec6 100644 --- a/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py +++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/load.py @@ -12,20 +12,284 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Sequence -from bionemo.core.data.load import default_ngc_client, default_pbss_client, entrypoint, load +import argparse +import contextlib +import shutil +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Literal, Optional, Sequence, TextIO +import boto3 +import ngcsdk +import pooch +from botocore.config import Config +from tqdm import tqdm -_ = entrypoint -# This needs to be around so that ruff doesn't automatically remove it as it's unused. -# We don't want to include it in __all__. -# But older installations __may__ be using the old CLI path (bionemo.core.data.load:entrypoint) -# so this is here for backwards compatability. +from bionemo.core import BIONEMO_CACHE_DIR +from bionemo.testing.data.resource import Resource, get_all_resources -__all__: Sequence[str] = ( - "load", - "default_ngc_client", - "default_pbss_client", -) +__all__: Sequence[str] = ("load",) + + +def default_pbss_client(): + """Create a default S3 client for PBSS.""" + retry_config = Config(retries={"max_attempts": 10, "mode": "standard"}) + return boto3.client("s3", endpoint_url="https://pbss.s8k.io", config=retry_config) + + +def _s3_download(url: str, output_file: str | Path, _: pooch.Pooch) -> None: + """Download a file from PBSS.""" + # Parse S3 URL to get bucket and key + parts = url.replace("s3://", "").split("/") + bucket = parts[0] + key = "/".join(parts[1:]) + + with contextlib.closing(default_pbss_client()) as s3: + object_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] + progress_bar = tqdm(total=object_size, unit="B", unit_scale=True, desc=url) + + # Define callback + def progress_callback(bytes_transferred): + progress_bar.update(bytes_transferred) + + # Download file from S3 + s3.download_file(bucket, key, output_file, Callback=progress_callback) + + +def default_ngc_client() -> ngcsdk.Client: + """Create a default NGC client. + + This should load the NGC API key from ~/.ngc/config, or from environment variables passed to the docker container. + """ + return ngcsdk.Client() + + +@dataclass +class NGCDownloader: + """A class to download files from NGC in a Pooch-compatible way. + + NGC downloads are typically structured as directories, while pooch expects a single file. This class + downloads a single file from an NGC directory and moves it to the desired location. + """ + + filename: str + ngc_registry: Literal["model", "resource"] + + def __call__(self, url: str, output_file: str | Path, _: pooch.Pooch) -> None: + """Download a file from NGC.""" + client = default_ngc_client() + + download_fns = { + "model": client.registry.model.download_version, + "resource": client.registry.resource.download_version, + } + + output_file = Path(output_file) + output_file.parent.mkdir(parents=True, exist_ok=True) + + # NGC seems to always download to a specific directory that we can't specify ourselves. + ngc_dirname = Path(url).name.replace(":", "_v") + + with tempfile.TemporaryDirectory(dir=output_file.parent) as temp_dir: + download_fns[self.ngc_registry](url, temp_dir, file_patterns=[self.filename]) + shutil.move(Path(temp_dir) / ngc_dirname / self.filename, output_file) + + +def load( + model_or_data_tag: str, + source: Literal["ngc", "pbss"] = "pbss", + resources: dict[str, Resource] | None = None, + cache_dir: Path | None = None, +) -> Path: + """Download a resource from PBSS or NGC. + + Args: + model_or_data_tag: A pointer to the desired resource. Must be a key in the resources dictionary. + source: Either "pbss" (NVIDIA-internal download) or "ngc" (NVIDIA GPU Cloud). Defaults to "pbss". + resources: A custom dictionary of resources. If None, the default resources will be used. (Mostly for testing.) + cache_dir: The directory to store downloaded files. Defaults to BIONEMO_CACHE_DIR. (Mostly for testing.) + + Raises: + ValueError: If the desired tag was not found, or if an NGC url was requested but not provided. + + Returns: + A Path object pointing either at the downloaded file, or at a decompressed folder containing the + file(s). + + Examples: + For a resource specified in 'filename.yaml' with tag 'tag', the following will download the file: + >>> load("filename/tag") + PosixPath(/tmp/bionemo/downloaded-file-name) + """ + if resources is None: + resources = get_all_resources() + + if cache_dir is None: + cache_dir = BIONEMO_CACHE_DIR + + if model_or_data_tag not in resources: + raise ValueError(f"Resource '{model_or_data_tag}' not found.") + + if source == "ngc" and resources[model_or_data_tag].ngc is None: + raise ValueError(f"Resource '{model_or_data_tag}' does not have an NGC URL.") + + resource = resources[model_or_data_tag] + filename = str(resource.pbss).split("/")[-1] + + extension = "".join(Path(filename).suffixes) + processor = _get_processor(extension, resource.unpack, resource.decompress) + + if source == "pbss": + download_fn = _s3_download + url = resource.pbss + + elif source == "ngc": + assert resource.ngc_registry is not None + download_fn = NGCDownloader(filename=filename, ngc_registry=resource.ngc_registry) + url = resource.ngc + + else: + raise ValueError(f"Source '{source}' not supported.") + + download = pooch.retrieve( + url=str(url), + known_hash=resource.sha256, + path=cache_dir, + downloader=download_fn, + processor=processor, + ) + + # Pooch by default returns a list of unpacked files if they unpack a zipped or tarred directory. Instead of that, we + # just want the unpacked, parent folder. + if isinstance(download, list): + return Path(processor.extract_dir) # type: ignore + + else: + return Path(download) + + +def _get_processor(extension: str, unpack: bool | None, decompress: bool | None): + """Get the processor for a given file extension. + + If unpack and decompress are both None, the processor will be inferred from the file extension. + + Args: + extension: The file extension. + unpack: Whether to unpack the file. + decompress: Whether to decompress the file. + + Returns: + A Pooch processor object. + """ + if extension in {".gz", ".bz2", ".xz"} and decompress is None: + return pooch.Decompress() + + elif extension in {".tar", ".tar.gz"} and unpack is None: + return pooch.Untar() + + elif extension == ".zip" and unpack is None: + return pooch.Unzip() + + else: + return None + + +def print_resources(*, output_source: TextIO = sys.stdout) -> None: + """Prints all available downloadable resources & their sources to STDOUT.""" + print("#resource_name\tsource_options", file=output_source) + for resource_name, resource in sorted(get_all_resources().items()): + sources = [] + if resource.ngc is not None: + sources.append("ngc") + if resource.pbss is not None: + sources.append("pbss") + print(f"{resource_name}\t{','.join(sources)}", file=output_source) + + +def entrypoint(): + """Allows a user to get a specific artifact from the command line.""" + parser = argparse.ArgumentParser( + description="Retrieve the local path to the requested artifact name or list resources." + ) + + # Create mutually exclusive group + group = parser.add_mutually_exclusive_group(required=True) + + # Add the argument for artifact name, which is required if --list-resources is not used + group.add_argument("artifact_name", type=str, nargs="?", help="Name of the artifact") + + # Add the --list-resources option + group.add_argument( + "--list-resources", action="store_true", default=False, help="List all available artifacts and then exit." + ) + + # Add the --source option + parser.add_argument( + "--source", + type=str, + choices=["pbss", "ngc"], + default="ngc", + help='Backend to use, Internal NVIDIA users can set this to "pbss".', + ) + + parser.add_argument( + "--all", + action="store_true", + default=False, + help="Download all resources. Ignores all other options.", + ) + args = parser.parse_args() + maybe_error = main( + download_all=args.all, + list_resources=args.list_resources, + artifact_name=args.artifact_name, + source=args.source, + ) + if maybe_error is not None: + parser.error(maybe_error) + + +if __name__ == "__main__": + entrypoint() + + +def main( + download_all: bool, list_resources: bool, artifact_name: str, source: Literal["pbss", "ngc"] +) -> Optional[str]: + """Main download script logic: parameters are 1:1 with CLI flags. Returns string describing error on failure.""" + if download_all: + print("Downloading all resources:", file=sys.stderr) + print_resources(output_source=sys.stderr) + print("-" * 80, file=sys.stderr) + + resource_to_local: dict[str, Path] = {} + for resource_name in tqdm( + sorted(get_all_resources()), + desc="Downloading Resources", + ): + with contextlib.redirect_stdout(sys.stderr): + local_path = load(resource_name, source=source) + resource_to_local[resource_name] = local_path + + print("-" * 80, file=sys.stderr) + print("All resources downloaded:", file=sys.stderr) + for resource_name, local_path in sorted(resource_to_local.items()): + print(f" {resource_name}: {str(local_path.absolute())}", file=sys.stderr) + + elif list_resources: + print_resources(output_source=sys.stdout) + + elif artifact_name is not None and len(artifact_name) > 0: + # Get the local path for the provided artifact name + with contextlib.redirect_stdout(sys.stderr): + local_path = load(artifact_name, source=source) + + # Print the result => CLI use assumes that we can get the single downloaded resource's path on STDOUT + print(str(local_path.absolute())) + + else: + return "You must provide an artifact name if --list-resources or --all is not set!" diff --git a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py index 677f6f49e0..065af432dc 100644 --- a/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py +++ b/sub-packages/bionemo-testing/src/bionemo/testing/data/resource.py @@ -12,12 +12,90 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Sequence -from bionemo.core.data.resource import Resource, get_all_resources +import functools +import itertools +from collections import Counter +from importlib.resources import files +from pathlib import Path +from typing import Annotated, Literal -__all__: Sequence[str] = ( - "Resource", - "get_all_resources", -) +import pydantic +import yaml +from registry.api.utils import RegistryTarget + + +def _validate_ngc_resource(value: str) -> str: + return str(RegistryTarget(value, "Pattern should be in format [org/[team/]]name[:version]")) + + +class Resource(pydantic.BaseModel): + """Class that represents a remote resource for downloading and caching test data.""" + + model_config = pydantic.ConfigDict(use_attribute_docstrings=True) + + tag: Annotated[str, pydantic.StringConstraints(pattern=r"^[^/]*/[^/]*$")] # Only slash between filename and tag. + """A unique identifier for the resource. The file(s) will be accessible via load("filename/tag").""" + + ngc: Annotated[str, pydantic.AfterValidator(_validate_ngc_resource)] | None = None + """The NGC URL for the resource. + + Should be in format [org/[team/]]name[:version]. If None, the resource is not available on NGC. + """ + + ngc_registry: Literal["model", "resource"] | None = None + """The NGC resource type (model or resource) for the data. Must be provided if ngc is not None.""" + + pbss: Annotated[pydantic.AnyUrl, pydantic.UrlConstraints(allowed_schemes=["s3"])] + """The PBSS (NVIDIA-internal) URL of the resource.""" + + sha256: str | None + """The SHA256 checksum of the resource. If None, the SHA will not be checked on download (not recommended).""" + + owner: pydantic.NameEmail + """The owner or primary point of contact for the resource, in the format "Name ".""" + + description: str | None = None + """A description of the file(s).""" + + unpack: Literal[False, None] = None + """Whether the resource should be unpacked after download. If None, will defer to the file extension.""" + + decompress: Literal[False, None] = None + """Whether the resource should be decompressed after download. If None, will defer to the file extension.""" + + @pydantic.model_validator(mode="after") + def _validate_ngc_registry(self): + if self.ngc and not self.ngc_registry: + raise ValueError(f"ngc_registry must be provided if ngc is not None: {self.tag}") + return self + + +@functools.cache +def get_all_resources(resource_path: Path | None = None) -> dict[str, Resource]: + """Return a dictionary of all resources.""" + if not resource_path: + resource_path = Path(files("bionemo.testing.data").joinpath("resources")) # type: ignore + + resources_files = itertools.chain(resource_path.glob("*.yaml"), resource_path.glob("*.yml")) + + all_resources = [resource for file in resources_files for resource in _parse_resource_file(file)] + + resource_list = pydantic.TypeAdapter(list[Resource]).validate_python(all_resources) + resource_dict = {resource.tag: resource for resource in resource_list} + + if len(resource_dict) != len(resource_list): + # Show the # of and which ones are duplicated so that a user can begin debugging and resolve the issue. + tag_counts = Counter([resource.tag for resource in resource_list]) + raise ValueError(f"Duplicate resource tags found!: {[tag for tag, count in tag_counts.items() if count > 1]}") + + return resource_dict + + +def _parse_resource_file(file) -> list: + with file.open("r") as f: + resources = yaml.safe_load(f) + for resource in resources: + resource["tag"] = f"{file.stem}/{resource['tag']}" + return resources diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml similarity index 100% rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/esm2.yaml rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/esm2.yaml diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml similarity index 100% rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/geneformer.yaml rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/geneformer.yaml diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml similarity index 100% rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/scdl.yaml rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/scdl.yaml diff --git a/sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml b/sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml similarity index 100% rename from sub-packages/bionemo-core/src/bionemo/core/data/resources/single_cell.yaml rename to sub-packages/bionemo-testing/src/bionemo/testing/data/resources/single_cell.yaml diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py similarity index 95% rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py rename to sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py index ae413d8147..452604ce3d 100644 --- a/sub-packages/bionemo-core/tests/bionemo/core/data/test_load.py +++ b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_load.py @@ -23,8 +23,8 @@ import pytest -from bionemo.core.data.load import default_ngc_client, default_pbss_client, load -from bionemo.core.data.resource import get_all_resources +from bionemo.testing.data.load import default_ngc_client, default_pbss_client, load +from bionemo.testing.data.resource import get_all_resources def test_load_raises_error_on_invalid_tag(tmp_path): @@ -98,7 +98,7 @@ def test_load_raises_with_no_ngc_url(tmp_path): load("foo/bar", source="ngc", resources=get_all_resources(tmp_path), cache_dir=tmp_path) # type: ignore -@patch("bionemo.core.data.load._s3_download") +@patch("bionemo.testing.data.load._s3_download") def test_load_with_file(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -115,7 +115,7 @@ def test_load_with_file(mocked_s3_download, tmp_path): assert file_path.read_text() == "test" -@patch("bionemo.core.data.load._s3_download") +@patch("bionemo.testing.data.load._s3_download") def test_load_with_gzipped_file(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -137,7 +137,7 @@ def write_compressed_text(_1, output_file: str, _2): assert file_path.read_text() == "test" -@patch("bionemo.core.data.load._s3_download") +@patch("bionemo.testing.data.load._s3_download") def test_load_with_gzipped_file_no_decomp(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -163,7 +163,7 @@ def write_compressed_text(_1, output_file: str, _2): assert f.read() == "test" -@patch("bionemo.core.data.load._s3_download") +@patch("bionemo.testing.data.load._s3_download") def test_load_with_tar_directory(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -195,7 +195,7 @@ def write_compressed_dir(_1, output_file: str, _2): assert (file_path / "test_file").read_text() == "test" -@patch("bionemo.core.data.load._s3_download") +@patch("bionemo.testing.data.load._s3_download") def test_load_with_tar_directory_no_unpack(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -232,7 +232,7 @@ def write_tarfile_dir(_1, output_file: str, _2): assert (tmp_path / "extracted/test_file").read_text() == "test" -@patch("bionemo.core.data.load._s3_download") +@patch("bionemo.testing.data.load._s3_download") def test_load_with_targz_directory(mocked_s3_download, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -274,7 +274,7 @@ def test_default_ngc_client(): assert clt.api_key is not None -@patch("bionemo.core.data.load.default_ngc_client") +@patch("bionemo.testing.data.load.default_ngc_client") def test_load_with_file_from_ngc_model(mocked_get_ngc_client, tmp_path): (tmp_path / "foo.yaml").write_text( """ @@ -305,7 +305,7 @@ def mocked_ngc_download(url, destination, file_patterns): mocked_ngc_client.registry.model.download_version.assert_called_once() -@patch("bionemo.core.data.load.default_ngc_client") +@patch("bionemo.testing.data.load.default_ngc_client") def test_load_with_file_from_ngc_resource(mocked_get_ngc_client, tmp_path): (tmp_path / "foo.yaml").write_text( """ diff --git a/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py similarity index 98% rename from sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py rename to sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py index ebd81abbb9..2b3370e5ad 100644 --- a/sub-packages/bionemo-core/tests/bionemo/core/data/test_resource.py +++ b/sub-packages/bionemo-testing/tests/bionemo/testing/data/test_resource.py @@ -20,7 +20,7 @@ import pydantic import pytest -from bionemo.core.data.resource import Resource, get_all_resources +from bionemo.testing.data.resource import Resource, get_all_resources def test_get_all_resources_returns_valid_entries():