diff --git a/.github/workflows/test-notebooks.yaml b/.github/workflows/test-notebooks.yaml index 61272055..ae09a42f 100644 --- a/.github/workflows/test-notebooks.yaml +++ b/.github/workflows/test-notebooks.yaml @@ -11,7 +11,37 @@ concurrency: cancel-in-progress: true jobs: + ensure-data-is-cached: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + with: + filter: blob:none + fetch-depth: 0 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + enable-cache: false + python-version: "3.13" + + - name: Restore data cache + id: data-cache + uses: actions/cache@v4 + with: + path: data # IMPORTANT: this will fail if scanpy.settings.datasetdir default changes + key: data-${{ hashFiles('**/download_data.py') }} + restore-keys: | + data- + enableCrossOsArchive: true + + - name: Download datasets + # Always run to ensure any missing files are downloaded + # (restore-keys may provide partial cache) + run: uvx hatch run data:download + test: + needs: [ensure-data-is-cached] runs-on: ${{ matrix.os }} strategy: fail-fast: false @@ -33,8 +63,20 @@ jobs: enable-cache: true python-version: ${{ matrix.python }} cache-dependency-glob: pyproject.toml + + - name: Restore data cache + id: data-cache + uses: actions/cache@v4 + with: + path: data # IMPORTANT: this will fail if scanpy.settings.datasetdir default changes + key: data-${{ hashFiles('**/download_data.py') }} + restore-keys: | + data- + enableCrossOsArchive: true + - name: Create notebooks environment run: uvx hatch -v env create notebooks + - name: Test notebooks env: MPLBACKEND: agg diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 945ba736..8681b225 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -42,14 +42,15 @@ jobs: id: data-cache uses: actions/cache@v4 with: - path: | - ~/.cache/squidpy/*.h5ad - ~/.cache/squidpy/*.zarr + path: data # IMPORTANT: this will fail if scanpy.settings.datasetdir default changes key: data-${{ hashFiles('**/download_data.py') }} + restore-keys: | + data- enableCrossOsArchive: true - name: Download datasets - if: steps.data-cache.outputs.cache-hit != 'true' + # Always run to ensure any missing files are downloaded + # (restore-keys may provide partial cache) run: uvx hatch run data:download # Get the test environment from hatch as defined in pyproject.toml. @@ -122,10 +123,10 @@ jobs: id: data-cache uses: actions/cache@v4 with: - path: | - ~/.cache/squidpy/*.h5ad - ~/.cache/squidpy/*.zarr + path: data # IMPORTANT: this will fail if scanpy.settings.datasetdir default changes key: data-${{ hashFiles('**/download_data.py') }} + restore-keys: | + data- enableCrossOsArchive: true - name: System dependencies (Linux) @@ -181,10 +182,10 @@ jobs: id: coverage-data-cache uses: actions/cache@v4 with: - path: | - ~/.cache/squidpy/*.h5ad - ~/.cache/squidpy/*.zarr + path: data # IMPORTANT: this will fail if scanpy.settings.datasetdir default changes key: data-${{ hashFiles('**/download_data.py') }} + restore-keys: | + data- enableCrossOsArchive: true - name: System dependencies (Linux) diff --git a/.scripts/ci/download_data.py b/.scripts/ci/download_data.py index 92cfc267..c22566ff 100644 --- a/.scripts/ci/download_data.py +++ b/.scripts/ci/download_data.py @@ -1,77 +1,70 @@ #!/usr/bin/env python3 +"""Download datasets to populate CI cache. + +This script downloads all datasets that tests might need. +The downloader handles caching to scanpy.settings.datasetdir. +""" + from __future__ import annotations import argparse -from pathlib import Path -from typing import Any -from squidpy.datasets import visium_hne_sdata +from scanpy import settings +from spatialdata._logging import logger _CNT = 0 # increment this when you want to rebuild the CI cache -_ROOT = Path.home() / ".cache" / "squidpy" - - -def _print_message(func_name: str, path: Path, *, dry_run: bool = False) -> None: - prefix = "[DRY RUN]" if dry_run else "" - if path.is_file(): - print(f"{prefix}[Loading] {func_name:>25} <- {str(path):>25}") - else: - print(f"{prefix}[Downloading] {func_name:>25} -> {str(path):>25}") - - -def _maybe_download_data(func_name: str, path: Path) -> Any: - import squidpy as sq - - try: - return getattr(sq.datasets, func_name)(path=path) - except Exception as e: # noqa: BLE001 - print(f"File {str(path):>25} seems to be corrupted: {e}. Removing and retrying") - path.unlink() - - return getattr(sq.datasets, func_name)(path=path) def main(args: argparse.Namespace) -> None: from anndata import AnnData import squidpy as sq + from squidpy.datasets._downloader import get_downloader - all_datasets = sq.datasets._dataset.__all__ + sq.datasets._image.__all__ - all_extensions = ["h5ad"] * len(sq.datasets._dataset.__all__) + ["tiff"] * len(sq.datasets._image.__all__) + downloader = get_downloader() + registry = downloader.registry + + # Visium samples tested in CI + visium_samples_to_cache = [ + "V1_Mouse_Kidney", + "Targeted_Visium_Human_SpinalCord_Neuroscience", + "Visium_FFPE_Human_Breast_Cancer", + ] if args.dry_run: - for func_name, ext in zip(all_datasets, all_extensions): - if func_name == "visium_hne_sdata": - ext = "zarr" - path = _ROOT / f"{func_name}.{ext}" - _print_message(func_name, path, dry_run=True) + logger.info("Cache: %s", settings.datasetdir) + logger.info( + "Would download: %d AnnData, %d images, %d SpatialData, %d Visium", + len(registry.anndata_datasets), + len(registry.image_datasets), + len(registry.spatialdata_datasets), + len(visium_samples_to_cache), + ) return - # could be parallelized, but on CI it largely does not matter (usually limited to 2 cores + bandwidth limit) - for func_name, ext in zip(all_datasets, all_extensions): - if func_name == "visium_hne_sdata": - ext = "zarr" - path = _ROOT / f"{func_name}.{ext}" - - _print_message(func_name, path) - obj = visium_hne_sdata(_ROOT) + # Download all datasets - the downloader handles caching + for name in registry.anndata_datasets: + obj = getattr(sq.datasets, name)() + assert isinstance(obj, AnnData) - assert path.is_dir(), f"Expected a .zarr folder at {path}" - continue + for name in registry.image_datasets: + obj = getattr(sq.datasets, name)() + assert isinstance(obj, sq.im.ImageContainer) - path = _ROOT / f"{func_name}.{ext}" - _print_message(func_name, path) - obj = _maybe_download_data(func_name, path) + for name in registry.spatialdata_datasets: + getattr(sq.datasets, name)() - # we could do without the AnnData check as well (1 less req. in tox.ini), but it's better to be safe - assert isinstance(obj, AnnData | sq.im.ImageContainer), type(obj) - assert path.is_file(), path + for sample in visium_samples_to_cache: + obj = sq.datasets.visium(sample, include_hires_tiff=True) + assert isinstance(obj, AnnData) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download data used for tutorials/examples.") + parser = argparse.ArgumentParser(description="Download datasets to populate CI cache.") parser.add_argument( - "--dry-run", action="store_true", help="Do not download any data, just print what would be downloaded." + "--dry-run", + action="store_true", + help="Do not download, just print what would be downloaded.", ) main(parser.parse_args()) diff --git a/pyproject.toml b/pyproject.toml index 31ff9baa..795a1e12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,8 @@ dependencies = [ "omnipath>=1.0.7", "pandas>=2.1", "pillow>=8", + "pooch>=1.6", + "pyyaml>=6", "scanpy>=1.9.3", "scikit-image>=0.25", # due to https://github.com/scikit-image/scikit-image/issues/6850 breaks rescale ufunc diff --git a/src/squidpy/datasets/_10x_datasets.py b/src/squidpy/datasets/_10x_datasets.py deleted file mode 100644 index bc645f4f..00000000 --- a/src/squidpy/datasets/_10x_datasets.py +++ /dev/null @@ -1,166 +0,0 @@ -from __future__ import annotations - -import tarfile -from pathlib import Path -from typing import Literal, NamedTuple - -import spatialdata as sd -from anndata import AnnData -from scanpy import settings - -from squidpy._constants._constants import TenxVersions -from squidpy.datasets._utils import DEFAULT_CACHE_DIR, PathLike, _get_zipped_dataset, download_file - -__all__ = ["visium"] - - -class VisiumFiles(NamedTuple): - feature_matrix: str - spatial_attrs: str - tif_image: str - - -VisiumDatasets = Literal[ - # spaceranger version 1.1.0 datasets - "V1_Breast_Cancer_Block_A_Section_1", - "V1_Breast_Cancer_Block_A_Section_2", - "V1_Human_Heart", - "V1_Human_Lymph_Node", - "V1_Mouse_Kidney", - "V1_Adult_Mouse_Brain", - "V1_Mouse_Brain_Sagittal_Posterior", - "V1_Mouse_Brain_Sagittal_Posterior_Section_2", - "V1_Mouse_Brain_Sagittal_Anterior", - "V1_Mouse_Brain_Sagittal_Anterior_Section_2", - "V1_Human_Brain_Section_1", - "V1_Human_Brain_Section_2", - "V1_Adult_Mouse_Brain_Coronal_Section_1", - "V1_Adult_Mouse_Brain_Coronal_Section_2", - # spaceranger version 1.2.0 datasets - "Targeted_Visium_Human_Cerebellum_Neuroscience", - "Parent_Visium_Human_Cerebellum", - "Targeted_Visium_Human_SpinalCord_Neuroscience", - "Parent_Visium_Human_SpinalCord", - "Targeted_Visium_Human_Glioblastoma_Pan_Cancer", - "Parent_Visium_Human_Glioblastoma", - "Targeted_Visium_Human_BreastCancer_Immunology", - "Parent_Visium_Human_BreastCancer", - "Targeted_Visium_Human_OvarianCancer_Pan_Cancer", - "Targeted_Visium_Human_OvarianCancer_Immunology", - "Parent_Visium_Human_OvarianCancer", - "Targeted_Visium_Human_ColorectalCancer_GeneSignature", - "Parent_Visium_Human_ColorectalCancer", - # spaceranger version 1.3.0 datasets - "Visium_FFPE_Mouse_Brain", - "Visium_FFPE_Mouse_Brain_IF", - "Visium_FFPE_Mouse_Kidney", - "Visium_FFPE_Human_Breast_Cancer", - "Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma", - "Visium_FFPE_Human_Prostate_Cancer", - "Visium_FFPE_Human_Prostate_IF", - "Visium_FFPE_Human_Normal_Prostate", -] - - -def visium( - sample_id: VisiumDatasets, - *, - include_hires_tiff: bool = False, - base_dir: PathLike | None = None, -) -> AnnData: - """ - Download Visium `datasets `_ from *10x Genomics*. - - Parameters - ---------- - sample_id - Name of the Visium dataset. - include_hires_tiff - Whether to download the high-resolution tissue section into - :attr:`anndata.AnnData.uns` ``['spatial']['{sample_id}']['metadata']['source_image_path']``. - base_dir - Directory where to download the data. If `None`, use :attr:`scanpy.settings.datasetdir`. - - Returns - ------- - Spatial :class:`anndata.AnnData`. - """ - from squidpy.read._read import visium as read_visium - - if sample_id.startswith("V1_"): - spaceranger_version = TenxVersions.V1 - elif sample_id.startswith("Targeted_") or sample_id.startswith("Parent_"): - spaceranger_version = TenxVersions.V2 - else: - spaceranger_version = TenxVersions.V3 - - if base_dir is None: - base_dir = settings.datasetdir - base_dir = Path(base_dir) - sample_dir = base_dir / sample_id - sample_dir.mkdir(exist_ok=True, parents=True) - - url_prefix = f"https://cf.10xgenomics.com/samples/spatial-exp/{spaceranger_version}/{sample_id}/" - visium_files = VisiumFiles( - f"{sample_id}_filtered_feature_bc_matrix.h5", - f"{sample_id}_spatial.tar.gz", - f"{sample_id}_image.tif", - ) - - # download spatial data - tar_pth = sample_dir / visium_files.spatial_attrs - download_file(filename=tar_pth, backup_url=url_prefix + visium_files.spatial_attrs) - with tarfile.open(tar_pth) as f: - for el in f: - if not (sample_dir / el.name).exists(): - f.extract(el, sample_dir) - - # download counts - download_file( - filename=sample_dir / "filtered_feature_bc_matrix.h5", - backup_url=url_prefix + visium_files.feature_matrix, - ) - - if include_hires_tiff: # download image - download_file(filename=sample_dir / "image.tif", backup_url=url_prefix + visium_files.tif_image) - return read_visium( - base_dir / sample_id, - source_image_path=base_dir / sample_id / "image.tif", - ) - - return read_visium(base_dir / sample_id) - - -def visium_hne_sdata(folderpath: Path | str | None = None) -> sd.SpatialData: - """ - Downloads a Visium H&E dataset into a specified folder and returns it as a `SpatialData` object. - - It downloads and extracts the dataset into: - - `/visium_hne_sdata.zip` for the compressed file - - `/visium_hne_sdata.zarr` for the extracted dataset - - Parameters - ---------- - folderpath : Path | str - A folder path where the dataset will be downloaded and extracted. The resulting `.zarr` - folder is used to load the `SpatialData` object. - - Returns - ------- - SpatialData - The downloaded and extracted Visium H&E dataset as a `SpatialData` object. - """ - - FIGSHARE_ID = "52370645" - DATASET_NAME = "visium_hne_sdata" - - if folderpath is None: - folderpath = DEFAULT_CACHE_DIR - else: - folderpath = Path(folderpath).expanduser().absolute() - - return _get_zipped_dataset( - folderpath=folderpath, - dataset_name=DATASET_NAME, - figshare_id=FIGSHARE_ID, - ) diff --git a/src/squidpy/datasets/__init__.py b/src/squidpy/datasets/__init__.py index f5fad42a..82bea08b 100644 --- a/src/squidpy/datasets/__init__.py +++ b/src/squidpy/datasets/__init__.py @@ -1,7 +1,57 @@ from __future__ import annotations -from squidpy.datasets._10x_datasets import visium, visium_hne_sdata -from squidpy.datasets._dataset import * # noqa: F403 -from squidpy.datasets._image import * # noqa: F403 +from squidpy.datasets._datasets import ( + # Type aliases for dataset names + AnnDataDatasets, + ImageDatasets, + SpatialDataDatasets, + VisiumDatasets, + # AnnData datasets + four_i, + imc, + merfish, + mibitof, + sc_mouse_cortex, + seqfish, + slideseqv2, + # 10x Genomics Visium + visium, + visium_fluo_adata, + visium_fluo_adata_crop, + # Image datasets + visium_fluo_image_crop, + visium_hne_adata, + visium_hne_adata_crop, + visium_hne_image, + visium_hne_image_crop, + visium_hne_sdata, +) -__all__ = ["visium", "visium_hne_sdata"] +__all__ = [ + # Type aliases + "VisiumDatasets", + "AnnDataDatasets", + "ImageDatasets", + "SpatialDataDatasets", + # Datasets by format: + # AnnData + "four_i", + "imc", + "seqfish", + "visium_hne_adata", + "visium_hne_adata_crop", + "visium_fluo_adata", + "visium_fluo_adata_crop", + "sc_mouse_cortex", + "mibitof", + "merfish", + "slideseqv2", + # AnnData with Image + "visium", + # Image + "visium_fluo_image_crop", + "visium_hne_image_crop", + "visium_hne_image", + # SpatialData + "visium_hne_sdata", +] diff --git a/src/squidpy/datasets/_dataset.py b/src/squidpy/datasets/_dataset.py deleted file mode 100644 index 75b4d65d..00000000 --- a/src/squidpy/datasets/_dataset.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import annotations - -from copy import copy -from typing import Any, Protocol - -from anndata import AnnData - -from squidpy.datasets._10x_datasets import visium_hne_sdata -from squidpy.datasets._utils import AMetadata, PathLike - - -class Dataset(Protocol): - def __call__(self, path: PathLike | None = ..., **kwargs: Any) -> AnnData: ... - - -_4i = AMetadata( - name="four_i", - doc_header="Pre-processed subset 4i dataset from `Gut et al `__.", - shape=(270876, 43), - url="https://ndownloader.figshare.com/files/26254294", -) -_imc = AMetadata( - name="imc", - doc_header="Pre-processed subset IMC dataset from `Jackson et al " - "`__.", - shape=(4668, 34), - url="https://ndownloader.figshare.com/files/26098406", -) -_seqfish = AMetadata( - name="seqfish", - doc_header="Pre-processed subset seqFISH dataset from `Lohoff et al " - "`__.", - shape=(19416, 351), - url="https://ndownloader.figshare.com/files/26098403", -) -_vha = AMetadata( - name="visium_hne_adata", - doc_header="Pre-processed `10x Genomics Visium H&E dataset " - "`__.", - shape=(2688, 18078), - url="https://ndownloader.figshare.com/files/26098397", -) -_vfa = AMetadata( - name="visium_fluo_adata", - doc_header="Pre-processed `10x Genomics Visium Fluorecent dataset " - "`__.", - shape=(2800, 16562), - url="https://ndownloader.figshare.com/files/26098391", -) -_vhac = AMetadata( - name="visium_hne_adata_crop", - doc_header="Pre-processed subset `10x Genomics Visium H&E dataset " - "`__.", - shape=(684, 18078), - url="https://ndownloader.figshare.com/files/26098382", -) -_vfac = AMetadata( - name="visium_fluo_adata_crop", - doc_header="Pre-processed subset `10x Genomics Visium Fluorescent dataset " - "`__.", - shape=(704, 16562), - url="https://ndownloader.figshare.com/files/26098376", -) -_smc = AMetadata( - name="sc_mouse_cortex", - doc_header="Pre-processed `scRNA-seq mouse cortex `__.", - shape=(21697, 36826), - url="https://ndownloader.figshare.com/files/26404781", -) -_mibitof = AMetadata( - name="mibitof", - doc_header="Pre-processed MIBI-TOF dataset from `Hartmann et al `__.", - shape=(3309, 36), - url="https://ndownloader.figshare.com/files/28241139", -) -_merfish = AMetadata( - name="merfish", - doc_header="Pre-processed MERFISH dataset from `Moffitt et al `__.", - shape=(73655, 161), - url="https://ndownloader.figshare.com/files/28169379", -) -_slideseqv2 = AMetadata( - name="slideseqv2", - doc_header="Pre-processed SlideseqV2 dataset from `Stickles et al `__.", - shape=(41786, 4000), - url="https://ndownloader.figshare.com/files/28242783", -) - -for name, var in copy(locals()).items(): - if isinstance(var, AMetadata): - var._create_function(name, globals()) - - -__all__ = [ # noqa: F822 - "four_i", - "imc", - "seqfish", - "visium_hne_adata", - "visium_hne_adata_crop", - "visium_hne_sdata", - "visium_fluo_adata", - "visium_fluo_adata_crop", - "sc_mouse_cortex", - "mibitof", - "merfish", - "slideseqv2", -] diff --git a/src/squidpy/datasets/_datasets.py b/src/squidpy/datasets/_datasets.py new file mode 100644 index 00000000..f219caf6 --- /dev/null +++ b/src/squidpy/datasets/_datasets.py @@ -0,0 +1,246 @@ +"""Public dataset interface functions using hardcoded dataset names. + +This module provides the public API for downloading squidpy datasets. +All functions fetch datasets by their known names from the registry. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +from scanpy import settings + +from squidpy.datasets._downloader import get_downloader +from squidpy.datasets._registry import DatasetType, get_registry +from squidpy.read._utils import PathLike + +if TYPE_CHECKING: + import spatialdata as sd + from anndata import AnnData + + +# ============================================================================= +# Hardcoded dataset name types +# ============================================================================= + +# 10x Genomics Visium datasets (visium_10x type) +VisiumDatasets = Literal[ + # spaceranger version 1.1.0 datasets + "V1_Breast_Cancer_Block_A_Section_1", + "V1_Breast_Cancer_Block_A_Section_2", + "V1_Human_Heart", + "V1_Human_Lymph_Node", + "V1_Mouse_Kidney", + "V1_Adult_Mouse_Brain", + "V1_Mouse_Brain_Sagittal_Posterior", + "V1_Mouse_Brain_Sagittal_Posterior_Section_2", + "V1_Mouse_Brain_Sagittal_Anterior", + "V1_Mouse_Brain_Sagittal_Anterior_Section_2", + "V1_Human_Brain_Section_1", + "V1_Human_Brain_Section_2", + "V1_Adult_Mouse_Brain_Coronal_Section_1", + "V1_Adult_Mouse_Brain_Coronal_Section_2", + # spaceranger version 1.2.0 datasets + "Targeted_Visium_Human_Cerebellum_Neuroscience", + "Parent_Visium_Human_Cerebellum", + "Targeted_Visium_Human_SpinalCord_Neuroscience", + "Parent_Visium_Human_SpinalCord", + "Targeted_Visium_Human_Glioblastoma_Pan_Cancer", + "Parent_Visium_Human_Glioblastoma", + "Targeted_Visium_Human_BreastCancer_Immunology", + "Parent_Visium_Human_BreastCancer", + "Targeted_Visium_Human_OvarianCancer_Pan_Cancer", + "Targeted_Visium_Human_OvarianCancer_Immunology", + "Parent_Visium_Human_OvarianCancer", + "Targeted_Visium_Human_ColorectalCancer_GeneSignature", + "Parent_Visium_Human_ColorectalCancer", + # spaceranger version 1.3.0 datasets + "Visium_FFPE_Mouse_Brain", + "Visium_FFPE_Mouse_Brain_IF", + "Visium_FFPE_Mouse_Kidney", + "Visium_FFPE_Human_Breast_Cancer", + "Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma", + "Visium_FFPE_Human_Prostate_Cancer", + "Visium_FFPE_Human_Prostate_IF", + "Visium_FFPE_Human_Normal_Prostate", +] + +# AnnData datasets (.h5ad) +AnnDataDatasets = Literal[ + "four_i", + "imc", + "seqfish", + "visium_hne_adata", + "visium_hne_adata_crop", + "visium_fluo_adata", + "visium_fluo_adata_crop", + "sc_mouse_cortex", + "mibitof", + "merfish", + "slideseqv2", +] + +# Image datasets (.tiff) +ImageDatasets = Literal[ + "visium_fluo_image_crop", + "visium_hne_image_crop", + "visium_hne_image", +] + +# SpatialData datasets (.zarr) +SpatialDataDatasets = Literal["visium_hne_sdata",] + + +# ============================================================================= +# 10x Genomics Visium functions +# ============================================================================= + + +def visium( + sample_id: VisiumDatasets, + *, + include_hires_tiff: bool = False, + base_dir: PathLike | None = None, +) -> AnnData: + """ + Download Visium `datasets `_ from *10x Genomics*. + + Parameters + ---------- + sample_id + Name of the Visium dataset. + include_hires_tiff + Whether to download the high-resolution tissue section into + :attr:`anndata.AnnData.uns` ``['spatial']['{sample_id}']['metadata']['source_image_path']``. + base_dir + Directory where to download the data. If `None`, uses :attr:`scanpy.settings.datasetdir`. + + Returns + ------- + :class:`anndata.AnnData` + Spatial AnnData object. + """ + # Validate sample_id against known names + downloader = get_downloader() + + if sample_id not in downloader.registry: + msg = f"Unknown Visium sample: {sample_id}. " + msg += f"Available samples: {downloader.registry.visium_datasets}" + raise ValueError(msg) + + # Use scanpy.settings.datasetdir/visium if base_dir not specified + if base_dir is None: + base_dir = Path(settings.datasetdir) / "visium" + + return downloader.download(sample_id, base_dir, include_hires_tiff=include_hires_tiff) + + +def visium_hne_sdata(folderpath: Path | str | None = None) -> sd.SpatialData: + """ + Download a Visium H&E dataset as a SpatialData object. + + Parameters + ---------- + folderpath + A folder path where the dataset will be downloaded and extracted. + If `None`, uses :attr:`scanpy.settings.datasetdir`. + + Returns + ------- + :class:`spatialdata.SpatialData` + The downloaded and extracted Visium H&E dataset. + """ + downloader = get_downloader() + return downloader.download("visium_hne_sdata", folderpath) + + +# ============================================================================= +# Dataset loader factory +# ============================================================================= + + +@dataclass(frozen=True) +class _DocParts: + """Documentation parts for dataset loader functions.""" + + shape_prefix: str + path_desc: str + kwargs_desc: str + return_type: str + + +_ANNDATA_DOC = _DocParts( + shape_prefix="The shape of this :class:`anndata.AnnData` object", + path_desc="Path where to save the dataset.", + kwargs_desc="Keyword arguments for ``anndata.read_h5ad``.", + return_type=":class:`anndata.AnnData`\n The dataset.", +) + +_IMAGE_DOC = _DocParts( + shape_prefix="The shape of this image is", + path_desc="Path where to save the .tiff image.", + kwargs_desc="Keyword arguments for :meth:`squidpy.im.ImageContainer.add_img`.", + return_type=":class:`squidpy.im.ImageContainer`\n The image data.", +) + +_DOC_PARTS_BY_TYPE: dict[DatasetType, _DocParts] = { + DatasetType.ANNDATA: _ANNDATA_DOC, + DatasetType.IMAGE: _IMAGE_DOC, +} + + +def _make_loader(dataset_name: str): + """Factory function to create dataset loader functions. + + Automatically derives documentation from the registry based on dataset type. + """ + entry = get_registry().get(dataset_name) + + if entry is None: + raise ValueError(f"Unknown dataset: {dataset_name}") + + doc_parts = _DOC_PARTS_BY_TYPE.get(entry.type) + if doc_parts is None: + raise ValueError(f"Unsupported type for loader factory: {entry.type}") + + def loader(path: PathLike | None = None, **kwargs: Any): + return get_downloader().download(dataset_name, path, **kwargs) + + loader.__doc__ = f""" + {entry.doc_header} + + {doc_parts.shape_prefix} ``{entry.shape}``. + + Parameters + ---------- + path + {doc_parts.path_desc} + kwargs + {doc_parts.kwargs_desc} + + Returns + ------- + {doc_parts.return_type} + """ + loader.__name__ = dataset_name + return loader + + +# AnnData datasets +four_i = _make_loader("four_i") +imc = _make_loader("imc") +seqfish = _make_loader("seqfish") +visium_hne_adata = _make_loader("visium_hne_adata") +visium_fluo_adata = _make_loader("visium_fluo_adata") +visium_hne_adata_crop = _make_loader("visium_hne_adata_crop") +visium_fluo_adata_crop = _make_loader("visium_fluo_adata_crop") +sc_mouse_cortex = _make_loader("sc_mouse_cortex") +mibitof = _make_loader("mibitof") +merfish = _make_loader("merfish") +slideseqv2 = _make_loader("slideseqv2") +# Image datasets +visium_fluo_image_crop = _make_loader("visium_fluo_image_crop") +visium_hne_image_crop = _make_loader("visium_hne_image_crop") +visium_hne_image = _make_loader("visium_hne_image") diff --git a/src/squidpy/datasets/_downloader.py b/src/squidpy/datasets/_downloader.py new file mode 100644 index 00000000..8492f790 --- /dev/null +++ b/src/squidpy/datasets/_downloader.py @@ -0,0 +1,290 @@ +"""Unified dataset downloader using pooch.""" + +from __future__ import annotations + +import shutil +import tarfile +from functools import lru_cache +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import pooch +from scanpy import settings +from spatialdata._logging import logger as logg + +from squidpy.datasets._registry import ( + DatasetEntry, + DatasetRegistry, + DatasetType, + FileEntry, + get_registry, +) + +if TYPE_CHECKING: + from anndata import AnnData + from spatialdata import SpatialData + + from squidpy.im import ImageContainer + +__all__ = [ + "DatasetDownloader", + "download", + "get_downloader", +] + + +class DatasetDownloader: + """Unified downloader for all squidpy datasets. + + Parameters + ---------- + cache_dir + Directory to cache downloaded files. Defaults to :attr:`scanpy.settings.datasetdir`. + s3_base_url + Base URL for S3 bucket. If None, uses the value from datasets.yaml. + """ + + def __init__( + self, + registry: DatasetRegistry, + cache_dir: Path | str | None = None, + s3_base_url: str | None = None, + ): + self.cache_dir = Path(cache_dir or settings.datasetdir) + + self.cache_dir.mkdir(parents=True, exist_ok=True) + + self.registry = registry + self._s3_base_url = s3_base_url or self.registry.s3_base_url + + def _resolve_path( + self, + path: Path | str | None, + file_entry: FileEntry, + default_subdir: str, + ) -> tuple[Path, str]: + """Resolve target directory and filename from path argument.""" + if path is not None: + path = Path(path) + target_dir = path.parent + suffix = Path(file_entry.name).suffix + target_name = path.name if path.suffix else f"{path.name}{suffix}" + else: + target_dir = self.cache_dir / default_subdir + target_name = file_entry.name + return target_dir, target_name + + def _download_file( + self, + file_entry: FileEntry, + target_dir: Path, + target_name: str | None = None, + ) -> Path: + """Download a single file.""" + target_dir.mkdir(parents=True, exist_ok=True) + filename = target_name or file_entry.name + local_path = target_dir / filename + + if local_path.exists(): + logg.debug(f"Using cached file: {local_path}") + return local_path + + urls = file_entry.get_urls(self._s3_base_url) + errors: list[Exception] = [] + + for url in urls: + try: + logg.info(f"Downloading {filename} from {url}") + downloaded = pooch.retrieve( + url=url, + known_hash=(f"sha256:{file_entry.sha256}" if file_entry.sha256 else None), + fname=filename, + path=str(target_dir), + progressbar=True, + ) + return Path(downloaded) + except (OSError, ValueError, RuntimeError) as e: + errors.append(e) + logg.warning(f"Failed to download from {url}: {e}") + + msg = f"Failed to download {filename}" + raise ExceptionGroup(msg, errors) + + def download(self, name: str, path: Path | str | None = None, **kwargs: Any) -> Any: + """Download a dataset by name and return the appropriate object. + + Parameters + ---------- + name + Dataset name from the registry. + path + Optional custom path for download. + **kwargs + Additional arguments passed to the loader. + + Returns + ------- + Loaded dataset. + """ + if name not in self.registry: + raise ValueError(f"Unknown dataset: {name}. Available: {self.registry.all_names}") + + entry = self.registry[name] + loaders = { + DatasetType.ANNDATA: lambda: self._load_anndata(entry, path, **kwargs), + DatasetType.IMAGE: lambda: self._load_image(entry, path, **kwargs), + DatasetType.SPATIALDATA: lambda: self._load_spatialdata(entry, path), + DatasetType.VISIUM_10X: lambda: self._load_visium_10x( + entry, + path, + include_hires_tiff=kwargs.pop("include_hires_tiff", False), + ), + } + + loader = loaders.get(entry.type) + if loader is None: + raise ValueError(f"Unknown dataset type: {entry.type}") + return loader() + + def _load_anndata( + self, + entry: DatasetEntry, + path: Path | str | None = None, + **kwargs: Any, + ) -> AnnData: + """Download and load an AnnData dataset.""" + import anndata + + file_entry = entry.get_file_by_suffix(".h5ad") + if file_entry is None: + raise ValueError(f"Dataset {entry.name} has no .h5ad file") + target_dir, target_name = self._resolve_path(path, file_entry, "anndata") + + local_path = self._download_file(file_entry, target_dir, target_name) + adata = anndata.read_h5ad(local_path, **kwargs) + + if entry.shape is not None and adata.shape != entry.shape: + logg.warning(f"Expected shape {entry.shape}, got {adata.shape}") + + return adata + + def _load_image( + self, + entry: DatasetEntry, + path: Path | str | None = None, + **kwargs: Any, + ) -> ImageContainer: + """Download and load an image dataset.""" + from squidpy.im import ImageContainer + + file_entry = entry.get_file_by_suffix(".tiff") + if file_entry is None: + raise ValueError(f"Dataset {entry.name} has no .tiff file") + target_dir, target_name = self._resolve_path(path, file_entry, "images") + + local_path = self._download_file(file_entry, target_dir, target_name) + + img = ImageContainer() + img.add_img(local_path, layer="image", library_id=entry.library_id, **kwargs) + return img + + def _load_spatialdata( + self, + entry: DatasetEntry, + path: Path | str | None = None, + ) -> SpatialData: + """Download and load a SpatialData dataset.""" + import spatialdata as sd + + file_entry = entry.get_file_by_suffix(".zip") + if file_entry is None: + raise ValueError(f"Dataset {entry.name} has no .zip file") + folder = Path(path or self.cache_dir / "spatialdata") + folder.mkdir(parents=True, exist_ok=True) + + zarr_path = folder / f"{entry.name}.zarr" + + if zarr_path.exists(): + logg.info(f"Loading existing dataset from {zarr_path}") + return sd.read_zarr(zarr_path) + + zip_path = self._download_file(file_entry, folder) + logg.info(f"Extracting {zip_path} to {folder}") + shutil.unpack_archive(str(zip_path), folder) + + if not zarr_path.exists(): + raise RuntimeError(f"Expected extracted data at {zarr_path}, but not found") + + return sd.read_zarr(zarr_path) + + def _load_visium_10x( + self, + entry: DatasetEntry, + path: Path | str | None = None, + include_hires_tiff: bool = False, + ) -> AnnData: + """Download and load a 10x Genomics Visium dataset.""" + from squidpy.read._read import visium as read_visium + + base_dir = Path(path or self.cache_dir / "visium") + sample_dir = base_dir / entry.name + sample_dir.mkdir(parents=True, exist_ok=True) + + # Download feature matrix + matrix_file = entry.get_file("filtered_feature_bc_matrix.h5") + if matrix_file is None: + raise ValueError(f"Dataset {entry.name} missing filtered_feature_bc_matrix.h5") + self._download_file(matrix_file, sample_dir) + + # Download and extract spatial data + spatial_file = entry.get_file("spatial.tar.gz") + if spatial_file is None: + raise ValueError(f"Dataset {entry.name} missing spatial.tar.gz") + + spatial_path = self._download_file(spatial_file, sample_dir) + with tarfile.open(spatial_path) as f: + for member in f: + if not (sample_dir / member.name).exists(): + f.extract(member, sample_dir) + + # Optionally download high-res image + source_image_path = None + if include_hires_tiff: + image_file = entry.get_file_by_name_prefix("image.") + if image_file is None: + logg.warning(f"High-res image not available for {entry.name}") + else: + try: + self._download_file(image_file, sample_dir) + source_image_path = sample_dir / image_file.name + except (OSError, ValueError, RuntimeError) as e: + logg.warning(f"Failed to download high-res image: {e}") + + if source_image_path and source_image_path.exists(): + return read_visium(sample_dir, source_image_path=source_image_path) + return read_visium(sample_dir) + + +@lru_cache(maxsize=1) +def get_downloader() -> DatasetDownloader: + """Get the singleton downloader instance.""" + return DatasetDownloader(registry=get_registry()) + + +def download(name: str, path: Path | str | None = None, **kwargs: Any) -> Any: + """Download a dataset by name. + + Parameters + ---------- + name + Dataset name. + path + Optional custom path. + **kwargs + Additional arguments passed to the loader. + + Returns + ------- + Loaded dataset. + """ + return get_downloader().download(name, path, **kwargs) diff --git a/src/squidpy/datasets/_image.py b/src/squidpy/datasets/_image.py deleted file mode 100644 index 67549b72..00000000 --- a/src/squidpy/datasets/_image.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import annotations - -from copy import copy -from typing import Any, Protocol - -from squidpy.datasets._utils import ImgMetadata, PathLike -from squidpy.im._container import ImageContainer - - -class ImageDataset(Protocol): - def __call__(self, path: PathLike | None = ..., **kwargs: Any) -> ImageContainer: ... - - -_vfic = ImgMetadata( - name="visium_fluo_image_crop", - doc_header="Cropped Fluorescent image from `10x Genomics Visium dataset " - "`__.", - shape=(7272, 7272), - library_id="V1_Adult_Mouse_Brain_Coronal_Section_2", - url="https://ndownloader.figshare.com/files/26098364", -) -_vhic = ImgMetadata( - name="visium_hne_image_crop", - doc_header="Cropped H&E image from `10x Genomics Visium dataset " - "`__.", - shape=(3527, 3527), - library_id="V1_Adult_Mouse_Brain", - url="https://ndownloader.figshare.com/files/26098328", -) -_vhn = ImgMetadata( - name="visium_hne_image", - doc_header="H&E image from `10x Genomics Visium dataset " - "`__.", - shape=(11757, 11291), - library_id="V1_Adult_Mouse_Brain", - url="https://ndownloader.figshare.com/files/26098124", -) - - -for name, var in copy(locals()).items(): - if isinstance(var, ImgMetadata): - var._create_function(name, glob_ns=globals()) - - -__all__ = [ # noqa: F822 - "visium_fluo_image_crop", - "visium_hne_image_crop", - "visium_hne_image", -] diff --git a/src/squidpy/datasets/_registry.py b/src/squidpy/datasets/_registry.py new file mode 100644 index 00000000..da5cdb67 --- /dev/null +++ b/src/squidpy/datasets/_registry.py @@ -0,0 +1,198 @@ +"""Unified dataset registry loaded from YAML configuration.""" + +from __future__ import annotations + +import importlib.resources +from dataclasses import dataclass, field +from enum import Enum +from functools import lru_cache +from typing import TYPE_CHECKING + +import yaml + +if TYPE_CHECKING: + from collections.abc import Iterator + from importlib.resources.abc import Traversable + + from squidpy.read._utils import PathLike + +__all__ = ["DatasetType", "FileEntry", "DatasetEntry", "DatasetRegistry", "get_registry"] + + +def _get_config_traversable() -> Traversable: + """Get the file-like object to datasets.yaml using importlib.resources for robustness.""" + # Using importlib.resources for robust path resolution across different installation methods + # (editable installs, zip imports, etc.) + return importlib.resources.files("squidpy.datasets").joinpath("datasets.yaml") + + +class DatasetType(Enum): + """Types of datasets.""" + + ANNDATA = "anndata" + IMAGE = "image" + SPATIALDATA = "spatialdata" + VISIUM_10X = "visium_10x" + + +@dataclass(frozen=True) +class FileEntry: + """Metadata for a single file within a dataset.""" + + name: str + s3_key: str + sha256: str | None = None + + def get_urls(self, s3_base_url: str) -> list[str]: + """Return list of URLs to try, primary (S3) first, then fallback.""" + urls = [] + if s3_base_url and self.s3_key: + urls.append(f"{s3_base_url.rstrip('/')}/{self.s3_key}") + return urls + + +@dataclass +class DatasetEntry: + """Metadata for a dataset (can have one or multiple files).""" + + name: str + type: DatasetType + files: list[FileEntry] + shape: tuple[int, ...] | None = None + doc_header: str | None = None + library_id: str | None = None + + def get_file(self, name: str) -> FileEntry | None: + """Get a specific file by name.""" + for f in self.files: + if f.name == name: + return f + return None + + def get_file_by_suffix(self, suffix: str) -> FileEntry | None: + """Get a file by suffix (e.g., 'filtered_feature_bc_matrix.h5').""" + for f in self.files: + if f.name.endswith(suffix): + return f + return None + + def get_file_by_name_prefix(self, prefix: str) -> FileEntry | None: + """Get a file by prefix of its name (e.g., 'image.' to find image.tif or image.jpg).""" + for f in self.files: + if f.name.startswith(prefix): + return f + return None + + +@dataclass +class DatasetRegistry: + """Central registry for all squidpy datasets.""" + + s3_base_url: str = "" + datasets: dict[str, DatasetEntry] = field(default_factory=dict) + + @classmethod + def from_yaml(cls, config_path: PathLike | None = None) -> DatasetRegistry: + """Load registry from YAML configuration file.""" + # This case should be always true + # only for testing and tinkering config_path should be provided + if config_path is None: + with _get_config_traversable().open() as f: + config = yaml.safe_load(f) + else: + with open(config_path) as f: + config = yaml.safe_load(f) + + registry = cls(s3_base_url=config.get("s3_base_url", "")) + + # Load all datasets + for name, data in config.get("datasets", {}).items(): + # Parse files + files = [] + for file_data in data.get("files", []): + files.append( + FileEntry( + name=file_data["name"], + s3_key=file_data["s3_key"], + sha256=file_data.get("sha256"), + ) + ) + + # Parse shape + shape = None + if "shape" in data: + shape_data = data["shape"] + if isinstance(shape_data, list): + shape = tuple(shape_data) + else: + shape = shape_data + + registry.datasets[name] = DatasetEntry( + name=name, + type=DatasetType(data["type"]), + files=files, + shape=shape, + doc_header=data.get("doc_header"), + library_id=data.get("library_id"), + ) + + return registry + + def get(self, name: str) -> DatasetEntry | None: + """Get a dataset by name.""" + return self.datasets.get(name) + + def __getitem__(self, name: str) -> DatasetEntry: + """Get a dataset by name, raises KeyError if not found.""" + if name not in self.datasets: + raise KeyError(f"Unknown dataset: {name}. Available: {list(self.datasets.keys())}") + return self.datasets[name] + + def __contains__(self, name: str) -> bool: + """Check if dataset exists.""" + return name in self.datasets + + def iter_by_type(self, dataset_type: DatasetType) -> Iterator[DatasetEntry]: + """Iterate over datasets of a specific type.""" + for entry in self.datasets.values(): + if entry.type == dataset_type: + yield entry + + @property + def anndata_datasets(self) -> list[str]: + """Return names of all AnnData datasets.""" + return [name for name, entry in self.datasets.items() if entry.type == DatasetType.ANNDATA] + + @property + def image_datasets(self) -> list[str]: + """Return names of all image datasets.""" + return [name for name, entry in self.datasets.items() if entry.type == DatasetType.IMAGE] + + @property + def spatialdata_datasets(self) -> list[str]: + """Return names of all SpatialData datasets.""" + return [name for name, entry in self.datasets.items() if entry.type == DatasetType.SPATIALDATA] + + @property + def visium_10x_datasets(self) -> list[str]: + """Return names of all 10x Genomics Visium datasets.""" + return [name for name, entry in self.datasets.items() if entry.type == DatasetType.VISIUM_10X] + + @property + def visium_datasets(self) -> list[str]: + """Return names of all Visium datasets (alias for visium_10x_datasets).""" + return self.visium_10x_datasets + + @property + def all_names(self) -> list[str]: + """Return all dataset names.""" + return list(self.datasets.keys()) + + +@lru_cache(maxsize=1) +def get_registry() -> DatasetRegistry: + """Get the singleton dataset registry instance. + + Uses lru_cache to ensure a single instance without mutable global state. + """ + return DatasetRegistry.from_yaml() diff --git a/src/squidpy/datasets/_utils.py b/src/squidpy/datasets/_utils.py deleted file mode 100644 index a0d0188e..00000000 --- a/src/squidpy/datasets/_utils.py +++ /dev/null @@ -1,235 +0,0 @@ -from __future__ import annotations - -import os -import shutil -from abc import ABC, abstractmethod -from collections.abc import Callable, Sequence -from dataclasses import dataclass, field -from inspect import Parameter, Signature, signature -from pathlib import Path -from typing import Any, TypeAlias - -import anndata -import pooch -import spatialdata as sd -from anndata import AnnData -from scanpy import logging as logg -from scanpy import read - -from squidpy.im import ImageContainer - -PathLike: TypeAlias = os.PathLike[str] | str -Function_t: TypeAlias = Callable[..., AnnData | Any] -DEFAULT_CACHE_DIR = Path.home() / ".cache" / "squidpy" - - -@dataclass(frozen=True) -class Metadata(ABC): - """Base class handling metadata.""" - - name: str - url: str - - doc_header: str | None = field(default=None, repr=False) - path: PathLike | None = field(default=None, repr=False) - shape: tuple[int, int] | None = field(default=None, repr=False) - library_id: str | Sequence[str] | None = field(default=None, repr=False) - - _DOC_FMT = "" - - def __post_init__(self) -> None: - if self.doc_header is None: - object.__setattr__(self, "doc_header", f"Download `{self.name.title().replace('_', ' ')}` data.") - if self.path is None: - object.__setattr__(self, "path", os.path.expanduser(f"~/.cache/squidpy/{self.name}")) - - @property - @abstractmethod - def _extension(self) -> str: - pass - - @abstractmethod - def _download(self, fpath: PathLike, backup_url: str, **kwargs: Any) -> Any: - pass - - @abstractmethod - def _create_signature(self) -> Signature: - pass - - def _create_function(self, name: str, glob_ns: dict[str, Any]) -> None: - if name in globals(): - raise KeyError(f"Function name `{name}` is already present in `{sorted(globals().keys())}`.") - - sig = self._create_signature() - globals()["NoneType"] = type(None) # __post_init__ return annotation - globals()[name] = self - - exec( - f"def {self.name}{sig}:\n" - f' """' - f" {self._DOC_FMT.format(doc_header=self.doc_header, shape=self.shape)}" - f' """\n' - f" return {name}.download(path, **kwargs)".replace(" /,", ""), - globals(), - glob_ns, - ) - - def download(self, fpath: PathLike | None = None, **kwargs: Any) -> Any: - """Download the dataset into ``fpath``.""" - fpath = str(self.path if fpath is None else fpath) - if not fpath.endswith(self._extension): - fpath += self._extension - - if os.path.isfile(fpath): - logg.debug(f"Loading dataset `{self.name}` from `{fpath}`") - else: - logg.debug(f"Downloading dataset `{self.name}` from `{self.url}` as `{fpath}`") - - dirname = Path(fpath).parent - try: - if not dirname.is_dir(): - logg.info(f"Creating directory `{dirname}`") - dirname.mkdir(parents=True, exist_ok=True) - except OSError as e: - logg.error(f"Unable to create directory `{dirname}`. Reason `{e}`") - - data = self._download(fpath=fpath, backup_url=self.url, **kwargs) - - if self.shape is not None and data.shape != self.shape: - raise ValueError(f"Expected the data to have shape `{self.shape}`, found `{data.shape}`.") - - return data - - -class AMetadata(Metadata): - """Metadata class for :class:`anndata.AnnData`.""" - - _DOC_FMT = """ - {doc_header} - - The shape of this :class:`anndata.AnnData` object ``{shape}``. - - Parameters - ---------- - path - Path where to save the dataset. - kwargs - Keyword arguments for :func:`scanpy.read`. - - Returns - ------- - The dataset.""" - - def _create_signature(self) -> Signature: - return signature(lambda _: _).replace( - parameters=[ - Parameter("path", kind=Parameter.POSITIONAL_OR_KEYWORD, annotation=PathLike, default=None), - Parameter("kwargs", kind=Parameter.VAR_KEYWORD, annotation=Any), - ], - return_annotation=anndata.AnnData, - ) - - def _download(self, fpath: PathLike, backup_url: str, **kwargs: Any) -> AnnData: - kwargs.setdefault("sparse", True) - kwargs.setdefault("cache", True) - - return read(filename=fpath, backup_url=backup_url, **kwargs) - - @property - def _extension(self) -> str: - return ".h5ad" - - -class ImgMetadata(Metadata): - """Metadata class for :class:`squidpy.im.ImageContainer`.""" - - _DOC_FMT = """ - {doc_header} - - The shape of this image is ``{shape}``. - - Parameters - ---------- - path - Path where to save the .tiff image. - kwargs - Keyword arguments for :meth:`squidpy.im.ImageContainer.add_img`. - - Returns - ------- - :class:`squidpy.im.ImageContainer` The image data.""" - # not the perfect annotation, but better than nothing - _EXT = ".tiff" - - def _create_signature(self) -> Signature: - return signature(lambda _: _).replace( - parameters=[ - Parameter("path", kind=Parameter.POSITIONAL_OR_KEYWORD, annotation=PathLike, default=None), - Parameter("kwargs", kind=Parameter.VAR_KEYWORD, annotation=Any), - ], - ) - - def _download(self, fpath: PathLike, backup_url: str, **kwargs: Any) -> Any: - download_file(filename=Path(fpath), backup_url=backup_url) - - img = ImageContainer() - img.add_img(fpath, layer="image", library_id=self.library_id, **kwargs) - - return img - - @property - def _extension(self) -> str: - return ".tiff" - - -def _get_zipped_dataset(folderpath: Path, dataset_name: str, figshare_id: str) -> sd.SpatialData: - """Returns a specific dataset as SpatialData object. If the file is not present on disk, it will be downloaded and extracted.""" - - # Create directory if it doesn't exist - if not folderpath.exists(): - logg.info(f"Creating directory `{folderpath}`") - folderpath.mkdir(parents=True, exist_ok=True) - elif not folderpath.is_dir(): - raise ValueError(f"Expected a directory path for `folderpath`, found: {folderpath}") - - download_zip = folderpath / f"{dataset_name}.zip" - extracted_path = folderpath / f"{dataset_name}.zarr" - - # Return early if data is already extracted - if extracted_path.exists(): - logg.info(f"Loading existing dataset from {extracted_path}") - return sd.read_zarr(extracted_path) - - # Download if necessary - if not download_zip.exists(): - logg.info(f"Downloading Visium H&E SpatialData to {download_zip}") - try: - download_file(filename=download_zip, backup_url=f"https://ndownloader.figshare.com/files/{figshare_id}") - except Exception as e: - raise RuntimeError(f"Failed to download dataset: {e}") from e - - # Extract if necessary - if not extracted_path.exists(): - logg.info(f"Extracting dataset from {download_zip} to {extracted_path}") - try: - shutil.unpack_archive(str(download_zip), folderpath) - except Exception as e: - raise RuntimeError(f"Failed to extract dataset: {e}") from e - - if not extracted_path.exists(): - raise RuntimeError(f"Expected extracted data at {extracted_path}, but not found") - - return sd.read_zarr(extracted_path) - - -def download_file(filename: PathLike, backup_url: str) -> None: - """ - Replacement for scanpy._utils.check_presence_download using Pooch. - Saves to the exact local path specified in 'filename'. - """ - pooch.retrieve( - url=backup_url, - known_hash=None, - fname=os.path.basename(filename), - path=os.path.dirname(filename) or ".", # Handles current dir if no folder - ) diff --git a/src/squidpy/datasets/datasets.yaml b/src/squidpy/datasets/datasets.yaml new file mode 100644 index 00000000..18de8f7e --- /dev/null +++ b/src/squidpy/datasets/datasets.yaml @@ -0,0 +1,648 @@ +# Squidpy Dataset Registry +# +# Each dataset has: +# - type: anndata (.h5ad)| image (.tiff) | spatialdata (.zip -> .zarr) | visium_10x (3 files) +# - files: list of files with individual URLs and hashes +# - metadata: shape, doc_header, library_id, etc. + +s3_base_url: https://scverse-exampledata.s3.amazonaws.com/squidpy/ + +datasets: + # =========================================================================== + # AnnData datasets (.h5ad) + # =========================================================================== + four_i: + type: anndata + shape: [270876, 43] + doc_header: "Pre-processed subset 4i dataset from `Gut et al `__." + files: + - name: four_i.h5ad + s3_key: figshare/four_i.h5ad + sha256: 894e54af155c8ce94bbeeac1056431de9cc0e86460e49cd38ca1a5f952e32124 + + imc: + type: anndata + shape: [4668, 34] + doc_header: "Pre-processed subset IMC dataset from `Jackson et al `__." + files: + - name: imc.h5ad + s3_key: figshare/imc.h5ad + sha256: 950c44c785ea86c4262140b0229e0b4f77110a765c3b6874cdb5e0e52973c6fe + + seqfish: + type: anndata + shape: [19416, 351] + doc_header: "Pre-processed subset seqFISH dataset from `Lohoff et al `__." + files: + - name: seqfish.h5ad + s3_key: figshare/seqfish.h5ad + sha256: 7e544c0ede7538067537da69c52748ad01522ef7fc8691e077fd73c9434019f7 + + visium_hne_adata: + type: anndata + shape: [2688, 18078] + doc_header: "Pre-processed `10x Genomics Visium H&E dataset `__." + files: + - name: visium_hne_adata.h5ad + s3_key: figshare/visium_hne_adata.h5ad + sha256: 3571e6b84dd32c73ece8ae0bebbd4758e77c6fdd321b2bbe0d4ffa8ed56e1cbe + + visium_fluo_adata: + type: anndata + shape: [2800, 16562] + doc_header: "Pre-processed `10x Genomics Visium Fluorescent dataset `__." + files: + - name: visium_fluo_adata.h5ad + s3_key: figshare/visium_fluo_adata.h5ad + sha256: 8f92e2a03a669f939a80ba7299678377d43847983f9e0f9846c4462f7c02bdd6 + + visium_hne_adata_crop: + type: anndata + shape: [684, 18078] + doc_header: "Pre-processed subset `10x Genomics Visium H&E dataset `__." + files: + - name: visium_hne_adata_crop.h5ad + s3_key: figshare/visium_hne_adata_crop.h5ad + sha256: 9c9b277bde9f34a022df7f3e35b35ce7ecc80f006d6640b0786f4ace6f6eb5dd + + visium_fluo_adata_crop: + type: anndata + shape: [704, 16562] + doc_header: "Pre-processed subset `10x Genomics Visium Fluorescent dataset `__." + files: + - name: visium_fluo_adata_crop.h5ad + s3_key: figshare/visium_fluo_adata_crop.h5ad + sha256: ea8776c281a364e8c30a8de55a17479aa6d3c9eb90fe756c55f02ffd79e053ec + + sc_mouse_cortex: + type: anndata + shape: [21697, 36826] + doc_header: "Pre-processed `scRNA-seq mouse cortex `__." + files: + - name: sc_mouse_cortex.h5ad + s3_key: figshare/sc_mouse_cortex.h5ad + sha256: 3e0a26e1af06c1ea8f53a808ee683bf950de8cc03ee48bd291f95eeca6056aac + + mibitof: + type: anndata + shape: [3309, 36] + doc_header: "Pre-processed MIBI-TOF dataset from `Hartmann et al `__." + files: + - name: mibitof.h5ad + s3_key: figshare/mibitof.h5ad + sha256: 3f125c51695d78ed1c36d5485dc390ab400154d021f0c7715b89f8ee83978690 + + merfish: + type: anndata + shape: [73655, 161] + doc_header: "Pre-processed MERFISH dataset from `Moffitt et al `__." + files: + - name: merfish.h5ad + s3_key: figshare/merfish.h5ad + sha256: 371723d48413ba76aba49ccf7ea24867b1db940529216fe2902484f5c2a48904 + + slideseqv2: + type: anndata + shape: [41786, 4000] + doc_header: "Pre-processed SlideseqV2 dataset from `Stickles et al `__." + files: + - name: slideseqv2.h5ad + s3_key: figshare/slideseqv2.h5ad + sha256: 224a0f2b3d4f6c0ac1583c93bb6bfa910a986df82302c4a927d61883b8b63d8b + + # =========================================================================== + # Image datasets (.tiff) + # =========================================================================== + visium_fluo_image_crop: + type: image + shape: [7272, 7272] + library_id: V1_Adult_Mouse_Brain_Coronal_Section_2 + doc_header: Cropped Fluorescent image from 10x Genomics Visium dataset. + files: + - name: visium_fluo_image_crop.tiff + s3_key: figshare/visium_fluo_image_crop.tiff + sha256: 2929fdd06e32fa25b38493e67f301fc5b22b1a32bfbe48ab7237d8d85fe8982d + + visium_hne_image_crop: + type: image + shape: [3527, 3527] + library_id: V1_Adult_Mouse_Brain + doc_header: Cropped H&E image from 10x Genomics Visium dataset. + files: + - name: visium_hne_image_crop.tiff + s3_key: figshare/visium_hne_image_crop.tiff + sha256: 56d379d96da859ea963c4349bbc8de07da9b68ce133839ebef5fe1b033c9e7bb + + visium_hne_image: + type: image + shape: [11757, 11291] + library_id: V1_Adult_Mouse_Brain + doc_header: H&E image from 10x Genomics Visium dataset. + files: + - name: visium_hne_image.tiff + s3_key: figshare/visium_hne_image.tiff + sha256: 39d0a85a7cecb0bde9ad2566260d571bb49834d26fc443cb32b96475f30668b2 + + # =========================================================================== + # SpatialData datasets (.zip -> .zarr) + # =========================================================================== + visium_hne_sdata: + type: spatialdata + doc_header: Visium H&E dataset as SpatialData object. + files: + - name: visium_hne_sdata.zip + s3_key: figshare/visium_hne_sdata.zip + sha256: 6f88b1624d072a362cb2b40a12f86b7649d3d2f2cc762dd6be23a078ac3093b6 + + # =========================================================================== + # 10x Genomics Visium datasets (3 files each) + # =========================================================================== + V1_Breast_Cancer_Block_A_Section_1: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Breast_Cancer_Block_A_Section_1. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Breast_Cancer_Block_A_Section_1/V1_Breast_Cancer_Block_A_Section_1_filtered_feature_bc_matrix.h5 + sha256: b3cbb6ead60afad848ebcc07a34835613d9d2d79e416ad8300b210d9de584b65 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Breast_Cancer_Block_A_Section_1/V1_Breast_Cancer_Block_A_Section_1_spatial.tar.gz + sha256: 415530a9d31172f63fd358d5d1cefb0798698ef9c0e614f52825894cd45c61bf + - name: image.tif + s3_key: 10x_genomics/V1_Breast_Cancer_Block_A_Section_1/V1_Breast_Cancer_Block_A_Section_1_image.tif + sha256: 73a94ca956d1b5c027dcaf1ac42c15fed4d4a12eefee4b4b597e62f34cc5959e + + V1_Breast_Cancer_Block_A_Section_2: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Breast_Cancer_Block_A_Section_2. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Breast_Cancer_Block_A_Section_2/V1_Breast_Cancer_Block_A_Section_2_filtered_feature_bc_matrix.h5 + sha256: e325b4e53f0036a71904cea6b7cd65945f10c25cf2ef8e2f681de9ac1ff7766c + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Breast_Cancer_Block_A_Section_2/V1_Breast_Cancer_Block_A_Section_2_spatial.tar.gz + sha256: 15fcf837478bcba847b50e393b016f95f02067ea930e81f96b103f2a14bf6db0 + - name: image.tif + s3_key: 10x_genomics/V1_Breast_Cancer_Block_A_Section_2/V1_Breast_Cancer_Block_A_Section_2_image.tif + sha256: 1ad96042cba8e05b22566ef5b8197714e06d4a5e98d040e926c8591ffdb48cc7 + + V1_Human_Heart: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Human_Heart. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Human_Heart/V1_Human_Heart_filtered_feature_bc_matrix.h5 + sha256: a7274ea2ff2717693b200e614c288c174f02925b6d82c98221871a08e52d415b + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Human_Heart/V1_Human_Heart_spatial.tar.gz + sha256: f2033b6ec4c13dcaf75386d4de3e86cb3357e15fe0eaa2be7037c3d3f0194188 + - name: image.tif + s3_key: 10x_genomics/V1_Human_Heart/V1_Human_Heart_image.tif + sha256: ff6fc5169a4ab3af5a0799ee1333e2fa0e6df242bc6bc5162e6ea51e6826fa5f + + V1_Human_Lymph_Node: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Human_Lymph_Node. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Human_Lymph_Node/V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5 + sha256: 86fd533eb907450e7125b9820183a0ca73776eeafcc5eddae5695b6aabfd9139 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Human_Lymph_Node/V1_Human_Lymph_Node_spatial.tar.gz + sha256: 812808883366ff9623dc8354847a7211b0d922b2bfc4c9359d6e12e993ea6a73 + - name: image.tif + s3_key: 10x_genomics/V1_Human_Lymph_Node/V1_Human_Lymph_Node_image.tif + sha256: 19462aa8f74ed6032738251e2f56931be9047de58bab6ca0373af385aae2ff03 + + V1_Mouse_Kidney: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Mouse_Kidney. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Mouse_Kidney/V1_Mouse_Kidney_filtered_feature_bc_matrix.h5 + sha256: 5e0b1d1c51c4e8759cd623d212573e1c28daf95d66e0d25a8e4488f6bed3831a + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Mouse_Kidney/V1_Mouse_Kidney_spatial.tar.gz + sha256: 91570548eae3d2bcf738af45e9dc463547a01669841db43ff20afb41b7cc0539 + - name: image.tif + s3_key: 10x_genomics/V1_Mouse_Kidney/V1_Mouse_Kidney_image.tif + sha256: be58cc8b6aec69635ecc813fa9de559fd124a5c265999aff198d76d75f3c6f6a + + V1_Adult_Mouse_Brain: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Adult_Mouse_Brain. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Adult_Mouse_Brain/V1_Adult_Mouse_Brain_filtered_feature_bc_matrix.h5 + sha256: eb78379e02dcf48036abf05b67233e73ecb0d880787feb82f76ff16f6ce01eb3 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Adult_Mouse_Brain/V1_Adult_Mouse_Brain_spatial.tar.gz + sha256: 46d6b05ba740f232d6bf4b27b9a8846815851e000985fb878f1364bab04e5bd4 + - name: image.tif + s3_key: 10x_genomics/V1_Adult_Mouse_Brain/V1_Adult_Mouse_Brain_image.tif + sha256: 39d0a85a7cecb0bde9ad2566260d571bb49834d26fc443cb32b96475f30668b2 + + V1_Mouse_Brain_Sagittal_Posterior: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Mouse_Brain_Sagittal_Posterior. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Posterior/V1_Mouse_Brain_Sagittal_Posterior_filtered_feature_bc_matrix.h5 + sha256: df962209143860488bd679ff7a1b3bb14985208d0929fb814934ba7571c488b6 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Posterior/V1_Mouse_Brain_Sagittal_Posterior_spatial.tar.gz + sha256: b878d7eaca487ffc17224a870f86599abf9d45a99033c1ef65661ff119cef0d5 + - name: image.tif + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Posterior/V1_Mouse_Brain_Sagittal_Posterior_image.tif + sha256: 18558cd21345f6d590af4f519859a3710b400508ed31ae1e0b1d66b561de5e5f + + V1_Mouse_Brain_Sagittal_Posterior_Section_2: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Mouse_Brain_Sagittal_Posterior_Section_2. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Posterior_Section_2/V1_Mouse_Brain_Sagittal_Posterior_Section_2_filtered_feature_bc_matrix.h5 + sha256: 7e4dbea6a2037b3478719d121eec33a56b96d0c904daade9c8cbeb428f415bc4 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Posterior_Section_2/V1_Mouse_Brain_Sagittal_Posterior_Section_2_spatial.tar.gz + sha256: e8be08bf867387a6209c4f37ede4266d1305dfb038c77411b213a28fc2d79107 + - name: image.tif + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Posterior_Section_2/V1_Mouse_Brain_Sagittal_Posterior_Section_2_image.tif + sha256: ccc41944ae7586b1468141f840897c6096e1ce9506e6e0774b9ef3d89054b12c + + V1_Mouse_Brain_Sagittal_Anterior: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Mouse_Brain_Sagittal_Anterior. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Anterior/V1_Mouse_Brain_Sagittal_Anterior_filtered_feature_bc_matrix.h5 + sha256: 56078d8d6fe6c13de248fdb1c518b691cdef78fb00021b659786b4a47c6656d5 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Anterior/V1_Mouse_Brain_Sagittal_Anterior_spatial.tar.gz + sha256: 5f41a803e2bd69fa4dfca6abc8fa2d4e0d76aeb6c72d7038a5fdcf9cc50a36f8 + - name: image.tif + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Anterior/V1_Mouse_Brain_Sagittal_Anterior_image.tif + sha256: 5837c44e4e1d0f77b3ade2350a4f2159922f4c388e882f1c38b5683ff7bd086d + + V1_Mouse_Brain_Sagittal_Anterior_Section_2: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Mouse_Brain_Sagittal_Anterior_Section_2. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Anterior_Section_2/V1_Mouse_Brain_Sagittal_Anterior_Section_2_filtered_feature_bc_matrix.h5 + sha256: 6ffcca5979b722f7f112202a9a8273cdad60a2b40ba4f651d465fbba577b5a5c + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Anterior_Section_2/V1_Mouse_Brain_Sagittal_Anterior_Section_2_spatial.tar.gz + sha256: 0fa49dd44835e0a77c4f24653abe6092c0a164c594696540ee3d9e2eee2e34a2 + - name: image.tif + s3_key: 10x_genomics/V1_Mouse_Brain_Sagittal_Anterior_Section_2/V1_Mouse_Brain_Sagittal_Anterior_Section_2_image.tif + sha256: f6573c85a2e56fd9f70e6ce59134ed03a14140a074e2f85161f8b4778dad5bfb + + V1_Human_Brain_Section_1: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Human_Brain_Section_1. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Human_Brain_Section_1/V1_Human_Brain_Section_1_filtered_feature_bc_matrix.h5 + sha256: 076e5d1fa82243349ee963e069b362c81198e8c4f752ad9680dfb0714d0c240f + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Human_Brain_Section_1/V1_Human_Brain_Section_1_spatial.tar.gz + sha256: 3979bcbeb13a61d5e1a2ffb066bd76b4c121a1cd1293c66da7578cd1fbb1d247 + - name: image.tif + s3_key: 10x_genomics/V1_Human_Brain_Section_1/V1_Human_Brain_Section_1_image.tif + sha256: 68ac2d8b72514248eeb7666a7409c286d1e7d4af30b700baed36ebe85dcf2961 + + V1_Human_Brain_Section_2: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Human_Brain_Section_2. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Human_Brain_Section_2/V1_Human_Brain_Section_2_filtered_feature_bc_matrix.h5 + sha256: 1ee773e152a1befb9a4fce1b4883f62790e717546d5db092e209c6ac4c1f9d2d + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Human_Brain_Section_2/V1_Human_Brain_Section_2_spatial.tar.gz + sha256: 74b40c7f38d69b086d47739a565a8b17e558c2c5cf2c83c17c999e9ac7c25f83 + - name: image.tif + s3_key: 10x_genomics/V1_Human_Brain_Section_2/V1_Human_Brain_Section_2_image.tif + sha256: 87881ff452472bc326c6267184a43ad5c53fb4caa39637679a59143228266ef1 + + V1_Adult_Mouse_Brain_Coronal_Section_1: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Adult_Mouse_Brain_Coronal_Section_1. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Adult_Mouse_Brain_Coronal_Section_1/V1_Adult_Mouse_Brain_Coronal_Section_1_filtered_feature_bc_matrix.h5 + sha256: 005cecf7aed6704c8a29f446e6cb092a9db960b39cf312720ff0238ef94cca81 + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Adult_Mouse_Brain_Coronal_Section_1/V1_Adult_Mouse_Brain_Coronal_Section_1_spatial.tar.gz + sha256: 96c5d31c3c8e9639e5545dcb4494580cbe8f7e0b63ca92476c800d332796e0f2 + - name: image.tif + s3_key: 10x_genomics/V1_Adult_Mouse_Brain_Coronal_Section_1/V1_Adult_Mouse_Brain_Coronal_Section_1_image.tif + sha256: 9ce5353f60c6a19657cdd43db392d0cb9cb253891daa8751c00bd2918af892e3 + + V1_Adult_Mouse_Brain_Coronal_Section_2: + type: visium_10x + doc_header: 10x Genomics Visium dataset V1_Adult_Mouse_Brain_Coronal_Section_2. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/V1_Adult_Mouse_Brain_Coronal_Section_2/V1_Adult_Mouse_Brain_Coronal_Section_2_filtered_feature_bc_matrix.h5 + sha256: 140366bde796174172d7c91b9c03107dfee3912c7cc060df0a47c030b6c72caf + - name: spatial.tar.gz + s3_key: 10x_genomics/V1_Adult_Mouse_Brain_Coronal_Section_2/V1_Adult_Mouse_Brain_Coronal_Section_2_spatial.tar.gz + sha256: 2b1e930d55fe8dff239d6838aa53389fe10ff011558a02ea654dad9d16737b03 + - name: image.tif + s3_key: 10x_genomics/V1_Adult_Mouse_Brain_Coronal_Section_2/V1_Adult_Mouse_Brain_Coronal_Section_2_image.tif + sha256: 4df93ac227d129250ee3ade3d0d33175bd51f0baae0bcf2841be779b3df03b4a + + # SpaceRanger 1.2.0 datasets + Targeted_Visium_Human_Cerebellum_Neuroscience: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_Cerebellum_Neuroscience. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_Cerebellum_Neuroscience/Targeted_Visium_Human_Cerebellum_Neuroscience_filtered_feature_bc_matrix.h5 + sha256: 93f2846c73c978fda63672b56ea6ccd6805344870030882bcb784c2e5ad4a281 + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_Cerebellum_Neuroscience/Targeted_Visium_Human_Cerebellum_Neuroscience_spatial.tar.gz + sha256: 4c5c78ed12b6af7661bceabc3f819e5668148b5bb330e72d3e0d7bd2031118a8 + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_Cerebellum_Neuroscience/Targeted_Visium_Human_Cerebellum_Neuroscience_image.tif + sha256: 51855c63d57e4c67f3ec0db22a051067706537c740763306334696305d686eda + + Parent_Visium_Human_Cerebellum: + type: visium_10x + doc_header: 10x Genomics Visium dataset Parent_Visium_Human_Cerebellum. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_filtered_feature_bc_matrix.h5 + sha256: 05c137dd74623e748558c60a99d8e19749cbd073d070ce827aec73cee899f1d0 + - name: spatial.tar.gz + s3_key: 10x_genomics/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_spatial.tar.gz + sha256: 7a8a42ad53d93776b7b21b31c3727d76a8ed6c332e2f39b6b056b52ef41eeea0 + - name: image.tif + s3_key: 10x_genomics/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_image.tif + sha256: 51855c63d57e4c67f3ec0db22a051067706537c740763306334696305d686eda + + Targeted_Visium_Human_SpinalCord_Neuroscience: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_SpinalCord_Neuroscience. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_SpinalCord_Neuroscience/Targeted_Visium_Human_SpinalCord_Neuroscience_filtered_feature_bc_matrix.h5 + sha256: e6e0ced51e4092361cffba849e4fba1c3e641c27a59ce9622c6582ae42b4c98d + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_SpinalCord_Neuroscience/Targeted_Visium_Human_SpinalCord_Neuroscience_spatial.tar.gz + sha256: ae94a184bc207170497a00fd95ae0bc49fda934f7e83113ca286d590c5f85e0e + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_SpinalCord_Neuroscience/Targeted_Visium_Human_SpinalCord_Neuroscience_image.tif + sha256: 2b81347ba65f3f61a76a2b9d8ff6d05f34d72bc2cbf3e9895374e29f81f78019 + + Parent_Visium_Human_SpinalCord: + type: visium_10x + doc_header: 10x Genomics Visium dataset Parent_Visium_Human_SpinalCord. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Parent_Visium_Human_SpinalCord/Parent_Visium_Human_SpinalCord_filtered_feature_bc_matrix.h5 + sha256: b731d6bf09c402c9b6f43c88296a08ffcaa508079a20d70f449342c165b80037 + - name: spatial.tar.gz + s3_key: 10x_genomics/Parent_Visium_Human_SpinalCord/Parent_Visium_Human_SpinalCord_spatial.tar.gz + sha256: 0d94e3b8efdb2dbc3be5d912f79828d7ea9ddb333eb7376f31986a8b16a34aeb + - name: image.tif + s3_key: 10x_genomics/Parent_Visium_Human_SpinalCord/Parent_Visium_Human_SpinalCord_image.tif + sha256: 2b81347ba65f3f61a76a2b9d8ff6d05f34d72bc2cbf3e9895374e29f81f78019 + + Targeted_Visium_Human_Glioblastoma_Pan_Cancer: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_Glioblastoma_Pan_Cancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_Glioblastoma_Pan_Cancer/Targeted_Visium_Human_Glioblastoma_Pan_Cancer_filtered_feature_bc_matrix.h5 + sha256: 2d6c3ba9dfb47235eb7fc560fbabe52ddb553a667af6d67f8ba2e1ac43d768f0 + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_Glioblastoma_Pan_Cancer/Targeted_Visium_Human_Glioblastoma_Pan_Cancer_spatial.tar.gz + sha256: c374909d78319f25fb487ed58804124370ee1312b36181fb1a04e9755bccc345 + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_Glioblastoma_Pan_Cancer/Targeted_Visium_Human_Glioblastoma_Pan_Cancer_image.tif + sha256: cb789cbf1b6b738a89728b3489bbf0c405ddfc12a7d870176ebd555c36212c5c + + Parent_Visium_Human_Glioblastoma: + type: visium_10x + doc_header: 10x Genomics Visium dataset Parent_Visium_Human_Glioblastoma. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Parent_Visium_Human_Glioblastoma/Parent_Visium_Human_Glioblastoma_filtered_feature_bc_matrix.h5 + sha256: 76d6b36807972a25c856a4bfe9ef3cd4ca71eaf5c4591271b0a7e471457ffb98 + - name: spatial.tar.gz + s3_key: 10x_genomics/Parent_Visium_Human_Glioblastoma/Parent_Visium_Human_Glioblastoma_spatial.tar.gz + sha256: 7be26425284ea6bfb07ea7ad6c001cd9a30b2ffc47547858bd51d219c9fd3a69 + - name: image.tif + s3_key: 10x_genomics/Parent_Visium_Human_Glioblastoma/Parent_Visium_Human_Glioblastoma_image.tif + sha256: cb789cbf1b6b738a89728b3489bbf0c405ddfc12a7d870176ebd555c36212c5c + + Targeted_Visium_Human_BreastCancer_Immunology: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_BreastCancer_Immunology. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_BreastCancer_Immunology/Targeted_Visium_Human_BreastCancer_Immunology_filtered_feature_bc_matrix.h5 + sha256: f19d185627d64e90f0c3b25ab9afcbb4090b7f76978f02346c7518190ad6d4e1 + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_BreastCancer_Immunology/Targeted_Visium_Human_BreastCancer_Immunology_spatial.tar.gz + sha256: 980d0bad49dfe3ad157968be3408893a9ae9deed4e789a3d0745d45484b8e78b + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_BreastCancer_Immunology/Targeted_Visium_Human_BreastCancer_Immunology_image.tif + sha256: 9f9963ce937b50891fc2723fbcd45394070c6975a3e2551b0a5db0ef45ca73f2 + + Parent_Visium_Human_BreastCancer: + type: visium_10x + doc_header: 10x Genomics Visium dataset Parent_Visium_Human_BreastCancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_filtered_feature_bc_matrix.h5 + sha256: fb38021b00795bcd05720a7d1ac451dc443366cf27cad57b0b5144f4661a9d35 + - name: spatial.tar.gz + s3_key: 10x_genomics/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_spatial.tar.gz + sha256: 57b97ff5e9b0be325797ffa3d1b043414ff46ca9b13248b4b5d610d8e6806ade + - name: image.tif + s3_key: 10x_genomics/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_image.tif + sha256: 9f9963ce937b50891fc2723fbcd45394070c6975a3e2551b0a5db0ef45ca73f2 + + Targeted_Visium_Human_OvarianCancer_Pan_Cancer: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_OvarianCancer_Pan_Cancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_OvarianCancer_Pan_Cancer/Targeted_Visium_Human_OvarianCancer_Pan_Cancer_filtered_feature_bc_matrix.h5 + sha256: f2d0fdfae06adbf3f761802a8561d05459d211b25039caa1a6e994720b015551 + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_OvarianCancer_Pan_Cancer/Targeted_Visium_Human_OvarianCancer_Pan_Cancer_spatial.tar.gz + sha256: 469a5a144eb996535b9502382a6668d19e050aeacdc8c719c31c23b338c2c4f5 + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_OvarianCancer_Pan_Cancer/Targeted_Visium_Human_OvarianCancer_Pan_Cancer_image.tif + sha256: 2e36037b0850399c1fdb07c66c31db5458081febba4a1f80ac505cde6320ca79 + + Targeted_Visium_Human_OvarianCancer_Immunology: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_OvarianCancer_Immunology. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_OvarianCancer_Immunology/Targeted_Visium_Human_OvarianCancer_Immunology_filtered_feature_bc_matrix.h5 + sha256: b09c4888611195aa33cf9ec1d7b294728726376b76f41b3d3920a400f52fe89d + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_OvarianCancer_Immunology/Targeted_Visium_Human_OvarianCancer_Immunology_spatial.tar.gz + sha256: 8165bec67690db161c9ea298777759bf765637b6ea8fb23a952c26933af65158 + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_OvarianCancer_Immunology/Targeted_Visium_Human_OvarianCancer_Immunology_image.tif + sha256: 2e36037b0850399c1fdb07c66c31db5458081febba4a1f80ac505cde6320ca79 + + Parent_Visium_Human_OvarianCancer: + type: visium_10x + doc_header: 10x Genomics Visium dataset Parent_Visium_Human_OvarianCancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Parent_Visium_Human_OvarianCancer/Parent_Visium_Human_OvarianCancer_filtered_feature_bc_matrix.h5 + sha256: 5dd7fa0daca816d8700cf275c14c98004a37cc92a8c3b1d4c11f19ed3fedbc97 + - name: spatial.tar.gz + s3_key: 10x_genomics/Parent_Visium_Human_OvarianCancer/Parent_Visium_Human_OvarianCancer_spatial.tar.gz + sha256: 219698516af089fd2dea1b8080c4bdcf1c0c0a3eff05a63ef9f115c3e4b6ffdd + - name: image.tif + s3_key: 10x_genomics/Parent_Visium_Human_OvarianCancer/Parent_Visium_Human_OvarianCancer_image.tif + sha256: 2e36037b0850399c1fdb07c66c31db5458081febba4a1f80ac505cde6320ca79 + + Targeted_Visium_Human_ColorectalCancer_GeneSignature: + type: visium_10x + doc_header: 10x Genomics Visium dataset Targeted_Visium_Human_ColorectalCancer_GeneSignature. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Targeted_Visium_Human_ColorectalCancer_GeneSignature/Targeted_Visium_Human_ColorectalCancer_GeneSignature_filtered_feature_bc_matrix.h5 + sha256: 3b7f01724c0adf9c469db5250cee6669d354ef1742e31003c7930f5d457f2457 + - name: spatial.tar.gz + s3_key: 10x_genomics/Targeted_Visium_Human_ColorectalCancer_GeneSignature/Targeted_Visium_Human_ColorectalCancer_GeneSignature_spatial.tar.gz + sha256: 077a8f78869e722431233109a0eac89a658847a8e82e3c5973eb193247ee49da + - name: image.tif + s3_key: 10x_genomics/Targeted_Visium_Human_ColorectalCancer_GeneSignature/Targeted_Visium_Human_ColorectalCancer_GeneSignature_image.tif + sha256: 1825e4940d72b363d4676061168235ec3560b8ccb3f1259546cd66d2bfc3729a + + Parent_Visium_Human_ColorectalCancer: + type: visium_10x + doc_header: 10x Genomics Visium dataset Parent_Visium_Human_ColorectalCancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Parent_Visium_Human_ColorectalCancer/Parent_Visium_Human_ColorectalCancer_filtered_feature_bc_matrix.h5 + sha256: 5826da1470ce071757675b563c9036c055f7f800e0930b635968b887546b95e0 + - name: spatial.tar.gz + s3_key: 10x_genomics/Parent_Visium_Human_ColorectalCancer/Parent_Visium_Human_ColorectalCancer_spatial.tar.gz + sha256: ebba3b8bfe212545181ef68c65b6e23eda7e3ddf9e59d02e6edc6b9d96325476 + - name: image.tif + s3_key: 10x_genomics/Parent_Visium_Human_ColorectalCancer/Parent_Visium_Human_ColorectalCancer_image.tif + sha256: 1825e4940d72b363d4676061168235ec3560b8ccb3f1259546cd66d2bfc3729a + + # SpaceRanger 1.3.0 datasets (FFPE) + Visium_FFPE_Mouse_Brain: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Mouse_Brain. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Mouse_Brain/Visium_FFPE_Mouse_Brain_filtered_feature_bc_matrix.h5 + sha256: f5a5d0fafeab6259ded1c4883b255ef57557b81f32774513594e23a49e8352ce + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Mouse_Brain/Visium_FFPE_Mouse_Brain_spatial.tar.gz + sha256: e4e1b845fd078946c6f8b61bd8d1927c0ce2395c3730f602cd80ef439d4a9d73 + # Note: image.tif not available for this dataset + + Visium_FFPE_Mouse_Brain_IF: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Mouse_Brain_IF. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Mouse_Brain_IF/Visium_FFPE_Mouse_Brain_IF_filtered_feature_bc_matrix.h5 + sha256: 14e869d99cdf173bd20c900cc79a5df567ab6630ac50dfd75080afd993e80ff7 + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Mouse_Brain_IF/Visium_FFPE_Mouse_Brain_IF_spatial.tar.gz + sha256: 3df8b527d6f83970c0ab09872c69442fe5d2fb0a687db7918006b058012096f2 + - name: image.tif + s3_key: 10x_genomics/Visium_FFPE_Mouse_Brain_IF/Visium_FFPE_Mouse_Brain_IF_image.tif + sha256: 1dd8bcad6297c25afdaaf7d0cfd1be3fac6db6bfc21a44ee71ad31dc13ecc9cf + + Visium_FFPE_Mouse_Kidney: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Mouse_Kidney. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Mouse_Kidney/Visium_FFPE_Mouse_Kidney_filtered_feature_bc_matrix.h5 + sha256: b7c3a904971ece636c79cda1dd49d7f7f183080698516477f96182b46597f977 + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Mouse_Kidney/Visium_FFPE_Mouse_Kidney_spatial.tar.gz + sha256: cd55f1d7c5665d7f37ea83ab58729149e93e762f4ed69f7f9bde747ec4404c64 + - name: image.tif + s3_key: 10x_genomics/Visium_FFPE_Mouse_Kidney/Visium_FFPE_Mouse_Kidney_image.tif + sha256: 6628054c2ecf68124f319ef154bead8359424fe9ccf9f2e1191aac66de787870 + + Visium_FFPE_Human_Breast_Cancer: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Human_Breast_Cancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_filtered_feature_bc_matrix.h5 + sha256: 64321f603f7200b0bedffbe353c04dde72bbe7c7be7390e503d955aa9b2584c5 + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_spatial.tar.gz + sha256: 2937fcc44b7adee70f162a9e09857410dcf22eed89a3e3187950dfc1574fea14 + - name: image.tif + s3_key: 10x_genomics/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_image.tif + sha256: b3fbbdb5006769ef7d2b614171e208b6caf901bde52272a02da6e3d019626ce0 + + Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma/Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma_filtered_feature_bc_matrix.h5 + sha256: 61fa385e2f5126a3085aea6836e9bfcae8e0d90face655156dc872e7ea450d6a + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma/Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma_spatial.tar.gz + sha256: 3929961344a03075ce8012186952c1df7fd084e7cac7f1bccb70744a5eaf9569 + - name: image.tif + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma/Visium_FFPE_Human_Prostate_Acinar_Cell_Carcinoma_image.tif + sha256: 68b2eccb14f2344ceba2beccf99dace6506768c4e905624fa2e658fbda4dc86c + + Visium_FFPE_Human_Prostate_Cancer: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Human_Prostate_Cancer. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_Cancer/Visium_FFPE_Human_Prostate_Cancer_filtered_feature_bc_matrix.h5 + sha256: 08fb4185da5ac375ab3f4782744752bf3aa23fedbe93d0882aaa9fa0afde9abd + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_Cancer/Visium_FFPE_Human_Prostate_Cancer_spatial.tar.gz + sha256: 3eec0e63de718a51afd1d11dea3a1db926009399937f93384d14c81f52e37202 + - name: image.tif + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_Cancer/Visium_FFPE_Human_Prostate_Cancer_image.tif + sha256: c5e967a1c09deef5a1ef0a60b00c823d2daa3640b50ee3cb6683f61c254299f7 + + Visium_FFPE_Human_Prostate_IF: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Human_Prostate_IF. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_filtered_feature_bc_matrix.h5 + sha256: 14de9f9debec29e3e359d105191911f838478002216c4540a97d323b2fbea17b + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_spatial.tar.gz + sha256: 508619614fb1d8db763c197df0a234d655c329be355f77b970b6ec29aa001da0 + - name: image.tif + s3_key: 10x_genomics/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_image.tif + sha256: a46c35abc58b10b3704a8f8f89d73ec8d2169b8eb4ed1e2bea8d72e6163837ba + + Visium_FFPE_Human_Normal_Prostate: + type: visium_10x + doc_header: 10x Genomics Visium FFPE dataset Visium_FFPE_Human_Normal_Prostate. + files: + - name: filtered_feature_bc_matrix.h5 + s3_key: 10x_genomics/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_filtered_feature_bc_matrix.h5 + sha256: a62e3f01b30eb0cbf524dbaafb52c086bc2710ff7461284b0b3778ca0e5a355c + - name: spatial.tar.gz + s3_key: 10x_genomics/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_spatial.tar.gz + sha256: cd9f2616ed155b8dcdd68d0780eda5dd24f1d440b02cfd8f5c5bd1bb6481000e + - name: image.jpg + s3_key: 10x_genomics/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_image.jpg + sha256: null diff --git a/src/squidpy/gr/_build.py b/src/squidpy/gr/_build.py index 97b754ec..5ed20fc3 100644 --- a/src/squidpy/gr/_build.py +++ b/src/squidpy/gr/_build.py @@ -15,7 +15,6 @@ from anndata.utils import make_index_unique from fast_array_utils import stats as fau_stats from numba import njit, prange -from scanpy import logging as logg from scipy.sparse import ( SparseEfficiencyWarning, block_diag, @@ -30,10 +29,8 @@ from sklearn.neighbors import NearestNeighbors from spatialdata import SpatialData from spatialdata._core.centroids import get_centroids -from spatialdata._core.query.relational_query import ( - get_element_instances, - match_element_to_table, -) +from spatialdata._core.query.relational_query import get_element_instances, match_element_to_table +from spatialdata._logging import logger as logg from spatialdata.models import get_table_keys from spatialdata.models.models import ( Labels2DModel, diff --git a/src/squidpy/read/_read.py b/src/squidpy/read/_read.py index 3b3d9474..13791b23 100644 --- a/src/squidpy/read/_read.py +++ b/src/squidpy/read/_read.py @@ -9,12 +9,11 @@ import numpy as np import pandas as pd from anndata import AnnData -from scanpy import logging as logg from scipy.sparse import csr_matrix +from spatialdata._logging import logger as logg from squidpy._constants._pkg_constants import Key -from squidpy.datasets._utils import PathLike -from squidpy.read._utils import _load_image, _read_counts +from squidpy.read._utils import PathLike, _load_image, _read_counts __all__ = ["visium", "vizgen", "nanostring"] diff --git a/src/squidpy/read/_utils.py b/src/squidpy/read/_utils.py index cf29f24e..44ff7bfe 100644 --- a/src/squidpy/read/_utils.py +++ b/src/squidpy/read/_utils.py @@ -1,7 +1,8 @@ from __future__ import annotations +import os from pathlib import Path -from typing import Any +from typing import Any, TypeAlias import numpy as np from anndata import AnnData @@ -12,7 +13,9 @@ from squidpy._constants._pkg_constants import Key from squidpy._utils import NDArrayA -from squidpy.datasets._utils import PathLike + +# Type alias for path-like objects +PathLike: TypeAlias = os.PathLike[str] | str def _read_counts( diff --git a/tests/datasets/conftest.py b/tests/datasets/conftest.py deleted file mode 100644 index ebdc1c49..00000000 --- a/tests/datasets/conftest.py +++ /dev/null @@ -1,11 +0,0 @@ -from __future__ import annotations - -import sys - -import pytest - - -@pytest.fixture(autouse=True) -def _xfail_internet_if_macos(request: pytest.FixtureRequest) -> None: - if request.node.get_closest_marker("internet") and sys.platform == "darwin": - request.applymarker(pytest.mark.xfail(reason="Downloads fail on macOS", strict=False)) diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py index f940f509..70ce0d99 100644 --- a/tests/datasets/test_dataset.py +++ b/tests/datasets/test_dataset.py @@ -2,33 +2,55 @@ import warnings from http.client import RemoteDisconnected -from pathlib import Path -from types import FunctionType import pytest from anndata import AnnData, OldFormatWarning import squidpy as sq +# All public dataset functions that should be importable +_DATASET_FUNCTIONS = [ + # AnnData datasets + "four_i", + "imc", + "seqfish", + "visium_hne_adata", + "visium_hne_adata_crop", + "visium_fluo_adata", + "visium_fluo_adata_crop", + "sc_mouse_cortex", + "mibitof", + "merfish", + "slideseqv2", + # Image datasets + "visium_fluo_image_crop", + "visium_hne_image_crop", + "visium_hne_image", + # 10x Visium + "visium", + "visium_hne_sdata", +] + class TestDatasetsImports: - @pytest.mark.parametrize("func", sq.datasets._dataset.__all__ + sq.datasets._image.__all__) + @pytest.mark.parametrize("func", _DATASET_FUNCTIONS) def test_import(self, func): assert hasattr(sq.datasets, func), dir(sq.datasets) fn = getattr(sq.datasets, func) - assert isinstance(fn, FunctionType) + assert callable(fn) # TODO(michalk8): parse the code and xfail iff server issue class TestDatasetsDownload: @pytest.mark.timeout(120) @pytest.mark.internet() - def test_download_imc(self, tmp_path: Path): + def test_download_imc(self): + # Not passing path uses scanpy.settings.datasetdir with warnings.catch_warnings(): warnings.simplefilter("ignore", category=OldFormatWarning) try: - adata = sq.datasets.imc(tmp_path / "foo") + adata = sq.datasets.imc() assert isinstance(adata, AnnData) assert adata.shape == (4668, 34) @@ -37,11 +59,12 @@ def test_download_imc(self, tmp_path: Path): @pytest.mark.timeout(120) @pytest.mark.internet() - def test_download_visium_hne_image_crop(self, tmp_path: Path): + def test_download_visium_hne_image_crop(self): + # Not passing path uses scanpy.settings.datasetdir with warnings.catch_warnings(): warnings.simplefilter("ignore", category=OldFormatWarning) try: - img = sq.datasets.visium_hne_image_crop(tmp_path / "foo") + img = sq.datasets.visium_hne_image_crop() assert isinstance(img, sq.im.ImageContainer) assert img.shape == (3527, 3527) diff --git a/tests/datasets/test_download_visium_dataset.py b/tests/datasets/test_download_visium_dataset.py index 330d4865..b2a98809 100644 --- a/tests/datasets/test_download_visium_dataset.py +++ b/tests/datasets/test_download_visium_dataset.py @@ -25,21 +25,23 @@ "Visium_FFPE_Human_Breast_Cancer", ], ) -def test_visium_datasets(tmpdir, sample): - # Tests that reading / downloading datasets works and it does not have any global effects +def test_visium_datasets(sample): + # Tests that reading / downloading datasets works + # and it does not have any global effects sample_dataset = visium(sample) sample_dataset_again = visium(sample) assert_adata_equal(sample_dataset, sample_dataset_again) - # Test that changing the dataset directory doesn't break reading - settings.datasetdir = Path(tmpdir) + # Test that downloading dataset again returns the same data + # (uses cache) sample_dataset_again = visium(sample) assert_adata_equal(sample_dataset, sample_dataset_again) # Test that downloading tissue image works sample_dataset = visium(sample, include_hires_tiff=True) - expected_image_path = settings.datasetdir / sample / "image.tif" - image_path = Path(sample_dataset.uns["spatial"][sample]["metadata"]["source_image_path"]) + expected_image_path = (Path(settings.datasetdir) / "visium" / sample / "image.tif").resolve() + spatial_metadata = sample_dataset.uns["spatial"][sample]["metadata"] + image_path = Path(spatial_metadata["source_image_path"]).resolve() assert image_path == expected_image_path # Test that tissue image exists and is a valid image file @@ -47,14 +49,18 @@ def test_visium_datasets(tmpdir, sample): # Test that tissue image is a tif image file (using `file`) process = subprocess.run(["file", "--mime-type", image_path], stdout=subprocess.PIPE) - output = process.stdout.strip().decode() # make process output string + output = process.stdout.strip().decode() assert output == str(image_path) + ": image/tiff" -@pytest.mark.timeout(120) +# since this is 400mb's +# and need to unpack it, +# if downloading on other integration tests, it will timeout +@pytest.mark.timeout(240) @pytest.mark.internet() -def test_visium_sdata_dataset(tmpdir): - sdata = visium_hne_sdata(Path(tmpdir)) +def test_visium_sdata_dataset(): + # Not passing path uses scanpy.settings.datasetdir + sdata = visium_hne_sdata() assert isinstance(sdata, sd.SpatialData) assert list(sdata.shapes.keys()) == ["spots"] assert list(sdata.images.keys()) == ["hne"] diff --git a/tests/datasets/test_downloader.py b/tests/datasets/test_downloader.py new file mode 100644 index 00000000..7eae2714 --- /dev/null +++ b/tests/datasets/test_downloader.py @@ -0,0 +1,144 @@ +"""Tests for the unified dataset downloader.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from scanpy import settings + +from squidpy.datasets._downloader import ( + DatasetDownloader, + download, + get_downloader, +) +from squidpy.datasets._registry import get_registry + + +class TestDatasetDownloader: + """Tests for DatasetDownloader class.""" + + def test_init_default_cache_dir(self): + downloader = DatasetDownloader(registry=get_registry()) + assert downloader.cache_dir == Path(settings.datasetdir) + + def test_init_custom_cache_dir(self, tmp_path: Path): + downloader = DatasetDownloader(registry=get_registry(), cache_dir=tmp_path / "custom_cache") + assert downloader.cache_dir == tmp_path / "custom_cache" + assert downloader.cache_dir.exists() + + def test_init_custom_s3_url(self): + s3_url = "https://my-bucket.s3.amazonaws.com" + downloader = DatasetDownloader(registry=get_registry(), s3_base_url=s3_url) + assert downloader._s3_base_url == s3_url + + def test_registry_loaded(self): + downloader = DatasetDownloader(registry=get_registry()) + assert downloader.registry is not None + assert len(downloader.registry.datasets) > 0 + + def test_download_unknown_dataset(self, tmp_path: Path): + downloader = DatasetDownloader(registry=get_registry(), cache_dir=tmp_path) + with pytest.raises(ValueError, match="Unknown dataset"): + downloader.download("nonexistent_dataset") + + +class TestGetDownloader: + """Tests for get_downloader singleton function.""" + + def test_returns_downloader(self): + downloader = get_downloader() + assert isinstance(downloader, DatasetDownloader) + + def test_returns_same_instance(self): + # lru_cache ensures singleton behavior + downloader1 = get_downloader() + downloader2 = get_downloader() + assert downloader1 is downloader2 + + +class TestDownloadFunction: + """Tests for download convenience function.""" + + def test_unknown_dataset_raises(self): + with pytest.raises(ValueError, match="Unknown dataset"): + download("nonexistent_dataset") + + +class TestDownloaderIntegration: + """Integration tests that require network access.""" + + @pytest.mark.timeout(120) + @pytest.mark.internet() + def test_download_imc_dataset(self): + """Test downloading a small AnnData dataset.""" + from anndata import AnnData + + # Use scanpy.settings.datasetdir to match what download_data.py uses + downloader = DatasetDownloader(registry=get_registry(), cache_dir=settings.datasetdir) + adata = downloader.download("imc") + + assert isinstance(adata, AnnData) + assert adata.shape == (4668, 34) + + @pytest.mark.timeout(120) + @pytest.mark.internet() + def test_download_caches_file(self): + """Test that downloaded files are cached.""" + cache_dir = Path(settings.datasetdir) + downloader = DatasetDownloader(registry=get_registry(), cache_dir=cache_dir) + + # First download + adata1 = downloader.download("imc") + + # Check file exists in cache + cache_files = list((cache_dir / "anndata").glob("*.h5ad")) + # At least one file (may have more from other tests) + assert len(cache_files) >= 1 + + # Second download should use cache (no network) + adata2 = downloader.download("imc") + assert adata1.shape == adata2.shape + + @pytest.mark.timeout(180) + @pytest.mark.internet() + def test_download_visium_sample(self): + """Test downloading a Visium sample.""" + from anndata import AnnData + + downloader = DatasetDownloader(registry=get_registry(), cache_dir=settings.datasetdir) + adata = downloader.download("V1_Mouse_Kidney", include_hires_tiff=False) + + assert isinstance(adata, AnnData) + assert "spatial" in adata.uns + + @pytest.mark.timeout(300) + @pytest.mark.internet() + def test_include_hires_tiff_caching_behavior(self): + """Test include_hires_tiff: cached files persist, return varies. + + On CI, V1_Mouse_Kidney is pre-cached via .scripts/ci/download_data.py + with include_hires_tiff=True, so this tests return behavior. + """ + sample_id = "V1_Mouse_Kidney" + cache_dir = Path(settings.datasetdir) + hires_image_path = cache_dir / "visium" / sample_id / "image.tif" + downloader = DatasetDownloader(registry=get_registry(), cache_dir=cache_dir) + + # include_hires_tiff=False: no source_image_path in metadata + adata = downloader.download(sample_id, include_hires_tiff=False) + metadata = adata.uns["spatial"][sample_id].get("metadata", {}) + assert "source_image_path" not in metadata + + # include_hires_tiff=True: source_image_path in metadata, file cached + adata = downloader.download(sample_id, include_hires_tiff=True) + metadata = adata.uns["spatial"][sample_id].get("metadata", {}) + assert "source_image_path" in metadata + assert Path(metadata["source_image_path"]).exists() + assert hires_image_path.exists() + + # include_hires_tiff=False again: cached file persists, not in metadata + adata = downloader.download(sample_id, include_hires_tiff=False) + metadata = adata.uns["spatial"][sample_id].get("metadata", {}) + assert "source_image_path" not in metadata + assert hires_image_path.exists() # file still cached diff --git a/tests/datasets/test_registry.py b/tests/datasets/test_registry.py new file mode 100644 index 00000000..2283b05c --- /dev/null +++ b/tests/datasets/test_registry.py @@ -0,0 +1,211 @@ +"""Tests for the unified dataset registry.""" + +from __future__ import annotations + +import pytest + +from squidpy.datasets._registry import ( + DatasetEntry, + DatasetRegistry, + DatasetType, + FileEntry, + get_registry, +) + + +class TestFileEntry: + """Tests for FileEntry dataclass.""" + + def test_entry_creation(self): + entry = FileEntry( + name="test.h5ad", + s3_key="figshare/test.h5ad", + sha256="abc123", + ) + assert entry.name == "test.h5ad" + assert entry.sha256 == "abc123" + + def test_get_urls_with_s3(self): + entry = FileEntry( + name="test.h5ad", + s3_key="figshare/test.h5ad", + ) + urls = entry.get_urls("https://s3.example.com") + assert len(urls) == 1 + assert urls[0] == "https://s3.example.com/figshare/test.h5ad" + + +class TestDatasetEntry: + """Tests for DatasetEntry dataclass.""" + + def test_single_file_dataset(self): + entry = DatasetEntry( + name="test", + type=DatasetType.ANNDATA, + files=[ + FileEntry( + name="test.h5ad", + s3_key="test.h5ad", + ) + ], + shape=(100, 50), + ) + assert len(entry.files) == 1 + assert entry.shape == (100, 50) + + def test_visium_10x_dataset(self): + entry = DatasetEntry( + name="V1_Test", + type=DatasetType.VISIUM_10X, + files=[ + FileEntry( + name="filtered_feature_bc_matrix.h5", + s3_key="test.h5", + ), + FileEntry(name="spatial.tar.gz", s3_key="test.tar.gz"), + FileEntry(name="image.tif", s3_key="test.tif"), + ], + ) + assert len(entry.files) == 3 + assert entry.type == DatasetType.VISIUM_10X + assert entry.get_file_by_name_prefix("image.") is not None + + def test_get_file(self): + entry = DatasetEntry( + name="test", + type=DatasetType.VISIUM_10X, + files=[ + FileEntry( + name="filtered_feature_bc_matrix.h5", + s3_key="test.h5", + ), + FileEntry(name="spatial.tar.gz", s3_key="test.tar.gz"), + ], + ) + f = entry.get_file("spatial.tar.gz") + assert f is not None + assert f.name == "spatial.tar.gz" + + assert entry.get_file("nonexistent") is None + + +class TestDatasetRegistry: + """Tests for DatasetRegistry class.""" + + def test_from_yaml_loads_config(self): + registry = DatasetRegistry.from_yaml() + assert registry is not None + assert len(registry.datasets) > 0 + + def test_anndata_datasets_loaded(self): + registry = DatasetRegistry.from_yaml() + assert "four_i" in registry + assert "imc" in registry + assert "seqfish" in registry + assert "visium_hne_adata" in registry + + def test_anndata_dataset_fields(self): + registry = DatasetRegistry.from_yaml() + four_i = registry["four_i"] + assert four_i.type == DatasetType.ANNDATA + assert four_i.shape == (270876, 43) + assert len(four_i.files) == 1 + + def test_image_datasets_loaded(self): + registry = DatasetRegistry.from_yaml() + assert "visium_hne_image" in registry + assert "visium_hne_image_crop" in registry + assert "visium_fluo_image_crop" in registry + + def test_image_has_library_id(self): + registry = DatasetRegistry.from_yaml() + img = registry["visium_hne_image"] + assert img.library_id == "V1_Adult_Mouse_Brain" + + def test_spatialdata_loaded(self): + registry = DatasetRegistry.from_yaml() + assert "visium_hne_sdata" in registry + sdata = registry["visium_hne_sdata"] + assert sdata.type == DatasetType.SPATIALDATA + + def test_visium_10x_datasets_loaded(self): + registry = DatasetRegistry.from_yaml() + # Check samples from different versions + assert "V1_Adult_Mouse_Brain" in registry + assert "Parent_Visium_Human_Cerebellum" in registry + assert "Visium_FFPE_Mouse_Brain" in registry + + def test_visium_10x_dataset_structure(self): + registry = DatasetRegistry.from_yaml() + v1_sample = registry["V1_Adult_Mouse_Brain"] + assert v1_sample.type == DatasetType.VISIUM_10X + assert len(v1_sample.files) == 3 # matrix, spatial, image + assert v1_sample.get_file_by_name_prefix("image.") is not None + + def test_visium_10x_has_jpg(self): + """Test that Visium_FFPE_Human_Normal_Prostate has jpg image.""" + registry = DatasetRegistry.from_yaml() + sample = registry["Visium_FFPE_Human_Normal_Prostate"] + assert sample.type == DatasetType.VISIUM_10X + # Check it's a jpg + img_file = sample.get_file_by_name_prefix("image.") + assert img_file is not None + assert img_file.name == "image.jpg" + + def test_get_dataset(self): + registry = DatasetRegistry.from_yaml() + entry = registry.get("four_i") + assert entry is not None + assert entry.name == "four_i" + + assert registry.get("nonexistent") is None + + def test_getitem(self): + registry = DatasetRegistry.from_yaml() + entry = registry["four_i"] + assert entry.name == "four_i" + + with pytest.raises(KeyError): + _ = registry["nonexistent"] + + def test_contains(self): + registry = DatasetRegistry.from_yaml() + assert "four_i" in registry + assert "nonexistent" not in registry + + def test_iter_by_type(self): + registry = DatasetRegistry.from_yaml() + anndata_entries = list(registry.iter_by_type(DatasetType.ANNDATA)) + assert len(anndata_entries) == 11 # 11 h5ad datasets + + visium_10x_entries = list(registry.iter_by_type(DatasetType.VISIUM_10X)) + assert len(visium_10x_entries) == 35 # 35 Visium samples + + def test_property_lists(self): + registry = DatasetRegistry.from_yaml() + assert len(registry.anndata_datasets) == 11 + assert len(registry.image_datasets) == 3 + assert len(registry.spatialdata_datasets) == 1 + assert len(registry.visium_datasets) == 35 + + def test_all_names(self): + registry = DatasetRegistry.from_yaml() + names = registry.all_names + assert "four_i" in names + assert "visium_hne_image" in names + assert "V1_Adult_Mouse_Brain" in names + # Total: 11 + 3 + 1 + 35 = 50 + assert len(names) == 50 + + +class TestGetRegistry: + """Tests for get_registry singleton function.""" + + def test_returns_registry(self): + registry = get_registry() + assert isinstance(registry, DatasetRegistry) + + def test_returns_same_instance(self): + registry1 = get_registry() + registry2 = get_registry() + assert registry1 is registry2