wip code review

LucaMarconato · LucaMarconato · commit cc9dc685183c · 2025-10-24T14:02:45.000+02:00
diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
@@ -409,5 +409,5 @@ class VisiumHDKeys(ModeEnum):
     FILE_FORMAT = "file_format"
 
     # Cell Segmentation keys
-    CELL_SEG_KEY_HD = 'cell_segmentations'
-    NUCLEUS_SEG_KEY_HD = 'nucleus_segmentations'
+    CELL_SEG_KEY_HD = "cell_segmentations"
+    NUCLEUS_SEG_KEY_HD = "nucleus_segmentations"
diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py
@@ -35,19 +35,20 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
-    import anndata
+    from anndata import AnnData
     from multiscale_spatial_image import MultiscaleSpatialImage
     from spatial_image import SpatialImage
     from spatialdata._types import ArrayLike
 
 RNG = default_rng(0)
 
+
 @inject_docs(vx=VisiumHDKeys)
 def visium_hd(
     path: str | Path,
     dataset_id: str | None = None,
     filtered_counts_file: bool = True,
-    load_segmentations_only: bool = True,
+    load_segmentations_only: bool = False,
     load_nucleus_segmentations: bool = False,
     bin_size: int | list[int] | None = None,
     bins_as_squares: bool = True,
@@ -74,6 +75,8 @@ def visium_hd(
     load_segmentations_only
         If `True`, only the segmented cell boundaries and their associated counts will be loaded. All binned data
         will be skipped.
+    load_nucleus_segmentations
+        ...
     bin_size
         When specified, load the data of a specific bin size, or a list of bin sizes. By default, it loads all the
         available bin sizes.
@@ -115,15 +118,26 @@ def visium_hd(
     CELL_GEOJSON_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.CELL_SEGMENTATION_GEOJSON_PATH
     NUCLEUS_GEOJSON_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.NUCLEUS_SEGMENTATION_GEOJSON_PATH
     SCALE_FACTORS_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.SPATIAL / VisiumHDKeys.SCALEFACTORS_FILE
-    BARCODE_MAPPINGS_PATH = next((file for file in path.rglob("*") if file.name.endswith(VisiumHDKeys.BARCODE_MAPPINGS_FILE)), None)
-    FILTERED_MATRIX_2U_PATH = path / VisiumHDKeys.BINNED_OUTPUTS / f"{VisiumHDKeys.BIN_PREFIX}002um" / VisiumHDKeys.FILTERED_COUNTS_FILE
-    cell_segmentation_files_exist = COUNT_MATRIX_PATH.exists() and CELL_GEOJSON_PATH.exists() and SCALE_FACTORS_PATH.exists()
-    nucleus_segmentation_files_exist = NUCLEUS_GEOJSON_PATH.exists() and (BARCODE_MAPPINGS_PATH is not None and BARCODE_MAPPINGS_PATH.exists()) and FILTERED_MATRIX_2U_PATH.exists()
+    BARCODE_MAPPINGS_PATH = next(
+        (file for file in path.rglob("*") if file.name.endswith(VisiumHDKeys.BARCODE_MAPPINGS_FILE)), None
+    )
+    FILTERED_MATRIX_2U_PATH = (
+        path / VisiumHDKeys.BINNED_OUTPUTS / f"{VisiumHDKeys.BIN_PREFIX}002um" / VisiumHDKeys.FILTERED_COUNTS_FILE
+    )
+    cell_segmentation_files_exist = (
+        COUNT_MATRIX_PATH.exists() and CELL_GEOJSON_PATH.exists() and SCALE_FACTORS_PATH.exists()
+    )
+    nucleus_segmentation_files_exist = (
+        NUCLEUS_GEOJSON_PATH.exists()
+        and (BARCODE_MAPPINGS_PATH is not None and BARCODE_MAPPINGS_PATH.exists())
+        and FILTERED_MATRIX_2U_PATH.exists()
+    )
 
     if dataset_id is None:
         dataset_id = _infer_dataset_id(path)
 
     filename_prefix = _get_filename_prefix(path, dataset_id)
+
     def load_image(path: Path, suffix: str, scale_factors: list[int] | None = None) -> None:
         _load_image(
             path=path,
@@ -147,6 +161,7 @@ def load_image(path: Path, suffix: str, scale_factors: list[int] | None = None)
 
     # Load Binned Data (skipped if load_segmentations_only is True)
     if not load_segmentations_only:
+
         def _get_bins(path_bins: Path) -> list[str]:
             return sorted(
                 [
@@ -158,9 +173,7 @@ def _get_bins(path_bins: Path) -> list[str]:
 
         all_path_bins = [path_bin for path_bin in all_files if VisiumHDKeys.BINNED_OUTPUTS in str(path_bin)]
         if len(all_path_bins) != 0:
-            path_bins_parts = all_path_bins[
-                -1
-            ].parts
+            path_bins_parts = all_path_bins[-1].parts
             path_bins = Path(*path_bins_parts[: path_bins_parts.index(VisiumHDKeys.BINNED_OUTPUTS) + 1])
         else:
             path_bins = path
@@ -181,6 +194,7 @@ def _get_bins(path_bins: Path) -> list[str]:
         if bin_size is None or bin_sizes == []:
             bin_sizes = all_bin_sizes
 
+        # iterate over the given bins and load the data
         for bin_size_str in bin_sizes:
             path_bin = path_bins / bin_size_str
             counts_file = VisiumHDKeys.FILTERED_COUNTS_FILE if filtered_counts_file else VisiumHDKeys.RAW_COUNTS_FILE
@@ -195,6 +209,7 @@ def _get_bins(path_bins: Path) -> list[str]:
             with open(path_bin_spatial / VisiumHDKeys.SCALEFACTORS_FILE) as file:
                 scalefactors = json.load(file)
 
+            # consistency check
             found_bin_size = re.search(r"\d{3}", bin_size_str)
             assert found_bin_size is not None
             assert float(found_bin_size.group()) == scalefactors[VisiumHDKeys.SCALEFACTORS_BIN_SIZE_UM]
@@ -206,6 +221,7 @@ def _get_bins(path_bins: Path) -> list[str]:
 
             tissue_positions_file = path_bin_spatial / VisiumHDKeys.TISSUE_POSITIONS_FILE
 
+            # read coordinates and set up adata.obs and adata.obsm
             coords = pd.read_parquet(tissue_positions_file)
             assert all(
                 coords.columns.values
@@ -221,7 +237,9 @@ def _get_bins(path_bins: Path) -> list[str]:
             coords.set_index(VisiumHDKeys.BARCODE, inplace=True, drop=True)
             coords_filtered = coords.loc[adata.obs.index]
             adata.obs = pd.merge(adata.obs, coords_filtered, how="left", left_index=True, right_index=True)
+            # compatibility to legacy squidpy
             adata.obsm["spatial"] = adata.obs[[VisiumHDKeys.LOCATIONS_X, VisiumHDKeys.LOCATIONS_Y]].values
+            # dropping the spatial coordinates (will be stored in shapes)
             adata.obs.drop(
                 columns=[
                     VisiumHDKeys.LOCATIONS_X,
@@ -231,6 +249,8 @@ def _get_bins(path_bins: Path) -> list[str]:
             )
             adata.obs[VisiumHDKeys.INSTANCE_KEY] = np.arange(len(adata))
 
+            # scaling
+            transform_original = Identity()
             transform_lowres = Scale(
                 np.array(
                     [
@@ -249,11 +269,13 @@ def _get_bins(path_bins: Path) -> list[str]:
                 ),
                 axes=("x", "y"),
             )
+            # parse shapes
             shapes_name = dataset_id + "_" + bin_size_str
             radius = scalefactors[VisiumHDKeys.SCALEFACTORS_SPOT_DIAMETER_FULLRES] / 2.0
 
             # Here we ensure that only the correct coordinate systems are created for the binned data
             transformations = {
+                dataset_id: transform_original,
                 f"{dataset_id}_downscaled_hires": transform_hires,
                 f"{dataset_id}_downscaled_lowres": transform_lowres,
             }
@@ -272,6 +294,7 @@ def _get_bins(path_bins: Path) -> list[str]:
                     GeoDataFrame(geometry=squares_series), transformations=transformations
                 )
 
+            # parse table
             adata.obs[VisiumHDKeys.REGION_KEY] = shapes_name
             adata.obs[VisiumHDKeys.REGION_KEY] = adata.obs[VisiumHDKeys.REGION_KEY].astype("category")
 
@@ -290,45 +313,52 @@ def _get_bins(path_bins: Path) -> list[str]:
         cell_adata_hd = sc.read_10x_h5(COUNT_MATRIX_PATH)
         cell_adata_hd.var_names_make_unique()
 
-        shapes_transformations_hd = _make_shapes_transformation(scale_factors_path=SCALE_FACTORS_PATH, dataset_id=dataset_id) # Used for both cell and nucleus segmentations
+        shapes_transformations_hd = _make_shapes_transformation(
+            scale_factors_path=SCALE_FACTORS_PATH, dataset_id=dataset_id
+        )  # Used for both cell and nucleus segmentations
         cell_geojson_features_map = _make_geojson_features_map(CELL_GEOJSON_PATH)
-        cell_shapes_gdf = _extract_geometries_from_geojson(cell_adata_hd, geojson_features_map=cell_geojson_features_map)
+        cell_shapes_gdf = _extract_geometries_from_geojson(
+            cell_adata_hd, geojson_features_map=cell_geojson_features_map
+        )
 
         SHAPES_KEY_HD = f"{dataset_id}_{VisiumHDKeys.CELL_SEG_KEY_HD}"
-        cell_adata_hd.obs['cell_id'] = cell_adata_hd.obs.index
-        cell_adata_hd.obs['region'] = SHAPES_KEY_HD
-        cell_adata_hd.obs['region'] = cell_adata_hd.obs['region'].astype('category')
+        cell_adata_hd.obs["cell_id"] = cell_adata_hd.obs.index
+        cell_adata_hd.obs["region"] = SHAPES_KEY_HD
+        cell_adata_hd.obs["region"] = cell_adata_hd.obs["region"].astype("category")
         cell_adata_hd = cell_adata_hd[cell_shapes_gdf.index].copy()
 
         shapes[SHAPES_KEY_HD] = ShapesModel.parse(cell_shapes_gdf, transformations=shapes_transformations_hd)
         tables[VisiumHDKeys.CELL_SEG_KEY_HD] = TableModel.parse(
-            cell_adata_hd,
-            region=SHAPES_KEY_HD,
-            region_key='region',
-            instance_key='cell_id'
+            cell_adata_hd, region=SHAPES_KEY_HD, region_key="region", instance_key="cell_id"
         )
 
         # load nucleus segmentations if available
         if nucleus_segmentation_files_exist and load_nucleus_segmentations:
             print("Found nucleus segmentation data. Incorporating nucleus_segmentations.")
 
-            nucleus_adata_hd = _make_filtered_nucleus_adata(filtered_matrix_h5_path=FILTERED_MATRIX_2U_PATH,barcode_mappings_parquet_path=BARCODE_MAPPINGS_PATH)
-            geojson_features_map = _make_geojson_features_map(NUCLEUS_GEOJSON_PATH)
-            nucleus_shapes_gdf = _extract_geometries_from_geojson(adata=nucleus_adata_hd, geojson_features_map=geojson_features_map)
-
-            SHAPES_KEY_HD = f"{dataset_id}_{VisiumHDKeys.NUCLEUS_SEG_KEY_HD}"
-            nucleus_adata_hd.obs['cell_id'] = nucleus_adata_hd.obs.index
-            nucleus_adata_hd.obs['region'] = SHAPES_KEY_HD
-            nucleus_adata_hd.obs['region'] = nucleus_adata_hd.obs['region'].astype('category')
-            nucleus_adata_hd = nucleus_adata_hd[nucleus_shapes_gdf.index].copy()
-
-            shapes[SHAPES_KEY_HD] = ShapesModel.parse(nucleus_shapes_gdf, transformations=shapes_transformations_hd)
-            tables[VisiumHDKeys.NUCLEUS_SEG_KEY_HD] = TableModel.parse(
-                nucleus_adata_hd,
-                region=SHAPES_KEY_HD,
-                region_key='region',
-                instance_key='cell_id'
-            )
+            if BARCODE_MAPPINGS_PATH is None:
+                warnings.warn(
+                    "Cannot find the barcode mappings file, skipping nucleus segmentations.", UserWarning, stacklevel=2
+                )
+            else:
+                nucleus_adata_hd = _make_filtered_nucleus_adata(
+                    filtered_matrix_h5_path=FILTERED_MATRIX_2U_PATH, barcode_mappings_parquet_path=BARCODE_MAPPINGS_PATH
+                )
+                geojson_features_map = _make_geojson_features_map(NUCLEUS_GEOJSON_PATH)
+                nucleus_shapes_gdf = _extract_geometries_from_geojson(
+                    adata=nucleus_adata_hd, geojson_features_map=geojson_features_map
+                )
+
+                SHAPES_KEY_HD = f"{dataset_id}_{VisiumHDKeys.NUCLEUS_SEG_KEY_HD}"
+                nucleus_adata_hd.obs["cell_id"] = nucleus_adata_hd.obs.index
+                nucleus_adata_hd.obs["region"] = SHAPES_KEY_HD
+                nucleus_adata_hd.obs["region"] = nucleus_adata_hd.obs["region"].astype("category")
+                nucleus_adata_hd = nucleus_adata_hd[nucleus_shapes_gdf.index].copy()
+
+                shapes[SHAPES_KEY_HD] = ShapesModel.parse(nucleus_shapes_gdf, transformations=shapes_transformations_hd)
+                tables[VisiumHDKeys.NUCLEUS_SEG_KEY_HD] = TableModel.parse(
+                    nucleus_adata_hd, region=SHAPES_KEY_HD, region_key="region", instance_key="cell_id"
+                )
 
     # Read all images and add transformations for both binning and segmentation
     fullres_image_file_paths = []
@@ -472,7 +502,9 @@ def _get_bins(path_bins: Path) -> list[str]:
             )
             warped = np.round(warped * 255).astype(np.uint8)
             if not load_segmentations_only:
-                warped = Image2DModel.parse(warped, dims=("y", "x", "c"), transformations={dataset_id: affine}, rgb=True)
+                warped = Image2DModel.parse(
+                    warped, dims=("y", "x", "c"), transformations={dataset_id: affine}, rgb=True
+                )
                 images[dataset_id + "_cytassist_image"] = warped
     elif load_all_images:
         warnings.warn(
@@ -645,12 +677,13 @@ def _get_transform_matrices(metadata: dict[str, Any], hd_layout: dict[str, Any])
 
     return transform_matrices
 
+
 def _make_filtered_nucleus_adata(
     filtered_matrix_h5_path: Path,
     barcode_mappings_parquet_path: Path,
-    bin_col_name: str = 'square_002um',
-    aggregate_col_name: str = 'cell_id'
-) -> anndata.AnnData:
+    bin_col_name: str = "square_002um",
+    aggregate_col_name: str = "cell_id",
+) -> AnnData:
     """Generate a filtered AnnData object by aggregating 2um binned data based on nucleus segmentation.
 
     Uses a 2um filtered_feature_bc_matrix.h5 file and a barcode_mappings.parquet file containing
@@ -671,7 +704,7 @@ def _make_filtered_nucleus_adata(
 
     Returns:
     --------
-    anndata.AnnData
+    AnnData
         An AnnData object where the observations correspond to filtered cell IDs
         and the variables correspond to the original features from the input data.
     """
@@ -680,19 +713,19 @@ def _make_filtered_nucleus_adata(
     barcode_mappings = pq.read_table(barcode_mappings_parquet_path)
 
     # Filter to only include valid cell IDs that are in both nucleus and cell
-    barcode_mappings = barcode_mappings.filter((barcode_mappings['cell_id'].is_valid()) and barcode_mappings["in_nucleus"])
+    barcode_mappings = barcode_mappings.filter(
+        (barcode_mappings["cell_id"].is_valid()) and barcode_mappings["in_nucleus"]
+    )
 
     # Filter the 2um adata to only include squares present in the barcode mappings
     valid_squares = barcode_mappings[bin_col_name].unique()
     squares_to_keep = np.intersect1d(adata_2um.obs_names, valid_squares)
     adata_filtered = adata_2um[squares_to_keep, :].copy()
 
     # Map each square to its corresponding cell ID
-    square_to_cell_map = dict(zip(
-        barcode_mappings[bin_col_name].to_pylist(),
-        barcode_mappings[aggregate_col_name].to_pylist(), strict=False
-
-    ))
+    square_to_cell_map = dict(
+        zip(barcode_mappings[bin_col_name].to_pylist(), barcode_mappings[aggregate_col_name].to_pylist(), strict=False)
+    )
     ordered_cell_ids = [square_to_cell_map[square] for square in adata_filtered.obs_names]
     unique_cells = list(dict.fromkeys(ordered_cell_ids).keys())
     cell_to_idx = {cell: i for i, cell in enumerate(unique_cells)}
@@ -702,10 +735,7 @@ def _make_filtered_nucleus_adata(
     row_indices = np.arange(len(ordered_cell_ids))
     data = np.ones_like(row_indices)
 
-    aggregation_matrix = csc_matrix(
-        (data, (row_indices, col_indices)),
-        shape=(adata_filtered.n_obs, len(unique_cells))
-    )
+    aggregation_matrix = csc_matrix((data, (row_indices, col_indices)), shape=(adata_filtered.n_obs, len(unique_cells)))
 
     # Make the final AnnData object where cell IDs are filtered
     # to the data under the segmented nuclei
@@ -716,12 +746,13 @@ def _make_filtered_nucleus_adata(
 
     return adata_nucleus
 
-def _extract_geometries_from_geojson(adata: anndata.AnnData, geojson_features_map: dict[str, Any]) -> GeoDataFrame:
+
+def _extract_geometries_from_geojson(adata: AnnData, geojson_features_map: dict[str, Any]) -> GeoDataFrame:
     """Extract geometries and create a GeoDataFrame from a GeoJSON features map.
 
     Parameters
     ----------
-    cell_adata : anndata.AnnData
+    cell_adata : AnnData
         AnnData object containing cell data.
     geojson_features_map : dict[str, Any]
         Dictionary mapping cell IDs to GeoJSON features.
@@ -737,7 +768,7 @@ def _extract_geometries_from_geojson(adata: anndata.AnnData, geojson_features_ma
     for obs_index_str in adata.obs.index:
         feature = geojson_features_map.get(obs_index_str)
         if feature:
-            polygon_coords = np.array(feature['geometry']['coordinates'][0])
+            polygon_coords = np.array(feature["geometry"]["coordinates"][0])
             geometries.append(Polygon(polygon_coords))
             cell_ids_ordered.append(obs_index_str)
         else:
@@ -748,10 +779,8 @@ def _extract_geometries_from_geojson(adata: anndata.AnnData, geojson_features_ma
     geometries = [geometries[i] for i in valid_indices]
     cell_ids_ordered = [cell_ids_ordered[i] for i in valid_indices]
 
-    return GeoDataFrame({
-        'cell_id': cell_ids_ordered,
-        'geometry': geometries
-    }, index=cell_ids_ordered)
+    return GeoDataFrame({"cell_id": cell_ids_ordered, "geometry": geometries}, index=cell_ids_ordered)
+
 
 def _make_shapes_transformation(scale_factors_path: Path, dataset_id: str) -> dict[str, Scale]:
     """Load scale factors for lowres and hires images and create transformations.
@@ -770,18 +799,20 @@ def _make_shapes_transformation(scale_factors_path: Path, dataset_id: str) -> di
     """
     with open(scale_factors_path) as f:
         scale_data_hd = json.load(f)
-    lowres_scale_factor_hd = scale_data_hd['tissue_lowres_scalef']
-    hires_scale_factor_hd = scale_data_hd['tissue_hires_scalef']
+    lowres_scale_factor_hd = scale_data_hd["tissue_lowres_scalef"]
+    hires_scale_factor_hd = scale_data_hd["tissue_hires_scalef"]
 
     return {
-        f"{dataset_id}_downscaled_lowres": Scale(np.array([lowres_scale_factor_hd, lowres_scale_factor_hd]), axes=("x", "y")),
-        f"{dataset_id}_downscaled_hires": Scale(np.array([hires_scale_factor_hd, hires_scale_factor_hd]), axes=("x", "y"))
+        f"{dataset_id}_downscaled_lowres": Scale(
+            np.array([lowres_scale_factor_hd, lowres_scale_factor_hd]), axes=("x", "y")
+        ),
+        f"{dataset_id}_downscaled_hires": Scale(
+            np.array([hires_scale_factor_hd, hires_scale_factor_hd]), axes=("x", "y")
+        ),
     }
 
+
 def _make_geojson_features_map(geojson_path: Path) -> dict[str, Any]:
     with open(geojson_path) as f:
         geojson_data = json.load(f)
-    return {
-        f"cellid_{feature['properties']['cell_id']:09d}-1": feature
-        for feature in geojson_data['features']
-    }
+    return {f"cellid_{feature['properties']['cell_id']:09d}-1": feature for feature in geojson_data["features"]}
diff --git a/tests/test_visium_hd.py b/tests/test_visium_hd.py