Skip to content

Commit 30e4076

Browse files
wip code review visium hd
1 parent 49d3e7e commit 30e4076

File tree

1 file changed

+96
-82
lines changed

1 file changed

+96
-82
lines changed

src/spatialdata_io/readers/visium_hd.py

Lines changed: 96 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,27 @@ def load_image(path: Path, suffix: str, scale_factors: list[int] | None = None)
157157
stacklevel=2,
158158
)
159159

160-
# Load Binned Data (skipped if load_segmentations_only is True)
160+
# TODO: load scalefactor independenly of the parameter load_segmentations_only
161+
transform_lowres = Scale(
162+
np.array(
163+
[
164+
scalefactors[VisiumHDKeys.SCALEFACTORS_LOWRES],
165+
scalefactors[VisiumHDKeys.SCALEFACTORS_LOWRES],
166+
]
167+
),
168+
axes=("x", "y"),
169+
)
170+
transform_hires = Scale(
171+
np.array(
172+
[
173+
scalefactors[VisiumHDKeys.SCALEFACTORS_HIRES],
174+
scalefactors[VisiumHDKeys.SCALEFACTORS_HIRES],
175+
]
176+
),
177+
axes=("x", "y"),
178+
)
179+
180+
# Load Binned Data
161181
if not load_segmentations_only:
162182

163183
def _get_bins(path_bins: Path) -> list[str]:
@@ -171,7 +191,9 @@ def _get_bins(path_bins: Path) -> list[str]:
171191

172192
all_path_bins = [path_bin for path_bin in all_files if VisiumHDKeys.BINNED_OUTPUTS in str(path_bin)]
173193
if len(all_path_bins) != 0:
174-
path_bins_parts = all_path_bins[-1].parts
194+
path_bins_parts = all_path_bins[
195+
-1
196+
].parts # just choosing last one here as users might have tar file which would be first
175197
path_bins = Path(*path_bins_parts[: path_bins_parts.index(VisiumHDKeys.BINNED_OUTPUTS) + 1])
176198
else:
177199
path_bins = path
@@ -192,6 +214,7 @@ def _get_bins(path_bins: Path) -> list[str]:
192214
if bin_size is None or bin_sizes == []:
193215
bin_sizes = all_bin_sizes
194216

217+
# iterate over the given bins and load the data
195218
for bin_size_str in bin_sizes:
196219
path_bin = path_bins / bin_size_str
197220
counts_file = VisiumHDKeys.FILTERED_COUNTS_FILE if filtered_counts_file else VisiumHDKeys.RAW_COUNTS_FILE
@@ -206,6 +229,7 @@ def _get_bins(path_bins: Path) -> list[str]:
206229
with open(path_bin_spatial / VisiumHDKeys.SCALEFACTORS_FILE) as file:
207230
scalefactors = json.load(file)
208231

232+
# consistency check
209233
found_bin_size = re.search(r"\d{3}", bin_size_str)
210234
assert found_bin_size is not None
211235
assert float(found_bin_size.group()) == scalefactors[VisiumHDKeys.SCALEFACTORS_BIN_SIZE_UM]
@@ -217,6 +241,7 @@ def _get_bins(path_bins: Path) -> list[str]:
217241

218242
tissue_positions_file = path_bin_spatial / VisiumHDKeys.TISSUE_POSITIONS_FILE
219243

244+
# read coordinates and set up adata.obs and adata.obsm
220245
coords = pd.read_parquet(tissue_positions_file)
221246
assert all(
222247
coords.columns.values
@@ -232,7 +257,9 @@ def _get_bins(path_bins: Path) -> list[str]:
232257
coords.set_index(VisiumHDKeys.BARCODE, inplace=True, drop=True)
233258
coords_filtered = coords.loc[adata.obs.index]
234259
adata.obs = pd.merge(adata.obs, coords_filtered, how="left", left_index=True, right_index=True)
260+
# compatibility to legacy squidpy
235261
adata.obsm["spatial"] = adata.obs[[VisiumHDKeys.LOCATIONS_X, VisiumHDKeys.LOCATIONS_Y]].values
262+
# dropping the spatial coordinates (will be stored in shapes)
236263
adata.obs.drop(
237264
columns=[
238265
VisiumHDKeys.LOCATIONS_X,
@@ -242,29 +269,13 @@ def _get_bins(path_bins: Path) -> list[str]:
242269
)
243270
adata.obs[VisiumHDKeys.INSTANCE_KEY] = np.arange(len(adata))
244271

245-
transform_lowres = Scale(
246-
np.array(
247-
[
248-
scalefactors[VisiumHDKeys.SCALEFACTORS_LOWRES],
249-
scalefactors[VisiumHDKeys.SCALEFACTORS_LOWRES],
250-
]
251-
),
252-
axes=("x", "y"),
253-
)
254-
transform_hires = Scale(
255-
np.array(
256-
[
257-
scalefactors[VisiumHDKeys.SCALEFACTORS_HIRES],
258-
scalefactors[VisiumHDKeys.SCALEFACTORS_HIRES],
259-
]
260-
),
261-
axes=("x", "y"),
262-
)
272+
# scaling
273+
transform_original = Identity()
274+
# parse shapes
263275
shapes_name = dataset_id + "_" + bin_size_str
264276
radius = scalefactors[VisiumHDKeys.SCALEFACTORS_SPOT_DIAMETER_FULLRES] / 2.0
265-
266-
# Here we ensure that only the correct coordinate systems are created for the binned data
267277
transformations = {
278+
dataset_id: transform_original,
268279
f"{dataset_id}_downscaled_hires": transform_hires,
269280
f"{dataset_id}_downscaled_lowres": transform_lowres,
270281
}
@@ -283,6 +294,7 @@ def _get_bins(path_bins: Path) -> list[str]:
283294
GeoDataFrame(geometry=squares_series), transformations=transformations
284295
)
285296

297+
# parse table
286298
adata.obs[VisiumHDKeys.REGION_KEY] = shapes_name
287299
adata.obs[VisiumHDKeys.REGION_KEY] = adata.obs[VisiumHDKeys.REGION_KEY].astype("category")
288300

@@ -295,6 +307,25 @@ def _get_bins(path_bins: Path) -> list[str]:
295307
if var_names_make_unique:
296308
tables[bin_size_str].var_names_make_unique()
297309

310+
if annotate_table_by_labels:
311+
for bin_size_str in bin_sizes:
312+
shapes_name = dataset_id + "_" + bin_size_str
313+
# add labels layer (rasterized bins).
314+
labels_name = f"{dataset_id}_{bin_size_str}_labels"
315+
labels_element = rasterize_bins(
316+
sdata,
317+
bins=shapes_name,
318+
table_name=bin_size_str,
319+
row_key=VisiumHDKeys.ARRAY_ROW,
320+
col_key=VisiumHDKeys.ARRAY_COL,
321+
value_key=None,
322+
return_region_as_labels=True,
323+
)
324+
sdata[labels_name] = labels_element
325+
rasterize_bins_link_table_to_labels(
326+
sdata=sdata, table_name=bin_size_str, rasterized_labels_name=labels_name
327+
)
328+
298329
# Integrate the segmentation data (skipped if segmentation files are not found)
299330
if cell_segmentation_files_exist:
300331
print("Found segmentation data. Incorporating cell_segmentations.")
@@ -385,62 +416,51 @@ def _get_bins(path_bins: Path) -> list[str]:
385416
stacklevel=2,
386417
)
387418

419+
# hires image
388420
hires_image_path = [path for path in all_files if VisiumHDKeys.IMAGE_HIRES_FILE in str(path)]
389421
if len(hires_image_path) > 0:
390422
load_image(
391423
path=hires_image_path[0],
392424
suffix="_hires_image",
393425
)
394-
if not load_segmentations_only and "transform_hires" in locals():
395-
set_transformation(
396-
images[dataset_id + "_hires_image"],
397-
{
398-
f"{dataset_id}_downscaled_hires": Identity(),
399-
dataset_id: transform_hires.inverse(),
400-
},
401-
set_all=True,
402-
)
403-
if cell_segmentation_files_exist:
404-
set_transformation(
405-
images[dataset_id + "_hires_image"],
406-
{f"{dataset_id}_downscaled_hires": Identity()},
407-
set_all=True,
408-
)
426+
set_transformation(
427+
images[dataset_id + "_hires_image"],
428+
{
429+
f"{dataset_id}_downscaled_hires": Identity(),
430+
dataset_id: transform_hires.inverse(),
431+
},
432+
set_all=True,
433+
)
409434
else:
410435
warnings.warn(
411436
f"No image path found containing the hires image: {VisiumHDKeys.IMAGE_HIRES_FILE}",
412437
UserWarning,
413438
stacklevel=2,
414439
)
415440

441+
# lowres image
416442
lowres_image_path = [path for path in all_files if VisiumHDKeys.IMAGE_LOWRES_FILE in str(path)]
417443
if len(lowres_image_path) > 0:
418444
load_image(
419445
path=lowres_image_path[0],
420446
suffix="_lowres_image",
421447
)
422-
if not load_segmentations_only and "transform_lowres" in locals():
423-
set_transformation(
424-
images[dataset_id + "_lowres_image"],
425-
{
426-
f"{dataset_id}_downscaled_lowres": Identity(),
427-
dataset_id: transform_lowres.inverse(),
428-
},
429-
set_all=True,
430-
)
431-
if cell_segmentation_files_exist:
432-
set_transformation(
433-
images[dataset_id + "_lowres_image"],
434-
{f"{dataset_id}_downscaled_lowres": Identity()},
435-
set_all=True,
436-
)
448+
set_transformation(
449+
images[dataset_id + "_lowres_image"],
450+
{
451+
f"{dataset_id}_downscaled_lowres": Identity(),
452+
dataset_id: transform_lowres.inverse(),
453+
},
454+
set_all=True,
455+
)
437456
else:
438457
warnings.warn(
439458
f"No image path found containing the lowres image: {VisiumHDKeys.IMAGE_LOWRES_FILE}",
440459
UserWarning,
441460
stacklevel=2,
442461
)
443462

463+
# cytassist image
444464
cytassist_path = [path for path in all_files if VisiumHDKeys.IMAGE_CYTASSIST in str(path)]
445465
if load_all_images and len(cytassist_path) > 0:
446466
load_image(
@@ -455,17 +475,21 @@ def _get_bins(path_bins: Path) -> list[str]:
455475
projective /= projective[2, 2]
456476
if _projective_matrix_is_affine(projective):
457477
affine = Affine(projective, input_axes=("x", "y"), output_axes=("x", "y"))
458-
if not load_segmentations_only:
459-
set_transformation(image, affine, dataset_id)
478+
set_transformation(image, affine, dataset_id)
460479
else:
480+
# the projective matrix is not affine, we will separate the affine part and the projective shift, and apply
481+
# the projective shift to the image
461482
affine_matrix, projective_shift = _decompose_projective_matrix(projective)
462483
affine = Affine(affine_matrix, input_axes=("x", "y"), output_axes=("x", "y"))
484+
485+
# determine the size of the transformed image
463486
bounding_box = get_extent(image, coordinate_system=dataset_id)
464487
x0, x1 = bounding_box["x"]
465488
y0, y1 = bounding_box["y"]
466489
x1 -= 1
467490
y1 -= 1
468491
corners = [(x0, y0), (x1, y0), (x1, y1), (x0, y1)]
492+
469493
transformed_corners = []
470494
for x, y in corners:
471495
px, py = _projective_matrix_transform_point(projective_shift, x, y)
@@ -477,18 +501,24 @@ def _get_bins(path_bins: Path) -> list[str]:
477501
np.max(transformed_corners_array[:, 0]),
478502
np.max(transformed_corners_array[:, 1]),
479503
)
504+
# the first two components are <= 0, we just discard them since the cytassist image has a lot of padding
505+
# and therefore we can safely discard pixels with negative coordinates
480506
transformed_shape = (np.ceil(transformed_bounds[2]), np.ceil(transformed_bounds[3]))
507+
508+
# flip xy
481509
transformed_shape = (transformed_shape[1], transformed_shape[0])
510+
511+
# the cytassist image is a small, single-scale image, so we can compute it in memory
482512
numpy_data = image.transpose("y", "x", "c").data.compute()
483513
warped = warp(
484514
numpy_data, ProjectiveTransform(projective_shift).inverse, output_shape=transformed_shape, order=1
485515
)
486516
warped = np.round(warped * 255).astype(np.uint8)
487-
if not load_segmentations_only:
488-
warped = Image2DModel.parse(
489-
warped, dims=("y", "x", "c"), transformations={dataset_id: affine}, rgb=True
490-
)
491-
images[dataset_id + "_cytassist_image"] = warped
517+
warped = Image2DModel.parse(
518+
warped, dims=("y", "x", "c"), transformations={dataset_id: affine}, rgb=True
519+
)
520+
# we replace the cytassist image with the warped image
521+
images[dataset_id + "_cytassist_image"] = warped
492522
elif load_all_images:
493523
warnings.warn(
494524
f"No image path found containing the cytassist image: {VisiumHDKeys.IMAGE_CYTASSIST}",
@@ -498,23 +528,7 @@ def _get_bins(path_bins: Path) -> list[str]:
498528

499529
sdata = SpatialData(tables=tables, images=images, shapes=shapes, labels=labels)
500530

501-
if annotate_table_by_labels and not load_segmentations_only:
502-
for bin_size_str in bin_sizes:
503-
shapes_name = dataset_id + "_" + bin_size_str
504-
labels_name = f"{dataset_id}_{bin_size_str}_labels"
505-
labels_element = rasterize_bins(
506-
sdata,
507-
bins=shapes_name,
508-
table_name=bin_size_str,
509-
row_key=VisiumHDKeys.ARRAY_ROW,
510-
col_key=VisiumHDKeys.ARRAY_COL,
511-
value_key=None,
512-
return_region_as_labels=True,
513-
)
514-
sdata[labels_name] = labels_element
515-
rasterize_bins_link_table_to_labels(
516-
sdata=sdata, table_name=bin_size_str, rasterized_labels_name=labels_name
517-
)
531+
518532

519533
return sdata
520534

@@ -667,22 +681,22 @@ def _make_filtered_nucleus_adata(
667681
bin_col_name: str = "square_002um",
668682
aggregate_col_name: str = "cell_id",
669683
) -> anndata.AnnData:
670-
"""Generate a filtered AnnData object by aggregating 2um binned data based on nucleus segmentation.
684+
"""Generate a filtered AnnData object by aggregating binned data (default 2um) based on nucleus segmentation.
671685
672-
Uses a 2um filtered_feature_bc_matrix.h5 file and a barcode_mappings.parquet file containing
686+
Uses filtered_feature_bc_matrix.h5 file and a barcode_mappings.parquet file containing
673687
barcode mappings, filters the data to include only valid nucleus mappings,
674688
and aggregates the data based on specified bin into cell IDs which only contain
675689
the 2um square data under segmented nuclei.
676690
677691
Parameters:
678692
-----------
679-
filtered_matrix_h5_path : Path
693+
filtered_matrix_h5_path
680694
Path to the 10x Genomics HDF5 matrix file.
681-
barcode_mappings_parquet_path : Path
695+
barcode_mappings_parquet_path
682696
Path to the Parquet file containing barcode mappings.
683-
bin_col_name : str, optional
697+
bin_col_name
684698
Column name in the barcode mappings that specifies the spatial bin (default is 'square_002um').
685-
aggregate_col_name : str, optional
699+
aggregate_col_name
686700
Column name in the barcode mappings that specifies the aggregate cell ID (default is 'cell_id').
687701
688702
Returns:

0 commit comments

Comments
 (0)