From e2d2a25c965133683284da58bd84d5ae08708c32 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Thu, 19 Feb 2026 01:16:03 +0000 Subject: [PATCH] Sort layers alphabetically for deterministic h5ad output The HDF5 B-tree stores group children in insertion order, and Python set iteration order (used when populating layers from output_assays) is non-deterministic. This caused the layers (spliced, unspliced, ambiguous) to be written in different orders across runs, producing different md5 checksums even though the data was identical. Sort layers alphabetically via OrderedDict before writing h5ad files to ensure byte-level reproducibility. Co-Authored-By: Claude Opus 4.6 --- src/qcatch/input_processing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/qcatch/input_processing.py b/src/qcatch/input_processing.py index b1f28b6..293a4dd 100644 --- a/src/qcatch/input_processing.py +++ b/src/qcatch/input_processing.py @@ -5,6 +5,7 @@ import logging import os import shutil +from collections import OrderedDict from pathlib import Path import numpy as np @@ -469,6 +470,9 @@ def save_results(args, version, intermediate_result, valid_bcs): args.input.mtx_data.obs_names.sort_values(), args.input.mtx_data.var_names.sort_values() ].copy() + # Sort layers alphabetically for deterministic h5ad output + args.input.mtx_data.layers = OrderedDict(sorted(args.input.mtx_data.layers.items())) + if args.input.is_h5ad and output_dir == args.input.dir: # Inplace overwrite: same location as original temp_file = os.path.join(output_dir, "quants.h5ad") @@ -494,6 +498,8 @@ def save_results(args, version, intermediate_result, valid_bcs): filter_mtx_data = filter_mtx_data[ filter_mtx_data.obs_names.sort_values(), filter_mtx_data.var_names.sort_values() ].copy() + # Sort layers alphabetically for deterministic h5ad output + filter_mtx_data.layers = OrderedDict(sorted(filter_mtx_data.layers.items())) # Save the filtered anndata to a new file filter_mtx_data_filename = os.path.join(output_dir, "filtered_quants.h5ad") filter_mtx_data.write_h5ad(filter_mtx_data_filename, compression="gzip")