Skip to content

Commit 037a3e5

Browse files
committed
improved connectivity edges
1 parent 4c28866 commit 037a3e5

File tree

1 file changed

+92
-32
lines changed

1 file changed

+92
-32
lines changed

src/cellucid/prepare_data.py

Lines changed: 92 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -898,63 +898,123 @@ def export_data_for_web(
898898
print("INFO: No gene expression data provided, skipping var export.")
899899

900900
# Process connectivity data if provided
901+
# GPU-optimized edge format for instanced rendering
901902
if connectivities is not None:
902903
connectivity_manifest_path = out_dir / connectivity_manifest_filename
903904
if _file_exists_skip(connectivity_manifest_path, "connectivity manifest", force):
904905
pass
905906
else:
906907
connectivity_binary_dir = out_dir / connectivity_binary_dirname
907908
connectivity_binary_dir.mkdir(parents=True, exist_ok=True)
908-
909+
909910
if not sparse.isspmatrix_csr(connectivities):
910911
connectivities = sparse.csr_matrix(connectivities)
911-
912+
912913
if connectivities.shape[0] != n_cells or connectivities.shape[1] != n_cells:
913914
raise ValueError(
914915
f"Connectivity matrix shape {connectivities.shape} does not match "
915916
f"number of cells {n_cells}."
916917
)
917-
918+
919+
# Symmetrize and binarize the connectivity matrix
918920
connectivities_sym = connectivities + connectivities.T
919921
connectivities_sym.data[:] = 1
920922
connectivities_csr = connectivities_sym.tocsr()
921-
922-
indptr = connectivities_csr.indptr.astype(np.uint32)
923-
indices = connectivities_csr.indices.astype(np.uint32)
924-
925-
n_edges = len(indices) // 2
926-
927-
indptr_fname = "indptr.u32"
928-
indices_fname = "indices.u32"
929-
indptr_path = connectivity_binary_dir / indptr_fname
930-
indices_path = connectivity_binary_dir / indices_fname
931-
932-
_write_binary(indptr_path, indptr, compression)
933-
_write_binary(indices_path, indices, compression)
934-
935-
manifest_indptr = f"{connectivity_binary_dirname}/{indptr_fname}"
936-
manifest_indices = f"{connectivity_binary_dirname}/{indices_fname}"
923+
924+
# Determine optimal dtype based on cell count
925+
# uint16: up to 65,535 cells
926+
# uint32: up to 4,294,967,295 cells
927+
# uint64: up to 18,446,744,073,709,551,615 cells
928+
if n_cells <= 65535:
929+
index_dtype = np.uint16
930+
index_dtype_str = "uint16"
931+
index_bytes = 2
932+
elif n_cells <= 4294967295:
933+
index_dtype = np.uint32
934+
index_dtype_str = "uint32"
935+
index_bytes = 4
936+
else:
937+
index_dtype = np.uint64
938+
index_dtype_str = "uint64"
939+
index_bytes = 8
940+
941+
indptr = connectivities_csr.indptr
942+
indices = connectivities_csr.indices
943+
944+
# Extract unique edges (src < dst to avoid duplicates)
945+
# Using vectorized operations for speed with large datasets
946+
print(f" Extracting unique edges from {n_cells:,} cells...")
947+
948+
edge_sources = []
949+
edge_destinations = []
950+
max_neighbors_found = 0
951+
952+
# Process in chunks for memory efficiency with very large datasets
953+
chunk_size = 100000
954+
for chunk_start in range(0, n_cells, chunk_size):
955+
chunk_end = min(chunk_start + chunk_size, n_cells)
956+
for cell_idx in range(chunk_start, chunk_end):
957+
start = indptr[cell_idx]
958+
end = indptr[cell_idx + 1]
959+
neighbor_count = end - start
960+
if neighbor_count > max_neighbors_found:
961+
max_neighbors_found = neighbor_count
962+
963+
for j in range(start, end):
964+
neighbor_idx = indices[j]
965+
# Only keep edges where src < dst (avoid duplicates)
966+
if cell_idx < neighbor_idx:
967+
edge_sources.append(cell_idx)
968+
edge_destinations.append(neighbor_idx)
969+
970+
edge_sources = np.array(edge_sources, dtype=index_dtype)
971+
edge_destinations = np.array(edge_destinations, dtype=index_dtype)
972+
n_unique_edges = len(edge_sources)
973+
974+
print(f" Found {n_unique_edges:,} unique edges, max {max_neighbors_found} neighbors/cell")
975+
976+
# Sort edges by source, then by destination for optimal gzip compression
977+
print(f" Sorting edges for optimal compression...")
978+
sort_idx = np.lexsort((edge_destinations, edge_sources))
979+
edge_sources = edge_sources[sort_idx]
980+
edge_destinations = edge_destinations[sort_idx]
981+
982+
# Write binary files (column-separated for better compression)
983+
sources_fname = "edges.src.bin"
984+
dests_fname = "edges.dst.bin"
985+
sources_path = connectivity_binary_dir / sources_fname
986+
dests_path = connectivity_binary_dir / dests_fname
987+
988+
_write_binary(sources_path, edge_sources, compression)
989+
_write_binary(dests_path, edge_destinations, compression)
990+
991+
manifest_sources = f"{connectivity_binary_dirname}/{sources_fname}"
992+
manifest_dests = f"{connectivity_binary_dirname}/{dests_fname}"
937993
if compression:
938-
manifest_indptr += ".gz"
939-
manifest_indices += ".gz"
940-
994+
manifest_sources += ".gz"
995+
manifest_dests += ".gz"
996+
997+
# Write manifest
941998
connectivity_manifest_payload = {
942-
"n_points": int(n_cells),
943-
"n_edges": int(n_edges),
944-
"nnz": int(len(indices)),
945-
"format": "csr",
946-
"indptrPath": manifest_indptr,
947-
"indicesPath": manifest_indices,
948-
"dtype": "uint32",
999+
"format": "edge_pairs",
1000+
"n_cells": int(n_cells),
1001+
"n_edges": int(n_unique_edges),
1002+
"max_neighbors": int(max_neighbors_found),
1003+
"index_bytes": index_bytes,
1004+
"index_dtype": index_dtype_str,
1005+
"sourcesPath": manifest_sources,
1006+
"destinationsPath": manifest_dests,
9491007
"compression": compression if compression else None,
9501008
}
1009+
9511010
connectivity_manifest_path.write_text(
9521011
json.dumps(connectivity_manifest_payload), encoding="utf-8"
9531012
)
954-
1013+
9551014
print(
956-
f"✓ Wrote connectivity manifest ({n_edges:,} unique edges, ~{len(indices)//n_cells:.1f} neighbors/cell) "
957-
f"to {connectivity_manifest_path}"
1015+
f"✓ Wrote connectivity ({n_unique_edges:,} edges, "
1016+
f"max {max_neighbors_found} neighbors/cell, {index_dtype_str}) "
1017+
f"to {connectivity_binary_dir}"
9581018
)
9591019
else:
9601020
print("INFO: No connectivity data provided, skipping connectivity export.")

0 commit comments

Comments
 (0)