@@ -898,63 +898,123 @@ def export_data_for_web(
898898 print ("INFO: No gene expression data provided, skipping var export." )
899899
900900 # Process connectivity data if provided
901+ # GPU-optimized edge format for instanced rendering
901902 if connectivities is not None :
902903 connectivity_manifest_path = out_dir / connectivity_manifest_filename
903904 if _file_exists_skip (connectivity_manifest_path , "connectivity manifest" , force ):
904905 pass
905906 else :
906907 connectivity_binary_dir = out_dir / connectivity_binary_dirname
907908 connectivity_binary_dir .mkdir (parents = True , exist_ok = True )
908-
909+
909910 if not sparse .isspmatrix_csr (connectivities ):
910911 connectivities = sparse .csr_matrix (connectivities )
911-
912+
912913 if connectivities .shape [0 ] != n_cells or connectivities .shape [1 ] != n_cells :
913914 raise ValueError (
914915 f"Connectivity matrix shape { connectivities .shape } does not match "
915916 f"number of cells { n_cells } ."
916917 )
917-
918+
919+ # Symmetrize and binarize the connectivity matrix
918920 connectivities_sym = connectivities + connectivities .T
919921 connectivities_sym .data [:] = 1
920922 connectivities_csr = connectivities_sym .tocsr ()
921-
922- indptr = connectivities_csr .indptr .astype (np .uint32 )
923- indices = connectivities_csr .indices .astype (np .uint32 )
924-
925- n_edges = len (indices ) // 2
926-
927- indptr_fname = "indptr.u32"
928- indices_fname = "indices.u32"
929- indptr_path = connectivity_binary_dir / indptr_fname
930- indices_path = connectivity_binary_dir / indices_fname
931-
932- _write_binary (indptr_path , indptr , compression )
933- _write_binary (indices_path , indices , compression )
934-
935- manifest_indptr = f"{ connectivity_binary_dirname } /{ indptr_fname } "
936- manifest_indices = f"{ connectivity_binary_dirname } /{ indices_fname } "
923+
924+ # Determine optimal dtype based on cell count
925+ # uint16: up to 65,535 cells
926+ # uint32: up to 4,294,967,295 cells
927+ # uint64: up to 18,446,744,073,709,551,615 cells
928+ if n_cells <= 65535 :
929+ index_dtype = np .uint16
930+ index_dtype_str = "uint16"
931+ index_bytes = 2
932+ elif n_cells <= 4294967295 :
933+ index_dtype = np .uint32
934+ index_dtype_str = "uint32"
935+ index_bytes = 4
936+ else :
937+ index_dtype = np .uint64
938+ index_dtype_str = "uint64"
939+ index_bytes = 8
940+
941+ indptr = connectivities_csr .indptr
942+ indices = connectivities_csr .indices
943+
944+ # Extract unique edges (src < dst to avoid duplicates)
945+ # Using vectorized operations for speed with large datasets
946+ print (f" Extracting unique edges from { n_cells :,} cells..." )
947+
948+ edge_sources = []
949+ edge_destinations = []
950+ max_neighbors_found = 0
951+
952+ # Process in chunks for memory efficiency with very large datasets
953+ chunk_size = 100000
954+ for chunk_start in range (0 , n_cells , chunk_size ):
955+ chunk_end = min (chunk_start + chunk_size , n_cells )
956+ for cell_idx in range (chunk_start , chunk_end ):
957+ start = indptr [cell_idx ]
958+ end = indptr [cell_idx + 1 ]
959+ neighbor_count = end - start
960+ if neighbor_count > max_neighbors_found :
961+ max_neighbors_found = neighbor_count
962+
963+ for j in range (start , end ):
964+ neighbor_idx = indices [j ]
965+ # Only keep edges where src < dst (avoid duplicates)
966+ if cell_idx < neighbor_idx :
967+ edge_sources .append (cell_idx )
968+ edge_destinations .append (neighbor_idx )
969+
970+ edge_sources = np .array (edge_sources , dtype = index_dtype )
971+ edge_destinations = np .array (edge_destinations , dtype = index_dtype )
972+ n_unique_edges = len (edge_sources )
973+
974+ print (f" Found { n_unique_edges :,} unique edges, max { max_neighbors_found } neighbors/cell" )
975+
976+ # Sort edges by source, then by destination for optimal gzip compression
977+ print (f" Sorting edges for optimal compression..." )
978+ sort_idx = np .lexsort ((edge_destinations , edge_sources ))
979+ edge_sources = edge_sources [sort_idx ]
980+ edge_destinations = edge_destinations [sort_idx ]
981+
982+ # Write binary files (column-separated for better compression)
983+ sources_fname = "edges.src.bin"
984+ dests_fname = "edges.dst.bin"
985+ sources_path = connectivity_binary_dir / sources_fname
986+ dests_path = connectivity_binary_dir / dests_fname
987+
988+ _write_binary (sources_path , edge_sources , compression )
989+ _write_binary (dests_path , edge_destinations , compression )
990+
991+ manifest_sources = f"{ connectivity_binary_dirname } /{ sources_fname } "
992+ manifest_dests = f"{ connectivity_binary_dirname } /{ dests_fname } "
937993 if compression :
938- manifest_indptr += ".gz"
939- manifest_indices += ".gz"
940-
994+ manifest_sources += ".gz"
995+ manifest_dests += ".gz"
996+
997+ # Write manifest
941998 connectivity_manifest_payload = {
942- "n_points" : int (n_cells ),
943- "n_edges" : int (n_edges ),
944- "nnz" : int (len (indices )),
945- "format" : "csr" ,
946- "indptrPath" : manifest_indptr ,
947- "indicesPath" : manifest_indices ,
948- "dtype" : "uint32" ,
999+ "format" : "edge_pairs" ,
1000+ "n_cells" : int (n_cells ),
1001+ "n_edges" : int (n_unique_edges ),
1002+ "max_neighbors" : int (max_neighbors_found ),
1003+ "index_bytes" : index_bytes ,
1004+ "index_dtype" : index_dtype_str ,
1005+ "sourcesPath" : manifest_sources ,
1006+ "destinationsPath" : manifest_dests ,
9491007 "compression" : compression if compression else None ,
9501008 }
1009+
9511010 connectivity_manifest_path .write_text (
9521011 json .dumps (connectivity_manifest_payload ), encoding = "utf-8"
9531012 )
954-
1013+
9551014 print (
956- f"✓ Wrote connectivity manifest ({ n_edges :,} unique edges, ~{ len (indices )// n_cells :.1f} neighbors/cell) "
957- f"to { connectivity_manifest_path } "
1015+ f"✓ Wrote connectivity ({ n_unique_edges :,} edges, "
1016+ f"max { max_neighbors_found } neighbors/cell, { index_dtype_str } ) "
1017+ f"to { connectivity_binary_dir } "
9581018 )
9591019 else :
9601020 print ("INFO: No connectivity data provided, skipping connectivity export." )
0 commit comments