@@ -10,7 +10,7 @@ class SubSample(IngestFiles):
1010 MAX_THRESHOLD = 100_000
1111 SUBSAMPLE_THRESHOLDS = [MAX_THRESHOLD , 20_000 , 10_000 , 1_000 ]
1212
13- def __init__ (self , * , cluster_file = None , cell_metadata_file = None ):
13+ def __init__ (self , cluster_file , cell_metadata_file = None ):
1414 IngestFiles .__init__ (
1515 self , cluster_file , self .ALLOWED_FILE_TYPES , open_as = 'dataframe'
1616 )
@@ -75,6 +75,7 @@ def bin(self, annotation: Tuple[str, str]):
7575 columns = copy .copy (self .coordinates_and_cell_names )
7676 # coordinates, cell names and annotation name
7777 columns .append (annotation [0 ])
78+ # Subset of df where header is [cell_names, x, y, z, <annot_name>]
7879 subset = self .file [columns ].copy ()
7980 subset .sort_values (by = [annotation ], inplace = True )
8081 # Generates 20 bins
@@ -100,10 +101,10 @@ def subsample(self):
100101 # Dict of values for the x, y, and z coordinates
101102 points = {k : [] for k in self .coordinates_and_cell_names }
102103 num_per_group = int (sample_size / group_size )
103-
104+ cells_left = sample_size
104105 # bin = ("unique value in column" : dataframe)
105106 for bin in self .return_sorted_bin (anotation_dict , annotation_name ):
106- cells_left = sample_size
107+
107108 amount_of_rows = len (bin [1 ].index )
108109 # If the amount of sampled values is larger
109110 # than the whole array, take the whole array
0 commit comments