broadinstitute
diff --git a/‎ingest/ingest_pipeline.py‎
Lines changed: 1 addition & 2 deletions b/‎ingest/ingest_pipeline.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎ingest/subsample.py‎
Lines changed: 4 additions & 3 deletions b/‎ingest/subsample.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎tests/data/subsample_metadata_test.csv‎
Lines changed: 1 addition & 0 deletions b/‎tests/data/subsample_metadata_test.csv‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/data/test_1k_cluster_Data.csv‎
Lines changed: 0 additions & 1 deletion b/‎tests/data/test_1k_cluster_Data.csv‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/data/test_1k_cluster_data.csv‎
Lines changed: 1 addition & 0 deletions b/‎tests/data/test_1k_cluster_data.csv‎
Lines changed: 1 addition & 0 deletions
@@ -230,8 +230,7 @@ def has_valid_metadata_convention(self):
         """ Determines if cell metadata file follows metadata convention"""
         with open(self.JSON_CONVENTION, 'r') as f:
             convention = json.load(f)
-
-        validate_input_metadata(self.cell_metadata, convention)
+            validate_input_metadata(self.cell_metadata, convention)
         return not report_issues(self.cell_metadata)
 
     def ingest_expression(self) -> None:
 
@@ -10,7 +10,7 @@ class SubSample(IngestFiles):
     MAX_THRESHOLD = 100_000
     SUBSAMPLE_THRESHOLDS = [MAX_THRESHOLD, 20_000, 10_000, 1_000]
 
-    def __init__(self, *, cluster_file=None, cell_metadata_file=None):
+    def __init__(self, cluster_file, cell_metadata_file=None):
         IngestFiles.__init__(
             self, cluster_file, self.ALLOWED_FILE_TYPES, open_as='dataframe'
         )
@@ -75,6 +75,7 @@ def bin(self, annotation: Tuple[str, str]):
             columns = copy.copy(self.coordinates_and_cell_names)
             # coordinates, cell names and annotation name
             columns.append(annotation[0])
+            # Subset of df where header is [cell_names, x, y, z, <annot_name>]
             subset = self.file[columns].copy()
             subset.sort_values(by=[annotation], inplace=True)
             # Generates 20 bins
@@ -100,10 +101,10 @@ def subsample(self):
                 # Dict of values for the x, y, and z coordinates
                 points = {k: [] for k in self.coordinates_and_cell_names}
                 num_per_group = int(sample_size / group_size)
-
+                cells_left = sample_size
                 # bin = ("unique value in column" : dataframe)
                 for bin in self.return_sorted_bin(anotation_dict, annotation_name):
-                    cells_left = sample_size
+
                     amount_of_rows = len(bin[1].index)
                     # If the amount of sampled values is larger
                     # than the whole array, take the whole array