Skip to content

Commit d0a2e87

Browse files
Merge pull request #38 from broadinstitute/ea-test-subsample
Write QA Test for Subsampling
2 parents 76ebc3d + 593c268 commit d0a2e87

File tree

8 files changed

+2223
-3506
lines changed

8 files changed

+2223
-3506
lines changed

ingest/ingest_pipeline.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,7 @@ def has_valid_metadata_convention(self):
230230
""" Determines if cell metadata file follows metadata convention"""
231231
with open(self.JSON_CONVENTION, 'r') as f:
232232
convention = json.load(f)
233-
234-
validate_input_metadata(self.cell_metadata, convention)
233+
validate_input_metadata(self.cell_metadata, convention)
235234
return not report_issues(self.cell_metadata)
236235

237236
def ingest_expression(self) -> None:

ingest/subsample.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class SubSample(IngestFiles):
1010
MAX_THRESHOLD = 100_000
1111
SUBSAMPLE_THRESHOLDS = [MAX_THRESHOLD, 20_000, 10_000, 1_000]
1212

13-
def __init__(self, *, cluster_file=None, cell_metadata_file=None):
13+
def __init__(self, cluster_file, cell_metadata_file=None):
1414
IngestFiles.__init__(
1515
self, cluster_file, self.ALLOWED_FILE_TYPES, open_as='dataframe'
1616
)
@@ -75,6 +75,7 @@ def bin(self, annotation: Tuple[str, str]):
7575
columns = copy.copy(self.coordinates_and_cell_names)
7676
# coordinates, cell names and annotation name
7777
columns.append(annotation[0])
78+
# Subset of df where header is [cell_names, x, y, z, <annot_name>]
7879
subset = self.file[columns].copy()
7980
subset.sort_values(by=[annotation], inplace=True)
8081
# Generates 20 bins
@@ -100,10 +101,10 @@ def subsample(self):
100101
# Dict of values for the x, y, and z coordinates
101102
points = {k: [] for k in self.coordinates_and_cell_names}
102103
num_per_group = int(sample_size / group_size)
103-
104+
cells_left = sample_size
104105
# bin = ("unique value in column" : dataframe)
105106
for bin in self.return_sorted_bin(anotation_dict, annotation_name):
106-
cells_left = sample_size
107+
107108
amount_of_rows = len(bin[1].index)
108109
# If the amount of sampled values is larger
109110
# than the whole array, take the whole array

tests/data/subsample_metadata_test.csv

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

tests/data/test_1k_cluster_Data.csv

Lines changed: 0 additions & 1 deletion
This file was deleted.

tests/data/test_1k_cluster_data.csv

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)