Skip to content

Commit 52cb0ba

Browse files
authored
Merge pull request #237 from broadinstitute/development
Release 1.14.2
2 parents 8685a99 + 6fff171 commit 52cb0ba

File tree

6 files changed

+67
-12
lines changed

6 files changed

+67
-12
lines changed

ingest/annotations.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,10 @@ def preprocess(self, is_metadata_convention=False):
105105
self.annot_types[0] = self.annot_types[0].upper()
106106
if self.validate_unique_header():
107107
self.create_data_frame()
108-
self.preprocess_numeric_annot(is_metadata_convention)
108+
try:
109+
self.preprocess_numeric_annot(is_metadata_convention)
110+
except ValueError as v:
111+
raise ValueError(v)
109112
else:
110113
msg = (
111114
"Unable to parse file - Duplicate annotation header names are not allowed. \n"
@@ -119,7 +122,13 @@ def preprocess_numeric_annot(self, is_metadata_convention):
119122
# Metadata convention can contain arrays that have numeric or string values.
120123
# Therefore dtypes for numeric annotations are skipped.
121124
if not is_metadata_convention:
122-
self.file = Annotations.coerce_numeric_values(self.file, self.annot_types)
125+
try:
126+
self.file = Annotations.coerce_numeric_values(
127+
self.file, self.annot_types
128+
)
129+
except ValueError as v:
130+
# self.store_validation_issue("error", v, "content:type:not-numeric")
131+
raise ValueError(v)
123132

124133
@staticmethod
125134
def convert_header_to_multi_index(df, header_names: List[Tuple]):

ingest/clusters.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@ def __init__(
5555
for i, header in enumerate(self.headers):
5656
if header in ["X", "Y", "Z"]:
5757
self.headers[i] = self.headers[i].lower()
58-
self.preprocess()
58+
try:
59+
self.preprocess()
60+
except ValueError as v:
61+
raise ValueError(v)
5962
self.determine_coordinates_and_cell_names()
6063
self.source_file_type = "cluster"
6164
self.cluster_type = (

ingest/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def set_mixpanel_nums(self):
6464
"""Derive count for each type of Mixpanel property
6565
"""
6666
for prop in ["errorTypes", "errors", "warningTypes", "warnings"]:
67-
num_prop = "num" + prop.capitalize()
67+
num_prop = "num" + prop[0].upper() + prop[1:]
6868
if self.__properties.get(prop):
6969
self.__properties[num_prop] = len(self.__properties[prop])
7070

ingest/ingest_pipeline.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,32 @@ def initialize_file_connection(self, file_type, file_path):
176176
File object.
177177
"""
178178
file_connections = {"cell_metadata": CellMetadata, "cluster": Clusters}
179-
180-
return file_connections.get(file_type)(
181-
file_path,
182-
self.study_id,
183-
self.study_file_id,
184-
tracer=self.tracer,
185-
**self.kwargs,
186-
)
179+
try:
180+
return file_connections.get(file_type)(
181+
file_path,
182+
self.study_id,
183+
self.study_file_id,
184+
tracer=self.tracer,
185+
**self.kwargs,
186+
)
187+
except ValueError as v:
188+
# Caution: recording errorTypes in this manner can clobber other collected errors.
189+
# ValueErrors during file connection indicate file cannot be processed
190+
# this logging approach should not lose collected file validation information
191+
if str(v).startswith("could not convert"):
192+
config.get_metric_properties().update(
193+
{"errorTypes": ["content:type:not-numeric"]}
194+
)
195+
elif str(v).startswith("Unable to parse"):
196+
config.get_metric_properties().update(
197+
{"errorTypes": ["format:cap:unique"]}
198+
)
199+
else:
200+
config.get_metric_properties().update(
201+
{"errorTypes": ["parse:unhandled"]}
202+
)
203+
self.report_validation("failure")
204+
raise ValueError(v)
187205

188206
def insert_many(self, collection_name, documents):
189207
if not config.bypass_mongo_writes():
@@ -427,6 +445,13 @@ def subsample(self):
427445
if load_status != 0:
428446
return load_status
429447
else:
448+
# Caution: recording errorTypes in this manner can clobber other collected errors.
449+
# In subsampling, known failure modes are ValueErrors which stop processing so
450+
# this logging approach should not lose file validation information
451+
config.get_metric_properties().update(
452+
{"errorTypes": ["content:missing:values-across-files"]}
453+
)
454+
self.report_validation("failure")
430455
raise ValueError(
431456
"Cluster file has cell names that are not present in cell metadata file."
432457
)

tests/data/cluster_non-numeric.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
NAME X Y Z Category Intensity
2+
TYPE numeric numeric numeric group numeric
3+
CELL_0001 34.472 32.211 60.035 C pos
4+
CELL_0002 15.975 10.043 21.424 B pos
5+
CELL_0003 -11.688 -53.645 -58.374 A neg
6+
CELL_0004 30.04 31.138 33.597 B pos
7+
CELL_0005 23.862 33.092 26.904 B pos

tests/test_cluster.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,14 @@ def test_missing_coordinate_column_values_false(self):
114114
"testCluster",
115115
)
116116
self.assertFalse(cluster.require_X_Y_not_nan())
117+
118+
def test_numeric_false(self):
119+
"""Ensures numeric annotations have numeric values
120+
"""
121+
with self.assertRaises(ValueError):
122+
cluster = Clusters(
123+
"../tests/data/cluster_non-numeric.txt",
124+
"dec0dedfeed1111111111111",
125+
"addedfeed000000000000000",
126+
"testCluster",
127+
)

0 commit comments

Comments
 (0)