Merge pull request #237 from broadinstitute/development

jlchang · web-flow · commit 52cb0ba39ef8 · 2022-03-01T12:36:42.000-05:00
Release 1.14.2
diff --git a/ingest/annotations.py b/ingest/annotations.py
@@ -105,7 +105,10 @@ def preprocess(self, is_metadata_convention=False):
         self.annot_types[0] = self.annot_types[0].upper()
         if self.validate_unique_header():
             self.create_data_frame()
-            self.preprocess_numeric_annot(is_metadata_convention)
+            try:
+                self.preprocess_numeric_annot(is_metadata_convention)
+            except ValueError as v:
+                raise ValueError(v)
         else:
             msg = (
                 "Unable to parse file - Duplicate annotation header names are not allowed. \n"
@@ -119,7 +122,13 @@ def preprocess_numeric_annot(self, is_metadata_convention):
         # Metadata convention can contain arrays that have numeric or string values.
         # Therefore dtypes for numeric annotations are skipped.
         if not is_metadata_convention:
-            self.file = Annotations.coerce_numeric_values(self.file, self.annot_types)
+            try:
+                self.file = Annotations.coerce_numeric_values(
+                    self.file, self.annot_types
+                )
+            except ValueError as v:
+                # self.store_validation_issue("error", v, "content:type:not-numeric")
+                raise ValueError(v)
 
     @staticmethod
     def convert_header_to_multi_index(df, header_names: List[Tuple]):
diff --git a/ingest/clusters.py b/ingest/clusters.py
@@ -55,7 +55,10 @@ def __init__(
         for i, header in enumerate(self.headers):
             if header in ["X", "Y", "Z"]:
                 self.headers[i] = self.headers[i].lower()
-        self.preprocess()
+        try:
+            self.preprocess()
+        except ValueError as v:
+            raise ValueError(v)
         self.determine_coordinates_and_cell_names()
         self.source_file_type = "cluster"
         self.cluster_type = (
diff --git a/ingest/config.py b/ingest/config.py
@@ -64,7 +64,7 @@ def set_mixpanel_nums(self):
         """Derive count for each type of Mixpanel property
         """
         for prop in ["errorTypes", "errors", "warningTypes", "warnings"]:
-            num_prop = "num" + prop.capitalize()
+            num_prop = "num" + prop[0].upper() + prop[1:]
             if self.__properties.get(prop):
                 self.__properties[num_prop] = len(self.__properties[prop])
 
diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -176,14 +176,32 @@ def initialize_file_connection(self, file_type, file_path):
                 File object.
         """
         file_connections = {"cell_metadata": CellMetadata, "cluster": Clusters}
-
-        return file_connections.get(file_type)(
-            file_path,
-            self.study_id,
-            self.study_file_id,
-            tracer=self.tracer,
-            **self.kwargs,
-        )
+        try:
+            return file_connections.get(file_type)(
+                file_path,
+                self.study_id,
+                self.study_file_id,
+                tracer=self.tracer,
+                **self.kwargs,
+            )
+        except ValueError as v:
+            # Caution: recording errorTypes in this manner can clobber other collected errors.
+            # ValueErrors during file connection indicate file cannot be processed
+            # this logging approach should not lose collected file validation information
+            if str(v).startswith("could not convert"):
+                config.get_metric_properties().update(
+                    {"errorTypes": ["content:type:not-numeric"]}
+                )
+            elif str(v).startswith("Unable to parse"):
+                config.get_metric_properties().update(
+                    {"errorTypes": ["format:cap:unique"]}
+                )
+            else:
+                config.get_metric_properties().update(
+                    {"errorTypes": ["parse:unhandled"]}
+                )
+            self.report_validation("failure")
+            raise ValueError(v)
 
     def insert_many(self, collection_name, documents):
         if not config.bypass_mongo_writes():
@@ -427,6 +445,13 @@ def subsample(self):
                         if load_status != 0:
                             return load_status
                 else:
+                    # Caution: recording errorTypes in this manner can clobber other collected errors.
+                    # In subsampling, known failure modes are ValueErrors which stop processing so
+                    # this logging approach should not lose file validation information
+                    config.get_metric_properties().update(
+                        {"errorTypes": ["content:missing:values-across-files"]}
+                    )
+                    self.report_validation("failure")
                     raise ValueError(
                         "Cluster file has cell names that are not present in cell metadata file."
                     )
diff --git a/tests/data/cluster_non-numeric.txt b/tests/data/cluster_non-numeric.txt
@@ -0,0 +1,7 @@
+NAME	X	Y	Z	Category	Intensity
+TYPE	numeric	numeric	numeric	group	numeric
+CELL_0001	34.472	32.211	60.035	C	pos
+CELL_0002	15.975	10.043	21.424	B	pos
+CELL_0003	-11.688	-53.645	-58.374	A	neg
+CELL_0004	30.04	31.138	33.597	B	pos
+CELL_0005	23.862	33.092	26.904	B	pos
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
@@ -114,3 +114,14 @@ def test_missing_coordinate_column_values_false(self):
             "testCluster",
         )
         self.assertFalse(cluster.require_X_Y_not_nan())
+
+    def test_numeric_false(self):
+        """Ensures numeric annotations have numeric values
+        """
+        with self.assertRaises(ValueError):
+            cluster = Clusters(
+                "../tests/data/cluster_non-numeric.txt",
+                "dec0dedfeed1111111111111",
+                "addedfeed000000000000000",
+                "testCluster",
+            )