Merge pull request #92 from broadinstitute/ea-validate-annotation-files

knapii-developments · web-flow · commit a5ded6345906 · 2020-05-07T12:46:41.000-04:00
Ea validate annotation files
diff --git a/ingest/annotations.py b/ingest/annotations.py
@@ -240,9 +240,9 @@ def validate_against_header_count(self):
         return valid
 
     def validate_format(self):
-        """Check all metadata file format criteria for file validity
+        """Check common format criteria for annotation files
         """
-        self.is_valid_file = all(
+        return all(
             [
                 self.validate_header_keyword(),
                 self.validate_type_keyword(),
@@ -251,4 +251,3 @@ def validate_format(self):
                 self.validate_against_header_count(),
             ]
         )
-        return self.is_valid_file
diff --git a/ingest/cell_metadata.py b/ingest/cell_metadata.py
@@ -7,13 +7,14 @@
 PREREQUISITES
 Must have python 3.6 or higher.
 """
+import collections
+import ntpath
 from collections import defaultdict
-from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
 from dataclasses import dataclass
-from mypy_extensions import TypedDict
+from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
+
 from bson.objectid import ObjectId
-import ntpath
-import collections
+from mypy_extensions import TypedDict
 
 try:
     # Used when importing internally and in tests
@@ -35,7 +36,7 @@ def __init__(
         study_id: ObjectId,
         study_file_id: ObjectId,
         *args,
-        **kwargs
+        **kwargs,
     ):
 
         self.study_accession = kwargs.pop("study_accession")
@@ -60,6 +61,33 @@ class Model(TypedDict):
         # unique values from "group" type annotations
         values: List
 
+    # Will evolve to do cross file validation
+    def validate(self):
+        """ Runs all validation checks
+        """
+        return all([self.is_valid_format()])
+
+    def is_valid_format(self):
+        """Validates format by calling all format validation methods"""
+        return all(
+            [self.validate_header_for_coordinate_values(), self.validate_format()]
+        )
+
+    def validate_header_for_coordinate_values(self):
+        """Cell metadata files should not have coordinates in header
+        :return: boolean True if coordinates are not in header, otherwise False
+        """
+        lower_cased_headers = [header.lower() for header in self.headers]
+        valid = not any(
+            [coordinate in ('x', 'y', 'z') for coordinate in lower_cased_headers]
+        )
+        if valid:
+            return True
+        else:
+            msg = 'Header names can not be coordinate values x, y, or z (case insensitive)'
+            self.store_validation_issue('error', 'format', msg)
+            return False
+
     def transform(self):
         """ Builds cell metadata model"""
         AnnotationModel = collections.namedtuple(
diff --git a/ingest/clusters.py b/ingest/clusters.py
@@ -1,8 +1,9 @@
-from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
-from dataclasses import dataclass
-from mypy_extensions import TypedDict
 import logging
+from dataclasses import dataclass
+from typing import Dict, Generator, List, Tuple, Union  # noqa: F401
+
 from bson.objectid import ObjectId
+from mypy_extensions import TypedDict
 
 try:
     from ingest_files import DataArray
@@ -75,6 +76,32 @@ def __init__(
         self.extra_log_params = {'study_id': self.study_id, 'duration': None}
         self.preprocess()
 
+    # Will evolve to do cross file validation
+    def validate(self):
+        """ Runs all validation checks
+        """
+        return all([self.is_valid_format()])
+
+    def is_valid_format(self):
+        """Validates format by calling all format validation methods"""
+        return all(
+            [self.validate_header_for_coordinate_values(), self.validate_format()]
+        )
+
+    def validate_header_for_coordinate_values(self):
+        """Cluster files must have coordinates 'x' and 'y' in header
+        :return: boolean True if coordinates are in header, otherwise False
+        """
+        lower_cased_headers = [header.lower() for header in self.headers]
+        for coordinate in ('x', 'y'):
+            if coordinate not in lower_cased_headers:
+                msg = (
+                    "Header must have coordinate values 'x' and 'y' (case insensitive)"
+                )
+                self.store_validation_issue('error', 'format', msg)
+                return False
+        return True
+
     def transform(self):
         """ Builds cluster data model"""
         # Array of Hash objects that describe all extra "annotation" columns
diff --git a/ingest/dense.py b/ingest/dense.py
@@ -10,6 +10,7 @@
 
 import collections
 from typing import List  # noqa: F401
+
 from bson.objectid import ObjectId
 
 try:
diff --git a/ingest/ingest_files.py b/ingest/ingest_files.py
@@ -279,7 +279,9 @@ def open_pandas(self, file_path, file_type, **kwargs):
                 delimiter = ","
             else:
                 delimiter = None
-            dialect = csv.Sniffer().sniff(open_file_object.readline(), delimiters=delimiter)
+            dialect = csv.Sniffer().sniff(
+                open_file_object.readline(), delimiters=delimiter
+            )
             dialect.skipinitialspace = True
             open_file_object.seek(0)
             return pd.read_csv(file_path, dialect=dialect, **kwargs)
diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py
@@ -233,6 +233,7 @@ def load(
             return 1
         return 0
 
+    # @profile
     def load_expression_file(self, models, is_gene_model=False):
         collection_name = self.matrix.COLLECTION_NAME
         # Creates operations to perform for bulk write
@@ -337,9 +338,8 @@ def upload_metadata_to_bq(self):
                 return 1
         return 0
 
-    @trace
-    @my_debug_logger()
-    # @profile
+    # @trace
+    # @my_debug_logger()
     def ingest_expression(self) -> int:
         """Ingests expression files.
         """
@@ -405,7 +405,7 @@ def ingest_expression(self) -> int:
     # @my_debug_logger()
     def ingest_cell_metadata(self):
         """Ingests cell metadata files into Firestore."""
-        if self.cell_metadata.validate_format():
+        if self.cell_metadata.validate():
             self.info_logger.info(
                 f'Cell metadata file format valid', extra=self.extra_log_params
             )
@@ -451,7 +451,7 @@ def ingest_cell_metadata(self):
     @my_debug_logger()
     def ingest_cluster(self):
         """Ingests cluster files."""
-        if self.cluster.validate_format():
+        if self.cluster.validate():
             annotation_model = self.cluster.transform()
             status = self.load(
                 self.cluster.COLLECTION_NAME,
diff --git a/tests/data/cluster_bad.txt b/tests/data/cluster_bad.txt
@@ -1,16 +1 @@
-NAME	X	Y
-CELL_0001	34.472	32.211
-CELL_0002	15.975	10.043
-CELL_0003	-11.688	-53.645
-CELL_0004	30.04	31.138
-CELL_0005	23.862	33.092
-CELL_0006	-39.07	-14.64
-CELL_0007	40.039	27.206
-CELL_0008	28.755	27.187
-CELL_0009	-48.601	-13.512
-CELL_00010	14.653	27.832
-CELL_00011	20.603	32.071
-CELL_00012	-10.333	-51.733
-CELL_00013	-52.966	-12.484
-CELL_00014	38.513	26.969
-CELL_00015	12.838	13.047
+NAME	XCELL_0001	34.472CELL_0002	15.975CELL_0003	-11.688CELL_0004	30.04CELL_0005	23.862CELL_0006	-39.07CELL_0007	40.039CELL_0008	28.755CELL_0009	-48.601CELL_00010	14.653CELL_00011	20.603CELL_00012	-10.333CELL_00013	-52.966CELL_00014	38.513CELL_00015	12.838
diff --git a/tests/data/cluster_bad_missing_coordinate.txt b/tests/data/cluster_bad_missing_coordinate.txt
@@ -0,0 +1 @@
+NAME	XTYPE	groupCELL_0001	34.472CELL_0002	15.975CELL_0003	-11.688CELL_0004	30.04CELL_0005	23.862CELL_0006	-39.07CELL_0007	40.039CELL_0008	28.755CELL_0009	-48.601CELL_00010	14.653CELL_00011	20.603CELL_00012	-10.333CELL_00013	-52.966CELL_00014	38.513CELL_00015	12.838
diff --git a/tests/data/metadata_bad.txt b/tests/data/metadata_bad.txt
@@ -1,16 +1 @@
-NAME	CLUSTER	SUB-CLUSTER
-CELL_0001	CLST_A	CLST_A_1
-CELL_0002	CLST_A	CLST_A_1
-CELL_0003	CLST_A	CLST_A_1
-CELL_0004	CLST_A	CLST_A_2
-CELL_0005	CLST_A	CLST_A_2
-CELL_0006	CLST_B	CLST_B_1
-CELL_0007	CLST_B	CLST_B_1
-CELL_0008	CLST_B	CLST_B_2
-CELL_0009	CLST_B	CLST_B_2
-CELL_00010	CLST_B	CLST_B_2
-CELL_00011	CLST_C	CLST_C_1
-CELL_00012	CLST_C	CLST_C_1
-CELL_00013	CLST_C	CLST_C_1
-CELL_00014	CLST_C	CLST_C_2
-CELL_00015	CLST_C	CLST_C_2
+NAME	CLUSTER	SUB-CLUSTER	XTYPE	GROUP	GROUP	NUMERICCELL_0001	CLST_A	CLST_A_1	1CELL_0002	CLST_A	CLST_A_1	2CELL_0003	CLST_A	CLST_A_1	3CELL_0004	CLST_A	CLST_A_2	4CELL_0005	CLST_A	CLST_A_2	5CELL_0006	CLST_B	CLST_B_1	6CELL_0007	CLST_B	CLST_B_1	7CELL_0008	CLST_B	CLST_B_2	8CELL_0009	CLST_B	CLST_B_2	9CELL_00010	CLST_B	CLST_B_2	10CELL_00011	CLST_C	CLST_C_1	11CELL_00012	CLST_C	CLST_C_1	12CELL_00013	CLST_C	CLST_C_1	13CELL_00014	CLST_C	CLST_C_2	14CELL_00015	CLST_C	CLST_C_2	15			
diff --git a/tests/data/metadata_bad_contains_coordinates.txt b/tests/data/metadata_bad_contains_coordinates.txt
@@ -0,0 +1 @@
+NAME	CLUSTER	SUB-CLUSTER	X		TYPE	group	group	numeric		CELL_0001	CLST_A	CLST_A_1	1		CELL_0002	CLST_A	CLST_A_1	2		CELL_0003	CLST_A	CLST_A_1	3		CELL_0004	CLST_A	CLST_A_2	4		CELL_0005	CLST_A	CLST_A_2	5		CELL_0006	CLST_B	CLST_B_1	6		CELL_0007	CLST_B	CLST_B_1	7		CELL_0008	CLST_B	CLST_B_2	8		CELL_0009	CLST_B	CLST_B_2	9		cCELL_00010	CLST_B	CLST_B_2	10		CELL_00011	CLST_C	CLST_C_1	11		CELL_00012	CLST_C	CLST_C_1	12		CELL_00013	CLST_C	CLST_C_1	13		CELL_00014	CLST_C	CLST_C_2	14		CELL_00015	CLST_C	CLST_C_2	15							
diff --git a/tests/data/metadata_example.txt b/tests/data/metadata_example.txt
@@ -14,4 +14,4 @@ CELL_00011	CLST_C	CLST_C_1	0.638
 CELL_00012	CLST_C	CLST_C_1	8.888
 CELL_00013	CLST_C	CLST_C_1	-2.27
 CELL_00014	CLST_C	CLST_C_2	-2.606
-CELL_00015	CLST_C	CLST_C_2	-9.089z
+CELL_00015	CLST_C	CLST_C_2	-9.089
diff --git a/tests/test_cell_metadata.py b/tests/test_cell_metadata.py
@@ -0,0 +1,33 @@
+import sys
+import unittest
+
+sys.path.append("../ingest")
+from cell_metadata import CellMetadata
+
+
+class TestCellMetadata(unittest.TestCase):
+    def test_validate_header_for_coordinate_values_false(self):
+        """Ensures validate_header_for_coordinate_values returns false when
+        coordinate value is in metadata file
+         """
+        cm = CellMetadata(
+            '../tests/data/metadata_bad_contains_coordinates.txt',
+            '5d276a50421aa9117c982845',
+            '5dd5ae25421aa910a723a337',
+            study_accession='SCP2',
+            tracer=None,
+        )
+        self.assertFalse(cm.validate_header_for_coordinate_values())
+
+    def test_validate_header_for_coordinate_values_true(self):
+        """Ensures validate_header_for_coordinate_values returns true when
+        coordinate value is not in metadata file
+         """
+        cm = CellMetadata(
+            '../tests/data/metadata_example.txt',
+            '5d276a50421aa9117c982845',
+            '5dd5ae25421aa910a723a337',
+            study_accession='SCP2',
+            tracer=None,
+        )
+        self.assertTrue(cm.validate_header_for_coordinate_values())
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
@@ -0,0 +1,31 @@
+import sys
+import unittest
+
+sys.path.append("../ingest")
+from clusters import Clusters
+
+
+class TestCellMetadata(unittest.TestCase):
+    def test_validate_header_for_coordinate_values_false(self):
+        """Ensures validate_header_for_coordinate_values returns false when
+         coordinate is missing in header
+        """
+        cluster = Clusters(
+            '../tests/data/cluster_bad_missing_coordinate.txt',
+            '5d276a50421aa9117c982845',
+            '5dd5ae25421aa910a723a337',
+            'testCluster',
+        )
+        self.assertFalse(cluster.validate_header_for_coordinate_values())
+
+    def test_validate_header_for_coordinate_values_true(self):
+        """Ensures validate_header_for_coordinate_values returns true when
+        coordintate value is in cluster file
+         """
+        cluster = Clusters(
+            '../tests/data/cluster_example.txt',
+            '5d276a50421aa9117c982845',
+            '5dd5ae25421aa910a723a337',
+            'testCluster',
+        )
+        self.assertTrue(cluster.validate_header_for_coordinate_values())
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -378,6 +378,28 @@ def test_bad_metadata_file(self):
             exit_pipeline(ingest, status, status_cell_metadata, arguments)
         self.assertEqual(cm.exception.code, 1)
 
+    def test_bad_metadata_file_contains_coordinates(self):
+        """Ingest Pipeline should not succeed for metadata file containing
+        coordinates
+        """
+        args = [
+            '--study-id',
+            '5d276a50421aa9117c982845',
+            '--study-file-id',
+            '5dd5ae25421aa910a723a337',
+            'ingest_cell_metadata',
+            '--cell-metadata-file',
+            '../tests/data/metadata_bad_contains_coordinates.txt',
+            '--study-accession',
+            'SCP123',
+            '--ingest-cell-metadata',
+        ]
+        ingest, arguments, status, status_cell_metadata = self.setup_ingest(args)
+
+        with self.assertRaises(SystemExit) as cm:
+            exit_pipeline(ingest, status, status_cell_metadata, arguments)
+        self.assertEqual(cm.exception.code, 1)
+
     def test_good_cluster_file(self):
         """Ingest Pipeline should succeed for properly formatted cluster file
         """
@@ -424,6 +446,27 @@ def test_bad_cluster_file(self):
             exit_pipeline(ingest, status, status_cell_metadata, arguments)
         self.assertEqual(cm.exception.code, 1)
 
+    def test_bad_cluster_missing_coordinate_file(self):
+        """Ingest Pipeline should fail for missing coordinate in cluster file
+        """
+        args = [
+            '--study-id',
+            '5d276a50421aa9117c982845',
+            '--study-file-id',
+            '5dd5ae25421aa910a723a337',
+            'ingest_cluster',
+            '--cluster-file',
+            '../tests/data/cluster_bad_missing_coordinate.txt',
+            '--ingest-cluster',
+            '--name',
+            'cluster1',
+        ]
+        ingest, arguments, status, status_cell_metadata = self.setup_ingest(args)
+
+        with self.assertRaises(SystemExit) as cm:
+            exit_pipeline(ingest, status, status_cell_metadata, arguments)
+        self.assertEqual(cm.exception.code, 1)
+
     # def test_ingest_loom(self):
     #     """Ingest Pipeline should extract and transform loom files
     #     """

Original file line number	Diff line number	Diff line change
`@@ -240,9 +240,9 @@ def validate_against_header_count(self):`
`240`	`240`	`return valid`
`241`	`241`
`242`	`242`	`def validate_format(self):`
`243`		`- """Check all metadata file format criteria for file validity`
	`243`	`+ """Check common format criteria for annotation files`
`244`	`244`	`"""`
`245`		`- self.is_valid_file = all(`
	`245`	`+ return all(`
`246`	`246`	`[`
`247`	`247`	`self.validate_header_keyword(),`
`248`	`248`	`self.validate_type_keyword(),`
`@@ -251,4 +251,3 @@ def validate_format(self):`
`251`	`251`	`self.validate_against_header_count(),`
`252`	`252`	`]`
`253`	`253`	`)`
`254`		`- return self.is_valid_file`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+NAME XTYPE groupCELL_0001 34.472CELL_0002 15.975CELL_0003 -11.688CELL_0004 30.04CELL_0005 23.862CELL_0006 -39.07CELL_0007 40.039CELL_0008 28.755CELL_0009 -48.601CELL_00010 14.653CELL_00011 20.603CELL_00012 -10.333CELL_00013 -52.966CELL_00014 38.513CELL_00015 12.838`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+NAME CLUSTER SUB-CLUSTER X TYPE group group numeric CELL_0001 CLST_A CLST_A_1 1 CELL_0002 CLST_A CLST_A_1 2 CELL_0003 CLST_A CLST_A_1 3 CELL_0004 CLST_A CLST_A_2 4 CELL_0005 CLST_A CLST_A_2 5 CELL_0006 CLST_B CLST_B_1 6 CELL_0007 CLST_B CLST_B_1 7 CELL_0008 CLST_B CLST_B_2 8 CELL_0009 CLST_B CLST_B_2 9 cCELL_00010 CLST_B CLST_B_2 10 CELL_00011 CLST_C CLST_C_1 11 CELL_00012 CLST_C CLST_C_1 12 CELL_00013 CLST_C CLST_C_1 13 CELL_00014 CLST_C CLST_C_2 14 CELL_00015 CLST_C CLST_C_2 15