broadinstitute
diff --git a/‎.circleci/config.yml‎
Lines changed: 6 additions & 5 deletions b/‎.circleci/config.yml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingest/cell_metadata.py‎
Lines changed: 61 additions & 51 deletions b/‎ingest/cell_metadata.py‎
Lines changed: 61 additions & 51 deletions
diff --git a/‎ingest/ingest_files.py‎
Lines changed: 1 addition & 0 deletions b/‎ingest/ingest_files.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingest/ingest_pipeline.py‎
Lines changed: 10 additions & 10 deletions b/‎ingest/ingest_pipeline.py‎
Lines changed: 10 additions & 10 deletions
@@ -4,6 +4,9 @@
 #
 version: 2.1
 
+orbs:
+  codecov: codecov/[email protected]
+
 jobs:
   build:
     docker:
@@ -61,9 +64,7 @@ jobs:
             export FIRESTORE_EMULATOR_HOST=localhost:8081
             . venv/bin/activate
             cd tests
-            coverage run -m pytest
-            coverage report --include *scp-ingest-pipeline/ingest*
-            coverage html --include *scp-ingest-pipeline/ingest*
+            pytest --cov-report=xml --cov=../ingest/
 
-      - store_artifacts:
-          path: tests/htmlcov
+      - codecov/upload:
+          file: tests/coverage.xml
@@ -2,6 +2,7 @@
 File Ingest Pipeline for Single Cell Portal
 
 [![Build status](https://img.shields.io/circleci/build/github/broadinstitute/scp-ingest-pipeline.svg)](https://circleci.com/gh/broadinstitute/scp-ingest-pipeline)
+[![Code coverage](https://codecov.io/gh/broadinstitute/scp-ingest-pipeline/branch/master/graph/badge.svg)](https://codecov.io/gh/broadinstitute/scp-ingest-pipeline)
 
 The SCP Ingest Pipeline is an ETL pipeline for single-cell RNA-seq data.  
 
 
@@ -46,12 +46,12 @@ def __init__(self, values: List, cell_names: List):
 
 
 class CellMetadata(IngestFiles):
-    ALLOWED_FILE_TYPES = ["text/csv", "text/plain", "text/tab-separated-values"]
+    ALLOWED_FILE_TYPES = ['text/csv', 'text/plain', 'text/tab-separated-values']
 
     def __init__(self, file_path, file_id: str, study_accession: str, *args, **kwargs):
 
         IngestFiles.__init__(
-            self, file_path, self.ALLOWED_FILE_TYPES, open_as="dataframe"
+            self, file_path, self.ALLOWED_FILE_TYPES, open_as='dataframe'
         )
         self.headers = self.file.columns.get_level_values(0)
         self.annot_types = self.file.columns.get_level_values(1)
@@ -61,14 +61,13 @@ def __init__(self, file_path, file_id: str, study_accession: str, *args, **kwarg
         # lambda below initializes new key with nested dictionary as value and avoids KeyError
         self.issues = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
         self.ontology = defaultdict(lambda: defaultdict(list))
-        self.type = defaultdict(list)
         self.cells = []
         self.is_valid_file = self.validate_format()
 
     @dataclass
     class Model:
-        COLLECTION_NAME = "cell_metadata"
-        SUBCOLLECTION_NAME = "data"
+        COLLECTION_NAME = 'cell_metadata'
+        SUBCOLLECTION_NAME = 'data'
         annot_type: str
         doc: Document
         subdoc: SubDocument
@@ -84,12 +83,12 @@ def preproccess(self):
         self.file.rename(columns={name: name.upper(), type: type.upper()}, inplace=True)
         # Make sure group annotations are treated as strings
         group_columns = self.file.xs(
-            "group", axis=1, level=1, drop_level=False
+            'group', axis=1, level=1, drop_level=False
         ).columns.tolist()
         self.file[group_columns] = self.file[group_columns].astype(str)
         # Find numeric columns,  round to 3 decimals places, and cast to floats
         numeric_columns = self.file.xs(
-            "numeric", axis=1, level=1, drop_level=False
+            'numeric', axis=1, level=1, drop_level=False
         ).columns.tolist()
         self.file[numeric_columns] = self.file[numeric_columns].round(3).astype(float)
 
@@ -102,18 +101,18 @@ def transform(self):
             yield self.Model(
                 column_type,
                 {
-                    "name": col_name,
-                    "study_accession": self.study_accession,
+                    'name': col_name,
+                    'study_accession': self.study_accession,
                     # save unique values for group type annotations
-                    "unique_values": list(self.file[column].unique())
-                    if column_type == "group"
+                    'unique_values': list(self.file[column].unique())
+                    if column_type == 'group'
                     else [],
-                    "annotation_type": column_type,
-                    "file_id": self.file_id,
+                    'annotation_type': column_type,
+                    'file_id': self.file_id,
                 },
                 {
-                    "cell_names": list(self.file.iloc[:, 0]),
-                    "values": list(self.file[column]),
+                    'cell_names': list(self.file.iloc[:, 0]),
+                    'values': list(self.file[column]),
                 },
             )
 
@@ -132,8 +131,8 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
             Subdocuments that are under 1,048,576 bytes.
         """
 
-        size_of_cell_names_field = 10 + 1  # "cell_names" is 10 characters
-        size_of_values_field = 6 + 1  # "values" is 6 characters
+        size_of_cell_names_field = 10 + 1  # 'cell_names' is 10 characters
+        size_of_values_field = 6 + 1  # 'values' is 6 characters
         starting_sum = (
             +len(doc_name)
             + 1
@@ -149,17 +148,17 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
         sum = starting_sum
         annot_type = model.annot_type
         # All cells names:[] that are in subdoc
-        cell_names = model.subdoc["cell_names"]
+        cell_names = model.subdoc['cell_names']
         # All values:[] that are in subdoc
-        values = model.subdoc["values"]
+        values = model.subdoc['values']
 
         for index, (cell_name, value) in enumerate(zip(cell_names, values)):
 
             cell_name_storage = len(cell_name) + 1 + size_of_cell_names_field
 
             # Check annotation type because float and string values have
             # different storage values
-            if annot_type == "numeric":
+            if annot_type == 'numeric':
                 value_storage = size_of_values_field + float_storage
             else:
                 value_storage = len(value) + 1 + size_of_values_field
@@ -175,10 +174,10 @@ def chunk_subdocuments(self, doc_name: str, doc_path: str, model: Model) -> Dict
                     end_index = index - 1
                 # TODO: This can turn into a logging statement
                 # Please do not remove this. It's needed for testing
-                print(f"{sum}, {index}, {start_index}, {end_index}")
+                print(f'{sum}, {index}, {start_index}, {end_index}')
                 yield {
-                    "cell_names": cell_names[start_index:end_index],
-                    "values": values[start_index:end_index],
+                    'cell_names': cell_names[start_index:end_index],
+                    'values': values[start_index:end_index],
                 }
                 # Reset sum and add storage size at current index
                 sum = starting_sum + cell_name_storage + value_storage
@@ -206,30 +205,37 @@ def validate_header_keyword(self):
         """
 
         valid = False
-        if self.headers[0].upper() == "NAME":
+        if self.headers[0].upper() == 'NAME':
             valid = True
-            if self.headers[0] != "NAME":
-                # ToDO - capture warning below in error report
-                msg = (
-                    f'Warning: metadata file keyword "NAME" provided as '
-                    f"{self.headers[0]}"
-                )
+            if self.headers[0] != 'NAME':
+                msg = f'Metadata file keyword "NAME" provided as ' f"{self.headers[0]}"
                 self.store_validation_issue('warn', 'format', msg)
         else:
-            msg = 'Error: Metadata file header row malformed, missing NAME. (Case Sensitive)'
+            msg = 'Malformed metadata file header row, missing NAME. (Case Sensitive)'
             self.store_validation_issue('error', 'format', msg)
         return valid
 
     def validate_unique_header(self):
-        """Check all metadata header names are unique.
+        """Check all metadata header names are unique and not empty.
         :return: boolean   True if valid, False otherwise
         """
         valid = False
         unique_headers = set(self.headers)
         if len(unique_headers) == len(self.headers):
             valid = True
-        if any("Unnamed" in s for s in list(unique_headers)):
-            msg = "Error: Headers cannot contain empty values"
+        else:
+            seen_headers = set()
+            duplicate_headers = set()
+            for x in self.headers:
+                if x in seen_headers or seen_headers.add(x):
+                    duplicate_headers.add(x)
+            msg = (
+                f'Duplicated metadata header names are not allowed: {duplicate_headers}'
+            )
+            self.store_validation_issue('error', 'format', msg)
+            valid = False
+        if any('Unnamed' in s for s in list(unique_headers)):
+            msg = 'Headers cannot contain empty values'
             self.store_validation_issue('error', 'format', msg)
             valid = False
         return valid
@@ -239,18 +245,13 @@ def validate_type_keyword(self):
         :return: boolean   True if valid, False otherwise
         """
         valid = False
-        if self.annot_types[0].upper() == "TYPE":
+        if self.annot_types[0].upper() == 'TYPE':
             valid = True
-            if self.annot_types[0] != "TYPE":
-                # ToDO - capture warning below in issue report
-                # investigate f-string formatting here
-                msg = (
-                    'Warning: Metadata file keyword TYPE provided as '
-                    '{self.metadata_types[0]}'
-                )
+            if self.annot_types[0] != 'TYPE':
+                msg = f'Metadata file keyword "TYPE" provided as {self.annot_types[0]}'
                 self.store_validation_issue('warn', 'format', msg)
         else:
-            msg = 'Error: Metadata file TYPE row malformed, missing TYPE'
+            msg = 'Malformed metadata TYPE row, missing TYPE. (Case Sensitive)'
             self.store_validation_issue('error', 'format', msg)
         return valid
 
@@ -268,10 +269,17 @@ def validate_type_annotations(self):
                 # string for error reporting
                 if 'Unnamed' in t:
                     invalid_types.append('<empty value>')
+                # Duplicated metadata header name causes type annotation issue.
+                # Side effect of Pandas adding a suffix to uniquefy the header.
+                # These invalid annotations should not be included in invalid
+                # type annotation count. This exception may cause miscount of
+                # type annot errors if user-supplied annotation has period.
+                elif '.' in t:
+                    pass
                 else:
                     invalid_types.append(t)
         if invalid_types:
-            msg = 'Error: TYPE declarations should be group or numeric'
+            msg = 'TYPE row annotations should be "group" or "numeric"'
             self.store_validation_issue('error', 'format', msg, invalid_types)
         else:
             valid = True
@@ -294,7 +302,7 @@ def validate_against_header_count(self):
         )
         if not len_headers == len_annot_type:
             msg = (
-                f'Error: {len_annot_type} TYPE declarations '
+                f'Header mismatch: {len_annot_type} TYPE declarations '
                 f'for {len_headers} column headers'
             )
             self.store_validation_issue('error', 'format', msg)
@@ -305,10 +313,12 @@ def validate_against_header_count(self):
     def validate_format(self):
         """Check all metadata file format criteria for file validity
         """
-        return (
-            self.validate_header_keyword()
-            and self.validate_type_keyword()
-            and self.validate_type_annotations()
-            and self.validate_unique_header()
-            and self.validate_against_header_count()
+        return all(
+            [
+                self.validate_header_keyword(),
+                self.validate_type_keyword(),
+                self.validate_type_annotations(),
+                self.validate_unique_header(),
+                self.validate_against_header_count(),
+            ]
         )
@@ -88,6 +88,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0):
         file_connections = {
             "text/csv": self.open_csv(open_file),
             "text/plain": open_file,
+            "application/json": open_file,
             "text/tab-separated-values": self.open_tsv(open_file),
             "dataframe": self.open_pandas,
         }
 
@@ -19,7 +19,8 @@
 python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/metadata_valid.tsv --ingest-cell-metadata
 
 # Ingest Cell Metadata file against convention
-python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/metadata_valid.tsv --ingest-cell-metadata --validate-convention
+!! Please note that you must have permission to the SCP bucket
+python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/valid_array_v1.1.3.tsv --ingest-cell-metadata --validate-convention
 
 # Ingest dense file
 python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_expression --taxon-name 'Homo sapiens' --taxon-common-name human --ncbi-taxid 9606 --matrix-file ../tests/data/dense_matrix_19_genes_100k_cells.txt --matrix-file-type dense
@@ -47,21 +48,18 @@
 from google.api_core import exceptions
 from google.cloud import firestore
 from mtx import Mtx
+from ingest_files import IngestFiles
 from subsample import SubSample
 from loom import Loom
-from validation.validate_metadata import (
-    collect_jsonschema_errors,
-    validate_collected_ontology_data,
-    report_issues,
-)
+from validation.validate_metadata import validate_input_metadata, report_issues
 
 # Ingest file types
 EXPRESSION_FILE_TYPES = ["dense", "mtx", "loom"]
 
 
 class IngestPipeline(object):
     # File location for metadata json convention
-    JSON_CONVENTION = 'DoNotTouch/AMC_v0.8.json'
+    JSON_CONVENTION = 'gs://fc-bcc55e6c-bec3-4b2e-9fb2-5e1526ddfcd2/metadata_conventions/AMC_v1.1.3/AMC_v1.1.3.json'
 
     def __init__(
         self,
@@ -233,10 +231,11 @@ def load_subsample(self, doc):
     def has_valid_metadata_convention(self):
         """ Determines if cell metadata file follows metadata convention"""
         with open(self.JSON_CONVENTION, 'r') as f:
-            convention = json.load(f)
+            json_file = IngestFiles(self.JSON_CONVENTION, ['application/json'])
+            convention = json.load(json_file.file)
+            validate_input_metadata(self.cell_metadata, convention)
 
-        collect_jsonschema_errors(self.cell_metadata, convention)
-        validate_collected_ontology_data(self.cell_metadata, convention)
+        f.close()
         return not report_issues(self.cell_metadata)
 
     def ingest_expression(self) -> None:
@@ -278,6 +277,7 @@ def ingest_cell_metadata(self):
             if self.kwargs['validate_convention'] is not None:
                 if self.kwargs['validate_convention']:
                     if self.has_valid_metadata_convention():
+                        print("it works!")
                         pass
                     else:
                         return 1
Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0):`
`88`	`88`	`file_connections = {`
`89`	`89`	`"text/csv": self.open_csv(open_file),`
`90`	`90`	`"text/plain": open_file,`
	`91`	`+ "application/json": open_file,`
`91`	`92`	`"text/tab-separated-values": self.open_tsv(open_file),`
`92`	`93`	`"dataframe": self.open_pandas,`
`93`	`94`	`}`