Merge pull request #235 from broadinstitute/development

jlchang · web-flow · commit 8685a99f8d60 · 2022-02-18T10:10:20.000-05:00
Release 1.14.1
diff --git a/ingest/cell_metadata.py b/ingest/cell_metadata.py
@@ -58,6 +58,7 @@ def __init__(
         self.ontology = defaultdict(lambda: defaultdict(list))
         self.ordered_ontology = defaultdict(list)
         self.ordered_labels = defaultdict(list)
+        self.synonym_updates = defaultdict(lambda: defaultdict(str))
         self.cells = []
         self.numeric_array_columns = {}
         self.kwargs = kwargs
diff --git a/ingest/validation/validate_metadata.py b/ingest/validation/validate_metadata.py
@@ -37,6 +37,7 @@
 import copy
 import itertools
 import math
+import pandas as pd
 
 import colorama
 from colorama import Fore
@@ -1208,6 +1209,13 @@ def validate_collected_ontology_data(metadata, convention):
                                 (ontology_id, ontology_label)
                             ],
                         )
+                else:
+                    property_header = property_name + "__ontology_label"
+                    matched_label_for_id = label_and_synonyms.get("label")
+                    if ontology_label != matched_label_for_id:
+                        metadata.synonym_updates[property_header][
+                            ontology_label
+                        ] = matched_label_for_id
             except ValueError as value_error:
                 metadata.store_validation_issue(
                     "error",
@@ -1402,6 +1410,81 @@ def detect_excel_drag(metadata, convention):
     return excel_drag
 
 
+def to_1D(series):
+    """ Pandas values which are list need unnesting
+    """
+    return [x for _list in series for x in _list]
+
+
+def replace_single_value_array(df, metadata_name, synonym, label):
+    """ Synonym replacement (in-place) for single-value array metadata
+    Pandas doesn't operate well on lists which are potentially non-homogenous
+    # https://stackoverflow.com/questions/53116286/how-to-assign-an-entire-list-to-each-row-of-a-pandas-dataframe
+    """
+    match = [v == [synonym] for v in df[metadata_name]]
+    value = [label]
+    df.loc[match, metadata_name] = df.apply(lambda x: value, axis=1)
+
+
+def replace_synonym_in_multivalue_array(df, metadata_name, substitutions):
+    """ Synonym replacement (in-place) for multi-value array ontology labels
+        must identify all affected arrays of labels, construct replacement arrays
+        then replace old synonym-containing array with an updated array
+    """
+    orig_values = list(df[metadata_name].transform(tuple).unique())
+    matching_synonyms = {}
+    for o in orig_values:
+        # if a synonym (s) is an element of the multivalue array (o), track it
+        matching_synonyms[o] = [s for s in substitutions.keys() if s in o]
+    for o in matching_synonyms.keys():
+        # if a multivalue array contains synonyms
+        if matching_synonyms[o]:
+            # make a copy of the original multivalue array that will take on all substitutions
+            replacement_value = list(o)
+            # make all synonym substitutions into the multivalue array of ontology labels
+            for s in matching_synonyms[o]:
+                replacement_value = [
+                    substitutions[s] if term == s else term
+                    for term in replacement_value
+                ]
+            # select the rows in the dataframe with entries for multivalue array (o)
+            match = [v == list(o) for v in df[metadata_name]]
+            df.loc[match, metadata_name] = df.apply(lambda x: replacement_value, axis=1)
+
+
+def replace_synonyms(metadata):
+    """
+    Update BigQuery data to store ontology labels and not synonyms
+    """
+    bq_filename = str(metadata.study_file_id) + ".json"
+    df = pd.read_json(bq_filename, lines=True)
+    for metadata_name in metadata.synonym_updates.keys():
+        # non-array metadata values are strings
+        if isinstance(df[metadata_name][0], str):
+            for synonym in metadata.synonym_updates[metadata_name].keys():
+                df[metadata_name].replace(
+                    synonym,
+                    metadata.synonym_updates[metadata_name][synonym],
+                    inplace=True,
+                )
+        # Pandas can't hash mutable complex objects like lists
+        # need to find and replace by location (iloc)
+        elif len(df[metadata_name]) == len(to_1D(df[metadata_name])):
+            for synonym in metadata.synonym_updates[metadata_name].keys():
+                replace_single_value_array(
+                    df,
+                    metadata_name,
+                    synonym,
+                    metadata.synonym_updates[metadata_name][synonym],
+                )
+        # at least one non-single array-type metadata
+        else:
+            replace_synonym_in_multivalue_array(
+                df, metadata_name, metadata.synonym_updates[metadata_name]
+            )
+    df.to_json(bq_filename, orient="records", lines=True)
+
+
 def validate_input_metadata(metadata, convention, bq_json=None):
     """Wrapper function to run validation functions
     """
@@ -1415,6 +1498,8 @@ def validate_input_metadata(metadata, convention, bq_json=None):
         # long-compute-time issue (if false positives are possible, bypass will be needed)
         dev_logger.info('Validating ontology content against EBI OLS')
         validate_collected_ontology_data(metadata, convention)
+        if metadata.synonym_updates:
+            replace_synonyms(metadata)
         confirm_uniform_units(metadata, convention)
 
 
diff --git a/tests/data/annotation/metadata/convention/df.json b/tests/data/annotation/metadata/convention/df.json
@@ -0,0 +1,5 @@
+{"CellID": "BM01_16dpp_AAGCAGTGGTAT", "disease__time_since_onset": [12.0, 2.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000944"], "organ_region__ontology_label": ["Folium-tuber vermis (VII)"], "disease__treated": [false, false], "species": "NCBITaxon_9606", "species__ontology_label": "human", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["European"], "ethnicity": ["HANCESTRO_0005"], "organism_age": 31.0, "disease": ["MONDO_0005015", "MONDO_0006849"], "disease__ontology_label": ["diabetes", "mastitis"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
+{"CellID": "BM01_16dpp_TAAGCAGTGGTA", "disease__time_since_onset": [1.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000302", "MBA_000000294", "MBA_000000795"], "organ_region__ontology_label": ["Superior colliculus, sensory related", "Superior colliculus, motor related", "Periaqueductal gray"], "disease__treated": [false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo Sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["white"], "ethnicity": ["HANCESTRO_0005"], "organism_age": 31.0, "disease": ["MONDO_0005709"], "disease__ontology_label": ["common cold"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
+{"CellID": "BM01_16dpp_CTAAGCAGTGGT", "disease__time_since_onset": [24.0, 2.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000714", "MBA_000000972"], "disease__treated": [true, false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["British"], "ethnicity": ["HANCESTRO_0462"], "organism_age": 31.0, "disease": ["MONDO_0005015", "MONDO_0005709"], "disease__ontology_label": ["diabetes mellitus", "common cold"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "organ_region__ontology_label": ["Orbital area", "Prelimbic area"], "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
+{"CellID": "BM01_16dpp_CGGTAAACCATT", "disease__time_since_onset": [36.0, 3.0, 1.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000001041"], "organ_region__ontology_label": ["Paraflocculus"], "disease__treated": [true, false, false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity": ["HANCESTRO_0462"], "organism_age": 31.0, "disease": ["MONDO_0005015", "MONDO_0006849", "MONDO_0005709"], "disease__ontology_label": ["diabetes", "breast infection", "common cold"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "ethnicity__ontology_label": ["British"], "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
+{"CellID": "BM01_16dpp_CCGAATTCACCG", "disease__time_since_onset": [0.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000909", "MBA_000000502"], "organ_region__ontology_label": ["Entorhinal area", "Subiculum"], "disease__treated": [false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["Caucasian"], "ethnicity": ["HANCESTRO_0005"], "organism_age": 31.0, "disease": ["MONDO_0000001"], "disease__ontology_label": ["disease or disorder"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
diff --git a/tests/test_validate_metadata.py b/tests/test_validate_metadata.py
@@ -23,6 +23,7 @@
 from unittest.mock import patch
 import io
 import numpy as np
+import pandas as pd
 
 sys.path.append("../ingest")
 sys.path.append("../ingest/validation")
@@ -40,6 +41,8 @@
     MAX_HTTP_ATTEMPTS,
     is_empty_string,
     is_label_or_synonym,
+    replace_single_value_array,
+    replace_synonym_in_multivalue_array,
 )
 
 
@@ -890,12 +893,56 @@ def test_will_allow_synonym_matches(self):
         self.assertTrue(
             metadata.validate_format(), "Valid metadata headers should not elicit error"
         )
-        validate_input_metadata(metadata, convention)
+        validate_input_metadata(metadata, convention, bq_json=True)
         self.assertFalse(
             report_issues(metadata), "Valid ontology content should not elicit error"
         )
         self.teardown_metadata(metadata)
 
+    def test_array_synonym_replacement(self):
+        data = "../tests/data/annotation/metadata/convention/df.json"
+        df = pd.read_json(data, lines=True)
+
+        metadata_name = "ethnicity__ontology_label"
+        matches_before_replace = [v == ["white"] for v in df[metadata_name]]
+
+        replace_single_value_array(df, metadata_name, "white", "European")
+        matches_after_replace = [v == ["white"] for v in df[metadata_name]]
+
+        self.assertTrue(
+            np.count_nonzero(matches_before_replace) == 1,
+            "original df should have one instance of ['white']",
+        )
+        self.assertTrue(
+            np.count_nonzero(matches_after_replace) == 0,
+            "resulting df should have no instances of ['white']",
+        )
+
+        metadata_name = "disease__ontology_label"
+        orig_values = list(df[metadata_name].transform(tuple).unique())
+        replace = {"diabetes": "diabetes mellitus", "breast infection": "mastitis"}
+        replace_synonym_in_multivalue_array(df, metadata_name, replace)
+        replaced_values = list(df[metadata_name].transform(tuple).unique())
+
+        expected_result = [
+            ('diabetes mellitus', 'mastitis'),
+            ('common cold',),
+            ('diabetes mellitus', 'common cold'),
+            ('diabetes mellitus', 'mastitis', 'common cold'),
+            ('disease or disorder',),
+        ]
+
+        self.assertFalse(
+            orig_values == replaced_values,
+            "multi-value array names should be different after replacement",
+        )
+
+        self.assertEqual(
+            replaced_values,
+            expected_result,
+            "multi-value array names should match expected result",
+        )
+
     def test_validate_nonconventional_numeric_content(self):
         """Nonconventional numeric metadata values should all validate as numeric
         """