Skip to content

Commit 8685a99

Browse files
authored
Merge pull request #235 from broadinstitute/development
Release 1.14.1
2 parents 75cbef7 + 2701eb0 commit 8685a99

File tree

4 files changed

+139
-1
lines changed

4 files changed

+139
-1
lines changed

ingest/cell_metadata.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def __init__(
5858
self.ontology = defaultdict(lambda: defaultdict(list))
5959
self.ordered_ontology = defaultdict(list)
6060
self.ordered_labels = defaultdict(list)
61+
self.synonym_updates = defaultdict(lambda: defaultdict(str))
6162
self.cells = []
6263
self.numeric_array_columns = {}
6364
self.kwargs = kwargs

ingest/validation/validate_metadata.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import copy
3838
import itertools
3939
import math
40+
import pandas as pd
4041

4142
import colorama
4243
from colorama import Fore
@@ -1208,6 +1209,13 @@ def validate_collected_ontology_data(metadata, convention):
12081209
(ontology_id, ontology_label)
12091210
],
12101211
)
1212+
else:
1213+
property_header = property_name + "__ontology_label"
1214+
matched_label_for_id = label_and_synonyms.get("label")
1215+
if ontology_label != matched_label_for_id:
1216+
metadata.synonym_updates[property_header][
1217+
ontology_label
1218+
] = matched_label_for_id
12111219
except ValueError as value_error:
12121220
metadata.store_validation_issue(
12131221
"error",
@@ -1402,6 +1410,81 @@ def detect_excel_drag(metadata, convention):
14021410
return excel_drag
14031411

14041412

1413+
def to_1D(series):
1414+
""" Pandas values which are list need unnesting
1415+
"""
1416+
return [x for _list in series for x in _list]
1417+
1418+
1419+
def replace_single_value_array(df, metadata_name, synonym, label):
1420+
""" Synonym replacement (in-place) for single-value array metadata
1421+
Pandas doesn't operate well on lists which are potentially non-homogenous
1422+
# https://stackoverflow.com/questions/53116286/how-to-assign-an-entire-list-to-each-row-of-a-pandas-dataframe
1423+
"""
1424+
match = [v == [synonym] for v in df[metadata_name]]
1425+
value = [label]
1426+
df.loc[match, metadata_name] = df.apply(lambda x: value, axis=1)
1427+
1428+
1429+
def replace_synonym_in_multivalue_array(df, metadata_name, substitutions):
1430+
""" Synonym replacement (in-place) for multi-value array ontology labels
1431+
must identify all affected arrays of labels, construct replacement arrays
1432+
then replace old synonym-containing array with an updated array
1433+
"""
1434+
orig_values = list(df[metadata_name].transform(tuple).unique())
1435+
matching_synonyms = {}
1436+
for o in orig_values:
1437+
# if a synonym (s) is an element of the multivalue array (o), track it
1438+
matching_synonyms[o] = [s for s in substitutions.keys() if s in o]
1439+
for o in matching_synonyms.keys():
1440+
# if a multivalue array contains synonyms
1441+
if matching_synonyms[o]:
1442+
# make a copy of the original multivalue array that will take on all substitutions
1443+
replacement_value = list(o)
1444+
# make all synonym substitutions into the multivalue array of ontology labels
1445+
for s in matching_synonyms[o]:
1446+
replacement_value = [
1447+
substitutions[s] if term == s else term
1448+
for term in replacement_value
1449+
]
1450+
# select the rows in the dataframe with entries for multivalue array (o)
1451+
match = [v == list(o) for v in df[metadata_name]]
1452+
df.loc[match, metadata_name] = df.apply(lambda x: replacement_value, axis=1)
1453+
1454+
1455+
def replace_synonyms(metadata):
1456+
"""
1457+
Update BigQuery data to store ontology labels and not synonyms
1458+
"""
1459+
bq_filename = str(metadata.study_file_id) + ".json"
1460+
df = pd.read_json(bq_filename, lines=True)
1461+
for metadata_name in metadata.synonym_updates.keys():
1462+
# non-array metadata values are strings
1463+
if isinstance(df[metadata_name][0], str):
1464+
for synonym in metadata.synonym_updates[metadata_name].keys():
1465+
df[metadata_name].replace(
1466+
synonym,
1467+
metadata.synonym_updates[metadata_name][synonym],
1468+
inplace=True,
1469+
)
1470+
# Pandas can't hash mutable complex objects like lists
1471+
# need to find and replace by location (iloc)
1472+
elif len(df[metadata_name]) == len(to_1D(df[metadata_name])):
1473+
for synonym in metadata.synonym_updates[metadata_name].keys():
1474+
replace_single_value_array(
1475+
df,
1476+
metadata_name,
1477+
synonym,
1478+
metadata.synonym_updates[metadata_name][synonym],
1479+
)
1480+
# at least one non-single array-type metadata
1481+
else:
1482+
replace_synonym_in_multivalue_array(
1483+
df, metadata_name, metadata.synonym_updates[metadata_name]
1484+
)
1485+
df.to_json(bq_filename, orient="records", lines=True)
1486+
1487+
14051488
def validate_input_metadata(metadata, convention, bq_json=None):
14061489
"""Wrapper function to run validation functions
14071490
"""
@@ -1415,6 +1498,8 @@ def validate_input_metadata(metadata, convention, bq_json=None):
14151498
# long-compute-time issue (if false positives are possible, bypass will be needed)
14161499
dev_logger.info('Validating ontology content against EBI OLS')
14171500
validate_collected_ontology_data(metadata, convention)
1501+
if metadata.synonym_updates:
1502+
replace_synonyms(metadata)
14181503
confirm_uniform_units(metadata, convention)
14191504

14201505

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"CellID": "BM01_16dpp_AAGCAGTGGTAT", "disease__time_since_onset": [12.0, 2.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000944"], "organ_region__ontology_label": ["Folium-tuber vermis (VII)"], "disease__treated": [false, false], "species": "NCBITaxon_9606", "species__ontology_label": "human", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["European"], "ethnicity": ["HANCESTRO_0005"], "organism_age": 31.0, "disease": ["MONDO_0005015", "MONDO_0006849"], "disease__ontology_label": ["diabetes", "mastitis"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
2+
{"CellID": "BM01_16dpp_TAAGCAGTGGTA", "disease__time_since_onset": [1.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000302", "MBA_000000294", "MBA_000000795"], "organ_region__ontology_label": ["Superior colliculus, sensory related", "Superior colliculus, motor related", "Periaqueductal gray"], "disease__treated": [false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo Sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["white"], "ethnicity": ["HANCESTRO_0005"], "organism_age": 31.0, "disease": ["MONDO_0005709"], "disease__ontology_label": ["common cold"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
3+
{"CellID": "BM01_16dpp_CTAAGCAGTGGT", "disease__time_since_onset": [24.0, 2.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000714", "MBA_000000972"], "disease__treated": [true, false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["British"], "ethnicity": ["HANCESTRO_0462"], "organism_age": 31.0, "disease": ["MONDO_0005015", "MONDO_0005709"], "disease__ontology_label": ["diabetes mellitus", "common cold"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "organ_region__ontology_label": ["Orbital area", "Prelimbic area"], "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
4+
{"CellID": "BM01_16dpp_CGGTAAACCATT", "disease__time_since_onset": [36.0, 3.0, 1.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000001041"], "organ_region__ontology_label": ["Paraflocculus"], "disease__treated": [true, false, false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity": ["HANCESTRO_0462"], "organism_age": 31.0, "disease": ["MONDO_0005015", "MONDO_0006849", "MONDO_0005709"], "disease__ontology_label": ["diabetes", "breast infection", "common cold"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "ethnicity__ontology_label": ["British"], "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}
5+
{"CellID": "BM01_16dpp_CCGAATTCACCG", "disease__time_since_onset": [0.0], "disease__time_since_onset__unit": "UO_0000035", "organ_region": ["MBA_000000909", "MBA_000000502"], "organ_region__ontology_label": ["Entorhinal area", "Subiculum"], "disease__treated": [false], "species": "NCBITaxon_9606", "species__ontology_label": "Homo sapiens", "geographical_region": "GAZ_00003181", "geographical_region__ontology_label": "Boston", "library_preparation_protocol": "EFO_0008919", "library_preparation_protocol__ontology_label": "Seq-Well", "organ": "UBERON_0001913", "organ__ontology_label": "milk", "sex": "female", "is_living": "yes", "organism_age__unit": "UO_0000036", "organism_age__unit_label": "year", "ethnicity__ontology_label": ["Caucasian"], "ethnicity": ["HANCESTRO_0005"], "organism_age": 31.0, "disease": ["MONDO_0000001"], "disease__ontology_label": ["disease or disorder"], "cell_type": "CL_0000066", "cell_type__ontology_label": "epithelial cell", "donor_id": "BM01", "biosample_id": "BM01_16dpp_r3", "biosample_type": "PrimaryBioSample_BodyFluid", "preservation_method": "Fresh", "disease__time_since_onset__unit_label": "month", "study_accession": "SCPdev", "file_id": "dec0dedfeed1111111111111", "metadata_convention_version": "2.2.0", "organism_age__seconds": 978285600.0}

tests/test_validate_metadata.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from unittest.mock import patch
2424
import io
2525
import numpy as np
26+
import pandas as pd
2627

2728
sys.path.append("../ingest")
2829
sys.path.append("../ingest/validation")
@@ -40,6 +41,8 @@
4041
MAX_HTTP_ATTEMPTS,
4142
is_empty_string,
4243
is_label_or_synonym,
44+
replace_single_value_array,
45+
replace_synonym_in_multivalue_array,
4346
)
4447

4548

@@ -890,12 +893,56 @@ def test_will_allow_synonym_matches(self):
890893
self.assertTrue(
891894
metadata.validate_format(), "Valid metadata headers should not elicit error"
892895
)
893-
validate_input_metadata(metadata, convention)
896+
validate_input_metadata(metadata, convention, bq_json=True)
894897
self.assertFalse(
895898
report_issues(metadata), "Valid ontology content should not elicit error"
896899
)
897900
self.teardown_metadata(metadata)
898901

902+
def test_array_synonym_replacement(self):
903+
data = "../tests/data/annotation/metadata/convention/df.json"
904+
df = pd.read_json(data, lines=True)
905+
906+
metadata_name = "ethnicity__ontology_label"
907+
matches_before_replace = [v == ["white"] for v in df[metadata_name]]
908+
909+
replace_single_value_array(df, metadata_name, "white", "European")
910+
matches_after_replace = [v == ["white"] for v in df[metadata_name]]
911+
912+
self.assertTrue(
913+
np.count_nonzero(matches_before_replace) == 1,
914+
"original df should have one instance of ['white']",
915+
)
916+
self.assertTrue(
917+
np.count_nonzero(matches_after_replace) == 0,
918+
"resulting df should have no instances of ['white']",
919+
)
920+
921+
metadata_name = "disease__ontology_label"
922+
orig_values = list(df[metadata_name].transform(tuple).unique())
923+
replace = {"diabetes": "diabetes mellitus", "breast infection": "mastitis"}
924+
replace_synonym_in_multivalue_array(df, metadata_name, replace)
925+
replaced_values = list(df[metadata_name].transform(tuple).unique())
926+
927+
expected_result = [
928+
('diabetes mellitus', 'mastitis'),
929+
('common cold',),
930+
('diabetes mellitus', 'common cold'),
931+
('diabetes mellitus', 'mastitis', 'common cold'),
932+
('disease or disorder',),
933+
]
934+
935+
self.assertFalse(
936+
orig_values == replaced_values,
937+
"multi-value array names should be different after replacement",
938+
)
939+
940+
self.assertEqual(
941+
replaced_values,
942+
expected_result,
943+
"multi-value array names should match expected result",
944+
)
945+
899946
def test_validate_nonconventional_numeric_content(self):
900947
"""Nonconventional numeric metadata values should all validate as numeric
901948
"""

0 commit comments

Comments
 (0)