Skip to content

Commit 64572f3

Browse files
authored
Merge pull request #79 from broadinstitute/jlc_regularize_stored_ontologyID
Regularize stored ontology IDs (SCP-2116)
2 parents 74ee750 + 3ad4b5d commit 64572f3

File tree

4 files changed

+70
-9
lines changed

4 files changed

+70
-9
lines changed

ingest/ingest_pipeline.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,13 @@ def load(
227227
self.insert_many('data_arrays', documents)
228228
except Exception as e:
229229
self.error_logger.error(e, extra=self.extra_log_params)
230-
if e.details is not None:
230+
if hasattr(e, 'details') and e.details is not None:
231231
self.error_logger.error(e.details, extra=self.extra_log_params)
232+
else:
233+
self.error_logger.error(
234+
'Error loading data to MongoDB (no details available) - check MongoDB access',
235+
extra=self.extra_log_params,
236+
)
232237
return 1
233238
return 0
234239

ingest/validation/validate_metadata.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from colorama import Fore
3131
import jsonschema
3232
from google.cloud import bigquery
33+
import numpy as np
3334

3435
sys.path.append('..')
3536
try:
@@ -292,10 +293,24 @@ def cast_boolean_type(value):
292293
raise ValueError(f'cannot cast {value} as boolean')
293294

294295

296+
def value_is_nan(value):
297+
"""Check if value is nan
298+
nan is a special dataframe value to indicate missing data
299+
"""
300+
try:
301+
return np.isnan(value)
302+
except TypeError:
303+
return False
304+
305+
295306
def cast_integer_type(value):
296307
"""Cast metadata value as integer
297308
"""
298-
return int(value)
309+
if value_is_nan(value):
310+
# nan indicates missing data, has no valid integer value for casting
311+
return value
312+
else:
313+
return int(value)
299314

300315

301316
def cast_float_type(value):
@@ -307,12 +322,27 @@ def cast_float_type(value):
307322
def cast_string_type(value):
308323
"""Cast string type per convention where Pandas autodetected a number
309324
"""
310-
if isinstance(value, numbers.Number):
325+
if value_is_nan(value):
326+
# nan indicates missing data; by type, nan is a numpy float
327+
# so a separate type check is needed for proper handling
328+
return value
329+
elif isinstance(value, numbers.Number):
311330
return str(value)
312331
else:
313332
return value
314333

315334

335+
def regularize_ontology_id(value):
336+
"""Regularize ontology_ids for storage with underscore format
337+
"""
338+
try:
339+
return value.replace(":", "_")
340+
except AttributeError:
341+
# when expected value is not actually an ontology ID
342+
# return the bad value for JSON schema validation
343+
return value
344+
345+
316346
def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metadata):
317347
"""for metadatum, lookup expected type by metadata convention
318348
and cast value as appropriate type for validation
@@ -332,6 +362,12 @@ def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metada
332362
# files that support array-based metadata navtively (eg. loom,
333363
# anndata etc) splitting on pipe may become problematic
334364
for element in value.split('|'):
365+
try:
366+
if 'ontology' in convention['properties'][metadatum]:
367+
element = regularize_ontology_id(element)
368+
except KeyError:
369+
# unconventional metadata will trigger this exception
370+
pass
335371
cast_element = metadata_types.get(
336372
lookup_metadata_type(convention, metadatum)
337373
)(element)
@@ -349,8 +385,22 @@ def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metada
349385
# metadata is being cast - the value needs to be passed as an array,
350386
# it is already boolean via Pandas' inference processes
351387
except AttributeError:
352-
cast_metadata[metadatum] = [value]
388+
try:
389+
if 'ontology' in convention['properties'][metadatum]:
390+
value = regularize_ontology_id(value)
391+
except KeyError:
392+
# unconventional metadata will trigger this exception
393+
pass
394+
cast_metadata[metadatum] = [
395+
metadata_types.get(lookup_metadata_type(convention, metadatum))(value)
396+
]
353397
else:
398+
try:
399+
if 'ontology' in convention['properties'][metadatum]:
400+
value = regularize_ontology_id(value)
401+
except KeyError:
402+
# unconventional metadata will trigger this exception
403+
pass
354404
try:
355405
cast_value = metadata_types.get(
356406
lookup_metadata_type(convention, metadatum)
@@ -555,9 +605,9 @@ def retrieve_ontology_term(convention_url, ontology_id, ontologies):
555605
# valid separators are underscore and colon (used by HCA)
556606
try:
557607
ontology_shortname, term_id = re.split('[_:]', ontology_id)
558-
# when ontolgyID is malformed and has no separator -> ValueError
559-
# when ontologyID value is empty string -> TypeError
560608
except (ValueError, TypeError):
609+
# when ontology_id is malformed and has no separator -> ValueError
610+
# when ontology_id value is empty string -> TypeError
561611
return None
562612
# check if we have already retrieved this ontology reference
563613
if ontology_shortname not in ontologies:

tests/data/issues_metadata_v1.1.3.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
"BM01_16dpp_CGGTAAACCATT",
2121
"BM01_16dpp_AAGCAGTGGTAT"
2222
],
23-
"is_living: 'nan' is not one of ['yes', 'no', 'unknown']": [
23+
"is_living: nan is not one of ['yes', 'no', 'unknown']": [
24+
"BM01_16dpp_TAAGCAGTGGTA"
25+
],
26+
"is_living: nan is not of type 'string'": [
2427
"BM01_16dpp_TAAGCAGTGGTA"
2528
],
2629
"sample_type: 'direct from donr - fresh' is not one of ['cell line', 'organoid', 'direct from donor - fresh', 'direct from donor - frozen', 'cultured primary cells']": [

tests/data/issues_ontology_v1.1.3.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
{
22
"error": {
33
"convention": {
4-
"cell_type: 'nan' does not match '^[A-Za-z]+[_:][0-9]'": [
4+
"cell_type: nan is not of type 'string'": [
55
"BM01_16dpp_AAGCAGTGGTAT"
66
],
7-
"geographical_region: 'nan' does not match '^[A-Za-z]+[_:][0-9]'": [
7+
"geographical_region: nan is not of type 'string'": [
8+
"BM01_16dpp_AAGCAGTGGTAT"
9+
],
10+
"geographical_region__ontology_label: nan is not of type 'string'": [
811
"BM01_16dpp_AAGCAGTGGTAT"
912
],
1013
"library_preparation_protocol: 'EFO0008919' does not match '^[A-Za-z]+[_:][0-9]'": [

0 commit comments

Comments
 (0)