3030from colorama import Fore
3131import jsonschema
3232from google .cloud import bigquery
33+ import numpy as np
3334
3435sys .path .append ('..' )
3536try :
@@ -292,10 +293,24 @@ def cast_boolean_type(value):
292293 raise ValueError (f'cannot cast { value } as boolean' )
293294
294295
296+ def value_is_nan (value ):
297+ """Check if value is nan
298+ nan is a special dataframe value to indicate missing data
299+ """
300+ try :
301+ return np .isnan (value )
302+ except TypeError :
303+ return False
304+
305+
295306def cast_integer_type (value ):
296307 """Cast metadata value as integer
297308 """
298- return int (value )
309+ if value_is_nan (value ):
310+ # nan indicates missing data, has no valid integer value for casting
311+ return value
312+ else :
313+ return int (value )
299314
300315
301316def cast_float_type (value ):
@@ -307,12 +322,27 @@ def cast_float_type(value):
307322def cast_string_type (value ):
308323 """Cast string type per convention where Pandas autodetected a number
309324 """
310- if isinstance (value , numbers .Number ):
325+ if value_is_nan (value ):
326+ # nan indicates missing data; by type, nan is a numpy float
327+ # so a separate type check is needed for proper handling
328+ return value
329+ elif isinstance (value , numbers .Number ):
311330 return str (value )
312331 else :
313332 return value
314333
315334
335+ def regularize_ontology_id (value ):
336+ """Regularize ontology_ids for storage with underscore format
337+ """
338+ try :
339+ return value .replace (":" , "_" )
340+ except AttributeError :
341+ # when expected value is not actually an ontology ID
342+ # return the bad value for JSON schema validation
343+ return value
344+
345+
316346def cast_metadata_type (metadatum , value , id_for_error_detail , convention , metadata ):
317347 """for metadatum, lookup expected type by metadata convention
318348 and cast value as appropriate type for validation
@@ -332,6 +362,12 @@ def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metada
332362 # files that support array-based metadata navtively (eg. loom,
333363 # anndata etc) splitting on pipe may become problematic
334364 for element in value .split ('|' ):
365+ try :
366+ if 'ontology' in convention ['properties' ][metadatum ]:
367+ element = regularize_ontology_id (element )
368+ except KeyError :
369+ # unconventional metadata will trigger this exception
370+ pass
335371 cast_element = metadata_types .get (
336372 lookup_metadata_type (convention , metadatum )
337373 )(element )
@@ -349,8 +385,22 @@ def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metada
349385 # metadata is being cast - the value needs to be passed as an array,
350386 # it is already boolean via Pandas' inference processes
351387 except AttributeError :
352- cast_metadata [metadatum ] = [value ]
388+ try :
389+ if 'ontology' in convention ['properties' ][metadatum ]:
390+ value = regularize_ontology_id (value )
391+ except KeyError :
392+ # unconventional metadata will trigger this exception
393+ pass
394+ cast_metadata [metadatum ] = [
395+ metadata_types .get (lookup_metadata_type (convention , metadatum ))(value )
396+ ]
353397 else :
398+ try :
399+ if 'ontology' in convention ['properties' ][metadatum ]:
400+ value = regularize_ontology_id (value )
401+ except KeyError :
402+ # unconventional metadata will trigger this exception
403+ pass
354404 try :
355405 cast_value = metadata_types .get (
356406 lookup_metadata_type (convention , metadatum )
@@ -555,9 +605,9 @@ def retrieve_ontology_term(convention_url, ontology_id, ontologies):
555605 # valid separators are underscore and colon (used by HCA)
556606 try :
557607 ontology_shortname , term_id = re .split ('[_:]' , ontology_id )
558- # when ontolgyID is malformed and has no separator -> ValueError
559- # when ontologyID value is empty string -> TypeError
560608 except (ValueError , TypeError ):
609+ # when ontology_id is malformed and has no separator -> ValueError
610+ # when ontology_id value is empty string -> TypeError
561611 return None
562612 # check if we have already retrieved this ontology reference
563613 if ontology_shortname not in ontologies :
0 commit comments