1212from dataclasses import dataclass
1313from typing import Dict , Generator , List , Tuple , Union # noqa: F401
1414import warnings
15+ import scanpy as sc
1516
1617
1718import pandas as pd # NOqa: F821
2122# import google.cloud.logging
2223
2324try :
24- from monitor import setup_logger
25+ from monitor import setup_logger , log_exception
2526except ImportError :
26- from .monitor import setup_logger
27+ from .monitor import setup_logger , log_exception
2728
2829
2930@dataclass
@@ -75,13 +76,16 @@ class IngestFiles:
7576 # General logger for class
7677 # Logger provides more details
7778 dev_logger = setup_logger (__name__ , "log.txt" , format = "support_configs" )
79+ user_logger = setup_logger (__name__ + ".user_logger" , "user_log.txt" )
7880 # Filter out warnings about using end user credentials when running ingest_pipeline as dev
7981 warnings .filterwarnings (
8082 "ignore" , "Your application has authenticated using end user credentials"
8183 )
8284
8385 def __init__ (self , file_path , allowed_file_types ):
8486 self .file_path = file_path
87+ # define filetype for h5ad file extension
88+ mimetypes .add_type ('application/x-hdf5' , '.h5ad' )
8589 # File is remote (in GCS bucket) when running via PAPI,
8690 # and typically local when developing
8791 self .is_remote_file = IngestFiles .is_remote_file (file_path )
@@ -195,6 +199,7 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
195199 "text/plain" : self .open_txt ,
196200 "text/tab-separated-values" : self .open_tsv ,
197201 "dataframe" : self .open_pandas ,
202+ "application/x-hdf5" : self .open_h5ad ,
198203 }
199204
200205 if start_point != 0 :
@@ -214,6 +219,11 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
214219 file_connections .get (file_type )(open_file , file_type , ** kwargs ),
215220 open_file ,
216221 )
222+ elif file_type == "application/x-hdf5" :
223+ return (
224+ file_connections .get (file_type )(file_path , ** kwargs ),
225+ open_file ,
226+ )
217227 else :
218228 return (
219229 file_connections .get (file_type )(open_file , ** kwargs ),
@@ -227,9 +237,12 @@ def open_file(self, file_path, open_as=None, start_point: int = 0, **kwargs):
227237 open_file ,
228238 )
229239 else :
230- raise ValueError (
231- f"Unsupported file format. Allowed file types are: { ' ' .join (self .allowed_file_types )} "
240+ msg = (
241+ f"Unsupported file format. Allowed file MIME types are: "
242+ f"{ ' ' .join (self .allowed_file_types )} "
232243 )
244+ log_exception (IngestFiles .dev_logger , IngestFiles .user_logger , msg )
245+ raise ValueError (msg )
233246
234247 # Inherited function
235248 def extract (self ):
@@ -298,6 +311,15 @@ def open_pandas(self, file_path, file_type, **kwargs):
298311 else :
299312 raise ValueError ("File must be tab or comma delimited" )
300313
314+ def open_h5ad (self , file_path , ** kwargs ):
315+ """Opens file as AnnData object """
316+ try :
317+ return sc .read_h5ad (file_path , backed = 'r' )
318+ except OSError as e :
319+ msg = f"Scanpy cannot read file, \" { file_path } \" ."
320+ log_exception (IngestFiles .dev_logger , IngestFiles .user_logger , msg )
321+ raise ValueError (msg )
322+
301323 def open_csv (self , opened_file_object , ** kwargs ):
302324 """Opens csv file"""
303325 csv .register_dialect (
0 commit comments