1+ from email .headerregistry import Group
12import logging
23import numpy as np
34import pandas as pd
@@ -36,19 +37,20 @@ def __init__(
3637 cell_metadata ,
3738 matrix_file_path ,
3839 matrix_file_type ,
39- annotation ,
40+ annotation_name ,
4041 ** kwargs ,
4142 ):
4243 DifferentialExpression .de_logger .info (
4344 "Initializing DifferentialExpression instance"
4445 )
4546 self .cluster = cluster
4647 self .metadata = cell_metadata
47- self .annotation = annotation
48+ self .annotation = annotation_name
4849 self .matrix_file_path = matrix_file_path
4950 self .matrix_file_type = matrix_file_type
5051 self .kwargs = kwargs
5152 self .accession = self .kwargs ["study_accession" ]
53+ self .annot_scope = self .kwargs ["annotation_scope" ]
5254 # only used in output filename, replacing non-alphanumeric with underscores
5355 self .cluster_name = re .sub (r'\W+' , '_' , self .kwargs ["name" ])
5456 self .method = self .kwargs ["method" ]
@@ -175,6 +177,7 @@ def subset_adata(adata, de_cells):
175177 return adata
176178
177179 def execute_de (self ):
180+ print (f'dev_info: Starting DE for { self .accession } ' )
178181 try :
179182 if self .matrix_file_type == "mtx" :
180183 DifferentialExpression .de_logger .info ("preparing DE on sparse matrix" )
@@ -184,6 +187,7 @@ def execute_de(self):
184187 self .matrix_file_path ,
185188 self .matrix_file_type ,
186189 self .annotation ,
190+ self .annot_scope ,
187191 self .accession ,
188192 self .cluster_name ,
189193 self .method ,
@@ -198,6 +202,7 @@ def execute_de(self):
198202 self .matrix_file_path ,
199203 self .matrix_file_type ,
200204 self .annotation ,
205+ self .annot_scope ,
201206 self .accession ,
202207 self .cluster_name ,
203208 self .method ,
@@ -222,18 +227,17 @@ def get_genes(genes_path):
222227 """
223228 genes_df = pd .read_csv (genes_path , sep = "\t " , header = None )
224229 if len (genes_df .columns ) > 1 :
230+ # unclear if falling back to gene_id is useful (SCP-4283)
231+ # print so we're aware of dups during dev testing
225232 if genes_df [1 ].count () == genes_df [1 ].nunique ():
226- return genes_df [ 1 ]. tolist ()
227- elif genes_df [ 0 ]. count () == genes_df [ 0 ]. nunique ():
228- return genes_df [0 ].tolist ()
233+ msg = "dev_info: Features file contains duplicate identifiers (col 2)"
234+ print ( msg )
235+ return genes_df [1 ].tolist ()
229236 else :
230- msg = "Features file contains duplicate identifiers"
231- print (msg )
232- log_exception (
233- DifferentialExpression .dev_logger , DifferentialExpression .de_logger , msg
234- )
235- raise ValueError (msg )
236- return genes
237+ if genes_df [0 ].count () == genes_df [0 ].nunique ():
238+ msg = "dev_info: Features file contains duplicate identifiers (col 1)"
239+ print (msg )
240+ return genes_df [0 ].tolist ()
237241
238242 @staticmethod
239243 def get_barcodes (barcodes_path ):
@@ -264,6 +268,7 @@ def run_scanpy_de(
264268 matrix_file_path ,
265269 matrix_file_type ,
266270 annotation ,
271+ annot_scope ,
267272 study_accession ,
268273 cluster_name ,
269274 method ,
@@ -315,15 +320,23 @@ def run_scanpy_de(
315320 DifferentialExpression .dev_logger , DifferentialExpression .de_logger , msg
316321 )
317322 raise KeyError (msg )
323+ # ToDo - detection and handling of annotations with only one sample (SCP-4282)
324+ except ValueError as e :
325+ print (e )
326+ log_exception (
327+ DifferentialExpression .dev_logger , DifferentialExpression .de_logger , e
328+ )
329+ raise KeyError (e )
318330
319331 DifferentialExpression .de_logger .info ("Gathering DE annotation labels" )
320332 groups = np .unique (adata .obs [annotation ]).tolist ()
321333 for group in groups :
322- group_filename = re .sub (r'\W+' , '_' , group )
334+ clean_group = re .sub (r'\W+' , '_' , group )
335+ clean_annotation = re .sub (r'\W+' , '_' , annotation )
323336 DifferentialExpression .de_logger .info (f"Writing DE output for { group } " )
324337 rank = sc .get .rank_genes_groups_df (adata , key = rank_key , group = group )
325338
326- out_file = f'{ cluster_name } --{ annotation } --{ group_filename } --{ method } .tsv'
339+ out_file = f'{ cluster_name } --{ clean_annotation } --{ clean_group } -- { annot_scope } --{ method } .tsv'
327340 # Round numbers to 4 significant digits while respecting fixed point
328341 # and scientific notation (note: trailing zeros are removed)
329342 rank .to_csv (out_file , sep = '\t ' , float_format = '%.4g' )
0 commit comments