1- from email .headerregistry import Group
21import logging
32import numpy as np
43import pandas as pd
54import scanpy as sc
65import re
6+ import glob
77
88try :
99 from monitor import setup_logger , log_exception
@@ -124,9 +124,11 @@ def process_annots(metadata_file_path, allowed_file_types, headers, dtypes):
124124 """
125125 annot_redux = IngestFiles (metadata_file_path , allowed_file_types )
126126 annot_file_type = annot_redux .get_file_type (metadata_file_path )[0 ]
127- annot_file_handle = annot_redux .open_file (metadata_file_path )[1 ]
127+ annot_file_handle , local_file_path = IngestFiles .resolve_path (
128+ annot_redux , metadata_file_path
129+ )
128130 annots = annot_redux .open_pandas (
129- metadata_file_path ,
131+ local_file_path ,
130132 annot_file_type ,
131133 open_file_object = annot_file_handle ,
132134 names = headers ,
@@ -225,17 +227,24 @@ def get_genes(genes_path):
225227 If two columns present, check if there are duplicates in 2nd col
226228 If no duplicates, use as var_names, else use 1st column
227229 """
228- genes_df = pd .read_csv (genes_path , sep = "\t " , header = None )
230+ genes_object = IngestFiles (genes_path , None )
231+ local_genes_path = genes_object .resolve_path (genes_path )[1 ]
232+
233+ genes_df = pd .read_csv (local_genes_path , sep = "\t " , header = None )
229234 if len (genes_df .columns ) > 1 :
230235 # unclear if falling back to gene_id is useful (SCP-4283)
231236 # print so we're aware of dups during dev testing
232- if genes_df [1 ].count () == genes_df [1 ].nunique ():
233- msg = "dev_info: Features file contains duplicate identifiers (col 2)"
237+ if genes_df [1 ].count () != genes_df [1 ].nunique ():
238+ msg = (
239+ "dev_info: Features file contains duplicate identifiers in column 2"
240+ )
234241 print (msg )
235242 return genes_df [1 ].tolist ()
236243 else :
237- if genes_df [0 ].count () == genes_df [0 ].nunique ():
238- msg = "dev_info: Features file contains duplicate identifiers (col 1)"
244+ if genes_df [0 ].count () != genes_df [0 ].nunique ():
245+ msg = (
246+ "dev_info: Features file contains duplicate identifiers in column 1"
247+ )
239248 print (msg )
240249 return genes_df [0 ].tolist ()
241250
@@ -252,13 +261,29 @@ def get_barcodes(barcodes_path):
252261 def adata_from_mtx (matrix_file_path , genes_path , barcodes_path ):
253262 """ reconstitute AnnData object from matrix, genes, barcodes files
254263 """
255- adata = sc .read_mtx (matrix_file_path )
264+ # process smaller files before reading larger matrix file
265+ barcodes = DifferentialExpression .get_barcodes (barcodes_path )
266+ features = DifferentialExpression .get_genes (genes_path )
267+ matrix_object = IngestFiles (matrix_file_path , None )
268+ local_file_path = matrix_object .resolve_path (matrix_file_path )[1 ]
269+ adata = sc .read_mtx (local_file_path )
256270 # For AnnData, obs are cells and vars are genes
257271 # BUT transpose needed for both dense and sparse
258272 # so transpose step is after this data object composition step
259273 # therefore the assignements below are the reverse of expected
260- adata .var_names = DifferentialExpression .get_barcodes (barcodes_path )
261- adata .obs_names = DifferentialExpression .get_genes (genes_path )
274+ adata .var_names = barcodes
275+ adata .obs_names = features
276+ return adata
277+
278+ @staticmethod
279+ def remove_single_sample_data (adata , annotation ):
280+ """ identify and remove cells that would constitute an annotation label
281+ that has data with only a single sample
282+ """
283+ counts = adata .obs [annotation ].value_counts (dropna = False )
284+ for label , count in counts .iteritems ():
285+ if count == 1 :
286+ adata = adata [adata .obs [annotation ] != label ]
262287 return adata
263288
264289 @staticmethod
@@ -285,7 +310,9 @@ def run_scanpy_de(
285310
286311 if matrix_file_type == "dense" :
287312 # will need try/except (SCP-4205)
288- adata = sc .read (matrix_file_path )
313+ matrix_object = IngestFiles (matrix_file_path , None )
314+ local_file_path = matrix_object .resolve_path (matrix_file_path )[1 ]
315+ adata = sc .read (local_file_path )
289316 else :
290317 # MTX reconstitution UNTESTED (SCP-4203)
291318 # will want try/except here to catch failed data object composition
@@ -300,6 +327,8 @@ def run_scanpy_de(
300327 # will need try/except (SCP-4205)
301328 adata .obs = DifferentialExpression .order_annots (de_annots , adata .obs_names )
302329
330+ adata = DifferentialExpression .remove_single_sample_data (adata , annotation )
331+
303332 sc .pp .normalize_total (adata , target_sum = 1e4 )
304333 sc .pp .log1p (adata )
305334 DifferentialExpression .de_logger .info ("calculating DE" )
@@ -348,3 +377,24 @@ def run_scanpy_de(
348377
349378 DifferentialExpression .de_logger .info ("DE processing complete" )
350379
380+ @staticmethod
381+ def string_for_output_match (arguments ):
382+ cleaned_cluster_name = re .sub (r'\W+' , '_' , arguments ["cluster_name" ])
383+ cleaned_annotation_name = re .sub (r'\W+' , '_' , arguments ["annotation_name" ])
384+ files_to_match = f"{ cleaned_cluster_name } --{ cleaned_annotation_name } *.tsv"
385+ return files_to_match
386+
387+ @staticmethod
388+ def delocalize_de_files (file_path , study_file_id , files_to_match ):
389+ """ Copy DE output files to study bucket
390+ """
391+
392+ files = glob .glob (files_to_match )
393+ for file in files :
394+ IngestFiles .delocalize_file (
395+ study_file_id ,
396+ None ,
397+ file_path ,
398+ file ,
399+ f"_scp_internal/differential_expression/{ file } " ,
400+ )
0 commit comments