66
77import pandas as pd
88import numpy as np
9+ from scipy .stats import spearmanr
910import csv
1011import logging
1112
@@ -25,7 +26,9 @@ def sort_comparison(groups):
2526 """
2627
2728 if any (i .isdigit () for i in groups ):
28- sorted_arr = sorted (groups , key = lambda x : int ("" .join ([i for i in x if i .isdigit ()])))
29+ sorted_arr = sorted (
30+ groups , key = lambda x : int ("" .join ([i for i in x if i .isdigit ()]))
31+ )
2932 return sorted_arr
3033 elif "rest" == groups [1 ]:
3134 return groups
@@ -56,6 +59,7 @@ def canonicalize_name_and_order(data, header_refmap):
5659 data = data .rename (columns = rename_map )
5760
5861 data = data .astype ({"group" : "string" })
62+ data ['order' ] = data .index .astype (int )
5963
6064 # Update headers to expected order
6165 unsorted_headers = list (data .columns )
@@ -139,7 +143,12 @@ def generate_manifest(stem, clean_val, clean_val_p, qual):
139143
140144 if len (file_names_pairwise ) != 0 :
141145 for value in range (len (file_names_pairwise )):
142- tsv_output .writerow ([file_names_pairwise [value ][0 ], file_names_pairwise [value ][1 ],])
146+ tsv_output .writerow (
147+ [
148+ file_names_pairwise [value ][0 ],
149+ file_names_pairwise [value ][1 ],
150+ ]
151+ )
143152
144153
145154def sort_all_group (all_group ):
@@ -188,44 +197,41 @@ def sort_comparison_metrics(comparison_metrics, size, significance):
188197
189198 # Arrange significance in expected order (ultimately ranked 3rd)
190199 comparison_metrics = sorted (
191- comparison_metrics ,
192- key = lambda x : x .split ('--' )[- 1 ] == significance
200+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == significance
193201 )
194202
195203 # Arrange size in expected order (ultimately ranked 2nd)
196204 comparison_metrics = sorted (
197- comparison_metrics ,
198- key = lambda x : x .split ('--' )[- 1 ] == size
205+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == size
199206 )
200207
201208 # Rank 1st with "gene", "group", then (if present) "comparison_group"
202- comparison_metrics = sorted (comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "comparison_group" )
203- comparison_metrics = sorted (comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "group" )
204- comparison_metrics = sorted (comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "gene" )
209+ comparison_metrics = sorted (
210+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "comparison_group"
211+ )
212+ comparison_metrics = sorted (
213+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "group"
214+ )
215+ comparison_metrics = sorted (
216+ comparison_metrics , key = lambda x : x .split ('--' )[- 1 ] == "gene"
217+ )
205218
206219 comparison_metrics .reverse ()
207220
208221 return comparison_metrics
209222
210223
211224def sort_headers (headers , size , significance ):
212- """Like `sort_comparison_metrics`, but for bare headers / metrics
213- """
225+ """Like `sort_comparison_metrics`, but for bare headers / metrics"""
214226
215227 # Sort alphabetically
216228 headers = sorted (headers )
217229
218- # Rank significance 1st (ultimately ranked 4th)
219- headers = sorted (
220- headers ,
221- key = lambda x : x == significance
222- )
230+ # Rank significance 1st (ultimately ranked 5th)
231+ headers = sorted (headers , key = lambda x : x == significance )
223232
224233 # Rank size 1st (ultimately ranked 4th)
225- headers = sorted (
226- headers ,
227- key = lambda x : x == size
228- )
234+ headers = sorted (headers , key = lambda x : x == size )
229235
230236 # Rank 1st with "gene", "group", then (if present) "comparison_group"
231237 headers = sorted (headers , key = lambda x : x == "comparison_group" )
@@ -236,6 +242,7 @@ def sort_headers(headers, size, significance):
236242
237243 return headers
238244
245+
239246# note: my initial files had pval, qval, logfoldchanges.
240247# David's files have qval, mean, logfoldchanges.
241248# For the purposes of this validation I will be using his column values/formatting.
@@ -248,7 +255,9 @@ def validate_size_and_significance(metrics, size, significance, logger):
248255 - Log to Sentry / Mixpanel
249256 """
250257 has_size = any ([metric .split ('--' )[- 1 ] == size for metric in metrics ])
251- has_significance = any ([metric .split ('--' )[- 1 ] == significance for metric in metrics ])
258+ has_significance = any (
259+ [metric .split ('--' )[- 1 ] == significance for metric in metrics ]
260+ )
252261
253262 in_headers = f"in headers: { metrics } "
254263
@@ -263,7 +272,9 @@ def validate_size_and_significance(metrics, size, significance, logger):
263272 logger .error (msg )
264273 raise ValueError (msg )
265274 elif has_size and has_significance :
266- logger .info (f'Found size ("{ size } ") and significance ("{ significance } ") metrics { in_headers } ' )
275+ logger .info (
276+ f'Found size ("{ size } ") and significance ("{ significance } ") metrics { in_headers } '
277+ )
267278
268279
269280def get_groups_and_metrics (raw_column_names , size , significance , logger ):
@@ -292,7 +303,9 @@ def get_groups_and_metrics(raw_column_names, size, significance, logger):
292303 column_items = raw_column_name .split ("--" )
293304 split_header = []
294305 for item in column_items :
295- item = item .replace ("'" , "" ) # Remove quotes in e.g. 'type_0'--'type_1'--qval
306+ item = item .replace (
307+ "'" , ""
308+ ) # Remove quotes in e.g. 'type_0'--'type_1'--qval
296309 if (item != "" ) and (item != "_" ):
297310 split_header .append (item .strip ("_" ))
298311 split_headers .append (split_header )
@@ -325,13 +338,53 @@ def detect_seurat_findallmarkers(headers):
325338
326339 These headers were observed in a real user-uploaded DE file.
327340 """
328- findallmarkers_headers = ['p_val' , 'avg_log2FC' , 'pct.1' , 'pct.2' , 'p_val_adj' , 'cluster' , 'gene' ]
329- is_seurat_findallmarkers = (
330- len (headers ) == len (findallmarkers_headers ) and all (headers == findallmarkers_headers )
341+ findallmarkers_headers = [
342+ 'p_val' ,
343+ 'avg_log2FC' ,
344+ 'pct.1' ,
345+ 'pct.2' ,
346+ 'p_val_adj' ,
347+ 'cluster' ,
348+ 'gene' ,
349+ ]
350+ is_seurat_findallmarkers = len (headers ) == len (findallmarkers_headers ) and all (
351+ headers == findallmarkers_headers
331352 )
332353 return is_seurat_findallmarkers
333354
334355
356+ def order_not_significant (array_1 , array_2 ):
357+ correlation , pval = spearmanr (array_1 , array_2 )
358+ if correlation > 0.95 :
359+ return False
360+ else :
361+ return True
362+
363+
364+ def organize_results (df ):
365+ # processing turned values into strings, convert to numeric for sorting
366+ df ["order" ] = df ["order" ].astype (float )
367+ df ["order" ] = df ["order" ].astype (int )
368+ # sort dataframe by input row order
369+ df = df .sort_values (by = "order" )
370+ df = df .set_index ('order' )
371+ # maintain unnamed index column in DE results file
372+ df .index .name = None
373+ # processing ensures the significance metric is the 3rd column of the df
374+ df [df .columns [2 ]] = df [df .columns [2 ]].astype (float )
375+ input_order = df [df .columns [2 ]].to_numpy ()
376+ sig_sorted = df [df .columns [2 ]].sort_values ()
377+ sig_array = sig_sorted .to_numpy ()
378+
379+ if order_not_significant (sig_array , input_order ):
380+ # sort dataframe by significance metric
381+ df = df .sort_values (by = df .columns [2 ])
382+ return df
383+ else :
384+ # leave dataframe sorted by input row order
385+ return df
386+
387+
335388class AuthorDifferentialExpression :
336389 dev_logger = setup_logger (__name__ , "log.txt" , format = "support_configs" )
337390 author_de_logger = setup_logger (
@@ -351,7 +404,7 @@ def __init__(
351404 annotation_scope ,
352405 method ,
353406 differential_expression_file ,
354- header_refmap
407+ header_refmap ,
355408 ):
356409 """
357410 :param cluster_name (string) Name of cluster, e.g. "All Cells UMAP"
@@ -413,13 +466,17 @@ def execute(self):
413466 groups , clean_val , metrics = get_groups_and_metrics (
414467 one_vs_rest , self .size_metric , self .significance_metric , logger
415468 )
416- self .generate_result_files (one_vs_rest , genes , rest , groups , clean_val , metrics )
469+ self .generate_result_files (
470+ one_vs_rest , genes , rest , groups , clean_val , metrics
471+ )
417472
418473 if len (pairwise ) != 0 :
419474 groups_p , clean_val_p , metrics = get_groups_and_metrics (
420475 pairwise , self .size_metric , self .significance_metric , logger
421476 )
422- self .generate_result_files (pairwise , genes , rest , groups_p , clean_val_p , metrics )
477+ self .generate_result_files (
478+ pairwise , genes , rest , groups_p , clean_val_p , metrics
479+ )
423480 generate_manifest (self .stem , clean_val , clean_val_p , metrics )
424481
425482 print ("Author DE transformation succeeded" )
@@ -462,7 +519,7 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
462519 for i in all_group_fin :
463520 for j in range (0 , len (i ), num_metrics ):
464521 x = j
465- comparison_metrics = i [x : x + num_metrics ]
522+ comparison_metrics = i [x : x + num_metrics ]
466523 sorted_comparison_metrics = sort_comparison_metrics (
467524 comparison_metrics , self .size_metric , self .significance_metric
468525 )
@@ -510,7 +567,9 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
510567 sorted_list = sort_comparison ([group , comparison_group ])
511568 comparison = f'{ sorted_list [0 ]} --{ sorted_list [1 ]} '
512569
513- clean_comparison_metric = '--' .join ([sanitize_string (group ) for group in comparison .split ('--' )])
570+ clean_comparison_metric = '--' .join (
571+ [sanitize_string (group ) for group in comparison .split ('--' )]
572+ )
514573
515574 tsv_name = f'{ self .stem } --{ clean_comparison_metric } --{ self .annot_scope } --{ self .method } .tsv'
516575
@@ -521,14 +580,16 @@ def generate_result_files(self, col, genes, rest, groups, clean_val, metrics):
521580 t_arr = arr .transpose ()
522581
523582 if len (t_arr ) == 0 :
524- print (f"No data to output for TSV, skip preparation to write { tsv_name } " )
583+ print (
584+ f"No data to output for TSV, skip preparation to write { tsv_name } "
585+ )
525586 continue
526587
527588 # Drop rows that are all "nan", as seen sometimes in Seurat FindAllMarkers()
528589 t_arr = t_arr [~ (t_arr == 'nan' ).any (axis = 1 )]
529590
530591 inner_df = pd .DataFrame (data = t_arr , columns = headers )
531-
592+ inner_df = organize_results ( inner_df )
532593 inner_df .to_csv (tsv_name , sep = '\t ' )
533594
534595 print (f"Wrote TSV: { tsv_name } " )
0 commit comments