@@ -89,6 +89,45 @@ def test_assess_annotation(self):
8989 KeyError , DifferentialExpression .assess_annotation , test_annotation , cm
9090 )
9191
92+ def test_detect_duplicate_gene_names (self ):
93+ """Genes file can have one or two columns of gene information
94+ If two columns present, use the second column containing gene names
95+ unless there are duplicate gene names in the second column
96+ If duplicates, check that 1st plus 2nd column provides uniqueness
97+ If unique when joined, join columns with pipe (|) for use as DE input
98+ """
99+ no_dup_genes_path = (
100+ "../tests/data/differential_expression/sparse/sparsemini_features.tsv"
101+ )
102+ no_dup_genes = DifferentialExpression .get_genes (no_dup_genes_path )
103+ self .assertNotIn (
104+ "|" , no_dup_genes [0 ], f"no delimiter expected in { no_dup_genes [0 ]} "
105+ )
106+
107+ dup_genes_path = (
108+ "../tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv"
109+ )
110+ dup_genes = DifferentialExpression .get_genes (dup_genes_path )
111+ self .assertIn ("|" , dup_genes [0 ], f"no delimiter expected in { dup_genes [0 ]} " )
112+
113+ def test_delimiter_in_gene_name (self ):
114+ delimited_data = {"names" : ["Tns1" , "Gfra1" ], "scores" : ["10.5" , "10.34" ]}
115+ delimited_df = pd .DataFrame (delimited_data )
116+ self .assertFalse (
117+ DifferentialExpression .delimiter_in_gene_name (delimited_df ),
118+ "no pipe delimiter should be detected in the input" ,
119+ )
120+
121+ undelimited_data = {
122+ "names" : ["ENSMUST00000027035|Sox17" , "ENSMUST00000195555|Sox17" ],
123+ "scores" : ["41.459137" , "-5.058518" ],
124+ }
125+ undelimited_df = pd .DataFrame (undelimited_data )
126+ self .assertTrue (
127+ DifferentialExpression .delimiter_in_gene_name (undelimited_df ),
128+ "expected pipe delimiter undetected" ,
129+ )
130+
92131 def test_de_remove_single_sample (self ):
93132 """ Test single sample removal
94133 """
@@ -237,15 +276,15 @@ def test_de_process_sparse(self):
237276 "../tests/data/differential_expression/sparse/sparsemini_cluster.txt" ,
238277 "addedfeed000000000000000" ,
239278 "dec0dedfeed0000000000000" ,
240- "de_sparse_integration " ,
279+ "de_sparse_dup_gene " ,
241280 )
242281
243282 de_kwargs = {
244283 "study_accession" : cm .study_accession ,
245284 "name" : cluster .name ,
246285 "annotation_scope" : test_scope ,
247286 "method" : test_method ,
248- "gene_file" : "../tests/data/differential_expression/sparse/sparsemini_features .tsv" ,
287+ "gene_file" : "../tests/data/differential_expression/sparse/sparsemini_dup_gene_name .tsv" ,
249288 "barcode_file" : "../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv" ,
250289 }
251290
@@ -273,26 +312,29 @@ def test_de_process_sparse(self):
273312 )
274313
275314 expected_file_path = (
276- "../tests/de_sparse_integration --cell_type__ontology_label"
315+ "../tests/de_sparse_dup_gene --cell_type__ontology_label"
277316 "--endothelial_cell--study--wilcoxon.tsv"
278317 )
279318
280319 content = pd .read_csv (expected_file_path , sep = "\t " , index_col = 0 )
281320 # confirm expected gene in DE file at expected position
282321 self .assertEqual (
283322 content .iloc [1 , 0 ],
284- "Mrpl15 " ,
285- "Did not find expected gene, Mrpl15 , at second row in DE file" ,
323+ "Sox17 " ,
324+ "Did not find expected gene, Sox17 , at second row in DE file. " ,
286325 )
287326 # confirm calculated value has expected significant digits
288327 self .assertEqual (
289328 content .iloc [0 , 2 ],
290329 11.63 ,
291- "Did not find expected logfoldchange value for Sox17 in DE file" ,
330+ "Did not find expected logfoldchange value for Sox17 in DE file. " ,
292331 )
332+ # confirm duplicate gene input generates expected gene_id info in output
333+ self .assertIn ('gene_id' , content .columns , "Expected gene_id output not found." )
293334
294335 # md5 checksum calculated using reference file in tests/data/differential_expression/sparse/reference
295- expected_checksum = "07b6c6565430a17f4f048e7b4f53ddac"
336+ # file updated 2022-05-25 to include output for duplicate gene handling
337+ expected_checksum = "ca0c7dcc4048614f22d6bc7dec18a2c0"
296338
297339 # running DifferentialExpression via pytest results in output files in the tests dir
298340 with open (expected_file_path , "rb" ) as f :
@@ -301,16 +343,15 @@ def test_de_process_sparse(self):
301343 self .assertEqual (
302344 de_output_checksum ,
303345 expected_checksum ,
304- "generated output file should match expected checksum" ,
346+ "Generated output file should match expected checksum. " ,
305347 )
306348
307349 arguments = {"cluster_name" : cluster .name , "annotation_name" : test_annotation }
308350 generated_output_match = DifferentialExpression .string_for_output_match (
309351 arguments
310352 )
311353 self .assertEqual (
312- generated_output_match ,
313- "de_sparse_integration--cell_type__ontology_label*.tsv" ,
354+ generated_output_match , "de_sparse_dup_gene--cell_type__ontology_label*.tsv"
314355 )
315356
316357 with patch ('ingest_files.IngestFiles.delocalize_file' ):
@@ -325,9 +366,7 @@ def test_de_process_sparse(self):
325366 )
326367
327368 # clean up DE outputs
328- output_wildcard_match = (
329- f"../tests/de_sparse_integration--{ test_annotation } *.tsv"
330- )
369+ output_wildcard_match = f"../tests/de_sparse_dup_gene--{ test_annotation } *.tsv"
331370 files = glob .glob (output_wildcard_match )
332371
333372 for file in files :
0 commit comments