Skip to content

Commit a8332a3

Browse files
authored
Merge pull request #257 from broadinstitute/development
Release 1.19.0
2 parents 275ae16 + 13214ff commit a8332a3

File tree

5 files changed

+94
-24
lines changed

5 files changed

+94
-24
lines changed

ingest/de.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -224,28 +224,41 @@ def execute_de(self):
224224
@staticmethod
225225
def get_genes(genes_path):
226226
""" Genes file can have one or two columns of gene information
227-
If two columns present, check if there are duplicates in 2nd col
228-
If no duplicates, use as var_names, else use 1st column
227+
Preferentially use gene names from second column.
228+
If duplicate gene names, check that 1st plus 2nd column provides uniqueness
229+
If unique when joined, join columns with pipe (|) for use as DE input
229230
"""
230231
genes_object = IngestFiles(genes_path, None)
231232
local_genes_path = genes_object.resolve_path(genes_path)[1]
232233

233234
genes_df = pd.read_csv(local_genes_path, sep="\t", header=None)
234235
if len(genes_df.columns) > 1:
235-
# unclear if falling back to gene_id is useful (SCP-4283)
236+
# if genes are not unique, try combining with gene_id (SCP-4283)
236237
# print so we're aware of dups during dev testing
237238
if genes_df[1].count() != genes_df[1].nunique():
238-
msg = (
239+
warning = (
239240
"dev_info: Features file contains duplicate identifiers in column 2"
240241
)
241-
print(msg)
242-
return genes_df[1].tolist()
242+
print(warning)
243+
genes_df['new_id'] = genes_df[[0, 1]].agg('|'.join, axis=1)
244+
if genes_df['new_id'].count() != genes_df['new_id'].nunique():
245+
msg = "Duplicates in features file even after joining gene_id and gene_name"
246+
log_exception(
247+
DifferentialExpression.dev_logger,
248+
DifferentialExpression.de_logger,
249+
msg,
250+
)
251+
raise ValueError(msg)
252+
else:
253+
return genes_df['new_id'].tolist()
254+
else:
255+
return genes_df[1].tolist()
243256
else:
244257
if genes_df[0].count() != genes_df[0].nunique():
245-
msg = (
258+
warning = (
246259
"dev_info: Features file contains duplicate identifiers in column 1"
247260
)
248-
print(msg)
261+
print(warning)
249262
return genes_df[0].tolist()
250263

251264
@staticmethod
@@ -286,6 +299,20 @@ def remove_single_sample_data(adata, annotation):
286299
adata = adata[adata.obs[annotation] != label]
287300
return adata
288301

302+
@staticmethod
303+
def delimiter_in_gene_name(rank):
304+
""" Check if pipe delimiter occurs in "names" column
305+
"""
306+
return rank['names'].str.contains('|', regex=False).any()
307+
308+
@staticmethod
309+
def extract_gene_id_for_out_file(rank):
310+
""" Separate out gene name from gene ID
311+
"""
312+
rank['gene_id'] = rank['names'].str.split('|').str[0]
313+
rank['names'] = rank['names'].str.split('|').str[1]
314+
return rank
315+
289316
@staticmethod
290317
def run_scanpy_de(
291318
cluster,
@@ -364,6 +391,8 @@ def run_scanpy_de(
364391
clean_annotation = re.sub(r'\W+', '_', annotation)
365392
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
366393
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
394+
if DifferentialExpression.delimiter_in_gene_name(rank):
395+
DifferentialExpression.extract_gene_id_for_out_file(rank)
367396

368397
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
369398
# Round numbers to 4 significant digits while respecting fixed point
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
names scores logfoldchanges pvals pvals_adj pct_nz_group pct_nz_reference gene_id
2+
0 Sox17 41.46 11.63 0 0 0.4833 0.002793 ENSMUST00000027035
3+
1 Sox17 -5.059 -0.8853 4.225e-07 4.225e-07 0.1514 0.1939 ENSMUST00000195555

tests/data/differential_expression/sparse/reference/de_sparse_integration--cell_type__ontology_label--endothelial_cell--study--wilcoxon.tsv

Lines changed: 0 additions & 3 deletions
This file was deleted.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ENSMUST00000027035 Sox17
2+
ENSMUST00000195555 Sox17

tests/test_de.py

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,45 @@ def test_assess_annotation(self):
8989
KeyError, DifferentialExpression.assess_annotation, test_annotation, cm
9090
)
9191

92+
def test_detect_duplicate_gene_names(self):
93+
"""Genes file can have one or two columns of gene information
94+
If two columns present, use the second column containing gene names
95+
unless there are duplicate gene names in the second column
96+
If duplicates, check that 1st plus 2nd column provides uniqueness
97+
If unique when joined, join columns with pipe (|) for use as DE input
98+
"""
99+
no_dup_genes_path = (
100+
"../tests/data/differential_expression/sparse/sparsemini_features.tsv"
101+
)
102+
no_dup_genes = DifferentialExpression.get_genes(no_dup_genes_path)
103+
self.assertNotIn(
104+
"|", no_dup_genes[0], f"no delimiter expected in {no_dup_genes[0]}"
105+
)
106+
107+
dup_genes_path = (
108+
"../tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv"
109+
)
110+
dup_genes = DifferentialExpression.get_genes(dup_genes_path)
111+
self.assertIn("|", dup_genes[0], f"no delimiter expected in {dup_genes[0]}")
112+
113+
def test_delimiter_in_gene_name(self):
114+
delimited_data = {"names": ["Tns1", "Gfra1"], "scores": ["10.5", "10.34"]}
115+
delimited_df = pd.DataFrame(delimited_data)
116+
self.assertFalse(
117+
DifferentialExpression.delimiter_in_gene_name(delimited_df),
118+
"no pipe delimiter should be detected in the input",
119+
)
120+
121+
undelimited_data = {
122+
"names": ["ENSMUST00000027035|Sox17", "ENSMUST00000195555|Sox17"],
123+
"scores": ["41.459137", "-5.058518"],
124+
}
125+
undelimited_df = pd.DataFrame(undelimited_data)
126+
self.assertTrue(
127+
DifferentialExpression.delimiter_in_gene_name(undelimited_df),
128+
"expected pipe delimiter undetected",
129+
)
130+
92131
def test_de_remove_single_sample(self):
93132
""" Test single sample removal
94133
"""
@@ -237,15 +276,15 @@ def test_de_process_sparse(self):
237276
"../tests/data/differential_expression/sparse/sparsemini_cluster.txt",
238277
"addedfeed000000000000000",
239278
"dec0dedfeed0000000000000",
240-
"de_sparse_integration",
279+
"de_sparse_dup_gene",
241280
)
242281

243282
de_kwargs = {
244283
"study_accession": cm.study_accession,
245284
"name": cluster.name,
246285
"annotation_scope": test_scope,
247286
"method": test_method,
248-
"gene_file": "../tests/data/differential_expression/sparse/sparsemini_features.tsv",
287+
"gene_file": "../tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv",
249288
"barcode_file": "../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv",
250289
}
251290

@@ -273,26 +312,29 @@ def test_de_process_sparse(self):
273312
)
274313

275314
expected_file_path = (
276-
"../tests/de_sparse_integration--cell_type__ontology_label"
315+
"../tests/de_sparse_dup_gene--cell_type__ontology_label"
277316
"--endothelial_cell--study--wilcoxon.tsv"
278317
)
279318

280319
content = pd.read_csv(expected_file_path, sep="\t", index_col=0)
281320
# confirm expected gene in DE file at expected position
282321
self.assertEqual(
283322
content.iloc[1, 0],
284-
"Mrpl15",
285-
"Did not find expected gene, Mrpl15, at second row in DE file",
323+
"Sox17",
324+
"Did not find expected gene, Sox17, at second row in DE file.",
286325
)
287326
# confirm calculated value has expected significant digits
288327
self.assertEqual(
289328
content.iloc[0, 2],
290329
11.63,
291-
"Did not find expected logfoldchange value for Sox17 in DE file",
330+
"Did not find expected logfoldchange value for Sox17 in DE file.",
292331
)
332+
# confirm duplicate gene input generates expected gene_id info in output
333+
self.assertIn('gene_id', content.columns, "Expected gene_id output not found.")
293334

294335
# md5 checksum calculated using reference file in tests/data/differential_expression/sparse/reference
295-
expected_checksum = "07b6c6565430a17f4f048e7b4f53ddac"
336+
# file updated 2022-05-25 to include output for duplicate gene handling
337+
expected_checksum = "ca0c7dcc4048614f22d6bc7dec18a2c0"
296338

297339
# running DifferentialExpression via pytest results in output files in the tests dir
298340
with open(expected_file_path, "rb") as f:
@@ -301,16 +343,15 @@ def test_de_process_sparse(self):
301343
self.assertEqual(
302344
de_output_checksum,
303345
expected_checksum,
304-
"generated output file should match expected checksum",
346+
"Generated output file should match expected checksum.",
305347
)
306348

307349
arguments = {"cluster_name": cluster.name, "annotation_name": test_annotation}
308350
generated_output_match = DifferentialExpression.string_for_output_match(
309351
arguments
310352
)
311353
self.assertEqual(
312-
generated_output_match,
313-
"de_sparse_integration--cell_type__ontology_label*.tsv",
354+
generated_output_match, "de_sparse_dup_gene--cell_type__ontology_label*.tsv"
314355
)
315356

316357
with patch('ingest_files.IngestFiles.delocalize_file'):
@@ -325,9 +366,7 @@ def test_de_process_sparse(self):
325366
)
326367

327368
# clean up DE outputs
328-
output_wildcard_match = (
329-
f"../tests/de_sparse_integration--{test_annotation}*.tsv"
330-
)
369+
output_wildcard_match = f"../tests/de_sparse_dup_gene--{test_annotation}*.tsv"
331370
files = glob.glob(output_wildcard_match)
332371

333372
for file in files:

0 commit comments

Comments
 (0)