Merge pull request #257 from broadinstitute/development

jlchang · web-flow · commit a8332a36f636 · 2022-06-02T12:00:04.000-04:00
Release 1.19.0
diff --git a/ingest/de.py b/ingest/de.py
@@ -224,28 +224,41 @@ def execute_de(self):
     @staticmethod
     def get_genes(genes_path):
         """ Genes file can have one or two columns of gene information
-            If two columns present, check if there are duplicates in 2nd col
-            If no duplicates, use as var_names, else use 1st column
+            Preferentially use gene names from second column.
+            If duplicate gene names, check that 1st plus 2nd column provides uniqueness
+            If unique when joined, join columns with pipe (|) for use as DE input
         """
         genes_object = IngestFiles(genes_path, None)
         local_genes_path = genes_object.resolve_path(genes_path)[1]
 
         genes_df = pd.read_csv(local_genes_path, sep="\t", header=None)
         if len(genes_df.columns) > 1:
-            # unclear if falling back to gene_id is useful (SCP-4283)
+            # if genes are not unique, try combining with gene_id (SCP-4283)
             # print so we're aware of dups during dev testing
             if genes_df[1].count() != genes_df[1].nunique():
-                msg = (
+                warning = (
                     "dev_info: Features file contains duplicate identifiers in column 2"
                 )
-                print(msg)
-            return genes_df[1].tolist()
+                print(warning)
+                genes_df['new_id'] = genes_df[[0, 1]].agg('|'.join, axis=1)
+                if genes_df['new_id'].count() != genes_df['new_id'].nunique():
+                    msg = "Duplicates in features file even after joining gene_id and gene_name"
+                    log_exception(
+                        DifferentialExpression.dev_logger,
+                        DifferentialExpression.de_logger,
+                        msg,
+                    )
+                    raise ValueError(msg)
+                else:
+                    return genes_df['new_id'].tolist()
+            else:
+                return genes_df[1].tolist()
         else:
             if genes_df[0].count() != genes_df[0].nunique():
-                msg = (
+                warning = (
                     "dev_info: Features file contains duplicate identifiers in column 1"
                 )
-                print(msg)
+                print(warning)
             return genes_df[0].tolist()
 
     @staticmethod
@@ -286,6 +299,20 @@ def remove_single_sample_data(adata, annotation):
                 adata = adata[adata.obs[annotation] != label]
         return adata
 
+    @staticmethod
+    def delimiter_in_gene_name(rank):
+        """ Check if pipe delimiter occurs in "names" column
+        """
+        return rank['names'].str.contains('|', regex=False).any()
+
+    @staticmethod
+    def extract_gene_id_for_out_file(rank):
+        """ Separate out gene name from gene ID
+        """
+        rank['gene_id'] = rank['names'].str.split('|').str[0]
+        rank['names'] = rank['names'].str.split('|').str[1]
+        return rank
+
     @staticmethod
     def run_scanpy_de(
         cluster,
@@ -364,6 +391,8 @@ def run_scanpy_de(
             clean_annotation = re.sub(r'\W+', '_', annotation)
             DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
             rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
+            if DifferentialExpression.delimiter_in_gene_name(rank):
+                DifferentialExpression.extract_gene_id_for_out_file(rank)
 
             out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
             # Round numbers to 4 significant digits while respecting fixed point
diff --git a/tests/data/differential_expression/sparse/reference/de_sparse_dup_gene--cell_type__ontology_label--endothelial_cell--study--wilcoxon.tsv b/tests/data/differential_expression/sparse/reference/de_sparse_dup_gene--cell_type__ontology_label--endothelial_cell--study--wilcoxon.tsv
@@ -0,0 +1,3 @@
+	names	scores	logfoldchanges	pvals	pvals_adj	pct_nz_group	pct_nz_reference	gene_id
+0	Sox17	41.46	11.63	0	0	0.4833	0.002793	ENSMUST00000027035
+1	Sox17	-5.059	-0.8853	4.225e-07	4.225e-07	0.1514	0.1939	ENSMUST00000195555
diff --git a/tests/data/differential_expression/sparse/reference/de_sparse_integration--cell_type__ontology_label--endothelial_cell--study--wilcoxon.tsv b/tests/data/differential_expression/sparse/reference/de_sparse_integration--cell_type__ontology_label--endothelial_cell--study--wilcoxon.tsv
diff --git a/tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv b/tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv
@@ -0,0 +1,2 @@
+ENSMUST00000027035	Sox17
+ENSMUST00000195555	Sox17
diff --git a/tests/test_de.py b/tests/test_de.py
@@ -89,6 +89,45 @@ def test_assess_annotation(self):
             KeyError, DifferentialExpression.assess_annotation, test_annotation, cm
         )
 
+    def test_detect_duplicate_gene_names(self):
+        """Genes file can have one or two columns of gene information
+            If two columns present, use the second column containing gene names
+            unless there are duplicate gene names in the second column
+            If duplicates, check that 1st plus 2nd column provides uniqueness
+            If unique when joined, join columns with pipe (|) for use as DE input
+        """
+        no_dup_genes_path = (
+            "../tests/data/differential_expression/sparse/sparsemini_features.tsv"
+        )
+        no_dup_genes = DifferentialExpression.get_genes(no_dup_genes_path)
+        self.assertNotIn(
+            "|", no_dup_genes[0], f"no delimiter expected in {no_dup_genes[0]}"
+        )
+
+        dup_genes_path = (
+            "../tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv"
+        )
+        dup_genes = DifferentialExpression.get_genes(dup_genes_path)
+        self.assertIn("|", dup_genes[0], f"no delimiter expected in {dup_genes[0]}")
+
+    def test_delimiter_in_gene_name(self):
+        delimited_data = {"names": ["Tns1", "Gfra1"], "scores": ["10.5", "10.34"]}
+        delimited_df = pd.DataFrame(delimited_data)
+        self.assertFalse(
+            DifferentialExpression.delimiter_in_gene_name(delimited_df),
+            "no pipe delimiter should be detected in the input",
+        )
+
+        undelimited_data = {
+            "names": ["ENSMUST00000027035|Sox17", "ENSMUST00000195555|Sox17"],
+            "scores": ["41.459137", "-5.058518"],
+        }
+        undelimited_df = pd.DataFrame(undelimited_data)
+        self.assertTrue(
+            DifferentialExpression.delimiter_in_gene_name(undelimited_df),
+            "expected pipe delimiter undetected",
+        )
+
     def test_de_remove_single_sample(self):
         """ Test single sample removal
         """
@@ -237,15 +276,15 @@ def test_de_process_sparse(self):
             "../tests/data/differential_expression/sparse/sparsemini_cluster.txt",
             "addedfeed000000000000000",
             "dec0dedfeed0000000000000",
-            "de_sparse_integration",
+            "de_sparse_dup_gene",
         )
 
         de_kwargs = {
             "study_accession": cm.study_accession,
             "name": cluster.name,
             "annotation_scope": test_scope,
             "method": test_method,
-            "gene_file": "../tests/data/differential_expression/sparse/sparsemini_features.tsv",
+            "gene_file": "../tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv",
             "barcode_file": "../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv",
         }
 
@@ -273,26 +312,29 @@ def test_de_process_sparse(self):
         )
 
         expected_file_path = (
-            "../tests/de_sparse_integration--cell_type__ontology_label"
+            "../tests/de_sparse_dup_gene--cell_type__ontology_label"
             "--endothelial_cell--study--wilcoxon.tsv"
         )
 
         content = pd.read_csv(expected_file_path, sep="\t", index_col=0)
         # confirm expected gene in DE file at expected position
         self.assertEqual(
             content.iloc[1, 0],
-            "Mrpl15",
-            "Did not find expected gene, Mrpl15, at second row in DE file",
+            "Sox17",
+            "Did not find expected gene, Sox17, at second row in DE file.",
         )
         # confirm calculated value has expected significant digits
         self.assertEqual(
             content.iloc[0, 2],
             11.63,
-            "Did not find expected logfoldchange value for Sox17 in DE file",
+            "Did not find expected logfoldchange value for Sox17 in DE file.",
         )
+        # confirm duplicate gene input generates expected gene_id info in output
+        self.assertIn('gene_id', content.columns, "Expected gene_id output not found.")
 
         # md5 checksum calculated using reference file in tests/data/differential_expression/sparse/reference
-        expected_checksum = "07b6c6565430a17f4f048e7b4f53ddac"
+        # file updated 2022-05-25 to include output for duplicate gene handling
+        expected_checksum = "ca0c7dcc4048614f22d6bc7dec18a2c0"
 
         # running DifferentialExpression via pytest results in output files in the tests dir
         with open(expected_file_path, "rb") as f:
@@ -301,16 +343,15 @@ def test_de_process_sparse(self):
         self.assertEqual(
             de_output_checksum,
             expected_checksum,
-            "generated output file should match expected checksum",
+            "Generated output file should match expected checksum.",
         )
 
         arguments = {"cluster_name": cluster.name, "annotation_name": test_annotation}
         generated_output_match = DifferentialExpression.string_for_output_match(
             arguments
         )
         self.assertEqual(
-            generated_output_match,
-            "de_sparse_integration--cell_type__ontology_label*.tsv",
+            generated_output_match, "de_sparse_dup_gene--cell_type__ontology_label*.tsv"
         )
 
         with patch('ingest_files.IngestFiles.delocalize_file'):
@@ -325,9 +366,7 @@ def test_de_process_sparse(self):
             )
 
         # clean up DE outputs
-        output_wildcard_match = (
-            f"../tests/de_sparse_integration--{test_annotation}*.tsv"
-        )
+        output_wildcard_match = f"../tests/de_sparse_dup_gene--{test_annotation}*.tsv"
         files = glob.glob(output_wildcard_match)
 
         for file in files:

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ names scores logfoldchanges pvals pvals_adj pct_nz_group pct_nz_reference gene_id`
	`2`	`+0 Sox17 41.46 11.63 0 0 0.4833 0.002793 ENSMUST00000027035`
	`3`	`+1 Sox17 -5.059 -0.8853 4.225e-07 4.225e-07 0.1514 0.1939 ENSMUST00000195555`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+ENSMUST00000027035 Sox17`
	`2`	`+ENSMUST00000195555 Sox17`