More flexible transcript to gene maps and better biotype filtering

maltekuehl · maltekuehl · commit 8fe80e72233b · 2024-11-25T01:50:12.000+01:00
diff --git a/CITATION.cff b/CITATION.cff
@@ -64,6 +64,5 @@ keywords:
   - Python
   - scverse
 license: GPL-3.0-or-later
-commit: d9533f5
-version: 0.10.0
-date-released: '2024-11-20'
+version: 0.11.0
+date-released: '2024-11-23'
diff --git a/docs/source/example.ipynb b/docs/source/example.ipynb
diff --git a/pytximport/_cli.py b/pytximport/_cli.py
@@ -1,7 +1,7 @@
 """Expose the tximport function as a command-line tool."""
 
-import os
 from logging import basicConfig, log, warning
+from pathlib import Path
 
 import click
 import numpy as np
@@ -214,6 +214,12 @@ def run(  # type: ignore
     help="The annotation field to use as the target in the mapping file.",
     required=False,
 )
+@click.option(
+    "--keep-biotype",
+    "--keep_biotype",
+    is_flag=True,
+    help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.",
+)
 def create_map(  # type: ignore
     **kwargs,
 ) -> None:
@@ -224,11 +230,17 @@ def create_map(  # type: ignore
         kwargs["input_file"],
         source_field=kwargs["source_field"] if kwargs["source_field"] else "transcript_id",
         target_field=kwargs["target_field"] if kwargs["target_field"] else "gene_id",
+        keep_biotype=kwargs["keep_biotype"],
     )
     log(25, "Created the transcript-to-gene mapping file. Saving the file...")
 
-    if not os.path.exists(kwargs["output_file"]) or kwargs["output_path_overwrite"]:
-        df.to_csv(kwargs["output_file"], sep="\t", index=False)
+    output_file = Path(kwargs["output_file"])
+    if not output_file.exists() or kwargs["output_path_overwrite"]:
+        df.to_csv(
+            kwargs["output_file"],
+            sep=("," if kwargs["output_file"].endswith(".csv") else "\t"),
+            index=False,
+        )
         log(25, f"Saved the transcript-to-gene mapping file to {kwargs['output_file']}.")
     else:
         warning(
diff --git a/pytximport/core/_tximport.py b/pytximport/core/_tximport.py
@@ -408,7 +408,7 @@ def tximport(
 
     if biotype_filter is not None:
         transcript_data = filter_by_biotype(
-            transcript_data, biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
+            transcript_data, biotype_filter=biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
         )
 
     # Remove appended gene names after underscore for RSEM data for both transcript and gene ids
@@ -515,6 +515,13 @@ def tximport(
             )
             output_format = "csv"
 
+        if output_path.suffix == ".h5ad" and output_format == "csv":
+            warning(
+                "The file extension of the `output_path` is `.h5ad` but the output format is `.csv`. "
+                "Changing the output format to `.h5ad`."
+            )
+            output_format = "h5ad"
+
         if output_format == "h5ad" and output_type != "anndata":
             warning(
                 "The output format is h5ad but the output type is not anndata. Changing the output type to anndata."
@@ -630,8 +637,6 @@ def tximport(
                     index=result.get_row_names(),
                     columns=result.get_column_names(),
                 )
-                df_gene_data.sort_index(inplace=True)
-                df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
             else:
                 if isinstance(result, ad.AnnData):
                     try:
@@ -649,8 +654,9 @@ def tximport(
                     index=(result[result_index] if output_type != "anndata" else result.var.index),
                     columns=(result.coords["file_path"].values if output_type != "anndata" else result.obs.index),
                 )
-                df_gene_data.sort_index(inplace=True)
-                df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
+
+            df_gene_data.sort_index(inplace=True)
+            df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
 
     # End the timer
     log(25, f"Finished the import in {time() - start_time:.2f} seconds.")
diff --git a/pytximport/utils/_create_transcript_gene_map.py b/pytximport/utils/_create_transcript_gene_map.py
@@ -1,7 +1,7 @@
 import re
-from logging import warning
+from logging import log, warning
 from pathlib import Path
-from typing import Any, Dict, Literal, Union
+from typing import Any, Dict, List, Literal, Union
 
 import numpy as np
 import pandas as pd
@@ -72,29 +72,81 @@ def create_transcript_gene_map(
 def create_transcript_gene_map_from_annotation(
     file_path: Union[str, Path],
     source_field: Literal["transcript_id", "transcript_name"] = "transcript_id",
-    target_field: Literal["gene_id", "gene_name"] = "gene_id",
+    target_field: Union[
+        Literal["gene_id", "gene_name", "gene_biotype", "transcript_name"],
+        List[Literal["gene_id", "gene_name", "gene_biotype", "transcript_name"]],
+    ] = "gene_id",
+    use_transcript_name_as_replacement_id: bool = True,
+    use_gene_name_as_replacement_id: bool = True,
     chunk_size: int = 100000,
-    keep_biotype: bool = False,
     **kwargs: Dict[str, Any],
 ) -> pd.DataFrame:
     """Create a mapping from transcript ids to gene ids using a GTF annotation file.
 
+    Basic example:
+
+    .. code-block:: python
+
+            from pytximport.utils import create_transcript_gene_map_from_annotation
+
+            # Create a mapping from transcript ids to gene names
+            transcript_gene_map = create_transcript_gene_map_from_annotation(
+                "path/to/annotation.gtf",
+                target_field="gene_name",
+            )
+
+            # Create a mapping from transcript ids to transcript names and include the gene biotype
+            transcript_gene_map = create_transcript_gene_map_from_annotation(
+                "path/to/annotation.gtf",
+                target_field=["transcript_name", "gene_biotype"],
+            )
+
     Args:
         file_path (Union[str, Path]): The path to the GTF annotation file.
-        field (Literal["gene_id", "gene_name"], optional): The identifier to get for each transcript id.
+        source_field (Literal["transcript_id", "transcript_name"], optional): The identifier to get for each transcript
+            id. Defaults to "transcript_id".
+        target_field (Union[ Literal["gene_id", "gene_name", "gene_biotype"], List[Literal["gene_id", "gene_name",
+            "gene_biotype"]], optional): The corresponding identifier(s) to get for each transcript.
             Defaults to "gene_id".
         chunk_size (int, optional): The number of lines to read at a time. Defaults to 100000.
+        use_transcript_name_as_replacement_id (bool, optional): Whether to use the transcript name as the transcript id
+            if the transcript id is missing. Defaults to True.
+        use_gene_name_as_replacement_id (bool, optional): Whether to use the gene name as the gene id if the gene id is
+            missing. Defaults to True.
         keep_biotype (bool, optional): Whether to keep the gene_biotype column. Defaults to False.
 
     Returns:
         pd.DataFrame: The mapping from transcript ids to gene ids.
     """
+    assert source_field != target_field, "The source_field and target_field must be different."
+
     transcript_gene_map = pd.DataFrame(columns=["transcript_id", "gene_id", "gene_name", "gene_biotype"])
 
     if "field" in kwargs:
         warning("The field argument is deprecated. Please use the source_field and target_field arguments instead.")
 
-    for chunk in pd.read_csv(file_path, sep="\t", chunksize=chunk_size, header=None, comment="#"):
+    if "keep_biotype" in kwargs and kwargs["keep_biotype"]:
+        warning("The keep_biotype argument is deprecated. Please use the target_field argument with a list instead.")
+        if target_field != "gene_biotype" and not (isinstance(target_field, list) and "gene_biotype" in target_field):
+            target_field = (
+                [*target_field, "gene_biotype"] if isinstance(target_field, list) else [target_field, "gene_biotype"]
+            )
+
+    if not isinstance(file_path, Path):
+        file_path = Path(file_path)
+
+    if not Path(file_path).exists():
+        raise FileNotFoundError(f"The file {file_path} does not exist.")
+
+    for chunk in pd.read_csv(
+        file_path,
+        sep="\t",
+        chunksize=chunk_size,
+        header=None,
+        comment="#",
+        compression=("gzip" if file_path.suffix == ".gz" else None),
+        engine="c",
+    ):
         # See: https://www.ensembl.org/info/website/upload/gff.html
         chunk.columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
 
@@ -131,26 +183,46 @@ def create_transcript_gene_map_from_annotation(
         transcript_gene_map["gene_name"],
     )
 
-    if source_field == "transcript_name":
+    # If only the transcript_name is present, we can drop the id and rename the transcript_name to transcript_id
+    if source_field == "transcript_name" and use_transcript_name_as_replacement_id:
         transcript_gene_map.drop("transcript_id", axis=1, inplace=True)
         transcript_gene_map.rename(columns={"transcript_name": "transcript_id"}, inplace=True)
 
-    if target_field == "gene_name":
+        source_field = "transcript_id"
+
+    # If only the gene_name is present, we can drop the gene_id and rename the gene_name to gene_id
+    if (
+        (target_field == "gene_name" or (isinstance(target_field, list) and "gene_name" in target_field))
+        and not (target_field == "gene_id" or (isinstance(target_field, list) and "gene_id" in target_field))
+        and use_gene_name_as_replacement_id
+    ):
+        log(
+            25,
+            (
+                "No gene_id target field was provided. Renaming gene_name to gene_id. "
+                "You can disable this behavior by setting use_gene_name_as_replacement_id to False."
+            ),
+        )
+
         transcript_gene_map.drop("gene_id", axis=1, inplace=True)
         transcript_gene_map.rename(columns={"gene_name": "gene_id"}, inplace=True)
 
-    fields_to_keep = ["transcript_id", "gene_id"]
+        if isinstance(target_field, list):
+            target_field = [field if field != "gene_name" else "gene_id" for field in target_field]
+        else:
+            target_field = "gene_id" if target_field == "gene_name" else target_field
 
-    if keep_biotype and "gene_biotype" in transcript_gene_map.columns:
-        fields_to_keep.append("gene_biotype")
+    fields_to_keep = [source_field, *target_field] if isinstance(target_field, list) else [source_field, target_field]
 
     transcript_gene_map = transcript_gene_map[fields_to_keep]
-
-    transcript_gene_map[["gene_id", "transcript_id"]] = transcript_gene_map[["gene_id", "transcript_id"]].replace(
-        "", np.nan
-    )
+    transcript_gene_map.replace("", np.nan, inplace=True)
     transcript_gene_map.dropna(inplace=True)
-    transcript_gene_map.drop_duplicates(subset=["gene_id", "transcript_id"], inplace=True)
+
+    if source_field == "transcript_id" and (
+        target_field == "gene_id" or (isinstance(target_field, list) and "gene_id" in target_field)
+    ):
+        transcript_gene_map.drop_duplicates(subset=["transcript_id", "gene_id"], inplace=True)
+
     transcript_gene_map.reset_index(drop=True, inplace=True)
 
     return transcript_gene_map
diff --git a/pytximport/utils/_filter_by_biotype.py b/pytximport/utils/_filter_by_biotype.py