Skip to content

Commit 8fe80e7

Browse files
committed
More flexible transcript to gene maps and better biotype filtering
1 parent 4c286cf commit 8fe80e7

File tree

6 files changed

+401
-102
lines changed

6 files changed

+401
-102
lines changed

CITATION.cff

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,5 @@ keywords:
6464
- Python
6565
- scverse
6666
license: GPL-3.0-or-later
67-
commit: d9533f5
68-
version: 0.10.0
69-
date-released: '2024-11-20'
67+
version: 0.11.0
68+
date-released: '2024-11-23'

docs/source/example.ipynb

Lines changed: 196 additions & 42 deletions
Large diffs are not rendered by default.

pytximport/_cli.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Expose the tximport function as a command-line tool."""
22

3-
import os
43
from logging import basicConfig, log, warning
4+
from pathlib import Path
55

66
import click
77
import numpy as np
@@ -214,6 +214,12 @@ def run( # type: ignore
214214
help="The annotation field to use as the target in the mapping file.",
215215
required=False,
216216
)
217+
@click.option(
218+
"--keep-biotype",
219+
"--keep_biotype",
220+
is_flag=True,
221+
help="Provide this flag to keep the gene_biotype column as an additional column in the mapping file.",
222+
)
217223
def create_map( # type: ignore
218224
**kwargs,
219225
) -> None:
@@ -224,11 +230,17 @@ def create_map( # type: ignore
224230
kwargs["input_file"],
225231
source_field=kwargs["source_field"] if kwargs["source_field"] else "transcript_id",
226232
target_field=kwargs["target_field"] if kwargs["target_field"] else "gene_id",
233+
keep_biotype=kwargs["keep_biotype"],
227234
)
228235
log(25, "Created the transcript-to-gene mapping file. Saving the file...")
229236

230-
if not os.path.exists(kwargs["output_file"]) or kwargs["output_path_overwrite"]:
231-
df.to_csv(kwargs["output_file"], sep="\t", index=False)
237+
output_file = Path(kwargs["output_file"])
238+
if not output_file.exists() or kwargs["output_path_overwrite"]:
239+
df.to_csv(
240+
kwargs["output_file"],
241+
sep=("," if kwargs["output_file"].endswith(".csv") else "\t"),
242+
index=False,
243+
)
232244
log(25, f"Saved the transcript-to-gene mapping file to {kwargs['output_file']}.")
233245
else:
234246
warning(

pytximport/core/_tximport.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ def tximport(
408408

409409
if biotype_filter is not None:
410410
transcript_data = filter_by_biotype(
411-
transcript_data, biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
411+
transcript_data, biotype_filter=biotype_filter, id_column=("gene_id" if gene_level else "transcript_id")
412412
)
413413

414414
# Remove appended gene names after underscore for RSEM data for both transcript and gene ids
@@ -515,6 +515,13 @@ def tximport(
515515
)
516516
output_format = "csv"
517517

518+
if output_path.suffix == ".h5ad" and output_format == "csv":
519+
warning(
520+
"The file extension of the `output_path` is `.h5ad` but the output format is `.csv`. "
521+
"Changing the output format to `.h5ad`."
522+
)
523+
output_format = "h5ad"
524+
518525
if output_format == "h5ad" and output_type != "anndata":
519526
warning(
520527
"The output format is h5ad but the output type is not anndata. Changing the output type to anndata."
@@ -630,8 +637,6 @@ def tximport(
630637
index=result.get_row_names(),
631638
columns=result.get_column_names(),
632639
)
633-
df_gene_data.sort_index(inplace=True)
634-
df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
635640
else:
636641
if isinstance(result, ad.AnnData):
637642
try:
@@ -649,8 +654,9 @@ def tximport(
649654
index=(result[result_index] if output_type != "anndata" else result.var.index),
650655
columns=(result.coords["file_path"].values if output_type != "anndata" else result.obs.index),
651656
)
652-
df_gene_data.sort_index(inplace=True)
653-
df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
657+
658+
df_gene_data.sort_index(inplace=True)
659+
df_gene_data.to_csv(output_path, index=True, header=True, quoting=2)
654660

655661
# End the timer
656662
log(25, f"Finished the import in {time() - start_time:.2f} seconds.")

pytximport/utils/_create_transcript_gene_map.py

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import re
2-
from logging import warning
2+
from logging import log, warning
33
from pathlib import Path
4-
from typing import Any, Dict, Literal, Union
4+
from typing import Any, Dict, List, Literal, Union
55

66
import numpy as np
77
import pandas as pd
@@ -72,29 +72,81 @@ def create_transcript_gene_map(
7272
def create_transcript_gene_map_from_annotation(
7373
file_path: Union[str, Path],
7474
source_field: Literal["transcript_id", "transcript_name"] = "transcript_id",
75-
target_field: Literal["gene_id", "gene_name"] = "gene_id",
75+
target_field: Union[
76+
Literal["gene_id", "gene_name", "gene_biotype", "transcript_name"],
77+
List[Literal["gene_id", "gene_name", "gene_biotype", "transcript_name"]],
78+
] = "gene_id",
79+
use_transcript_name_as_replacement_id: bool = True,
80+
use_gene_name_as_replacement_id: bool = True,
7681
chunk_size: int = 100000,
77-
keep_biotype: bool = False,
7882
**kwargs: Dict[str, Any],
7983
) -> pd.DataFrame:
8084
"""Create a mapping from transcript ids to gene ids using a GTF annotation file.
8185
86+
Basic example:
87+
88+
.. code-block:: python
89+
90+
from pytximport.utils import create_transcript_gene_map_from_annotation
91+
92+
# Create a mapping from transcript ids to gene names
93+
transcript_gene_map = create_transcript_gene_map_from_annotation(
94+
"path/to/annotation.gtf",
95+
target_field="gene_name",
96+
)
97+
98+
# Create a mapping from transcript ids to transcript names and include the gene biotype
99+
transcript_gene_map = create_transcript_gene_map_from_annotation(
100+
"path/to/annotation.gtf",
101+
target_field=["transcript_name", "gene_biotype"],
102+
)
103+
82104
Args:
83105
file_path (Union[str, Path]): The path to the GTF annotation file.
84-
field (Literal["gene_id", "gene_name"], optional): The identifier to get for each transcript id.
106+
source_field (Literal["transcript_id", "transcript_name"], optional): The identifier to get for each transcript
107+
id. Defaults to "transcript_id".
108+
target_field (Union[ Literal["gene_id", "gene_name", "gene_biotype"], List[Literal["gene_id", "gene_name",
109+
"gene_biotype"]], optional): The corresponding identifier(s) to get for each transcript.
85110
Defaults to "gene_id".
86111
chunk_size (int, optional): The number of lines to read at a time. Defaults to 100000.
112+
use_transcript_name_as_replacement_id (bool, optional): Whether to use the transcript name as the transcript id
113+
if the transcript id is missing. Defaults to True.
114+
use_gene_name_as_replacement_id (bool, optional): Whether to use the gene name as the gene id if the gene id is
115+
missing. Defaults to True.
87116
keep_biotype (bool, optional): Whether to keep the gene_biotype column. Defaults to False.
88117
89118
Returns:
90119
pd.DataFrame: The mapping from transcript ids to gene ids.
91120
"""
121+
assert source_field != target_field, "The source_field and target_field must be different."
122+
92123
transcript_gene_map = pd.DataFrame(columns=["transcript_id", "gene_id", "gene_name", "gene_biotype"])
93124

94125
if "field" in kwargs:
95126
warning("The field argument is deprecated. Please use the source_field and target_field arguments instead.")
96127

97-
for chunk in pd.read_csv(file_path, sep="\t", chunksize=chunk_size, header=None, comment="#"):
128+
if "keep_biotype" in kwargs and kwargs["keep_biotype"]:
129+
warning("The keep_biotype argument is deprecated. Please use the target_field argument with a list instead.")
130+
if target_field != "gene_biotype" and not (isinstance(target_field, list) and "gene_biotype" in target_field):
131+
target_field = (
132+
[*target_field, "gene_biotype"] if isinstance(target_field, list) else [target_field, "gene_biotype"]
133+
)
134+
135+
if not isinstance(file_path, Path):
136+
file_path = Path(file_path)
137+
138+
if not Path(file_path).exists():
139+
raise FileNotFoundError(f"The file {file_path} does not exist.")
140+
141+
for chunk in pd.read_csv(
142+
file_path,
143+
sep="\t",
144+
chunksize=chunk_size,
145+
header=None,
146+
comment="#",
147+
compression=("gzip" if file_path.suffix == ".gz" else None),
148+
engine="c",
149+
):
98150
# See: https://www.ensembl.org/info/website/upload/gff.html
99151
chunk.columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
100152

@@ -131,26 +183,46 @@ def create_transcript_gene_map_from_annotation(
131183
transcript_gene_map["gene_name"],
132184
)
133185

134-
if source_field == "transcript_name":
186+
# If only the transcript_name is present, we can drop the id and rename the transcript_name to transcript_id
187+
if source_field == "transcript_name" and use_transcript_name_as_replacement_id:
135188
transcript_gene_map.drop("transcript_id", axis=1, inplace=True)
136189
transcript_gene_map.rename(columns={"transcript_name": "transcript_id"}, inplace=True)
137190

138-
if target_field == "gene_name":
191+
source_field = "transcript_id"
192+
193+
# If only the gene_name is present, we can drop the gene_id and rename the gene_name to gene_id
194+
if (
195+
(target_field == "gene_name" or (isinstance(target_field, list) and "gene_name" in target_field))
196+
and not (target_field == "gene_id" or (isinstance(target_field, list) and "gene_id" in target_field))
197+
and use_gene_name_as_replacement_id
198+
):
199+
log(
200+
25,
201+
(
202+
"No gene_id target field was provided. Renaming gene_name to gene_id. "
203+
"You can disable this behavior by setting use_gene_name_as_replacement_id to False."
204+
),
205+
)
206+
139207
transcript_gene_map.drop("gene_id", axis=1, inplace=True)
140208
transcript_gene_map.rename(columns={"gene_name": "gene_id"}, inplace=True)
141209

142-
fields_to_keep = ["transcript_id", "gene_id"]
210+
if isinstance(target_field, list):
211+
target_field = [field if field != "gene_name" else "gene_id" for field in target_field]
212+
else:
213+
target_field = "gene_id" if target_field == "gene_name" else target_field
143214

144-
if keep_biotype and "gene_biotype" in transcript_gene_map.columns:
145-
fields_to_keep.append("gene_biotype")
215+
fields_to_keep = [source_field, *target_field] if isinstance(target_field, list) else [source_field, target_field]
146216

147217
transcript_gene_map = transcript_gene_map[fields_to_keep]
148-
149-
transcript_gene_map[["gene_id", "transcript_id"]] = transcript_gene_map[["gene_id", "transcript_id"]].replace(
150-
"", np.nan
151-
)
218+
transcript_gene_map.replace("", np.nan, inplace=True)
152219
transcript_gene_map.dropna(inplace=True)
153-
transcript_gene_map.drop_duplicates(subset=["gene_id", "transcript_id"], inplace=True)
220+
221+
if source_field == "transcript_id" and (
222+
target_field == "gene_id" or (isinstance(target_field, list) and "gene_id" in target_field)
223+
):
224+
transcript_gene_map.drop_duplicates(subset=["transcript_id", "gene_id"], inplace=True)
225+
154226
transcript_gene_map.reset_index(drop=True, inplace=True)
155227

156228
return transcript_gene_map

0 commit comments

Comments
 (0)