Skip to content

Commit

Permalink
Extend cell cycle gene scoring (#421)
Browse files Browse the repository at this point in the history
* refactor cell cycle gene handling

* add new gene sets for C. elegans and zebrafish and re-parse Tirosh genes from beginning

* add test datasets for C. elegans and zebrafish

* separate function for cell cycle gene set retrieval

* smart use of gene ID or gene name depending on data

* add gene scoring test for zebrafish and c_elegans

* simplify random gene selection for error reporting

* use Literal for organism

* add list of possible organism names to error message

* use scanpy backup_url functionality

* add assertions for gene score columns
  • Loading branch information
mumichae authored Jan 13, 2025
1 parent e614d35 commit 203b94f
Show file tree
Hide file tree
Showing 10 changed files with 559 additions and 29 deletions.
1 change: 1 addition & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Please refer to the `Single Cell Best Practices Book`_ for more details.
hvg_intersect
hvg_batch
score_cell_cycle
get_cell_cycle_genes
reduce_data


Expand Down
142 changes: 114 additions & 28 deletions scib/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import re
import tempfile
from typing import Literal

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
Expand Down Expand Up @@ -642,8 +644,24 @@ def reduce_data(
sc.tl.umap(adata)


# Cell Cycle
def score_cell_cycle(adata, organism="mouse"):
def score_cell_cycle(
adata: ad.AnnData,
organism: Literal[
"mouse",
"mus musculus",
"mus_musculus",
"human",
"homo sapiens",
"homo_sapiens",
"c_elegans",
"c elegans",
"caenorhabditis elegans",
"caenorhabditis_elegans",
"zebrafish",
"danio rerio",
"danio_rerio",
] = "mouse",
):
"""Score cell cycle score given an organism
Wrapper function for `scanpy.tl.score_genes_cell_cycle`_
Expand All @@ -653,43 +671,111 @@ def score_cell_cycle(adata, organism="mouse"):
Tirosh et al. cell cycle marker genes downloaded from
https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt
For human, mouse genes are capitalised and used directly. This is under the assumption that cell cycle genes are
well conserved across species.
See more on gene sets in :func:`~scib.preprocessing.get_cell_cycle_genes`.
This function picks gene IDs or gene names of the cell cycle genes, depending on what is present in the adata object.
:param adata: anndata object containing
:param organism: organism of gene names to match cell cycle genes
:return: tuple of ``(s_genes, g2m_genes)`` of S-phase genes and G2- and M-phase genes scores
"""
import pathlib

root = pathlib.Path(__file__).parent

cc_files = {
"mouse": [
root / "resources/s_genes_tirosh.txt",
root / "resources/g2m_genes_tirosh.txt",
],
"human": [
root / "resources/s_genes_tirosh_hm.txt",
root / "resources/g2m_genes_tirosh_hm.txt",
],
}

with open(cc_files[organism][0]) as f:
s_genes = [x.strip() for x in f.readlines() if x.strip() in adata.var.index]
with open(cc_files[organism][1]) as f:
g2m_genes = [x.strip() for x in f.readlines() if x.strip() in adata.var.index]
def filter_genes(adata: ad.AnnData, df: pd.DataFrame, columns: list = None):
if columns is None:
columns = ["gene_name", "gene_id"]
elif isinstance(columns, str):
columns = [columns]

n_genes = 0
for col in columns:
_genes = [g for g in df[col] if g in adata.var_names]
if len(_genes) > n_genes: # pick largest overlapping set
n_genes = len(_genes)
genes = _genes

if n_genes == 0:
# pick random genes for error message
rand_genes = np.random.choice(adata.var_names, 10)
raise ValueError(
f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}\n cell cycle genes:\n {df}"
)
return genes

if (len(s_genes) == 0) or (len(g2m_genes) == 0):
rand_choice = np.random.randint(1, adata.n_vars, 10)
rand_genes = adata.var_names[rand_choice].tolist()
raise ValueError(
f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}"
)
# get gene sets
gene_map = get_cell_cycle_genes(organism)

# filter gene sets across data
s_genes = filter_genes(adata, gene_map.query("phase == 'S'"))
g2m_genes = filter_genes(adata, gene_map.query("phase == 'G2/M'"))

# compute scores
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)


def get_cell_cycle_genes(
organism: Literal[
"mouse",
"mus musculus",
"mus_musculus",
"human",
"homo sapiens",
"homo_sapiens",
"c_elegans",
"c elegans",
"caenorhabditis elegans",
"caenorhabditis_elegans",
"zebrafish",
"danio rerio",
"danio_rerio",
]
):
"""
Get cell cycle genes for a given organism
Tirosh et al. cell cycle marker genes downloaded from
https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt
For human, mouse genes are capitalised and used directly. This is under the assumption that cell cycle genes are
well conserved across species
For organisms other than human or mouse, orthlogy-mapped datasets from Tinyaltas were used:
https://github.com/hbc/tinyatlas/tree/master/cell_cycle
:param organism: organism of gene names to match cell cycle genes
:param identifier: gene identifier to use. options: "gene_name", "gene_id"
"""
from pathlib import Path

organism_map = {
"mouse": "mus_musculus",
"mus musculus": "mus_musculus",
"human": "homo_sapiens",
"homo sapiens": "homo_sapiens",
"c_elegans": "caenorhabditis_elegans",
"caenorhabditis elegans": "caenorhabditis_elegans",
"c elegans": "caenorhabditis_elegans",
"zebrafish": "danio_rerio",
"danio rerio": "danio_rerio",
}
# additionally map each key to itself to make them available as well
organism_map |= {x: x for x in organism_map.values()}

# get lower-case organism name
organism = organism.lower()

assert (
organism in organism_map
), f"organism '{organism}' not supported. Supported organisms: {list(organism_map.keys())}"

# get organism name needed for retrieving correct file
organism = organism_map[organism]

# read gene sets
gene_file = Path(__file__).parent / "resources" / f"cell_cycle_genes_{organism}.tsv"
assert gene_file.exists(), f"{gene_file} doesn't exist"
return pd.read_table(gene_file)


def save_seurat(adata, path, batch, hvgs=None):
"""Save an ``anndata`` object to file as a Seurat object
Expand Down
18 changes: 18 additions & 0 deletions scib/resources/cell_cycle_genes_caenorhabditis_elegans.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
phase modified gene_id gene_name
G2/M 2024-07-04 WBGene00006974 zen-4
G2/M 2024-07-04 WBGene00000257 bmk-1
G2/M 2024-07-04 WBGene00000405 cdk-1
G2/M 2024-07-04 WBGene00000099 air-2
S 2024-07-04 WBGene00011912 T22C1.1
S 2024-07-04 WBGene00004338 rfc-2
S 2024-07-04 WBGene00004297 rad-51
S 2024-07-04 WBGene00003154 mcm-2
S 2024-07-04 WBGene00013241 ung-1
S 2024-07-04 WBGene00009372 evl-18
S 2024-07-04 WBGene00000382 cdc-6
S 2024-07-04 WBGene00003418 msh-2
S 2024-07-04 WBGene00003156 mcm-4
S 2024-07-04 WBGene00009287 psf-2
S 2024-07-04 WBGene00022141 chaf-2
S 2024-07-04 WBGene00000794 crn-1
S 2024-07-04 WBGene00022455 tyms-1
47 changes: 47 additions & 0 deletions scib/resources/cell_cycle_genes_danio_rerio.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
phase modified gene_id gene_name
G2/M 2018-10-19 ENSDARG00000078654 tpx2
G2/M 2018-10-19 ENSDARG00000075621 birc5a
G2/M 2018-10-19 ENSDARG00000001313 g2e3
G2/M 2018-10-19 ENSDARG00000061187 cbx5
G2/M 2018-10-19 ENSDARG00000056621 ctcf
G2/M 2018-10-19 ENSDARG00000041361 ttk
G2/M 2018-10-19 ENSDARG00000038882 smc4
G2/M 2018-10-19 ENSDARG00000005619 nek2
G2/M 2018-10-19 ENSDARG00000055133 cenpf
G2/M 2018-10-19 ENSDARG00000117089 CKS2
G2/M 2018-10-19 ENSDARG00000024488 top2a
G2/M 2018-10-19 ENSDARG00000043137 cdca8
G2/M 2018-10-19 ENSDARG00000002403 nusap1
G2/M 2018-10-19 ENSDARG00000010948 kif11
G2/M 2018-10-19 ENSDARG00000054804 anp32e
G2/M 2018-10-19 ENSDARG00000014013 lbr
G2/M 2018-10-19 ENSDARG00000036180 ccnb2
G2/M 2018-10-19 ENSDARG00000029722 hmgb2a
G2/M 2018-10-19 ENSDARG00000087554 cdk1
G2/M 2018-10-19 ENSDARG00000007971 cks1b
G2/M 2018-10-19 ENSDARG00000102674 ckap5
S 2018-10-19 ENSDARG00000057683 mcm6
S 2018-10-19 ENSDARG00000043720 cdc45
S 2018-10-19 ENSDARG00000018022 msh2
S 2018-10-19 ENSDARG00000019507 mcm5
S 2018-10-19 ENSDARG00000045308 pola1
S 2018-10-19 ENSDARG00000040041 mcm4
S 2018-10-19 ENSDARG00000035957 gmnn
S 2018-10-19 ENSDARG00000037188 rpa2
S 2018-10-19 ENSDARG00000057738 hells
S 2018-10-19 ENSDARG00000057323 e2f8
S 2018-10-19 ENSDARG00000002304 gins2
S 2018-10-19 ENSDARG00000054155 pcna
S 2018-10-19 ENSDARG00000039208 nasp
S 2018-10-19 ENSDARG00000074410 brip1
S 2018-10-19 ENSDARG00000019907 dscc1
S 2018-10-19 ENSDARG00000023002 dtl
S 2018-10-19 ENSDARG00000077620 cdca7a
S 2018-10-19 ENSDARG00000056473 chaf1b
S 2018-10-19 ENSDARG00000056414 usp1
S 2018-10-19 ENSDARG00000100558 slbp
S 2018-10-19 ENSDARG00000014017 rrm1
S 2018-10-19 ENSDARG00000011404 fen1
S 2018-10-19 ENSDARG00000056832 exo1
S 2018-10-19 ENSDARG00000042894 tyms
S 2018-10-19 ENSDARG00000103409 uhrf1
98 changes: 98 additions & 0 deletions scib/resources/cell_cycle_genes_homo_sapiens.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
gene_name gene_id phase
MCM5 ENSG00000100297 S
PCNA ENSG00000132646 S
TYMS ENSG00000176890 S
FEN1 ENSG00000168496 S
MCM2 ENSG00000073111 S
MCM4 ENSG00000104738 S
RRM1 ENSG00000167325 S
UNG ENSG00000076248 S
GINS2 ENSG00000131153 S
MCM6 ENSG00000076003 S
CDCA7 ENSG00000144354 S
DTL ENSG00000143476 S
PRIM1 ENSG00000198056 S
UHRF1 ENSG00000276043 S
MLF1IP ENSG00000151725 S
HELLS ENSG00000119969 S
RFC2 ENSG00000049541 S
RPA2 ENSG00000117748 S
NASP ENSG00000132780 S
RAD51AP1 ENSG00000111247 S
GMNN ENSG00000112312 S
WDR76 ENSG00000092470 S
SLBP ENSG00000163950 S
CCNE2 ENSG00000175305 S
UBR7 ENSG00000012963 S
POLD3 ENSG00000077514 S
MSH2 ENSG00000095002 S
ATAD2 ENSG00000156802 S
RAD51 ENSG00000051180 S
RRM2 ENSG00000171848 S
CDC45 ENSG00000093009 S
CDC6 ENSG00000094804 S
EXO1 ENSG00000174371 S
TIPIN ENSG00000075131 S
DSCC1 ENSG00000136982 S
BLM ENSG00000197299 S
CASP8AP2 ENSG00000118412 S
USP1 ENSG00000162607 S
CLSPN ENSG00000092853 S
POLA1 ENSG00000101868 S
CHAF1B ENSG00000159259 S
BRIP1 ENSG00000136492 S
E2F8 ENSG00000129173 S
HMGB2 ENSG00000164104 G2/M
CDK1 ENSG00000170312 G2/M
NUSAP1 ENSG00000137804 G2/M
UBE2C ENSG00000175063 G2/M
BIRC5 ENSG00000089685 G2/M
TPX2 ENSG00000088325 G2/M
TOP2A ENSG00000131747 G2/M
NDC80 ENSG00000080986 G2/M
CKS2 ENSG00000123975 G2/M
NUF2 ENSG00000143228 G2/M
CKS1B ENSG00000173207 G2/M
MKI67 ENSG00000148773 G2/M
TMPO ENSG00000120802 G2/M
CENPF ENSG00000117724 G2/M
TACC3 ENSG00000013810 G2/M
FAM64A ENSG00000129195 G2/M
SMC4 ENSG00000113810 G2/M
CCNB2 ENSG00000157456 G2/M
CKAP2L ENSG00000169607 G2/M
CKAP2 ENSG00000136108 G2/M
AURKB ENSG00000178999 G2/M
BUB1 ENSG00000169679 G2/M
KIF11 ENSG00000138160 G2/M
ANP32E ENSG00000143401 G2/M
TUBB4B ENSG00000188229 G2/M
GTSE1 ENSG00000075218 G2/M
KIF20B ENSG00000138182 G2/M
HJURP ENSG00000123485 G2/M
CDCA3 ENSG00000111665 G2/M
HN1 ENSG00000189159 G2/M
CDC20 ENSG00000117399 G2/M
TTK ENSG00000112742 G2/M
CDC25C ENSG00000158402 G2/M
KIF2C ENSG00000142945 G2/M
RANGAP1 ENSG00000100401 G2/M
NCAPD2 ENSG00000010292 G2/M
DLGAP5 ENSG00000126787 G2/M
CDCA2 ENSG00000184661 G2/M
CDCA8 ENSG00000134690 G2/M
ECT2 ENSG00000114346 G2/M
KIF23 ENSG00000137807 G2/M
HMMR ENSG00000072571 G2/M
AURKA ENSG00000087586 G2/M
PSRC1 ENSG00000134222 G2/M
ANLN ENSG00000011426 G2/M
LBR ENSG00000143815 G2/M
CKAP5 ENSG00000175216 G2/M
CENPE ENSG00000138778 G2/M
CTCF ENSG00000102974 G2/M
NEK2 ENSG00000117650 G2/M
G2E3 ENSG00000092140 G2/M
GAS2L3 ENSG00000139354 G2/M
CBX5 ENSG00000094916 G2/M
CENPA ENSG00000115163 G2/M
Loading

0 comments on commit 203b94f

Please sign in to comment.