Extend cell cycle gene scoring (#421)

* refactor cell cycle gene handling * add new gene sets for C. elegans and zebrafish and re-parse Tirosh genes from beginning * add test datasets for C. elegans and zebrafish * separate function for cell cycle gene set retrieval * smart use of gene ID or gene name depending on data * add gene scoring test for zebrafish and c_elegans * simplify random gene selection for error reporting * use Literal for organism * add list of possible organism names to error message * use scanpy backup_url functionality * add assertions for gene score columns
theislab · Jan 13, 2025 · 203b94f · 203b94f
1 parent e614d35
commit 203b94f
Show file tree

Hide file tree

Showing 10 changed files with 559 additions and 29 deletions.
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -33,6 +33,7 @@ Please refer to the `Single Cell Best Practices Book`_ for more details.
     hvg_intersect
     hvg_batch
     score_cell_cycle
+    get_cell_cycle_genes
     reduce_data
 
 

diff --git a/scib/preprocessing.py b/scib/preprocessing.py
@@ -1,7 +1,9 @@
 import logging
 import re
 import tempfile
+from typing import Literal
 
+import anndata as ad
 import numpy as np
 import pandas as pd
 import scanpy as sc
@@ -642,8 +644,24 @@ def reduce_data(
         sc.tl.umap(adata)
 
 
-# Cell Cycle
-def score_cell_cycle(adata, organism="mouse"):
+def score_cell_cycle(
+    adata: ad.AnnData,
+    organism: Literal[
+        "mouse",
+        "mus musculus",
+        "mus_musculus",
+        "human",
+        "homo sapiens",
+        "homo_sapiens",
+        "c_elegans",
+        "c elegans",
+        "caenorhabditis elegans",
+        "caenorhabditis_elegans",
+        "zebrafish",
+        "danio rerio",
+        "danio_rerio",
+    ] = "mouse",
+):
     """Score cell cycle score given an organism
 
     Wrapper function for `scanpy.tl.score_genes_cell_cycle`_
@@ -653,43 +671,111 @@ def score_cell_cycle(adata, organism="mouse"):
     Tirosh et al. cell cycle marker genes downloaded from
     https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt
 
-    For human, mouse genes are capitalised and used directly. This is under the assumption that cell cycle genes are
-    well conserved across species.
+    See more on gene sets in :func:`~scib.preprocessing.get_cell_cycle_genes`.
+
+    This function picks gene IDs or gene names of the cell cycle genes, depending on what is present in the adata object.
 
     :param adata: anndata object containing
     :param organism: organism of gene names to match cell cycle genes
     :return: tuple of ``(s_genes, g2m_genes)`` of S-phase genes and G2- and M-phase genes scores
     """
-    import pathlib
-
-    root = pathlib.Path(__file__).parent
-
-    cc_files = {
-        "mouse": [
-            root / "resources/s_genes_tirosh.txt",
-            root / "resources/g2m_genes_tirosh.txt",
-        ],
-        "human": [
-            root / "resources/s_genes_tirosh_hm.txt",
-            root / "resources/g2m_genes_tirosh_hm.txt",
-        ],
-    }
 
-    with open(cc_files[organism][0]) as f:
-        s_genes = [x.strip() for x in f.readlines() if x.strip() in adata.var.index]
-    with open(cc_files[organism][1]) as f:
-        g2m_genes = [x.strip() for x in f.readlines() if x.strip() in adata.var.index]
+    def filter_genes(adata: ad.AnnData, df: pd.DataFrame, columns: list = None):
+        if columns is None:
+            columns = ["gene_name", "gene_id"]
+        elif isinstance(columns, str):
+            columns = [columns]
+
+        n_genes = 0
+        for col in columns:
+            _genes = [g for g in df[col] if g in adata.var_names]
+            if len(_genes) > n_genes:  # pick largest overlapping set
+                n_genes = len(_genes)
+                genes = _genes
+
+        if n_genes == 0:
+            # pick random genes for error message
+            rand_genes = np.random.choice(adata.var_names, 10)
+            raise ValueError(
+                f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}\n cell cycle genes:\n {df}"
+            )
+        return genes
 
-    if (len(s_genes) == 0) or (len(g2m_genes) == 0):
-        rand_choice = np.random.randint(1, adata.n_vars, 10)
-        rand_genes = adata.var_names[rand_choice].tolist()
-        raise ValueError(
-            f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}"
-        )
+    # get gene sets
+    gene_map = get_cell_cycle_genes(organism)
 
+    # filter gene sets across data
+    s_genes = filter_genes(adata, gene_map.query("phase == 'S'"))
+    g2m_genes = filter_genes(adata, gene_map.query("phase == 'G2/M'"))
+
+    # compute scores
     sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
 
 
+def get_cell_cycle_genes(
+    organism: Literal[
+        "mouse",
+        "mus musculus",
+        "mus_musculus",
+        "human",
+        "homo sapiens",
+        "homo_sapiens",
+        "c_elegans",
+        "c elegans",
+        "caenorhabditis elegans",
+        "caenorhabditis_elegans",
+        "zebrafish",
+        "danio rerio",
+        "danio_rerio",
+    ]
+):
+    """
+    Get cell cycle genes for a given organism
+
+    Tirosh et al. cell cycle marker genes downloaded from
+    https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt
+
+    For human, mouse genes are capitalised and used directly. This is under the assumption that cell cycle genes are
+    well conserved across species
+
+    For organisms other than human or mouse, orthlogy-mapped datasets from Tinyaltas were used:
+    https://github.com/hbc/tinyatlas/tree/master/cell_cycle
+
+    :param organism: organism of gene names to match cell cycle genes
+    :param identifier: gene identifier to use. options: "gene_name", "gene_id"
+    """
+    from pathlib import Path
+
+    organism_map = {
+        "mouse": "mus_musculus",
+        "mus musculus": "mus_musculus",
+        "human": "homo_sapiens",
+        "homo sapiens": "homo_sapiens",
+        "c_elegans": "caenorhabditis_elegans",
+        "caenorhabditis elegans": "caenorhabditis_elegans",
+        "c elegans": "caenorhabditis_elegans",
+        "zebrafish": "danio_rerio",
+        "danio rerio": "danio_rerio",
+    }
+    # additionally map each key to itself to make them available as well
+    organism_map |= {x: x for x in organism_map.values()}
+
+    # get lower-case organism name
+    organism = organism.lower()
+
+    assert (
+        organism in organism_map
+    ), f"organism '{organism}' not supported. Supported organisms: {list(organism_map.keys())}"
+
+    # get organism name needed for retrieving correct file
+    organism = organism_map[organism]
+
+    # read gene sets
+    gene_file = Path(__file__).parent / "resources" / f"cell_cycle_genes_{organism}.tsv"
+    assert gene_file.exists(), f"{gene_file} doesn't exist"
+    return pd.read_table(gene_file)
+
+
 def save_seurat(adata, path, batch, hvgs=None):
     """Save an ``anndata`` object to file as a Seurat object
 

diff --git a/scib/resources/cell_cycle_genes_caenorhabditis_elegans.tsv b/scib/resources/cell_cycle_genes_caenorhabditis_elegans.tsv
@@ -0,0 +1,18 @@
+phase	modified	gene_id	gene_name
+G2/M	2024-07-04	WBGene00006974	zen-4
+G2/M	2024-07-04	WBGene00000257	bmk-1
+G2/M	2024-07-04	WBGene00000405	cdk-1
+G2/M	2024-07-04	WBGene00000099	air-2
+S	2024-07-04	WBGene00011912	T22C1.1
+S	2024-07-04	WBGene00004338	rfc-2
+S	2024-07-04	WBGene00004297	rad-51
+S	2024-07-04	WBGene00003154	mcm-2
+S	2024-07-04	WBGene00013241	ung-1
+S	2024-07-04	WBGene00009372	evl-18
+S	2024-07-04	WBGene00000382	cdc-6
+S	2024-07-04	WBGene00003418	msh-2
+S	2024-07-04	WBGene00003156	mcm-4
+S	2024-07-04	WBGene00009287	psf-2
+S	2024-07-04	WBGene00022141	chaf-2
+S	2024-07-04	WBGene00000794	crn-1
+S	2024-07-04	WBGene00022455	tyms-1
diff --git a/scib/resources/cell_cycle_genes_danio_rerio.tsv b/scib/resources/cell_cycle_genes_danio_rerio.tsv
@@ -0,0 +1,47 @@
+phase	modified	gene_id	gene_name
+G2/M	2018-10-19	ENSDARG00000078654	tpx2
+G2/M	2018-10-19	ENSDARG00000075621	birc5a
+G2/M	2018-10-19	ENSDARG00000001313	g2e3
+G2/M	2018-10-19	ENSDARG00000061187	cbx5
+G2/M	2018-10-19	ENSDARG00000056621	ctcf
+G2/M	2018-10-19	ENSDARG00000041361	ttk
+G2/M	2018-10-19	ENSDARG00000038882	smc4
+G2/M	2018-10-19	ENSDARG00000005619	nek2
+G2/M	2018-10-19	ENSDARG00000055133	cenpf
+G2/M	2018-10-19	ENSDARG00000117089	CKS2
+G2/M	2018-10-19	ENSDARG00000024488	top2a
+G2/M	2018-10-19	ENSDARG00000043137	cdca8
+G2/M	2018-10-19	ENSDARG00000002403	nusap1
+G2/M	2018-10-19	ENSDARG00000010948	kif11
+G2/M	2018-10-19	ENSDARG00000054804	anp32e
+G2/M	2018-10-19	ENSDARG00000014013	lbr
+G2/M	2018-10-19	ENSDARG00000036180	ccnb2
+G2/M	2018-10-19	ENSDARG00000029722	hmgb2a
+G2/M	2018-10-19	ENSDARG00000087554	cdk1
+G2/M	2018-10-19	ENSDARG00000007971	cks1b
+G2/M	2018-10-19	ENSDARG00000102674	ckap5
+S	2018-10-19	ENSDARG00000057683	mcm6
+S	2018-10-19	ENSDARG00000043720	cdc45
+S	2018-10-19	ENSDARG00000018022	msh2
+S	2018-10-19	ENSDARG00000019507	mcm5
+S	2018-10-19	ENSDARG00000045308	pola1
+S	2018-10-19	ENSDARG00000040041	mcm4
+S	2018-10-19	ENSDARG00000035957	gmnn
+S	2018-10-19	ENSDARG00000037188	rpa2
+S	2018-10-19	ENSDARG00000057738	hells
+S	2018-10-19	ENSDARG00000057323	e2f8
+S	2018-10-19	ENSDARG00000002304	gins2
+S	2018-10-19	ENSDARG00000054155	pcna
+S	2018-10-19	ENSDARG00000039208	nasp
+S	2018-10-19	ENSDARG00000074410	brip1
+S	2018-10-19	ENSDARG00000019907	dscc1
+S	2018-10-19	ENSDARG00000023002	dtl
+S	2018-10-19	ENSDARG00000077620	cdca7a
+S	2018-10-19	ENSDARG00000056473	chaf1b
+S	2018-10-19	ENSDARG00000056414	usp1
+S	2018-10-19	ENSDARG00000100558	slbp
+S	2018-10-19	ENSDARG00000014017	rrm1
+S	2018-10-19	ENSDARG00000011404	fen1
+S	2018-10-19	ENSDARG00000056832	exo1
+S	2018-10-19	ENSDARG00000042894	tyms
+S	2018-10-19	ENSDARG00000103409	uhrf1
diff --git a/scib/resources/cell_cycle_genes_homo_sapiens.tsv b/scib/resources/cell_cycle_genes_homo_sapiens.tsv
@@ -0,0 +1,98 @@
+gene_name	gene_id	phase
+MCM5	ENSG00000100297	S
+PCNA	ENSG00000132646	S
+TYMS	ENSG00000176890	S
+FEN1	ENSG00000168496	S
+MCM2	ENSG00000073111	S
+MCM4	ENSG00000104738	S
+RRM1	ENSG00000167325	S
+UNG	ENSG00000076248	S
+GINS2	ENSG00000131153	S
+MCM6	ENSG00000076003	S
+CDCA7	ENSG00000144354	S
+DTL	ENSG00000143476	S
+PRIM1	ENSG00000198056	S
+UHRF1	ENSG00000276043	S
+MLF1IP	ENSG00000151725	S
+HELLS	ENSG00000119969	S
+RFC2	ENSG00000049541	S
+RPA2	ENSG00000117748	S
+NASP	ENSG00000132780	S
+RAD51AP1	ENSG00000111247	S
+GMNN	ENSG00000112312	S
+WDR76	ENSG00000092470	S
+SLBP	ENSG00000163950	S
+CCNE2	ENSG00000175305	S
+UBR7	ENSG00000012963	S
+POLD3	ENSG00000077514	S
+MSH2	ENSG00000095002	S
+ATAD2	ENSG00000156802	S
+RAD51	ENSG00000051180	S
+RRM2	ENSG00000171848	S
+CDC45	ENSG00000093009	S
+CDC6	ENSG00000094804	S
+EXO1	ENSG00000174371	S
+TIPIN	ENSG00000075131	S
+DSCC1	ENSG00000136982	S
+BLM	ENSG00000197299	S
+CASP8AP2	ENSG00000118412	S
+USP1	ENSG00000162607	S
+CLSPN	ENSG00000092853	S
+POLA1	ENSG00000101868	S
+CHAF1B	ENSG00000159259	S
+BRIP1	ENSG00000136492	S
+E2F8	ENSG00000129173	S
+HMGB2	ENSG00000164104	G2/M
+CDK1	ENSG00000170312	G2/M
+NUSAP1	ENSG00000137804	G2/M
+UBE2C	ENSG00000175063	G2/M
+BIRC5	ENSG00000089685	G2/M
+TPX2	ENSG00000088325	G2/M
+TOP2A	ENSG00000131747	G2/M
+NDC80	ENSG00000080986	G2/M
+CKS2	ENSG00000123975	G2/M
+NUF2	ENSG00000143228	G2/M
+CKS1B	ENSG00000173207	G2/M
+MKI67	ENSG00000148773	G2/M
+TMPO	ENSG00000120802	G2/M
+CENPF	ENSG00000117724	G2/M
+TACC3	ENSG00000013810	G2/M
+FAM64A	ENSG00000129195	G2/M
+SMC4	ENSG00000113810	G2/M
+CCNB2	ENSG00000157456	G2/M
+CKAP2L	ENSG00000169607	G2/M
+CKAP2	ENSG00000136108	G2/M
+AURKB	ENSG00000178999	G2/M
+BUB1	ENSG00000169679	G2/M
+KIF11	ENSG00000138160	G2/M
+ANP32E	ENSG00000143401	G2/M
+TUBB4B	ENSG00000188229	G2/M
+GTSE1	ENSG00000075218	G2/M
+KIF20B	ENSG00000138182	G2/M
+HJURP	ENSG00000123485	G2/M
+CDCA3	ENSG00000111665	G2/M
+HN1	ENSG00000189159	G2/M
+CDC20	ENSG00000117399	G2/M
+TTK	ENSG00000112742	G2/M
+CDC25C	ENSG00000158402	G2/M
+KIF2C	ENSG00000142945	G2/M
+RANGAP1	ENSG00000100401	G2/M
+NCAPD2	ENSG00000010292	G2/M
+DLGAP5	ENSG00000126787	G2/M
+CDCA2	ENSG00000184661	G2/M
+CDCA8	ENSG00000134690	G2/M
+ECT2	ENSG00000114346	G2/M
+KIF23	ENSG00000137807	G2/M
+HMMR	ENSG00000072571	G2/M
+AURKA	ENSG00000087586	G2/M
+PSRC1	ENSG00000134222	G2/M
+ANLN	ENSG00000011426	G2/M
+LBR	ENSG00000143815	G2/M
+CKAP5	ENSG00000175216	G2/M
+CENPE	ENSG00000138778	G2/M
+CTCF	ENSG00000102974	G2/M
+NEK2	ENSG00000117650	G2/M
+G2E3	ENSG00000092140	G2/M
+GAS2L3	ENSG00000139354	G2/M
+CBX5	ENSG00000094916	G2/M
+CENPA	ENSG00000115163	G2/M