diff --git a/docs/source/api.rst b/docs/source/api.rst index 4da63fd1..cb2e1892 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -33,6 +33,7 @@ Please refer to the `Single Cell Best Practices Book`_ for more details. hvg_intersect hvg_batch score_cell_cycle + get_cell_cycle_genes reduce_data diff --git a/scib/preprocessing.py b/scib/preprocessing.py index 1fd8496e..209f3df2 100644 --- a/scib/preprocessing.py +++ b/scib/preprocessing.py @@ -1,7 +1,9 @@ import logging import re import tempfile +from typing import Literal +import anndata as ad import numpy as np import pandas as pd import scanpy as sc @@ -642,8 +644,24 @@ def reduce_data( sc.tl.umap(adata) -# Cell Cycle -def score_cell_cycle(adata, organism="mouse"): +def score_cell_cycle( + adata: ad.AnnData, + organism: Literal[ + "mouse", + "mus musculus", + "mus_musculus", + "human", + "homo sapiens", + "homo_sapiens", + "c_elegans", + "c elegans", + "caenorhabditis elegans", + "caenorhabditis_elegans", + "zebrafish", + "danio rerio", + "danio_rerio", + ] = "mouse", +): """Score cell cycle score given an organism Wrapper function for `scanpy.tl.score_genes_cell_cycle`_ @@ -653,43 +671,111 @@ def score_cell_cycle(adata, organism="mouse"): Tirosh et al. cell cycle marker genes downloaded from https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt - For human, mouse genes are capitalised and used directly. This is under the assumption that cell cycle genes are - well conserved across species. + See more on gene sets in :func:`~scib.preprocessing.get_cell_cycle_genes`. + + This function picks gene IDs or gene names of the cell cycle genes, depending on what is present in the adata object. :param adata: anndata object containing :param organism: organism of gene names to match cell cycle genes :return: tuple of ``(s_genes, g2m_genes)`` of S-phase genes and G2- and M-phase genes scores """ - import pathlib - - root = pathlib.Path(__file__).parent - - cc_files = { - "mouse": [ - root / "resources/s_genes_tirosh.txt", - root / "resources/g2m_genes_tirosh.txt", - ], - "human": [ - root / "resources/s_genes_tirosh_hm.txt", - root / "resources/g2m_genes_tirosh_hm.txt", - ], - } - with open(cc_files[organism][0]) as f: - s_genes = [x.strip() for x in f.readlines() if x.strip() in adata.var.index] - with open(cc_files[organism][1]) as f: - g2m_genes = [x.strip() for x in f.readlines() if x.strip() in adata.var.index] + def filter_genes(adata: ad.AnnData, df: pd.DataFrame, columns: list = None): + if columns is None: + columns = ["gene_name", "gene_id"] + elif isinstance(columns, str): + columns = [columns] + + n_genes = 0 + for col in columns: + _genes = [g for g in df[col] if g in adata.var_names] + if len(_genes) > n_genes: # pick largest overlapping set + n_genes = len(_genes) + genes = _genes + + if n_genes == 0: + # pick random genes for error message + rand_genes = np.random.choice(adata.var_names, 10) + raise ValueError( + f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}\n cell cycle genes:\n {df}" + ) + return genes - if (len(s_genes) == 0) or (len(g2m_genes) == 0): - rand_choice = np.random.randint(1, adata.n_vars, 10) - rand_genes = adata.var_names[rand_choice].tolist() - raise ValueError( - f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}" - ) + # get gene sets + gene_map = get_cell_cycle_genes(organism) + # filter gene sets across data + s_genes = filter_genes(adata, gene_map.query("phase == 'S'")) + g2m_genes = filter_genes(adata, gene_map.query("phase == 'G2/M'")) + + # compute scores sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes) +def get_cell_cycle_genes( + organism: Literal[ + "mouse", + "mus musculus", + "mus_musculus", + "human", + "homo sapiens", + "homo_sapiens", + "c_elegans", + "c elegans", + "caenorhabditis elegans", + "caenorhabditis_elegans", + "zebrafish", + "danio rerio", + "danio_rerio", + ] +): + """ + Get cell cycle genes for a given organism + + Tirosh et al. cell cycle marker genes downloaded from + https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt + + For human, mouse genes are capitalised and used directly. This is under the assumption that cell cycle genes are + well conserved across species + + For organisms other than human or mouse, orthlogy-mapped datasets from Tinyaltas were used: + https://github.com/hbc/tinyatlas/tree/master/cell_cycle + + :param organism: organism of gene names to match cell cycle genes + :param identifier: gene identifier to use. options: "gene_name", "gene_id" + """ + from pathlib import Path + + organism_map = { + "mouse": "mus_musculus", + "mus musculus": "mus_musculus", + "human": "homo_sapiens", + "homo sapiens": "homo_sapiens", + "c_elegans": "caenorhabditis_elegans", + "caenorhabditis elegans": "caenorhabditis_elegans", + "c elegans": "caenorhabditis_elegans", + "zebrafish": "danio_rerio", + "danio rerio": "danio_rerio", + } + # additionally map each key to itself to make them available as well + organism_map |= {x: x for x in organism_map.values()} + + # get lower-case organism name + organism = organism.lower() + + assert ( + organism in organism_map + ), f"organism '{organism}' not supported. Supported organisms: {list(organism_map.keys())}" + + # get organism name needed for retrieving correct file + organism = organism_map[organism] + + # read gene sets + gene_file = Path(__file__).parent / "resources" / f"cell_cycle_genes_{organism}.tsv" + assert gene_file.exists(), f"{gene_file} doesn't exist" + return pd.read_table(gene_file) + + def save_seurat(adata, path, batch, hvgs=None): """Save an ``anndata`` object to file as a Seurat object diff --git a/scib/resources/cell_cycle_genes_caenorhabditis_elegans.tsv b/scib/resources/cell_cycle_genes_caenorhabditis_elegans.tsv new file mode 100644 index 00000000..a8f78d1f --- /dev/null +++ b/scib/resources/cell_cycle_genes_caenorhabditis_elegans.tsv @@ -0,0 +1,18 @@ +phase modified gene_id gene_name +G2/M 2024-07-04 WBGene00006974 zen-4 +G2/M 2024-07-04 WBGene00000257 bmk-1 +G2/M 2024-07-04 WBGene00000405 cdk-1 +G2/M 2024-07-04 WBGene00000099 air-2 +S 2024-07-04 WBGene00011912 T22C1.1 +S 2024-07-04 WBGene00004338 rfc-2 +S 2024-07-04 WBGene00004297 rad-51 +S 2024-07-04 WBGene00003154 mcm-2 +S 2024-07-04 WBGene00013241 ung-1 +S 2024-07-04 WBGene00009372 evl-18 +S 2024-07-04 WBGene00000382 cdc-6 +S 2024-07-04 WBGene00003418 msh-2 +S 2024-07-04 WBGene00003156 mcm-4 +S 2024-07-04 WBGene00009287 psf-2 +S 2024-07-04 WBGene00022141 chaf-2 +S 2024-07-04 WBGene00000794 crn-1 +S 2024-07-04 WBGene00022455 tyms-1 diff --git a/scib/resources/cell_cycle_genes_danio_rerio.tsv b/scib/resources/cell_cycle_genes_danio_rerio.tsv new file mode 100644 index 00000000..92486f05 --- /dev/null +++ b/scib/resources/cell_cycle_genes_danio_rerio.tsv @@ -0,0 +1,47 @@ +phase modified gene_id gene_name +G2/M 2018-10-19 ENSDARG00000078654 tpx2 +G2/M 2018-10-19 ENSDARG00000075621 birc5a +G2/M 2018-10-19 ENSDARG00000001313 g2e3 +G2/M 2018-10-19 ENSDARG00000061187 cbx5 +G2/M 2018-10-19 ENSDARG00000056621 ctcf +G2/M 2018-10-19 ENSDARG00000041361 ttk +G2/M 2018-10-19 ENSDARG00000038882 smc4 +G2/M 2018-10-19 ENSDARG00000005619 nek2 +G2/M 2018-10-19 ENSDARG00000055133 cenpf +G2/M 2018-10-19 ENSDARG00000117089 CKS2 +G2/M 2018-10-19 ENSDARG00000024488 top2a +G2/M 2018-10-19 ENSDARG00000043137 cdca8 +G2/M 2018-10-19 ENSDARG00000002403 nusap1 +G2/M 2018-10-19 ENSDARG00000010948 kif11 +G2/M 2018-10-19 ENSDARG00000054804 anp32e +G2/M 2018-10-19 ENSDARG00000014013 lbr +G2/M 2018-10-19 ENSDARG00000036180 ccnb2 +G2/M 2018-10-19 ENSDARG00000029722 hmgb2a +G2/M 2018-10-19 ENSDARG00000087554 cdk1 +G2/M 2018-10-19 ENSDARG00000007971 cks1b +G2/M 2018-10-19 ENSDARG00000102674 ckap5 +S 2018-10-19 ENSDARG00000057683 mcm6 +S 2018-10-19 ENSDARG00000043720 cdc45 +S 2018-10-19 ENSDARG00000018022 msh2 +S 2018-10-19 ENSDARG00000019507 mcm5 +S 2018-10-19 ENSDARG00000045308 pola1 +S 2018-10-19 ENSDARG00000040041 mcm4 +S 2018-10-19 ENSDARG00000035957 gmnn +S 2018-10-19 ENSDARG00000037188 rpa2 +S 2018-10-19 ENSDARG00000057738 hells +S 2018-10-19 ENSDARG00000057323 e2f8 +S 2018-10-19 ENSDARG00000002304 gins2 +S 2018-10-19 ENSDARG00000054155 pcna +S 2018-10-19 ENSDARG00000039208 nasp +S 2018-10-19 ENSDARG00000074410 brip1 +S 2018-10-19 ENSDARG00000019907 dscc1 +S 2018-10-19 ENSDARG00000023002 dtl +S 2018-10-19 ENSDARG00000077620 cdca7a +S 2018-10-19 ENSDARG00000056473 chaf1b +S 2018-10-19 ENSDARG00000056414 usp1 +S 2018-10-19 ENSDARG00000100558 slbp +S 2018-10-19 ENSDARG00000014017 rrm1 +S 2018-10-19 ENSDARG00000011404 fen1 +S 2018-10-19 ENSDARG00000056832 exo1 +S 2018-10-19 ENSDARG00000042894 tyms +S 2018-10-19 ENSDARG00000103409 uhrf1 diff --git a/scib/resources/cell_cycle_genes_homo_sapiens.tsv b/scib/resources/cell_cycle_genes_homo_sapiens.tsv new file mode 100644 index 00000000..ae8c97a6 --- /dev/null +++ b/scib/resources/cell_cycle_genes_homo_sapiens.tsv @@ -0,0 +1,98 @@ +gene_name gene_id phase +MCM5 ENSG00000100297 S +PCNA ENSG00000132646 S +TYMS ENSG00000176890 S +FEN1 ENSG00000168496 S +MCM2 ENSG00000073111 S +MCM4 ENSG00000104738 S +RRM1 ENSG00000167325 S +UNG ENSG00000076248 S +GINS2 ENSG00000131153 S +MCM6 ENSG00000076003 S +CDCA7 ENSG00000144354 S +DTL ENSG00000143476 S +PRIM1 ENSG00000198056 S +UHRF1 ENSG00000276043 S +MLF1IP ENSG00000151725 S +HELLS ENSG00000119969 S +RFC2 ENSG00000049541 S +RPA2 ENSG00000117748 S +NASP ENSG00000132780 S +RAD51AP1 ENSG00000111247 S +GMNN ENSG00000112312 S +WDR76 ENSG00000092470 S +SLBP ENSG00000163950 S +CCNE2 ENSG00000175305 S +UBR7 ENSG00000012963 S +POLD3 ENSG00000077514 S +MSH2 ENSG00000095002 S +ATAD2 ENSG00000156802 S +RAD51 ENSG00000051180 S +RRM2 ENSG00000171848 S +CDC45 ENSG00000093009 S +CDC6 ENSG00000094804 S +EXO1 ENSG00000174371 S +TIPIN ENSG00000075131 S +DSCC1 ENSG00000136982 S +BLM ENSG00000197299 S +CASP8AP2 ENSG00000118412 S +USP1 ENSG00000162607 S +CLSPN ENSG00000092853 S +POLA1 ENSG00000101868 S +CHAF1B ENSG00000159259 S +BRIP1 ENSG00000136492 S +E2F8 ENSG00000129173 S +HMGB2 ENSG00000164104 G2/M +CDK1 ENSG00000170312 G2/M +NUSAP1 ENSG00000137804 G2/M +UBE2C ENSG00000175063 G2/M +BIRC5 ENSG00000089685 G2/M +TPX2 ENSG00000088325 G2/M +TOP2A ENSG00000131747 G2/M +NDC80 ENSG00000080986 G2/M +CKS2 ENSG00000123975 G2/M +NUF2 ENSG00000143228 G2/M +CKS1B ENSG00000173207 G2/M +MKI67 ENSG00000148773 G2/M +TMPO ENSG00000120802 G2/M +CENPF ENSG00000117724 G2/M +TACC3 ENSG00000013810 G2/M +FAM64A ENSG00000129195 G2/M +SMC4 ENSG00000113810 G2/M +CCNB2 ENSG00000157456 G2/M +CKAP2L ENSG00000169607 G2/M +CKAP2 ENSG00000136108 G2/M +AURKB ENSG00000178999 G2/M +BUB1 ENSG00000169679 G2/M +KIF11 ENSG00000138160 G2/M +ANP32E ENSG00000143401 G2/M +TUBB4B ENSG00000188229 G2/M +GTSE1 ENSG00000075218 G2/M +KIF20B ENSG00000138182 G2/M +HJURP ENSG00000123485 G2/M +CDCA3 ENSG00000111665 G2/M +HN1 ENSG00000189159 G2/M +CDC20 ENSG00000117399 G2/M +TTK ENSG00000112742 G2/M +CDC25C ENSG00000158402 G2/M +KIF2C ENSG00000142945 G2/M +RANGAP1 ENSG00000100401 G2/M +NCAPD2 ENSG00000010292 G2/M +DLGAP5 ENSG00000126787 G2/M +CDCA2 ENSG00000184661 G2/M +CDCA8 ENSG00000134690 G2/M +ECT2 ENSG00000114346 G2/M +KIF23 ENSG00000137807 G2/M +HMMR ENSG00000072571 G2/M +AURKA ENSG00000087586 G2/M +PSRC1 ENSG00000134222 G2/M +ANLN ENSG00000011426 G2/M +LBR ENSG00000143815 G2/M +CKAP5 ENSG00000175216 G2/M +CENPE ENSG00000138778 G2/M +CTCF ENSG00000102974 G2/M +NEK2 ENSG00000117650 G2/M +G2E3 ENSG00000092140 G2/M +GAS2L3 ENSG00000139354 G2/M +CBX5 ENSG00000094916 G2/M +CENPA ENSG00000115163 G2/M diff --git a/scib/resources/cell_cycle_genes_mus_musculus.tsv b/scib/resources/cell_cycle_genes_mus_musculus.tsv new file mode 100644 index 00000000..3938d978 --- /dev/null +++ b/scib/resources/cell_cycle_genes_mus_musculus.tsv @@ -0,0 +1,98 @@ +gene_name gene_id phase +Mcm5 ENSMUSG00000005410 S +Pcna ENSMUSG00000027342 S +Tyms ENSMUSG00000025747 S +Fen1 ENSMUSG00000024742 S +Mcm2 ENSMUSG00000002870 S +Mcm4 ENSMUSG00000022673 S +Rrm1 ENSMUSG00000030978 S +Ung ENSMUSG00000029591 S +Gins2 ENSMUSG00000031821 S +Mcm6 ENSMUSG00000026355 S +Cdca7 ENSMUSG00000055612 S +Dtl ENSMUSG00000037474 S +Prim1 ENSMUSG00000025395 S +Uhrf1 ENSMUSG00000001228 S +Mlf1ip ENSMUSG00000031629 S +Hells ENSMUSG00000025001 S +Rfc2 ENSMUSG00000023104 S +Rpa2 ENSMUSG00000028884 S +Nasp ENSMUSG00000028693 S +Rad51ap1 ENSMUSG00000030346 S +Gmnn ENSMUSG00000006715 S +Wdr76 ENSMUSG00000027242 S +Slbp ENSMUSG00000004642 S +Ccne2 ENSMUSG00000028212 S +Ubr7 ENSMUSG00000041712 S +Pold3 ENSMUSG00000030726 S +Msh2 ENSMUSG00000024151 S +Atad2 ENSMUSG00000022360 S +Rad51 ENSMUSG00000027323 S +Rrm2 ENSMUSG00000020649 S +Cdc45 ENSMUSG00000000028 S +Cdc6 ENSMUSG00000017499 S +Exo1 ENSMUSG00000039748 S +Tipin ENSMUSG00000032397 S +Dscc1 ENSMUSG00000022422 S +Blm ENSMUSG00000030528 S +Casp8ap2 ENSMUSG00000028282 S +Usp1 ENSMUSG00000028560 S +Clspn ENSMUSG00000042489 S +Pola1 ENSMUSG00000006678 S +Chaf1b ENSMUSG00000022945 S +Brip1 ENSMUSG00000034329 S +E2f8 ENSMUSG00000046179 S +Hmgb2 ENSMUSG00000054717 G2/M +Cdk1 ENSMUSG00000019942 G2/M +Nusap1 ENSMUSG00000027306 G2/M +Ube2c ENSMUSG00000001403 G2/M +Birc5 ENSMUSG00000017716 G2/M +Tpx2 ENSMUSG00000027469 G2/M +Top2a ENSMUSG00000020914 G2/M +Ndc80 ENSMUSG00000024056 G2/M +Cks2 ENSMUSG00000062248 G2/M +Nuf2 ENSMUSG00000026683 G2/M +Cks1b ENSMUSG00000028044 G2/M +Mki67 ENSMUSG00000031004 G2/M +Tmpo ENSMUSG00000019961 G2/M +Cenpf ENSMUSG00000026605 G2/M +Tacc3 ENSMUSG00000037313 G2/M +Fam64a ENSMUSG00000020808 G2/M +Smc4 ENSMUSG00000034349 G2/M +Ccnb2 ENSMUSG00000032218 G2/M +Ckap2l ENSMUSG00000048327 G2/M +Ckap2 ENSMUSG00000037725 G2/M +Aurkb ENSMUSG00000020897 G2/M +Bub1 ENSMUSG00000027379 G2/M +Kif11 ENSMUSG00000012443 G2/M +Anp32e ENSMUSG00000015749 G2/M +Tubb4b ENSMUSG00000036752 G2/M +Gtse1 ENSMUSG00000022385 G2/M +Kif20b ENSMUSG00000024795 G2/M +Hjurp ENSMUSG00000044783 G2/M +Cdca3 ENSMUSG00000023505 G2/M +Hn1 ENSMUSG00000020737 G2/M +Cdc20 ENSMUSG00000006398 G2/M +Ttk ENSMUSG00000038379 G2/M +Cdc25c ENSMUSG00000044201 G2/M +Kif2c ENSMUSG00000028678 G2/M +Rangap1 ENSMUSG00000022391 G2/M +Ncapd2 ENSMUSG00000038252 G2/M +Dlgap5 ENSMUSG00000037544 G2/M +Cdca2 ENSMUSG00000048922 G2/M +Cdca8 ENSMUSG00000028873 G2/M +Ect2 ENSMUSG00000027699 G2/M +Kif23 ENSMUSG00000032254 G2/M +Hmmr ENSMUSG00000020330 G2/M +Aurka ENSMUSG00000027496 G2/M +Psrc1 ENSMUSG00000068744 G2/M +Anln ENSMUSG00000036777 G2/M +Lbr ENSMUSG00000004880 G2/M +Ckap5 ENSMUSG00000040549 G2/M +Cenpe ENSMUSG00000045328 G2/M +Ctcf ENSMUSG00000005698 G2/M +Nek2 ENSMUSG00000026622 G2/M +G2e3 ENSMUSG00000035293 G2/M +Gas2l3 ENSMUSG00000074802 G2/M +Cbx5 ENSMUSG00000009575 G2/M +Cenpa ENSMUSG00000029177 G2/M diff --git a/scib/resources/convert_genes.py b/scib/resources/convert_genes.py new file mode 100644 index 00000000..670a2568 --- /dev/null +++ b/scib/resources/convert_genes.py @@ -0,0 +1,112 @@ +import requests +from tqdm import tqdm + + +def get_gene_name_from_ensembl(gene_ids: list, species: str): + base_url = "https://rest.ensembl.org" + gene_names = [] + + for gene_id in tqdm(gene_ids): + response = requests.get( + f"{base_url}/lookup/id/{gene_id}?expand=1;species={species}", + headers={"Content-Type": "application/json"}, + ) + + if response.status_code == 200: + data = response.json() + gene_name = data.get("display_name", gene_id) + gene_names.append(gene_name) + else: + print(f"Error: {response.status_code}, skipping gene {gene_id}...") + gene_names.append(gene_id) + + return gene_names + + +def get_gene_id_from_ensembl(gene_names: list, species: str): + base_url = "https://rest.ensembl.org" + gene_ids = [] + + for gene_name in tqdm(gene_names): + response = requests.get( + f"{base_url}/xrefs/symbol/{species}/{gene_name}?expand=1", + headers={"Content-Type": "application/json"}, + ) + + if response.status_code == 200: + data = response.json() + if data: + gene_id = data[0].get( + "id", gene_name + ) # Get the first result's Ensembl ID + gene_ids.append(gene_id) + else: + gene_ids.append("Not Found") + else: + print(f"Error: {response.status_code}, skipping gene {gene_name}...") + gene_ids.append(gene_name) + + return gene_ids + + +if __name__ == "__main__": + from pathlib import Path + + import pandas as pd + + root = Path(__file__).parent + + cc_files = { + "mus_musculus": "https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt", + "homo_sapiens": "https://raw.githubusercontent.com/theislab/scanpy_usage/master/180209_cell_cycle/data/regev_lab_cell_cycle_genes.txt", + "caenorhabditis_elegans": "https://raw.githubusercontent.com/hbc/tinyatlas/refs/heads/master/cell_cycle/Caenorhabditis_elegans.csv", + "danio_rerio": "https://raw.githubusercontent.com/hbc/tinyatlas/refs/heads/master/cell_cycle/Danio_rerio.csv", + } + + # Tirosh mouse and human + # processed according to https://github.com/scverse/scanpy_usage/blob/master/180209_cell_cycle/cell_cycle.ipynb + for organism in ["mus_musculus", "homo_sapiens"]: + print(f"Organism: {organism}") + + # read file + gene_names = pd.read_csv(cc_files[organism], header=None)[0] + + if organism == "mus_musculus": + gene_names = gene_names.str.capitalize() + + # convert gene names + gene_ids = get_gene_id_from_ensembl(gene_names, species=organism) + + # create gene map + gene_map = pd.DataFrame(dict(gene_name=gene_names, gene_id=gene_ids)) + + # set cell cycle phase + gene_map.loc[:43, "phase"] = "S" + gene_map.loc[43:, "phase"] = "G2/M" + + # write to file + print(gene_map) + gene_map.to_csv( + root / f"cell_cycle_genes_{organism}.tsv", sep="\t", index=False + ) + + # Tinyatlas gene sets + # https://github.com/hbc/tinyatlas/tree/master/cell_cycle + for organism in ["caenorhabditis_elegans", "danio_rerio"]: + print(f"Organism: {organism}") + + # read file + gene_map = pd.read_csv(cc_files[organism]) + gene_map["gene_id"] = gene_map["geneID"] + del gene_map["geneID"] + + # get gene names + gene_map["gene_name"] = get_gene_name_from_ensembl( + gene_map["gene_id"], species=organism + ) + + # write to file + print(gene_map) + gene_map.to_csv( + root / f"cell_cycle_genes_{organism}.tsv", sep="\t", index=False + ) diff --git a/setup.cfg b/setup.cfg index 23e359b4..d33184f4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,7 +66,7 @@ zip_safe = False [options.package_data] scib = - resources/*.txt + resources/* knn_graph/* [options.extras_require] diff --git a/tests/conftest.py b/tests/conftest.py index 972cf45a..87faf04d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -97,3 +97,29 @@ def adata_clustered(adata_neighbors): adata_obj, label_key="celltype", cluster_key="cluster", verbose=True ) yield adata_obj + + +DATASETS = { + "c_elegans": "https://github.com/Munfred/wormcells-data/releases/download/cao2017/cao2017.h5ad", + "zebrafish": "https://figshare.com/ndownloader/files/27265280", + # from https://cellrank.readthedocs.io/en/stable/_modules/cellrank/datasets.html +} + + +@pytest.fixture() +def adata_from_url(request): + dataset_name = request.param + url = DATASETS[dataset_name] + + adata = sc.read(f"{dataset_name}.h5ad", backup_url=url) + assert adata is not None + adata.uns["dataset_name"] = dataset_name + + if "gene_id" in adata.var.columns: + adata.var_names = adata.var["gene_id"] + + if dataset_name == "zebrafish": + adata.var_names = adata.var_names.str.lower() + adata = adata[:, ~adata.var_names.duplicated()].copy() + + yield adata diff --git a/tests/preprocessing/test_gene_scoring.py b/tests/preprocessing/test_gene_scoring.py new file mode 100644 index 00000000..e3b26135 --- /dev/null +++ b/tests/preprocessing/test_gene_scoring.py @@ -0,0 +1,44 @@ +import pytest +import scanpy as sc + +import scib + + +def test_mouse(adata_paul15): + + assert "S_score" not in adata_paul15.obs.columns + assert "G2M_score" not in adata_paul15.obs.columns + assert "phase" not in adata_paul15.obs.columns + + scib.pp.score_cell_cycle( + adata_paul15, + organism="mouse", + ) + assert "S_score" in adata_paul15.obs.columns + assert "G2M_score" in adata_paul15.obs.columns + assert "phase" in adata_paul15.obs.columns + + scib.pp.score_cell_cycle( + adata_paul15, + organism="mus musculus", + ) + + +def test_human(adata_paul15): + scib.pp.score_cell_cycle( + sc.datasets.pbmc68k_reduced(), + organism="human", + ) + with pytest.raises(ValueError): + scib.pp.score_cell_cycle( + adata_paul15, + organism="human", + ) + + +@pytest.mark.parametrize("adata_from_url", ["c_elegans", "zebrafish"], indirect=True) +def test_organism(adata_from_url): + scib.pp.score_cell_cycle( + adata_from_url, + organism=adata_from_url.uns["dataset_name"], + )