Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ Crucially, **`DonorData`** ensures that genetic data and single-cell modalities
- **[Variant Preprocessing & Annotation](https://cellink-docs.readthedocs.io/en/latest/tutorials/explore_annotations.html):** Tools for quality control, annotation (VCF export/import), and selection of genetic variants.
- **Specialized Downstream Analysis:** Easily perform complex genetic analyses on single-cell expression data, including:
- [eQTL mapping](https://cellink-docs.readthedocs.io/en/latest/tutorials/pseudobulk_eqtl.html).
<!-- * Colocalization analysis with established disease loci. -->
- [Rare variant association studies](https://cellink-docs.readthedocs.io/en/latest/tutorials/burden_testing.html).
- **Interoperability:** **cellink** enhances standard workflows through data exports compatible with common genetic analysis tools, e.g., for [eQTL analysis with jaxqtl or tensorqtl](https://cellink-docs.readthedocs.io/en/latest/tutorials/pseudobulk_eqtl_jaxqtl_tensorqtl.html) and includes built-in [dataloaders for deep learning](https://cellink-docs.readthedocs.io/en/latest/tutorials/run_dataloader.html).
- [Clumping & pruning](https://cellink-docs.readthedocs.io/en/latest/tutorials/clumping_pruning.html).
- [Colocalization analysis](https://cellink-docs.readthedocs.io/en/latest/tutorials/colocalization.html).
- **Interoperability:** **cellink** enhances standard workflows through data exports compatible with common genetic analysis
tools, e.g., for [eQTL analysis with jaxqtl or tensorqtl](https://cellink-docs.readthedocs.io/en/latest/tutorials/pseudobulk_eqtl_jaxqtl_tensorqtl.html) and includes built-in [dataloaders for deep learning](https://cellink-docs.readthedocs.io/en/latest/tutorials/run_dataloader.html).

## 🚀 Getting Started

Expand Down
66 changes: 29 additions & 37 deletions docs/tutorials/burden_testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,7 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/modules/i12g/anaconda/envs/eh-scgenetics_new2/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/opt/modules/i12g/anaconda/envs/eh-scgenetics_new2/lib/python3.11/site-packages/limix_core/__init__.py:12: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
" from pkg_resources import DistributionNotFound as _DistributionNotFound\n"
]
}
],
"outputs": [],
"source": [
"from pathlib import Path\n",
"import warnings\n",
Expand All @@ -80,11 +69,11 @@
"from tqdm.auto import tqdm\n",
"\n",
"import cellink as cl\n",
"from cellink._core import DAnn, GAnn\n",
"from cellink._core import DAnn\n",
"from cellink.tl._rvat import run_burden_test, run_skat_test, beta_weighting\n",
"from cellink.utils import column_normalize, gaussianize\n",
"from cellink.at.acat import acat_test\n",
"from cellink.resources import get_onek1k"
"from cellink.resources import get_dummy_onek1k"
]
},
{
Expand Down Expand Up @@ -121,7 +110,7 @@
"source": [
"## Load and Prepare Data\n",
"\n",
"Here, we load a prepared dataset (`onek1k`) that includes genotype and expression information from human donors. We also extract gene annotations using Ensembl via `pybiomart`, which are essential for defining cis-windows during eQTL analysis."
"Here, we load a prepared dataset (`onek1k`) that includes genotype and expression information from human donors. (This is a subset of the full OneK1K dataset, which can be downloaded, and prepared using `get_onek1k()`) We also extract gene annotations using Ensembl via `pybiomart`, which are essential for defining cis-windows during eQTL analysis."
]
},
{
Expand All @@ -130,28 +119,23 @@
"metadata": {},
"outputs": [
{
"name": "stderr",
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:root:/data/ouga/home/ag_gagneur/hoev/cellink_sample_data/onek1k/onek1k_cellxgene.h5ad already exists\n",
"WARNING:root:No checksum provided, skipping verification\n",
"INFO:root:/data/ouga/home/ag_gagneur/hoev/cellink_sample_data/onek1k/OneK1K.noGP.vcf.gz already exists\n",
"WARNING:root:No checksum provided, skipping verification\n",
"INFO:root:/data/ouga/home/ag_gagneur/hoev/cellink_sample_data/onek1k/OneK1K.noGP.vcf.gz.csi already exists\n",
"WARNING:root:No checksum provided, skipping verification\n",
"INFO:root:/data/ouga/home/ag_gagneur/hoev/cellink_sample_data/onek1k/gene_counts_Ensembl_105_phenotype_metadata.tsv.gz already exists\n",
"WARNING:root:No checksum provided, skipping verification\n"
"[2025-12-29 01:34:14,420] INFO:root: /Users/larnoldt/cellink_data/dummy_onek1k/dummy_onek1k.dd.h5 already exists\n",
"[2025-12-29 01:34:14,420] WARNING:root: No checksum provided, skipping verification\n",
"[2025-12-29 01:34:15,611] INFO:root: Loaded dummy OneK1K dataset: (100, 146939, 125366, 34073)\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╔═<span style=\"color: #ff005f; text-decoration-color: #ff005f; font-weight: bold\"> DonorData(n_donors=981, n_cells_per_donor=[333-3,511], donor_id='donor_id') </span>═══════════════════════════════╗\n",
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">╔═<span style=\"color: #ff005f; text-decoration-color: #ff005f; font-weight: bold\"> DonorData(n_donors=100, n_cells_per_donor=[613-2,731], donor_id='donor_id') </span>═══════════════════════════════╗\n",
"║ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ║\n",
"║ ┃<span style=\"color: #ff005f; text-decoration-color: #ff005f; font-weight: bold\"> G (donors) </span>┃<span style=\"color: #ff005f; text-decoration-color: #ff005f; font-weight: bold\"> C (cells) </span>┃ ║\n",
"║ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ ║\n",
"║ │ AnnData object with n_obs × n_vars = 981 × 136,776 │ AnnData object with n_obs × n_vars = 1,248,980 × │ ║\n",
"║ │ │ 36,469 │ ║\n",
"║ │ AnnData object with n_obs × n_vars = 100 × 146,939View of AnnData object with n_obs × n_vars = │ ║\n",
"║ │ │ 125,366 × 34,073 │ ║\n",
"║ │ var: 'chrom', 'pos', 'a0', 'a1', 'AC', │ obs: 'orig.ident', 'nCount_RNA', │ ║\n",
"║ │ 'AC_Hemi', 'AC_Het', 'AC_Hom', 'AF', 'AN', 'ER2', │ 'nFeature_RNA', 'percent.mt', <span style=\"color: #ff005f; text-decoration-color: #ff005f; font-weight: bold\">'donor_id', </span> │ ║\n",
"║ │ 'ExcHet', 'HWE', 'IMPUTED', 'maf', 'NS', 'R2', │ 'pool_number', 'predicted.celltype.l2', │ ║\n",
Expand All @@ -173,7 +157,8 @@
"║ │ │ 'vst.variance.standardized', 'vst.variable', │ ║\n",
"║ │ │ 'feature_is_filtered', 'feature_name', │ ║\n",
"║ │ │ 'feature_reference', 'feature_biotype', │ ║\n",
"║ │ │ 'feature_length', 'feature_type' │ ║\n",
"║ │ │ 'feature_length', 'feature_type', 'start', 'end', │ ║\n",
"║ │ │ 'chrom' │ ║\n",
"║ │ obsm: 'gPCs' │ uns: 'cell_type_ontology_term_id_colors', │ ║\n",
"║ │ │ 'citation', 'default_embedding', │ ║\n",
"║ │ │ 'schema_reference', 'schema_version', 'title' │ ║\n",
Expand All @@ -185,12 +170,12 @@
"</pre>\n"
],
"text/plain": [
"╔═\u001b[1;38;5;197m DonorData(n_donors=981, n_cells_per_donor=[333-3,511], donor_id='donor_id') \u001b[0m═══════════════════════════════╗\n",
"╔═\u001b[1;38;5;197m DonorData(n_donors=100, n_cells_per_donor=[613-2,731], donor_id='donor_id') \u001b[0m═══════════════════════════════╗\n",
"║ ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ║\n",
"║ ┃\u001b[1;38;5;197m \u001b[0m\u001b[1;38;5;197mG (donors) \u001b[0m\u001b[1;38;5;197m \u001b[0m┃\u001b[1;38;5;197m \u001b[0m\u001b[1;38;5;197mC (cells) \u001b[0m\u001b[1;38;5;197m \u001b[0m┃ ║\n",
"║ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ ║\n",
"║ │ AnnData object with n_obs × n_vars = 981 × 136,776 │ AnnData object with n_obs × n_vars = 1,248,980 × │ ║\n",
"║ │ │ 36,469 │ ║\n",
"║ │ AnnData object with n_obs × n_vars = 100 × 146,939View of AnnData object with n_obs × n_vars = │ ║\n",
"║ │ │ 125,366 × 34,073 │ ║\n",
"║ │ var: 'chrom', 'pos', 'a0', 'a1', 'AC', │ obs: 'orig.ident', 'nCount_RNA', │ ║\n",
"║ │ 'AC_Hemi', 'AC_Het', 'AC_Hom', 'AF', 'AN', 'ER2', │ 'nFeature_RNA', 'percent.mt', \u001b[1;38;5;197m'donor_id', \u001b[0m │ ║\n",
"║ │ 'ExcHet', 'HWE', 'IMPUTED', 'maf', 'NS', 'R2', │ 'pool_number', 'predicted.celltype.l2', │ ║\n",
Expand All @@ -212,7 +197,8 @@
"║ │ │ 'vst.variance.standardized', 'vst.variable', │ ║\n",
"║ │ │ 'feature_is_filtered', 'feature_name', │ ║\n",
"║ │ │ 'feature_reference', 'feature_biotype', │ ║\n",
"║ │ │ 'feature_length', 'feature_type' │ ║\n",
"║ │ │ 'feature_length', 'feature_type', 'start', 'end', │ ║\n",
"║ │ │ 'chrom' │ ║\n",
"║ │ obsm: 'gPCs' │ uns: 'cell_type_ontology_term_id_colors', │ ║\n",
"║ │ │ 'citation', 'default_embedding', │ ║\n",
"║ │ │ 'schema_reference', 'schema_version', 'title' │ ║\n",
Expand All @@ -236,7 +222,7 @@
}
],
"source": [
"dd = get_onek1k(config_path=\"../../src/cellink/resources/config/onek1k.yaml\", verify_checksum=False)\n",
"dd = get_dummy_onek1k(config_path=\"../../src/cellink/resources/config/dummy_onek1k.yaml\", verify_checksum=False)\n",
"dd"
]
},
Expand All @@ -246,6 +232,7 @@
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"def _get_ensembl_gene_id_start_end_chr():\n",
" from pybiomart import Server\n",
"\n",
Expand All @@ -262,7 +249,8 @@
" \"Chromosome/scaffold name\": GAnn.chrom,\n",
" }\n",
" )\n",
" return ensembl_gene_id_start_end_chr"
" return ensembl_gene_id_start_end_chr\n",
"\"\"\""
]
},
{
Expand Down Expand Up @@ -729,8 +717,10 @@
}
],
"source": [
"\"\"\"\n",
"ensembl_gene_id_start_end_chr = _get_ensembl_gene_id_start_end_chr()\n",
"ensembl_gene_id_start_end_chr"
"ensembl_gene_id_start_end_chr\n",
"\"\"\""
]
},
{
Expand All @@ -739,8 +729,10 @@
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"dd.C.var = dd.C.var.join(ensembl_gene_id_start_end_chr)\n",
"dd.C.obs[DAnn.donor] = dd.C.obs[original_donor_col]\n",
"\"\"\"\n",
"dd.G.obsm[\"gPCs\"] = dd.G.obsm[\"gPCs\"][dd.G.obsm[\"gPCs\"].columns[:n_gpcs]]"
]
},
Expand Down Expand Up @@ -11703,7 +11695,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "eh-scgenetics_new2",
"display_name": "single_cell_base3",
"language": "python",
"name": "python3"
},
Expand All @@ -11717,7 +11709,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
"version": "3.12.8"
}
},
"nbformat": 4,
Expand Down
Loading
Loading