diff --git a/.github/workflows/basic_checks.yaml b/.github/workflows/basic_checks.yaml deleted file mode 100644 index fd95370..0000000 --- a/.github/workflows/basic_checks.yaml +++ /dev/null @@ -1,168 +0,0 @@ -name: Check, build, and push image - -on: [push, pull_request] - -env: - cache-version: v1 - -jobs: - r-build-and-check: - runs-on: ubuntu-latest - container: bioconductor/bioconductor_docker:devel - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - steps: - - uses: actions/checkout@v2 - - - name: Query dependencies and update old packages - run: | - BiocManager::install(ask=FALSE) - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - shell: Rscript {0} - - - name: Cache R packages - if: runner.os != 'Windows' - uses: actions/cache@v1 - with: - path: /usr/local/lib/R/site-library - key: ${{ env.cache-version }}-${{ runner.os }}-r-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-r- - - # This lets us augment with additional dependencies - - uses: r-lib/actions/setup-r@v2 - with: - r-version: release - - - uses: r-lib/actions/setup-r-dependencies@v2 - with: - extra-packages: rcmdcheck remotes reticulate BiocManager - - - uses: actions/setup-python@v4 - with: - python-version: "3.x" - - - name: setup r-reticulate venv - shell: Rscript {0} - run: | - python_packages <- - c("numpy") - - library(reticulate) - sessionInfo() - - - uses: r-lib/actions/check-r-package@v2 - -# - name: Install system dependencies -# if: runner.os == 'Linux' -# env: -# RHUB_PLATFORM: linux-x86_64-ubuntu-gcc -# run: | -# python -h -# Rscript -e "library(reticulate);sessionInfo();R.home()" -# Rscript -e "remotes::install_github('r-hub/sysreqs')" -# sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") -# cat 'DESCRIPTION' -# echo $sysreqs -# sudo -s eval "$sysreqs" - -# - name: Install dependencies -# run: | -# BiocManager::repositories() -# remotes::install_deps(dependencies = TRUE, repos = BiocManager::repositories()) -# remotes::install_cran("rcmdcheck") -# shell: Rscript {0} - - - name: Check - env: - _R_CHECK_CRAN_INCOMING_REMOTE_: false - run: rcmdcheck::rcmdcheck(args = c("--no-manual"), error_on = "warning", check_dir = "check") - shell: Rscript {0} - - - - name: Build pkgdown - run: | - PATH=$PATH:$HOME/bin/ Rscript -e 'pkgdown::build_site(".")' - - # deploy needs rsync? Seems so. - - name: Install deploy dependencies - run: | - apt-get update - apt-get -y install rsync - - - name: Deploy 🚀 - uses: JamesIves/github-pages-deploy-action@releases/v4 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH: gh-pages # The branch the action should deploy to. - FOLDER: docs # The folder the action should deploy. - docker-build-and-push: - #needs: r-build-and-check - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - # This is used to complete the identity challenge - # with sigstore/fulcio when running outside of PRs. - id-token: write - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - - name: Set Environment Variables - run: | - REPO_LOWER="$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')" - REGISTRY=ghcr.io - echo "BUILD_DATE=$(date +'%Y-%m-%d %H:%M:%S')" >> $GITHUB_ENV - echo "GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_ENV - echo "REGISTRY=${REGISTRY}" >> $GITHUB_ENV - echo "IMAGE=${REGISTRY}/${REPO_LOWER}" >> $GITHUB_ENV - - - name: Show environment - run: | - env - - - # Install the cosign tool except on PR - # https://github.com/sigstore/cosign-installer - - name: Install cosign - if: github.event_name != 'pull_request' - uses: sigstore/cosign-installer@1e95c1de343b5b0c23352d6417ee3e48d5bcd422 - with: - cosign-release: 'v1.4.0' - - - # Workaround: https://github.com/docker/build-push-action/issues/461 - - name: Setup Docker buildx - uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf - - # Login against a Docker registry except on PR - # https://github.com/docker/login-action - - name: Log into registry ${{ env.REGISTRY }} - if: github.event_name != 'pull_request' - uses: docker/login-action@28218f9b04b4f3f62068d7b6ce6ca5b26e35336c - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # Extract metadata (tags, labels) for Docker - # https://github.com/docker/metadata-action - - name: Extract Docker metadata - id: meta - uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 - with: - images: ${{ env.IMAGE }} - - # Build and push Docker image with Buildx (don't push on PR) - # https://github.com/docker/build-push-action - - name: Build and push Docker image - id: build-and-push - uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc - with: - context: . - push: ${{ github.event_name != 'pull_request' }} - tags: | - ${{ env.IMAGE }}:latest - ${{ env.IMAGE }}:${{ env.GIT_SHA }} diff --git a/.github/workflows/rworkflows.yml b/.github/workflows/rworkflows.yml new file mode 100644 index 0000000..891b236 --- /dev/null +++ b/.github/workflows/rworkflows.yml @@ -0,0 +1,123 @@ +name: rworkflows +'on': + push: + branches: + - master + - main + - devel + - RELEASE_** + pull_request: + branches: + - master + - main + - devel + - RELEASE_** +jobs: + rworkflows: + permissions: write-all + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + container: ${{ matrix.config.cont }} + strategy: + fail-fast: ${{ false }} + matrix: + config: + - os: ubuntu-latest + bioc: devel + r: auto + cont: ghcr.io/bioconductor/bioconductor_docker:devel + rspm: ~ + - os: macOS-latest + bioc: release + r: auto + cont: ~ + rspm: ~ + - os: windows-latest + bioc: release + r: auto + cont: ~ + rspm: ~ + steps: + - uses: neurogenomics/rworkflows@master + with: + run_bioccheck: ${{ false }} + run_rcmdcheck: ${{ true }} + as_cran: ${{ true }} + run_vignettes: ${{ true }} + has_testthat: ${{ true }} + run_covr: ${{ true }} + run_pkgdown: ${{ true }} + has_runit: ${{ false }} + has_latex: ${{ false }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run_docker: ${{ false }} + DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }} + runner_os: ${{ runner.os }} + cache_version: cache-v1 + docker_registry: ghcr.io + + docker-build-and-push: + #needs: r-build-and-check + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Set Environment Variables + run: | + REPO_LOWER="$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')" + REGISTRY=ghcr.io + echo "BUILD_DATE=$(date +'%Y-%m-%d %H:%M:%S')" >> $GITHUB_ENV + echo "GIT_SHA=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_ENV + echo "REGISTRY=${REGISTRY}" >> $GITHUB_ENV + echo "IMAGE=${REGISTRY}/${REPO_LOWER}" >> $GITHUB_ENV + + - name: Show environment + run: | + env + + # Install the cosign tool except on PR + # https://github.com/sigstore/cosign-installer + - name: Install cosign + if: github.event_name != 'pull_request' + uses: sigstore/cosign-installer@v3 + + - name: Setup Docker buildx + uses: docker/setup-buildx-action@v3 + + # Login against a Docker registry except on PR + # https://github.com/docker/login-action + - name: Log into registry ${{ env.REGISTRY }} + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Extract metadata (tags, labels) for Docker + # https://github.com/docker/metadata-action + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE }} + + # Build and push Docker image with Buildx (don't push on PR) + # https://github.com/docker/build-push-action + - name: Build and push Docker image + id: build-and-push + uses: docker/build-push-action@v5 + with: + context: . + push: ${{ github.event_name != 'pull_request' }} + tags: | + ${{ env.IMAGE }}:latest + ${{ env.IMAGE }}:${{ env.GIT_SHA }} diff --git a/.gitignore b/.gitignore index 1558cc4..d2a4d04 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,6 @@ TidyGenomicsTranscriptomicsWorkshop_bioc2023.Rproj .DS_Store /doc/ /Meta/ - -vignettes/tidyomicsWorkshopBioc2023_cache/ -vignettes/tidyomicsWorkshopBioc2023_files/ +vignettes/tidySpatialWorkshop_cache/ +vignettes/tidySpatialWorkshop_files/ +tidySpatialWorkshop.Rproj diff --git a/DESCRIPTION b/DESCRIPTION index 2732af4..2a3ba1c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,55 +1,69 @@ -Package: tidyomicsWorkshopBioc2023 -Title: Tidy genomic and transcriptomic single-cell analyses -Version: 0.16.13 -Authors@R: - c( - person(given = "Stefano", - family = "Mangiola", - role = c("aut"), - email = "mangiola.s@wehi.edu.au", +Package: tidySpatialWorkshop +Title: Workshop Materials for Tidy Spatial Analysis +Version: 0.18.5 +Authors@R: c( + person("Stefano", "Mangiola", email = "mangiola.s@wehi.edu.au", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7474-836X")), - person(given = "Michael", - family = "Love", - role = c("aut", "cre"), - email = "michaelisaiahlove@gmail.com", - comment = c(ORCID = "0000-0001-8401-0545")) - ) -Description: Workflow for BioC conference showing tidy genomic - and transcriptomic analyses, for a single-cell RNA-seq - application + person("Luciano", "Martelotto", email = "luciano.martelotto@adelaide.edu.au", role = "aut") + ) +Description: Workshop materials for learning tidy analysis of spatial transcriptomics data. + Includes example datasets and vignettes demonstrating the use of tidyomics tools for + spatial data analysis, covering both sequencing-based and imaging-based spatial transcriptomics. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true LazyDataCompression: xz Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 +biocViews: + Spatial, + Transcriptomics, + SingleCell, + Software, + Visualization Depends: - Biobase, - R (>= 4.2.0) + R (>= 4.3.0) Imports: - SingleCellExperiment, + SpatialExperiment, + tidySpatialExperiment, + tidySummarizedExperiment, + tidySingleCellExperiment, ggplot2, - plotly, dplyr, + tidyr, + stringr, + purrr, + glue, + scuttle, + scater, + scran, + methods +Suggests: + plotly, colorspace, dittoSeq, - tidySingleCellExperiment, - tidySummarizedExperiment, - scales, - batchelor, - scater, - purrr, - plyranges, - ensembldb, - EnsDb.Hsapiens.v86, + ggspavis, + RColorBrewer, + spatialLIBD, + ExperimentHub, + Banksy, + Seurat, + cowplot, + scRNAseq, + SPOTlight, + SubcellularSpatialData, forcats, - nullranges -Suggests: + tibble, + hoodscanR, + scico, + MoleculeExperiment, knitr, rmarkdown, - pkgdown -Biarch: true -URL: https://tidyomics.github.io/tidyomicsWorkshopBioc2023 -BugReports: https://github.com/tidyomics/tidyomicsWorkshopBioc2023/issues/new/choose + here, + igraph, + magick, + scatterpie, + ggcorrplot VignetteBuilder: knitr -DockerImage: ghcr.io/tidyomics/tidyomicsWorkshopBioc2023:latest +URL: https://github.com/tidyomics/tidySpatialWorkshop +BugReports: https://github.com/tidyomics/tidySpatialWorkshop/issues diff --git a/R/data_transcriptomics.R b/R/data_transcriptomics.R index 3f72b65..955ca25 100644 --- a/R/data_transcriptomics.R +++ b/R/data_transcriptomics.R @@ -82,3 +82,62 @@ #' @format GRanges object #' @usage data(pbmc_h3k4me3_hg38) "pbmc_h3k4me3_hg38" + +#' Pre-made gate for spatial transcriptomics data +#' +#' A gate object created using tidygate for the spatial transcriptomics workshop. +#' This gate was drawn on sample "151673" to select specific regions of interest +#' based on tissue morphology. The gate is used to demonstrate reproducible +#' spatial filtering in the workshop materials. +#' +#' @format A gate object compatible with tidygate and tidySpatialExperiment::gate() +#' +#' @source Created during the Tidy Spatial Workshop 2025 using interactive gating +#' on Visium spatial transcriptomics data from the spatialLIBD package +#' +#' @usage data(spatial_data_gated) +#' +"spatial_data_gated" + +#' Small region subset of Xenium spatial transcriptomics data +#' +#' A subset of Xenium spatial transcriptomics data containing molecules from a small +#' region (x between 3700-4200 and y between 5000-5500) for demonstration purposes. +#' This data is derived from the SubcellularSpatialData package's mouse brain dataset. +#' +#' @format A data frame containing single-molecule spatial transcriptomics data with the following columns: +#' \describe{ +#' \item{sample_id}{Sample identifier for the Xenium dataset} +#' \item{cell}{Cell identifier, NA for molecules not assigned to cells} +#' \item{gene}{Gene name} +#' \item{genetype}{Type of gene (e.g., "Gene")} +#' \item{x}{x-coordinate of the molecule} +#' \item{y}{y-coordinate of the molecule} +#' \item{counts}{Count value for the transcript (typically 1 for single-molecule data)} +#' \item{region}{Anatomical region annotation (e.g., "CA1")} +#' \item{technology}{Technology platform used ("Xenium")} +#' \item{level}{Hierarchical level of annotation (e.g., "Level9")} +#' \item{Level0}{Root level annotation ("root")} +#' \item{Level1}{First level annotation (e.g., "grey")} +#' \item{Level2}{Second level annotation (e.g., "CH")} +#' \item{Level3}{Third level annotation (e.g., "CTX")} +#' \item{Level4}{Fourth level annotation (e.g., "CTXpl")} +#' \item{Level5}{Fifth level annotation (e.g., "HPF")} +#' \item{Level6}{Sixth level annotation (e.g., "HIP")} +#' \item{Level7}{Seventh level annotation} +#' \item{Level8}{Eighth level annotation (e.g., "CA")} +#' \item{Level9}{Ninth level annotation (e.g., "CA1")} +#' \item{Level10}{Tenth level annotation} +#' \item{Level11}{Eleventh level annotation} +#' \item{transcript_id}{Unique identifier for the transcript} +#' \item{overlaps_nucleus}{Boolean (0/1) indicating if molecule overlaps with nucleus} +#' \item{z_location}{z-coordinate of the molecule} +#' \item{qv}{Quality value score} +#' } +#' +#' @source Data derived from SubcellularSpatialData package (EH8230 in ExperimentHub), +#' filtered to a small region for demonstration purposes. +#' Original data from Xenium V1 FF Mouse Brain MultiSection. +#' @usage data(tx_small_region) +#' +"tx_small_region" diff --git a/README.md b/README.md index f3f868e..b110807 100644 --- a/README.md +++ b/README.md @@ -1,125 +1,110 @@ -# tidyomicsWorkshopBioc2023 +# tidySpatialWorkshop [![DOI](https://zenodo.org/badge/379767139.svg)](https://zenodo.org/badge/latestdoi/379767139) -[![Check, build, and push image](https://github.com/tidyomics/tidyomicsWorkshopBioc2023/actions/workflows/basic_checks.yaml/badge.svg)](https://github.com/tidyomics/tidyomicsWorkshopBioc2023/actions/workflows/basic_checks.yaml) +[![Check, build, and push image](https://github.com/tidyomics/tidySpatialWorkshop/actions/workflows/basic_checks.yaml/badge.svg)](https://github.com/tidyomics/tidySpatialWorkshop/actions/workflows/basic_checks.yaml) -## Instructor names and contact information +## Instructor names and contact information -* Stefano Mangiola -* Michael Love +* Stefano Mangiola +* Luciano Martelotto ## Syllabus -Material [web page](https://tidyomics.github.io/tidyomicsWorkshopBioc2023/). +Material [web page](https://tidyomics.github.io/tidySpatialWorkshop/) More details on the workshop are below. -## Conference Galaxy platform +## Workshop partner: Physalia -You can find the workshop on the [BioC2023 Galaxy platform](https://workshop.bioconductor.org/) listed as: - -* **Package Demo: tidySingleCellExperiment** showing tidy genomic and transcriptomic analyses, for a single-cell RNA-seq application - -Click [here](https://bioc2023.bioconductor.org/workshops/) for a full list of the BioC2023 workshop demos. +
+Physalia + +
## Workshop package installation If you want to install the packages and material post-workshop, the -instructions are below. The workshop is designed for R `4.3` and -Bioconductor 3.17. +instructions are below. The workshop is designed for R `4.4` and +Bioconductor 3.19. ``` -#install.packages('remotes') - + # Install workshop package +#install.packages('BiocManager') +BiocManager::install("tidyomics/tidySpatialWorkshop", dependencies = TRUE) -remotes::install_github("tidyomics/tidyomicsWorkshopBioc2023", build_vignettes = TRUE) +# Then build the vignettes +BiocManager::install("tidyomics/tidySpatialWorkshop", build_vignettes = TRUE, force=TRUE) # To view vignette -library(tidyomicsWorkshopBioc2023) -vignette("tidyGenomicsTranscriptomics") +library(tidySpatialWorkshop) +vignette("Session_1_sequencing_assays") +``` + +## Interactive execution of the vignettes + +From command line, and enter the tidySpatialWorkshop directory. + ``` +# Open the command line +git clone git@github.com:tidyomics/tidySpatialWorkshop.git + +``` + +Alternatively download the [git zipped package](https://github.com/tidyomics/tidySpatialWorkshop/archive/refs/heads/devel.zip). Uncompress it. And enter the directory. -To run the code, you could then copy and paste the code from the -workshop vignette or -[R markdown file](https://raw.githubusercontent.com/tidyomics/tidyomicsWorkshopBioc2023/master/vignettes/tidyGenomicsTranscriptomics.Rmd) + +To run the code, you could then copy and paste the code from the workshop vignette or +[R markdown file](https://github.com/tidyomics/tidySpatialWorkshop/blob/devel/vignettes/Session_1_sequencing_assays.Rmd) into a new R Markdown file on your computer. ## Workshop Description -This tutorial will present how to perform analysis of single-cell RNA -sequencing data following the tidy data paradigm. The tidy data -paradigm provides a standard way to organise data values within a -dataset, where each variable is a column, each observation is a row, -and data is manipulated using an easy-to-understand vocabulary. Most -importantly, the data structure remains consistent across manipulation -and analysis functions. - -This can be achieved with the integration of packages present in the R -CRAN and Bioconductor ecosystem, including -[tidyseurat](https://stemangiola.github.io/tidyseurat/), -[tidySingleCellExperiment](https://stemangiola.github.io/tidySingleCellExperiment/) -and [tidyverse](https://www.tidyverse.org/). These packages are part -of the tidytranscriptomics suite that introduces a tidy approach to -RNA sequencing data representation and analysis. For more information -see the [tidy transcriptomics -blog](https://stemangiola.github.io/tidytranscriptomics/). - -In addition this workshop will finish with examples of how genomic and -transcriptomic data can be combined, e.g. ChIP-seq and scRNA-seq, also -using tidy data paradigms for genomic ranges. This is enabled with the -[plyranges](https://sa-lee.github.io/plyranges/) package, -with further information provided in the -[tidy-ranges-tutorial](https://tidyomics.github.io/tidy-ranges-tutorial/). +This workshop aims to equip participants with a foundational understanding of spatial omics, exploring its significant technologies, applications, and the distinction between imaging and sequencing approaches. We'll begin with a welcome session, outlining the objectives and structure for the day. The content will delve into the basics of spatial omics, discussing its relevance in modern biology and its impact on scientific research. We'll then compare various spatial omics technologies, focusing on the differences and practical considerations between imaging-based and sequencing-based methodologies. + +Further, we'll examine detailed sequencing techniques, experimental design, and data analysis challenges, providing insights into effective problem-solving strategies. An overview of analysis frameworks, including principles of 'tidy' data in spatial omics, will also be covered. The workshop will conclude with a summary of key takeaways and a Q&A session, ensuring participants leave with a comprehensive understanding of spatial omics. This session promises to be insightful, offering valuable knowledge for attendees to apply in their research fields. ### Pre-requisites -* Basic familiarity with single cell transcriptomic analyses +* Basic familiarity with R * Basic familiarity with tidyverse -* Basic familiarity with genomic ranges +* Basic familiarity with transcriptomic analyses ### Workshop Participation -The workshop format is a 45 minute session consisting of hands-on +The workshop format is 3 days with 3 hour sessions consisting of introduction of the experimental techniques, and hands-on demos, exercises and Q&A. ### _R_ / _Bioconductor_ packages used -* tidySingleCellExperiment -* plyranges +* SpatialExperiment +* MoleculeExperiment + +* SubcellularSpatialData +* ExperimentHub +* spatialLIBD +* CuratedAtlasQueryR + +* tidyverse +* tidySpatialExperiment +* tidySummarizedExperiment +* tidybulk + +* scater +* scran +* scuttle +* Seurat + +* SPOTlight +* Banksy +* hoodscanR + ### Workshop goals and objectives -The tidytranscriptomics approach to single-cell RNA sequencing data -analysis abstracts out the coding-related complexity and provides -tools that use an intuitive and jargon-free vocabulary, enabling focus -on the statistical and biological challenges. - -#### Learning goals - -* To approach data representation and analysis though a tidy data - paradigm, integrating tidyverse with the SingleCellExperiment data - object. -* To explore integration of genomic and transcriptomic data also using - a tidy data paradigm. - -#### What you will learn - -- Basic `tidy` operations possible with `tidySingleCellExperiment` - and `GRanges` -- How to interface `SingleCellExperiment` with tidy manipulation and - visualisation -- A real-world case study that will showcase the power of `tidy` - single-cell methods compared with base/ad-hoc methods -- Examples of how to integrate genomic and transcriptomic data - (ChIP-seq and RNA-seq) - -#### What you will *not* learn - -- The molecular technology of single-cell sequencing -- The fundamentals of single-cell data analysis -- The fundamentals of tidy data analysis -- Detailed data integration methods (multi-view or multi-omics - algorithms) +Provide a foundational understanding of spatial omics, covering different technologies and the distinctions between imaging and +sequencing in experimental and analytical contexts. This with a focus on the tidy R paradigm and tidyomics. + + diff --git a/_pkgdown.yml b/_pkgdown.yml index 558341f..5793c57 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,4 @@ -url: https://tidyomics.github.io/tidyomicsWorkshopBioc2023 +url: https://tidyomics.github.io/tidySpatialWorkshop template: params: @@ -6,10 +6,10 @@ template: #ganalytics: UA-99999999-9 home: - title: "tidyomicsWorkshopBioc2023" + title: "tidySpatialWorkshop" type: inverse navbar: right: - icon: fa-github - href: https://github.com/tidyomics/tidyomicsWorkshopBioc2023 + href: https://github.com/tidyomics/tidySpatialWorkshop diff --git a/data/spatial_data_gated.rda b/data/spatial_data_gated.rda new file mode 100644 index 0000000..3ead9ca Binary files /dev/null and b/data/spatial_data_gated.rda differ diff --git a/data/tx_small_region.rda b/data/tx_small_region.rda new file mode 100644 index 0000000..d6ca142 Binary files /dev/null and b/data/tx_small_region.rda differ diff --git a/inst/images/curated_atlas_query.png b/inst/images/curated_atlas_query.png new file mode 100755 index 0000000..9cc873b Binary files /dev/null and b/inst/images/curated_atlas_query.png differ diff --git a/inst/images/physalia-min.png b/inst/images/physalia-min.png new file mode 100755 index 0000000..b60eb6d Binary files /dev/null and b/inst/images/physalia-min.png differ diff --git a/inst/images/spatialExperimentClass.png b/inst/images/spatialExperimentClass.png new file mode 100755 index 0000000..4e2c7ee Binary files /dev/null and b/inst/images/spatialExperimentClass.png differ diff --git a/inst/images/stomics.png b/inst/images/stomics.png new file mode 100644 index 0000000..b744865 Binary files /dev/null and b/inst/images/stomics.png differ diff --git a/inst/images/three_technologies.png b/inst/images/three_technologies.png new file mode 100644 index 0000000..2065c9e Binary files /dev/null and b/inst/images/three_technologies.png differ diff --git a/inst/images/tidySPE_gate.png b/inst/images/tidySPE_gate.png new file mode 100755 index 0000000..ccf7ba5 Binary files /dev/null and b/inst/images/tidySPE_gate.png differ diff --git a/inst/images/tidyomics.png b/inst/images/tidyomics.png new file mode 100755 index 0000000..3b38d69 Binary files /dev/null and b/inst/images/tidyomics.png differ diff --git a/inst/images/visium.png b/inst/images/visium.png new file mode 100644 index 0000000..f800eb1 Binary files /dev/null and b/inst/images/visium.png differ diff --git a/inst/images/visiumhd.png b/inst/images/visiumhd.png new file mode 100644 index 0000000..8eeb15f Binary files /dev/null and b/inst/images/visiumhd.png differ diff --git a/inst/vignettes/tidyomics.bib b/inst/vignettes/tidyomics.bib index 4d49972..78eb89a 100644 --- a/inst/vignettes/tidyomics.bib +++ b/inst/vignettes/tidyomics.bib @@ -87,7 +87,7 @@ @Article{ Lee2020 VOLUME = {9}, YEAR = {2020}, NUMBER = {109}, -DOI = {10.12688/f1000research.22259.1} +DOI = {10.12688/f1000research.220259.1} } @article{Love2020, diff --git a/man/spatial_data_gated.Rd b/man/spatial_data_gated.Rd new file mode 100644 index 0000000..a1d179d --- /dev/null +++ b/man/spatial_data_gated.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transcriptomics.R +\docType{data} +\name{spatial_data_gated} +\alias{spatial_data_gated} +\title{Pre-made gate for spatial transcriptomics data} +\format{ +A gate object compatible with tidygate and tidySpatialExperiment::gate() +} +\source{ +Created during the Tidy Spatial Workshop 2025 using interactive gating +on Visium spatial transcriptomics data from the spatialLIBD package +} +\usage{ +data(spatial_data_gated) +} +\description{ +A gate object created using tidygate for the spatial transcriptomics workshop. +This gate was drawn on sample "151673" to select specific regions of interest +based on tissue morphology. The gate is used to demonstrate reproducible +spatial filtering in the workshop materials. +} +\keyword{datasets} diff --git a/man/tx_small_region.Rd b/man/tx_small_region.Rd new file mode 100644 index 0000000..de3cef6 --- /dev/null +++ b/man/tx_small_region.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_transcriptomics.R +\docType{data} +\name{tx_small_region} +\alias{tx_small_region} +\title{Small region subset of Xenium spatial transcriptomics data} +\format{ +A data frame containing single-molecule spatial transcriptomics data with the following columns: +\describe{ +\item{sample_id}{Sample identifier for the Xenium dataset} +\item{cell}{Cell identifier, NA for molecules not assigned to cells} +\item{gene}{Gene name} +\item{genetype}{Type of gene (e.g., "Gene")} +\item{x}{x-coordinate of the molecule} +\item{y}{y-coordinate of the molecule} +\item{counts}{Count value for the transcript (typically 1 for single-molecule data)} +\item{region}{Anatomical region annotation (e.g., "CA1")} +\item{technology}{Technology platform used ("Xenium")} +\item{level}{Hierarchical level of annotation (e.g., "Level9")} +\item{Level0}{Root level annotation ("root")} +\item{Level1}{First level annotation (e.g., "grey")} +\item{Level2}{Second level annotation (e.g., "CH")} +\item{Level3}{Third level annotation (e.g., "CTX")} +\item{Level4}{Fourth level annotation (e.g., "CTXpl")} +\item{Level5}{Fifth level annotation (e.g., "HPF")} +\item{Level6}{Sixth level annotation (e.g., "HIP")} +\item{Level7}{Seventh level annotation} +\item{Level8}{Eighth level annotation (e.g., "CA")} +\item{Level9}{Ninth level annotation (e.g., "CA1")} +\item{Level10}{Tenth level annotation} +\item{Level11}{Eleventh level annotation} +\item{transcript_id}{Unique identifier for the transcript} +\item{overlaps_nucleus}{Boolean (0/1) indicating if molecule overlaps with nucleus} +\item{z_location}{z-coordinate of the molecule} +\item{qv}{Quality value score} +} +} +\source{ +Data derived from SubcellularSpatialData package (EH8230 in ExperimentHub), +filtered to a small region for demonstration purposes. +Original data from Xenium V1 FF Mouse Brain MultiSection. +} +\usage{ +data(tx_small_region) +} +\description{ +A subset of Xenium spatial transcriptomics data containing molecules from a small +region (x between 3700-4200 and y between 5000-5500) for demonstration purposes. +This data is derived from the SubcellularSpatialData package's mouse brain dataset. +} +\keyword{datasets} diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd new file mode 100644 index 0000000..7b956cb --- /dev/null +++ b/vignettes/Introduction.Rmd @@ -0,0 +1,165 @@ +--- +title: "Introduction to Spatial omic analyses" +author: + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] + - Luciano Martellotto, Adelaide Centre for Epigenetics, South Australian immunoGENomics Cancer Institute^[] +output: rmarkdown::html_vignette +# bibliography: "`r file.path(system.file(package='tidySpatialWorkshop', 'bibliography'), 'bibliography.bib')`" +vignette: > + %\VignetteIndexEntry{Introduction to Spatial omic analyses} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) +``` + +## Instructors + +**Dr. Stefano Mangiola** is leading the Computational Cancer immunology group at the South Australian immunoGENomics Cancer Institute (SAiGENCI). He uses single-cell and spatial technologies to investigate the tumor microenvironment and the immune system. Beyong data production, his focus in on the integration and modelling of large-scale single-cell data resources. He is the author of `tidytranscriptiomics` and co-leads the `tidyomics` endevour. + +- BLUESKY: https://bsky.app/profile/stemang.bsky.social + +- TWITTER/X: https://x.com/steman_research + + +**Dr. Luciano Martelotto** is a key figure in the field of spatial omics technology. He demonstrated his extensive expertise and significant contributions to the fields of single cell and spatial omics technology. Currently, he heads the Martelotto Lab located at the Adelaide Centre for Epigenetics and the South Australian immunoGENomics Cancer Institute (SAiGENCI). His lab is dedicated to the development and evaluation of new tools and methodologies for single cell and spatial omics. + +- TWITTER/X: https://x.com/LGMartelotto + +## Workshop partner: Physalia + +```{r, echo=FALSE, out.width="700px"} +library(here) +knitr::include_graphics(here("inst/images/physalia-min.png")) +``` + +## Workshop goals and objectives + +### What you will learn + +- The basics of spatial profiling technologies +- Analysis and manipulation of sequencing-based spatial data. +- The basics of tidy R analyses of biological data with `tidyomics` +- How to interface `SpatialExperiment` with tidy R manipulation and visualisation +- Analysis and manipulation of imaging-based spatial data. + +## Getting started + +### Local + +You can view the material at the workshop webpage + +[here](https://tidyomics.github.io/tidySpatialWorkshop/index.html). + +## Workshop package installation + +If you want to install the packages and material post-workshop, the +instructions are below. The workshop is designed for R `4.4` and +Bioconductor 3.19. + +```{r, eval=FALSE} + +# Install workshop package +#install.packages('BiocManager') +BiocManager::install("tidyomics/tidySpatialWorkshop", dependencies = TRUE) + +# Then build the vignettes +BiocManager::install("tidyomics/tidySpatialWorkshop", build_vignettes = TRUE, force=TRUE) + +# To view vignette +library(tidySpatialWorkshop) +vignette("Introduction") +``` + +## Interactive execution of the vignettes + +From command line, and enter the tidySpatialWorkshop directory. + +``` +# Open the command line +git clone git@github.com:tidyomics/tidySpatialWorkshop.git + +``` + +Alternatively download the [git zipped package](https://github.com/tidyomics/tidySpatialWorkshop/archive/refs/heads/devel.zip). Uncompress it. And enter the directory. + +# Announcements + +Tidyomics is now published in (Nature Methods)[https://www.nature.com/articles/s41592-024-02299-2]. And availabel for (free) here[https://www.biorxiv.org/content/10.1101/2023.09.10.557072v3]. + +# Introduction to Spatial Omics + +### Objective + +Provide a foundational understanding of spatial omics, covering different technologies and the distinctions between imaging and +sequencing in experimental and analytical contexts. + +### Workshop Structure + +#### Day 1 + +##### 1. Welcome and Introduction + +- Introduction of the instructor +- Introduction of the crowd +- Overview and goals of the workshop. + +##### 2. What is Spatial Omics? + +- Definition and significance in modern biology. +- Key applications and impact. +- Overview of different spatial omics technologies. +- Comparison of imaging-based vs sequencing-based approaches. + +##### 3. Sequencing Spatial Omics + +- Detailed comparison of methodologies. +- Experimental design considerations. +- Data analysis challenges and solutions. + +##### 5. Analysis of sequencing based spatial data + +- Getting Started with SpatialExperiment. +- Data Visualisation and Manipulation. +- Quality control and filtering. +- Dimensionality reduction. +- Spatial Clustering. +- Deconvolution of pixel-based spatial data. + +#### Day 2 + +##### 1. Introduction to tidyomics + +- Use tidyverse on spatial, single-cell, pseudobulk and bulk genomic data + +##### 2. Working with tidySpatialExperiment + +- tidySpatialExperiment package +- Tidyverse commands +- Advanced filtering/gating and pseudobulk +- Work with features +- Summarisation/aggregation +- tidyfying your workflow +- Visualisation + +#### Day 3 + +##### 1. Imaging Spatial Omics + +- Detailed comparison of methodologies. +- Experimental design considerations. +- Data analysis challenges and solutions. + +##### 2. Spatial analyses of imaging data + +- Working with imaging-based data in Bioconductor with MoleculeExperiment +- Aggregation and analysis +- Clustering +- Neighborhood analyses + + diff --git a/vignettes/Session_1_sequencing_assays.Rmd b/vignettes/Session_1_sequencing_assays.Rmd new file mode 100644 index 0000000..c8299aa --- /dev/null +++ b/vignettes/Session_1_sequencing_assays.Rmd @@ -0,0 +1,1124 @@ +--- +title: "Sequencing assays" +author: + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] +output: rmarkdown::html_vignette +# bibliography: "`r file.path(system.file(package='tidySpatialWorkshop', 'vignettes'), 'tidyomics.bib')`" +vignette: > + %\VignetteIndexEntry{Sequencing assays} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) + +``` + +# Session 1: Spatial Analysis of Sequencing Data + +## Overview + +This workshop introduces spatial transcriptomics analysis using the Bioconductor framework, with a particular focus on the `SpatialExperiment` package. Participants will learn essential concepts and practical skills for analyzing spatially-resolved genomic data. + +### Learning Objectives + +By the end of this session, participants will be able to: +- Understand the fundamentals of spatial transcriptomics data analysis +- Work with the `SpatialExperiment` package and related tools +- Perform basic data manipulation and visualization of spatial data +- Apply quality control measures to spatial transcriptomics data +- Conduct dimensionality reduction and clustering analyses +- Interpret spatial patterns in gene expression data + +### Prerequisites + +- Basic knowledge of R programming +- Familiarity with genomic data concepts +- Understanding of basic statistical methods + +## Experimental technologies + +**Spatial-omics** encompasses a suite of powerful methods that reveal not only which genes are active in a tissue but also exactly where those genes are switched on. One widely used strategy involves laying a thin slice of tissue onto a specially prepared glass slide that carries an array of microscopic “spots,” each spot marked with its own unique molecular barcode. As the tissue is gently broken down, the messenger RNA molecules released from each cell adhere to the underlying spots and pick up that spot’s barcode. By sequencing the barcodes together with the captured RNA, researchers can reconstruct a two-dimensional map of gene expression. For example, the Visium platform from 10x Genomics uses this barcoded-surface approach to chart gene activity across tumour biopsies, helping oncologists to identify pockets of treatment-resistant cells within a cancerous mass. + +An alternative method, known as **combinatorial FISH** (fluorescence in situ hybridisation), skips the need for physical barcodes by using fluorescent probes that bind directly to RNA molecules within intact tissue. Each probe is tagged with a small coloured label, and by carrying out multiple rounds of staining, imaging and probe removal, a unique sequence of coloured dots is generated for each target gene. It’s akin to reading a barcode of coloured spots: once the entire sequence of images has been captured, computational decoding reveals which gene each pattern corresponds to and pinpoints its exact location. This technique underlies MERFISH (Multiplexed Error-Robust FISH), which neuroscientists often employ to map hundreds of genes simultaneously in brain sections, illuminating the molecular identities of different neuronal subtypes. + +**In-situ sequencing** offers yet another route to spatially resolved transcriptomics by performing the sequencing reactions directly within fixed tissue sections. Rather than relying on pre-made probes, this approach uses a series of enzymatic ligation or polymerisation steps to read out the RNA sequence base by base. At each cycle, fluorescently labelled reagents indicate which nucleotide (A, C, G or T) has been incorporated, and repeated imaging across multiple cycles yields short sequence reads in situ. Once these reads are matched to a reference genome, they reveal precisely where specific transcripts lie. Developmental biologists have harnessed this method—pioneered by technologies such as Fluorescent In Situ Sequencing (FISSEQ)—to follow gene expression patterns during embryo formation, tracking how cells differentiate according to their spatial context. + +```{r, echo=FALSE, out.width="700px"} +library(here) + +knitr::include_graphics(here("inst/images/three_technologies.png")) +``` + +The **Visium CytAssist** platform from 10x Genomics brings the power of spatial transcriptomics into a streamlined, sequencing-based workflow. At its heart lies a standard glass slide bearing an 11 mm by 11 mm capture area patterned with roughly 14 000 microscopic spots (or 5 000 spots on a smaller 6.5 mm by 6.5 mm format). Each spot is densely coated with millions of identical oligonucleotides, each bearing a unique spatial barcode, a unique molecular identifier (UMI) and a poly(dT) tail designed to bind the polyadenylated tails of mRNA. When a fresh‐frozen or FFPE tissue section is mounted onto this slide, RNA molecules released during permeabilisation will hybridise to these oligos, effectively “stamping” each transcript with its precise tissue coordinates. + +The CytAssist instrument automates the critical steps of permeabilisation, RNA digestion and probe release. Rather than capturing native transcripts directly, Visium employs probe hybridisation: a comprehensive set of probes tiles the entire transcriptome (v2 chemistry covers some 18 000 human or 19 000 mouse genes), binding selectively to their target RNAs. Once the tissue has been permeabilised, these probes are enzymatically released and immediately recaptured by the underlying barcoded array. A short extension reaction then attaches the probe insert to the spatial barcode and UMI, before a denaturation step frees the complete construct for library preparation. + +Sequencing libraries are configured so that Read 1 decodes the slide’s spatial barcode and the UMI, while Read 2 reads into the ligated probe insert, revealing the gene identity. To ensure robust detection of both abundant and rare messages, Visium recommends a minimum of 25 000 read‐pairs per covered spot. Optional immunofluorescence staining can be performed in parallel, providing morphological and protein‐level context alongside the transcriptomic data. + +In practice, Visium CytAssist has found widespread use across many fields. Cancer researchers have applied it to map immune cell infiltration and stromal niches within melanoma or breast carcinoma biopsies. Developmental biologists use it to chart gene expression gradients in embryonic tissues, revealing how cells acquire distinct identities in different locations. Even neuroscientists have begun to dissect the molecular architecture of brain regions, linking spatial patterns of gene activity with anatomy and function. By combining a turnkey instrument with a comprehensive probe set and high‐throughput sequencing, Visium offers an accessible route to the spatial “geography” of gene expression in virtually any tissue. + +```{r, echo=FALSE, out.width="700px"} +library(here) + +knitr::include_graphics(here("inst/images/visium.png")) +``` + +The **Visium HD** system represents a next-generation leap in spatial transcriptomics, offering subcellular resolution on a standard CytAssist instrument. Instead of discrete 55 µm spots, the Visium HD slide presents a continuous lawn of capture oligonucleotides across a 6.5 mm × 6.5 mm area, each oligo bearing a unique spatial barcode and UMI. These barcodes are patterned in a fine grid of 2 µm × 2 µm squares, which are digitally binned into 8 µm × 8 µm “pixels” for data analysis. In practice, this means that gene expression can be mapped at roughly one-cell or even subcellular scale—more than a six-fold improvement in resolution compared with the original Visium array. + +As with the standard Visium workflow, fresh-frozen or FFPE tissue sections are first stained (H&E or immunofluorescence, if desired) and imaged for morphological context. The CytAssist then automates permeabilisation, RNA digestion and probe‐release steps: a comprehensive probe set tiles the entire transcriptome, binding each target mRNA; released probes are recaptured by the underlying barcoded oligo lawn; and a short extension reaction fuses the probe insert to its spatial barcode and UMI. After denaturation frees these constructs, they undergo library preparation and high-throughput sequencing. Read 1 decodes the spatial barcode and UMI, while Read 2 reads into the probe insert to identify the gene. To cover the full 6.5 mm capture area at HD resolution, Visium HD recommends approximately 275 million read-pairs per run. + +```{r, echo=FALSE, out.width="700px"} +library(here) + +knitr::include_graphics(here("inst/images/visiumhd.png")) +``` + +**BGI’s STOmics** system brings spatial transcriptomics onto DNA nanoball (DNB) patterned chips that can cover areas from a few square millimetres right up to an entire microscope slide, offering both enormous scale and subcellular resolution. The process begins with the creation of a dense array of molecular “nanoballs,” each just 220 nm across and stamped onto the chip in a precise grid. During chip manufacture, each nanoball is endowed with three key elements: a poly-T tail for capturing polyadenylated mRNA, a unique molecular identifier (UMI) to count individual transcripts, and a coordinate identifier (CID) that records its exact X–Y position on the array. + +Once a freshly frozen or paraformaldehyde-fixed tissue section has been mounted and (optionally) stained for nuclei or protein markers, the chip is brought into contact with the specimen so that mRNA diffuses down into the nanoball layer and hybridises to the poly-T oligos. Reverse transcription then converts these captured RNAs into complementary DNA, preserving both their sequence information and spatial tag. Library construction and high-throughput sequencing follow much as in conventional RNA-seq, but every read now carries the CID and UMI, which bioinformatics pipelines use to reconstruct a high-density map of gene expression. + +The sheer density of the DNB pattern—over 25 000 spots per 100 µm² in the highest-resolution formats—means that STOmics can detect transcripts at nearly subcellular scale, revealing fine-grained differences in gene activity within single cells or across tiny tissue niches. At the same time, chip formats up to 174 cm² in area allow researchers to profile entire organs or large tissue biopsies in one run, without stitching together multiple fields of view. In practice, developmental biologists have used this platform to survey gene expression across whole zebrafish embryos, while tumour biologists have mapped the spatial organisation of immune infiltrates in large cancer resections. By marrying nanometre-scale resolution with slide-wide coverage, BGI’s STOmics empowers scientists to explore biological landscapes from the level of subcellular compartments all the way up to entire tissue architectures. + +```{r, echo=FALSE, out.width="700px"} +library(here) + +knitr::include_graphics(here("inst/images/stomics.png")) +``` + + +## Introduction to Bioconductor + +Bioconductor is an open-source, open-development software project built on the R programming language. It provides powerful tools for analyzing and comprehending high-throughput genomic data. + +### Key Features + +1. **Comprehensive Package Ecosystem** + - Over 2,000 software packages + - Specialized tools for various types of genomic data + - Integration capabilities across different data types + +2. **Quality Standards** + - Rigorous peer review process + - Consistent documentation + - Regular version updates + +3. **Community Support** + - Active developer community + - Extensive documentation + - Regular workshops and conferences + +### Spatial Omics Analysis in Bioconductor + +Spatial omics analysis combines traditional genomic data with spatial information, enabling researchers to understand gene expression patterns within their tissue context. Bioconductor provides several specialized tools for this purpose: + +- Dedicated spatial analysis packages +- Integration with existing genomic analysis workflows +- Visualization tools for spatial data +- Statistical methods for spatial patterns + +#### Support Resources + +- Slack channel: #community-bioc +- Support forum: https://support.bioconductor.org/ + +In the following sections of this workshop, we will explore practical examples and dive deeper into how Bioconductor is used in spatial omics analyses, including hands-on coding examples. Stay tuned for an engaging journey through spatial omics with Bioconductor! + +### 2. Getting Started with SpatialExperiment + +The `SpatialExperiment` package provides a robust framework for handling spatial transcriptomics data. It extends the `SingleCellExperiment` class by adding functionality specific to spatial data: + +- Storage of spatial coordinates +- Management of tissue images +- Integration of spot-based and cell-based data +- Specialized methods for spatial analysis + +Righelli et al. doi: [10.1093/bioinformatics/btac299](https://academic.oup.com/bioinformatics/article/38/11/3128/6575443?login=false) + +```{r, echo=FALSE, out.width="700px"} +library(here) + +knitr::include_graphics(here("inst/images/spatialExperimentClass.png")) +``` + +```{r, eval=FALSE} +# Install BiocManager +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +# Install SpatialExperiment package +BiocManager::install("SpatialExperiment") + +``` + +### 3. Downloading Example Dataset + +We'll work with data from the `spatialLIBD` package, which contains spatial transcriptomics data from human dorsolateral prefrontal cortex. This dataset provides an excellent example for learning spatial analysis techniques as it includes: + +- Multiple tissue sections +- Known anatomical layers +- Rich molecular information +- High-quality imaging data + +```{r, message=FALSE, warning=FALSE} + + +# Load the SpatialExperiment package + +# For SpatialExperiment the following might be needed +# 1. bash: module load ImageMagick/6.9.11-22 +# 2. R: devtools::install_github("ropensci/magick") + +library(SpatialExperiment) +``` + +Visualisation functions for spatial transcriptomics data. + +```{r, message=FALSE, warning=FALSE} +library(ggspavis) +``` + +Utility packages for single-cell data. + +```{r, message=FALSE, warning=FALSE} +library(scuttle) +library(scater) +library(scran) +``` + +In this section, we explore the handling and processing of spatial transcriptomics data using the `spatialLIBD` and `ExperimentHub` packages. The following R code block retrieves a specific dataset from the `ExperimentHub`, a Bioconductor project designed to manage and distribute large biological data sets. The code efficiently fetches the data, removes any existing dimensional reductions, and filters the dataset to include only selected samples. This approach is essential for analysing spatial patterns in gene expression across multiple samples, and the code below exemplifies how to manipulate these datasets in preparation for further analysis. This process is adapted from the `Banksy` package's vignette, which provides advanced methods for multi-sample spatial transcriptomics. + +Maynard and Torres et al., doi: [10.1038/s41593-020-00787-0](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8095368/), [tutorial](https://www.bioconductor.org/packages/release/data/experiment/vignettes/spatialLIBD/inst/doc/spatialLIBD.html) + +```{r, message=FALSE, warning=FALSE} +library(spatialLIBD) +library(ExperimentHub) + +# # To avoid error for SPE loading +# # https://support.bioconductor.org/p/9161859/#9161863 +# setClassUnion("ExpData", c("matrix", "SpatialExperiment")) + +spatial_data <- + ExperimentHub::ExperimentHub() |> + spatialLIBD::fetch_data( eh = _, type = "spe") + +names(libd_layer_colors) = gsub("ayer", "", names(libd_layer_colors)) + +# Clear the reductions +reducedDims(spatial_data) = NULL + +# Select only 3 samples +spatial_data = spatial_data[,spatial_data$sample_id %in% c("151673", "151675", "151676")] + +# Display the object +spatial_data + +``` + +From: + +We shows metadata for each cell, helping understand the dataset's structure. + +```{r} +col_data = colData(spatial_data) + +knitr::kable( + head(col_data), + format = "html" +) +``` + +We access and display feature-related information from the dataset. + +```{r} +row_data = rowData(spatial_data) + +knitr::kable( + head(row_data), + format = "html" +) +``` + +Here, we perform a preliminary examination of the assay data contained within the spatial dataset. + +```{r} +assay(spatial_data)[1:20, 75:100] +``` + +### 4. Data Visualisation and Manipulation + +Spatial transcriptomics data requires specialized visualization approaches to understand both molecular and spatial aspects simultaneously. The `ggspavis` package provides powerful tools for this purpose: + +```{r, fig.width=6, fig.height=6} +# image data +imgData(spatial_data) + +# Simple visualization of spatial data +ggspavis::plotSpots(spatial_data) + + facet_wrap(~sample_id) + +``` + +We can enhance our understanding by adding layer annotations. In this dataset, layers L1-6 represent different cortical layers, while WM indicates white matter: + +```{r, fig.width=6, fig.height=6} +# Plot spots with anatomical annotations +ggspavis::plotSpots( + spatial_data, + annotate = "spatialLIBD" +) + + facet_wrap(~sample_id) +``` + +Explore additional visualisation features offered by the Visium platform, exposing the H&E (hematoxylin and eosin) image. + +```{r, fig.width=6, fig.height=6} +ggspavis::plotVisium(spatial_data, point_size = 0.5) +``` + +This visualisation focuses on specific tissue features within the dataset, emphasising areas of interest. +1. **Mitochondrial Content**: High mitochondrial gene expression often indicates stressed or dying cells +2. **Library Size**: Total RNA content per spot +3. **Number of Detected Genes**: Diversity of gene expression per spot + +```{r, fig.width=6, fig.height=6} +ggspavis::plotVisium( + spatial_data, + annotate = "spatialLIBD", + highlight = "in_tissue", + point_size =0.5 +) + + facet_wrap(~sample_id) + +``` + +### 5. Quality control and filtering + +We will use the `scater` package [McCarthy et al. 2017](https://academic.oup.com/bioinformatics/article/33/8/1179/2907823?login=true) to compute the three primary QC metrics we discussed earlWe'llUsing the scater Package for QC Metrics: We'll apply the `scater` package to compute three primary quality control metrics. We'll also use `ggspavis` for visualisation along with some custom plotting techniques. + +Previously, we visualised both on- and off-tissue spots. Moving forward, we focus on on-tissue spots for more relevant analyses. This block shows how to filter out off-tissue spots to refine the dataset. + +Source [OSTAWorkshopBioc2021](https://lmweber.org/OSTAWorkshopBioc2021/articles/Vignette03_Analysis_workflow.html) + +```{r} +## Dataset dimensions before the filtering +dim(spatial_data) +``` + +Filtering Dataset to Retain Only On-Tissue Spots: We then refine our dataset by keeping only those spots that are on-tissue, aligning with our focus for subsequent analysis. The dimensions of the dataset after filtering are displayed to confirm the changes. + +```{r} +## Subset to keep only on-tissue spots +spatial_data <- spatial_data[, colData(spatial_data)$in_tissue == 1] +dim(spatial_data) +``` + +#### Mitochondrial + +Next, we identify mitochondrial genes, as they are indicative of cell health. Cells with high mitochondrial gene expression typically indicate poor health or dying cells, which we aim to exclude. + +```{r} +## Classify genes as "mitochondrial" (is_mito == TRUE) +## or not (is_mito == FALSE) +is_gene_mitochondrial <- grepl("(^MT-)|(^mt-)", rowData(spatial_data)$gene_name) +rowData(spatial_data)$gene_name[is_gene_mitochondrial] +``` + +After identifying mitochondrial genes, we apply quality control metrics to further clean the dataset. This involves adding per-cell QC measures and setting a threshold to exclude cells with high mitochondrial transcription. + +```{r} +## Add per-cell QC metrics to spatial data using identified mitochondrial genes +spatial_data <- scater::addPerCellQC( + spatial_data, + subsets = list(mito = is_gene_mitochondrial) +) + +## Select expressed genes threshold +qc_mitochondrial_transcription <- colData(spatial_data)$subsets_mito_percent > 30 + +## Check how many spots are filtered out +table(qc_mitochondrial_transcription) +``` + +After applying the QC metrics, it's crucial to visually assess their impact. This step involves checking the spatial pattern of the spots removed based on high mitochondrial transcription, helping us understand the distribution and quality of the remaining dataset. + +```{r, fig.width=6, fig.height=6} +## Add threshold in colData +colData(spatial_data)$qc_mitochondrial_transcription <- qc_mitochondrial_transcription + +## Visualize spatial pattern of filtered spots +plotSpotQC( + spatial_data, + plot_type = "spot", + annotate = "qc_mitochondrial_transcription" +) + + facet_wrap(~sample_id) +``` + +#### Library Size Analysis + +This analysis focuses on examining the distribution of library sizes across different spots. It uses a histogram and density plot to visualise the range and commonality of library sizes in the dataset. + +```{r, fig.width=6, fig.height=6} +## Visualize library size distribution +data.frame(colData(spatial_data)) |> + ggplot(aes(x = sum)) + + geom_histogram(aes(y = after_stat(density)), bins = 60) + + geom_density() + + scale_x_log10() + + xlab("Library size") + + ylab("Density") + + theme_classic() +``` + +Setting Library Size Threshold: After examining the library sizes, a threshold is applied to identify spots with library sizes below 700, which are considered for potential exclusion from further analysis. + +```{r} +## Select library size threshold +qc_total_counts <- colData(spatial_data)$sum < 700 + +## Check how many spots are filtered out +table(qc_total_counts) +``` + +Incorporating Library Size Threshold in Dataset: This step involves adding the library size threshold to the dataset's metadata and examining the spatial pattern of the spots that have been removed based on this criterion. + +```{r, fig.width=6, fig.height=6} + +## Add threshold in colData +colData(spatial_data)$qc_total_counts <- qc_total_counts + +## Check for putative spatial pattern of removed spots +plotSpotQC( + spatial_data, + plot_type = "spot", + annotate = "qc_total_counts", +) + + facet_wrap(~sample_id) + + +``` + +#### Detected genes + +This analysis examines how many genes are expressed per spot, using a histogram and density plot to visualise the distribution of gene counts across the dataset. + +```{r, fig.width=6, fig.height=6} +## Density and histogram of library sizes +data.frame(colData(spatial_data) ) |> + ggplot(aes(x = detected)) + + geom_histogram(aes(y = after_stat(density)), bins = 60) + + geom_density() + + scale_x_log10() + + xlab("Number of genes with > 0 counts") + + ylab("Density") + + theme_classic() +``` + + +Setting Gene Expression Threshold: This block applies a threshold to identify spots with fewer than 500 detected genes, considering these for exclusion to ensure data quality. + +```{r} +## Select expressed genes threshold +qc_detected_genes <- colData(spatial_data)$detected < 500 +## Check how many spots are filtered out +table(qc_detected_genes) +``` + +Incorporating Gene Expression Threshold in Dataset: After setting the gene expression threshold, it is added to the dataset's metadata. The spatial pattern of spots removed based on this threshold is then examined. + +```{r, fig.width=6, fig.height=6} +## Add threshold in colData +colData(spatial_data)$qc_detected_genes <- qc_detected_genes + +## Check for putative spatial pattern of removed spots +plotSpotQC( + spatial_data, + plot_type = "spot", + annotate = "qc_detected_genes", +) + + facet_wrap(~sample_id) + +``` + +Exploring the Relationship Between Library Size and Number of Genes Detected: This analysis explores the correlation between library size and the number of genes detected in each spot, providing insights into data quality and sequencing depth. + +```{r, fig.width=6, fig.height=6} +## Density and histogram of library sizes +data.frame(colData(spatial_data)) |> + ggplot(aes(sum, detected)) + + geom_point(shape=".") + + scale_x_log10() + + scale_y_log10() + + xlab("Library size") + + ylab("Number of genes with > 0 counts") + + theme_classic() +``` + +#### Combined filtering + +After applying all QC filters, this block combines them and stores the results in the dataset. The spatial patterns of all discarded spots are then reviewed to ensure comprehensive quality control. + +```{r, fig.width=6, fig.height=6} + +## Store the set in the object +colData(spatial_data)$discard <- qc_total_counts | qc_detected_genes | qc_mitochondrial_transcription + +## Check the spatial pattern of combined set of discarded spots +plotSpotQC( + spatial_data, + plot_type = "spot", + annotate = "discard", +) + + facet_wrap(~sample_id) + +``` + +The final step in data preprocessing involves removing all spots identified as low-quality based on the previously applied thresholds, refining the dataset for subsequent analyses. + +```{r} +spatial_data = spatial_data[,!colData(spatial_data)$discard ] +``` + +### 6. Dimensionality reduction + +Dimensionality reduction is essential in spatial transcriptomics due to the high-dimensional nature of the data, which includes vast gene expression profiles across various spatial locations. Techniques such as PCA (Principal Component Analysis) and UMAP (Uniform Manifold Approximation and Projection) are particularly valuable. PCA helps to reduce noise and highlight the most significant variance in the data, making it simpler to uncover underlying patterns and correlations. UMAP, ofen calculated from principal components (and not directly from features) preserves both global and local data structures, enabling more nuanced visualisations of complex cellular landscapes. Together, these methods facilitate a deeper understanding of spatial gene expression, helping to reveal biological insights such as cellular heterogeneity and tissue structure, which are crucial for both basic biological research and clinical applications. + +#### Variable gene identification + +```{r, message=FALSE, warning=FALSE, fig.width=6, fig.height=6} + +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data)) +dec = scran::modelGeneVar(spatial_data, subset.row = genes) + + +# Visualisation +plot(dec$mean, dec$total, xlab = "Mean log-expression", ylab = "Variance") +curve(metadata(dec)$trend(x), col = "blue", add = TRUE) + +# Get top variable genes +dec = scran::modelGeneVar(spatial_data, subset.row = genes, block = spatial_data$sample_id) +hvg = scran::getTopHVGs(dec, n = 1000) + +rowData(spatial_data[head(hvg),])[,c("gene_id", "gene_name")] +``` + +#### PCA + +With the highly variable genes, we perform PCA to reduce dimensionality, followed by UMAP to visualise the data in a lower-dimensional space, enhancing our ability to observe clustering and patterns in the data. + +```{r, fig.width=6, fig.height=6} +spatial_data <- + spatial_data |> + scuttle::logNormCounts() |> + scater::runPCA(subset_row = hvg) + +## Check correctness - names +reducedDimNames(spatial_data) + +reducedDim(spatial_data, "PCA")[1:5, 1:5] +``` + +::: {.note} +As for single-cell data, we need to verify that there is not significant batch effect. If so we need to adjust for it (a.k.a. integration) before calculating principal component. Many adjustment methods to output adjusted principal components directly. +::: + + +#### UMAP + +You can appreciate that, in this case, selecting within-sample variable genes, we do not see major batch effects across samples. We see two major pixel clusters. + +We can appreciate that there are no major batch effects across samples, and we don't see grouping driven by sample_id. + +```{r, fig.width=6, fig.height=6} +set.seed(42) +spatial_data <- scater::runUMAP(spatial_data, dimred = "PCA") + +scater::plotUMAP(spatial_data, colour_by = "sample_id", point_size = 0.2) +``` + +::: {.note} +**Exercise 1.1** + +Visualise where the two macro clusters are located spatially. We will take a very pragmatic approach and get cluster label from splitting the UMAP coordinated in two (`colData()` and `reducedDim()` will help us, see above), and then visualise it with `ggspavis`. + +- Modify the `SpatialExperiment` object based on the UMAP1 dimension so to divide those 2 cluster +- Visualise the UMAP colouring by the new labelling +- Visualise the Visium slide colouring by the new labelling +::: + +### 7. Clustering + +Clustering in spatial transcriptomics is crucial for understanding the intricate cellular composition of tissues. This technique groups cells/pixels based on similar gene expression profiles, enabling the identification of distinct cell types and states within a tissue's spatial context. Clustering reveals patterns of cellular organisation and differentiation, and interactions in the microenvironment. + +#### Transcriptome based nearest neighbours + +First, we establish the number of nearest neighbors to use in the k-NN graph. This graph forms the basis for clustering, using the Walktrap algorithm to detect community structures that suggest natural groupings or clusters in the data. `buildSNNGraph` is from the `scan` package. + +```{r} + +## Set number of Nearest-Neighbours (NNs) +k <- 10 + +## Build the k-NN graph +g_walk <- + spatial_data |> + scran::buildSNNGraph( + k = 10, + use.dimred = "PCA" + ) |> + igraph::cluster_walktrap() + +clus <- g_walk$membership +## Check how many +table(clus) +``` + +Applying Clustering Labels and Visualising Results: After determining clusters, we apply these labels back to the spatial data and visualise the results using UMAP. This allows us to observe how the data clusters in a reduced dimension space, and further visualise how these clusters map onto the physical tissue sections for context. + +We can appreciate here that we get two main pixel clusters. + +```{r, fig.width=6, fig.height=6} +colLabels(spatial_data) <- factor(clus) + +scater::plotUMAP(spatial_data, colour_by = "label") + scale_color_brewer(palette = "Paired") +``` + +Those two clusters group the white matter from the rest of the layers. + +```{r, fig.width=6, fig.height=6} +## Plot in tissue map +ggspavis::plotSpots(spatial_data, annotate = "label") + + facet_wrap(~sample_id) + + scale_color_brewer(palette = "Paired") +``` + +As for comparison, we show the manually annotated regions. We can see that while the single cell style clustering catchers, the overall tissue, architecture, a lot of details are not retrieved. We clusters cannot faithfully recapitulate the tissue morphology. However, they might represent specific cell types within morphological regions. + +```{r, fig.width=6, fig.height=6} +## Plot ground truth in tissue map +ggspavis::plotSpots(spatial_data, annotate = "spatialLIBD") + + facet_wrap(~sample_id) + + scale_color_manual(values = libd_layer_colors) + +``` + +#### Spatially-aware clustering + +To cluster spatial regions (i.e. tissue domain) rather than single-cell types, the clustering algorithms need to take spatial context into account. For example what is the transcriptional profile of the neighbouring pixels or neighbouring cells. + +BANKSY combines molecular and spatial information. BANKSY leverages the fact that a cell's state can be more fully represented by considering both its own transcriptome "nd that of its local microenvironment.This algorithm embeds cells within a combined space that incorporates their own transcriptome and that of their locell'svironment, representing both the cell state and the surrounding microenvironment. + +Overview of the algorithm + +- \* Construct a neighborhood graph between cells in physical space (k-nearest neighbors or radius nearest neighbors). + +- \* We use neighborhood graph to compute two matrices: + +-- \*\* Average neighborhood expression matrix + +-- \*\* "Azimuthal Gabor filter" matrix. It represents the transcriptomic microenvironment around each cell. It measures the gradient of gene expression in each cell's neighborhood. + +- \* These matrices are then scaled on the basis of a mixing parameter λ, which controls their relative weighting + +- \* Concatenate these two matrices with the original gene–cell expression matrix + +- \* Combine these three matrices by direct product + +[Singhal et al., 2025](https://www.nature.com/articles/s41588-024-01664-3) + +[Material source](https://bioconductor.org/packages/release/bioc/vignettes/Banksy/inst/doc/multi-sample.html) + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +library(Banksy) + +# scale the counts, without log transformation +spatial_data = spatial_data |> logNormCounts(log=FALSE, name = "normcounts") +``` + +**Highly-variable genes** + +The Banksy documentation, suggest the use of `Seurat` for the detection of highly variable genes. + +```{r, message=FALSE, warning=FALSE} +library(Seurat) + +# Convert to list +spatial_data_list_for_seurat <- lapply(unique(spatial_data$sample_id), function(x) + spatial_data[, spatial_data$sample_id == x ] +) + +seu_list <- lapply(spatial_data_list_for_seurat, function(x) { + x <- as.Seurat(x, data = NULL) + NormalizeData(x, scale.factor = 5000, normalization.method = 'RC') +}) + +# Compute HVGs +hvgs <- lapply(seu_list, function(x) { + VariableFeatures(FindVariableFeatures(x, nfeatures = 2000)) +}) +hvgs <- Reduce(union, hvgs) +rm(seu_list, spatial_data_list_for_seurat) + +rowData(spatial_data[head(hvgs),])[,c("gene_id", "gene_name")] + +``` + +We now split the data by sample, to compute the neighbourhood matrices. + +```{r, message=FALSE, warning=FALSE} +# Convert to list +spatial_data_list <- lapply(unique(spatial_data$sample_id), function(x) + spatial_data[ + hvgs, + spatial_data$sample_id == x + ] +) + +spatial_data_list <- lapply( + spatial_data_list, + computeBanksy, # Compute the component neighborhood matrices + assay_name = "normcounts" +) + +# Rejoin the datasets +spe_joint <- do.call(cbind, spatial_data_list) +``` + +Here, we perform PCA using the BANKSY algorithm on the joint dataset. The group argument specifies how to treat different samples, ensuring that features are scaled separately per sample group to account for variations among them. + +::: {.note} +Note: this step takes long time +::: + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +spe_joint <- runBanksyPCA( # Run PCA on the Banskly matrix + spe_joint, + lambda = 0.2, # spatial weighting parameter. Larger values (e.g. 0.8) incorporate more spatial neighborhood + group = "sample_id", # Features belonging to the grouping variable will be z-scaled separately. + seed = 42 +) +``` + +Once the dimensional reduction is complete, we cluster the spots across all samples and use `connectClusters` to visually compare these new BANKSY clusters against manual annotations. + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +spe_joint <- clusterBanksy( # clustering on the principal components computed on the BANKSY matrix + spe_joint, + lambda = 0.2, # spatial weighting parameter. Larger values (e.g. 0.8) incorporate more spatial neighborhood + resolution = 0.7, # numeric vector specifying resolution used for clustering (louvain / leiden). + seed = 42 +) +colData(spe_joint)$clust_annotation = colData(spe_joint)$Cluster + +spe_joint <- connectClusters(spe_joint) +``` + +As an optional step, we smooth the cluster labels for each sample independently, which can enhance the visual coherence of clusters, especially in heterogeneous biological samples. + +From SpiceMix paper [Chidester et al., 2023](https://www.nature.com/articles/s41588-022-01256-z) + +```{r, eval=TRUE, message=FALSE, warning=FALSE} +spatial_data_list <- lapply( + unique(spe_joint$sample_id), + function(x) + spe_joint[, spe_joint$sample_id == x] +) + +spatial_data_list <- lapply( + spatial_data_list, + smoothLabels, + cluster_names = "clust_M0_lam0.2_k50_res0.7", + k = 6L, + verbose = FALSE +) +names(spatial_data_list) <- paste0("sample_", unique(spe_joint$sample_id)) +``` + +The raw and smoothed cluster labels are stored in the `colData` slot of each `SingleCellExperiment` or `SpatialExperiment` object. + +```{r, eval=TRUE} +cluster_metadata = + colData(spatial_data_list$sample_151673)[, c("clust_M0_lam0.2_k50_res0.7", "clust_M0_lam0.2_k50_res0.7_smooth")] + + +knitr::kable(head(cluster_metadata), format = "html") +``` + +Using cluster comparison metrics like the adjusted Rand index (ARI) we evaluate the performance of our clustering approach. This statistical analysis helps validate the clustering results against known labels or pathologies. + +The Adjusted Rand Index (ARI) is a measure of the similarity between two data clusterings. Measures degree of overlapping between two partitions. + +```{r, eval=TRUE} +compareClusters(spatial_data_list$sample_151673, func = 'ARI') +``` + +We calculate the ARI for each sample to assess the consistency and accuracy of our clustering across different samples. + +```{r, eval=TRUE} +ari <- sapply(spatial_data_list, function(x) as.numeric(tail(compareClusters(x, func = "ARI")[, 1], n = 1))) +ari +``` + +Visualising Clusters and Annotations on Spatial Coordinates: We utilise the ggspavis package to visually map BANKSY clusters and manual annotations onto the spatial coordinates of the dataset, providing a comprehensive visual overview of spatial and clustering data relationships. + +```{r multi-sample-spatial, eval=TRUE, fig.width=6, fig.height=6} +# Use scater:::.get_palette('tableau10medium') +library(cowplot) + +pal <- c( + "#1abc9c", "#3498db", "#9b59b6", "#e74c3c", "#34495e", + "#f39c12", "#d35400", "#7f8c8d", "#2ecc71", "#e67e22" +) + + ggspavis::plotSpots( + do.call(cbind, spatial_data_list), + annotate = sprintf("%s_smooth", "clust_M0_lam0.2_k50_res0.7"), + pal = pal + ) + + facet_wrap(~sample_id) + + theme(legend.position = "none") + + labs(title = "BANKSY clusters") + + ggspavis::plotSpots( + do.call(cbind, spatial_data_list), + annotate = sprintf("%s", "clust_M0_lam0.2_k50_res0.7"), + pal = pal + ) + + facet_wrap(~sample_id) + + theme(legend.position = "none") + + labs(title = "BANKSY clusters") + +ggspavis::plotSpots(spatial_data, annotate = "spatialLIBD") + + facet_wrap(~sample_id) + + scale_color_manual(values = libd_layer_colors) + + theme(legend.position = "none") + + labs(title = "spatialLIBD regions") +``` + +::: {.note} +**Exercise 1.2** + +We have applied cluster smoothing using `smoothLabels`. How much do you think this operation has affected the cluster labels. To find out, + +- Plot the non smoothed cluster +- identify the pixel that have been smoothed, and +- visualise them using `plotSpotQC` that we have used above. +::: + +### 8. Deconvolution of pixel-based spatial data + +One of the popular algorithms for spatial deconvolution is SPOTlight. [Elosua-Bayes et al., 2021](https://academic.oup.com/nar/article/49/9/e50/6129341). + +[Source](https://bioconductor.org/packages/devel/bioc/vignettes/SPOTlight/inst/doc/SPOTlight_kidney.html) + +SPOTlight uses a seeded non-negative matrix factorization regression, initialized using cell-type marker genes and non-negative least squares. + +#### Producing the reference for single-cell databases + +Here, we retrieve and prepare a single-cell RNA reference. The dataset in question, zhong-prefrontal-2018, originates from a study by Zhong et al. (2018), which offers a comprehensive single-cell transcriptomic survey of the human prefrontal cortex during development . Utilising the scRNAseq package, the dataset is fetched and subsequently processed to aggregate counts across cells sharing the same sample and cell type, thereby reducing data complexity and enhancing interpretability. Further filtering steps ensure the removal of empty columns and entries with missing cell type annotations. Finally, the logNormCounts function from the scuttle package is applied to perform log-normalisation, a crucial step for mitigating technical variability and preparing the data for accurate comparative analyses . + +```{r, message=FALSE, warning=FALSE, fig.width=6, fig.height=6} +# Get reference +library(scRNAseq) +brain_reference <- fetchDataset("zhong-prefrontal-2018", "2023-12-22") + +brain_reference = + brain_reference |> + scuttle::aggregateAcrossCells(ids = paste(brain_reference$sample, brain_reference$cell_types, sep = "_")) + +brain_reference = brain_reference[, brain_reference |> assay() |> colSums() > 0] +brain_reference = brain_reference[, !brain_reference$cell_types |> is.na()] + +brain_reference = + brain_reference |> + logNormCounts() +``` + +```{r, message=FALSE} +knitr::kable(head(colData(brain_reference)), format = "html") +``` + +These are the cell types included in our reference, and the number of pseudobulk samples we have for each cell type. + +```{r} + +table(brain_reference$cell_types) + +``` + +These are the number of samples we have for each of the three data sets. + +```{r} + +table(brain_reference$sample) +``` + + +Now, we identify the variable genes within each dataset, to not capture technical effects, and identify the union of variable genes for further analysis. + +```{r, warning=FALSE} +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(brain_reference)) + +# Convert to list +brain_reference_list <- lapply(unique(brain_reference$dataset_id), function(x) brain_reference[, brain_reference$dataset_id == x]) + +dec = scran::modelGeneVar(brain_reference, subset.row = genes, block = brain_reference$sample_id) +hvg_CAQ = scran::getTopHVGs(dec, n = 1000) + +hvg_CAQ = unique( unlist(hvg_CAQ)) + +head(hvg_CAQ) +``` + +Initially, the code prepares the spatial data by setting gene names as row identifiers. + +```{r} +spatial_data_gene_name = spatial_data +rownames(spatial_data_gene_name) <- rowData(spatial_data_gene_name)$gene_name +spatial_data_gene_name = logNormCounts(spatial_data_gene_name) +``` + +We then identify and score relevant marker genes based on their expression and significance across different cell types. The result is a curated list of high-potential marker genes, organised and ready for deeper analysis and interpretation in the context of spatial gene expression patterns. + +::: {.note} +This function provides a convenience wrapper for marker gene identification between groups of cells, based on running `pairwiseTTests.` + +This function represents a simpler and more intuitive summary of the differences between the groups. We do this by realizing that the p-values for these types of comparisons are largely meaningless; individual cells are not meaningful units of experimental replication, while the groups themselves are defined from the data. Thus, by discarding the p-values, we can simplify our marker selection by focusing only on the effect sizes between groups. +::: + +```{r} +mgs <- scran::scoreMarkers( + brain_reference, + groups = brain_reference$cell_types, + + # Omit mitochondrial genes and keep all the genes in spatial + subset.row = + grep("(^MT-)|(^mt-)|(\\.)|(-)", rownames(brain_reference), value=TRUE, invert=TRUE) |> + intersect(rownames(spatial_data_gene_name)) +) + +# Select the most informative markers +mgs_df <- lapply(names(mgs), function(i) { + x <- mgs[[i]] + + # Filter and keep relevant marker genes, those with AUC > 0.8 + x <- x[x$mean.AUC > 0.8, ] + + # Sort the genes from highest to lowest weight + x <- x[order(x$mean.AUC, decreasing = TRUE), ] + + # Add gene and cluster id to the dataframe + x$gene <- rownames(x) + x$cluster <- i + data.frame(x) +}) +mgs_df <- do.call(rbind, mgs_df) + +head(mgs_df) +``` + +We now utilise `SPOTlight` to deconvolve tisslet'smposition from our independent mouse brain reference. The result is visualised through a scatter pie plot, which provides a graphical representation of the spatial distribution of cell types within a let'sfic sample. This visualisation aids in understanding the spatial heterogeneity. + +```{r, fig.width=6, fig.height=6, message=FALSE, warning=FALSE} +library(SPOTlight) + +res <- SPOTlight( + x = brain_reference |> assay("logcounts"), + y = spatial_data_gene_name |> assay("logcounts"), + groups = brain_reference$cell_types, + group_id = "cluster", + mgs = mgs_df, + hvg = intersect(hvg_CAQ, rownames(spatial_data_gene_name)), + weight_id = "mean.AUC", + gene_id = "gene" +) + +cell_first_sample = colData(spatial_data_gene_name)$sample_id=="151673" + +plotSpatialScatterpie( + x = spatial_data_gene_name[,cell_first_sample], + y = res$mat[cell_first_sample,], + cell_types = colnames(res$mat[cell_first_sample,]), + img = FALSE, + scatterpie_alpha = 1, + pie_scale = 0.4 +) + +``` + +Let's visualise without pit'syte_cell and endothelial cells, which oclet'sa lot of the visual spectrum. + +```{r, fig.width=6, fig.height=6} + +plotSpatialScatterpie( + x = spatial_data_gene_name[,cell_first_sample], + y = res$mat[cell_first_sample,-c(2,9)], + cell_types = colnames(res$mat[cell_first_sample,-c(2,9)]), + img = FALSE, + scatterpie_alpha = 1, + pie_scale = 0.4 +) + +``` + +Let's also exclude without muscle_cell, which occupy a lot of the visual spectrum. + +```{r, fig.width=6, fig.height=6} + +plotSpatialScatterpie( + x = spatial_data_gene_name[,cell_first_sample], + y = res$mat[cell_first_sample,-c(2, 9, 5)], + cell_types = colnames(res$mat[cell_first_sample,-c(2, 9, 5)]), + img = FALSE, + scatterpie_alpha = 1, + pie_scale = 0.4 +) + +``` + +No, let's look at the correlation matrices to see which cell type are most often occurring rather than mutually exclusive within our data set. + +```{r, fig.width=6, fig.height=6} + +plotCorrelationMatrix(res$mat) +``` +```{r} +mat_df = as.data.frame(res$mat) +``` + +#### Excercise + + + +::: {.note} +**Exercise 1.4** + +Rather than looking at the correlation matrix, overall, let's observe whether the correlation structure amongst cell types is consistent across samples. Do you think it's consistent or noticeably different? +::: + + +::: {.note} +**Exercise 1.5** + +## Exercise 1.5 (adapted to your current cell types) + +Some of the most positive correlations in the new matrix are seen between: + +- **Microglia** and **Neurons** +- **Astrocytes** and **Stem.cells** + +> **Microglia** are the resident immune cells of the central nervous system, constantly surveying the parenchyma and clearing debris. +> **Neurons** are the electrically excitable cells that transmit and process information via synaptic connections. +> **Astrocytes** are star-shaped glia that support neuronal metabolism, regulate extracellular ions and neurotransmitter uptake. +> **Stem.cells** denote undifferentiated progenitors capable of self-renewal and differentiation into multiple neural lineages. + +Let us now **visualise** where these pairs of cell types most co-occur in your spatial map. For **each** pair, carry out the following: + +1. **Label** any pixel where both cell types exceed 10 % abundance (i.e. > 0.1). +2. **Label** any pixel where the _sum_ of their abundances exceeds 40 % (i.e. > 0.4). +3. **Plot** the spatial coordinates of all pixels, **colouring** them by this new label (for example: + - `0` = neither condition met + - `1` = both abundances > 0.1 + - `2` = summed abundance > 0.4 + +You should end up with two analogous visualisations: + +- **Microglia + Neurons** +- **Astrocytes + Stem.cells** + +Feel free to reuse your previous code, simply substituting the cell-type columns and updating the thresholds as above. + +::: + + + +#### Bonus - Alternative reference from the Human Cell Atlas - using cellNexus + +[cellNexus](https://stemangiola.github.io/cellNexus/) is a query interface that allow the programmatic exploration and retrieval of the harmonised, curated and reannotated CELLxGENE single-cell human cell atlas. Data can be retrieved at cell, sample, or dataset levels based on filtering criteria. + +Harmonised data is stored in the ARDC Nectar Research Cloud, and most cellNexus functions interact with Nectar via web requests, so a network connection is required for most functionality. + +Mangiola et al., 2025 doi [doi.org/10.1101/2023.06.08.542671](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3) + +```{r, echo=FALSE, out.width="700px"} +knitr::include_graphics(here("inst/images/curated_atlas_query.png")) +``` + + +```{r, eval = FALSE, message=FALSE, warning=FALSE, fig.width=3, fig.height=3} +# Get reference +library(cellNexus) +library(HDF5Array) + +tmp_file_path = tempfile() + +brain_reference = + + # Query metadata across 30M cells + get_metadata() |> + + # Filter your data of interest + dplyr::filter(tissue_groups=="cerebral lobes and cortical areas", disease == "Normal") |> + + # Collect pseudobulk as SummarizedExperiment + get_pseudobulk() |> + + # Normalise for Spotlight + scuttle::logNormCounts() |> + + # Save for fast reading + HDF5Array::saveHDF5SummarizedExperiment(tmp_file_path, replace = TRUE) +``` + +```{r, eval = FALSE, message=FALSE} +library(HDF5Array) + +brain_reference = HDF5Array::loadHDF5SummarizedExperiment(tmp_file_path) + +my_metadata = colData(brain_reference) + +knitr::kable(head(my_metadata), format = "html") +``` + +These are the cell types included in our reference, and the number of pseudobulk samples we have for each cell type. + +```{r, eval = FALSE} + +table(brain_reference$cell_type_harmonised) + +``` + +These are the number of samples we have for each of the three data sets. + +```{r, eval = FALSE} + +table(brain_reference$dataset_id) +``` + +The `collection_id` can be used to gather information on the cell database. e.g. + +```{r, eval = FALSE} +table(brain_reference$collection_id) +``` + + + +**Session Information** + +```{r} +sessionInfo() +``` + +**References** + +```{css echo=FALSE} +.note { + margin: 30px; + padding: 1em; + background: #FFF8F0; + border: 1px solid #EFE8E0; + border-radius: 10px; +} +``` diff --git a/vignettes/Session_2_Tidy_spatial_analyses.Rmd b/vignettes/Session_2_Tidy_spatial_analyses.Rmd new file mode 100644 index 0000000..533120e --- /dev/null +++ b/vignettes/Session_2_Tidy_spatial_analyses.Rmd @@ -0,0 +1,913 @@ +--- +title: "Tidy spatial analyses" +author: + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] +output: rmarkdown::html_vignette +# bibliography: "`r file.path(system.file(package='tidySpatialWorkshop', 'vignettes'), 'tidyomics.bib')`" +vignette: > + %\VignetteIndexEntry{Tidy spatial analyses} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) +library(here) +``` + + + +# Session 2: Tidying spatial data + +## Introduction to tidyomics + +`tidyomics` represents a significant advancement in bioinformatics analysis by bridging the gap between Bioconductor and the tidyverse ecosystem. This integration provides several key benefits: + +1. **Unified Analysis Framework**: Combines the power of Bioconductor's specialized biological data structures with tidyverse's intuitive data manipulation +2. **Maintained Compatibility**: Preserves original data containers and methods, ensuring long-term support +3. **Enhanced Workflow Efficiency**: Enables streamlined analysis pipelines using familiar tidyverse syntax + +The ecosystem includes several specialized packages: +- `tidySummarizedExperiment`: For bulk RNA-seq analysis +- `tidySingleCellExperiment`: For single-cell data +- `tidySpatialExperiment`: For spatial transcriptomics +- Additional tools: `plyranges`, `nullranges`, `tidyseurat`, `tidybulk`, `tidytof` + + +[tidySpatialWorkshop](https://github.com/tidyomics/tidySpatialWorkshop) +[tidy transcriptomic manifesto](https://tidyomics.github.io/tidyomicsBlog/post/2021-07-07-tidy-transcriptomics-manifesto/) + +`tidyomics` is an interoperable software ecosystem that bridges Bioconductor and the tidyverse. `tidyomics` is installable with a single homonymous meta-package. This ecosystem includes three new packages: tidySummarizedExperiment, tidySingleCellExperiment, and tidySpatialExperiment, and five publicly available R packages: `plyranges`, `nullranges`, `tidyseurat`, `tidybulk`, `tidytof`. Importantly, `tidyomics` leaves the original data containers and methods unaltered, ensuring compatibility with existing software, maintainability and long-term Bioconductor support. + +`tidyomics` is presented in "The tidyomics ecosystem: Enhancing omic data analyses" [Hutchison and Keyes et al., 2025](https://www.biorxiv.org/content/10.1101/2023.09.10.557072v1) + +```{r, echo=FALSE, out.width="700px"} +knitr::include_graphics(here("inst/images/tidyomics.png")) +``` + +[Slides](https://docs.google.com/gview?url=https://raw.githubusercontent.com/tidytranscriptomics-workshops/LoveMangiola2022_tidytranscriptomics/master/inst/LoveMangiola2022_tidytranscriptomics.pdf) + + + +### Installation + +let's make sure we get the latest packages available on github + +```{r, eval=FALSE} + +# In May 2025, the following packages should be installed from github repositories, to use the latest features. In case you have them pre installed, run the following command +BiocManager::install(c("lmweber/ggspavis", + "stemangiola/tidySummarizedExperiment", + "stemangiola/tidySingleCellExperiment", + "william-hutchison/tidySpatialExperiment", + "stemangiola/tidybulk", + "stemangiola/tidygate"), + update = FALSE) + +``` + +**Important:** Please restart your R session after installation to ensure the updated packages are loaded correctly. + +Let's load the libraries needed for this session + +```{r, message = FALSE} +library(SpatialExperiment) + +# Tidyverse library(tidyverse) +library(ggplot2) +library(plotly) +library(dplyr) +library(tidyr) +library(purrr) +library(glue) # sprintf +library(stringr) + +# Plotting +library(colorspace) +library(dittoSeq) +library(ggspavis) + +# Analysis +library(scuttle) +library(scater) +library(scran) + +``` + +Similarly to **Section 2**, this section uses `spatialLIBD` and `ExperimentHub` packages to gather spatial transcriptomics data. + +doi: [10.1038/s41593-020-00787-0](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8095368/) + + +```{r, message = FALSE} +# From https://bioconductor.org/packages/devel/bioc/vignettes/Banksy/inst/doc/multi-sample.html +library(spatialLIBD) +library(ExperimentHub) + +# To avoid error for SPE loading +# https://support.bioconductor.org/p/9161859/#9161863 +setClassUnion("ExpData", c("matrix", "SummarizedExperiment")) + +spatial_data <- + ExperimentHub::ExperimentHub() |> + spatialLIBD::fetch_data( eh = _, type = "spe") + +# Clear the reductions +reducedDims(spatial_data) = NULL + +# Make cell ID unique +colnames(spatial_data) = paste0(colnames(spatial_data), colData(spatial_data)$sample_id) +rownames(spatialCoords(spatial_data)) = colnames(spatial_data) # Bug? + +# Display the object +spatial_data +``` +::: {.note} +If `ExperimentHub` should not work. The `spatial_data` object from the previous code block can be downloaded from [Zenodo - 10.5281/zenodo.11233385](https://zenodo.org/records/11233385/files/tidySpatialWorkshop_spatial_data.rds?download=1) +::: + +## Working with tidySpatialExperiment + +The `tidySpatialExperiment` package creates a bridge between `SpatialExperiment` objects and the tidyverse ecosystem. It provides: + +1. A tidy data view of `SpatialExperiment` objects +2. Compatible dplyr, tidyr, ggplot and plotly functions +3. Seamless integration with existing SpatialExperiment functionality + +### 1. tidySpatialExperiment package + +`tidySpatialExperiment` provides a bridge between the `SpatialExperiment` single-cell package and the tidyverse [@wickham2019welcome]. It creates an invisible layer that enables viewing the `SpatialExperiment` object as a tidyverse tibble, and provides `SpatialExperiment`-compatible `dplyr`, `tidyr`, `ggplot` +and `plotly` functions. + +If we load the `tidySpatialExperiment` package and then view the single cell data, it now displays as a tibble. + +```{r message = FALSE} +library(tidySpatialExperiment) + +spatial_data +``` + +#### Data interface, display + +If we want to revert to the standard `SpatialExperiment` view we can do that. + +```{r} +options("restore_SpatialExperiment_show" = TRUE) +spatial_data +``` + +If we want to revert back to tidy SpatialExperiment view we can. + +```{r} +options("restore_SpatialExperiment_show" = FALSE) +spatial_data +``` + +::: {.note} +Note that **rows** in this context refers to rows of the abstraction, not **rows** of the SpatialExperiment which correspond to genes **tidySpatialExperiment** prioritizes cells as the units of observation in the abstraction, while the full dataset, including measurements of expression of all genes, is still available "in the background". +::: + +#### Original behaviour is preserved + +The tidy representation behaves exactly as a native `SpatialExperiment`. It can be interacted with using [SpatialExperiment commands](https://www.bioconductor.org/packages/release/bioc/vignettes/SpatialExperiment/inst/doc/SpatialExperiment.html) +such as `assays`. + +```{r} +assays(spatial_data) +``` + +### 2. Tidyverse commands + +We can also interact with our object as we do with any tidyverse tibble. We can use `tidyverse` commands, such as `filter`, `select` and `mutate` to explore the `tidySpatialExperiment` object. Some examples are shown below and more can be seen at the `tidySpatialExperiment` [website](https://stemangiola.github.io/tidySpatialExperiment/articles/introduction.html#tidyverse-commands-1). + +#### Select + +We can use `select` to view columns, for example, to see the filename, total cellular RNA abundance and cell phase. + +If we use `select` we will also get any view-only columns returned, such as the UMAP columns generated during the preprocessing. + +```{r} +spatial_data |> select(.cell, sample_id, in_tissue, spatialLIBD) +``` + +::: {.note} +Note that some columns are always displayed no matter whet. These column include special slots in the objects such as reduced dimensions, spatial coordinates (mandatory for `SpatialExperiment`), and sample identifier (mandatory for `SpatialExperiment`). +::: + +Although the select operation can be used as a display tool, to explore our object, it updates the `SpatialExperiment` metadata, subsetting the desired columns. + +```{r} +spatial_data |> + select(.cell, sample_id, in_tissue, spatialLIBD) |> + colData() +``` + +To select columns of interest, we can use `tidyverse` powerful pattern-matching tools. For example, using the method `contains` to select + +```{r} + +spatial_data |> + select(.cell, contains("sum")) +``` + + +#### Filter + +We can use `filter` to subset rows, for example, to keep our three samples we are going to work with. + +We just display the dimensions of the dataset before filtering + +```{r} +ncol(spatial_data) +``` + +```{r} +spatial_data = + spatial_data |> + filter(sample_id %in% c("151673", "151675", "151676")) + +spatial_data +``` + +Here we confirm that the tidy R manipulation has changed the underlining object. + +```{r} +ncol(spatial_data) +``` + +In comparison the base-R method recalls the variable multiple times + +```{r, eval=FALSE} +spatial_data = spatial_data[,spatial_data$sample_id %in% c("151673", "151675", "151676")] +``` + +Or for example, to see just the rows for the cells in spatialLIBD region L1. + +```{r} +spatial_data |> dplyr::filter(sample_id == "151673", spatialLIBD == "L1") +``` + +Flexible, more powerful filters with `stringr` + +```{r} + +spatial_data |> + dplyr::filter( + subject |> str_detect("Br[0-9]1"), + spatialLIBD == "L1" + ) + +``` + +#### Summarise + +The integration of all spot/pixel/cell-related information in one table abstraction is very powerful to speed-up data exploration ana analysis. + +```{r} + +spatial_data |> + filter(sum_umi < 1000) |> + count(sample_id) + +``` + +#### Mutate + +We can use `mutate` to create a column. For example, we could create a new `Phase_l` column that contains a lower-case version of `Phase`. + +::: {.note} +Note that the special columns `sample_id`, `pxl_col_in_fullres`, `pxl_row_in_fullres`, `PC*` are view only and cannot be mutated. +::: + +```{r message=FALSE} +spatial_data |> + mutate(spatialLIBD_lower = tolower(spatialLIBD)) |> + select(.cell, spatialLIBD, spatialLIBD_lower) +``` + +We can update the underlying `SpatialExperiment` object, for future analyses. And confirm that the `SpatialExperiment` metadata has been mutated. + +```{r message=FALSE} +spatial_data = + spatial_data |> + mutate(spatialLIBD_lower = tolower(spatialLIBD)) + +spatial_data |> + colData() |> + _[,c("spatialLIBD", "spatialLIBD_lower")] +``` + +We can mutate columns for on-the-fly analyses and exploration. Let's suppose one column has capitalisation inconsistencies, and we want to apply a unique filter. + +```{r message=FALSE} +spatial_data |> + mutate(spatialLIBD = tolower(spatialLIBD)) |> + filter(spatialLIBD == "wm") +``` + +#### Extract + +We can use tidyverse commands to polish an annotation column. We will extract the sample, and group information from the file name column into separate columns. + +```{r message=FALSE} + +# Simulate file path +spatial_data = spatial_data |> mutate(file_path = glue("../data/single_cell/{sample_id}/outs/raw_feature_bc_matrix/")) + + +# First take a look at the file column +spatial_data |> select(.cell, file_path) +``` + +Extract specific identifiers from complex data paths, simplifying the dataset by isolating crucial metadata. This process allows for clearer identification of samples based on their file paths, improving data organization. + +```{r} +# Create column for sample +spatial_data <- spatial_data |> + # Extract sample ID from file path and display the updated data + tidyr::extract(file_path, "sample_id_from_file_path", "\\.\\./data/single_cell/([0-9]+)/outs/raw_feature_bc_matrix/", remove = FALSE) + +# Take a look +spatial_data |> select(.cell, sample_id_from_file_path, everything()) +``` + +#### Unite + +We could use tidyverse `unite` to combine columns, for example to create a new column for sample id combining the sample and subject id +(BCB) columns. + +```{r message=FALSE} +spatial_data <- spatial_data |> unite("sample_subject", sample_id, subject, remove = FALSE) + +# Take a look +spatial_data |> select(.cell, sample_id, sample_subject, subject) +``` + +### 3. Advanced filtering/gating and pseudobulk + +`tidySpatialExperiment` provide a interactive advanced tool for gating region of interest for streamlined exploratory analyses. + +This capability is powered by `tidygate`. We show how you can visualise your data and manually drawing gates to select one or more regions of interest using an intuitive tidy grammar. From https://bioconductor.org/packages/devel/bioc/vignettes/tidySpatialExperiment/inst/doc/overview.html + +Let's draw an arbitrary gate interactively + +```{r, eval=FALSE} +spatial_data = + spatial_data |> + + # Filter one sample + filter(in_tissue, sample_id=="151673") |> + + # Gate based on tissue morphology + tidySpatialExperiment::gate(alpha = 0.1, colour = "spatialLIBD") + +spatial_data_gated = tidygate_env$gates +``` + +You can reload a pre-made gate for reproducibility + +```{r} +data(spatial_data_gated) + +spatial_data = + spatial_data |> + + # Filter one sample + filter(in_tissue, sample_id=="151673") |> + + # Gate based on tissue morphology + tidySpatialExperiment::gate(alpha = 0.1, colour = "spatialLIBD", programmatic_gates = tidySpatialWorkshop::spatial_data_gated) + +``` + +`tidySpatialExperiment` added a `.gated` column to the `SpatialExperiment` object. We can see this column in its tibble abstraction. + +```{r} + +spatial_data |> select(.cell, .gated) +``` + +We can count how many pixels we selected with simple `tidyverse` grammar + +```{r} +spatial_data |> count(.gated) +``` + +To have a visual feedback of our selection we can plot the slide annotating by our newly created column. + +```{r, fig.width=7, fig.height=8} +spatial_data |> + ggspavis::plotVisium(annotate = ".gated") +``` + + +```{r, echo=FALSE, out.width="300px"} +knitr::include_graphics(here("inst/images/tidySPE_gate.png")) +``` + +We can also filter, for further analyses + +```{r} +spatial_data |> + filter(.gated == 1) +``` + +::: {.note} +**Exercise 2.1** +Gate roughly the white matter layer of the tissue (bottom-left) and visualise in UMAP reduced dimensions where this manual gate is distributed. + +- Calculate PCA, UMAPs as we did for Session 1 +- Gate the area of white matter +- Plot UMAP dimensions according to the gating +::: + +### 4. Work with features + +By default `tidySpatialExperiment` (as well as `tidySingleCellExperiment`) focus their tidy abstraction on pixels and cells, as this is the key analysis and visualisation unit in spatial and single-cell data. This has proven to be a practical solution to achieve elegant `tidy` analyses and visualisation. + +In contrast, bulk data focuses to features/genes for analysis. In this case its tidy representation with `tidySummarizedExperiment` prioritise features, exposing them to the user. + +If you want to interact with features, the method `join_features` will be helpful. For example, we can add one or more features of interest to our abstraction. + +Let's add the astrocyte marker GFAP + +Find out ENSEMBL ID + +```{r} +rowData(spatial_data) |> + as_tibble() |> + filter( gene_name == "GFAP") +``` + +Join the feature to the metadata + +```{r} +spatial_data = + spatial_data |> + join_features("ENSG00000131095", shape="wide") + +spatial_data |> + select(.cell, ENSG00000131095) + +``` + + +::: {.note} +**Exercise 2.2** +Join the endothelial marker PECAM1 (CD31, look for ENSEMBL ID), and plot in space the pixel that are in the 0.75 percentile of EPCAM1 expression. Are the PECAM1-positive pixels (endothelial?) spatially clustered? + +- Get the ENSEMBL ID +- Join the feature to the tidy data abstraction +- Calculate the 0.75 quantile across all pixels `mutate()` +- Label the cells with high PECAM1 +- Plot the slide colouring for the new label +::: + + +### 5. Summarisation/aggregation + +#### Distinct + +We can quickly explore the elements of a variable with distinct + +```{r} +spatial_data |> + distinct(sample_id) +``` +We can `distinct` across multiple variables + +```{r} +spatial_data |> + distinct(sample_id, Cluster) +``` + +#### Count + +We can gather more information counting the instances of a variable + +```{r} +spatial_data |> + count(Cluster) |> + arrange(desc(n)) +``` + +We calculate summary statistics of a subset of data + +```{r} +spatial_data |> +filter(Cluster==1) |> + count(sample_id) |> + arrange(desc(n)) + +``` + +#### Aggregate + +For summarised analyses, we can aggregate pixels/cells as pseudobulk with the function `aggregate_cells`. This also works for `SingleCellExeriment`.We obtain a `SummarizedExperiment`. + +```{r} +spe_regions_aggregated <- + spatial_data |> + aggregate_cells(c(sample_id, spatialLIBD)) + +spe_regions_aggregated +``` + +`tidyomics` allows to cross spatial, single-cell (Bioconductor and seurat), and bulk keeping a consistent interface. + +```{r} +library(tidySummarizedExperiment) + +spe_regions_aggregated + +``` + +You will be able to apply the familiar `tidyverse` operations + +```{r} +spe_regions_aggregated |> + filter(sample_id == "151673") +``` + + + +### 6. tidyfying your workflow + +We will take workflow used in **Session 2**, performed using mostly base R syntax and convert it to tidy R syntax. We will show you how the readability and modularity of your workflow will improve. + +#### Subset to keep only on-tissue spots. + +**Base R approach:** + +```{r, eval=FALSE} +spatial_data <- spatial_data[, colData(spatial_data)$in_tissue == 1] +``` + +**Tidyverse Approach:** + +```{r} +spatial_data <- + spatial_data |> + filter(in_tissue == 1) +``` + +**Specific Differences and Advantages:** + +The `tidyverse` `filter()` function clearly states the intent to filter the dataset, whereas the Base R approach uses subsetting which might not be immediately clear to someone unfamiliar with the syntax. + +The `tidyverse` approach inherently supports chaining further operations without manually checking dimensions, assuming that users trust the operation to behave as expected. + +#### Manipulating feature information + +::: {.note} +For `SingleCellExperiment` there is no tidy API for manipulating feature wise data yet, on the contrary for `SummarizedExperiment`, because gene-centric the abstraction allow for direct gene information manipulation. Currently, `tidySingleCellExperiment` and `tidySpatialExperiment` do not prioritize the manipulation of features (genes). + +While these functions can employ genes for cell manipulation and visualisation, as demonstrated in `join_features()`, they lack tools for altering feature-related information. Instead, their primary focus is on cell information, which serves as the main observational unit in single-cell data. This contrasts with bulk RNA sequencing data, where features are more central. +::: + +The tidy API for `SingleCellExperiment` has feature-manipulation API among our plans. See [tidyomics challenges](https://github.com/orgs/tidyomics/projects/1) + +**Base R approach:** + +```{r} +is_gene_mitochondrial <- grepl("(^MT-)|(^mt-)", rowData(spatial_data)$gene_name) +rowData(spatial_data)$gene_name[is_gene_mitochondrial] +``` + +#### Quality Control: + +Apply quality control measures to exclude cells based on mitochondrial content and read/gene count, a common indicator of cell health and viability. + +**Base R approach:** + +```{r, eval=FALSE} +spatial_data <- addPerCellQC(spatial_data, subsets = list(mito = is_gene_mitochondrial)) + +## Select expressed genes threshold +qc_mitochondrial_transcription <- colData(spatial_data)$subsets_mito_percent > 30 +colData(spatial_data)$qc_mitochondrial_transcription <- qc_mitochondrial_transcription + +``` + +**Tidyverse Approach:** + +```{r} + +spatial_data <- + spatial_data |> + + # Add QC + addPerCellQC(subsets = list(mito = is_gene_mitochondrial)) |> + + ## Add threshold in colData + mutate( + qc_mitochondrial_transcription = subsets_mito_percent > 30 + ) + +spatial_data |> select(.cell, qc_mitochondrial_transcription) + +``` + +**Specific Differences and Advantages:** + +`tidyverse` pipelines these operations without storing intermediate results, directly updating the dataset. Base R separates these steps, requiring manual tracking of variables and updating the dataset in multiple steps, increasing complexity and potential for errors. + +Direct Data Mutation: Tidyverse directly mutates the dataset within the pipeline, whereas Base R extracts, computes, and then reassigns values, which can be more verbose and less efficient in terms of workflow clarity and execution. + +#### Group-specific analyses + +**Base R approach:** + +```{r, eval=FALSE, fig.width=7, fig.height=8} +# get gene for subset +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data)) + +# Convert to list +spatial_data_list <- lapply(unique(spatial_data$sample_id), function(x) spatial_data[, spatial_data$sample_id == x]) + +# Detect sample-specific hughly-variable genes +marker_genes = + lapply( spatial_data_list, + function(x){ + dec = scran::modelGeneVar(x, subset.row = genes) + scran::getTopHVGs(dec, n = 1000) + } + ) + +head(unique(unlist(marker_genes))) + +``` + +**Tidyverse Approach: group_split** + +```{r, fig.width=7, fig.height=8} +# get gene for subset +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data)) + +marker_genes = + spatial_data |> + + # Grouping + group_split(sample_id) |> + + # Loop across the list elements + map(~ .x |> + scran::modelGeneVar(subset.row = genes) |> + scran::getTopHVGs(n = 1000) + ) |> + reduce(union) + +marker_genes |> head() +``` + +**Tidyverse Approach: nest** + +```{r, fig.width=7, fig.height=8} + +spatial_data |> + nest(sample_data = -sample_id) |> + mutate(marker_genes = map(sample_data, ~ + .x |> + scran::modelGeneVar(subset.row = genes) |> + scran::getTopHVGs(n = 1000) + )) + +``` + + + +**Specific Differences and Advantages:** + +`tidyverse` neatly handles grouping and plotting within a single chain, using `nest()` or `group_split()` and `map()` for compartmentalized operations, which organizes the workflow into a coherent sequence. + +tidyverse's `map()` is a powerful functional language tool, which can return arbitrary types, such as `map_int`, `map_char`, `map_lgl`.It is integrated into the data manipulation workflow, making it part of the data pipeline. + +#### Multi-parameter filtering + +**Base R approach:** + +```{r, eval=FALSE} +## # Mitochondrial transcription +qc_mitochondrial_transcription <- colData(spatial_data)$subsets_mito_percent > 30 +colData(spatial_data)$qc_mitochondrial_transcription <- qc_mitochondrial_transcription + +# ## Select library size threshold +qc_total_counts <- colData(spatial_data)$sum < 700 +colData(spatial_data)$qc_total_counts <- qc_total_counts + +# ## Select expressed genes threshold +qc_detected_genes <- colData(spatial_data)$detected < 500 +colData(spatial_data)$qc_detected_genes <- qc_detected_genes + +# ## Find combination to filter +colData(spatial_data)$discard <- qc_total_counts | qc_detected_genes | qc_mitochondrial_transcription + +# # Filter +spatial_data = spatial_data[,!colData(spatial_data)$discard ] +``` + +**Tidyverse Approach:** + +```{r} + +spatial_data_filtered = + spatial_data |> + + mutate( + discard = + subsets_mito_percent > 30 | + sum < 700 | + detected < 500 + ) |> + filter(!discard) +``` + +**Specific Differences and Advantages:** + +**Tidyverse:** The code directly applies multiple filtering conditions within a single filter() function, making it highly readable and concise. The conditions are clearly laid out, and the operation directly modifies the spatial_data dataframe. This approach is more intuitive for managing complex filters as it condenses them into a singular functional expression. + +**Base R:** The approach first calculates each condition and stores them within the colData of the dataset. These conditions are then combined to create a logical vector that flags rows to discard. Finally, it subsets the data by removing rows that meet any of the discard conditions. This method is more verbose and requires manually handling intermediate logical vectors, which can introduce errors and complexity in tracking multiple data transformations. + +**Why tidyverse might be better in this context:** + +**Coding efficiency:** `tidyverse` chains operations, reducing the need for intermediate variables and making the code cleaner and less error-prone. + +**Readability:** The filter conditions are all in one place, which simplifies understanding what the code does at a glance, especially for users familiar with the tidyverse syntax. + +**Maintainability:** Fewer and self-explanatory lines of code and no need for intermediate steps make the code easier to maintain and modify, especially when conditions change or additional filters are needed. + + +### 7. Visualisation + +Here, we will show how to use ad-hoc spatial visualisation, as well as `ggplot` to explore spatial data we will show how `tidySpatialExperiment` allowed to alternate between tidyverse visualisation, and any visualisation compatible with `SpatialExperiment`. + +#### Ad-hoc visualisation: Plotting the regions + +Let's visualise the regions that spatialLIBD labelled across three Visium 10X samples. + +```{r, fig.width=7, fig.height=8} +spatial_data_filtered |> + ggspavis::plotSpots(annotate = "spatialLIBD") + + facet_wrap(~sample_id) + + scale_color_manual(values = libd_layer_colors |> str_remove("ayer")) + + theme(legend.position = "none") + + labs(title = "spatialLIBD regions") +``` + +#### Custom visualisation: Plotting the regions + +```{r, fig.width=7, fig.height=8} +spatial_data |> + ggplot(aes(array_row, array_col)) + + geom_point(aes(color = spatialLIBD)) + + facet_wrap(~sample_id) + + coord_fixed() + + theme(legend.position = "none") + + labs(title = "spatialLIBD regions") +``` + +#### Custom visualisation: Plotting RNA output + +Now, let's observe what is the difference in total transcriptional cell output across regions. We can appreciate that different regions of these Visium slide is characterised by significantly different total RNA output. For example, the region one has a low R&D output, on the contrary regions to an L3, characterised by a high RNA output. + +We could conclude that when we use thresholding to filter "low-quality" pixels we have to be careful about possible biological and spatial effects. + +```{r, fig.width=7, fig.height=4} + +spatial_data_filtered |> + ggplot(aes(sum_umi, color = spatialLIBD)) + + geom_density() + + facet_wrap(~sample_id) + + scale_color_manual(values = libd_layer_colors |> str_remove("ayer")) + + scale_x_log10() + + theme_bw() + +``` + +We provide another example of how the use of tidy. Spatial experiment makes custom visualisation, very easy and intuitive, leveraging `ggplot` functionalities. We will observe the relationship between mitochondrial transcription percentage, and total gene counts. We expect this relationship to be inverse as cells with higher mitochondrial transcription percentage tent to have a more limited transcriptional gene pool (e.g. for dieying or damaged cells). + +```{r, fig.width=7, fig.height=8} + +spatial_data_filtered |> + ggplot(aes(subsets_mito_percent, sum_gene)) + + geom_point(aes(color = spatialLIBD), size=0.2) + + stat_ellipse(aes(group = spatialLIBD), alpha = 0.3) + + scale_color_manual(values = libd_layer_colors |> + str_remove("ayer")) + + scale_y_log10() + + theme_bw() + +``` + +Interestingly, if we plot the correlation between these two quantities we observe heterogeneity among regions, with L1 showing a very small association. + + +```{r, fig.width=7, fig.height=8} + +spatial_data_filtered |> + ggplot(aes(subsets_mito_percent, sum_gene)) + + geom_point(aes(color = spatialLIBD), size=0.2) + + scale_color_manual(values = libd_layer_colors |> str_remove("ayer")) + + geom_smooth(method="lm") + + facet_wrap(~spatialLIBD) + + scale_y_log10() + + theme_bw() + +``` + +Let's take a step further and group the correlations according to samples, to see whether different samples show different correlations. + +```{r, fig.width=7, fig.height=8} +spatial_data_filtered |> + ggplot(aes(subsets_mito_percent, sum_gene)) + + geom_point(aes(color = spatialLIBD), size=0.2) + + scale_color_manual(values = libd_layer_colors |> str_remove("ayer")) + + geom_smooth(aes(group = sample_id), method="lm") + + facet_wrap(~spatialLIBD) + + scale_y_log10() + + theme_bw() +``` + +As you can appreciate, the relationship between the number of genes, probed Purcell and their mitochondrial prescription abundance it's quite consistent. + +::: {.note} +**Excercise 2.3 (assisted)** + +To to practice the use of `tidyomics` on spatial data, we propose a few exercises that connect manipulation, calculations and visualisation. These exercises are just meant to be simple use cases that exploit tidy R streamlined language. + +We assume that the cells we filtered as non-alive or damaged, characterised by being enriched uniquely for mitochondrial, genes, and genes, linked to up apoptosis. It is good practice to check these assumption. This exercise aims to estimate what genes are differentially expressed between filtered and unfiltered cells. Then visualise the results. + +Use `tidyomic`/`tidyverse` tools to label dead cells and perform differential expression within each region. Some of the comments you can use are: `mutate`, `nest`, `map`, `aggregate_cells`, `tidybulk:::test_differential_abundance`, + +A hist: + +- spatial_data |> + mutate( + dead = ... + +- Aggregate by sample, dead status, ad annotated region + +- `nest` by annotated region + +- use `map` to test DE +::: + +::: {.note} +**Excercise 2.4** + +Inspired by our audience, let's try to use `tidyomics` to identify potential Amyloid Plaques. + +Amyloid plaques are extracellular deposits primarily composed of aggregated amyloid-beta (Aβ) peptides. They are a hallmark of Alzheimer's disease (AD) and are also found in certain other neurodegenerative conditions. + +Amyloid plaques can be found in the brains of mice, particularly in transgenic mouse models that are engineered to develop Alzheimer's disease-like pathology. +Although amyloid plaques themselves are extracellular, the presence and formation of these plaques are associated with specific gene expression changes in the surrounding and involved cells. These gene markers are indicative of the processes that contribute to amyloid plaque formation, as well as the cellular response to these plaques ([Ranman et al., 2021](https://molecularneurodegeneration.biomedcentral.com/articles/10.1186/s13024-021-00465-0).) + +- join_features() +- mutate() +- ggspavis::plotSpots() +::: + +```{r} +marker_genes_of_amyloid_plaques = c("APP", "PSEN1", "PSEN2", "CLU", "APOE", "CD68", "ITGAM", "AIF1") + +rownames(spatial_data) = rowData(spatial_data)$gene_name + +``` + +The excercise includes + +- Join the features + +- Rescaling + +- Summarising signature (sum), `mutate()` + +- Plotting colousing by the signature +::: + + +**Session Information** + +```{r} +sessionInfo() +``` + +**References** + +```{css echo=FALSE} +.note { + margin: 30px; + padding: 1em; + background: #FFF8F0; + border: 1px solid #EFE8E0; + border-radius: 10px; +} +``` \ No newline at end of file diff --git a/vignettes/Session_3_imaging_assays.Rmd b/vignettes/Session_3_imaging_assays.Rmd new file mode 100644 index 0000000..f6c5819 --- /dev/null +++ b/vignettes/Session_3_imaging_assays.Rmd @@ -0,0 +1,657 @@ +--- +title: "Imaging assays (tidy)" +author: + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] + - Luciano Martellotto, Adelaide Centre for Epigenetics, South Australian immunoGENomics Cancer Institute^[] +output: rmarkdown::html_vignette +# bibliography: "`r file.path(system.file(package='tidySpatialWorkshop', 'vignettes'), 'tidyomics.bib')`" +vignette: > + %\VignetteIndexEntry{Imaging assays (tidy)} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) + +library(here) +``` + +# Session 3 – Spatial analyses of imaging data + +In this session we will learn the basics of imaging-derived spatial transcriptomic data. We will learn how to visualise, manipulate and analyse single molecule data. + +We will maintain the use of `tidyomics` that we learned in `Session 2`. The programming style, in contrast of `Session 1` will make use of the `|>` (pipe) operator. + +### 1. Working with imaging-based data in Bioconductor with MoleculeExperiment + +```{r, message=FALSE, warning=FALSE} +# https://bioconductor.org/packages/devel/data/experiment/vignettes/SubcellularSpatialData/inst/doc/SubcellularSpatialData.html +# BiocManager::install("stemangiola/SubcellularSpatialData") + +# Tidyverse library(tidyverse) +library(ggplot2) +library(plotly) +library(dplyr) +library(tidyr) +library(purrr) +library(glue) # sprintf +library(stringr) +library(forcats) +library(tibble) + +# Plotting +library(colorspace) +library(dittoSeq) +library(ggspavis) +library(RColorBrewer) +library(ggspavis) + +# Analysis +library(scuttle) +library(scater) +library(scran) + +# Data download +library(ExperimentHub) +library(SubcellularSpatialData) + +# Tidyomics +library(tidySingleCellExperiment) +library(tidySummarizedExperiment) +library(tidySpatialExperiment) + +# Niche analysis +library(hoodscanR) +library(scico) + + +``` + +#### SubcellularSpatialData + +This [data package](https://bioconductor.org/packages/release/data/experiment/html/SubcellularSpatialData.html) contains annotated datasets localized at the sub-cellular level from the STOmics, Xenium, and CosMx platforms, as analyzed in the publication by [Bhuva et al., 2025](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03241-7). It includes raw transcript detections and provides functions to convert these into `SpatialExperiment` objects. + +```{r, eval=FALSE} + +# To avoid error for SPE loading +# https://support.bioconductor.org/p/9161859/#9161863 +setClassUnion("ExpData", c("matrix", "SummarizedExperiment")) + +eh = ExperimentHub() +query(eh, "SubcellularSpatialData") + +# Brain Mouse data +tx = eh[["EH8230"]] +tx |> filter(sample_id=="Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs") |> nrow() +# 62,744,602 +``` + +#### An overview of the data + + +```{r, fig.width=7, fig.height=8, eval=FALSE} +tx_small = tx[sample(seq_len(nrow(tx)), size = nrow(tx)/500),] +``` + +```{r, echo=FALSE} +# To avoid error for SPE loading +# https://support.bioconductor.org/p/9161859/#9161863 +setClassUnion("ExpData", c("matrix", "SummarizedExperiment")) + +options(timeout = max(300, getOption("timeout"))) +tx_small_file = tempfile() +utils:: download.file("https://zenodo.org/records/11213118/files/tx_small.rda?download=1", destfile = tx_small_file) +load(tx_small_file) + +tx_small = tx_small |> as_tibble() +``` + +Let's preview the object. The data is contained in a simple data frame. + +```{r} +tx_small |> + head() |> + knitr::kable( + format = "html" +) +``` + +We can appreciate how, even subsampling the data 1 in 500, we still have a vast amount of data to visualise. + +```{r, fig.width=7, fig.height=8} +tx_small |> + ggplot(aes(x, y, colour = region)) + + geom_point(pch = ".") + + facet_wrap(~sample_id, ncol = 2) + + coord_fixed() + + theme_minimal() + + theme(legend.position = "none") +``` + +This dataset have been annotated for regions. Let's have a look how many regions have been annotated + +```{r} +tx_small |> + distinct(region) +``` + +From this large dataset, we select a small reagion for illustrative purposes + +```{r, eval=FALSE} +tx_small_region = + tx |> + filter(x |> between(3700, 4200), y |> between(5000, 5500)) +``` + +Load the pre-saved data + +```{r, echo=FALSE} +tx_small_region_file = tempfile() +utils::download.file("https://zenodo.org/records/11213155/files/tx_small_region.rda?download=1", destfile = tx_small_region_file) +load(tx_small_region_file) +``` + +### 2. MoleculeExperiment + +The R package MoleculeExperiment includes functions to create and manipulate objects from the newly introduced MoleculeExperiment class, designed for analyzing molecule-based spatial transcriptomics data from platforms such as Xenium by 10X, CosMx SMI by Nanostring, and Merscope by Vizgen, among others. + +Although in this session we will not use `MoleculeExperiment` class, because of the lack of segmentation boundary information (we rather have cell identifiers), we briefly introduce this package because as an important part of Bioconductor. + +We show how we would import our table of probe location into a `MoleculeExperiment`. At the end of the Session, for knowledge, we will navigate the example code given in the [vignette material](https://www.bioconductor.org/packages/release/bioc/vignettes/MoleculeExperiment/inst/doc/MoleculeExperiment.html). + +```{r, fig.width=7, fig.height=8} + +library(MoleculeExperiment) + +repoDir = system.file("extdata", package = "MoleculeExperiment") +repoDir = paste0(repoDir, "/xenium_V1_FF_Mouse_Brain") + +me = readXenium(repoDir, keepCols = "essential") +me +``` +In this object, besides the single molecule location, we have cell segmentation boundaries. We can use these boudaries to understand subcellular localisation of molecules and to aggregate molecules in cells. + +```{r, fig.width=7, fig.height=8} +ggplot_me() + + geom_polygon_me(me, assayName = "cell", fill = "#F8DE7E", color="grey") + + geom_point_me(me) + + # zoom in to selected patch area + coord_cartesian( + xlim = c(4900, 4919.98), + ylim = c(6400.02, 6420) + ) +``` + +In this object we don't only have the cell segmentation but the nucleous segmentation as well. + +```{r, fig.width=7, fig.height=8} +boundaries(me, "nucleus") = readBoundaries( + dataDir = repoDir, + pattern = "nucleus_boundaries.csv", + segmentIDCol = "cell_id", + xCol = "vertex_x", + yCol = "vertex_y", + keepCols = "essential", + boundariesAssay = "nucleus", + scaleFactorVector = 1 +) + +boundaries(me, "cell") +showMolecules(me) + +bds_colours = setNames( + c("#aa0000ff", "#ffaaffff"), + c("Region 1", "Region 2") +) + +ggplot_me() + + # add cell segments and colour by cell id + geom_polygon_me(me, byFill = "segment_id", colour = "black", alpha = 0.1) + + # add molecule points and colour by feature name + geom_point_me(me, byColour = "feature_id", size = 0.1) + + # add nuclei segments and colour the border with red + geom_polygon_me(me, assayName = "nucleus", fill = NA, colour = "red") + + # zoom in to selected patch area + coord_cartesian(xlim = c(4900, 4919.98), ylim = c(6400.02, 6420)) +``` + +```{r} +rm(me) +gc() +``` + +We can organise our large data frame containing single molecules into a more efficient `MoleculeExperiment`. + +```{r} +library(MoleculeExperiment) + +tx_small_me = + tx_small |> + select(sample_id, gene, x, y) |> + dataframeToMEList( + dfType = "molecules", + assayName = "detected", + sampleCol = "sample_id", + factorCol = "gene", + xCol = "x", + yCol = "y" + ) |> + MoleculeExperiment() + +tx_small_me +``` + +Here, we can appreciate the difference in size between the redundant data frame + +```{r} +tx_small |> + object.size() |> + format(units = "auto") +``` + +and the `MoleculeExperiment`. + +```{r} +tx_small_me |> + object.size() |> + format(units = "auto") +``` + +```{r} +rm(tx_small) +rm(tx_small_me) +gc() +``` + +#### A preview of a zoomed in section of the tissue + +Now let's try to visualise just a small section. You can appreciate, coloured by cell, single molecules. You cqn also appreciate the difference in density between regions. An aspect to note, is that not all probes are withiin cells. This depends on the segmentation process. + +```{r, fig.width=10, fig.height=10} +brewer.pal(7, "Set1") + +tx_small_region |> + filter(!is.na(cell)) |> + slice_sample(prop = 0.3) |> + ggplot(aes(x, y, colour = factor(cell))) + + geom_point(shape=".") + + + facet_wrap(~sample_id, ncol = 2) + + scale_color_manual(values = sample(colorRampPalette(brewer.pal(8, "Set2"))(1800))) + + coord_fixed() + + theme_minimal() + + theme(legend.position = "none") +``` + + +Let's have a look to the probes that have not being unassigned to cells. + +```{r, fig.width=7, fig.height=8} + +tx_small_region |> + filter(is.na(cell)) |> + ggplot(aes(x, y, colour = factor(cell))) + + geom_point(shape=".") + + + facet_wrap(~sample_id, ncol = 2) + + scale_color_manual(values = sample(colorRampPalette(brewer.pal(8, "Set2"))(1800))) + + coord_fixed() + + theme_minimal() + + theme(legend.position = "none") +``` + +```{r} +rm(tx_small_region) +gc() +``` + +::: {.note} +**Exercise 3.1** + +We want to understand how much data we are discarding, that does not have a cell identity. + +- Using base R grammar calculate what is the ratio of outside-cell vs within-cell, probes +- Reproduce the same calculation with `tidyverse` + +::: + +### 3. Aggregation and analysis + +We will convert our cell by gene count to a `SpatialExperiment`. This object stores a cell by gene matrix with relative XY coordinates. + +`SubcellularSpatialData` has a utility function that aggregated the single molecules in cells, where these cell ID have been identified with segmentation. + + +```{r, eval=FALSE} +tx_spe = SubcellularSpatialData::tx2spe(tx) + + tx_spe = tx_spe |> mutate(in_tissue = TRUE) +``` + +```{r, echo=FALSE} +tx_spe_file = tempfile() +utils::download.file("https://zenodo.org/records/11213166/files/tx_spe.rda?download=1", destfile = tx_spe_file) +# load("~/Downloads/tx_spe.rda") +load(tx_spe_file) +``` + +Keep just the annotated regions. + +```{r} +tx_spe = tx_spe |> filter(!is.na(region)) +``` + +Let have a look to the `SpatialExperiment`. + +```{r} +tx_spe +``` + +A trivial edit to work with `ggspavis.` + +```{r} +tx_spe = tx_spe |> mutate(in_tissue = TRUE) +``` + +Let's have a look to our `SpatialExperiment`. + +```{r} +tx_spe +``` + +Let's have a look at how many cells have been detected for each region + +```{r, fig.width=7, fig.height=8} +tx_spe |> + add_count(region) |> + ggplot(aes(fct_reorder(region, n, .desc = TRUE))) + + geom_bar() + + theme_bw() + + theme(axis.text.x = element_text(angle=90, hjust=1, size = 2)) +``` + +We normalise the `SpatialExperiment` using `scater`. + +```{r} +tx_spe = + tx_spe |> + + # Scaling and tranformation + scater::logNormCounts() +``` + +We then visualise what is the relationship between variance and total expression across cells. + +```{r, fig.width=3, fig.height=2} +tx_spe |> + + # Gene variance + scran::modelGeneVar(block = tx_spe$sample_id) |> + + # Reformat for plotting + as_tibble(rownames = "feature") |> + + # Plot + ggplot(aes(mean, total)) + + geom_point() + + geom_smooth(color="red")+ + xlab("Mean log-expression") + + ylab("Variance") + + theme_bw() + +``` + +For further analysis, we subset the dataset to allow quicker calculations. + +```{r} +tx_spe_sample_1 = + tx_spe |> + filter(sample_id=="1") |> + slice_sample(prop = 0.2) +``` + +As we have done previously, we calculate variable informative genes, for further analyses. + +```{r} +genes <- !grepl(pattern = "NegControl.+|BLANK.+", x = rownames(tx_spe_sample_1)) + +# Get the top 2000 genes. +top.hvgs = + tx_spe_sample_1 |> + + scran::modelGeneVar(subset.row = genes) |> + + # Model gene variance and select variable genes per sample + getTopHVGs(n=200) + +top.hvgs +``` + +The selected subset of genes can then be passed to the subset.row argument (or equivalent) in downstream steps. + +```{r} +tx_spe_sample_1 = + tx_spe_sample_1 |> + fixedPCA( subset.row=top.hvgs ) +``` + +We then use the gene expression to cluster sales based on their similarity and represent these clusters in a two dimensional embeddings (UMAP) + +::: {.note} + +Louvain clustering is a popular method used in single-cell RNA sequencing (scRNA-seq) data analysis to identify groups of cells with similar gene expression profiles. This method is based on the Louvain algorithm, which is widely used for detecting community structures in large networks. + +The Louvain algorithm is designed to maximize a metric known as modularity, which measures the density of edges inside communities compared to edges outside communities. + +It operates in two phases: + +- first, it looks for small communities by optimizing modularity locally, and +- second it aggregates nodes belonging to the same community and repeats the process. + +[Blondel et al., 2008](https://iopscience.iop.org/article/10.1088/1742-5468/2008/10/P10008) + +::: + +```{r} +cluster_labels = + tx_spe_sample_1 |> + scran::clusterCells( + use.dimred="PCA", + BLUSPARAM=bluster::NNGraphParam(k=20, cluster.fun="louvain") + ) |> + as.character() + +cluster_labels |> + head() +``` + +Now we add this cluster column to our `SpatialExperiment` + +```{r} +tx_spe_sample_1 = + tx_spe_sample_1 |> + mutate(clusters = cluster_labels) + +tx_spe_sample_1 |> select(.cell, clusters) +``` + +As we have done before, we caculate UMAPs for visualisation purposes. + +::: {.note} +This step takes long time. +::: + +```{r} +## Check how many +tx_spe_sample_1 = + tx_spe_sample_1 |> + runUMAP() +``` +Now, let's visualise the clusters in UMAP space. + +```{r, fig.width=7, fig.height=8} +tx_spe_sample_1 |> + plotUMAP(colour_by = "clusters") + + scale_color_discrete( + colorRampPalette(brewer.pal(9, "Set1"))(30) + ) +``` + + +::: {.note} +**Exercise 3.2** + +Let's try to understand the identity of these clusters performing gene marker detection. + +In the previous sections we have seen how to do gene marker selection for sequencing-based spatial data. We just have to adapt it to our current scenario. + +- Score the markers (scran::scoreMarkers or tx_spe_sample_1) + +- Filter top markers (filter mean.AUC > 0.8) + +- Focus on Cluster 1 and try to guess the cell type (subset first element in the list, copy and paste the first 5 genes, and quickly look in public resources about what cell type those gene are markers of) + +- Plot the umap colouring by the top marker of cluster 1 (plotReducedDim()) +::: + + +Too understand whether the cell clusters explain morphology as opposed to merely cell identity, we can color cells according to annotated region. As we can see we have a lot of regions. We have more regions that cell clusters. + +```{r, fig.width=7, fig.height=8} + +tx_spe_sample_1 |> + plotUMAP(colour_by = "region") + + scale_color_discrete( + brewer.pal(n = 30, name = "Set1") + ) + + guides(color="none") + +``` + + +Let's try to understand the morphological distribution of cell clusters in space. + +Plot ground truth in tissue map. + +```{r, fig.width=7, fig.height=8} + +tx_spe_sample_1 |> + ggspavis::plotSpots(annotate = "clusters") + + guides(color = "none") + +# For comparison the annotated regions +tx_spe_sample_1 |> + ggspavis::plotSpots(annotate = "region") + + scale_color_manual(values = colorRampPalette( brewer.pal(9,"Set1") )(150) ) + + guides(color = "none") + +``` + +::: {.note} +**Exercise 3.3** + +**Spatial-aware clustering:** Apply the spatial aware clustering method BANKSY. Taking as example the code run for session 2. +::: + + +### 4. Neighborhood analyses + +hoodscanR [Liu et al., 2025](https://www.bioconductor.org/packages/release/bioc/vignettes/hoodscanR/inst/doc/Quick_start.html) + +[Source](https://divingintogeneticsandgenomics.com/post/neighborhood-cellular-niches-analysis-with-spatial-transcriptome-data-in-seurat-and-bioconductor/) + +Algorithm: + +- Nearest cells detection by Approximate Nearest Neighbor (ANN) search algorithm + +- Calculating euclidean distance matrix between cells and their k-nearest neighbors + +- Cell-level annotations provided by users are used to construct a cell annotation matrix + +- Identify cellular neighborhoods uses the SoftMax function, enhanced by a "shape" parameter that governs the "influence radious". This measures probability of a cell type to be found in a neighbour. + +- The K-means clustering algorithm finds recurring neighbours + +In order to perform neighborhood scanning, we need to firstly identify k (in this example, k = 100) nearest cells for each cells. The searching algorithm is based on Approximate Near Neighbor (ANN) C++ library from the RANN package. + +```{r} +tx_spe_neighbours = + tx_spe_sample_1 |> + readHoodData(anno_col = "clusters") |> + findNearCells(k = 100) + +``` + +The output of findNearCells function includes two matrix, an annotation matrix and a distance matrix. + +```{r} +tx_spe_neighbours$cells[1:10, 1:5] + +tx_spe_neighbours$distance[1:10, 1:5] + +``` + +We can then perform neighborhood analysis using the function scanHoods. This function incldue the modified softmax algorithm, aimming to genereate a matrix with the probability of each cell associating with their 100 nearest cells. + +```{r} + # Calculate neighbours +pm <- scanHoods(tx_spe_neighbours$distance) + + # We can then merge the probabilities by the cell types of the 100 nearest cells. We get the probability distribution of each cell all each neighborhood. +hoods <- mergeByGroup(pm, tx_spe_neighbours$cells) + +hoods[1:2, 1:10] +``` + + +We plot randomly plot 50 cells to see the output of neighborhood scanning using plotHoodMat. In this plot, each value represent the probability of the each cell (each row) located in each cell type neighborhood. The rowSums of the probability maxtrix will always be 1. + +```{r, fig.width=7, fig.height=8} +hoods |> + plotHoodMat(n = 50) +``` + +We can then merge the neighborhood results with the `SpatialExperiment` object using `mergeHoodSpe` so that we can conduct more neighborhood-related analysis. + +```{r} +tx_spe_sample_1 = tx_spe_sample_1 |> mergeHoodSpe(hoods) + +tx_spe_sample_1 +``` + +We can see what are the neighborhood distributions look like in each cluster using `plotProbDist.` + +```{r, fig.width=10, fig.height=10} +tx_spe_sample_1 |> + plotProbDist( + pm_cols = colnames(hoods), + by_cluster = TRUE, + plot_all = TRUE, + show_clusters = as.character(seq(10)) + ) +``` + + + + +**Session Information** + +```{r} +sessionInfo() +``` + +**References** + +```{css echo=FALSE} +.note { + margin: 30px; + padding: 1em; + background: #FFF8F0; + border: 1px solid #EFE8E0; + border-radius: 10px; +} +``` diff --git a/vignettes/Solutions.Rmd b/vignettes/Solutions.Rmd new file mode 100644 index 0000000..a3196f5 --- /dev/null +++ b/vignettes/Solutions.Rmd @@ -0,0 +1,490 @@ +--- +title: "Solutions to exercises" +author: + - Stefano Mangiola, South Australian immunoGENomics Cancer Institute^[], Walter and Eliza Hall Institute^[] +output: rmarkdown::html_vignette +# bibliography: "`r file.path(system.file(package='tidySpatialWorkshop', 'vignettes'), 'tidyomics.bib')`" +vignette: > + %\VignetteIndexEntry{Solutions to exercises} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, cache = FALSE) + +``` + + +::: {.note} +**Exercise 1.1** +::: + +```{r, fig.width=7, fig.height=8, eval=FALSE} +# Label +colData(spatial_data)$macro_cluster = reducedDim(spatial_data, "UMAP")[,"UMAP1"] > -2.5 + +# Verify +scater::plotUMAP(spatial_data, colour_by = "macro_cluster", point_size = 0.2) + +ggspavis::plotVisium( + spatial_data, + annotate = "macro_cluster", + highlight = "in_tissue" +) +``` + + +::: {.note} +**Exercise 1.2** +::: + +```{r, fig.width=7, fig.height=8, eval=FALSE} + +spe_joint <- do.call(cbind, spatial_data_list) + + + ggspavis::plotSpots(spe_joint, annotate = sprintf("%s", "clust_M0_lam0.2_k50_res0.7"), pal = pal) + + facet_wrap(~sample_id) + + theme(legend.position = "none") + + labs(title = "BANKSY clusters") + +``` + +```{r, fig.width=7, fig.height=8, eval=FALSE} + +spe_joint$has_changed = !spe_joint$clust_M0_lam0.2_k50_res0.7 == spe_joint$clust_M0_lam0.2_k50_res0.7_smooth + +plotSpotQC( + spe_joint, + plot_type = "spot", + annotate = "has_changed", +) + + facet_wrap(~sample_id) +``` + + +::: {.note} +**Excercise 1.3** +::: + + +```{r, fig.width=7, fig.height=8, eval=FALSE} + +res_spatialLIBD = split(data.frame(res$mat), colData(spatial_data_gene_name)$sample_id ) + +lapply(res_spatialLIBD, function(x) plotCorrelationMatrix(as.matrix(x[,-10]))) + +``` + +::: {.note} +**Excercise 1.4** +::: + + +```{r, fig.width=7, fig.height=8, eval=FALSE} + +res_spatialLIBD = split(data.frame(res$mat), colData(spatial_data_gene_name)$spatialLIBD ) + +lapply(res_spatialLIBD, function(x) plotCorrelationMatrix(as.matrix(x[,-10]))) +``` + +::: {.note} +**Excercise 1.5** +::: + + +```{r, fig.width=7, fig.height=8, eval=FALSE} + +# 1. Microglia + Neurons +is_microglia_neuron <- mat_df$Microglia > 0.1 & + mat_df$Neurons > 0.1 & + (mat_df$Microglia + mat_df$Neurons) > 0.4 +spatial_data$is_microglia_neuron <- is_microglia_neuron + +ggspavis::plotSpots(spatial_data, annotate = "is_microglia_neuron") + + facet_wrap(~sample_id) + + scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey")) + + theme(legend.position = "none") + + labs(title = "Microglia + Neurons") + + +# 2. Astrocytes + Stem cells +# note the space in the column name — use backticks +is_astrocyte_stem <- mat_df$Astrocytes > 0.1 & + mat_df$`Stem cells` > 0.1 & + (mat_df$Astrocytes + mat_df$`Stem cells`) > 0.4 +spatial_data$is_astrocyte_stem <- is_astrocyte_stem + +ggspavis::plotSpots(spatial_data, annotate = "is_astrocyte_stem") + + facet_wrap(~sample_id) + + scale_color_manual(values = c("TRUE" = "blue", "FALSE" = "grey")) + + theme(legend.position = "none") + + labs(title = "Astrocytes + Stem cells") +``` + +**Excercise 1.6** +::: + +```{r, message=FALSE, warning=FALSE, fig.width=3, fig.height=3, eval=FALSE} +# Get reference +library(cellNexus) +library(HDF5Array) + +tmp_file_path = tempfile() + +DelayedArray::setAutoBlockSize(size=1e9) + +brain_reference = + + # Query metadata across 30M cells + get_metadata() |> + + # Filter your data of interest + dplyr::filter(tissue=="dorsolateral prefrontal cortex", disease == "normal") |> + + # Collect pseudobulk as SummarizedExperiment + get_pseudobulk() |> + + # Normalise for Spotlight + scuttle::logNormCounts() |> + + # Save for fast reading + HDF5Array::saveHDF5SummarizedExperiment(tmp_file_path, replace = TRUE, verbose = TRUE) +``` + +```{r, message=FALSE, eval=FALSE} +library(HDF5Array) + +brain_reference = HDF5Array::loadHDF5SummarizedExperiment(tmp_file_path) + +my_metadata = colData(brain_reference) + +knitr::kable(head(my_metadata), format = "html") +``` + +These are the cell types included in our reference, and the number of pseudobulk samples we have for each cell type. + +```{r, eval=FALSE} + +table(brain_reference$cell_types) + +``` + +These are the number of samples we have for each of the three data sets. + +```{r, eval=FALSE} + +table(brain_reference$dataset_id) +``` + +The `collection_id` can be used to gather information on the cell database. e.g. + +```{r, eval=FALSE} +table(brain_reference$collection_id) +``` + + + +::: {.note} +**Exercise 2.1** +::: + +```{r, eval=FALSE} +# Get top variable genes +genes <- !grepl(pattern = "^Rp[l|s]|Mt", x = rownames(spatial_data)) +hvg = + spatial_data |> + scran::modelGeneVar(subset.row = genes, block = spatial_data$sample_id) |> + scran::getTopHVGs(n = 1000) + +# Calculate PCA +spatial_data |> + + scuttle::logNormCounts() |> + scater::runPCA(subset_row = hvg) |> + + # Calculate UMAP + scater::runUMAP(dimred = "PCA") |> + + # Filter one sample + filter(in_tissue, sample_id=="151673") |> + + # Gate based on tissue morphology + tidySpatialExperiment::gate_spatial(alpha = 0.1) |> + + # Plot + scater::plotUMAP(colour_by = ".gate") +``` + + +::: {.note} +**Exercise 2.2** +::: + + +```{r, eval=FALSE} +rowData(spatial_data) |> + as_tibble() |> + filter( gene_name == "PECAM1") + +gene = "ENSG00000261371" + +spatial_data |> + + # Join the feature + join_features(gene, shape="wide") |> + + # Calculate the quantile + mutate(my_quantile = quantile(ENSG00000261371, 0.75)) |> + + # Label the pixels + mutate(PECAM1_positive = ENSG00000261371 > my_quantile) |> + + # Plot + ggspavis::plotSpots(annotate = "PECAM1_positive") + + facet_wrap(~sample_id) + +``` + + +::: {.note} +**Excercise 2.3** +::: + +```{r, eval=FALSE} +library(tidySummarizedExperiment) +library(tidybulk) + +differential_analysis = + spatial_data |> + mutate( + dead = + + # Stringent threshold + subsets_mito_percent > 20 | + sum < 700 | + detected < 500 + ) |> + aggregate_cells(c(sample_id, spatialLIBD, dead)) |> + keep_abundant(factor_of_interest = c(dead)) |> + nest(data = - spatialLIBD) |> + + # filter regions having both alive and dead cells + filter( map_int(data, ~ .x |> distinct(sample_id, dead) |> nrow() ) == 6 ) |> + + # DE analyses + mutate(data = map( + data, + test_differential_abundance, + ~ dead + sample_id, + method = "edgeR_quasi_likelihood", + test_above_log2_fold_change = log(2) + )) + +differential_analysis |> + mutate(data = map(data, pivot_transcript)) |> + unnest(data) |> + filter(FDR<0.05) +``` + + +::: {.note} +**Excercise 2.4** +::: + +```{r, eval=FALSE} +rownames(spatial_data_filtered) = rowData(spatial_data_filtered)$gene_name + +marker_genes_of_amyloid_plaques = c("APP", "PSEN1", "PSEN2", "CLU", "APOE", "CD68", "ITGAM", "AIF1") + +spatial_data_filtered |> + +# Join the features + join_features(marker_genes_of_amyloid_plaques, shape = "wide") |> + + # Rescaling + mutate(across(any_of(marker_genes_of_amyloid_plaques), scales::rescale)) |> + +# Summarising signature + mutate(amyloid_plaques_signature = APP + PSEN1 + PSEN2 + CLU + APOE + CD68 + ITGAM + AIF1) |> + +# Plotting + ggspavis::plotSpots( + annotate = "amyloid_plaques_signature" + ) + + facet_wrap(~sample_id) + +``` + + +::: {.note} +**Exercise 3.2** +::: + +```{r, eval=FALSE} +mgs <- scran::scoreMarkers( + tx_spe_sample_1, + groups = tx_spe_sample_1$clusters, + + # Omit mitochondrial genes and keep all the genes in spatial + subset.row = + grep("NegControl.+|BLANK.+", rownames(tx_spe_sample_1), value=TRUE, invert=TRUE) +) + +# Select the most informative markers +mgs_df <- lapply(names(mgs), function(i) { + x <- mgs[[i]] + + # Filter and keep relevant marker genes, those with AUC > 0.8 + x <- x[x$mean.AUC > 0.8, ] + + # Sort the genes from highest to lowest weight + x <- x[order(x$mean.AUC, decreasing = TRUE), ] + + # Add gene and cluster id to the dataframe + x$gene <- rownames(x) + x$cluster <- i + data.frame(x) +}) + +head(mgs_df[[1]]) + +tx_spe_sample_1 |> + plotReducedDim("UMAP", colour_by="Gjc3") + +``` + + + + + + +::: {.note} +**Exercise 3.3** +::: + +```{r, eval=FALSE, message=FALSE, warning=FALSE} +library(Banksy) + +# scale the counts, without log transformation +tx_spe_sample_1 = tx_spe_sample_1 |> logNormCounts(log=FALSE, name = "normcounts") +``` + +**Highly-variable genes** + +The Banksy documentation, suggest the use of `Seurat` for the detection of highly variable genes. + +```{r, message=FALSE, warning=FALSE, eval=FALSE} +library(Seurat) + +"NegControl.+|BLANK.+" + +tx_spe_sample_1_seu = + as.Seurat(tx_spe_sample_1, data = NULL) + +genes <- !grepl(pattern = "NegControl.+|BLANK.+", x = rownames(tx_spe_sample_1_seu)) + + +tx_spe_sample_1_seu = tx_spe_sample_1_seu[genes,] + +tx_spe_sample_1_seu = tx_spe_sample_1_seu |> + NormalizeData(scale.factor = 5000, normalization.method = 'RC') + + + +# Compute HVGs +hvgs <- VariableFeatures(FindVariableFeatures(tx_spe_sample_1_seu, nfeatures = 2000)) + + +rm(tx_spe_sample_1_seu) + +head(hvgs) +``` + +We now split the data by sample, to compute the neighbourhood matrices. + +```{r, message=FALSE, warning=FALSE, eval=FALSE} +# Convert to list +tx_spe_sample_1_banksy = tx_spe_sample_1[hvgs,] + +# Compute the component neighborhood matrices +tx_spe_sample_1_banksy = tx_spe_sample_1_banksy |> + computeBanksy( + assay_name = "normcounts" + ) + +``` + +Here, we perform PCA using the BANKSY algorithm on the joint dataset. The group argument specifies how to treat different samples, ensuring that features are scaled separately per sample group to account for variations among them. + +```{r, eval=FALSE, message=FALSE, warning=FALSE} +tx_spe_sample_1_banksy <- runBanksyPCA( # Run PCA on the Banskly matrix + tx_spe_sample_1_banksy, + lambda = 0.2, # spatial weighting parameter. Larger values (e.g. 0.8) incorporate more spatial neighborhood + seed = 42 +) +``` + +Once the dimensional reduction is complete, we cluster the spots across all samples and use `connectClusters` to visually compare these new BANKSY clusters against manual annotations. + +```{r, eval=FALSE, message=FALSE, warning=FALSE} +tx_spe_sample_1_banksy <- clusterBanksy( # clustering on the principal components computed on the BANKSY matrix + tx_spe_sample_1_banksy, + lambda = 0.2, # spatial weighting parameter. Larger values (e.g. 0.8) incorporate more spatial neighborhood + resolution = 0.7, # numeric vector specifying resolution used for clustering (louvain / leiden). + seed = 42 +) +``` + +As an optional step, we smooth the cluster labels for each sample independently, which can enhance the visual coherence of clusters, especially in heterogeneous biological samples. + +From SpiceMix paper [Chidester et al., 2023](https://www.nature.com/articles/s41588-022-01256-z) + +```{r, eval=FALSE, message=FALSE, warning=FALSE} +tx_spe_sample_1_banksy_smoothed = tx_spe_sample_1_banksy |> + smoothLabels( + cluster_names = "clust_M0_lam0.2_k50_res0.7", + k = 6L, + verbose = FALSE +) + +``` + +The raw and smoothed cluster labels are stored in the `colData` slot of each `SingleCellExperiment` or `SpatialExperiment` object. + +Visualising Clusters and Annotations on Spatial Coordinates: We utilise the ggspavis package to visually map BANKSY clusters and manual annotations onto the spatial coordinates of the dataset, providing a comprehensive visual overview of spatial and clustering data relationships. + +```{r multi-sample-spatial, eval=TRUE, fig.width=3, fig.height=3, eval=FALSE} +# Use scater:::.get_palette('tableau10medium') + + ggspavis::plotSpots( + tx_spe_sample_1_banksy, + annotate = "clust_M0_lam0.2_k50_res0.7" + ) + + guides(color = "none") + + labs(title = "BANKSY clusters") + + ggspavis::plotSpots( + tx_spe_sample_1_banksy_smoothed, + annotate = "clust_M0_lam0.2_k50_res0.7_smooth" + ) + + guides(color = "none") + + labs(title = "BANKSY clusters smoothed") + +# ggspavis::plotSpots( +# tx_spe_sample_1_banksy, +# annotate = "clusters" +# ) + +# guides(color = "none") + +# labs(title = "cell clustering") + +ggspavis::plotSpots( +tx_spe_sample_1_banksy, +annotate = "region" +) + + scale_color_manual(values = colorRampPalette( brewer.pal(9,"Set1") )(150) ) + + guides(color = "none") + + labs(title = "ground truth") + +``` \ No newline at end of file diff --git a/vignettes/tidyGenomicsTranscriptomics.Rmd b/vignettes/tidyGenomicsTranscriptomics.Rmd deleted file mode 100644 index 9dce859..0000000 --- a/vignettes/tidyGenomicsTranscriptomics.Rmd +++ /dev/null @@ -1,778 +0,0 @@ ---- -title: "Tidy genomic and transcriptomic single-cell analyses" -author: - - Maria Doyle, Peter MacCallum Cancer Centre^[] - - Stefano Mangiola, Walter and Eliza Hall Institute^[] - - Michael Love, UNC-Chapel Hill^[] -output: rmarkdown::html_vignette -bibliography: "`r file.path(system.file(package='tidyomicsWorkshopBioc2023', 'vignettes'), 'tidyomics.bib')`" -vignette: > - %\VignetteIndexEntry{Tidy genomic and transcriptomic single-cell analyses} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE, cache = FALSE) -``` - -## Instructors - -*Dr. Stefano Mangiola* is currently a Postdoctoral researcher in the -laboratory of Prof. Tony Papenfuss at the Walter and Eliza Hall -Institute in Melbourne, Australia. His background spans from -biotechnology to bioinformatics and biostatistics. His research -focuses on prostate and breast tumour microenvironment, the -development of statistical models for the analysis of RNA sequencing -data, and data analysis and visualisation interfaces. - -*Dr. Michael Love* is an Associate Professor at UNC-Chapel Hill in the -Department of Genetics and the Department of Biostatistics. His -background is in Statistics and Computational Biology. His research -is highly collaborative, working with Geneticists, Molecular -Biologists, Epidemiologists, and Computer Scientists on methods for -transcriptomics, epigenetics, GWAS, and high-throughput functional -screens. - -## Workshop goals and objectives - -### What you will learn - -- Basic `tidy` operations possible with `tidySingleCellExperiment` - and `GRanges` -- How to interface `SingleCellExperiment` with tidy manipulation and - visualisation -- A real-world case study that will showcase the power of `tidy` - single-cell methods compared with base/ad-hoc methods -- Examples of how to integrate genomic and transcriptomic data - (ChIP-seq and RNA-seq) - -### What you will *not* learn - -- The molecular technology of single-cell sequencing -- The fundamentals of single-cell data analysis -- The fundamentals of tidy data analysis -- Detailed data integration methods (multi-view or multi-omics - algorithms) - -## Getting started - -### Local - -We will use the Cloud during the workshop and this method is available -if you want to run the material after the workshop. If you want to -install on your own computer, see instructions -[here](https://tidyomics.github.io/tidyomicsWorkshopBioc2023/index.html#workshop-package-installation). - -Alternatively, you can view the material at the workshop webpage -[here](https://tidyomics.github.io/tidyomicsWorkshopBioc2023/articles/main.html). - -## Introduction slides - -We provide two links to introductory material (to consult outside of -the workshop slot): - -1. [Introduction to tidytranscriptomics](https://docs.google.com/gview?url=https://raw.githubusercontent.com/tidyomics/tidyomicsWorkshopBioc2023/master/inst/tidytranscriptomics_slides.pdf) -2. [Introduction to tidy genomics](https://github.com/tidyomics/tidy-genomics-talk/blob/main/tidy-genomics-talk.pdf) - -# Part 1 Introduction to tidySingleCellExperiment - -```{r message = FALSE} -# Load packages -library(SingleCellExperiment) -library(ggplot2) -library(plotly) -library(dplyr) -library(colorspace) -library(dittoSeq) -``` - -SingleCellExperiment is a popular data object in the Bioconductor -ecosystem for representing single-cell datasets, which works -seamlessly with numerous Bioconductor packages and methods written by -different groups [@Amezquita2020]. It can easily be converted to and -from other formats such as SeuratObject and scverse's AnnData. - -Here we load single-cell data in SingleCellExperiment object -format. This data is peripheral blood mononuclear cells (PBMCs) from -metastatic breast cancer patients. - -```{r} -# load single cell RNA sequencing data -sce_obj <- tidyomicsWorkshopBioc2023::sce_obj - -# take a look -sce_obj -``` - -tidySingleCellExperiment provides a bridge between the -SingleCellExperiment single-cell package and the tidyverse -[@wickham2019welcome]. It creates an invisible layer that enables -viewing the SingleCellExperiment object as a tidyverse tibble, and -provides SingleCellExperiment-compatible *dplyr*, *tidyr*, *ggplot* -and *plotly* functions. - -For related work for bulk RNA-seq, see the -[tidybulk](https://stemangiola.github.io/tidybulk/) package [@Mangiola2021]. - -If we load the *tidySingleCellExperiment* package and then view the -single cell data, it now displays as a tibble. - -```{r message = FALSE} -library(tidySingleCellExperiment) - -sce_obj -``` - -If we want to revert to the standard SingleCellExperiment view we can do that. - -```{r} -options("restore_SingleCellExperiment_show" = TRUE) -sce_obj -``` - -If we want to revert back to tidy SingleCellExperiment view we can. - -```{r} -options("restore_SingleCellExperiment_show" = FALSE) -sce_obj -``` - -It can be interacted with using -[SingleCellExperiment commands](https://bioconductor.org/packages/devel/bioc/vignettes/SingleCellExperiment/inst/doc/intro.html) -such as `assays`. - -```{r} -assays(sce_obj) -``` - -We can also interact with our object as we do with any tidyverse tibble. - -## Tidyverse commands - -We can use tidyverse commands, such as `filter`, `select` and `mutate` -to explore the tidySingleCellExperiment object. Some examples are -shown below and more can be seen at the tidySingleCellExperiment -website -[here](https://stemangiola.github.io/tidySingleCellExperiment/articles/introduction.html#tidyverse-commands-1). - -We can use `filter` to choose rows, for example, to see just the rows for the cells in G1 cell-cycle stage. - -```{r} -sce_obj |> filter(Phase == "G1") -``` - -:::: {.note} -Note that *rows* in this context refers to rows of the abstraction, -not *rows* of the SingleCellExperiment which correspond to -genes. *tidySingleCellExperiment* prioritizes cells as the units of -observation in the abstraction, while the full dataset, including -measurements of expression of all genes, is still available "in the -background". -:::: - -We can use `select` to view columns, for example, to see the filename, -total cellular RNA abundance and cell phase. - -If we use `select` we will also get any view-only columns returned, -such as the UMAP columns generated during the preprocessing. - -```{r} -sce_obj |> select(.cell, file, nCount_RNA, Phase) -``` - -We can use `mutate` to create a column. For example, we could create a -new `Phase_l` column that contains a lower-case version of `Phase`. - -```{r message=FALSE} -sce_obj |> - mutate(Phase_l = tolower(Phase)) |> - select(.cell, Phase, Phase_l) -``` - -We can use tidyverse commands to polish an annotation column. We will -extract the sample, and group information from the file name column -into separate columns. - -```{r message=FALSE} -# First take a look at the file column -sce_obj |> select(.cell, file) -``` - -```{r} -# Create column for sample -sce_obj <- sce_obj |> - # Extract sample - extract(file, "sample", "../data/.*/([a-zA-Z0-9_-]+)/outs.+", remove = FALSE) - -# Take a look -sce_obj |> select(.cell, sample, everything()) -``` - -We could use tidyverse `unite` to combine columns, for example to -create a new column for sample id combining the sample and patient id -(BCB) columns. - -```{r message=FALSE} -sce_obj <- sce_obj |> unite("sample_id", sample, BCB, remove = FALSE) - -# Take a look -sce_obj |> select(.cell, sample_id, sample, BCB) -``` - - -# Part 2 Signature visualisation - -## Data pre-processing - -The object `sce_obj` we've been using was created as part of a study -on breast cancer systemic immune response. Peripheral blood -mononuclear cells have been sequenced for RNA at the single-cell -level. The steps used to generate the object are summarised below. - -- `scran`, `scater`, and `DropletsUtils` packages have been used to - eliminate empty droplets and dead cells. Samples were individually - quality checked and cells were filtered for good gene coverage . - -- Variable features were identified using `modelGeneVar`. - -- Read counts were scaled and normalised using logNormCounts from `scuttle`. - -- Data integration was performed using `fastMNN` with default parameters. - -- PCA performed to reduce feature dimensionality. - -- Nearest-neighbor cell networks were calculated using 30 principal components. - -- 2 UMAP dimensions were calculated using 30 principal components. - -- Cells with similar transcriptome profiles were grouped into clusters using Louvain clustering from `scran`. - -## Analyse custom signature - -The researcher analysing this dataset wanted to identify gamma delta T -cells using a gene signature from a published paper -[@Pizzolato2019]. We'll show how that can be done here. - -With tidySingleCellExperiment's `join_features` we can view the counts -for genes in the signature as columns joined to our single cell -tibble. - -```{r} -sce_obj |> - join_features(c("CD3D", "TRDC", "TRGC1", "TRGC2", "CD8A", "CD8B"), shape = "wide") -``` - -We can use tidyverse `mutate` to create a column containing the -signature score. To generate the score, we scale the sum of the 4 -genes, CD3D, TRDC, TRGC1, TRGC2, and subtract the scaled sum of the 2 -genes, CD8A and CD8B. `mutate` is powerful in enabling us to perform -complex arithmetic operations easily. - -```{r} -sce_scored <- sce_obj |> - - join_features(c("CD3D", "TRDC", "TRGC1", "TRGC2", "CD8A", "CD8B"), shape = "wide") |> - - mutate( - signature_score = - scales::rescale(CD3D + TRDC + TRGC1 + TRGC2, to = c(0, 1)) - - scales::rescale(CD8A + CD8B, to = c(0, 1)) - ) - -sce_scored |> - - select(.cell, signature_score, everything()) -``` - -The gamma delta T cells could then be visualised by the signature -score using Bioconductor's visualisation functions. - -```{r} -sce_scored |> - - scater::plotUMAP(colour_by = "signature_score") -``` - -The cells could also be visualised using the popular and powerful -`ggplot2` package, enabling the researcher to use ggplot functions -they were familiar with, and to customise the plot with great -flexibility. - -```{r} -sce_scored |> - - # plot cells with high score last so they're not obscured by other cells - arrange(signature_score) |> - - ggplot(aes(UMAP_1, UMAP_2, color = signature_score)) + - geom_point() + - scale_color_distiller(palette = "Spectral") + - tidyomicsWorkshopBioc2023::theme_multipanel -``` - -For exploratory analyses, we can select the gamma delta T cells, the -red cluster on the left with high signature score. We'll filter for -cells with a signature score > 0.7. - -```{r} -sce_obj_gamma_delta <- - - sce_scored |> - - # Proper cluster selection should be used instead (see supplementary material) - filter(signature_score > 0.7) -``` - -For comparison, we show the alternative using base R and -SingleCellExperiment. Note that the code contains more redundancy and -intermediate objects. - -```{r eval=FALSE} -counts_positive <- - assay(sce_obj, "logcounts")[c("CD3D", "TRDC", "TRGC1", "TRGC2"), ] |> - colSums() |> - scales::rescale(to = c(0, 1)) - -counts_negative <- - assay(sce_obj, "logcounts")[c("CD8A", "CD8B"), ] |> - colSums() |> - scales::rescale(to = c(0, 1)) - -sce_obj$signature_score <- counts_positive - counts_negative - -sce_obj_gamma_delta <- sce_obj[, sce_obj$signature_score > 0.7] -``` - -We can then focus on just these gamma delta T cells and chain -Bioconductor and tidyverse commands together to analyse. - -```{r warning=FALSE, message=FALSE} -library(batchelor) -library(scater) - -sce_obj_gamma_delta <- - - sce_obj_gamma_delta |> - - # Integrate - using batchelor. - multiBatchNorm(batch = colData(sce_obj_gamma_delta)$sample) |> - fastMNN(batch = colData(sce_obj_gamma_delta)$sample) |> - - # Join metadata removed by fastMNN - using tidyverse - left_join(as_tibble(sce_obj_gamma_delta)) |> - - # Dimension reduction - using scater - runUMAP(ncomponents = 2, dimred = "corrected") -``` - -Visualise gamma delta T cells. As we have used rough threshold we are -left with only few cells. Proper cluster selection should be used -instead (see supplementary material). - -```{r} -sce_obj_gamma_delta |> plotUMAP() -``` - - -It is also possible to visualise the cells as a 3D plot using plotly. -The example data used here only contains a few genes, for the sake of -time and size in this demonstration, but below is how you could -generate the 3 dimensions needed for 3D plot with a full dataset. - -```{r eval = FALSE} -single_cell_object |> - RunUMAP(dims = 1:30, n.components = 3L, spread = 0.5, min.dist = 0.01, n.neighbors = 10L) -``` - -We'll demonstrate creating a 3D plot using some data that has 3 UMAP -dimensions. This is a fantastic way to visualise both reduced -dimensions and metadata in the same representation. - -```{r umap plot 2, message = FALSE, warning = FALSE} -pbmc <- tidyomicsWorkshopBioc2023::sce_obj_UMAP3 - -pbmc |> - plot_ly( - x = ~`UMAP_1`, - y = ~`UMAP_2`, - z = ~`UMAP_3`, - color = ~cell_type, - colors = dittoSeq::dittoColors() - ) %>% - add_markers(size = I(1)) -``` - -## Exercises - -Using the `sce_obj`: - -1. What proportion of all cells are gamma-delta T cells? Use - signature_score > 0.7 to identify gamma-delta T cells. - -2. There is a cluster of cells characterised by a low RNA output - (nCount_RNA < 100). Identify the cell composition (cell_type) of - that cluster. - -# Part 3 Genomic and transcriptomic data integration - -So far we have examined expression of genes across our cells, but we -haven't considered the genomic location of the genes. In this section -we will operate on genomic location information using the *plyranges* -[@Lee2019] package, and tie this to the single cell transcriptomics -data we have been examining. - -*plyranges* provides tidyverse-style functionality to genomic ranges -(GRanges objects [@Lawrence2013]) analogous to how -*tidySingleCellExperiment* provides functionality for -SingleCellExperiment objects. For an example of a workflow using -*plyranges* as part of a bulk RNA-seq analysis, see @Lee2020. - -:::: {.note} -In some pipelines (e.g. using *tximeta* [@Love2020] to import bulk or -single-cell quantification data) our SingleCellExperiment or -SummarizedExperiment would already have genomic range information -attached to the rows of the object. That is, we would have information -about the starts and ends of the genes, their strand information, even -the lengths of the chromosomes and the genome build, etc. -:::: - -Before we start our integrative analysis, we will first take some -steps to add hg38 range information on our genes, matching by the gene -symbol provided on the rows of `sce_obj`. We add genes from the -*ensembldb* package [@Rainer2019] (to see how to add a particular -version of Ensembl, see the package vignette). - -```{r message=FALSE} -# what our rownames look like: gene symbols -sce_obj |> rownames() |> head() - -# we recommend Ensembl or GENCODE gene annotations -edb <- EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86 # hg38 -g <- ensembldb::genes(edb) -head(genome(g)) -``` - -We can examine the first three genes and their metadata: - -```{r message=FALSE} -library(plyranges) -g |> slice(1:3) -``` - -Our first task is to subset `g`, the human genes, to the ones in our -SingleCellExperiment. While Ensembl IDs are unique, gene symbols are -not, so we will have to also remove any duplicate gene symbols -(recommend working with Ensembl IDs as feature identifiers when -possible). - -We subset the columns of `g` to just the `symbol` column, using the -`select()` function. *plyranges* enables us to use familiar tidy verbs -on GRanges objects, e.g. to `filter` rows or to `select` columns of -metadata attached to the ranges. - -```{r} -g <- g |> - - select(symbol) - -g |> slice(1:3) -``` - -Now we filter the genes of our SingleCellExperiment to only those -present as rownames of `sce_obj`, and remove duplicate IDs. We then -sort by genomic location, and use the gene symbols as names of the -ranges. - -```{r} -gene_names <- sce_obj |> rownames() - -g <- g |> - - filter(symbol %in% gene_names) |> - filter(!duplicated(symbol)) |> - sort() # genomic position sorting - -names(g) <- g$symbol -``` - -Now we can add our gene information to the rows of the -`sce_obj`. - -:::: {.note} -Again, these following steps would not be necessary using a -pipeline that imports quantification data to ranged objects, but it -isn't too hard to add manually. If adding ranges manually, make note -of the provenance of where you downloaded the information (Ensembl, -GENCODE or otherwise), and assign a genome build if you plan on -sharing the final object (e.g. with `genome(x) <- "..."`). -:::: - -```{r} -all(names(g) %in% rownames(sce_obj)) -sce_sub <- sce_obj[ names(g), ] # put SCE in order of `g` -rowRanges(sce_sub) <- g # assign ranges `g` to the SCE -sce_sub |> rowRanges() # peek at the ranges -``` - -Now let's do something interesting with the gene ranges: let's see if -genes near peaks of active chromatin marks (H3K4me3 measured with -ChIP-seq) in another experiment involving PBMC have a difference in -their expression level compared to other genes. - -There are many sources of epigenetic tracks, ENCODE, Roadmap, -etc. Here we will use some ENCODE [@ENCODE2012] data available in -*AnnotationHub* on Bioconductor. - -:::: {.note} -We had to do a bit of book-keeping first. These peaks were in -the hg19 genome build, so we have ran the following commands (here -un-evaluated) to 'lift" the peaks to the hg38 genome build, and -provide proper sequence information. -:::: - -```{r eval=FALSE} -### un-evaluated code chunk ### -# AnnotationHub contains many useful genome tracks -library(AnnotationHub) -ah <- AnnotationHub() -# can be queried with keywords -query(ah, c("Peripheral_Blood","h3k4me3")) -# downloading a particular resource -peaks <- ah[["AH44823"]] -library(rtracklayer) -# lifting hg19 to hg38 -new_peaks <- unlist( - liftOver( - peaks, - chain = import.chain("~/Downloads/hg19ToHg38.over.chain") - ) -) -# Ensembl-style chrom names -seqlevelsStyle(new_peaks) <- "NCBI" -# bring over any missing chroms -seqlevels(new_peaks) <- seqlevels(sce_sub) -# bring over chrom lengths -seqinfo(new_peaks) <- seqinfo(sce_sub) -# subset to a few columns -pbmc_h3k4me3_hg38 <- new_peaks |> - select(signalValue, qValue, peak) -# save for reloading below -save(pbmc_h3k4me3_hg38, file="data/pbmc_h3k4me3_hg38.rda", compress="xz") -``` - -Loading those peaks. - -```{r} -# loading those H3K4me3 peaks for PBMCs -peaks <- tidyomicsWorkshopBioc2023::pbmc_h3k4me3_hg38 -plot(peaks$qValue, type="l", ylab="q-value", main="H3K4me3 peaks") -abline(v=5000, lty=2) -``` - -We will arbitrarily chose to take the top 5,000 peaks by p-value. How -to chose the number of epigenetic peaks and genes to consider in a -multi-omics analysis is a more involved question, and is worth -considering the impact of such a choice for enrichment analysis. - -```{r} -peaks <- peaks |> - - slice(1:5000) |> # ordered already by p-value (most to least sig) - sort() # genomic position sorting -``` - -It is easy to compute the distance of the genes in our -SingleCellExperiment to the nearest H3K4me3 peak, using *plyranges* -convenience functions. - -See the -[plyranges package website](https://sa-lee.github.io/plyranges/reference/index.html) -for a full listing of such functions. - -```{r} -dists <- rowRanges(sce_sub) |> - - anchor_5p() |> # anchor 5 prime end - mutate(width=1) |> # shrink range to 1bp - add_nearest_distance(peaks) - -hist(log10(dists$distance + 1), breaks=40, - main="gene-to-peak distance", xlab="log10 distance") -``` - -Let's put the genes into 4 different bins by their distance to a -H3K4me3 peak: 1) overlapping [distance of 0], 2) 1bp-10kb, -3) 10kb-100kb, or 4) farther than 100kb. - -```{r} -bins <- c(0,1,1e4,1e5,Inf) - -dists <- dists |> - - select(symbol, distance, .drop_ranges = TRUE) |> # remove chr/pos/strand etc. - as_tibble() |> - mutate(distance_bin = cut(distance, bins, include.lowest = TRUE)) |> - rename(feature = symbol) # this will help us add information to the SCE -``` - -We can see how many genes are in which category: - -```{r} -dists |> - - ggplot(aes(distance_bin)) + - geom_bar() -``` - -We will now take the SingleCellExperiment data and compress it down to -a SummarizedExperiment using the `aggregate_cells` function from -*tidySingleCellExperiment*. After this function, every column of the -SummarizedExperiment is a cell type. - -We immediately pipe the SummarizedExperiment into a `left_join` where -we add the distance-to-peak information, and then into a `nest` -command where we create a nested (tidy)SummarizedExperiment object, -where we have grouped by the distance bin that the genes fall into. - - -```{r message=FALSE} -library(tidySummarizedExperiment) -library(purrr) -nested <- sce_sub |> - - aggregate_cells(cell_type) |> - left_join(dists, by="feature") |> - nest(se = -distance_bin) - -nested -``` - -We can now operate on the SummarizedExperiment objects that are within -`nested`. For example, we can extract the column means of the counts -(cell-type-specific means of counts over genes). - -We save this as a new tibble called `smry` as it contains summary -information. - -```{r} -smry <- nested |> - - mutate(summary = map(se, \(se) { - tibble(count_mean = colMeans(assay(se)), - cell_type = se$cell_type) - }) - ) |> - select(-se) |> - unnest(summary) |> - mutate(cell = substr(cell_type, 1, 13)) # abbreviate cell_type for plot -``` - -As a final plot, we can look at the mean count over the genes in a -particular distance bin (x-axis), across the cell types (y-axis). The -different lines show how the genes' proximity to H3K4me3 peaks affects -the average count. - -```{r} -library(forcats) -smry |> - - mutate(cell = fct_reorder(cell, count_mean, .fun=median)) |> - ggplot(aes(count_mean, cell, color=distance_bin, group=distance_bin)) + - geom_point() + - geom_line(orientation="y") + - xlab("mean of count over genes in bin") + - ylab("cell type") -``` - -We finish this section with a re-cap of the steps we took: - -1. calculated distances from genes to H3K4me3 peaks -2. added distance information onto the SingleCellExperiment -3. condensed the dataset via pseudobulking -4. computed the mean count (over genes) within 4 bins of genes -5. visualized these mean counts over cell types - -Let's consider a number of issues with this first-pass analysis and -visualization: - -1. we arbitrarily thresholded the peaks at the beginning of the - analysis to take the top 5,000 -2. we just computed the mean count, not taking into account total - reads per cell type (sequencing depth per cell x number of cells) -3. we are comparing cell-type-specific expression to aggregate - ChIP-seq peaks in PBMC -4. the two experiments, while both PBMC, come from different projects, - different labs, and one is from cancer patients, while the other is - from the ENCODE project -5. we only plot the mean count over genes, perhaps it would be better - to show more information about the distribution - -:::: {.note} -As an alternative to (1), we could have counted the number of peaks -that were near each genes, or even computed on the metadata of the -peaks (signal value, etc.). An example follows of how we could have -counted overlaps instead (how many peaks within 20kb). -:::: - -```{r} -overlaps <- rowRanges(sce_sub) |> - - anchor_5p() |> - mutate(width=1) %>% - mutate(num_overlaps = count_overlaps(., peaks, maxgap=2e4)) - -table(overlaps$num_overlaps) -``` - -:::: {.note} -Another question that often arises is whether the distances or -overlaps we observe are more or less than we would expect if there -were no relationship between the positions of genes and peaks. -To answer questions like these, we have developed the *nullranges* -package, which allows for either bootstrapping of ranges throughout -the genome [@Mu2023], or selection of a covariate-matched background -set [@Davis2023], to compute probabilities under the null hypothesis. -A brief example follows of bootstrapping some of the peaks in a window -of chr1:20Mb-30Mb (see *nullranges* website for more comprehensive -examples). -:::: - -```{r} -library(nullranges) - -# make a small range for demonstration of bootstrapping -seg <- data.frame(seqnames=1, start=20e6, width=10e6) |> - as_granges() - -# generate a genome segmentation from this one range -seg <- oneRegionSegment(seg, seqlength=seqlengths(peaks)[[1]]) -seg - -# bootstrap peaks within this segment (just for demo) -peaks |> - filter_by_overlaps(seg[2]) |> - bootRanges(blockLength=1e6, seg=seg, R=10) - -# these bootstrapped ranges could then be used for computing -# generic test statistics, with the `iter` column used for -# building a null distribution -``` - -**Session Information** - -```{r} -sessionInfo() -``` - -**References** - -```{css echo=FALSE} -.note { - margin: 30px; - padding: 1em; - background: #FFF8F0; - border: 1px solid #EFE8E0; - border-radius: 10px; -} -```