Skip to content

Commit

Permalink
Update to MSigDB 2023.1
Browse files Browse the repository at this point in the history
  • Loading branch information
igordot committed Dec 21, 2024
1 parent a7967d3 commit 32130ea
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 14 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: msigdbr
Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format
Version: 2022.1.1
Version: 2023.1.1
Authors@R:
person("Igor", "Dolgalev", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0003-4451-126X"))
Expand All @@ -17,7 +17,7 @@ License: MIT + file LICENSE
URL: https://igordot.github.io/msigdbr/
BugReports: https://github.com/igordot/msigdbr/issues
Depends:
R (>= 3.4)
R (>= 3.6)
Imports:
babelgene (>= 22.9),
dplyr (>= 1.1.1),
Expand All @@ -34,4 +34,4 @@ VignetteBuilder:
knitr
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
RoxygenNote: 7.3.2
7 changes: 6 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# msigdbr 2023.1.1

* Based on MSigDB v2023.1.Hs release.
* Not on CRAN.

# msigdbr 2022.1.1

* Based on MSigDB v2022.1 release.
* Based on MSigDB v2022.1.Hs release.
* Not on CRAN.

# msigdbr 7.5.1
Expand Down
Binary file modified R/sysdata.rda
Binary file not shown.
19 changes: 11 additions & 8 deletions data-raw/msigdbr-prepare.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

library(dplyr)
library(tidyr)
library(purrr)
Expand All @@ -12,19 +11,22 @@ options(pillar.print_max = 100)
# Import MSigDB gene sets -----

# Set MSigDB version
mdb_version <- "2022.1.Hs"
mdb_version <- "2023.1.Hs"

# Set HGNC version (last quarterly release before MSigDB release)
hgnc_version <- "2022-07-01"
hgnc_version <- "2023-01-01"

# Set MSigDB file paths
mdb_xml <- glue("msigdb_v{mdb_version}.xml")
mdb_xml_zip <- str_glue("{mdb_xml}.zip")
mdb_url_base <- "https://data.broadinstitute.org/gsea-msigdb/msigdb"
mdb_xml_url <- glue("{mdb_url_base}/release/{mdb_version}/{mdb_xml}")
mdb_zip_url <- glue("{mdb_url_base}/release/{mdb_version}/{mdb_xml_zip}")

# Download the MSigDB XML file
# Download and unzip the MSigDB XML file
options(timeout = 300)
download.file(url = mdb_xml_url, destfile = mdb_xml)
download.file(url = mdb_zip_url, destfile = mdb_xml_zip)
unzip(mdb_xml_zip, exdir = ".")
file.remove(mdb_xml_zip)

# Check MSigDB XML file size in bytes
utils:::format.object_size(file.size(mdb_xml), units = "auto")
Expand Down Expand Up @@ -81,6 +83,7 @@ mdb_category_genesets
ensembl_url <- glue("{mdb_url_base}/annotations/human/Human_Ensembl_Gene_ID_MSigDB.v{mdb_version}.chip")
ensembl_tbl <- read_tsv(ensembl_url, progress = FALSE, show_col_types = FALSE)
ensembl_tbl <- distinct(ensembl_tbl, human_ensembl_gene = `Probe Set ID`, human_gene_symbol = `Gene Symbol`)
ensembl_tbl <- filter(ensembl_tbl, str_detect(human_ensembl_gene, "^ENSG000"))
ensembl_tbl <- arrange(ensembl_tbl, human_ensembl_gene)

# Check for multi-mappers (should be many)
Expand All @@ -91,7 +94,7 @@ count(ensembl_tbl, human_gene_symbol, sort = TRUE)

# Download HGNC mappings
# May not include all MSigDB genes, but there is usually one Ensembl ID per gene
hgnc_url <- glue("https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/hgnc_complete_set_{hgnc_version}.txt")
hgnc_url <- str_glue("https://storage.googleapis.com/public-download-files/hgnc/archive/archive/quarterly/tsv/hgnc_complete_set_{hgnc_version}.txt")
hgnc_tbl <- read_tsv(hgnc_url, progress = FALSE, show_col_types = FALSE, guess_max = 10000)
hgnc_tbl <- distinct(hgnc_tbl, human_ensembl_gene = ensembl_gene_id, human_entrez_gene = entrez_id)
hgnc_tbl <- mutate(hgnc_tbl, human_entrez_gene = as.integer(human_entrez_gene))
Expand Down Expand Up @@ -244,7 +247,7 @@ genes_members_ratio <- full_join(mdb_geneset_members, count(msigdbr_geneset_gene
genes_members_ratio$ratio <- genes_members_ratio$n_genes / genes_members_ratio$n_members
if (min(genes_members_ratio$n_genes) < 5) stop()
if (max(genes_members_ratio$n_genes) > 2300) stop()
if (max(genes_members_ratio$ratio) > 2) stop()
if (max(genes_members_ratio$ratio) > 2.2) stop()
if (quantile(genes_members_ratio$ratio, 0.99) > 1) stop()
if (quantile(genes_members_ratio$ratio, 0.001) < 0.3) stop()
if (quantile(genes_members_ratio$ratio, 0.1) < 0.7) stop()
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-msigdbr.R
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ test_that("number of genes in specific gene sets", {
msigdbr_mm <- msigdbr(species = "Mus musculus")
# H: HALLMARK_APOPTOSIS
expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M5902")), 161)
expect_equal(nrow(filter(msigdbr_mm, gs_id == "M5902")), 161)
expect_equal(nrow(filter(msigdbr_mm, gs_id == "M5902")), 160)
expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M5903")), 32)
expect_equal(nrow(filter(msigdbr_mm, gs_id == "M5903")), 32)
# C8: HAY_BONE_MARROW_PRE_DENDRITIC
Expand Down
2 changes: 1 addition & 1 deletion vignettes/msigdbr-intro.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ gsva(gset.idx.list = msigdbr_list, ...)

**Which version of MSigDB was used?**

This package was generated with MSigDB v2022.1.
This package was generated with MSigDB v2023.1.Hs.
The MSigDB version is used as the base of the msigdbr CRAN package version.
You can check the installed version with `packageVersion("msigdbr")`.

Expand Down

0 comments on commit 32130ea

Please sign in to comment.