Slim Fourati 2024-01-31
Load packages
suppressPackageStartupMessages(library(package = "knitr"))
suppressPackageStartupMessages(library(package = "kableExtra"))
suppressPackageStartupMessages(library(package = "readxl"))
suppressPackageStartupMessages(library(package = "EDASeq"))
suppressPackageStartupMessages(library(package = "pvca")) # dleelab/pvca
suppressPackageStartupMessages(library(package = "lme4"))
suppressPackageStartupMessages(library(package = "parallelDist"))
suppressPackageStartupMessages(library(package = "parallel"))
suppressPackageStartupMessages(library(package = "edgeR"))
suppressPackageStartupMessages(library(package = "ggbeeswarm"))
suppressPackageStartupMessages(library(package = "ComplexHeatmap"))
suppressPackageStartupMessages(library(package = "tidyverse"))
opts_chunk$set(echo = TRUE, fig.path = "../figure/")
options(readr.show_col_types = FALSE,
dplyr.summarise.inform = FALSE)
workDir <- dirname(getwd())
Reading raw counts
countDF <- read_csv(file = file.path(workDir, "input/joana2.genecounts.csv")) %>%
as.data.frame() %>%
column_to_rownames(var = "gene_id")
message("First five rows/columns of count matrix")
## First five rows/columns of count matrix
countDF[1:5, 1:5] %>%
kable()
DGA7_D14_CD4 | DGA7_D14_CD4MEM | DGA7_D14_CD4NA | DGA7_D14_CD8 | DGA7_D14_CD8CM | |
---|---|---|---|---|---|
ENSMMUG00000000001 | 31 | 121 | 60 | 71 | 66 |
ENSMMUG00000000002 | 2 | 10 | 11 | 6 | 5 |
ENSMMUG00000000005 | 177 | 395 | 110 | 216 | 297 |
ENSMMUG00000000006 | 0 | 1 | 0 | 0 | 0 |
ENSMMUG00000000007 | 4 | 20 | 36 | 35 | 33 |
message("Dimensions of the count matrix")
## Dimensions of the count matrix
print(dim(countDF))
## [1] 35432 396
Read sample annotation
sampleAnnotFile <- file.path(workDir,
"input",
"JoanaDias_Bulk_RNAseq_sampleTracking_06012021.xlsx")
# read each plates
sampleAnnotDF <- read_excel(path = sampleAnnotFile,
.name_repair = function(x) make.unique(make.names(x)))
# append treatment info
treatFile <- file.path(workDir,
"input",
"RNAseq_II_for_Slim_20210829.slide3.txt")
treatDF <- read_tsv(file = treatFile, name_repair = make.names)
sampleAnnotDF <- merge(x = sampleAnnotDF, y = treatDF, by = "Monkey.ID")
# append sample id
sampleAnnotDF <- sampleAnnotDF %>%
mutate(rowname = paste(Monkey.ID,
Timepoint,
Cell.population.sorted.1,
sep = "_"),
rowname = gsub(pattern = " ", replacement = "", rowname)) %>%
as.data.frame() %>%
column_to_rownames(var = "rowname")
sampleAnnotDF <- sampleAnnotDF[colnames(countDF), ]
# TotalReads counted
sampleAnnotDF$TotalReads <- colSums(countDF)
message("print header of sample annotation")
## print header of sample annotation
sampleAnnotDF[1:5, 1:5] %>%
kable()
Monkey.ID | Collaborator | Sample. | Plate. | Sample.Type.Subset | |
---|---|---|---|---|---|
DGA7_D14_CD4 | DGA7 | Joana Dias/Koup Lab | 117 | 2 | Total RNA from sorted cells in RNAzol |
DGA7_D14_CD4MEM | DGA7 | Joana Dias/Koup Lab | 119 | 2 | Total RNA from sorted cells in RNAzol |
DGA7_D14_CD4NA | DGA7 | Joana Dias/Koup Lab | 118 | 2 | Total RNA from sorted cells in RNAzol |
DGA7_D14_CD8 | DGA7 | Joana Dias/Koup Lab | 111 | 2 | Total RNA from sorted cells in RNAzol |
DGA7_D14_CD8CM | DGA7 | Joana Dias/Koup Lab | 116 | 2 | Total RNA from sorted cells in RNAzol |
message("Number of animal w/ RNA-seq data")
## Number of animal w/ RNA-seq data
sampleAnnotDF %>%
.$Monkey.ID %>%
unique() %>%
length() %>%
print()
## [1] 12
message("Number of timepoints and subsets w/ RNA-seq data")
## Number of timepoints and subsets w/ RNA-seq data
sampleAnnotDF %>%
select(Monkey.ID, Timepoint, Cell.population.sorted.1) %>%
distinct() %>%
group_by(Timepoint, Cell.population.sorted.1) %>%
summarize(n = n()) %>%
ungroup() %>%
mutate(Timepoint = factor(Timepoint, levels = c("pre", "D14", "wk8"))) %>%
pivot_wider(names_from = Cell.population.sorted.1, values_from = n) %>%
arrange(Timepoint) %>%
kable()
Timepoint | CD4 | CD4MEM | CD4NA | CD8 | CD8CM | CD8EM | CD8NA | GCTfh | fCD8 | nonTfh | nonfCD8 |
---|---|---|---|---|---|---|---|---|---|---|---|
pre | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 |
D14 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 |
wk8 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 | 12 |
Transcriptomic profiles of 12 animals for 11 subsets at 3 timepoints was performed.
Read gene annotation
colNames <- c("seqname",
"source",
"feature",
"start",
"end",
"score",
"strand",
"frame",
"attribute")
colTypes <- paste(c(rep("c", times = 3),
rep("i", times = 2),
rep("c", times = 4)),
collapse = "")
featuresAnnotationFile <- file.path(workDir, "input/joana2.genes.gtf")
skipNb <- read_lines(file = featuresAnnotationFile)
skipNb <- sum(grepl(pattern = "^#", skipNb))
featuresAnnot <- read_tsv(file = featuresAnnotationFile,
skip = skipNb,
col_names = colNames,
col_types = colTypes)
# extract gene_id and transcript_id from attributes
featAnnot <- featuresAnnot %>%
mutate(gene_id = gsub(pattern = ".*gene_id \"([^;]+)\";.+",
replacement = "\\1",
attribute),
gene_name = ifelse(test = grepl(pattern = "gene_name",
attribute),
yes = gsub(pattern = ".+gene_name \"([^;]+)\";.+",
replacement = "\\1",
attribute),
no = NA),
gene_biotype = ifelse(test = grepl(pattern = "gene_biotype",
attribute),
yes = gsub(pattern = ".+gene_biotype \"([^;]+)\";.*",
replacement = "\\1",
attribute),
no = NA)) %>%
select(seqname, strand, gene_id, gene_name, gene_biotype) %>%
distinct() %>%
as.data.frame()
rownames(featAnnot) <- featAnnot$gene_id
featAnnot <- featAnnot[rownames(countDF), ]
cat("Number of genes annotated")
## Number of genes annotated
featAnnot %>%
group_by(gene_biotype) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
kable()
gene_biotype | n |
---|---|
protein_coding | 21591 |
lncRNA | 4769 |
misc_RNA | 3419 |
snRNA | 1875 |
snoRNA | 1326 |
pseudogene | 679 |
Y_RNA | 666 |
miRNA | 626 |
rRNA | 145 |
IG_V_gene | 88 |
processed_pseudogene | 88 |
TR_V_gene | 48 |
scaRNA | 43 |
Mt_tRNA | 22 |
TR_J_gene | 21 |
TR_C_gene | 8 |
vault_RNA | 7 |
ribozyme | 4 |
IG_C_gene | 3 |
IG_D_gene | 2 |
Mt_rRNA | 2 |
Build raw SeqExpressionSet
# build raw ExpressionSet
esetRaw <- newSeqExpressionSet(counts = as.matrix(countDF),
featureData = AnnotatedDataFrame(featAnnot),
phenoData = AnnotatedDataFrame(sampleAnnotDF))
save(esetRaw, file = file.path(workDir, "output/joana2.esetRaw.RData"))
print(esetRaw)
## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 35432 features, 396 samples
## element names: counts, normalizedCounts, offset
## protocolData: none
## phenoData
## sampleNames: DGA7_D14_CD4 DGA7_D14_CD4MEM ... DHBC_wk8_nonTfh (396
## total)
## varLabels: Monkey.ID Collaborator ... TotalReads (31 total)
## varMetadata: labelDescription
## featureData
## featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
## ENSMMUG00000065388 (35432 total)
## fvarLabels: seqname strand ... gene_biotype (5 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
Barplot with read alignment statistics
statDF <- read_tsv(file = file.path(workDir, "input/joana2.ReadStats.txt"),
col_names = FALSE) %>%
distinct() %>%
spread(X3, X2)
plotDF <- counts(esetRaw) %>%
colSums() %>%
data.frame(Counted = .) %>%
rownames_to_column() %>%
merge(x = statDF, by.x = "X1", by.y = "rowname")
plotDF <- plotDF %>%
mutate(Trimmed = TotalReads - Surviving,
NotMapped = Surviving - UniqMapped - Multimapped,
NotCounted = UniqMapped + Multimapped - Counted) %>%
select(Trimmed, NotMapped, NotCounted, Counted, X1) %>%
arrange(Counted) %>%
mutate(X1 = factor(X1, levels = unique(X1))) %>%
gather(key, value, -X1)
ggplot(data = plotDF,
mapping = aes(x = X1, y = value, fill = key)) +
geom_bar(stat = "identity") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Samples", y = "Number of reads") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
message("number of samples with more than 2e06 counted reads")
plotDF %>%
filter(key %in% "Counted") %>%
mutate(over10e6 = value >= 2e06) %>%
group_by(over10e6) %>%
summarize(n())
## # A tibble: 2 × 2
## over10e6 `n()`
## <lgl> <int>
## 1 FALSE 6
## 2 TRUE 390
message("6 samples below requirment for DEG analysis:")
pData(esetRaw)[as.vector(filter(plotDF, key %in% "Counted" & value < 2e06)$X1), ] %>%
kable()
Monkey.ID | Collaborator | Sample. | Plate. | Sample.Type.Subset | Study.. | Timepoint | Cell.population.sorted | Cell.population.sorted.1 | Sample.Well.or.tube.label | Number.of.Cells | Sample.Volume.ul. | Method.to.determine.RNA.conc. | RIN | Library.Preparation.Method | Library.Elution.volume | Index.well | Index.Primer.Name | Index.Primer.sequence | Final.library.conc…nM. | Final.avg.library.size..bp. | Date.Received | Pool..66.samples.pool. | Location.VRC | Illumina.Instrument | Flowcell.ID | Lane.sequenced | Date.Sequenced | Notes | NHP.group | TotalReads | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
DGA7_D14_nonfCD8 | DGA7 | Joana Dias/Koup Lab | 114 | 2 | Total RNA from sorted cells in RNAzol | 730.6 | D14 | non-fCD8+ | nonfCD8 | B3 | 5000 | 10 | NA | NA | NEBNext Ultra II RNA Library Preparation Kit | 25 | B3 | P15 | TGGTACAG | 0.2215850 | NA | NA | 2 | NA | NovaSeq 6000 | HGH73DRXY | 2 | 2021-06-29 | NA | UnTx | 55448 |
DGH1_wk8_CD8NA | DGH1 | Joana Dias/Koup Lab | 27 | 1 | Total RNA from sorted cells in RNAzol | 730.6 | wk8 | CD8+ naïve | CD8NA | C4 | 5000 | 10 | NA | NA | NEBNext Ultra II RNA Library Preparation Kit | 25 | C4 | P28 | GTAACGAC | 0.6170620 | NA | NA | 1 | NA | NovaSeq 6000 | HGH73DRXY | 1 | 2021-06-29 | NA | DEL bNAb Tx | 245885 |
DHBC_D14_nonfCD8 | DHBC | Joana Dias/Koup Lab | 378 | 4 | Total RNA from sorted cells in RNAzol | 730.6 | D14 | non-fCD8+ | nonfCD8 | B12 | 5000 | 10 | NA | NA | NEBNext Ultra II RNA Library Preparation Kit | 25 | B12 | P24 | ATTGCGTG | 0.3831800 | NA | NA | 6 | NA | NovaSeq 6000 | HGH5LDRXY | 2 | 2021-06-30 | NA | DEL bNAb Tx | 422567 |
DGA7_pre_CD8 | DGA7 | Joana Dias/Koup Lab | 100 | 2 | Total RNA from sorted cells in RNAzol | 730.6 | pre | total CD8+ | CD8 | D1 | 5000 | 10 | NA | NA | NEBNext Ultra II RNA Library Preparation Kit | 25 | D1 | P37 | AGACCGTA | 1.3271829 | NA | NA | 2 | NA | NovaSeq 6000 | HGH73DRXY | 2 | 2021-06-29 | NA | UnTx | 456720 |
DGG6_wk8_CD8NA | DGG6 | Joana Dias/Koup Lab | 60 | 1 | Total RNA from sorted cells in RNAzol | 730.6 | wk8 | CD8+ naïve | CD8NA | D8 | 5000 | 10 | NA | NA | NEBNext Ultra II RNA Library Preparation Kit | 25 | D8 | P44 | ATCTCGCT | 0.6257944 | NA | NA | 1 | NA | NovaSeq 6000 | HGH73DRXY | 1 | 2021-06-29 | NA | UnTx | 706247 |
DGG6_pre_CD8 | DGG6 | Joana Dias/Koup Lab | 34 | 1 | Total RNA from sorted cells in RNAzol | 730.6 | pre | total CD8+ | CD8 | B5 | 5000 | 10 | NA | NA | NEBNext Ultra II RNA Library Preparation Kit | 25 | B5 | P17 | GCTATCCT | 101.9640219 | NA | NA | 1 | NA | NovaSeq 6000 | HGH73DRXY | 1 | 2021-06-29 | NA | UnTx | 967971 |
All samples except 6 have more than 2 million reads and can be used for diff. expression analysis.
Barplot with mapped reads statistics
plotDF <- counts(esetRaw) %>%
as.data.frame() %>%
rownames_to_column() %>%
merge(y = select(fData(esetRaw), gene_id, gene_biotype),
by.x = "rowname",
by.y = "gene_id") %>%
mutate(gene_biotype = ifelse(test = grepl(pattern = "pseudogene",
gene_biotype),
yes = "pseudogene",
no = gene_biotype),
gene_biotype = ifelse(test = gene_biotype %in%
c("miRNA", "lincRNA", "snoRNA",
"sRNA", "snRNA", "vaultRNA",
"3prime_overlapping_ncrna",
"macro_lncRNA", "scaRNA"),
yes = "ncrna",
no = gene_biotype),
gene_biotype = ifelse(test = gene_biotype %in%
c("antisense",
"ncRNA",
"Mt_rRNA",
"Mt_tRNA",
"protein_coding",
"pseudogene",
"ribozyme",
"rRNA"),
yes = gene_biotype,
no = "other")) %>%
gather(sample, value, -rowname, -gene_biotype) %>%
group_by(sample, gene_biotype) %>%
summarize(eta = sum(value),
mu = mean(value),
q2 = median(value)) %>%
ungroup() %>%
arrange(!grepl("Water", `sample`)) %>%
mutate(`sample` = factor(`sample`, levels = unique(`sample`)))
ggplot(data = plotDF,
mapping = aes(x = sample, y = eta, fill = gene_biotype)) +
geom_bar(stat = "identity") +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Samples", y = "Number of reads") +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
message("percentage of reads that are from protein coding")
## percentage of reads that are from protein coding
plotDF %>%
group_by(sample) %>%
summarize(n = sum(eta)) %>%
merge(y = plotDF) %>%
filter(gene_biotype %in% "protein_coding") %>%
mutate(perc_prot_cod = eta/n * 100) %>%
summarize(type = "perc_prot_coding",
min = min(.$perc_prot_cod, na.rm = TRUE),
max = max(.$perc_prot_cod, na.rm = TRUE))
## type min max
## 1 perc_prot_coding 35.21459 98.4649
As expected, most counted reads are mapped to protein coding genes.
Principal variance component analysis (raw)
esetTemp <- ExpressionSet(assayData = log2(counts(esetRaw) + 0.25),
phenoData = AnnotatedDataFrame(data = pData(esetRaw)))
esetTemp$TotalReadsQ3 <- cut(esetTemp$TotalReads,
breaks = quantile(esetTemp$TotalReads,
probs = c(0,0.33, 0.66,1)),
include.lowest = TRUE,
right = TRUE)
esetTemp$NumberCellsQ3 <- cut(esetTemp$Number.of.Cells,
breaks = c(0, 1500, 3000, 5000),
include.lowest = TRUE,
right = TRUE)
esetTemp$lowCountedReads <- esetTemp$TotalReads < 2e6
fit <- PVCA(counts = exprs(esetTemp),
meta = pData(esetTemp)[, c("Timepoint",
"NHP.group",
"Monkey.ID",
"Cell.population.sorted.1",
"NumberCellsQ3",
"TotalReadsQ3",
"lowCountedReads")],
threshold = 0.6,
inter = FALSE)
plotDF <- data.frame(effect = as.vector(fit),
label = names(fit)) %>%
arrange(effect) %>%
mutate(label = factor(label, levels = label))
ggplot(data = plotDF,
mapping = aes(x = label, y = effect)) +
geom_bar(stat = "identity") +
labs(x = NULL, y = "Explained effect") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Multidimensional scaling plot based on raw counts
rawMat <- counts(esetRaw)
mat <- rawMat[rowSums(rawMat) > 0, ]
mat <- log2(mat + 0.25)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels <- colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
plotDF <- as.data.frame(fit$points)
plotDF <- plotDF %>%
rownames_to_column() %>%
merge(y = rownames_to_column(pData(esetRaw)),
by = "rowname") %>%
mutate(lowCountedReads = (TotalReads < 2e6))
ggplot(data = plotDF,
mapping = aes(x = V1,
y = V2,
color = Cell.population.sorted,
shape = lowCountedReads)) +
geom_point(size = 4) +
labs(x = paste0("1st dimension (",
round((fit$eig/sum(fit$eig))[1] * 100),
"%)"),
y = paste0("2nd dimension (",
round((fit$eig/sum(fit$eig))[2] * 100),
"%)")) +
theme_bw()
filter(plotDF,V1 > 300)
## rowname V1 V2 Monkey.ID Collaborator Sample.
## 1 DGA7_D14_nonfCD8 866.8445 -11.33078 DGA7 Joana Dias/Koup Lab 114
## 2 DGA7_pre_CD8 513.3055 310.34905 DGA7 Joana Dias/Koup Lab 100
## 3 DGG6_pre_CD8 351.2199 -38.84293 DGG6 Joana Dias/Koup Lab 34
## 4 DGG6_wk8_CD8NA 703.3137 150.16881 DGG6 Joana Dias/Koup Lab 60
## 5 DGH1_wk8_CD8NA 701.7878 149.36020 DGH1 Joana Dias/Koup Lab 27
## 6 DHBC_D14_nonfCD8 555.1679 -55.52795 DHBC Joana Dias/Koup Lab 378
## Plate. Sample.Type.Subset Study.. Timepoint
## 1 2 Total RNA from sorted cells in RNAzol 730.6 D14
## 2 2 Total RNA from sorted cells in RNAzol 730.6 pre
## 3 1 Total RNA from sorted cells in RNAzol 730.6 pre
## 4 1 Total RNA from sorted cells in RNAzol 730.6 wk8
## 5 1 Total RNA from sorted cells in RNAzol 730.6 wk8
## 6 4 Total RNA from sorted cells in RNAzol 730.6 D14
## Cell.population.sorted Cell.population.sorted.1 Sample.Well.or.tube.label
## 1 non-fCD8+ nonfCD8 B3
## 2 total CD8+ CD8 D1
## 3 total CD8+ CD8 B5
## 4 CD8+ naïve CD8NA D8
## 5 CD8+ naïve CD8NA C4
## 6 non-fCD8+ nonfCD8 B12
## Number.of.Cells Sample.Volume.ul. Method.to.determine.RNA.conc. RIN
## 1 5000 10 NA NA
## 2 5000 10 NA NA
## 3 5000 10 NA NA
## 4 5000 10 NA NA
## 5 5000 10 NA NA
## 6 5000 10 NA NA
## Library.Preparation.Method Library.Elution.volume
## 1 NEBNext Ultra II RNA Library Preparation Kit 25
## 2 NEBNext Ultra II RNA Library Preparation Kit 25
## 3 NEBNext Ultra II RNA Library Preparation Kit 25
## 4 NEBNext Ultra II RNA Library Preparation Kit 25
## 5 NEBNext Ultra II RNA Library Preparation Kit 25
## 6 NEBNext Ultra II RNA Library Preparation Kit 25
## Index.well Index.Primer.Name Index.Primer.sequence Final.library.conc...nM.
## 1 B3 P15 TGGTACAG 0.2215850
## 2 D1 P37 AGACCGTA 1.3271829
## 3 B5 P17 GCTATCCT 101.9640219
## 4 D8 P44 ATCTCGCT 0.6257944
## 5 C4 P28 GTAACGAC 0.6170620
## 6 B12 P24 ATTGCGTG 0.3831800
## Final.avg.library.size..bp. Date.Received Pool..66.samples.pool. Location.VRC
## 1 NA NA 2 NA
## 2 NA NA 2 NA
## 3 NA NA 1 NA
## 4 NA NA 1 NA
## 5 NA NA 1 NA
## 6 NA NA 6 NA
## Illumina.Instrument Flowcell.ID Lane.sequenced Date.Sequenced Notes
## 1 NovaSeq 6000 HGH73DRXY 2 2021-06-29 NA
## 2 NovaSeq 6000 HGH73DRXY 2 2021-06-29 NA
## 3 NovaSeq 6000 HGH73DRXY 1 2021-06-29 NA
## 4 NovaSeq 6000 HGH73DRXY 1 2021-06-29 NA
## 5 NovaSeq 6000 HGH73DRXY 1 2021-06-29 NA
## 6 NovaSeq 6000 HGH5LDRXY 2 2021-06-30 NA
## NHP.group TotalReads lowCountedReads
## 1 UnTx 55448 TRUE
## 2 UnTx 456720 TRUE
## 3 UnTx 967971 TRUE
## 4 UnTx 706247 TRUE
## 5 DEL bNAb Tx 245885 TRUE
## 6 DEL bNAb Tx 422567 TRUE
Samples with low read counts are indeed the main driver of raw gene-expression.
Normalize (TMM) count data
# remove low counts samples
esetTemp <- esetRaw[, esetRaw$TotalReads >= 9e5]
dge <- DGEList(counts = counts(esetTemp),
remove.zeros = TRUE)
# remove low expressed genes
flag <- filterByExpr(dge, group = esetRaw$Cell.population.sorted)
dge <- dge[flag, , keep.lib.sizes = FALSE]
dge <- calcNormFactors(object = dge, method = "TMM")
normalizedCounts <- cpm(dge, normalized.lib.sizes = TRUE, log = TRUE)
eset <- newSeqExpressionSet(
counts = dge$counts,
normalizedCounts = as.matrix(normalizedCounts),
featureData = fData(esetTemp)[rownames(normalizedCounts), ],
phenoData = pData(esetTemp))
save(eset, file = file.path(workDir, "output/joana2.eset.RData"))
print(eset)
## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 14986 features, 391 samples
## element names: counts, normalizedCounts, offset
## protocolData: none
## phenoData
## sampleNames: DGA7_D14_CD4 DGA7_D14_CD4MEM ... DHBC_wk8_nonTfh (391
## total)
## varLabels: Monkey.ID Collaborator ... TotalReads (31 total)
## varMetadata: labelDescription
## featureData
## featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
## ENSMMUG00000065387 (14986 total)
## fvarLabels: seqname strand ... gene_biotype (5 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
Principal variance component analysis (normalized)
esetTemp <- ExpressionSet(assayData = normCounts(eset),
phenoData = AnnotatedDataFrame(data = pData(eset)))
esetTemp$TotalReadsQ3 <- cut(esetTemp$TotalReads,
breaks = quantile(esetTemp$TotalReads,
probs = c(0,0.33, 0.66,1)),
include.lowest = TRUE,
right = TRUE)
fit <- PVCA(counts = exprs(esetTemp),
meta = pData(esetTemp)[, c("Timepoint",
"NHP.group",
"Monkey.ID",
"Cell.population.sorted.1",
"TotalReadsQ3")],
threshold = 0.6,
inter = FALSE)
plotDF <- data.frame(effect = as.vector(fit),
label = names(fit)) %>%
arrange(effect) %>%
mutate(label = factor(label, levels = unique(label)))
ggplot(data = plotDF,
mapping = aes(x = label, y = effect)) +
geom_bar(stat = "identity") +
labs(x = NULL, y = "Explained effect") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Multidimensional scalling plot on normalized counts
# mds normalized/cpm counts
mat <- normCounts(eset)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels <- colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
plotDF <- as.data.frame(fit$points) %>%
rownames_to_column() %>%
merge(y = rownames_to_column(pData(eset)),
by = "rowname")
plotLabel <- round((fit$eig/sum(fit$eig)) * 100)
plotLabel <- plotLabel[1:2]
plotLabel <- paste0(c("1st dimension (", "2nd dimension ("),
plotLabel,
"%)")
ggplot(data = plotDF,
mapping = aes(x = V1,
y = V2,
color = Cell.population.sorted.1,
shape = NHP.group)) +
geom_point(size = 4) +
labs(x = plotLabel[[1]],
y = plotLabel[[2]]) +
theme_bw() +
theme(legend.key = element_blank(),
plot.title = element_text(hjust = 0.5))
Jitter plot of genes coding for cell surface markers
geneLS <- c("CD4", "CD8A")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, ]) %>%
as.data.frame() %>%
rownames_to_column() %>%
merge(y = select(fData(eset), gene_id, gene_name),
by.x = "rowname",
by.y = "gene_id") %>%
select(-rowname) %>%
gather(SampleID, count, -gene_name) %>%
merge(y = select(rownames_to_column(pData(eset)),
rowname,
Cell.population.sorted.1),
by.x = "SampleID",
by.y = "rowname") %>%
mutate(value = pmax(count, 0.1))
ggplot(plotDF,
mapping = aes(x = Cell.population.sorted.1, y = value)) +
geom_boxplot(outlier.color = "transparent") +
geom_beeswarm(mapping = aes(color = Cell.population.sorted.1)) +
scale_y_log10() +
facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
theme_bw() +
theme(axis.text = element_text(size = 5),
legend.pos = "none")
geneLS <- c("CD28", "CXCR5", "PDCD1")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, eset$Cell.population.sorted.1 %in%
c("CD4", "CD4MEM", "CD4NA", "GCTfh", "nonTfh")]) %>%
as.data.frame() %>%
rownames_to_column() %>%
merge(y = select(fData(eset), gene_id, gene_name),
by.x = "rowname",
by.y = "gene_id") %>%
select(-rowname) %>%
gather(SampleID, count, -gene_name) %>%
merge(y = select(rownames_to_column(pData(eset)),
rowname,
Cell.population.sorted.1),
by.x = "SampleID",
by.y = "rowname") %>%
mutate(value = pmax(count, 0.1))
ggplot(plotDF,
mapping = aes(x = Cell.population.sorted.1, y = value)) +
geom_boxplot(outlier.color = "transparent") +
geom_beeswarm(mapping = aes(color = Cell.population.sorted.1)) +
scale_y_log10() +
facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
theme_bw() +
theme(axis.text = element_text(size = 5),
legend.pos = "none")
geneLS <- c("CCR7", "CD28", "CXCR5", "PDCD1")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, eset$Cell.population.sorted.1 %in%
c("CD8", "CD8CM", "CD8EM", "CD8NA", "fCD8", "nonfCD8")]) %>%
as.data.frame() %>%
rownames_to_column() %>%
merge(y = select(fData(eset), gene_id, gene_name),
by.x = "rowname",
by.y = "gene_id") %>%
select(-rowname) %>%
gather(SampleID, count, -gene_name) %>%
merge(y = select(rownames_to_column(pData(eset)),
rowname,
Cell.population.sorted.1),
by.x = "SampleID",
by.y = "rowname") %>%
mutate(value = pmax(count, 0.1))
ggplot(plotDF,
mapping = aes(x = Cell.population.sorted.1, y = value)) +
geom_boxplot(outlier.color = "transparent") +
geom_beeswarm(mapping = aes(color = Cell.population.sorted.1)) +
scale_y_log10() +
facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
theme_bw() +
theme(axis.text = element_text(size = 5),
legend.pos = "none")
Principal variance component analysis per subset
plotDF <- NULL
for (SUBSET in unique(eset$Cell.population.sorted.1)) {
esetTemp <- eset[, eset$Cell.population.sorted.1 %in% SUBSET]
esetTemp <- ExpressionSet(assayData = normCounts(esetTemp),
phenoData = AnnotatedDataFrame(data = pData(esetTemp)))
fit <- PVCA(counts = exprs(esetTemp),
meta = pData(esetTemp)[, c("Timepoint",
"NHP.group",
"Monkey.ID")],
threshold = 0.6,
inter = FALSE)
plotTemp <- data.frame(effect = as.vector(fit),
label = names(fit)) %>%
arrange(effect) %>%
mutate(label = factor(label, levels = unique(label)),
Cell.population.sorted.1 = SUBSET)
plotDF <- rbind(plotDF, plotTemp)
}
ggplot(data = plotDF,
mapping = aes(x = label, y = effect)) +
geom_bar(stat = "identity") +
labs(x = NULL, y = "Explained effect") +
theme_bw() +
facet_wrap(facets = ~Cell.population.sorted.1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Substrack pre expression
# merge replicate samples
flagDF <- pData(eset) %>%
rownames_to_column() %>%
select(Monkey.ID, Timepoint, Cell.population.sorted.1, rowname) %>%
pivot_wider(names_from = Timepoint, values_from = rowname) %>%
pivot_longer(cols = c("D14", "wk8"),
names_to = "Timepoint",
values_to = "rowname") %>%
filter(!is.na(pre) & !is.na(rowname))
esetBaselined <- eset[, flagDF$rowname]
baselinedMat <- normCounts(eset)[, flagDF$rowname] -
normCounts(eset)[, flagDF$pre]
normCounts(esetBaselined) <- baselinedMat
save(esetBaselined, file = file.path(workDir, "output/joana2.esetBaselined.RData"))
print(esetBaselined)
## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 14986 features, 258 samples
## element names: counts, normalizedCounts, offset
## protocolData: none
## phenoData
## sampleNames: DGA7_D14_CD4 DGA7_wk8_CD4 ... DHBC_wk8_nonfCD8 (258
## total)
## varLabels: Monkey.ID Collaborator ... TotalReads (31 total)
## varMetadata: labelDescription
## featureData
## featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
## ENSMMUG00000065387 (14986 total)
## fvarLabels: seqname strand ... gene_biotype (5 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
Principal variance component analysis per subset (on baselined expression)
plotDF <- NULL
for (SUBSET in unique(esetBaselined$Cell.population.sorted.1)) {
esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted.1 %in% SUBSET]
esetTemp <- ExpressionSet(assayData = normCounts(esetTemp),
phenoData = AnnotatedDataFrame(data = pData(esetTemp)))
fit <- PVCA(counts = exprs(esetTemp),
meta = pData(esetTemp)[, c("Timepoint",
"NHP.group",
"Monkey.ID")],
threshold = 0.6,
inter = FALSE)
plotTemp <- data.frame(effect = as.vector(fit),
label = names(fit)) %>%
arrange(effect) %>%
mutate(label = factor(label, levels = unique(label)),
Cell.population.sorted.1 = SUBSET)
plotDF <- rbind(plotDF, plotTemp)
}
ggplot(data = plotDF,
mapping = aes(x = label, y = effect)) +
geom_bar(stat = "identity") +
labs(x = NULL, y = "Explained effect") +
theme_bw() +
facet_wrap(facets = ~Cell.population.sorted.1) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Multidimensional scalling plot on baselined counts by subsets
plotDF <- NULL
for (SUBSET in unique(esetBaselined$Cell.population.sorted.1)) {
# mds normalized/cpm counts
esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted.1 %in% SUBSET]
mat <- normCounts(esetTemp)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels <- colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
as.data.frame(fit$points) %>%
rownames_to_column() %>%
merge(y = rownames_to_column(pData(eset)),
by = "rowname") %>%
rbind(plotDF, .) -> plotDF
}
ggplot(data = plotDF,
mapping = aes(x = V1,
y = V2,
color = Timepoint,
shape = NHP.group)) +
geom_point(size = 4) +
facet_wrap(facets = ~Cell.population.sorted.1, scale = "free") +
theme_bw() +
theme(legend.key = element_blank(),
plot.title = element_text(hjust = 0.5))
Differential expression between timepoints (paired)
fits <- list()
# vaccine effect
for (SUBSET in unique(eset$Cell.population.sorted.1)) {
esetTemp <- eset[, eset$Cell.population.sorted.1 %in% SUBSET]
dge <- DGEList(counts = counts(esetTemp),
samples = pData(esetTemp),
genes = fData(esetTemp))
goi <- interaction(gsub(pattern = " .+",
replacement = "",
esetTemp$NHP.group),
esetTemp$Timepoint,
drop = TRUE)
donor <- esetTemp$Monkey.ID
designMat <- model.matrix(~0+goi)
rownames(designMat) <- sampleNames(esetTemp)
colnames(designMat) <- gsub(pattern = "goi",
replacement = "",
colnames(designMat))
attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
v <- voom(counts = dge, design = designMat)
corfit <- duplicateCorrelation(v$E,
design = designMat,
block = donor)
fit <- lmFit(v,
design = designMat,
block = donor,
correlation = corfit$consensus)
contrastLS <- grep(pattern = "pre", colnames(fit), value = TRUE, invert = TRUE) %>%
paste0(., "-", gsub(pattern = "\\..+", replacement = ".pre", .))
contrastMat <- makeContrasts(contrasts = contrastLS, levels = fit$design)
fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
fit2 <- eBayes(fit = fit2)
fits[[paste0(SUBSET, "_vax")]] <- list(fit = fit, fit2 = fit2)
}
# vaccine vs untreated
for (SUBSET in unique(esetBaselined$Cell.population.sorted.1)) {
esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted.1 %in% SUBSET]
goi <- interaction(gsub(pattern = " .+",
replacement = "",
esetTemp$NHP.group),
esetTemp$Timepoint,
drop = TRUE)
designMat <- model.matrix(~0+goi)
rownames(designMat) <- sampleNames(esetTemp)
colnames(designMat) <- gsub(pattern = "goi",
replacement = "",
colnames(designMat))
attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
dge <- ExpressionSet(assayData =normCounts(esetTemp),
phenoData = AnnotatedDataFrame(pData(esetTemp)),
featureData = AnnotatedDataFrame(fData(esetTemp)))
fit <- lmFit(dge,
design = designMat)
contrastLS <- grep(pattern = "UnTx", colnames(fit), value = TRUE, invert = TRUE) %>%
paste0(., "-", gsub(pattern = ".+\\.", replacement = "UnTx.", .))
contrastMat <- makeContrasts(contrasts = contrastLS, levels = fit$design)
fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
fit2 <- eBayes(fit = fit2)
fits[[paste0(SUBSET, "_untx")]] <- list(fit = fit, fit2 = fit2)
}
# save fits
save(fits, file = file.path(workDir, "output/joana2.fits.RData"))
# print number of genes differently expressed
degNbDF <- NULL
for (modelName in names(fits)) {
fit2 <- fits[[modelName]][["fit2"]]
for (coefName in colnames(fit2)) {
top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
as.data.frame() %>%
filter(gene_biotype %in% "protein_coding") %>%
group_by(sign(logFC)) %>%
summarize(p = sum(P.Value <= 0.05),
adj.p = sum(adj.P.Val <= 0.05)) %>%
mutate(modelName = modelName,
contrast = coefName) %>%
rbind(degNbDF) -> degNbDF
}
}
degNbDF %>%
pivot_longer(cols = c(-modelName, -contrast, -`sign(logFC)`),
names_to = "cname",
values_to = "n") %>%
mutate(`sign(logFC)` = c("-1" = "DN", "1" = "UP")[as.character(`sign(logFC)`)],
cname = paste0(cname, `sign(logFC)`)) %>%
select(-`sign(logFC)`) %>%
pivot_wider(names_from = cname, values_from = n) %>%
mutate(p = paste0(pUP, "/", pDN),
adj.p = paste0(adj.pUP, "/", adj.pDN)) %>%
select(modelName, contrast, p, adj.p) %>%
as.data.frame() %>%
kable()
modelName | contrast | p | adj.p |
---|---|---|---|
CD8_untx | WT.wk8-UnTx.wk8 | 542/710 | 0/0 |
CD8_untx | DEL.wk8-UnTx.wk8 | 616/698 | 0/0 |
CD8_untx | WT.D14-UnTx.D14 | 342/577 | 0/0 |
CD8_untx | DEL.D14-UnTx.D14 | 258/499 | 0/0 |
nonfCD8_untx | WT.wk8-UnTx.wk8 | 280/243 | 0/0 |
nonfCD8_untx | DEL.wk8-UnTx.wk8 | 521/373 | 0/0 |
nonfCD8_untx | WT.D14-UnTx.D14 | 381/409 | 0/0 |
nonfCD8_untx | DEL.D14-UnTx.D14 | 225/335 | 0/0 |
nonTfh_untx | WT.wk8-UnTx.wk8 | 325/357 | 0/0 |
nonTfh_untx | DEL.wk8-UnTx.wk8 | 379/346 | 0/0 |
nonTfh_untx | WT.D14-UnTx.D14 | 309/415 | 0/0 |
nonTfh_untx | DEL.D14-UnTx.D14 | 598/554 | 0/0 |
GCTfh_untx | WT.wk8-UnTx.wk8 | 669/726 | 0/0 |
GCTfh_untx | DEL.wk8-UnTx.wk8 | 381/365 | 0/0 |
GCTfh_untx | WT.D14-UnTx.D14 | 358/280 | 0/0 |
GCTfh_untx | DEL.D14-UnTx.D14 | 478/312 | 0/0 |
fCD8_untx | WT.wk8-UnTx.wk8 | 403/526 | 0/1 |
fCD8_untx | DEL.wk8-UnTx.wk8 | 672/857 | 0/0 |
fCD8_untx | WT.D14-UnTx.D14 | 336/363 | 0/0 |
fCD8_untx | DEL.D14-UnTx.D14 | 241/330 | 0/0 |
CD8NA_untx | WT.wk8-UnTx.wk8 | 168/275 | 0/0 |
CD8NA_untx | DEL.wk8-UnTx.wk8 | 281/337 | 0/0 |
CD8NA_untx | WT.D14-UnTx.D14 | 236/381 | 0/0 |
CD8NA_untx | DEL.D14-UnTx.D14 | 282/381 | 0/0 |
CD8EM_untx | WT.wk8-UnTx.wk8 | 325/286 | 0/0 |
CD8EM_untx | DEL.wk8-UnTx.wk8 | 727/1079 | 2/1 |
CD8EM_untx | WT.D14-UnTx.D14 | 640/454 | 0/0 |
CD8EM_untx | DEL.D14-UnTx.D14 | 493/389 | 0/0 |
CD8CM_untx | WT.wk8-UnTx.wk8 | 465/421 | 0/0 |
CD8CM_untx | DEL.wk8-UnTx.wk8 | 463/494 | 0/0 |
CD8CM_untx | WT.D14-UnTx.D14 | 429/257 | 0/0 |
CD8CM_untx | DEL.D14-UnTx.D14 | 333/286 | 0/0 |
CD4NA_untx | WT.wk8-UnTx.wk8 | 305/439 | 0/0 |
CD4NA_untx | DEL.wk8-UnTx.wk8 | 308/367 | 0/0 |
CD4NA_untx | WT.D14-UnTx.D14 | 345/318 | 0/0 |
CD4NA_untx | DEL.D14-UnTx.D14 | 345/341 | 0/0 |
CD4MEM_untx | WT.wk8-UnTx.wk8 | 282/281 | 0/0 |
CD4MEM_untx | DEL.wk8-UnTx.wk8 | 363/414 | 0/0 |
CD4MEM_untx | WT.D14-UnTx.D14 | 497/406 | 0/1 |
CD4MEM_untx | DEL.D14-UnTx.D14 | 895/592 | 0/0 |
CD4_untx | WT.wk8-UnTx.wk8 | 436/426 | 0/0 |
CD4_untx | DEL.wk8-UnTx.wk8 | 307/405 | 0/0 |
CD4_untx | WT.D14-UnTx.D14 | 236/365 | 0/0 |
CD4_untx | DEL.D14-UnTx.D14 | 222/283 | 0/0 |
nonfCD8_vax | WT.wk8-WT.pre | 1180/444 | 6/9 |
nonfCD8_vax | UnTx.wk8-UnTx.pre | 1726/751 | 230/99 |
nonfCD8_vax | DEL.wk8-DEL.pre | 1940/764 | 155/60 |
nonfCD8_vax | WT.D14-WT.pre | 290/345 | 0/0 |
nonfCD8_vax | UnTx.D14-UnTx.pre | 349/394 | 3/0 |
nonfCD8_vax | DEL.D14-DEL.pre | 235/628 | 0/0 |
nonTfh_vax | WT.wk8-WT.pre | 1094/484 | 9/35 |
nonTfh_vax | UnTx.wk8-UnTx.pre | 1708/903 | 334/197 |
nonTfh_vax | DEL.wk8-DEL.pre | 795/542 | 0/4 |
nonTfh_vax | WT.D14-WT.pre | 257/403 | 0/0 |
nonTfh_vax | UnTx.D14-UnTx.pre | 567/554 | 0/0 |
nonTfh_vax | DEL.D14-DEL.pre | 268/362 | 0/0 |
GCTfh_vax | WT.wk8-WT.pre | 1053/608 | 21/54 |
GCTfh_vax | UnTx.wk8-UnTx.pre | 2361/1328 | 984/520 |
GCTfh_vax | DEL.wk8-DEL.pre | 1649/881 | 187/101 |
GCTfh_vax | WT.D14-WT.pre | 317/181 | 2/0 |
GCTfh_vax | UnTx.D14-UnTx.pre | 491/408 | 0/0 |
GCTfh_vax | DEL.D14-DEL.pre | 242/180 | 0/0 |
fCD8_vax | WT.wk8-WT.pre | 1333/1084 | 135/176 |
fCD8_vax | UnTx.wk8-UnTx.pre | 1936/1394 | 739/567 |
fCD8_vax | DEL.wk8-DEL.pre | 1331/583 | 35/22 |
fCD8_vax | WT.D14-WT.pre | 324/774 | 10/0 |
fCD8_vax | UnTx.D14-UnTx.pre | 539/999 | 1/3 |
fCD8_vax | DEL.D14-DEL.pre | 250/744 | 0/0 |
CD8NA_vax | WT.wk8-WT.pre | 655/487 | 12/17 |
CD8NA_vax | UnTx.wk8-UnTx.pre | 1295/618 | 173/88 |
CD8NA_vax | DEL.wk8-DEL.pre | 1053/432 | 18/2 |
CD8NA_vax | WT.D14-WT.pre | 226/323 | 2/0 |
CD8NA_vax | UnTx.D14-UnTx.pre | 473/414 | 0/0 |
CD8NA_vax | DEL.D14-DEL.pre | 977/386 | 1/0 |
CD8EM_vax | WT.wk8-WT.pre | 836/518 | 7/19 |
CD8EM_vax | UnTx.wk8-UnTx.pre | 1469/877 | 178/123 |
CD8EM_vax | DEL.wk8-DEL.pre | 1720/1041 | 140/86 |
CD8EM_vax | WT.D14-WT.pre | 216/310 | 0/0 |
CD8EM_vax | UnTx.D14-UnTx.pre | 626/935 | 0/0 |
CD8EM_vax | DEL.D14-DEL.pre | 138/396 | 0/0 |
CD8CM_vax | WT.wk8-WT.pre | 821/440 | 3/16 |
CD8CM_vax | UnTx.wk8-UnTx.pre | 2309/646 | 421/111 |
CD8CM_vax | DEL.wk8-DEL.pre | 545/270 | 0/2 |
CD8CM_vax | WT.D14-WT.pre | 244/251 | 0/0 |
CD8CM_vax | UnTx.D14-UnTx.pre | 482/585 | 0/0 |
CD8CM_vax | DEL.D14-DEL.pre | 181/259 | 0/0 |
CD8_vax | WT.wk8-WT.pre | 945/558 | 21/45 |
CD8_vax | UnTx.wk8-UnTx.pre | 1499/781 | 121/63 |
CD8_vax | DEL.wk8-DEL.pre | 1041/373 | 13/9 |
CD8_vax | WT.D14-WT.pre | 395/423 | 9/0 |
CD8_vax | UnTx.D14-UnTx.pre | 590/415 | 0/0 |
CD8_vax | DEL.D14-DEL.pre | 821/908 | 46/8 |
CD4NA_vax | WT.wk8-WT.pre | 865/951 | 25/69 |
CD4NA_vax | UnTx.wk8-UnTx.pre | 1256/910 | 181/164 |
CD4NA_vax | DEL.wk8-DEL.pre | 742/446 | 19/19 |
CD4NA_vax | WT.D14-WT.pre | 301/209 | 1/0 |
CD4NA_vax | UnTx.D14-UnTx.pre | 417/412 | 0/0 |
CD4NA_vax | DEL.D14-DEL.pre | 253/465 | 0/0 |
CD4MEM_vax | WT.wk8-WT.pre | 1048/305 | 3/11 |
CD4MEM_vax | UnTx.wk8-UnTx.pre | 1251/758 | 94/69 |
CD4MEM_vax | DEL.wk8-DEL.pre | 1198/348 | 5/2 |
CD4MEM_vax | WT.D14-WT.pre | 237/130 | 2/0 |
CD4MEM_vax | UnTx.D14-UnTx.pre | 524/567 | 0/0 |
CD4MEM_vax | DEL.D14-DEL.pre | 467/323 | 0/0 |
CD4_vax | WT.wk8-WT.pre | 1261/844 | 36/70 |
CD4_vax | UnTx.wk8-UnTx.pre | 1631/802 | 276/152 |
CD4_vax | DEL.wk8-DEL.pre | 885/383 | 4/8 |
CD4_vax | WT.D14-WT.pre | 307/260 | 1/0 |
CD4_vax | UnTx.D14-UnTx.pre | 517/467 | 0/0 |
CD4_vax | DEL.D14-DEL.pre | 441/372 | 0/0 |
Heatmaps with DEGs
for (modelName in grep(pattern = "_vax", names(fits), value = TRUE)) {
fit2 <- fits[[modelName]][["fit2"]]
for (coefName in colnames(fit2)) {
top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
as.data.frame() %>%
filter(!is.na(gene_name) &
gene_biotype %in% "protein_coding" &
adj.P.Val <= 0.05)
if (nrow(top) > 0) {
top <- top %>%
top_n(50, wt = abs(t)) %>%
arrange(desc(logFC))
sampleLS <- which(fit2$contrast[, coefName] != 0) %>%
fit2$design[, .] %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(group, value, -rowname) %>%
group_by(rowname) %>%
summarize(value = sum(value)) %>%
filter(value > 0) %>%
.$rowname
mat <- normCounts(eset)[top$gene_id, sampleLS, drop = FALSE] %>%
t() %>%
scale() %>%
t()
mat <- mat[complete.cases(mat), , drop = FALSE]
colorLS <- colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
breakLS <- c(-1 * max(abs(mat), na.rm = TRUE),
seq(from = -1 * min(abs(range(mat, na.rm = TRUE))),
to = min(abs(range(mat, na.rm = TRUE))),
length.out = 99),
max(abs(mat), na.rm = TRUE))
matAnnot <- pData(eset) %>%
rownames_to_column() %>%
select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
column_to_rownames()
matAnnot <- matAnnot[colnames(mat), ]
matAnnot <- matAnnot[, colMeans(is.na(matAnnot)) < 1]
ha <- HeatmapAnnotation(df = matAnnot)
heat <- Heatmap(matrix = mat,
width = unit(0.5, units = "npc"),
height = unit(0.1+0.6*(nrow(mat)/50), units = "npc"),
top_annotation = ha,
name = "zscore",
column_title = paste0(modelName, ": ", coefName),
row_names_gp = gpar(fontsize = 7),
column_labels = pData(eset)[colnames(mat), "Monkey.ID"],
row_labels = top$gene_name,
use_raster = TRUE)
print(heat)
}
}
}
for (modelName in grep(pattern = "_untx", names(fits), value = TRUE)) {
fit2 <- fits[[modelName]][["fit2"]]
for (coefName in colnames(fit2)) {
top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
as.data.frame() %>%
filter(!is.na(gene_name) &
gene_biotype %in% "protein_coding" &
adj.P.Val <= 0.05)
if (nrow(top) > 0) {
top <- top %>%
top_n(50, wt = abs(t)) %>%
arrange(desc(logFC))
sampleLS <- which(fit2$contrast[, coefName] != 0) %>%
fit2$design[, .] %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(group, value, -rowname) %>%
group_by(rowname) %>%
summarize(value = sum(value)) %>%
filter(value > 0) %>%
.$rowname
mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
mat <- mat[complete.cases(mat), , drop = FALSE]
colorLS <- colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
breakLS <- c(-1 * max(abs(mat), na.rm = TRUE),
seq(from = -1 * min(abs(range(mat, na.rm = TRUE))),
to = min(abs(range(mat, na.rm = TRUE))),
length.out = 99),
max(abs(mat), na.rm = TRUE))
matAnnot <- pData(esetBaselined) %>%
rownames_to_column() %>%
select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
column_to_rownames()
matAnnot <- matAnnot[colnames(mat), ]
matAnnot <- matAnnot[, colMeans(is.na(matAnnot)) < 1]
ha <- HeatmapAnnotation(df = matAnnot)
heat <- Heatmap(matrix = mat,
width = unit(0.5, units = "npc"),
height = unit(0.1+0.6*(nrow(mat)/50), units = "npc"),
top_annotation = ha,
name = "zscore",
column_title = paste0(modelName, ": ", coefName),
row_names_gp = gpar(fontsize = 7),
column_labels = pData(eset)[colnames(mat), "Monkey.ID"],
row_labels = top$gene_name,
use_raster = TRUE)
print(heat)
}
}
}
Heatmaps with all DEGs across time by subset
message("FDR cutoff of 5%")
for (modelName in grep(pattern = "vax", names(fits), value = TRUE)) {
fit2 <- fits[[modelName]][["fit2"]]
top <- topTable(fit2, number = Inf, p.value = 0.05) %>%
as.data.frame() %>%
filter(gene_biotype %in% "protein_coding")
if (nrow(top) > 0) {
top <- top %>%
arrange(desc(WT.D14.WT.pre)) %>%
mutate(gene_name = ifelse(test = is.na(gene_name),
yes = gene_id,
no = gene_name))
sampleLS <- rownames(fit2$design)
sampleLS <- intersect(sampleLS, sampleNames(esetBaselined))
mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
mat <- mat[complete.cases(mat), ]
matAnnot <- pData(eset) %>%
rownames_to_column() %>%
select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
column_to_rownames()
matAnnot <- matAnnot[colnames(mat), ]
ha <- HeatmapAnnotation(df = matAnnot,
col = list(NHP.group = c("UnTx" = "grey", "WT bNAb Tx" = "orange", "DEL bNAb Tx" = "green")))
splitCols <- pData(eset)[colnames(mat),
c("Timepoint", "NHP.group")] %>%
apply(MARGIN = 1, FUN = paste, collapse = ".")
heat <- Heatmap(matrix = mat,
width = unit(0.5, units = "npc"),
height = unit(0.75, units = "npc"),
top_annotation = ha,
name = "log2FC",
column_title = modelName,
show_row_names = FALSE,
column_labels = pData(eset)[colnames(mat), "Monkey.ID"],
column_split = splitCols,
use_raster = TRUE)
print(heat)
}
}
message("FDR cutoff of 5%")
for (modelName in grep(pattern = "untx", names(fits), value = TRUE)) {
fit2 <- fits[[modelName]][["fit2"]]
top <- topTable(fit2, number = Inf, p.value = 0.05) %>%
as.data.frame()
if (nrow(top) > 0) {
top <- top %>%
filter(gene_biotype %in% "protein_coding")
}
if (nrow(top) > 0) {
top <- top %>%
arrange(desc(WT.D14.UnTx.D14)) %>%
mutate(gene_name = ifelse(test = is.na(gene_name),
yes = gene_id,
no = gene_name))
sampleLS <- rownames(fit2$design)
sampleLS <- intersect(sampleLS, sampleNames(esetBaselined))
mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
mat <- mat[complete.cases(mat), ]
matAnnot <- pData(eset) %>%
rownames_to_column() %>%
select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
column_to_rownames()
matAnnot <- matAnnot[colnames(mat), ]
ha <- HeatmapAnnotation(df = matAnnot,
col = list(NHP.group = c("UnTx" = "grey", "WT bNAb Tx" = "orange", "DEL bNAb Tx" = "green")))
splitCols <- pData(eset)[colnames(mat),
c("Timepoint", "NHP.group")] %>%
apply(MARGIN = 1, FUN = paste, collapse = ".")
heat <- Heatmap(matrix = mat,
width = unit(0.5, units = "npc"),
height = unit(0.75, units = "npc"),
top_annotation = ha,
name = "log2FC",
column_title = modelName,
row_labels = top$gene_name,
column_labels = pData(eset)[colnames(mat),
"Monkey.ID"],
column_split = splitCols)
print(heat)
}
}
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin23.0.0 (64-bit)
## Running under: macOS Sonoma 14.3
##
## Matrix products: default
## BLAS: /opt/homebrew/Cellar/openblas/0.3.26/lib/libopenblasp-r0.3.26.dylib
## LAPACK: /opt/homebrew/Cellar/r/4.3.2/lib/R/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] lubridate_1.9.3 forcats_1.0.0
## [3] stringr_1.5.1 dplyr_1.1.4
## [5] purrr_1.0.2 readr_2.1.5
## [7] tidyr_1.3.1 tibble_3.2.1
## [9] tidyverse_2.0.0 ComplexHeatmap_2.18.0
## [11] ggbeeswarm_0.7.2 ggplot2_3.4.4
## [13] edgeR_4.0.14 limma_3.58.1
## [15] parallelDist_0.2.6 lme4_1.1-35.1
## [17] Matrix_1.6-5 pvca_0.1.0
## [19] EDASeq_2.36.0 ShortRead_1.60.0
## [21] GenomicAlignments_1.38.2 SummarizedExperiment_1.32.0
## [23] MatrixGenerics_1.14.0 matrixStats_1.2.0
## [25] Rsamtools_2.18.0 GenomicRanges_1.54.1
## [27] Biostrings_2.70.2 GenomeInfoDb_1.38.5
## [29] XVector_0.42.0 IRanges_2.36.0
## [31] S4Vectors_0.40.2 BiocParallel_1.36.0
## [33] Biobase_2.62.0 BiocGenerics_0.48.1
## [35] readxl_1.4.3 kableExtra_1.4.0
## [37] knitr_1.45
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 shape_1.4.6 rstudioapi_0.15.0
## [4] magrittr_2.0.3 magick_2.8.2 GenomicFeatures_1.54.1
## [7] farver_2.1.1 nloptr_2.0.3 rmarkdown_2.25
## [10] GlobalOptions_0.1.2 BiocIO_1.12.0 zlibbioc_1.48.0
## [13] vctrs_0.6.5 memoise_2.0.1 minqa_1.2.6
## [16] RCurl_1.98-1.14 htmltools_0.5.7 S4Arrays_1.2.0
## [19] progress_1.2.3 curl_5.2.0 cellranger_1.1.0
## [22] SparseArray_1.2.3 cachem_1.0.8 iterators_1.0.14
## [25] lifecycle_1.0.4 pkgconfig_2.0.3 R6_2.5.1
## [28] fastmap_1.1.1 clue_0.3-65 GenomeInfoDbData_1.2.11
## [31] digest_0.6.34 colorspace_2.1-0 AnnotationDbi_1.64.1
## [34] RSQLite_2.3.5 hwriter_1.3.2.1 labeling_0.4.3
## [37] filelock_1.0.3 timechange_0.3.0 fansi_1.0.6
## [40] httr_1.4.7 abind_1.4-5 compiler_4.3.2
## [43] doParallel_1.0.17 bit64_4.0.5 withr_3.0.0
## [46] DBI_1.2.1 highr_0.10 R.utils_2.12.3
## [49] biomaRt_2.58.1 MASS_7.3-60.0.1 rappdirs_0.3.3
## [52] DelayedArray_0.28.0 rjson_0.2.21 tools_4.3.2
## [55] vipor_0.4.7 beeswarm_0.4.0 R.oo_1.26.0
## [58] glue_1.7.0 restfulr_0.0.15 nlme_3.1-164
## [61] cluster_2.1.6 generics_0.1.3 gtable_0.3.4
## [64] tzdb_0.4.0 R.methodsS3_1.8.2 hms_1.1.3
## [67] xml2_1.3.6 utf8_1.2.4 foreach_1.5.2
## [70] pillar_1.9.0 vroom_1.6.5 circlize_0.4.15
## [73] splines_4.3.2 BiocFileCache_2.10.1 lattice_0.22-5
## [76] aroma.light_3.32.0 rtracklayer_1.62.0 bit_4.0.5
## [79] deldir_2.0-2 tidyselect_1.2.0 locfit_1.5-9.8
## [82] svglite_2.1.3 xfun_0.41 statmod_1.5.0
## [85] stringi_1.8.3 yaml_2.3.8 boot_1.3-28.1
## [88] evaluate_0.23 codetools_0.2-19 interp_1.1-6
## [91] cli_3.6.2 RcppParallel_5.1.7 systemfonts_1.0.5
## [94] munsell_0.5.0 Rcpp_1.0.12 dbplyr_2.4.0
## [97] png_0.1-8 XML_3.99-0.16.1 blob_1.2.4
## [100] prettyunits_1.2.0 latticeExtra_0.6-30 jpeg_0.1-10
## [103] bitops_1.0-7 viridisLite_0.4.2 scales_1.3.0
## [106] crayon_1.5.2 GetoptLong_1.0.5 rlang_1.1.3
## [109] KEGGREST_1.42.0