Dias/Koup preprocessing of RNA-Seq data (set 2)

Slim Fourati 2024-01-31

Load packages

suppressPackageStartupMessages(library(package = "knitr"))
suppressPackageStartupMessages(library(package = "kableExtra"))
suppressPackageStartupMessages(library(package = "readxl"))
suppressPackageStartupMessages(library(package = "EDASeq"))
suppressPackageStartupMessages(library(package = "pvca")) # dleelab/pvca
suppressPackageStartupMessages(library(package = "lme4"))
suppressPackageStartupMessages(library(package = "parallelDist"))
suppressPackageStartupMessages(library(package = "parallel"))
suppressPackageStartupMessages(library(package = "edgeR"))
suppressPackageStartupMessages(library(package = "ggbeeswarm"))
suppressPackageStartupMessages(library(package = "ComplexHeatmap"))
suppressPackageStartupMessages(library(package = "tidyverse"))

opts_chunk$set(echo = TRUE, fig.path = "../figure/")
options(readr.show_col_types   = FALSE,
        dplyr.summarise.inform = FALSE)
workDir <- dirname(getwd())

Reading raw counts

countDF <- read_csv(file = file.path(workDir, "input/joana2.genecounts.csv")) %>%
  as.data.frame() %>%
  column_to_rownames(var = "gene_id")

message("First five rows/columns of count matrix")

## First five rows/columns of count matrix

countDF[1:5, 1:5] %>%
  kable()

	DGA7_D14_CD4	DGA7_D14_CD4MEM	DGA7_D14_CD4NA	DGA7_D14_CD8	DGA7_D14_CD8CM
ENSMMUG00000000001	31	121	60	71	66
ENSMMUG00000000002	2	10	11	6	5
ENSMMUG00000000005	177	395	110	216	297
ENSMMUG00000000006	0	1	0	0	0
ENSMMUG00000000007	4	20	36	35	33

message("Dimensions of the count matrix")

## Dimensions of the count matrix

print(dim(countDF))

## [1] 35432   396

Read sample annotation

sampleAnnotFile <- file.path(workDir,
                             "input",
                             "JoanaDias_Bulk_RNAseq_sampleTracking_06012021.xlsx")
# read each plates
sampleAnnotDF <- read_excel(path = sampleAnnotFile,
                            .name_repair = function(x) make.unique(make.names(x)))
# append treatment info
treatFile <- file.path(workDir,
                       "input",
                       "RNAseq_II_for_Slim_20210829.slide3.txt")
treatDF <- read_tsv(file = treatFile, name_repair = make.names)
sampleAnnotDF <- merge(x = sampleAnnotDF, y = treatDF, by = "Monkey.ID")
# append sample id
sampleAnnotDF <- sampleAnnotDF %>%
  mutate(rowname = paste(Monkey.ID,
                         Timepoint,
                         Cell.population.sorted.1,
                         sep = "_"),
         rowname = gsub(pattern = " ", replacement = "", rowname)) %>%
   as.data.frame() %>%
  column_to_rownames(var = "rowname")
sampleAnnotDF <- sampleAnnotDF[colnames(countDF), ]
# TotalReads counted
sampleAnnotDF$TotalReads <- colSums(countDF)

message("print header of sample annotation")

## print header of sample annotation

sampleAnnotDF[1:5, 1:5] %>%
  kable()

	Monkey.ID	Collaborator	Sample.	Plate.	Sample.Type.Subset
DGA7_D14_CD4	DGA7	Joana Dias/Koup Lab	117	2	Total RNA from sorted cells in RNAzol
DGA7_D14_CD4MEM	DGA7	Joana Dias/Koup Lab	119	2	Total RNA from sorted cells in RNAzol
DGA7_D14_CD4NA	DGA7	Joana Dias/Koup Lab	118	2	Total RNA from sorted cells in RNAzol
DGA7_D14_CD8	DGA7	Joana Dias/Koup Lab	111	2	Total RNA from sorted cells in RNAzol
DGA7_D14_CD8CM	DGA7	Joana Dias/Koup Lab	116	2	Total RNA from sorted cells in RNAzol

message("Number of animal w/ RNA-seq data")

## Number of animal w/ RNA-seq data

sampleAnnotDF %>%
  .$Monkey.ID %>%
  unique() %>%
  length() %>%
  print()

## [1] 12

message("Number of timepoints and subsets w/ RNA-seq data")

## Number of timepoints and subsets w/ RNA-seq data

sampleAnnotDF %>%
  select(Monkey.ID, Timepoint, Cell.population.sorted.1) %>%
  distinct() %>%
  group_by(Timepoint, Cell.population.sorted.1) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(Timepoint = factor(Timepoint, levels = c("pre", "D14", "wk8"))) %>%
  pivot_wider(names_from = Cell.population.sorted.1, values_from = n) %>%
  arrange(Timepoint) %>%
  kable()

Timepoint	CD4	CD4MEM	CD4NA	CD8	CD8CM	CD8EM	CD8NA	GCTfh	fCD8	nonTfh	nonfCD8
pre	12	12	12	12	12	12	12	12	12	12	12
D14	12	12	12	12	12	12	12	12	12	12	12
wk8	12	12	12	12	12	12	12	12	12	12	12

Transcriptomic profiles of 12 animals for 11 subsets at 3 timepoints was performed.

Read gene annotation

colNames <- c("seqname",
              "source",
              "feature",
              "start",
              "end",
              "score",
              "strand",
              "frame",
              "attribute")
colTypes <- paste(c(rep("c", times = 3),
                    rep("i", times = 2),
                    rep("c", times = 4)),
                  collapse = "")
featuresAnnotationFile <- file.path(workDir, "input/joana2.genes.gtf")
skipNb <- read_lines(file = featuresAnnotationFile)
skipNb <- sum(grepl(pattern = "^#", skipNb))

featuresAnnot <- read_tsv(file      = featuresAnnotationFile,
                          skip      = skipNb,
                          col_names = colNames,
                          col_types = colTypes)
# extract gene_id and transcript_id from attributes
featAnnot <- featuresAnnot %>%
  mutate(gene_id = gsub(pattern = ".*gene_id \"([^;]+)\";.+",
                        replacement = "\\1",
                        attribute),
         gene_name = ifelse(test = grepl(pattern = "gene_name",
                                         attribute),
                            yes = gsub(pattern = ".+gene_name \"([^;]+)\";.+",
                                       replacement = "\\1",
                                       attribute),
                            no  = NA),
         gene_biotype = ifelse(test = grepl(pattern = "gene_biotype",
                                         attribute),
                            yes = gsub(pattern = ".+gene_biotype \"([^;]+)\";.*",
                                       replacement = "\\1",
                                       attribute),
                            no  = NA)) %>%
  select(seqname, strand, gene_id, gene_name, gene_biotype) %>%
  distinct() %>%
  as.data.frame()
rownames(featAnnot) <- featAnnot$gene_id
featAnnot <- featAnnot[rownames(countDF), ]

cat("Number of genes annotated")

## Number of genes annotated

featAnnot %>%
    group_by(gene_biotype) %>%
    summarize(n = n()) %>%
    arrange(desc(n)) %>%
  kable()

gene_biotype	n
protein_coding	21591
lncRNA	4769
misc_RNA	3419
snRNA	1875
snoRNA	1326
pseudogene	679
Y_RNA	666
miRNA	626
rRNA	145
IG_V_gene	88
processed_pseudogene	88
TR_V_gene	48
scaRNA	43
Mt_tRNA	22
TR_J_gene	21
TR_C_gene	8
vault_RNA	7
ribozyme	4
IG_C_gene	3
IG_D_gene	2
Mt_rRNA	2

Build raw SeqExpressionSet

# build  raw ExpressionSet
esetRaw <- newSeqExpressionSet(counts      = as.matrix(countDF),
                               featureData = AnnotatedDataFrame(featAnnot),
                               phenoData   = AnnotatedDataFrame(sampleAnnotDF))
save(esetRaw, file = file.path(workDir, "output/joana2.esetRaw.RData"))
print(esetRaw)

## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 35432 features, 396 samples 
##   element names: counts, normalizedCounts, offset 
## protocolData: none
## phenoData
##   sampleNames: DGA7_D14_CD4 DGA7_D14_CD4MEM ... DHBC_wk8_nonTfh (396
##     total)
##   varLabels: Monkey.ID Collaborator ... TotalReads (31 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
##     ENSMMUG00000065388 (35432 total)
##   fvarLabels: seqname strand ... gene_biotype (5 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Barplot with read alignment statistics

statDF <- read_tsv(file      = file.path(workDir, "input/joana2.ReadStats.txt"),
                   col_names = FALSE) %>%
  distinct() %>%
  spread(X3, X2)
plotDF <- counts(esetRaw) %>% 
  colSums() %>%
  data.frame(Counted = .) %>%
  rownames_to_column() %>%
  merge(x = statDF, by.x = "X1", by.y = "rowname")

plotDF <- plotDF %>%
  mutate(Trimmed    = TotalReads - Surviving,
         NotMapped  = Surviving - UniqMapped - Multimapped,
         NotCounted = UniqMapped + Multimapped - Counted) %>%
  select(Trimmed, NotMapped, NotCounted, Counted, X1) %>%
  arrange(Counted) %>% 
  mutate(X1 = factor(X1, levels = unique(X1))) %>%
  gather(key, value, -X1)
ggplot(data = plotDF,
       mapping = aes(x = X1, y = value, fill = key)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Samples", y = "Number of reads") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

message("number of samples with more than 2e06 counted reads")
plotDF %>%
  filter(key %in% "Counted") %>%
  mutate(over10e6 = value >= 2e06) %>%
  group_by(over10e6) %>%
  summarize(n())

## # A tibble: 2 × 2
##   over10e6 `n()`
##   <lgl>    <int>
## 1 FALSE        6
## 2 TRUE       390

message("6 samples below requirment for DEG analysis:")
pData(esetRaw)[as.vector(filter(plotDF, key %in% "Counted" & value < 2e06)$X1), ] %>%
  kable()

	Monkey.ID	Collaborator	Sample.	Plate.	Sample.Type.Subset	Study..	Timepoint	Cell.population.sorted	Cell.population.sorted.1	Sample.Well.or.tube.label	Number.of.Cells	Sample.Volume.ul.	Method.to.determine.RNA.conc.	RIN	Library.Preparation.Method	Library.Elution.volume	Index.well	Index.Primer.Name	Index.Primer.sequence	Final.library.conc…nM.	Final.avg.library.size..bp.	Date.Received	Pool..66.samples.pool.	Location.VRC	Illumina.Instrument	Flowcell.ID	Lane.sequenced	Date.Sequenced	Notes	NHP.group	TotalReads
DGA7_D14_nonfCD8	DGA7	Joana Dias/Koup Lab	114	2	Total RNA from sorted cells in RNAzol	730.6	D14	non-fCD8+	nonfCD8	B3	5000	10	NA	NA	NEBNext Ultra II RNA Library Preparation Kit	25	B3	P15	TGGTACAG	0.2215850	NA	NA	2	NA	NovaSeq 6000	HGH73DRXY	2	2021-06-29	NA	UnTx	55448
DGH1_wk8_CD8NA	DGH1	Joana Dias/Koup Lab	27	1	Total RNA from sorted cells in RNAzol	730.6	wk8	CD8+ naïve	CD8NA	C4	5000	10	NA	NA	NEBNext Ultra II RNA Library Preparation Kit	25	C4	P28	GTAACGAC	0.6170620	NA	NA	1	NA	NovaSeq 6000	HGH73DRXY	1	2021-06-29	NA	DEL bNAb Tx	245885
DHBC_D14_nonfCD8	DHBC	Joana Dias/Koup Lab	378	4	Total RNA from sorted cells in RNAzol	730.6	D14	non-fCD8+	nonfCD8	B12	5000	10	NA	NA	NEBNext Ultra II RNA Library Preparation Kit	25	B12	P24	ATTGCGTG	0.3831800	NA	NA	6	NA	NovaSeq 6000	HGH5LDRXY	2	2021-06-30	NA	DEL bNAb Tx	422567
DGA7_pre_CD8	DGA7	Joana Dias/Koup Lab	100	2	Total RNA from sorted cells in RNAzol	730.6	pre	total CD8+	CD8	D1	5000	10	NA	NA	NEBNext Ultra II RNA Library Preparation Kit	25	D1	P37	AGACCGTA	1.3271829	NA	NA	2	NA	NovaSeq 6000	HGH73DRXY	2	2021-06-29	NA	UnTx	456720
DGG6_wk8_CD8NA	DGG6	Joana Dias/Koup Lab	60	1	Total RNA from sorted cells in RNAzol	730.6	wk8	CD8+ naïve	CD8NA	D8	5000	10	NA	NA	NEBNext Ultra II RNA Library Preparation Kit	25	D8	P44	ATCTCGCT	0.6257944	NA	NA	1	NA	NovaSeq 6000	HGH73DRXY	1	2021-06-29	NA	UnTx	706247
DGG6_pre_CD8	DGG6	Joana Dias/Koup Lab	34	1	Total RNA from sorted cells in RNAzol	730.6	pre	total CD8+	CD8	B5	5000	10	NA	NA	NEBNext Ultra II RNA Library Preparation Kit	25	B5	P17	GCTATCCT	101.9640219	NA	NA	1	NA	NovaSeq 6000	HGH73DRXY	1	2021-06-29	NA	UnTx	967971

All samples except 6 have more than 2 million reads and can be used for diff. expression analysis.

Barplot with mapped reads statistics

plotDF <- counts(esetRaw) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  merge(y    = select(fData(esetRaw), gene_id, gene_biotype),
        by.x = "rowname",
        by.y = "gene_id") %>%
  mutate(gene_biotype = ifelse(test = grepl(pattern = "pseudogene",
                                            gene_biotype),
                               yes = "pseudogene",
                               no  = gene_biotype),
         gene_biotype = ifelse(test = gene_biotype %in% 
                                 c("miRNA", "lincRNA", "snoRNA",
                                   "sRNA", "snRNA", "vaultRNA",
                                   "3prime_overlapping_ncrna",
                                   "macro_lncRNA", "scaRNA"),
                               yes = "ncrna",
                               no = gene_biotype),
         gene_biotype = ifelse(test = gene_biotype %in% 
                                 c("antisense",
                                   "ncRNA",
                                   "Mt_rRNA",
                                   "Mt_tRNA",
                                   "protein_coding",
                                   "pseudogene",
                                   "ribozyme",
                                   "rRNA"),
                               yes = gene_biotype,
                               no  = "other")) %>%
  gather(sample, value, -rowname, -gene_biotype) %>%
  group_by(sample, gene_biotype) %>%
  summarize(eta = sum(value),
            mu  = mean(value),
            q2  = median(value)) %>%
  ungroup() %>%
  arrange(!grepl("Water", `sample`)) %>% 
  mutate(`sample` = factor(`sample`, levels = unique(`sample`)))

ggplot(data = plotDF,
       mapping = aes(x = sample, y = eta, fill = gene_biotype)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Samples", y = "Number of reads") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

message("percentage of reads that are from protein coding")

## percentage of reads that are from protein coding

plotDF %>% 
  group_by(sample) %>%
  summarize(n = sum(eta)) %>%
  merge(y = plotDF) %>%
  filter(gene_biotype %in% "protein_coding") %>%
  mutate(perc_prot_cod = eta/n * 100) %>%
  summarize(type = "perc_prot_coding",
            min  = min(.$perc_prot_cod, na.rm = TRUE),
            max  = max(.$perc_prot_cod, na.rm = TRUE))

##               type      min     max
## 1 perc_prot_coding 35.21459 98.4649

As expected, most counted reads are mapped to protein coding genes.

Principal variance component analysis (raw)

esetTemp <- ExpressionSet(assayData = log2(counts(esetRaw) + 0.25),
                          phenoData = AnnotatedDataFrame(data = pData(esetRaw)))

esetTemp$TotalReadsQ3 <- cut(esetTemp$TotalReads,
                             breaks         = quantile(esetTemp$TotalReads,
                                                       probs = c(0,0.33, 0.66,1)),
                             include.lowest = TRUE,
                             right          = TRUE)
esetTemp$NumberCellsQ3 <- cut(esetTemp$Number.of.Cells,
                              breaks         = c(0, 1500, 3000, 5000),
                              include.lowest = TRUE,
                              right          = TRUE)
esetTemp$lowCountedReads <- esetTemp$TotalReads < 2e6

fit <- PVCA(counts = exprs(esetTemp),
            meta = pData(esetTemp)[, c("Timepoint",
                                       "NHP.group",
                             "Monkey.ID",
                             "Cell.population.sorted.1",
                             "NumberCellsQ3",
                             "TotalReadsQ3",
                             "lowCountedReads")],
     threshold = 0.6,
     inter = FALSE)

plotDF <- data.frame(effect = as.vector(fit),
                     label  = names(fit)) %>%
  arrange(effect) %>%
  mutate(label = factor(label, levels = label))
ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Subset, samples with low counts reads and animals are the main drivers of the raw gene-expression.

Multidimensional scaling plot based on raw counts

rawMat <- counts(esetRaw)
mat <- rawMat[rowSums(rawMat) > 0, ]
mat <- log2(mat + 0.25)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels  <-  colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
plotDF <- as.data.frame(fit$points)
plotDF <- plotDF %>%
    rownames_to_column() %>%
    merge(y = rownames_to_column(pData(esetRaw)),
          by = "rowname") %>%
  mutate(lowCountedReads = (TotalReads < 2e6))
ggplot(data    = plotDF,
                   mapping = aes(x     = V1,
                                 y     = V2,
                 color = Cell.population.sorted,
                 shape = lowCountedReads)) +
  geom_point(size = 4) +
  labs(x = paste0("1st dimension (",
                        round((fit$eig/sum(fit$eig))[1] * 100),
                        "%)"),
                    y = paste0("2nd dimension (",
                        round((fit$eig/sum(fit$eig))[2] * 100),
                        "%)")) +
  theme_bw()

filter(plotDF,V1 > 300)

##            rowname       V1        V2 Monkey.ID        Collaborator Sample.
## 1 DGA7_D14_nonfCD8 866.8445 -11.33078      DGA7 Joana Dias/Koup Lab     114
## 2     DGA7_pre_CD8 513.3055 310.34905      DGA7 Joana Dias/Koup Lab     100
## 3     DGG6_pre_CD8 351.2199 -38.84293      DGG6 Joana Dias/Koup Lab      34
## 4   DGG6_wk8_CD8NA 703.3137 150.16881      DGG6 Joana Dias/Koup Lab      60
## 5   DGH1_wk8_CD8NA 701.7878 149.36020      DGH1 Joana Dias/Koup Lab      27
## 6 DHBC_D14_nonfCD8 555.1679 -55.52795      DHBC Joana Dias/Koup Lab     378
##   Plate.                    Sample.Type.Subset Study.. Timepoint
## 1      2 Total RNA from sorted cells in RNAzol   730.6       D14
## 2      2 Total RNA from sorted cells in RNAzol   730.6       pre
## 3      1 Total RNA from sorted cells in RNAzol   730.6       pre
## 4      1 Total RNA from sorted cells in RNAzol   730.6       wk8
## 5      1 Total RNA from sorted cells in RNAzol   730.6       wk8
## 6      4 Total RNA from sorted cells in RNAzol   730.6       D14
##   Cell.population.sorted Cell.population.sorted.1 Sample.Well.or.tube.label
## 1              non-fCD8+                  nonfCD8                        B3
## 2             total CD8+                      CD8                        D1
## 3             total CD8+                      CD8                        B5
## 4             CD8+ naïve                    CD8NA                        D8
## 5             CD8+ naïve                    CD8NA                        C4
## 6              non-fCD8+                  nonfCD8                       B12
##   Number.of.Cells Sample.Volume.ul. Method.to.determine.RNA.conc. RIN
## 1            5000                10                            NA  NA
## 2            5000                10                            NA  NA
## 3            5000                10                            NA  NA
## 4            5000                10                            NA  NA
## 5            5000                10                            NA  NA
## 6            5000                10                            NA  NA
##                     Library.Preparation.Method Library.Elution.volume
## 1 NEBNext Ultra II RNA Library Preparation Kit                     25
## 2 NEBNext Ultra II RNA Library Preparation Kit                     25
## 3 NEBNext Ultra II RNA Library Preparation Kit                     25
## 4 NEBNext Ultra II RNA Library Preparation Kit                     25
## 5 NEBNext Ultra II RNA Library Preparation Kit                     25
## 6 NEBNext Ultra II RNA Library Preparation Kit                     25
##   Index.well Index.Primer.Name Index.Primer.sequence Final.library.conc...nM.
## 1         B3               P15              TGGTACAG                0.2215850
## 2         D1               P37              AGACCGTA                1.3271829
## 3         B5               P17              GCTATCCT              101.9640219
## 4         D8               P44              ATCTCGCT                0.6257944
## 5         C4               P28              GTAACGAC                0.6170620
## 6        B12               P24              ATTGCGTG                0.3831800
##   Final.avg.library.size..bp. Date.Received Pool..66.samples.pool. Location.VRC
## 1                          NA            NA                      2           NA
## 2                          NA            NA                      2           NA
## 3                          NA            NA                      1           NA
## 4                          NA            NA                      1           NA
## 5                          NA            NA                      1           NA
## 6                          NA            NA                      6           NA
##   Illumina.Instrument Flowcell.ID Lane.sequenced Date.Sequenced Notes
## 1        NovaSeq 6000   HGH73DRXY              2     2021-06-29    NA
## 2        NovaSeq 6000   HGH73DRXY              2     2021-06-29    NA
## 3        NovaSeq 6000   HGH73DRXY              1     2021-06-29    NA
## 4        NovaSeq 6000   HGH73DRXY              1     2021-06-29    NA
## 5        NovaSeq 6000   HGH73DRXY              1     2021-06-29    NA
## 6        NovaSeq 6000   HGH5LDRXY              2     2021-06-30    NA
##     NHP.group TotalReads lowCountedReads
## 1        UnTx      55448            TRUE
## 2        UnTx     456720            TRUE
## 3        UnTx     967971            TRUE
## 4        UnTx     706247            TRUE
## 5 DEL bNAb Tx     245885            TRUE
## 6 DEL bNAb Tx     422567            TRUE

Samples with low read counts are indeed the main driver of raw gene-expression.

Normalize (TMM) count data

# remove low counts samples
esetTemp <- esetRaw[, esetRaw$TotalReads >= 9e5]
dge <- DGEList(counts       = counts(esetTemp),
               remove.zeros = TRUE)
# remove low expressed genes
flag <- filterByExpr(dge, group = esetRaw$Cell.population.sorted)
dge <- dge[flag, , keep.lib.sizes = FALSE]
dge <- calcNormFactors(object = dge, method = "TMM")
normalizedCounts <- cpm(dge, normalized.lib.sizes = TRUE, log = TRUE)
eset <-  newSeqExpressionSet(
    counts           = dge$counts,
    normalizedCounts = as.matrix(normalizedCounts),
    featureData      = fData(esetTemp)[rownames(normalizedCounts), ],
    phenoData        = pData(esetTemp))
save(eset, file = file.path(workDir, "output/joana2.eset.RData"))
print(eset)

## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 14986 features, 391 samples 
##   element names: counts, normalizedCounts, offset 
## protocolData: none
## phenoData
##   sampleNames: DGA7_D14_CD4 DGA7_D14_CD4MEM ... DHBC_wk8_nonTfh (391
##     total)
##   varLabels: Monkey.ID Collaborator ... TotalReads (31 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
##     ENSMMUG00000065387 (14986 total)
##   fvarLabels: seqname strand ... gene_biotype (5 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Principal variance component analysis (normalized)

esetTemp <- ExpressionSet(assayData = normCounts(eset),
                          phenoData = AnnotatedDataFrame(data = pData(eset)))
esetTemp$TotalReadsQ3 <- cut(esetTemp$TotalReads,
                             breaks         = quantile(esetTemp$TotalReads,
                                                       probs = c(0,0.33, 0.66,1)),
                         include.lowest = TRUE,
                         right          = TRUE)
fit <- PVCA(counts = exprs(esetTemp),
            meta = pData(esetTemp)[, c("Timepoint",
                                       "NHP.group",
                             "Monkey.ID",
                             "Cell.population.sorted.1",
                             "TotalReadsQ3")],
                           threshold = 0.6,
                           inter = FALSE)
plotDF <- data.frame(effect = as.vector(fit),
                     label  = names(fit)) %>%
  arrange(effect) %>%
  mutate(label = factor(label, levels = unique(label)))
ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

After normalization (for total number of reads counted per samples), subset remain the main drivers of expression.

Multidimensional scalling plot on normalized counts

# mds normalized/cpm counts
mat <- normCounts(eset)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels  <- colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
plotDF <- as.data.frame(fit$points) %>%
  rownames_to_column() %>%
  merge(y = rownames_to_column(pData(eset)),
        by = "rowname")
plotLabel <- round((fit$eig/sum(fit$eig)) * 100)
plotLabel <- plotLabel[1:2]
plotLabel <- paste0(c("1st dimension (", "2nd dimension ("),
                    plotLabel,
                    "%)")
ggplot(data = plotDF,
       mapping = aes(x = V1,
                     y = V2,
                     color = Cell.population.sorted.1,
                     shape = NHP.group)) +
  geom_point(size = 4) +
  labs(x = plotLabel[[1]],
       y = plotLabel[[2]]) +
  theme_bw() +
  theme(legend.key = element_blank(),
        plot.title = element_text(hjust = 0.5))

After normalization (for total number of reads counted per samples), subset remain the main driver of expression

Jitter plot of genes coding for cell surface markers

geneLS <- c("CD4", "CD8A")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, ]) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  merge(y = select(fData(eset), gene_id, gene_name),
    by.x = "rowname",
    by.y = "gene_id") %>%
  select(-rowname) %>%
  gather(SampleID, count, -gene_name) %>%
  merge(y = select(rownames_to_column(pData(eset)),
           rowname,
           Cell.population.sorted.1),
    by.x = "SampleID",
    by.y = "rowname") %>%
  mutate(value = pmax(count, 0.1))
ggplot(plotDF,
          mapping = aes(x = Cell.population.sorted.1, y = value)) +
  geom_boxplot(outlier.color = "transparent") +
  geom_beeswarm(mapping = aes(color = Cell.population.sorted.1)) +
  scale_y_log10() +
  facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
  theme_bw() +
  theme(axis.text  = element_text(size = 5),
    legend.pos = "none")

geneLS <- c("CD28", "CXCR5", "PDCD1")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, eset$Cell.population.sorted.1 %in%
                        c("CD4", "CD4MEM", "CD4NA", "GCTfh", "nonTfh")]) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  merge(y = select(fData(eset), gene_id, gene_name),
    by.x = "rowname",
    by.y = "gene_id") %>%
  select(-rowname) %>%
  gather(SampleID, count, -gene_name) %>%
  merge(y = select(rownames_to_column(pData(eset)),
           rowname,
           Cell.population.sorted.1),
    by.x = "SampleID",
    by.y = "rowname") %>%
  mutate(value = pmax(count, 0.1))
ggplot(plotDF,
          mapping = aes(x = Cell.population.sorted.1, y = value)) +
  geom_boxplot(outlier.color = "transparent") +
  geom_beeswarm(mapping = aes(color = Cell.population.sorted.1)) +
  scale_y_log10() +
  facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
  theme_bw() +
  theme(axis.text  = element_text(size = 5),
    legend.pos = "none")

geneLS <- c("CCR7", "CD28", "CXCR5", "PDCD1")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, eset$Cell.population.sorted.1 %in%
                        c("CD8", "CD8CM", "CD8EM", "CD8NA", "fCD8", "nonfCD8")]) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  merge(y = select(fData(eset), gene_id, gene_name),
    by.x = "rowname",
    by.y = "gene_id") %>%
  select(-rowname) %>%
  gather(SampleID, count, -gene_name) %>%
  merge(y = select(rownames_to_column(pData(eset)),
           rowname,
           Cell.population.sorted.1),
    by.x = "SampleID",
    by.y = "rowname") %>%
  mutate(value = pmax(count, 0.1))
ggplot(plotDF,
          mapping = aes(x = Cell.population.sorted.1, y = value)) +
  geom_boxplot(outlier.color = "transparent") +
  geom_beeswarm(mapping = aes(color = Cell.population.sorted.1)) +
  scale_y_log10() +
  facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
  theme_bw() +
  theme(axis.text  = element_text(size = 5),
    legend.pos = "none")

Principal variance component analysis per subset

plotDF <- NULL
for (SUBSET in unique(eset$Cell.population.sorted.1)) {
  esetTemp <- eset[, eset$Cell.population.sorted.1 %in% SUBSET]
  esetTemp <- ExpressionSet(assayData = normCounts(esetTemp),
                phenoData = AnnotatedDataFrame(data = pData(esetTemp)))
  fit <- PVCA(counts = exprs(esetTemp),
          meta = pData(esetTemp)[, c("Timepoint",
                     "NHP.group",
                     "Monkey.ID")],
                           threshold = 0.6,
                           inter = FALSE)
  plotTemp <- data.frame(effect = as.vector(fit),
             label  = names(fit)) %>%
    arrange(effect) %>%
    mutate(label = factor(label, levels = unique(label)),
       Cell.population.sorted.1 = SUBSET)
  plotDF <- rbind(plotDF, plotTemp)
}

ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  facet_wrap(facets = ~Cell.population.sorted.1) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Donor effect is observed in all subsets

Substrack pre expression

# merge replicate samples
flagDF <- pData(eset) %>%
  rownames_to_column() %>%
  select(Monkey.ID, Timepoint, Cell.population.sorted.1, rowname) %>%
  pivot_wider(names_from = Timepoint, values_from = rowname) %>%
  pivot_longer(cols = c("D14", "wk8"),
               names_to = "Timepoint",
               values_to = "rowname") %>%
  filter(!is.na(pre) & !is.na(rowname))
esetBaselined <- eset[, flagDF$rowname]
baselinedMat <- normCounts(eset)[, flagDF$rowname] -
  normCounts(eset)[, flagDF$pre]
normCounts(esetBaselined) <- baselinedMat
save(esetBaselined, file = file.path(workDir, "output/joana2.esetBaselined.RData"))
print(esetBaselined)

## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 14986 features, 258 samples 
##   element names: counts, normalizedCounts, offset 
## protocolData: none
## phenoData
##   sampleNames: DGA7_D14_CD4 DGA7_wk8_CD4 ... DHBC_wk8_nonfCD8 (258
##     total)
##   varLabels: Monkey.ID Collaborator ... TotalReads (31 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
##     ENSMMUG00000065387 (14986 total)
##   fvarLabels: seqname strand ... gene_biotype (5 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Principal variance component analysis per subset (on baselined expression)

plotDF <- NULL
for (SUBSET in unique(esetBaselined$Cell.population.sorted.1)) {
  esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted.1 %in% SUBSET]
  esetTemp <- ExpressionSet(assayData = normCounts(esetTemp),
                            phenoData = AnnotatedDataFrame(data = pData(esetTemp)))
  fit <- PVCA(counts = exprs(esetTemp),
              meta = pData(esetTemp)[, c("Timepoint",
                                         "NHP.group",
                                         "Monkey.ID")],
                           threshold = 0.6,
                           inter = FALSE)
  plotTemp <- data.frame(effect = as.vector(fit),
                         label  = names(fit)) %>%
    arrange(effect) %>%
    mutate(label = factor(label, levels = unique(label)),
           Cell.population.sorted.1 = SUBSET)
  plotDF <- rbind(plotDF, plotTemp)
}

ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  facet_wrap(facets = ~Cell.population.sorted.1) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Treatment effect is better observed on pre-post expression.

Multidimensional scalling plot on baselined counts by subsets

plotDF <- NULL
for (SUBSET in unique(esetBaselined$Cell.population.sorted.1)) {
  # mds normalized/cpm counts
  esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted.1 %in% SUBSET]

  mat <- normCounts(esetTemp)
  distMat <- parallelDist(t(mat), threads = detectCores() - 1)
  attributes(distMat)$Labels  <- colnames(mat)
  fit <- cmdscale(distMat, k = 2, eig = TRUE)
  as.data.frame(fit$points) %>%
    rownames_to_column() %>%
    merge(y = rownames_to_column(pData(eset)),
          by = "rowname") %>%
    rbind(plotDF, .) -> plotDF
}

ggplot(data = plotDF,
                   mapping = aes(x = V1,
                                 y = V2,
                                 color = Timepoint,
                                 shape = NHP.group)) +
  geom_point(size = 4) +
  facet_wrap(facets = ~Cell.population.sorted.1, scale = "free") +
  theme_bw() +
  theme(legend.key = element_blank(),
        plot.title = element_text(hjust = 0.5))

Differential expression between timepoints (paired)

fits <- list()
# vaccine effect
for (SUBSET in unique(eset$Cell.population.sorted.1)) {
  esetTemp <- eset[, eset$Cell.population.sorted.1 %in% SUBSET]
  dge <- DGEList(counts  = counts(esetTemp),
                 samples = pData(esetTemp),
                 genes   = fData(esetTemp))
  goi <- interaction(gsub(pattern     = " .+",
                          replacement = "",
                          esetTemp$NHP.group),
                     esetTemp$Timepoint,
                     drop = TRUE)
  donor <- esetTemp$Monkey.ID
  designMat <- model.matrix(~0+goi) 
  rownames(designMat) <- sampleNames(esetTemp)
  colnames(designMat) <- gsub(pattern     = "goi",
                              replacement = "",
                              colnames(designMat))
  attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
  v <- voom(counts = dge, design = designMat)
  corfit <- duplicateCorrelation(v$E,
                                 design = designMat,
                                 block  = donor)
  fit <- lmFit(v,
               design      = designMat,
               block       = donor,
               correlation = corfit$consensus)
  contrastLS <- grep(pattern = "pre", colnames(fit), value = TRUE, invert = TRUE) %>%
    paste0(., "-", gsub(pattern = "\\..+", replacement = ".pre", .))
  contrastMat <- makeContrasts(contrasts = contrastLS, levels = fit$design)
  fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
  fit2 <- eBayes(fit = fit2)
  fits[[paste0(SUBSET, "_vax")]] <- list(fit = fit, fit2 = fit2)
}

# vaccine vs untreated
for (SUBSET in unique(esetBaselined$Cell.population.sorted.1)) {
  esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted.1 %in% SUBSET]
  goi <- interaction(gsub(pattern     = " .+",
                          replacement = "",
                          esetTemp$NHP.group),
                     esetTemp$Timepoint,
                     drop = TRUE)
  designMat <- model.matrix(~0+goi) 
  rownames(designMat) <- sampleNames(esetTemp)
  colnames(designMat) <- gsub(pattern     = "goi",
                              replacement = "",
                              colnames(designMat))
  attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
  dge <- ExpressionSet(assayData =normCounts(esetTemp),
                 phenoData = AnnotatedDataFrame(pData(esetTemp)),
                 featureData   = AnnotatedDataFrame(fData(esetTemp)))
  fit <- lmFit(dge,
               design      = designMat)
  contrastLS <- grep(pattern = "UnTx", colnames(fit), value = TRUE, invert = TRUE) %>%
    paste0(., "-", gsub(pattern = ".+\\.", replacement = "UnTx.", .))
  contrastMat <- makeContrasts(contrasts = contrastLS, levels = fit$design)
  fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
  fit2 <- eBayes(fit = fit2)
  fits[[paste0(SUBSET, "_untx")]] <- list(fit = fit, fit2 = fit2)
}
# save fits
save(fits, file = file.path(workDir, "output/joana2.fits.RData"))

# print number of genes differently expressed
degNbDF <- NULL 
for (modelName in names(fits)) {
  fit2 <- fits[[modelName]][["fit2"]]
  for (coefName in colnames(fit2)) {
    top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
      as.data.frame() %>%
      filter(gene_biotype %in% "protein_coding") %>%
      group_by(sign(logFC)) %>%
      summarize(p     = sum(P.Value <= 0.05),
                adj.p = sum(adj.P.Val <= 0.05)) %>%
      mutate(modelName = modelName,
             contrast = coefName) %>%
      rbind(degNbDF) -> degNbDF
  }
}

degNbDF %>%
  pivot_longer(cols = c(-modelName, -contrast, -`sign(logFC)`),
               names_to = "cname",
               values_to = "n") %>%
  mutate(`sign(logFC)` = c("-1" = "DN", "1" = "UP")[as.character(`sign(logFC)`)],
         cname         = paste0(cname, `sign(logFC)`)) %>%
  select(-`sign(logFC)`) %>%
  pivot_wider(names_from = cname, values_from = n) %>%
  mutate(p = paste0(pUP, "/", pDN),
         adj.p = paste0(adj.pUP, "/", adj.pDN)) %>%
  select(modelName, contrast, p, adj.p) %>%
  as.data.frame() %>%
  kable()

modelName	contrast	p	adj.p
CD8_untx	WT.wk8-UnTx.wk8	542/710	0/0
CD8_untx	DEL.wk8-UnTx.wk8	616/698	0/0
CD8_untx	WT.D14-UnTx.D14	342/577	0/0
CD8_untx	DEL.D14-UnTx.D14	258/499	0/0
nonfCD8_untx	WT.wk8-UnTx.wk8	280/243	0/0
nonfCD8_untx	DEL.wk8-UnTx.wk8	521/373	0/0
nonfCD8_untx	WT.D14-UnTx.D14	381/409	0/0
nonfCD8_untx	DEL.D14-UnTx.D14	225/335	0/0
nonTfh_untx	WT.wk8-UnTx.wk8	325/357	0/0
nonTfh_untx	DEL.wk8-UnTx.wk8	379/346	0/0
nonTfh_untx	WT.D14-UnTx.D14	309/415	0/0
nonTfh_untx	DEL.D14-UnTx.D14	598/554	0/0
GCTfh_untx	WT.wk8-UnTx.wk8	669/726	0/0
GCTfh_untx	DEL.wk8-UnTx.wk8	381/365	0/0
GCTfh_untx	WT.D14-UnTx.D14	358/280	0/0
GCTfh_untx	DEL.D14-UnTx.D14	478/312	0/0
fCD8_untx	WT.wk8-UnTx.wk8	403/526	0/1
fCD8_untx	DEL.wk8-UnTx.wk8	672/857	0/0
fCD8_untx	WT.D14-UnTx.D14	336/363	0/0
fCD8_untx	DEL.D14-UnTx.D14	241/330	0/0
CD8NA_untx	WT.wk8-UnTx.wk8	168/275	0/0
CD8NA_untx	DEL.wk8-UnTx.wk8	281/337	0/0
CD8NA_untx	WT.D14-UnTx.D14	236/381	0/0
CD8NA_untx	DEL.D14-UnTx.D14	282/381	0/0
CD8EM_untx	WT.wk8-UnTx.wk8	325/286	0/0
CD8EM_untx	DEL.wk8-UnTx.wk8	727/1079	2/1
CD8EM_untx	WT.D14-UnTx.D14	640/454	0/0
CD8EM_untx	DEL.D14-UnTx.D14	493/389	0/0
CD8CM_untx	WT.wk8-UnTx.wk8	465/421	0/0
CD8CM_untx	DEL.wk8-UnTx.wk8	463/494	0/0
CD8CM_untx	WT.D14-UnTx.D14	429/257	0/0
CD8CM_untx	DEL.D14-UnTx.D14	333/286	0/0
CD4NA_untx	WT.wk8-UnTx.wk8	305/439	0/0
CD4NA_untx	DEL.wk8-UnTx.wk8	308/367	0/0
CD4NA_untx	WT.D14-UnTx.D14	345/318	0/0
CD4NA_untx	DEL.D14-UnTx.D14	345/341	0/0
CD4MEM_untx	WT.wk8-UnTx.wk8	282/281	0/0
CD4MEM_untx	DEL.wk8-UnTx.wk8	363/414	0/0
CD4MEM_untx	WT.D14-UnTx.D14	497/406	0/1
CD4MEM_untx	DEL.D14-UnTx.D14	895/592	0/0
CD4_untx	WT.wk8-UnTx.wk8	436/426	0/0
CD4_untx	DEL.wk8-UnTx.wk8	307/405	0/0
CD4_untx	WT.D14-UnTx.D14	236/365	0/0
CD4_untx	DEL.D14-UnTx.D14	222/283	0/0
nonfCD8_vax	WT.wk8-WT.pre	1180/444	6/9
nonfCD8_vax	UnTx.wk8-UnTx.pre	1726/751	230/99
nonfCD8_vax	DEL.wk8-DEL.pre	1940/764	155/60
nonfCD8_vax	WT.D14-WT.pre	290/345	0/0
nonfCD8_vax	UnTx.D14-UnTx.pre	349/394	3/0
nonfCD8_vax	DEL.D14-DEL.pre	235/628	0/0
nonTfh_vax	WT.wk8-WT.pre	1094/484	9/35
nonTfh_vax	UnTx.wk8-UnTx.pre	1708/903	334/197
nonTfh_vax	DEL.wk8-DEL.pre	795/542	0/4
nonTfh_vax	WT.D14-WT.pre	257/403	0/0
nonTfh_vax	UnTx.D14-UnTx.pre	567/554	0/0
nonTfh_vax	DEL.D14-DEL.pre	268/362	0/0
GCTfh_vax	WT.wk8-WT.pre	1053/608	21/54
GCTfh_vax	UnTx.wk8-UnTx.pre	2361/1328	984/520
GCTfh_vax	DEL.wk8-DEL.pre	1649/881	187/101
GCTfh_vax	WT.D14-WT.pre	317/181	2/0
GCTfh_vax	UnTx.D14-UnTx.pre	491/408	0/0
GCTfh_vax	DEL.D14-DEL.pre	242/180	0/0
fCD8_vax	WT.wk8-WT.pre	1333/1084	135/176
fCD8_vax	UnTx.wk8-UnTx.pre	1936/1394	739/567
fCD8_vax	DEL.wk8-DEL.pre	1331/583	35/22
fCD8_vax	WT.D14-WT.pre	324/774	10/0
fCD8_vax	UnTx.D14-UnTx.pre	539/999	1/3
fCD8_vax	DEL.D14-DEL.pre	250/744	0/0
CD8NA_vax	WT.wk8-WT.pre	655/487	12/17
CD8NA_vax	UnTx.wk8-UnTx.pre	1295/618	173/88
CD8NA_vax	DEL.wk8-DEL.pre	1053/432	18/2
CD8NA_vax	WT.D14-WT.pre	226/323	2/0
CD8NA_vax	UnTx.D14-UnTx.pre	473/414	0/0
CD8NA_vax	DEL.D14-DEL.pre	977/386	1/0
CD8EM_vax	WT.wk8-WT.pre	836/518	7/19
CD8EM_vax	UnTx.wk8-UnTx.pre	1469/877	178/123
CD8EM_vax	DEL.wk8-DEL.pre	1720/1041	140/86
CD8EM_vax	WT.D14-WT.pre	216/310	0/0
CD8EM_vax	UnTx.D14-UnTx.pre	626/935	0/0
CD8EM_vax	DEL.D14-DEL.pre	138/396	0/0
CD8CM_vax	WT.wk8-WT.pre	821/440	3/16
CD8CM_vax	UnTx.wk8-UnTx.pre	2309/646	421/111
CD8CM_vax	DEL.wk8-DEL.pre	545/270	0/2
CD8CM_vax	WT.D14-WT.pre	244/251	0/0
CD8CM_vax	UnTx.D14-UnTx.pre	482/585	0/0
CD8CM_vax	DEL.D14-DEL.pre	181/259	0/0
CD8_vax	WT.wk8-WT.pre	945/558	21/45
CD8_vax	UnTx.wk8-UnTx.pre	1499/781	121/63
CD8_vax	DEL.wk8-DEL.pre	1041/373	13/9
CD8_vax	WT.D14-WT.pre	395/423	9/0
CD8_vax	UnTx.D14-UnTx.pre	590/415	0/0
CD8_vax	DEL.D14-DEL.pre	821/908	46/8
CD4NA_vax	WT.wk8-WT.pre	865/951	25/69
CD4NA_vax	UnTx.wk8-UnTx.pre	1256/910	181/164
CD4NA_vax	DEL.wk8-DEL.pre	742/446	19/19
CD4NA_vax	WT.D14-WT.pre	301/209	1/0
CD4NA_vax	UnTx.D14-UnTx.pre	417/412	0/0
CD4NA_vax	DEL.D14-DEL.pre	253/465	0/0
CD4MEM_vax	WT.wk8-WT.pre	1048/305	3/11
CD4MEM_vax	UnTx.wk8-UnTx.pre	1251/758	94/69
CD4MEM_vax	DEL.wk8-DEL.pre	1198/348	5/2
CD4MEM_vax	WT.D14-WT.pre	237/130	2/0
CD4MEM_vax	UnTx.D14-UnTx.pre	524/567	0/0
CD4MEM_vax	DEL.D14-DEL.pre	467/323	0/0
CD4_vax	WT.wk8-WT.pre	1261/844	36/70
CD4_vax	UnTx.wk8-UnTx.pre	1631/802	276/152
CD4_vax	DEL.wk8-DEL.pre	885/383	4/8
CD4_vax	WT.D14-WT.pre	307/260	1/0
CD4_vax	UnTx.D14-UnTx.pre	517/467	0/0
CD4_vax	DEL.D14-DEL.pre	441/372	0/0

Heatmaps with DEGs

for (modelName in grep(pattern = "_vax", names(fits), value = TRUE)) {
  fit2 <- fits[[modelName]][["fit2"]]
  for (coefName in colnames(fit2)) {
    top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
      as.data.frame() %>%
      filter(!is.na(gene_name) &
           gene_biotype %in% "protein_coding" &
           adj.P.Val <= 0.05)
    if (nrow(top) > 0) {
      top <- top %>%
        top_n(50, wt = abs(t)) %>%
        arrange(desc(logFC))
      sampleLS <- which(fit2$contrast[, coefName] != 0) %>%
        fit2$design[, .] %>%
        as.data.frame() %>%
        rownames_to_column() %>%
        gather(group, value, -rowname) %>%
        group_by(rowname) %>%
        summarize(value = sum(value)) %>%
        filter(value > 0) %>%
        .$rowname
      mat <- normCounts(eset)[top$gene_id, sampleLS, drop = FALSE] %>%
        t() %>%
        scale() %>%
        t()
      mat <- mat[complete.cases(mat), , drop = FALSE]
      colorLS <- colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
      breakLS <- c(-1 * max(abs(mat), na.rm = TRUE),
                   seq(from = -1 * min(abs(range(mat, na.rm = TRUE))),
                       to   = min(abs(range(mat, na.rm = TRUE))),
                       length.out = 99),
                   max(abs(mat), na.rm = TRUE))
      matAnnot <- pData(eset) %>%
        rownames_to_column() %>%
        select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
        column_to_rownames()
      matAnnot <- matAnnot[colnames(mat), ]
      matAnnot <- matAnnot[, colMeans(is.na(matAnnot)) < 1]
      ha <- HeatmapAnnotation(df = matAnnot)
      heat <- Heatmap(matrix         = mat,
                      width          = unit(0.5, units = "npc"),
                      height         = unit(0.1+0.6*(nrow(mat)/50), units = "npc"),
                      top_annotation = ha,
                      name           = "zscore",
                      column_title   = paste0(modelName, ": ", coefName),
                      row_names_gp   = gpar(fontsize = 7),
                      column_labels  = pData(eset)[colnames(mat), "Monkey.ID"],
                          row_labels     = top$gene_name,
                          use_raster     = TRUE)
      print(heat)
    }
  }
}

for (modelName in grep(pattern = "_untx", names(fits), value = TRUE)) {
  fit2 <- fits[[modelName]][["fit2"]]
  for (coefName in colnames(fit2)) {
    top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
      as.data.frame() %>%
      filter(!is.na(gene_name) &
           gene_biotype %in% "protein_coding" &
           adj.P.Val <= 0.05)
    if (nrow(top) > 0) {
      top <- top %>%
        top_n(50, wt = abs(t)) %>%
        arrange(desc(logFC))
      sampleLS <- which(fit2$contrast[, coefName] != 0) %>%
        fit2$design[, .] %>%
        as.data.frame() %>%
        rownames_to_column() %>%
        gather(group, value, -rowname) %>%
        group_by(rowname) %>%
        summarize(value = sum(value)) %>%
        filter(value > 0) %>%
        .$rowname
      mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
      mat <- mat[complete.cases(mat), , drop = FALSE]
      colorLS <- colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
      breakLS <- c(-1 * max(abs(mat), na.rm = TRUE),
                   seq(from = -1 * min(abs(range(mat, na.rm = TRUE))),
                       to   = min(abs(range(mat, na.rm = TRUE))),
                       length.out = 99),
                   max(abs(mat), na.rm = TRUE))
      matAnnot <- pData(esetBaselined) %>%
        rownames_to_column() %>%
        select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
        column_to_rownames()
      matAnnot <- matAnnot[colnames(mat), ]
      matAnnot <- matAnnot[, colMeans(is.na(matAnnot)) < 1]
      ha <- HeatmapAnnotation(df = matAnnot)
      heat <- Heatmap(matrix         = mat,
                      width          = unit(0.5, units = "npc"),
                      height         = unit(0.1+0.6*(nrow(mat)/50), units = "npc"),
                      top_annotation = ha,
                      name           = "zscore",
                      column_title   = paste0(modelName, ": ", coefName),
                      row_names_gp   = gpar(fontsize = 7),
                      column_labels  = pData(eset)[colnames(mat), "Monkey.ID"],
                          row_labels     = top$gene_name,
                          use_raster     = TRUE)
      print(heat)
    }
  }
}

Heatmaps with all DEGs across time by subset

message("FDR cutoff of 5%") 
for (modelName in grep(pattern = "vax", names(fits), value = TRUE)) {
  fit2 <- fits[[modelName]][["fit2"]]
  top <- topTable(fit2, number = Inf, p.value = 0.05) %>%
    as.data.frame() %>%
    filter(gene_biotype %in% "protein_coding")
  if (nrow(top) > 0) {
    top <- top %>%
      arrange(desc(WT.D14.WT.pre)) %>%
      mutate(gene_name = ifelse(test = is.na(gene_name),
                yes  = gene_id,
                no   = gene_name))
    sampleLS <- rownames(fit2$design)
    sampleLS <- intersect(sampleLS, sampleNames(esetBaselined))
    mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
      
    mat <- mat[complete.cases(mat), ]
    matAnnot <- pData(eset) %>%
      rownames_to_column() %>%
      select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
      column_to_rownames()
    matAnnot <- matAnnot[colnames(mat), ]
    ha <- HeatmapAnnotation(df = matAnnot,
                            col = list(NHP.group = c("UnTx" = "grey", "WT bNAb Tx" = "orange", "DEL bNAb Tx" = "green")))
    splitCols <- pData(eset)[colnames(mat),
                 c("Timepoint", "NHP.group")] %>%
      apply(MARGIN = 1, FUN = paste, collapse = ".") 
    heat <- Heatmap(matrix         = mat,
            width          = unit(0.5, units = "npc"),
            height         = unit(0.75, units = "npc"),
            top_annotation = ha,
            name           = "log2FC",
            column_title   = modelName,
            show_row_names = FALSE,
            column_labels  = pData(eset)[colnames(mat), "Monkey.ID"],
            column_split   = splitCols,
            use_raster     = TRUE)
    print(heat)
  }
}

message("FDR cutoff of 5%") 
for (modelName in grep(pattern = "untx", names(fits), value = TRUE)) {
  fit2 <- fits[[modelName]][["fit2"]]
  top <- topTable(fit2, number = Inf, p.value = 0.05) %>%
    as.data.frame()
  if (nrow(top) > 0) { 
    top <- top %>%
    filter(gene_biotype %in% "protein_coding")
  }
  if (nrow(top) > 0) {
    top <- top %>%
      arrange(desc(WT.D14.UnTx.D14)) %>%
      mutate(gene_name = ifelse(test = is.na(gene_name),
                yes  = gene_id,
                no   = gene_name))
    sampleLS <- rownames(fit2$design)
    sampleLS <- intersect(sampleLS, sampleNames(esetBaselined))
    mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
      
    mat <- mat[complete.cases(mat), ]
    matAnnot <- pData(eset) %>%
      rownames_to_column() %>%
      select(rowname, Timepoint, NHP.group, Cell.population.sorted.1) %>%
      column_to_rownames()
    matAnnot <- matAnnot[colnames(mat), ]
    ha <- HeatmapAnnotation(df = matAnnot,
                            col = list(NHP.group = c("UnTx" = "grey", "WT bNAb Tx" = "orange", "DEL bNAb Tx" = "green")))
    splitCols <- pData(eset)[colnames(mat),
                 c("Timepoint", "NHP.group")] %>%
      apply(MARGIN = 1, FUN = paste, collapse = ".") 
    heat <- Heatmap(matrix         = mat,
            width          = unit(0.5, units = "npc"),
            height         = unit(0.75, units = "npc"),
            top_annotation = ha,
            name           = "log2FC",
            column_title   = modelName,
            row_labels     = top$gene_name,
            column_labels  = pData(eset)[colnames(mat),
                         "Monkey.ID"],
            column_split   = splitCols)
    print(heat)
  }
}

sessionInfo()

## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin23.0.0 (64-bit)
## Running under: macOS Sonoma 14.3
## 
## Matrix products: default
## BLAS:   /opt/homebrew/Cellar/openblas/0.3.26/lib/libopenblasp-r0.3.26.dylib 
## LAPACK: /opt/homebrew/Cellar/r/4.3.2/lib/R/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] stringr_1.5.1               dplyr_1.1.4                
##  [5] purrr_1.0.2                 readr_2.1.5                
##  [7] tidyr_1.3.1                 tibble_3.2.1               
##  [9] tidyverse_2.0.0             ComplexHeatmap_2.18.0      
## [11] ggbeeswarm_0.7.2            ggplot2_3.4.4              
## [13] edgeR_4.0.14                limma_3.58.1               
## [15] parallelDist_0.2.6          lme4_1.1-35.1              
## [17] Matrix_1.6-5                pvca_0.1.0                 
## [19] EDASeq_2.36.0               ShortRead_1.60.0           
## [21] GenomicAlignments_1.38.2    SummarizedExperiment_1.32.0
## [23] MatrixGenerics_1.14.0       matrixStats_1.2.0          
## [25] Rsamtools_2.18.0            GenomicRanges_1.54.1       
## [27] Biostrings_2.70.2           GenomeInfoDb_1.38.5        
## [29] XVector_0.42.0              IRanges_2.36.0             
## [31] S4Vectors_0.40.2            BiocParallel_1.36.0        
## [33] Biobase_2.62.0              BiocGenerics_0.48.1        
## [35] readxl_1.4.3                kableExtra_1.4.0           
## [37] knitr_1.45                 
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      shape_1.4.6             rstudioapi_0.15.0      
##   [4] magrittr_2.0.3          magick_2.8.2            GenomicFeatures_1.54.1 
##   [7] farver_2.1.1            nloptr_2.0.3            rmarkdown_2.25         
##  [10] GlobalOptions_0.1.2     BiocIO_1.12.0           zlibbioc_1.48.0        
##  [13] vctrs_0.6.5             memoise_2.0.1           minqa_1.2.6            
##  [16] RCurl_1.98-1.14         htmltools_0.5.7         S4Arrays_1.2.0         
##  [19] progress_1.2.3          curl_5.2.0              cellranger_1.1.0       
##  [22] SparseArray_1.2.3       cachem_1.0.8            iterators_1.0.14       
##  [25] lifecycle_1.0.4         pkgconfig_2.0.3         R6_2.5.1               
##  [28] fastmap_1.1.1           clue_0.3-65             GenomeInfoDbData_1.2.11
##  [31] digest_0.6.34           colorspace_2.1-0        AnnotationDbi_1.64.1   
##  [34] RSQLite_2.3.5           hwriter_1.3.2.1         labeling_0.4.3         
##  [37] filelock_1.0.3          timechange_0.3.0        fansi_1.0.6            
##  [40] httr_1.4.7              abind_1.4-5             compiler_4.3.2         
##  [43] doParallel_1.0.17       bit64_4.0.5             withr_3.0.0            
##  [46] DBI_1.2.1               highr_0.10              R.utils_2.12.3         
##  [49] biomaRt_2.58.1          MASS_7.3-60.0.1         rappdirs_0.3.3         
##  [52] DelayedArray_0.28.0     rjson_0.2.21            tools_4.3.2            
##  [55] vipor_0.4.7             beeswarm_0.4.0          R.oo_1.26.0            
##  [58] glue_1.7.0              restfulr_0.0.15         nlme_3.1-164           
##  [61] cluster_2.1.6           generics_0.1.3          gtable_0.3.4           
##  [64] tzdb_0.4.0              R.methodsS3_1.8.2       hms_1.1.3              
##  [67] xml2_1.3.6              utf8_1.2.4              foreach_1.5.2          
##  [70] pillar_1.9.0            vroom_1.6.5             circlize_0.4.15        
##  [73] splines_4.3.2           BiocFileCache_2.10.1    lattice_0.22-5         
##  [76] aroma.light_3.32.0      rtracklayer_1.62.0      bit_4.0.5              
##  [79] deldir_2.0-2            tidyselect_1.2.0        locfit_1.5-9.8         
##  [82] svglite_2.1.3           xfun_0.41               statmod_1.5.0          
##  [85] stringi_1.8.3           yaml_2.3.8              boot_1.3-28.1          
##  [88] evaluate_0.23           codetools_0.2-19        interp_1.1-6           
##  [91] cli_3.6.2               RcppParallel_5.1.7      systemfonts_1.0.5      
##  [94] munsell_0.5.0           Rcpp_1.0.12             dbplyr_2.4.0           
##  [97] png_0.1-8               XML_3.99-0.16.1         blob_1.2.4             
## [100] prettyunits_1.2.0       latticeExtra_0.6-30     jpeg_0.1-10            
## [103] bitops_1.0-7            viridisLite_0.4.2       scales_1.3.0           
## [106] crayon_1.5.2            GetoptLong_1.0.5        rlang_1.1.3            
## [109] KEGGREST_1.42.0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

20210909_Joana2.mrnaseq_preproc.md

20210909_Joana2.mrnaseq_preproc.md

Dias/Koup preprocessing of RNA-Seq data (set 2)

Timepoint	CD4	CD4MEM	CD4NA	CD8	CD8CM	CD8EM	CD8NA	GCTfh	fCD8	nonTfh	nonfCD8
pre	12	12	12	12	12	12	12	12	12	12	12
D14	12	12	12	12	12	12	12	12	12	12	12
wk8	12	12	12	12	12	12	12	12	12	12	12

Timepoint	CD4	CD4MEM	CD4NA	CD8	CD8CM	CD8EM	CD8NA	GCTfh	fCD8	nonTfh	nonfCD8
pre	12	12	12	12	12	12	12	12	12	12	12
D14	12	12	12	12	12	12	12	12	12	12	12
wk8	12	12	12	12	12	12	12	12	12	12	12

Files

20210909_Joana2.mrnaseq_preproc.md

Latest commit

History

20210909_Joana2.mrnaseq_preproc.md

File metadata and controls

Dias/Koup preprocessing of RNA-Seq data (set 2)

Timepoint	CD4	CD4MEM	CD4NA	CD8	CD8CM	CD8EM	CD8NA	GCTfh	fCD8	nonTfh	nonfCD8
pre	12	12	12	12	12	12	12	12	12	12	12
D14	12	12	12	12	12	12	12	12	12	12	12
wk8	12	12	12	12	12	12	12	12	12	12	12