Dias/Koup preprocessing of RNA-Seq data (set 1)

Slim Fourati 2024-01-29

suppressPackageStartupMessages(library(package = "knitr"))
suppressPackageStartupMessages(library(package = "readxl"))
suppressPackageStartupMessages(library(package = "zoo"))
suppressPackageStartupMessages(library(package = "ggbeeswarm"))
suppressPackageStartupMessages(library(package = "caTools"))
suppressPackageStartupMessages(library(package = "EDASeq"))
suppressPackageStartupMessages(library(package = "pvca")) # dleelab/pvca
suppressPackageStartupMessages(library(package = "lme4"))
suppressPackageStartupMessages(library(package = "parallelDist"))
suppressPackageStartupMessages(library(package = "parallel"))
suppressPackageStartupMessages(library(package = "edgeR"))
suppressPackageStartupMessages(library(package = "ggrepel"))
suppressPackageStartupMessages(library(package = "ComplexHeatmap"))
suppressPackageStartupMessages(library(package = "tidyverse"))

opts_chunk$set(echo = TRUE, fig.path = "../figure/")
options(readr.show_col_types   = FALSE,
        dplyr.summarise.inform = FALSE)
workDir <- dirname(getwd())

Reading raw counts

countDF <- read_csv(file = file.path(workDir, "input/joana.genecounts.csv")) %>%
  as.data.frame() %>%
  column_to_rownames(var = "gene_id")
message("First five rows/columns of count matrix")

## First five rows/columns of count matrix

print(countDF[1:5, 1:5])

##                    DGAM_wk8_Bcells DGAM_wk8_cDCs DGAM_wk8_monocytes
## ENSMMUG00000000001             305             0                  6
## ENSMMUG00000000002              32             0                  0
## ENSMMUG00000000005             754           169                399
## ENSMMUG00000000006               0             0                  0
## ENSMMUG00000000007              95             0                 51
##                    DGAM_wk8_NKcells DGAM_wk8_pDCs
## ENSMMUG00000000001               33            18
## ENSMMUG00000000002                0             0
## ENSMMUG00000000005               46           927
## ENSMMUG00000000006                2             0
## ENSMMUG00000000007               59           181

message("Dimensions of the count matrix")

## Dimensions of the count matrix

print(dim(countDF))

## [1] 34847   200

Read sample annotation

sampleAnnotFile <- file.path(workDir,
                             "input",
                             "Joana_Dias_bulk_RNAf_Sample_tracking_sheet_03082020.xlsx")
# read each plates
sheetLS <- excel_sheets(path = sampleAnnotFile)
sampleAnnotDF <- lapply(sheetLS, function(sheetName) {
  sampleAnnotTemp <- read_excel(path = sampleAnnotFile, sheet = sheetName) %>%
    mutate(Plate = sheetName)
  return(value = sampleAnnotTemp)
}) %>%
  do.call(what = rbind)
# append sample id
sampleAnnotDF <- sampleAnnotDF %>%
  mutate(rowname = paste(`Monkey ID`,
                         Timepoint,
                         `Cell population sorted`,
                         sep = "_"),
         rowname = gsub(pattern = " ", replacement = "", rowname)) %>%
  setNames(nm = make.names(names = names(.))) %>%
  as.data.frame() %>%
  column_to_rownames(var = "rowname")
sampleAnnotDF <- sampleAnnotDF[colnames(countDF), ]
# TotalReads counted
sampleAnnotDF$TotalReads <- colSums(countDF)
message("print header of sample annotation")

## print header of sample annotation

print(sampleAnnotDF[1:5, 1:5])

##                           Collaborator Sample.
## DGAM_wk8_Bcells    Joana Dias/Koup Lab       7
## DGAM_wk8_cDCs      Joana Dias/Koup Lab      23
## DGAM_wk8_monocytes Joana Dias/Koup Lab       8
## DGAM_wk8_NKcells   Joana Dias/Koup Lab      25
## DGAM_wk8_pDCs      Joana Dias/Koup Lab      24
##                                       Sample.Type.Subset Study.. Monkey.ID
## DGAM_wk8_Bcells    total RNA from sorted cells in RNAzol   730.2      DGAM
## DGAM_wk8_cDCs      total RNA from sorted cells in RNAzol   730.2      DGAM
## DGAM_wk8_monocytes total RNA from sorted cells in RNAzol   730.2      DGAM
## DGAM_wk8_NKcells   total RNA from sorted cells in RNAzol   730.2      DGAM
## DGAM_wk8_pDCs      total RNA from sorted cells in RNAzol   730.2      DGAM

message("Number of animal w/ RNA-seq data")

## Number of animal w/ RNA-seq data

sampleAnnotDF %>%
  .$Monkey.ID %>%
  unique() %>%
  length() %>%
  print()

## [1] 17

message("Number of timepoints and subsets w/ RNA-seq data")

## Number of timepoints and subsets w/ RNA-seq data

sampleAnnotDF %>%
  select(Monkey.ID, Timepoint, Cell.population.sorted) %>%
  distinct() %>%
  group_by(Timepoint, Cell.population.sorted) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  mutate(Timepoint = factor(Timepoint, levels = c("pre", "D14", "wk8"))) %>%
  pivot_wider(names_from = Cell.population.sorted, values_from = n) %>%
  arrange(Timepoint) %>%
  print()

## # A tibble: 3 × 6
##   Timepoint `B cells` `NK cells`  cDCs monocytes  pDCs
##   <fct>         <int>      <int> <int>     <int> <int>
## 1 pre              14         14    14        14    14
## 2 D14              10         10    10        10    10
## 3 wk8              16         16    16        16    16

Transcriptomic profiles of 17 animals for 5 subsets at 3 timepoints was performed

Read viral load

vlFile <- file.path(workDir,
                    "input/Raw_data_for_Slim_JD20200414.xlsx")
vlDF <- read_excel(path      = vlFile,
                   skip      = 2,
                   col_names = paste0("X", 1:8)) %>%
  mutate(Treatment = factor(X1,
                            levels = c("Untreated", "WT bNAb Tx", "DEL bNAb Tx")),
         Treatment = na.locf(Treatment)) %>%
  filter(X1 != Treatment & !is.na(X1))

by(vlDF, INDICES = vlDF$Treatment, FUN = function(x) {
  cNames <- as.vector(unlist(x[1, ]))
  cNames[length(cNames)] <- "Treatment"
  x <- x[-1, ]
  colnames(x) <- cNames
  x <- x %>%
    pivot_longer(cols = cNames[3:8],
                 names_to = "Animal ID",
         values_to = "Plasma viral loads (copies/mL)")
  return(value = x)
}) %>%
  do.call(what = rbind) -> vlDF

plotDF <- vlDF %>%
  select(-`day post-challenge`) %>%
  mutate(`week post-challenge` = as.numeric(`week post-challenge`),
         `Plasma viral loads (copies/mL)` =
           gsub(pattern = "below |Below ",
                replacement = "",
                `Plasma viral loads (copies/mL)`),
         `Plasma viral loads (copies/mL)` =
           as.numeric(`Plasma viral loads (copies/mL)`))

ggplot(data = plotDF,
       mapping = aes(x = `week post-challenge`,
                     y = `Plasma viral loads (copies/mL)`)) +
  geom_line(mapping = aes(group = `Animal ID`, color = Treatment), linewidth = 1.1) +
  scale_y_log10() +
  scale_color_manual(values = c("Untreated" = "grey",
                                "WT bNAb Tx" = "orange",
                "DEL bNAb Tx" = "green")) +
  theme_bw()

maxDF <- plotDF %>%
  group_by(Treatment, `Animal ID`) %>%
  summarize(max.vl = max(`Plasma viral loads (copies/mL)`))

ggplot(data = maxDF,
       mapping = aes(x = Treatment, y = max.vl)) +
  geom_boxplot(fill = "transparent") +
  geom_beeswarm(mapping = aes(color = Treatment), cex = 3, size = 2) +
  scale_y_log10() +
  labs(y = "Max plasma viral loads (copies/mL)") +
  scale_color_manual(values = c("Untreated"   = "grey",
                                "WT bNAb Tx"  = "orange",
                                "DEL bNAb Tx" = "green")) +
  theme_bw()

kruskal.test(x = maxDF$max.vl, g = maxDF$Treatment)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  maxDF$max.vl and maxDF$Treatment
## Kruskal-Wallis chi-squared = 1.0743, df = 2, p-value = 0.5844

pairwise.wilcox.test(x = maxDF$max.vl, g = maxDF$Treatment, p.adjust.method = "none")

## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  maxDF$max.vl and maxDF$Treatment 
## 
##             Untreated WT bNAb Tx
## WT bNAb Tx  0.81      -         
## DEL bNAb Tx 0.29      0.69      
## 
## P value adjustment method: none

# append to sample annotation
sampleAnnotDF %>%
  rownames_to_column() %>%
  merge(y = maxDF, by.x = "Monkey.ID", by.y = "Animal ID") %>%
  column_to_rownames() -> sampleAnnotDF
sampleAnnotDF <- sampleAnnotDF[colnames(countDF), ]

time2maxDF <- plotDF %>%
  arrange(desc(`week post-challenge`)) %>%
  group_by(Treatment, `Animal ID`) %>%
  summarize(time2max.vl = `week post-challenge`[which.max(`Plasma viral loads (copies/mL)`)]) %>%
  ungroup()

ggplot(data = time2maxDF,
       mapping = aes(x = Treatment, y = time2max.vl)) +
  geom_boxplot(fill = "transparent") +
  geom_beeswarm(mapping = aes(color = Treatment), cex = 3, size = 2) +
  labs(y = "Time to max plasma viral loads (weeks)") +
  scale_color_manual(values = c("Untreated"   = "grey",
                                "WT bNAb Tx"  = "orange",
                                "DEL bNAb Tx" = "green")) +
  theme_bw()

kruskal.test(x = time2maxDF$time2max.vl, g = time2maxDF$Treatment)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  time2maxDF$time2max.vl and time2maxDF$Treatment
## Kruskal-Wallis chi-squared = 7.3589, df = 2, p-value = 0.02524

pairwise.wilcox.test(x = time2maxDF$time2max.vl,
                     g = time2maxDF$Treatment,
                     p.adjust.method = "none")

## 
##  Pairwise comparisons using Wilcoxon rank sum test with continuity correction 
## 
## data:  time2maxDF$time2max.vl and time2maxDF$Treatment 
## 
##             Untreated WT bNAb Tx
## WT bNAb Tx  0.033     -         
## DEL bNAb Tx 0.062     0.100     
## 
## P value adjustment method: none

sampleAnnotDF %>%
  rownames_to_column() %>%
  merge(y = select(time2maxDF, -Treatment),
        by.x = "Monkey.ID",
        by.y = "Animal ID") %>%
  column_to_rownames() -> sampleAnnotDF
sampleAnnotDF <- sampleAnnotDF[colnames(countDF), ]

Read bNAb concentration

bnabFile <- file.path(workDir,
                          "input/Raw_data_for_Slim_JD20200414.xlsx")
bnabDF <- read_excel(path      = bnabFile,
                         sheet     = 2,
                         skip      = 2,
                         col_names = paste0("X", 1:8)) %>%
  mutate(bNab = factor(X1,
               levels = c("WT bNAb Tx_[VRC07-523-LS]",
                  "WT bNAb Tx_[PGT121]",
                  "DEL bNAb Tx_[VRC07-523-LS/DEL]",
                  "DEL bNAb Tx_[PGT121/DEL]")),
     bNab = na.locf(bNab)) %>%
  filter(X1 != bNab & !is.na(X1))

by(bnabDF, INDICES = bnabDF$bNab, FUN = function(x) {
  cNames <- as.vector(unlist(x[1, ]))
  cNames[length(cNames)] <- "bNab"
  x <- x[-1, ]
  colnames(x) <- cNames
  x <- x %>%
    pivot_longer(cols = cNames[3:8],
         names_to = "Animal ID",
         values_to = "Plasma bNAb concentration (ug/mL)")
  return(value = x)
}) %>%
  do.call(what = rbind) -> bnabDF

bnabDF <- bnabDF %>%
  mutate(`Plasma bNAb concentration (ug/mL)` =
       as.numeric(`Plasma bNAb concentration (ug/mL)`),
     `week post-challenge` = as.numeric(`week post-challenge`))

ggplot(data = bnabDF,
       mapping = aes(x = `week post-challenge`,
             y = `Plasma bNAb concentration (ug/mL)`)) +
  geom_line(mapping = aes(by = `Animal ID`, color = bNab)) +
  scale_color_manual(values = c("WT bNAb Tx_[VRC07-523-LS]"      = "red",
                                        "WT bNAb Tx_[PGT121]"            = "darkblue",
                                        "DEL bNAb Tx_[VRC07-523-LS/DEL]" = "orange",
                                        "DEL bNAb Tx_[PGT121/DEL]"       = "lightblue"),
                         name = "bNAb") +
  theme_bw()

## 
##  Kruskal-Wallis rank sum test
## 
## data:  aucDF$auc and aucDF$bNab
## Kruskal-Wallis chi-squared = 19.98, df = 3, p-value = 0.0001714

statDF <- sampleAnnotDF %>%
  mutate(log10.max.vl = log10(max.vl))  %>%
  select(Monkey.ID, log10.max.vl, time2max.vl, auc.VRC07.523.LS, auc.PGT121,
     Treatment) %>%
  distinct()
message("correlation between max VL and bNAb in plasma:")

## correlation between max VL and bNAb in plasma:

cor.test(formula = ~log10.max.vl+auc.VRC07.523.LS,
     data = statDF,
     method  = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  log10.max.vl and auc.VRC07.523.LS
## S = 261.74, p-value = 0.7933
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.08481042

cor.test(formula = ~log10.max.vl+auc.PGT121,
     data = statDF,
     method  = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  log10.max.vl and auc.PGT121
## S = 253.66, p-value = 0.7264
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1130806

message("correlation between time to max VL and bNAb in plasma:")

## correlation between time to max VL and bNAb in plasma:

cor.test(formula = ~time2max.vl+auc.VRC07.523.LS,
     data = statDF,
     method  = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  time2max.vl and auc.VRC07.523.LS
## S = 126.77, p-value = 0.06007
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5567549

cor.test(formula = ~time2max.vl+auc.PGT121,
     data = statDF,
     method  = "spearman")

## 
##  Spearman's rank correlation rho
## 
## data:  time2max.vl and auc.PGT121
## S = 126.77, p-value = 0.06007
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5567549

plotDF <- statDF %>%
  pivot_longer(cols = c(auc.VRC07.523.LS, auc.PGT121),
           names_to = "bNAb",
           values_to = "AUC")
ggplot(data = plotDF,
       mapping = aes(x = time2max.vl, y = AUC)) +
  geom_point(mapping = aes(color = bNAb, shape = Treatment)) +
  theme_bw()

non significant positive correlation between time to max VL and bNAb.

Read gene annotation

colNames <- c("seqname",
              "source",
              "feature",
              "start",
              "end",
              "score",
              "strand",
              "frame",
              "attribute")
colTypes <- paste(c(rep("c", times = 3),
                    rep("i", times = 2),
                    rep("c", times = 4)),
                  collapse = "")
featuresAnnotationFile <- file.path(workDir, "input/joana.genes.gtf")
skipNb <- read_lines(file = featuresAnnotationFile)
skipNb <- sum(grepl(pattern = "^#", skipNb))

featuresAnnot <- read_tsv(file      = featuresAnnotationFile,
                          skip      = skipNb,
                          col_names = colNames,
                          col_types = colTypes)
# extract gene_id and transcript_id from attributes
featAnnot <- featuresAnnot %>%
  mutate(gene_id = gsub(pattern = ".*gene_id \"([^;]+)\";.+",
                        replacement = "\\1",
                        attribute),
         gene_name = ifelse(test = grepl(pattern = "gene_name",
                                         attribute),
                            yes = gsub(pattern = ".+gene_name \"([^;]+)\";.+",
                                       replacement = "\\1",
                                       attribute),
                            no  = NA),
         gene_biotype = ifelse(test = grepl(pattern = "gene_biotype",
                                         attribute),
                            yes = gsub(pattern = ".+gene_biotype \"([^;]+)\";.*",
                                       replacement = "\\1",
                                       attribute),
                            no  = NA)) %>%
  select(seqname, strand, gene_id, gene_name, gene_biotype) %>%
  distinct() %>%
  as.data.frame()
rownames(featAnnot) <- featAnnot$gene_id
featAnnot <- featAnnot[rownames(countDF), ]
cat("Number of genes annotated")

## Number of genes annotated

featAnnot %>%
    group_by(gene_biotype) %>%
    summarize(n = n()) %>%
    arrange(desc(n))

## # A tibble: 19 × 2
##    gene_biotype             n
##    <chr>                <int>
##  1 protein_coding       21356
##  2 lncRNA                4596
##  3 misc_RNA              3385
##  4 snRNA                 1839
##  5 snoRNA                1315
##  6 pseudogene             666
##  7 Y_RNA                  661
##  8 miRNA                  622
##  9 rRNA                   103
## 10 IG_V_gene               87
## 11 processed_pseudogene    83
## 12 TR_V_gene               46
## 13 scaRNA                  43
## 14 TR_J_gene               21
## 15 TR_C_gene                8
## 16 vaultRNA                 7
## 17 ribozyme                 4
## 18 IG_C_gene                3
## 19 IG_D_gene                2

Build raw SeqExpressionSet

# build  raw ExpressionSet
esetRaw <- newSeqExpressionSet(counts      = as.matrix(countDF),
                               featureData = AnnotatedDataFrame(featAnnot),
                               phenoData   = AnnotatedDataFrame(sampleAnnotDF))
save(esetRaw, file = file.path(workDir, "output/joana.esetRaw.RData"))
print(esetRaw)

## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 34847 features, 200 samples 
##   element names: counts, normalizedCounts, offset 
## protocolData: none
## phenoData
##   sampleNames: DGAM_wk8_Bcells DGAM_wk8_cDCs ... HZ9_wk8_pDCs (200
##     total)
##   varLabels: Monkey.ID Collaborator ... auc.PGT121 (32 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
##     ENSMMUG00000065351 (34847 total)
##   fvarLabels: seqname strand ... gene_biotype (5 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Barplot with read alignment statistics

## number of samples with more than 2e06 counted reads

## # A tibble: 2 × 2
##   over10e6 `n()`
##   <lgl>    <int>
## 1 FALSE       25
## 2 TRUE       175

All samples except 25 have more than 2 million reads and can be used for diff. expression analysis.

Barplot with mapped reads statistics

plotDF <- counts(esetRaw) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  merge(y    = select(fData(esetRaw), gene_id, gene_biotype),
        by.x = "rowname",
        by.y = "gene_id") %>%
  mutate(gene_biotype = ifelse(test = grepl(pattern = "pseudogene",
                                            gene_biotype),
                               yes = "pseudogene",
                               no  = gene_biotype),
         gene_biotype = ifelse(test = gene_biotype %in% 
                                 c("miRNA", "lincRNA", "snoRNA",
                                   "sRNA", "snRNA", "vaultRNA",
                                   "3prime_overlapping_ncrna",
                                   "macro_lncRNA", "scaRNA"),
                               yes = "ncrna",
                               no = gene_biotype),
         gene_biotype = ifelse(test = gene_biotype %in% 
                                 c("antisense",
                                   "ncRNA",
                                   "Mt_rRNA",
                                   "Mt_tRNA",
                                   "protein_coding",
                                   "pseudogene",
                                   "ribozyme",
                                   "rRNA"),
                               yes = gene_biotype,
                               no  = "other")) %>%
  gather(sample, value, -rowname, -gene_biotype) %>%
  group_by(sample, gene_biotype) %>%
  summarize(eta = sum(value),
            mu  = mean(value),
            q2  = median(value)) %>%
  ungroup() %>%
  arrange(!grepl("Water", `sample`)) %>% 
  mutate(`sample` = factor(`sample`, levels = unique(`sample`)))

ggplot(data = plotDF,
       mapping = aes(x = sample, y = eta, fill = gene_biotype)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Samples", y = "Number of reads") +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

message("percentage of reads that are from protein coding")

## percentage of reads that are from protein coding

plotDF %>% 
  group_by(sample) %>%
  summarize(n = sum(eta)) %>%
  merge(y = plotDF) %>%
  filter(gene_biotype %in% "protein_coding") %>%
  mutate(perc_prot_cod = eta/n * 100) %>%
  summarize(type = "perc_prot_coding",
            min  = min(.$perc_prot_cod, na.rm = TRUE),
            max  = max(.$perc_prot_cod, na.rm = TRUE))

##               type      min      max
## 1 perc_prot_coding 76.30473 98.93777

As expected, most counted reads are mapped to protein coding genes.

Principal variance component analysis (raw)

esetTemp <- ExpressionSet(assayData = log2(counts(esetRaw) + 0.25),
                          phenoData = AnnotatedDataFrame(data = pData(esetRaw)))

esetTemp$TotalReadsQ3 <- cut(esetTemp$TotalReads,
                             breaks         = quantile(esetTemp$TotalReads,
                                                       probs = c(0,0.33, 0.66,1)),
                             include.lowest = TRUE,
                             right          = TRUE)
esetTemp$NumberCellsQ3 <- cut(esetTemp$Number.of.Cells,
                              breaks         = c(0, 1500, 3000, 5000),
                              include.lowest = TRUE,
                              right          = TRUE)

fit <- PVCA(counts = exprs(esetTemp),
            meta = pData(esetTemp)[, c("Timepoint",
                                       "Treatment",
                             "Monkey.ID",
                             "Cell.population.sorted",
                             "NumberCellsQ3",
                             "TotalReadsQ3")],
     threshold = 0.6,
     inter = FALSE)

plotDF <- data.frame(effect = as.vector(fit),
                     label  = names(fit)) %>%
  arrange(effect) %>%
  mutate(label = factor(label, levels = label))
ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Subset and total number of reads are the main drivers of the raw gene-expression.

Multidimensional scaling plot based on raw counts

rawMat <- counts(esetRaw)
mat <- rawMat[rowSums(rawMat) > 0, ]
mat <- log2(mat + 0.25)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels  <-  colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
plotDF <- as.data.frame(fit$points)
plotDF <- plotDF %>%
    rownames_to_column() %>%
    merge(y = rownames_to_column(pData(esetRaw)),
          by = "rowname")
ggplot(data    = plotDF,
                   mapping = aes(x     = V1,
                                 y     = V2,
                 shape = Cell.population.sorted,
                                 color = TotalReads)) +
  geom_point(size = 4) +
  labs(x = paste0("1st dimension (",
                        round((fit$eig/sum(fit$eig))[1] * 100),
                        "%)"),
                    y = paste0("2nd dimension (",
                        round((fit$eig/sum(fit$eig))[2] * 100),
                        "%)")) +
  scale_color_gradient(low = "lightgreen", high = "darkgreen") +
  theme_bw()

Total number of reads counted per sample is indeed the main driver of gene-expression.

Normalize (TMM) count data

# remove low counts samples
esetTemp <- esetRaw[, esetRaw$TotalReads >= 2.1e6]
dge <- DGEList(counts       = counts(esetTemp),
               remove.zeros = TRUE)
# remove low expressed genes
flag <- filterByExpr(dge, group = esetRaw$Cell.population.sorted)
dge <- dge[flag, , keep.lib.sizes = FALSE]
dge <- calcNormFactors(object = dge, method = "TMM")
normalizedCounts <- cpm(dge, normalized.lib.sizes = TRUE, log = TRUE)
eset <-  newSeqExpressionSet(
    counts           = dge$counts,
    normalizedCounts = as.matrix(normalizedCounts),
    featureData      = fData(esetTemp)[rownames(normalizedCounts), ],
    phenoData        = pData(esetTemp))
save(eset, file = file.path(workDir, "output/joana.eset.RData"))
print(eset)

## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 13901 features, 175 samples 
##   element names: counts, normalizedCounts, offset 
## protocolData: none
## phenoData
##   sampleNames: DGAM_wk8_Bcells DGAM_wk8_pDCs ... HZ9_wk8_pDCs (175
##     total)
##   varLabels: Monkey.ID Collaborator ... auc.PGT121 (32 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
##     ENSMMUG00000065345 (13901 total)
##   fvarLabels: seqname strand ... gene_biotype (5 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Principal variance component analysis (normalized)

esetTemp <- ExpressionSet(assayData = normCounts(eset),
                          phenoData = AnnotatedDataFrame(data = pData(eset)))
esetTemp$TotalReadsQ3 <- cut(esetTemp$TotalReads,
                             breaks         = quantile(esetTemp$TotalReads,
                                                       probs = c(0,0.33, 0.66,1)),
                         include.lowest = TRUE,
                         right          = TRUE)
fit <- PVCA(counts = exprs(esetTemp),
            meta = pData(esetTemp)[, c("Timepoint",
                                       "Treatment",
                             "Monkey.ID",
                             "Cell.population.sorted",
                             "TotalReadsQ3")],
                           threshold = 0.6,
                           inter = FALSE)
plotDF <- data.frame(effect = as.vector(fit),
                     label  = names(fit)) %>%
  arrange(effect) %>%
  mutate(label = factor(label, levels = unique(label)))
ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

After normalization (for total number of reads counted per samples), subset remain the main drivers of expression.

Multidimensional scaling plot on normalized counts

# mds normalized/cpm counts
mat <- normCounts(eset)
distMat <- parallelDist(t(mat), threads = detectCores() - 1)
attributes(distMat)$Labels  <- colnames(mat)
fit <- cmdscale(distMat, k = 2, eig = TRUE)
plotDF <- as.data.frame(fit$points) %>%
  rownames_to_column() %>%
  merge(y = rownames_to_column(pData(eset)),
        by = "rowname")
plotLabel <- round((fit$eig/sum(fit$eig)) * 100)
plotLabel <- plotLabel[1:2]
plotLabel <- paste0(c("1st dimension (", "2nd dimension ("),
                    plotLabel,
                    "%)")
ggplot(data = plotDF,
       mapping = aes(x = V1,
                     y = V2,
                     color = Cell.population.sorted,
                     shape = Treatment)) +
  geom_point(size = 4) +
  labs(x = plotLabel[[1]],
       y = plotLabel[[2]]) +
  theme_bw() +
  theme(legend.key = element_blank(),
        plot.title = element_text(hjust = 0.5))

message("Potential mislabelled samples")

## Potential mislabelled samples

filter(plotDF,
(Cell.population.sorted %in% "B cells" &
   V1 < 0) |
  (Cell.population.sorted %in% "monocytes" &
     V1 >   0)) %>%
  print()

##             rowname         V1       V2 Monkey.ID        Collaborator Sample.
## 1    HH7_wk8_Bcells -120.70289 46.36394       HH7 Joana Dias/Koup Lab       4
## 2 HH7_wk8_monocytes   79.50156  2.87096       HH7 Joana Dias/Koup Lab       5
##                      Sample.Type.Subset Study.. Timepoint
## 1 total RNA from sorted cells in RNAzol   730.2       wk8
## 2 total RNA from sorted cells in RNAzol   730.2       wk8
##   Cell.population.sorted Sample.Well.or.tube.label Number.of.Cells
## 1                B cells                        D1            5000
## 2              monocytes                        E1            2240
##   Sample.Volume.ul. Method.to.determine.RNA.conc. RIN
## 1                10                            NA  NA
## 2                10                            NA  NA
##                      Library.Preparation.Method Library.Elution.volume i7.index
## 1 NEB Next Ultra II RNA Library Preparation Kit                     NA      P37
## 2 NEB Next Ultra II RNA Library Preparation Kit                     NA      P49
##   Index.Primer.sequence Final.library.conc...nM. Final.avg.library.size..bp.
## 1              AGACCGTA                       NA                         430
## 2              ACGGTCTT                       NA                         430
##   Date.Received Location.VRC Illumina.Instrument Flowcell.ID Lane.sequenced
## 1            NA           NA          HiSeq 4000   HGGCYBBXY              1
## 2            NA           NA          HiSeq 4000   HGGCYBBXY              1
##   Date.Sequenced Notes   Plate TotalReads Treatment  max.vl time2max.vl
## 1     2020-03-25    NA Plate 1    8821941 Untreated 3.3e+07           4
## 2     2020-03-25    NA Plate 1    3877393 Untreated 3.3e+07           4
##   auc.VRC07.523.LS auc.PGT121
## 1               NA         NA
## 2               NA         NA

After normalization (for total number of reads counted per samples), subset remain the main driver of expression.
Potential swap was detected for animal HH7 between wk8 B cells and Monocytes.

Jitter plot of genes coding for cell surface markers of B cells and Monocytes

geneLS <- c("CD14", "MS4A1")
plotDF <- counts(eset[fData(eset)$gene_name %in% geneLS, ]) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  merge(y = select(fData(eset), gene_id, gene_name),
    by.x = "rowname",
    by.y = "gene_id") %>%
  select(-rowname) %>%
  gather(SampleID, count, -gene_name) %>%
  merge(y = select(rownames_to_column(pData(eset)),
           rowname,
           Cell.population.sorted),
    by.x = "SampleID",
    by.y = "rowname") %>%
  mutate(value = pmax(count, 0.1))
# swaps
swapLS <- c("HH7_wk8_Bcells", "HH7_wk8_monocytes")
swapDF <- filter(plotDF, SampleID %in% swapLS)
ggplot(data = filter(plotDF, count > 0),
          mapping = aes(x = Cell.population.sorted, y = count)) +
  geom_boxplot(outlier.color = "transparent") +
  geom_beeswarm(mapping = aes(color = Cell.population.sorted)) +
  geom_text_repel(data    = swapDF,
                      mapping = aes(label = SampleID)) +
  scale_y_log10() +
  facet_wrap(facet = ~gene_name, scale = "free", nrow = 3) +
  theme_bw() +
  theme(axis.text  = element_text(size = 5),
    legend.pos = "none")

Principal variance component analysis per subset

plotDF <- NULL
for (SUBSET in unique(eset$Cell.population.sorted)) {
  esetTemp <- eset[, eset$Cell.population.sorted %in% SUBSET]
  esetTemp <- ExpressionSet(assayData = normCounts(esetTemp),
                phenoData = AnnotatedDataFrame(data = pData(esetTemp)))
  fit <- PVCA(counts = exprs(esetTemp),
          meta = pData(esetTemp)[, c("Timepoint",
                     "Treatment",
                     "Monkey.ID")],
                           threshold = 0.6,
                           inter = FALSE)
  plotTemp <- data.frame(effect = as.vector(fit),
             label  = names(fit)) %>%
    arrange(effect) %>%
    mutate(label = factor(label, levels = unique(label)),
       Cell.population.sorted = SUBSET)
  plotDF <- rbind(plotDF, plotTemp)
}

ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  facet_wrap(facets = ~Cell.population.sorted) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Donor effect is observed in all subsets.

Substract pre expression

## SeqExpressionSet (storageMode: lockedEnvironment)
## assayData: 13901 features, 82 samples 
##   element names: counts, normalizedCounts, offset 
## protocolData: none
## phenoData
##   sampleNames: DGBG_D14_Bcells DGBG_wk8_Bcells ... HZ9_wk8_pDCs (82
##     total)
##   varLabels: Monkey.ID Collaborator ... auc.PGT121 (32 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSMMUG00000000001 ENSMMUG00000000002 ...
##     ENSMMUG00000065345 (13901 total)
##   fvarLabels: seqname strand ... gene_biotype (5 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

Principal variance component analysis per subset (on baselined expression)

plotDF <- NULL
for (SUBSET in unique(esetBaselined$Cell.population.sorted)) {
  esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted %in% SUBSET]
  esetTemp <- ExpressionSet(assayData = normCounts(esetTemp),
                            phenoData = AnnotatedDataFrame(data = pData(esetTemp)))
  fit <- PVCA(counts = exprs(esetTemp),
              meta = pData(esetTemp)[, c("Timepoint",
                                         "Treatment",
                                         "Monkey.ID")],
                           threshold = 0.6,
                           inter = FALSE)
  plotTemp <- data.frame(effect = as.vector(fit),
                         label  = names(fit)) %>%
    arrange(effect) %>%
    mutate(label = factor(label, levels = unique(label)),
           Cell.population.sorted = SUBSET)
  plotDF <- rbind(plotDF, plotTemp)
}

ggplot(data = plotDF,
       mapping = aes(x = label, y = effect)) +
  geom_bar(stat = "identity") +
  labs(x = NULL, y = "Explained effect") +
  theme_bw() +
  facet_wrap(facets = ~Cell.population.sorted) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Treatment effect is better observed on pre-post expression.

Multidimensional scalling plot on baselined counts by subsets

plotDF <- NULL
for (SUBSET in unique(esetBaselined$Cell.population.sorted)) {
  # mds normalized/cpm counts
  esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted %in% SUBSET]

  mat <- normCounts(esetTemp)
  distMat <- parallelDist(t(mat), threads = detectCores() - 1)
  attributes(distMat)$Labels  <- colnames(mat)
  fit <- cmdscale(distMat, k = 2, eig = TRUE)
  as.data.frame(fit$points) %>%
    rownames_to_column() %>%
    merge(y = rownames_to_column(pData(eset)),
          by = "rowname") %>%
    rbind(plotDF, .) -> plotDF
}

ggplot(data = plotDF,
                   mapping = aes(x = V1,
                                 y = V2,
                                 color = Timepoint,
                                 shape = Treatment)) +
  geom_point(size = 4) +
  facet_wrap(facets = ~Cell.population.sorted, scale = "free") +
  theme_bw() +
  theme(legend.key = element_blank(),
        plot.title = element_text(hjust = 0.5))

Differential expression between timepoints (paired)

message("top 50 genes based on p-values (all FDR<=5%")

## top 50 genes based on p-values (all FDR<=5%

fits <- list()
# vaccine effect
for (SUBSET in unique(eset$Cell.population.sorted)) {
  esetTemp <- eset[, eset$Cell.population.sorted %in% SUBSET]
  dge <- DGEList(counts  = counts(esetTemp),
                 samples = pData(esetTemp),
                 genes   = fData(esetTemp))
  goi <- interaction(gsub(pattern     = " .+",
                          replacement = "",
                          esetTemp$Treatment),
                     esetTemp$Timepoint,
                     drop = TRUE)
  donor <- esetTemp$Monkey.ID
  designMat <- model.matrix(~0+goi) 
  rownames(designMat) <- sampleNames(esetTemp)
  colnames(designMat) <- gsub(pattern     = "goi",
                              replacement = "",
                              colnames(designMat))
  attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
  v <- voom(counts = dge, design = designMat)
  corfit <- duplicateCorrelation(v$E,
                                 design = designMat,
                                 block  = donor)
  fit <- lmFit(v,
               design      = designMat,
               block       = donor,
               correlation = corfit$consensus)
  contrastLS <- grep(pattern = "pre", colnames(fit), value = TRUE, invert = TRUE) %>%
    paste0(., "-", gsub(pattern = "\\..+", replacement = ".pre", .))
  contrastMat <- makeContrasts(contrasts = contrastLS, levels = fit$design)
  fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
  fit2 <- eBayes(fit = fit2)
  fits[[paste0(SUBSET, "_vax")]] <- list(fit = fit, fit2 = fit2)
}
# for NK and Monocytes of DEL at wk8 compare low VL vs max VL
for (SUBSET in setdiff(eset$Cell.population.sorted, "cDCs")) {
  esetTemp <- eset[, eset$Cell.population.sorted %in% SUBSET &
                   eset$Treatment != "Untreated"]
  dge <- DGEList(counts  = counts(esetTemp),
                 samples = pData(esetTemp),
                 genes   = fData(esetTemp))
  goi <- ifelse(test = esetTemp$max.vl >= 1e04,
        yes  = "highMaxVL",
        no   = "lowMaxVL") %>%
      interaction(gsub(pattern = " .+",
                       replacement = "",
                       esetTemp$Treatment),
                  esetTemp$Timepoint,
                  .)
  designMat <- model.matrix(~0+goi) 
  rownames(designMat) <- sampleNames(esetTemp)
  colnames(designMat) <- gsub(pattern     = "goi",
                              replacement = "",
                              colnames(designMat))
  attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
  v <- voom(counts = dge, design = designMat)
  fit <- lmFit(v, design      = designMat)
  contrastLS <- grep(pattern = "highMaxVL", colnames(fit), value = TRUE) %>%
      paste0(.,
             "-",
             gsub(pattern = "high", replacement = "low", .))

  contrastMat <- makeContrasts(contrasts = contrastLS,
                   levels = fit$design)
  fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
  fit2 <- eBayes(fit = fit2)
  fits[[paste0(SUBSET, "_maxVL")]] <- list(fit = fit, fit2 = fit2)  
}

# for NK and Monocytes of DEL at wk8 compare low VL vs max VL
for (SUBSET in setdiff(esetBaselined$Cell.population.sorted, "cDCs")) {
  esetTemp <- esetBaselined[, esetBaselined$Cell.population.sorted %in% SUBSET &
                   esetBaselined$Treatment %in% "DEL bNAb Tx"]
  goi <- ifelse(test = esetTemp$max.vl >= 1e04,
        yes  = "highMaxVL",
        no   = "lowMaxVL") %>%
      interaction(gsub(pattern = " .+",
                       replacement = "",
                       esetTemp$Treatment),
                  esetTemp$Timepoint,
                  .)
  designMat <- model.matrix(~0+goi) 
  rownames(designMat) <- sampleNames(esetTemp)
  colnames(designMat) <- gsub(pattern     = "goi",
                              replacement = "",
                              colnames(designMat))
  attr(designMat, "assign") <- attr(designMat, "contrasts") <- NULL
  fit <- lmFit(normCounts(esetTemp), design      = designMat)
  contrastLS <- grep(pattern = "highMaxVL", colnames(fit), value = TRUE) %>%
      paste0(.,
             "-",
             gsub(pattern = "high", replacement = "low", .))

  contrastMat <- makeContrasts(contrasts = contrastLS,
                   levels = fit$design)
  fit2 <- contrasts.fit(fit = fit, contrasts = contrastMat)
  fit2 <- eBayes(fit = fit2)
  fits[[paste0(SUBSET, "_baselined_maxVL")]] <- list(fit = fit, fit2 = fit2)  
}

# save fits
save(fits, file = file.path(workDir, "output/joana.fits.RData"))

# print number of genes differently expressed
degNbDF <- NULL 
for (modelName in names(fits)) {
  fit2 <- fits[[modelName]][["fit2"]]
  for (coefName in colnames(fit2)) {
    top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
      as.data.frame()
    if (!("gene_id" %in% names(top))) {
      top <- top %>%
      rownames_to_column(var = "gene_id") %>%
      merge(y = select(fData(eset), gene_id, gene_name, gene_biotype), by = "gene_id")
    }
    top %>%
      filter(gene_biotype %in% "protein_coding") %>%
      group_by(sign(logFC)) %>%
      summarize(p     = sum(P.Value <= 0.05),
                adj.p = sum(adj.P.Val <= 0.05)) %>%
      mutate(modelName = modelName,
             contrast = coefName) %>%
      rbind(degNbDF) -> degNbDF
  }
}

degNbDF %>%
  pivot_longer(cols = c(-modelName, -contrast, -`sign(logFC)`),
               names_to = "cname",
               values_to = "n") %>%
  mutate(`sign(logFC)` = c("-1" = "DN", "1" = "UP")[as.character(`sign(logFC)`)],
         cname         = paste0(cname, `sign(logFC)`)) %>%
  select(-`sign(logFC)`) %>%
  pivot_wider(names_from = cname, values_from = n) %>%
  mutate(p = paste0(pUP, "/", pDN),
         adj.p = paste0(adj.pUP, "/", adj.pDN)) %>%
  select(modelName, contrast, p, adj.p) %>%
  as.data.frame() %>%
  kable()

modelName	contrast	p	adj.p
monocytes_baselined_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	866/619	134/8
monocytes_baselined_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	269/413	0/0
pDCs_baselined_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	694/367	0/0
pDCs_baselined_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	154/126	0/0
NK cells_baselined_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	715/785	2/0
NK cells_baselined_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	256/420	0/0
B cells_baselined_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	396/257	0/0
B cells_baselined_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	292/403	0/0
monocytes_maxVL	WT.wk8.highMaxVL-WT.wk8.lowMaxVL	412/637	4/56
monocytes_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	599/928	34/9
monocytes_maxVL	WT.pre.highMaxVL-WT.pre.lowMaxVL	341/133	0/0
monocytes_maxVL	DEL.pre.highMaxVL-DEL.pre.lowMaxVL	449/255	0/0
monocytes_maxVL	WT.D14.highMaxVL-WT.D14.lowMaxVL	1310/675	35/35
monocytes_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	238/190	0/0
NK cells_maxVL	WT.wk8.highMaxVL-WT.wk8.lowMaxVL	72/306	0/2
NK cells_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	868/671	0/1
NK cells_maxVL	WT.pre.highMaxVL-WT.pre.lowMaxVL	248/225	0/0
NK cells_maxVL	DEL.pre.highMaxVL-DEL.pre.lowMaxVL	851/351	0/1
NK cells_maxVL	WT.D14.highMaxVL-WT.D14.lowMaxVL	978/662	7/16
NK cells_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	158/167	0/0
pDCs_maxVL	WT.wk8.highMaxVL-WT.wk8.lowMaxVL	287/586	10/9
pDCs_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	893/314	1/0
pDCs_maxVL	WT.pre.highMaxVL-WT.pre.lowMaxVL	150/418	0/0
pDCs_maxVL	DEL.pre.highMaxVL-DEL.pre.lowMaxVL	198/291	0/0
pDCs_maxVL	WT.D14.highMaxVL-WT.D14.lowMaxVL	926/350	75/15
pDCs_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	112/165	0/0
B cells_maxVL	WT.wk8.highMaxVL-WT.wk8.lowMaxVL	166/388	0/2
B cells_maxVL	DEL.wk8.highMaxVL-DEL.wk8.lowMaxVL	377/525	0/1
B cells_maxVL	WT.pre.highMaxVL-WT.pre.lowMaxVL	319/261	0/0
B cells_maxVL	DEL.pre.highMaxVL-DEL.pre.lowMaxVL	401/103	0/0
B cells_maxVL	WT.D14.highMaxVL-WT.D14.lowMaxVL	465/629	2/9
B cells_maxVL	DEL.D14.highMaxVL-DEL.D14.lowMaxVL	312/359	0/0
cDCs_vax	WT.wk8-WT.pre	254/150	0/0
cDCs_vax	Untreated.wk8-Untreated.pre	880/272	22/2
cDCs_vax	DEL.wk8-DEL.pre	236/419	1/0
cDCs_vax	WT.D14-WT.pre	345/303	0/0
cDCs_vax	Untreated.D14-Untreated.pre	1079/1585	290/413
cDCs_vax	DEL.D14-DEL.pre	270/302	0/0
monocytes_vax	WT.wk8-WT.pre	405/337	0/0
monocytes_vax	Untreated.wk8-Untreated.pre	557/871	0/0
monocytes_vax	DEL.wk8-DEL.pre	426/983	2/2
monocytes_vax	WT.D14-WT.pre	550/674	0/0
monocytes_vax	Untreated.D14-Untreated.pre	633/431	0/0
monocytes_vax	DEL.D14-DEL.pre	454/571	17/2
NK cells_vax	WT.wk8-WT.pre	398/405	0/0
NK cells_vax	Untreated.wk8-Untreated.pre	535/406	0/0
NK cells_vax	DEL.wk8-DEL.pre	834/1199	87/67
NK cells_vax	WT.D14-WT.pre	564/1602	71/180
NK cells_vax	Untreated.D14-Untreated.pre	935/371	37/2
NK cells_vax	DEL.D14-DEL.pre	773/1189	169/67
pDCs_vax	WT.wk8-WT.pre	478/684	0/2
pDCs_vax	Untreated.wk8-Untreated.pre	125/177	0/0
pDCs_vax	DEL.wk8-DEL.pre	272/348	0/0
pDCs_vax	WT.D14-WT.pre	721/710	24/4
pDCs_vax	Untreated.D14-Untreated.pre	337/246	1/0
pDCs_vax	DEL.D14-DEL.pre	515/275	18/2
B cells_vax	WT.wk8-WT.pre	268/237	0/0
B cells_vax	Untreated.wk8-Untreated.pre	574/1224	0/0
B cells_vax	DEL.wk8-DEL.pre	387/231	5/1
B cells_vax	WT.D14-WT.pre	427/377	0/0
B cells_vax	Untreated.D14-Untreated.pre	391/197	0/0
B cells_vax	DEL.D14-DEL.pre	519/367	35/2

Heatmaps with DEGs

for (modelName in names(fits)) {
  fit2 <- fits[[modelName]][["fit2"]]
  for (coefName in colnames(fit2)) {
    top <- topTable(fit = fit2, coef = coefName, number = Inf) %>%
      as.data.frame()
    if (!("gene_id" %in% names(top))) {
      top <- top %>%
      rownames_to_column(var = "gene_id") %>%
      merge(y = select(fData(eset), gene_id, gene_name, gene_biotype), by = "gene_id")
    }
    top <- top %>%
      filter(!is.na(gene_name) &
           gene_biotype %in% "protein_coding" &
           adj.P.Val <= 0.05)
    if (nrow(top) > 0) {
      top <- top %>%
        top_n(50, wt = abs(t)) %>%
        arrange(desc(logFC))
      sampleLS <- which(fit2$contrast[, coefName] != 0) %>%
        fit2$design[, .] %>%
        as.data.frame() %>%
        rownames_to_column() %>%
        gather(group, value, -rowname) %>%
        group_by(rowname) %>%
        summarize(value = sum(value)) %>%
        filter(value > 0) %>%
        .$rowname
      mat <- normCounts(eset)[top$gene_id, sampleLS, drop = FALSE] %>%
        t() %>%
        scale() %>%
        t()
      mat <- mat[complete.cases(mat), , drop = FALSE]
      colorLS <- colorRampPalette(colors = c("blue", "white", "red"))(n = 100)
      breakLS <- c(-1 * max(abs(mat), na.rm = TRUE),
                   seq(from = -1 * min(abs(range(mat, na.rm = TRUE))),
                       to   = min(abs(range(mat, na.rm = TRUE))),
                       length.out = 99),
                   max(abs(mat), na.rm = TRUE))
      matAnnot <- pData(eset) %>%
        rownames_to_column() %>%
        mutate(log10.max.vl = log10(max.vl)) %>%
        select(rowname, Timepoint, Treatment, Cell.population.sorted,
               log10.max.vl, time2max.vl, auc.VRC07.523.LS, auc.PGT121) %>%
        column_to_rownames()
      matAnnot <- matAnnot[colnames(mat), ]
      matAnnot <- matAnnot[, colMeans(is.na(matAnnot)) < 1]
      ha <- HeatmapAnnotation(df = matAnnot)
      heat <- Heatmap(matrix         = mat,
                      width          = unit(0.5, units = "npc"),
                      height         = unit(0.1+0.6*(nrow(mat)/50), units = "npc"),
                      top_annotation = ha,
                      name           = "zscore",
                      column_title   = paste0(modelName, ": ", coefName),
                      row_names_gp   = gpar(fontsize = 7),
                      column_labels  = pData(eset)[colnames(mat), "Monkey.ID"],
              row_labels     = top$gene_name)
      print(heat)
    }
  }
}

Heatmaps with all DEGs across time by subset

message("FDR cutoff of 5%")

## FDR cutoff of 5%

for (modelName in grep(pattern = "vax", names(fits), value = TRUE)) {
  fit2 <- fits[[modelName]][["fit2"]]
  top <- topTableF(fit2, number = Inf, p.value = 0.05) %>%
    as.data.frame()
  if (!("gene_id" %in% names(top))) {
      top <- top %>%
      rownames_to_column(var = "gene_id") %>%
      merge(y = select(fData(eset), gene_id, gene_name, gene_biotype), by = "gene_id")
  }
  top <- top %>%
    filter(gene_biotype %in% "protein_coding")
  if (nrow(top) > 0) {
    top <- top %>%
      arrange(desc(WT.D14.WT.pre)) %>%
      mutate(gene_name = ifelse(test = is.na(gene_name),
                yes  = gene_id,
                no   = gene_name))
    sampleLS <- rownames(fit2$design)
    sampleLS <- intersect(sampleLS, sampleNames(esetBaselined))
    mat <- normCounts(esetBaselined)[top$gene_id, sampleLS, drop = FALSE]
      
    mat <- mat[complete.cases(mat), ]
    matAnnot <- pData(eset) %>%
      rownames_to_column() %>%
      mutate(log10.max.vl = log10(max.vl)) %>%
      select(rowname, Timepoint, Treatment, Cell.population.sorted,
         log10.max.vl, time2max.vl, auc.VRC07.523.LS, auc.PGT121) %>%
      column_to_rownames()
    matAnnot <- matAnnot[colnames(mat), ]
    ha <- HeatmapAnnotation(df = matAnnot)
    splitCols <- pData(eset)[colnames(mat),
                 c("Timepoint", "Treatment")] %>%
      apply(MARGIN = 1, FUN = paste, collapse = ".") 
    heat <- Heatmap(matrix         = mat,
            width          = unit(0.5, units = "npc"),
            height         = unit(0.75, units = "npc"),
            top_annotation = ha,
            name           = "log2FC",
            column_title   = modelName,
            show_row_names = FALSE,
            column_labels  = pData(eset)[colnames(mat),
                         "Monkey.ID"],
            column_split   = splitCols)
    print(heat)
  }
}

## topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.

## topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.

## topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.

## topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.

## topTableF is obsolete and will be removed in a future version of limma. Please considering using topTable instead.

sessionInfo()

## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin23.0.0 (64-bit)
## Running under: macOS Sonoma 14.3
## 
## Matrix products: default
## BLAS:   /opt/homebrew/Cellar/openblas/0.3.26/lib/libopenblasp-r0.3.26.dylib 
## LAPACK: /opt/homebrew/Cellar/r/4.3.2/lib/R/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] stringr_1.5.1               dplyr_1.1.4                
##  [5] purrr_1.0.2                 readr_2.1.5                
##  [7] tidyr_1.3.1                 tibble_3.2.1               
##  [9] tidyverse_2.0.0             ComplexHeatmap_2.18.0      
## [11] ggrepel_0.9.5               edgeR_4.0.12               
## [13] limma_3.58.1                parallelDist_0.2.6         
## [15] lme4_1.1-35.1               Matrix_1.6-5               
## [17] pvca_0.1.0                  EDASeq_2.36.0              
## [19] ShortRead_1.60.0            GenomicAlignments_1.38.2   
## [21] SummarizedExperiment_1.32.0 MatrixGenerics_1.14.0      
## [23] matrixStats_1.2.0           Rsamtools_2.18.0           
## [25] GenomicRanges_1.54.1        Biostrings_2.70.1          
## [27] GenomeInfoDb_1.38.5         XVector_0.42.0             
## [29] IRanges_2.36.0              S4Vectors_0.40.2           
## [31] BiocParallel_1.36.0         Biobase_2.62.0             
## [33] BiocGenerics_0.48.1         caTools_1.18.2             
## [35] ggbeeswarm_0.7.2            ggplot2_3.4.4              
## [37] zoo_1.8-12                  readxl_1.4.3               
## [39] knitr_1.45                 
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3      shape_1.4.6             rstudioapi_0.15.0      
##   [4] magrittr_2.0.3          magick_2.8.2            GenomicFeatures_1.54.1 
##   [7] farver_2.1.1            nloptr_2.0.3            rmarkdown_2.25         
##  [10] GlobalOptions_0.1.2     BiocIO_1.12.0           zlibbioc_1.48.0        
##  [13] vctrs_0.6.5             memoise_2.0.1           minqa_1.2.6            
##  [16] RCurl_1.98-1.14         htmltools_0.5.7         S4Arrays_1.2.0         
##  [19] progress_1.2.3          curl_5.2.0              cellranger_1.1.0       
##  [22] SparseArray_1.2.3       cachem_1.0.8            iterators_1.0.14       
##  [25] lifecycle_1.0.4         pkgconfig_2.0.3         R6_2.5.1               
##  [28] fastmap_1.1.1           clue_0.3-65             GenomeInfoDbData_1.2.11
##  [31] digest_0.6.34           colorspace_2.1-0        AnnotationDbi_1.64.1   
##  [34] RSQLite_2.3.5           hwriter_1.3.2.1         labeling_0.4.3         
##  [37] filelock_1.0.3          timechange_0.3.0        fansi_1.0.6            
##  [40] httr_1.4.7              abind_1.4-5             compiler_4.3.2         
##  [43] doParallel_1.0.17       bit64_4.0.5             withr_3.0.0            
##  [46] DBI_1.2.1               highr_0.10              R.utils_2.12.3         
##  [49] biomaRt_2.58.1          MASS_7.3-60.0.1         rappdirs_0.3.3         
##  [52] DelayedArray_0.28.0     rjson_0.2.21            tools_4.3.2            
##  [55] vipor_0.4.7             beeswarm_0.4.0          R.oo_1.26.0            
##  [58] glue_1.7.0              restfulr_0.0.15         nlme_3.1-164           
##  [61] cluster_2.1.6           generics_0.1.3          gtable_0.3.4           
##  [64] tzdb_0.4.0              R.methodsS3_1.8.2       hms_1.1.3              
##  [67] xml2_1.3.6              utf8_1.2.4              foreach_1.5.2          
##  [70] pillar_1.9.0            vroom_1.6.5             circlize_0.4.15        
##  [73] splines_4.3.2           BiocFileCache_2.10.1    lattice_0.22-5         
##  [76] aroma.light_3.32.0      rtracklayer_1.62.0      bit_4.0.5              
##  [79] deldir_2.0-2            tidyselect_1.2.0        locfit_1.5-9.8         
##  [82] xfun_0.41               statmod_1.5.0           stringi_1.8.3          
##  [85] yaml_2.3.8              boot_1.3-28.1           evaluate_0.23          
##  [88] codetools_0.2-19        interp_1.1-6            cli_3.6.2              
##  [91] RcppParallel_5.1.7      munsell_0.5.0           Rcpp_1.0.12            
##  [94] dbplyr_2.4.0            png_0.1-8               XML_3.99-0.16.1        
##  [97] blob_1.2.4              prettyunits_1.2.0       latticeExtra_0.6-30    
## [100] jpeg_0.1-10             bitops_1.0-7            scales_1.3.0           
## [103] crayon_1.5.2            GetoptLong_1.0.5        rlang_1.1.3            
## [106] KEGGREST_1.42.0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

20200511_Joana.mrnaseq_preproc.md

20200511_Joana.mrnaseq_preproc.md

Dias/Koup preprocessing of RNA-Seq data (set 1)

Files

20200511_Joana.mrnaseq_preproc.md

Latest commit

History

20200511_Joana.mrnaseq_preproc.md

File metadata and controls

Dias/Koup preprocessing of RNA-Seq data (set 1)