results/2023-0215/rough-draft_evaluate-categories_expression_initial.Rmd

---
title: "rough-draft_evaluate-categories_expression_initial.Rmd"
author: "KA"
email: "kalavatt@fredhutch.org"
output:
    html_notebook:
        toc: yes
        toc_float: true
---
<br />

## Get situated
### Code
<details>
<summary><i>Code: Get situated</i></summary>

```{r Get situated, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

library(ggplot2)
library(ggpubr)
library(PCAtools)
library(rstatix)
library(tidyverse)
library(treemap)

options(scipen = 999)
options(ggrepel.max.overlaps = Inf)

if(stringr::str_detect(getwd(), "kalavattam")) {
    p_local <- "/Users/kalavattam/Dropbox/FHCC"
} else {
    p_local <- "/Users/kalavatt/projects-etc"
}
p_wd <- "2022-2023_RRP6-NAB3/results/2023-0215"

setwd(paste(p_local, p_wd, sep = "/"))
getwd()

rm(p_local, p_wd)
```
</details>
<br />
<br />

## Load "comprehensive" `gtf` files
### Code
<details>
<summary><i>Code: Load "comprehensive" `gtf` files</i></summary>

```{r Load "comprehensive" gtf files, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

p_gtf <- "./outfiles_gtf-gff3/comprehensive/S288C_reference_genome_R64-1-1_20110203"

f_S <- "processed_features-intergenic_sense.gtf"
comp_S <- paste(p_gtf, f_S, sep = "/") %>%
    rtracklayer::import() %>%
    tibble::as_tibble() %>%
    dplyr::arrange(seqnames, start) %>%
    dplyr::select(-c(score, phase)) %>% 
    dplyr::rename(category = type.1)

f_SA <- "processed_features-intergenic_sense-antisense.gtf"
comp_SA <- paste(p_gtf, f_SA, sep = "/") %>%
    rtracklayer::import() %>%
    tibble::as_tibble() %>%
    dplyr::arrange(seqnames, start) %>%
    dplyr::select(-c(score, phase)) %>% 
    dplyr::rename(category = type.1)

rm(p_gtf, f_S, f_SA)
```
</details>
<br />
<br />

## Load counts matrices against "comprehensive" `gtf`s
### Code
<details>
<summary><i>Code: Load "comprehensive" `gtf` files</i></summary>

```{r Load counts matrices against "comprehensive" gtfs, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

read_in_counts_matrix <- function(x) {
    # ...
    # :param x: counts matrix from htseq-count
    # :return y: counts matrix as tibble
    y <- readr::read_tsv(x, show_col_types = FALSE) %>% 
        dplyr::rename(gene_id = ...1)
    return(y)
}


p_cm <- "./outfiles_htseq-count/comprehensive/S288C_reference_genome_R64-1-1_20110203/UT_prim_UMI"

f_SA_all <- "all-bams.hc-strd-eq.union.nonunique-all.processed_features-intergenic_sense-antisense.tsv"
f_S_all <- "all-bams.hc-strd-eq.union.nonunique-all.processed_features-intergenic_sense.tsv"
f_SA_frac <- "all-bams.hc-strd-eq.union.nonunique-fraction.processed_features-intergenic_sense-antisense.tsv"
f_S_frac <- "all-bams.hc-strd-eq.union.nonunique-fraction.processed_features-intergenic_sense.tsv"
f_SA_none <- "all-bams.hc-strd-eq.union.nonunique-none.processed_features-intergenic_sense-antisense.tsv"
f_S_none <- "all-bams.hc-strd-eq.union.nonunique-none.processed_features-intergenic_sense.tsv"
f_SA_rand <- "all-bams.hc-strd-eq.union.nonunique-random.processed_features-intergenic_sense-antisense.tsv"
f_S_rand <- "all-bams.hc-strd-eq.union.nonunique-random.processed_features-intergenic_sense.tsv"

t_SA_all <- read_in_counts_matrix(paste(p_cm, f_SA_all, sep = "/"))
t_S_all <- read_in_counts_matrix(paste(p_cm, f_S_all, sep = "/"))
t_SA_rand <- read_in_counts_matrix(paste(p_cm, f_SA_rand, sep = "/"))
t_S_rand <- read_in_counts_matrix(paste(p_cm, f_S_rand, sep = "/"))
t_SA_frac <- read_in_counts_matrix(paste(p_cm, f_SA_frac, sep = "/"))
t_S_frac <- read_in_counts_matrix(paste(p_cm, f_S_frac, sep = "/"))
t_S_none <- read_in_counts_matrix(paste(p_cm, f_S_none, sep = "/"))
t_SA_none <- read_in_counts_matrix(paste(p_cm, f_SA_none, sep = "/"))

rm(
    p_cm, f_SA_frac, f_S_frac, f_SA_none, f_S_none, f_SA_all, f_S_all,
    f_SA_rand, f_S_rand
)
```
</details>
<br />
<br />

## Evaluate the assignments from `htseq-count`
<b>Goal</b>: All counts/read pairs need to be accounted for; `htseq-count` counts should equal `samtools view` counts (after taking certain things&mdash;*see below*&mdash;into consideration) 

### Perform and assess the foundational work
#### Code
<details>
<summary><i>Code: Evaluate the assignments from `htseq-count`</i></summary>

```{r Evaluate the assignments from htseq-count, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

#  Create "test" dataframes from t_S_none columns 1 and 2 ---------------------
test <- t_S_none[, 1:2]

#  Clean up column names
colnames(test)[2] <- colnames(test)[2] %>%
    gsub("bams_renamed/UT_prim_UMI/", "", .) %>%
    gsub("\\.UT_prim_UMI\\.bam", "", .)

#  Extract the five "summary values" calculated by htseq-count; they are at the
#+ end of the matrices and have names that begin with two underscore characters
underscore <- test %>%
    dplyr::filter(stringr::str_detect(gene_id, "^__[a-zA-Z0-9_]*$"))
# View(underscore)

#  Extract the per-feature counts scored by htseq-count
counts <- test %>%
    dplyr::filter(!stringr::str_detect(gene_id, "^__[a-zA-Z0-9_]*$"))
# View(counts)


#  Determine and compare various tallies --------------------------------------
#  Tally the non-underscore-category counts (read pairs)
counts$`n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1` %>% sum()  # [1] 6765861
#IMPORTANT

#  Tally the underscore-category counts (read pairs)
underscore$`n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1` %>% sum()  # [1] 21827889
# 14968733 + 1607213 + 5251943  # [1] 21827889

#  Tally the number of reads (records) in the bam
# ❯ samtools view -c n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam
# 57187500

#  Tally the number of read pairs in the bam
# ❯ echo $(( 57187500 / 2 ))
# 28593750

#  Bam read pairs minus underscore-category counts
28593750 - 21827889  # [1] 6765861
#NOTE This is equal to the tally of non-underscore-category counts above

#CONCLUSION  1/2 We seem to have counts that are consistent between the bam and
#CONCLUSION  2/2 the counts matrix
```

```{bash, eval=FALSE}
#!/bin/bash

#  Get situated
grabnode  # 1 CPU, defaults
ml SAMtools/1.16.1-GCC-11.2.0

cd /home/kalavatt/tsukiyamalab/kalavatt/2022-2023_RRP6-NAB3 \
    || echo "cd'ing failed; check on this"

cd results/2023-0215/bams_renamed/UT_prim_UMI \
    || echo "cd'ing failed; check on this"


#  Assess the total number of reads in the bam --------------------------------
samtools view -c \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam  # 57187500

echo $(( 57187500 / 2 ))  # 28593750


#  Assess S. cerevisiae mitochondiral and non-S. cerevisiae reads -------------
#  Tally numbers of unimapping reads
samtools view \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam \
    Mito A B C D E F 20S \
        | awk '/\<NH:i:1\>/' \
        | wc -l  # 27447604
echo $(( 27447604 / 2 ))  # 13723802 unimapping read pairs

#  Tally numbers multimapping reads
samtools view \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam \
    Mito A B C D E F 20S \
        | awk '!/\<NH:i:1\>/' \
        | wc -l  # 4233128
echo $(( 4233128 / 2 ))  # 2116564 multimapping read pairs


#  Tally numbers of all reads
samtools view -c \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam \
    Mito A B C D E F 20S  # 31680732
echo $(( 31680732 / 2 ))  # 15840366 total read pairs

if [[ $(( 4233128 + 27447604 )) -eq 31680732 ]]; then
    echo "Tally of total reads equals sum of numbers of multimapping and" \
        "unimapping reads"
else
    echo "Tally of total reads does not equal sum of numbers of multimapping" \
        "and unimapping alignments"
fi

#  Assess S. cerevisiae I-XVI reads -------------------------------------------
#  Tally numbers of unimapping reads
samtools view \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam \
    I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI \
        | awk '/\<NH:i:1\>/' \
        | wc -l  # 19236010
echo $(( 19236010 / 2 ))  # 9618005 unimapping read pairs

#  Tally numbers multimapping reads
samtools view \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam \
    I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI \
        | awk '!/\<NH:i:1\>/' \
        | wc -l  # 6270758
echo $(( 6270758 / 2 ))  # 3135379 multimapping read pairs

#  Tally numbers of all reads
samtools view -c \
    n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam \
    I II III IV V VI VII VIII IX X XI XII XIII XIV XV XVI  # 25506768
echo $(( 25506768 / 2 ))  # 12753384 total read pairs

if [[ $(( 6270758 + 19236010 )) -eq 25506768 ]]; then
    echo "Tally of total reads equals sum of numbers of multimapping" \
        "and unimapping reads"
else
    echo "Tally of total reads does not equal sum of numbers of multimapping" \
        "and unimapping reads"
fi


#  Final assessments and conclusion -------------------------------------------
echo $(( 9618005 + 13723802 ))  # 23341807 total unimapping read pairs
echo $(( 3135379 + 2116564 ))  # 5251943 total multimapping read pairs

# `underscore`
# __no_feature           14968733
# __ambiguous            1607213
# __too_low_aQual        0
# __not_aligned          0
# __alignment_not_unique 5251943

#  Steps for processing  (#CONCLUSION)
#+ 1. From __alignment_not_unique, need to subtract multimappers against S.
#+    cerevisiae Mito, K. lactis A-F, and 20S
#+ 2. From __no_feature, need to subtract unimappers against S. cerevisiae
#+    Mito, K. lactis A-F, and 20S

#  In practice...
hc_anu=$(( 5251943 - 2116564 ))  # 3135379: New value for __alignment_not_unique  # 1
hc_nf=$(( 14968733 - 13723802 ))  # 1244931: New value for __no_feature  # 2
hc_ambi=1607213
hc_val=6765861  # sum(counts$`n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1`) / 2

echo $(( hc_anu + hc_nf + hc_ambi ))  # 5987523 (sans "valid")
echo $(( hc_val + hc_anu + hc_nf + hc_ambi ))  # 12753384
#  It took all day, but I finally go this fucker reconciled. Nice.
```
</details>
<br />

#### Notes
<details>
<summary><i>Notes: Evaluate the assignments from `htseq-count`</i></summary>

Per the sum of counts (read pairs) in dataframe `underscore`, there are <u>21,827,889 <b>invalid</b> counts</u> in the test bam.

After calling `samtools view -c n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1.UT_prim_UMI.bam`, we see that there are <u>28,593,750</u> counts (<u>57,187,500</u> aligned reads) in the file.

Looking at dataframe `t_S_none`&mdash;specifically, the five "summary values" calculated by `htseq-count`&mdash;and the results from calls to `samtools view`, we see that...
- 14,968,733 are classified "`__no_feature`" (read pairs that could not be assigned to any feature),
- 1,607,213 are "`__ambiguous`" (read pairs that could have been assigned to more than one feature and thus are not counted for any of them),
- 0 are "`__to_low_aQual`" (read pairs that were skipped due to the `-a` option&mdash;<i>not applicable to us since we used `STAR` to align reads</i>),
- 0 are "`__not_aligned`" (read pairs in the `bam` file without alignment),
- 5,251,943 are "`__alignment_not_unique`" (read pairs that align to more than one location in the reference as indicated by the `NH` tag), and
- 6,765,861 are "valid" counts (from `counts$n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1 %>% sum()`).

(<i>Quick check</i>: Does the sum of `__no_feature`, `__ambiguous`, `__alignment_not_unique` equal 21,827,889 as described above? <i>Yes.</i> `14968733 + 1607213 + 5251943  # [1] 21827889`)

<b>Goal</b>: We need to exclude counts associated with <u>*S. cerevisiae* chromosome Mito, *K. lactis* chromosomes A-F, and 20S</u> (<i>"Mito-KL-20S"</i> for short) from the five "summary values" calculated by `htseq-count`. We can achieve this goal by taking the following steps:
1. Calculate and store the <mark>number of <b>multimappers</b> associated with <i>"Mito-KL-20S"</i></mark>: `multi_Mito-KL-20S`.
2. Calculate and store the <mark>number of <b>unimappers</b> associated with <i>"Mito-KL-20S"</i></mark>: `uni_Mito-KL-20S`.
3. Calculate and store the <mark>number of <b>unimappers</b> associated with <i>S. cerevisiae chromosomes I-XVI</i></mark>: `uni_I-XVI`.
4. Subtract `multi_Mito-KL-20S` from "`__alignment_not_unique`"; store the new value: "`__alignment_not_unique_I-XVI`"
5. Subtract `uni_Mito-KL-20S` from "`__no_feature`"; store the new value: "`__no_feature_I-XVI`".
6. Calculate and store <mark>*(i)* the sum of "`__no_feature_I-XVI`", *(ii)* "`__ambiguous`", *(iii)* "`__alignment_not_unique_I-XVI`", and *(iv)* "valid" counts (e.g., from `sum(counts$n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1)`)</mark>: `summary-updated`.
7. Calculate and store the <mark>number of all counts associated with <i>S. cerevisiae chromosomes I-XVI</i></mark>: `counts_I-XVI`.
8. <mark>Check that the sum from step #5 equals the value from step #6</mark>. If `TRUE`, then everything is OK and we're good to proceed; if `FALSE`, then we need to troubleshoot why.
</details>
<br />

### Perform the steps and update dataframe `underscore`
#### Code
<details>
<summary><i>Code: Perform the steps and update dataframe `underscore`</i></summary>

```{r Perform the steps and update dataframe underscore, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

#  Perform steps 1-7 as described ---------------------------------------------
`uni_Mito-KL-20S` <- 13723802
`multi_Mito-KL-20S` <- 2116564
`uni_I-XVI` <- 9618005
`multi_I-XVI` <- 3135379

`__alignment_not_unique` <- as.numeric(underscore[5, 2])
`__alignment_not_unique_I-XVI` <- `__alignment_not_unique` - `multi_Mito-KL-20S`
`__no_feature` <- as.numeric(underscore[1, 2])
`__no_feature_I-XVI` <- `__no_feature` - `uni_Mito-KL-20S`
`__ambiguous` <- as.numeric(underscore[2, 2])
`__valid_counts` <- sum(counts[, 2])

`summary-updated` <- sum(
    `__no_feature_I-XVI`,
    `__ambiguous`,
    `__alignment_not_unique_I-XVI`,
    `__valid_counts`
)
`counts_I-XVI` <- 12753384

`summary-updated` == `counts_I-XVI`  # [1] TRUE


#  Update dataframe underscore ------------------------------------------------
underscore <- underscore %>%
    tibble::add_row(
        gene_id = deparse(substitute(`__alignment_not_unique_I-XVI`)),
        `n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1` = `__alignment_not_unique_I-XVI`,
        .after = nrow(underscore)
    ) %>%
    tibble::add_row(
        gene_id = deparse(substitute(`__no_feature_I-XVI`)),
        `n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1` = `__no_feature_I-XVI`,
        .after = 1
    ) %>%
    tibble::add_row(
        gene_id = deparse(substitute(`__valid_counts`)),
        `n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1` = `__valid_counts`,
        .before = 1
    )


#  Clean up -------------------------------------------------------------------
rm(
    `uni_Mito-KL-20S`, `multi_Mito-KL-20S`, `uni_I-XVI`, `multi_I-XVI`,
    `__alignment_not_unique`, `__alignment_not_unique_I-XVI`, `__no_feature`,
    `__no_feature_I-XVI`, `__ambiguous`, `__valid_counts`
)
```
</details>
<br />
<br />

## Using the test dataframes, set up/flesh out the analysis
### Code
<details>
<summary><i>Code: Using the test dataframes, set up/flesh out the analysis</i></summary>

```{r Using the test dataframes, set up/flesh out the analysis, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

test_joined <- dplyr::full_join(test, comp_S, by = "gene_id") %>%
    dplyr::select(c(
        gene_id, `n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1`, seqnames,
        start, end, width, strand, source, category, orf_classification
    ))

`test_joined_sum-by-seqnames` <- test_joined %>%
    dplyr::filter(!is.na(seqnames)) %>% 
    dplyr::group_by(seqnames) %>%
    dplyr::summarize(
        `sum-of-counts` = sum(`n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1`),
        `number-of-features` = dplyr::n()
    )
# seqnames sum-of-counts number-of-features
# <fctr>   <dbl>         <int>
# I	       114509	     387
# II	   540531	     1433
# III	   144565	     649
# IV	   918454	     2617
# V	       396056	     1083
# VI	   124511	     485
# VII	   581405	     1939
# VIII	   268683	     1020
# IX	   236196	     782
# X	       371552	     1297
# XI	   383392	     1153
# XII	   528894	     1838		
# XIII	   517916	     1648
# XIV	   478974	     1372
# XV	   662740	     1909
# XVI	   497483	     1640

df_tmp_1 <- test_joined %>%
    dplyr::filter(!is.na(category)) %>% 
    dplyr::group_by(category) %>%
    dplyr::summarize(
        `sum-of-counts` = sum(`n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1`),
        `number-of-features` = dplyr::n()
    )
# category    sum-of-counts    number-of-features
# <chr>       <dbl>            <int>
# ARS	      21264            674
# PG	      5142             14
# TE          39744            334
# centromere  0                32
# gene        5154259          6575
# intergenic  1210817          13162
# ncRNA	      80248            14
# rRNA	      3105             25
# snRNA	      163506           6
# snoRNA	  83096            77
# tRNA        19               275	
# telomere	  4661             64

df_tmp_2 <- underscore %>%
    dplyr::slice(c(3, 4, nrow(underscore))) %>%
    dplyr::rename(c(
        category = gene_id,
        `sum-of-counts` = `n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1`
    )) %>%
    dplyr::mutate(`number-of-features` = NA_integer_)
# category                        sum-of-counts    number-of-features
# <chr>                           <dbl>            <int>
# __no_feature_I-XVI	          1244931	       NA		
# __ambiguous	                  1607213	       NA		
# __alignment_not_unique_I-XVI    3135379	       NA

`test_joined_sum-by-category` <- dplyr::bind_rows(df_tmp_1, df_tmp_2)
rm(df_tmp_1, df_tmp_2)

# category                        sum-of-counts    number-of-features
# <chr>                           <dbl>            <int>
# ARS	                          21264	           674
# PG	                          5142	           14
# TE	                          39744	           334
# centromere	                  0	               32
# gene	                          5154259	       6575
# intergenic	                  1210817	       13162
# ncRNA	                          80248	           14
# rRNA	                          3105	           25
# snRNA	                          163506	       6
# snoRNA	                      83096	           77
# tRNA	                          19	           275
# telomere	                      4661	           64
# __no_feature_I-XVI	          1244931	       NA
# __ambiguous	                  1607213	       NA
# __alignment_not_unique_I-XVI	  3135379	       NA	

# #  Check that total counts are correct
# sum(`test_joined_sum-by-category`[, 2]) == `counts_I-XVI`  # [1] TRUE
# sum(`test_joined_sum-by-category`[, 2]) == `summary-updated`  # [1] TRUE
```
<br />

```{r}
#!/usr/bin/env Rscript

# #  Refresher: Make stacked bar charts with test data --------------------------
# specie <- c(
#     rep("sorgho", 3), rep("poacee", 3), rep("banana", 3), rep("triticum", 3)
# )
# condition <- rep(c("normal", "stress", "Nitrogen"), 4)
# value <- abs(rnorm(12, 0, 15))
# data <- data.frame(specie, condition, value)
# 
# rm(specie, condition, value)
# 
# #  Grouped
# ggplot(data, aes(fill = condition, y = value, x = specie)) +
#     geom_bar(position = "dodge", stat = "identity")
# 
# #  Stacked
# ggplot(data, aes(fill = condition, y = value, x = specie)) +
#     geom_bar(position = "stack", stat = "identity")
# 
# #  Stacked and percent
# ggplot(data, aes(fill = condition, y = value, x = specie)) +
#     geom_bar(position = "fill", stat = "identity")
# 
# treemap::treemap(
#     data,
#     index = c("specie", "condition"),
#     vSize = "value",
#     type = "index"
# )
# 
# rm(data)


#  Apply above to `test_joined_sum-by-category` -------------------------------
df <- `test_joined_sum-by-category` %>%
    dplyr::mutate(sample = "n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1") %>%
    dplyr::relocate(sample, .before = category) %>%
    dplyr::filter(!stringr::str_detect(
        category, "__alignment_not_unique_I-XVI"
    ))
df$category <- df$category %>%
    gsub("__", "", .) %>%
    gsub("_I-XVI", "", .) %>%
    gsub("PG", "pseudogene", .)

df %>%
    ggplot(aes(fill = category, y = `sum-of-counts`, x = sample)) +
    geom_bar(position = "dodge", stat = "identity") +
    scale_fill_manual(values = length(df$category) %>% viridisLite::viridis())

df %>%
    ggplot(aes(fill = category, y = `sum-of-counts`, x = sample)) +
    geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(values = length(df$category) %>% viridisLite::viridis())

df %>%
    ggplot(aes(fill = category, y = `sum-of-counts`, x = sample)) +
    geom_bar(position = "fill", stat = "identity") +
    scale_fill_manual(values = length(df$category) %>% viridisLite::viridis())

treemap::treemap(
    df,
    index = "category",
    vSize = "sum-of-counts",
    type = "index",
    position.legend = "right",
    palette = length(df$category) %>% viridisLite::viridis(),
    title = ""
)

set.seed(24)
treemap::treemap(
    df,
    index = "category",
    vSize = "sum-of-counts",
    type = "index",
    position.legend = "right",
    palette = length(df$category) %>% viridisLite::viridis() %>% sample(),
    title = ""
)
```

```{r}
#!/usr/bin/env Rscript

#  Clean up
rm(
    counts, df, test, test_joined, `test_joined_sum-by-category`,
    `test_joined_sum-by-seqnames`, underscore, `counts_I-XVI`,
    `summary-updated`
)
```

</details>
<br />
<br />

## Isolate and preprocess relevant WT G1 and Q datasets
### Isolate relevant WT G1 and Q datasets ("Ovation" datasets)
#### Code
<details>
<summary><i>Code: Isolate relevant WT G1 and Q datasets ("Ovation" datasets)</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Create a vector of "relevant" WT Q and G1 dataset names
relevant <- colnames(t_S_none)[stringr::str_detect(colnames(t_S_none), "ovn")]

#  Isolate the features for the relevant datasets
t_S_none_rel <- t_S_none[, c("gene_id", relevant)] %>%
    dplyr::filter(!stringr::str_detect(gene_id, "^__"))
# tail(t_S_none_rel)

#  Clean up column names
colnames(t_S_none_rel) <- colnames(t_S_none_rel) %>%
    gsub("bams_renamed/UT_prim_UMI/", "", .) %>%
    gsub("\\.UT_prim_UMI\\.bam", "", .) %>%
    gsub("*_day._ovn*", "", .) %>%
    gsub("aux-F_tc-F_", "", .) %>%
    gsub("_tech1", "", .)

#  Isolate htseq-count "summary values"for the relevant datasets
`t_S_none_rel_htseq-count-summary` <- t_S_none[, c("gene_id", relevant)] %>%
    dplyr::filter(stringr::str_detect(gene_id, "^__"))
# `t_S_none_rel_htseq-count-summary`

#  Again, clean up column names
colnames(`t_S_none_rel_htseq-count-summary`) <-
    colnames(`t_S_none_rel_htseq-count-summary`) %>%
    gsub("bams_renamed/UT_prim_UMI/", "", .) %>%
    gsub("\\.UT_prim_UMI\\.bam", "", .) %>%
    gsub("*_day._ovn*", "", .) %>%
    gsub("aux-F_tc-F_", "", .) %>%
    gsub("_tech1", "", .)

rm(relevant)
```
</details>
<br />

### Join relevant sample data with "positional" (etc.) information
#### Code
<details>
<summary><i>Code: Join relevant sample data with "positional" (etc.) information</i></summary>

```{r}
t_S_none_rel_joined <- dplyr::full_join(t_S_none_rel, comp_S, by = "gene_id")
t_S_none_rel_joined
```
</details>
<br />

### Create a metadata matrix for WT G1 and Q datasets ("Ovation" datasets)
#### Code
<details>
<summary><i>Code: Create a metadata matrix for WT G1 and Q datasets ("Ovation" datasets)</i></summary>

```{r}
metadata <- t_S_none_rel[, 2:ncol(t_S_none_rel)] %>%
    colnames() %>%
    stringr::str_split("_") %>%
    as.data.frame() %>%
    t() %>%
    as.data.frame()

rownames(metadata) <- t_S_none_rel[, 2:ncol(t_S_none_rel)] %>% colnames()
colnames(metadata) <- c("genotype", "state", "transcription", "replicate")

metadata
```
</details>
<br />

### Record positional information in a `GRanges` object
#### Code
<details>
<summary><i>Code: Record positional information in a `GRanges` object</i></summary>

```{r Record positional information in a GRanges object, echo=FALSE, results='hide', message=FALSE}
record_positional_info <- function(tibble) {
    pos_info <- GenomicRanges::GRanges(
        seqnames = tibble$seqnames,
        ranges = IRanges::IRanges(tibble$start, tibble$end),
        strand = tibble$strand,
        length = tibble$width,
        gene_id = tibble$gene_id,
        transcript_id = tibble$transcript_id,
        category = tibble$category,
        orf_classification = tibble$orf_classification,
        source_id = tibble$source_id
    )
    
    return(pos_info)
}


t_S_none_rel_pos_info <- record_positional_info(t_S_none_rel_joined)
```
</details>
<br />
<br />

## Test replicate consistency via PCA of non-normalized counts
### Initialize necessary functions
#### Code
<details>
<summary><i>Code: Initialize necessary functions</i></summary>

```{r}
#!/usr/bin/env Rscript

get_name_of_var <- function(v) {
    #TODO Write a description of this function
    #
    # :param v: ...
    # :return v: ...
    return(deparse(substitute(v)))
}
#TODO Add return description


get_top_loadings <- function(x, y, z, a) {
    #TODO Write a description of this function
    #
    # :param x: dataframe of PC loadings <data.frame>
    # :param y: character element for column in dataframe x <chr>
    # :param z: whether to select all loadings sorted from largest to smallest
    #           absolute value ('all'), positive loadings sorted from largest
    #           to smallest value ('pos'), or negative loadings sorted from
    #           largest to smallest absolute value ('neg') <str>
    # :param a: whether or not to keep 'sign' and 'abs' columns added in the
    #           course of processing the dataframe <logical>
    # :return b: ...
    b <- as.data.frame(x[[y]])
    rownames(b) <- rownames(x)
    colnames(b) <- y
    
    b[["sign"]] <- ifelse(
        b[[y]] > 0,
        "pos",
        ifelse(
            b[[y]] == 0,
            "zero",
            "neg"
        )
    )
    
    b[["abs"]] <- abs(b[[y]])
    
    if(z == "all") {
        b <- dplyr::arrange(b, by = desc(abs))
    } else if(z == "pos") {
        b <- b[b[[y]] > 0, ] %>% dplyr::arrange(., by = desc(abs))
    } else if(z == "neg") {
        b <- b[b[[y]] < 0, ] %>% dplyr::arrange(., by = desc(abs))
    } else {
        stop(paste0("Stopping: param z must be either 'all', 'pos', or 'neg'"))
    }
    
    if(isTRUE(a)) {
        paste0("Retaining 'sign' and 'abs' columns")
    } else if(isFALSE(a)) {
        b <- b %>% dplyr::select(-c(sign, abs))
    } else {
        stop(paste0("Stopping: param a must be either 'TRUE' or 'FALSE'"))
    }
    
    return(b)
}
#TODO Add return description


plot_biplot <- function(
    pca, PC_x, PC_y,
    loadings_show, loadings_n,
    meta_color, meta_shape,
    x_min, x_max, y_min, y_max
) {
    #TODO Write a description of this function
    #
    # :param pca: "pca" list object obtained by running PCAtools::pca()
    # :param PC_x: PC to plot on the x axis <chr>
    # :param PC_y: PC to plot on the y axis <chr>
    # :param loadings_show: whether to overlay component loadings or not <lgl>
    # :param loadings_n: number of top loadings to show <int >= 0>
    # :param meta_color: column in "pca" list metadata to color by <chr>
    # :param meta_shape: column in "pca" list metadata to shape by <chr>
    # :param x_min: minimum value on x axis <dbl>
    # :param x_max: maximum value on x axis <dbl>
    # :param y_min: minimum value on y axis <dbl>
    # :param y_max: maximum value on y axis <dbl>
    # :param title: title of biplot <dbl>
    # :return image: ...
    image <- pca %>% 
        PCAtools::biplot(
            x = PC_x,
            y = PC_y,
            lab = NULL,
            showLoadings = loadings_show,
            ntopLoadings = loadings_n,
            boxedLoadingsNames = TRUE,
            colby = meta_color,
            shape = meta_shape,
            encircle = FALSE,
            ellipse = FALSE,
            max.overlaps = Inf,
            xlim = c(x_min, x_max),
            ylim = c(y_min, y_max)
        ) +
            theme_slick
    
    return(image)
}
#TODO Add return description


plot_pos_neg_loadings_each_axis <- function(
    df_all, df_pos, df_neg,
    PC_x, PC_y,
    row_start, row_end,
    x_min, x_max, y_min, y_max,
    x_nudge, y_nudge, x_label, y_label,
    col_line_pos, col_line_neg, col_seg_pos, col_seg_neg
) {
    #TODO Write a description of this function
    #
    # :param df_all: dataframe: all loadings (from, e.g., PCAtools)
    # :param df_pos: dataframe: positive loadings ordered largest to smallest
    # :param df_neg: dataframe: negative loadings ordered smallest to largest
    # :param PC_x: PC to plot on the x axis
    # :param PC_y: PC to plot on the y axis
    # :param row_start: row from which to begin subsetting the PCs on x and y
    # :param row_end: row at which to end subsetting the PCs on x and y
    # :param x_min: minimum value on x axis <dbl>
    # :param x_max: maximum value on x axis <dbl>
    # :param y_min: minimum value on y axis <dbl>
    # :param y_max: maximum value on y axis <dbl>
    # :param x_nudge: amount to nudge labels on the x axis <dbl>
    # :param y_nudge: amount to nudge labels on the y axis <dbl>
    # :param x_label: x axis label <chr>
    # :param y_label: y axis label <chr>
    # :param col_line_pos: color: lines, arrows for positive loadings <chr>
    # :param col_line_neg: color: lines, arrows for negative loadings <chr>
    # :param col_seg_pos: color: segments connecting arrowhead and text bubble
    #                     for positive loadings <chr>
    # :param col_seg_neg: color: segments connecting arrowhead and text bubble
    #                     for negative loadings <chr>
    # :return image: ...
    filter_pos_1 <- rownames(df_pos[[PC_x]][row_start:row_end, ])
    filter_pos_2 <- rownames(df_pos[[PC_y]][row_start:row_end, ])
    filter_neg_1 <- rownames(df_neg[[PC_x]][row_start:row_end, ])
    filter_neg_2 <- rownames(df_neg[[PC_y]][row_start:row_end, ])
    
    loadings_filter_pos_1 <- df_all[rownames(df_all) %in% filter_pos_1, ]
    loadings_filter_pos_2 <- df_all[rownames(df_all) %in% filter_pos_2, ]
    loadings_filter_neg_1 <- df_all[rownames(df_all) %in% filter_neg_1, ]
    loadings_filter_neg_2 <- df_all[rownames(df_all) %in% filter_neg_2, ]
    
    images <- list()
    images[["PC_x_pos"]] <- plot_loadings(
        loadings_filter_pos_1,
        loadings_filter_pos_1[[PC_x]],
        loadings_filter_pos_1[[PC_y]],
        x_min, x_max, y_min, y_max, x_nudge, y_nudge,
        x_label, y_label, col_line_pos, col_seg_pos
    )
    images[["PC_y_pos"]] <- plot_loadings(
        loadings_filter_pos_2,
        loadings_filter_pos_2[[PC_x]],
        loadings_filter_pos_2[[PC_y]],
        x_min, x_max, y_min, y_max, x_nudge, y_nudge,
        x_label, y_label, col_line_pos, col_seg_pos
    )
    images[["PC_x_neg"]] <- plot_loadings(
        loadings_filter_neg_1,
        loadings_filter_neg_1[[PC_x]],
        loadings_filter_neg_1[[PC_y]],
        x_min, x_max, y_min, y_max, -y_nudge, x_nudge,
        x_label, y_label, col_line_neg, col_seg_neg
    )
    images[["PC_y_neg"]] <- plot_loadings(
        loadings_filter_neg_2,
        loadings_filter_neg_2[[PC_x]],
        loadings_filter_neg_2[[PC_y]],
        x_min, x_max, y_min, y_max, x_nudge, -y_nudge,
        x_label, y_label, col_line_neg, col_seg_neg
    )
    return(images)
}
#TODO Add return description


plot_loadings <- function(x, y, z, a, b, d, e, f, g, h, i, j, k) {
    #TODO Write a description of this function
    #
    # :param x: dataframe of PC loadings w/gene names as rownames <data.frame>
    # :param y: column in dataframe to plot on x axis <dbl>
    # :param z: column in dataframe to plot on y axis <dbl>
    # :param a: minimum value on x axis <dbl>
    # :param b: maximum value on x axis <dbl>
    # :param d: minimum value on y axis <dbl>
    # :param e: maximum value on y axis <dbl>
    # :param f: amount to nudge labels on the x axis <dbl>
    # :param g: amount to nudge labels on the y axis <dbl>
    # :param h: x axis label <chr>
    # :param i: y axis label <chr>
    # :param j: color of line and arrow <chr>
    # :param k: color of segment connecting arrowhead and text bubble <chr>
    # :return l: ...
    l <- ggplot2::ggplot(x, ggplot2::aes(x = y, y = z)) +  #TODO #FUNCTION
        ggplot2::coord_cartesian(xlim = c(a, b), ylim = c(d, e)) +
        ggplot2::geom_segment(
            aes(xend = 0, yend = 0, alpha = 0.5),
            color = j, 
            arrow = ggplot2::arrow(
                ends = "first",
                type = "open",
                length = unit(0.125, "inches")
            )
        ) +
        ggrepel::geom_label_repel(
            mapping = ggplot2::aes(
                fontface = 1, segment.color = k, segment.size = 0.25
            ),
            label = rownames(x),
            label.size = 0.05,
            direction = "both",
            nudge_x = f,  # 0.02
            nudge_y = g,  # 0.04
            force = 4,
            force_pull = 1,
            hjust = 0
        ) +
        ggplot2::xlab(h) +
        ggplot2::ylab(i) +
        theme_slick_no_legend
    
    return(l)
}
#TODO Add return description


draw_scree_plot <- function(pca, horn, elbow) {
    #TODO Write a description of this function
    #
    # :param pca: "pca" list object obtained by running PCAtools::pca()
    # :param horn: ...
    # :param elbow: ...
    # :return scree: ...
    scree <- PCAtools::screeplot(
        pca,
        components = PCAtools::getComponents(pca),
        vline = c(horn, elbow),
        vlineWidth = 0.25,
        sizeCumulativeSumLine = 0.5,
        sizeCumulativeSumPoints = 1.5
    ) +
        geom_text(aes(horn + 1, 50, label = "Horn's", vjust = 2)) +
        geom_text(aes(elbow + 1, 50, label = "Elbow", vjust = -2)) +
        theme_slick +
        ggplot2::theme(axis.text.x = element_text(angle = 90, hjust = 1))

    return(scree)
}
#TODO Add return description


#  Set up custom ggplot2 plot themes ------------------------------------------
theme_slick <- theme_classic() +
    theme(
        panel.grid.major = ggplot2::element_line(linewidth = 0.4),
        panel.grid.minor = ggplot2::element_line(linewidth = 0.2),
        axis.line = ggplot2::element_line(linewidth = 0.2),
        axis.ticks = ggplot2::element_line(linewidth = 0.4),
        axis.text = ggplot2::element_text(color = "black"),
        axis.title.x = ggplot2::element_text(),
        axis.title.y = ggplot2::element_text(),
        plot.title = ggplot2::element_text(),
        text = element_text(family = "")
    )

theme_slick_no_legend <- theme_slick + theme(legend.position = "none")
```
</details>
<br />

### Create "`pca`" object and evaluate significant PCs
#### Code
<details>
<summary><i>Code: Create "`pca`" object and evaluate significant PCs</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Create a PCAtools "pca" S4 object for the raw counts -----------------------
#+ Assign unique row names too
obj_pca <- PCAtools::pca(
    t_S_none_rel[, 2:ncol(t_S_none_rel)],
    metadata = metadata
)
rownames(obj_pca$loadings) <- dplyr::pull(t_S_none_rel, gene_id)


#  Determine "significant" PCs with Horn's parallel analysis ------------------
#+ See Horn, 1965
horn <- PCAtools::parallelPCA(mat = t_S_none_rel[, 2:ncol(t_S_none_rel)])
# horn$n


#  Determine "significant" principle components with the elbow method ---------
#+ See Buja and Eyuboglu, 1992
elbow <- PCAtools::findElbowPoint(obj_pca$variance)
# elbow


#  Evaluate cumulative proportion of explained variance with a scree plot -----
scree <- draw_scree_plot(obj_pca, horn = horn$n, elbow = elbow)
scree
```
</details>
<br />

### Evaluate positive, negative loadings on axes of biplots
#### Code
<details>
<summary><i>Code: Evaluate positive, negative loadings on axes of biplots</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Save component loading vectors in their own dataframe ----------------------
loadings <- as.data.frame(obj_pca$loadings)

#  Evaluate the component loading vectors for the number of significant PCs
#+ identified via the elbow method plus two
PCs <- paste0("PC", 1:(as.numeric(elbow) + 2))
top_loadings_all <- lapply(
    PCs, get_top_loadings, x = loadings, z = "all", a = TRUE
)
top_loadings_pos <- lapply(
    PCs, get_top_loadings, x = loadings, z = "pos", a = TRUE
)
top_loadings_neg <- lapply(
    PCs, get_top_loadings, x = loadings, z = "neg", a = TRUE
)

names(top_loadings_all) <-
    names(top_loadings_pos) <-
    names(top_loadings_neg) <-
    PCs
# rm(PCs)
# top_loadings_all$PC1 %>% head(n = 20)
# top_loadings_pos$PC1 %>% head(n = 20)
# top_loadings_neg$PC1 %>% head(n = 20)


#  Evaluate positive, negative loadings on axes of biplots --------------------
#+ Look at the top 15 per axis
images <- list()
mat <- combn(PCs, 2)
for(i in 1:ncol(mat)) {
    # i <- 1
    j <- mat[, i]
    
    PC_x <- x_label <- j[1]
    PC_y <- y_label <- j[2]
    
    images[[paste0("PCAtools.", PC_x, ".v.", PC_y)]] <- plot_biplot(
        pca = obj_pca,
        PC_x = PC_x,
        PC_y = PC_y,
        loadings_show = FALSE,
        loadings_n = 0,
        meta_color = "state",
        meta_shape = "transcription",
        x_min = -350000,
        x_max = 350000,
        y_min = -350000,
        y_max = 350000
    )
    
    images[[paste0("KA.", PC_x, ".v.", PC_y)]] <-
        plot_pos_neg_loadings_each_axis(
            df_all = loadings,
            df_pos = top_loadings_pos,
            df_neg = top_loadings_neg,
            PC_x = PC_x,
            PC_y = PC_y,
            row_start = 1,
            row_end = 15,  # 30
            x_min = -0.5,  # -0.15,  # -1.0,
            x_max = 0.5,  # 0.15,  # 1.0,
            y_min = -0.5,  # -0.1,  # -0.5,
            y_max = 0.5,  # 0.1,  # 0.5,
            x_nudge = 0.02,  # 0.02,  # 0.04,
            y_nudge = 0.04,  # 0.04,  # 0.02,
            x_label = x_label,
            y_label = y_label,
            col_line_pos = "black",
            col_line_neg = "red",
            col_seg_pos = "grey",
            col_seg_neg = "grey"
        )
    
    images[[paste0("KA.", PC_x, ".v.", PC_y)]]
}

#  How do things look?
images$PCAtools.PC1.v.PC2
images$KA.PC1.v.PC2$PC_x_pos
images$KA.PC1.v.PC2$PC_x_neg
images$KA.PC1.v.PC2$PC_y_pos
images$KA.PC1.v.PC2$PC_y_neg

images$PCAtools.PC1.v.PC3
images$KA.PC1.v.PC3$PC_x_pos
images$KA.PC1.v.PC3$PC_x_neg
images$KA.PC1.v.PC3$PC_y_pos
images$KA.PC1.v.PC3$PC_y_neg

images$PCAtools.PC1.v.PC4
images$KA.PC1.v.PC4$PC_x_pos
images$KA.PC1.v.PC4$PC_x_neg
images$KA.PC1.v.PC4$PC_y_pos
images$KA.PC1.v.PC4$PC_y_neg

images$PCAtools.PC2.v.PC3
images$KA.PC2.v.PC3$PC_x_pos
images$KA.PC2.v.PC3$PC_x_neg
images$KA.PC2.v.PC3$PC_y_pos
images$KA.PC2.v.PC3$PC_y_neg

images$PCAtools.PC2.v.PC4
images$KA.PC2.v.PC4$PC_x_pos
images$KA.PC2.v.PC4$PC_x_neg
images$KA.PC2.v.PC4$PC_y_pos
images$KA.PC2.v.PC4$PC_y_neg

images$PCAtools.PC3.v.PC4
images$KA.PC3.v.PC4$PC_x_pos
images$KA.PC3.v.PC4$PC_x_neg
images$KA.PC3.v.PC4$PC_y_pos
images$KA.PC3.v.PC4$PC_y_neg

# images$PCAtools.PC1.v.PC3
# images$KA.PC1.v.PC3
# images$PCAtools.PC1.v.PC4
# images$KA.PC1.v.PC4
# images$PCAtools.PC2.v.PC3
# images$KA.PC2.v.PC3
```
</details>
<br />

### Evaluate the top features and metadata-PC correlation
#### Code
<details>
<summary><i>Code: Evaluate the top features and metadata-PC correlation</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Plot the top features on an axis of component loading range ----------------
#+ ...to visualize the top variables (features) that drive variance among
#+ principal components of interest
p_loadings <- PCAtools::plotloadings(
    obj_pca,
    # components = getComponents(obj_pca, 1),
    components = getComponents(obj_pca, 1:2),
    rangeRetain = 0.1,
    absolute = FALSE,
    col = c("#785EF075", "#FFFFFF75", "#FE610075"),
    title = "Loadings plot",
    subtitle = "Top 5% of variables (i.e., features)",
    # shapeSizeRange = c(4, 16),
    borderColour = "#000000",
    borderWidth = 0.2,
    gridlines.major = TRUE,
    gridlines.minor = TRUE,
    axisLabSize = 10,
    labSize = 3,  # label_size
    drawConnectors = TRUE,
    widthConnectors = 0.2,
    typeConnectors = 'closed',
    colConnectors = 'black'
) +
    # ggplot2::coord_flip() +
    theme_slick_no_legend
p_loadings
#TODO Work up some logic for saving the plot


#  Evaluate correlations between PCs and model variables ----------------------
#+ Answer, "What is driving biologically significant variance in our data?"
PC_cor <- PCAtools::eigencorplot(
    obj_pca,
    components = PCAtools::getComponents(obj_pca, 1:8),
    metavars = c("state", "transcription", "replicate"),
    col = c("#FFFFFF75", "#7835AC75"),
    scale = FALSE,
    corFUN = "pearson",
    corMultipleTestCorrection = "BH",
    plotRsquared = TRUE,
    colFrame = "#FFFFFF",
    main = bquote(Pearson ~ r^2 ~ correlates),
    # main = "PC Pearson r-squared correlates",
    fontMain = 1,
    titleX = "Principal components",
    fontTitleX = 1,
    fontLabX = 1,
    titleY = "Model variables",
    rotTitleY = 90,
    fontTitleY = 1,
    fontLabY = 1
)
PC_cor
```
</details>
<br />

### Get lists of top loadings for GO analyses
#### Code
<details>
<summary><i>Code: Get lists of top loadings for GO analyses</i></summary>

```{r, eval=FALSE}
#!/usr/bin/env Rscript

# for(i in c("PC1", "PC2", "PC3", "PC4")) {
for(i in c("PC1", "PC2")) {
    # i <- "PC1"
    #  Positive
    loadings_pos_PC <- rownames(top_loadings_pos[[i]])[1:500]
    save_title_pos_PC <- paste0(
        "top-500.",
        stringr::str_replace_all(get_name_of_var(loadings_pos_PC), "_", "-"),
        ".", i, ".txt"
    )
    # readr::write_tsv(
    #     dplyr::as_tibble(loadings_pos_PC),
    #     paste0(args$directory_out, "/", save_title_pos_PC),
    #     col_names = FALSE
    # )
    #TODO Work up some logic for location(s) for outfiles
    
    #  Negative
    loadings_neg_PC <- rownames(top_loadings_neg[[i]])[1:500]
    save_title_neg_PC <- paste0(
        "top-500.",
        stringr::str_replace_all(get_name_of_var(loadings_neg_PC), "_", "-"),
        ".", i, ".txt"
    )
    # readr::write_tsv(
    #     dplyr::as_tibble(loadings_neg_PC),
    #     paste0(args$directory_out, "/", save_title_neg_PC),
    #     col_names = FALSE
    # )
    #TODO Work up some logic for location(s) for outfiles
}
```
</details>
<br />

### Clean up
#### Code
<details>
<summary><i>Code: Clean up</i></summary>

```{r}
#!/usr/bin/env Rscript

rm(
    horn, images, loadings, mat, obj_pca, p_loadings, PC_cor, scree,
    top_loadings_all, top_loadings_neg, top_loadings_pos
)

rm(
    elbow, i, j, loadings_neg_PC, loadings_pos_PC, PC_x, PC_y, PCs,
    save_title_neg_PC, save_title_pos_PC, x_label, y_label
)
```

</details>
<br />

### On the PCA results...
#### Notes
<details>
<summary><i>Notes: On the PCA results...</i></summary>

Even without normalization, the per-replicate counts facilitate the clustering of <u>like with like</u> and <u>unlike away from unlike</u>.

PC1 is strongly associated and correlated with "`state`" (i.e., Q versus G1), while PC2 is associated/correlated with "`transcription`".

`#CONCLUSION` I think it's valid to...
1. take an approach in which we calculate chi-square statistics, examining whether or not sets of variables&mdash;where vector "`category`" is the set and the variables are the vector elements&mdash;are likely to be related to each other, or
2. take an approach in which we compare the means of variables via t-tests&mdash;e.g., in this case, the elements in vector "`category`" are the variables, and the means of their between-replicate tallies are what we are comparing. I can see us doing "general" two-tailed t-tests or paired two-tailed t-tests.

</details>
<br />
<br />

## Test consistency, quality via PCA of `rlog`-norm. counts, no model
*The clustering of like with like in the raw data&mdash;how much is the influence of library size rather than library compostion?*

### Initialize necessary functions
*See above*

### For the counts, create a "`dds`" object then perform `rlog` transformations
#### Code
<details>
<summary><i>Code: For the counts, create a "`dds`" object then perform `rlog` transformations</i></summary>

```{r}
#!/usr/bin/env Rscript

dds_S_none_rel <- DESeq2::DESeqDataSetFromMatrix(
    countData = t_S_none_rel[, 2:length(t_S_none_rel)],
    colData = metadata,
    design = ~ 1,
    rowRanges = t_S_none_rel_pos_info
)
# dds_S_none_rel %>% BiocGenerics::counts() %>% head()
# dds_S_none_rel@rowRanges
# dds_S_none_rel@design
# dds_S_none_rel@assays


rT_S_none_rel <- DESeq2::rlog(dds_S_none_rel, blind = TRUE) %>%
    SummarizedExperiment::assay() %>%
    as.data.frame()
# rT_S_none_rel

rF_S_none_rel <- DESeq2::rlog(dds_S_none_rel, blind = FALSE) %>%
    SummarizedExperiment::assay() %>%
    as.data.frame()
# rF_S_none_rel

#NOTE 1/2 Since the experimental design is ~1, it makes no difference whether
#NOTE 2/2 blind is TRUE or FALSE
```
</details>
<br />

### Create "`pca`" object and evaluate significant PCs
#### Code
<details>
<summary><i>Code: Create "`pca`" object and evaluate significant PCs</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Create a PCAtools "pca" S4 object for the raw counts -----------------------
#+ Assign unique row names too
obj_pca <- PCAtools::pca(rT_S_none_rel, metadata = metadata)
rownames(obj_pca$loadings) <- dplyr::pull(t_S_none_rel, gene_id)


#  Determine "significant" PCs with Horn's parallel analysis ------------------
#+ See Horn, 1965
horn <- PCAtools::parallelPCA(mat = t_S_none_rel[, 2:ncol(t_S_none_rel)])
# horn$n


#  Determine "significant" principle components with the elbow method ---------
#+ See Buja and Eyuboglu, 1992
elbow <- PCAtools::findElbowPoint(obj_pca$variance)
# elbow


#  Evaluate cumulative proportion of explained variance with a scree plot -----
scree <- draw_scree_plot(obj_pca, horn = horn$n, elbow = elbow)
scree
```
</details>
<br />

### Evaluate positive, negative loadings on axes of biplots
#### Code
<details>
<summary><i>Code: Evaluate positive, negative loadings on axes of biplots</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Save component loading vectors in their own dataframe ----------------------
loadings <- as.data.frame(obj_pca$loadings)

#  Evaluate the component loading vectors for the number of significant PCs
#+ identified via the elbow method plus two
PCs <- paste0("PC", 1:(as.numeric(elbow) + 2))
top_loadings_all <- lapply(
    PCs, get_top_loadings, x = loadings, z = "all", a = TRUE
)
top_loadings_pos <- lapply(
    PCs, get_top_loadings, x = loadings, z = "pos", a = TRUE
)
top_loadings_neg <- lapply(
    PCs, get_top_loadings, x = loadings, z = "neg", a = TRUE
)

names(top_loadings_all) <-
    names(top_loadings_pos) <-
    names(top_loadings_neg) <-
    PCs
# rm(PCs)
# top_loadings_all$PC1 %>% head(n = 20)
# top_loadings_pos$PC1 %>% head(n = 20)
# top_loadings_neg$PC1 %>% head(n = 20)


#  Evaluate positive, negative loadings on axes of biplots --------------------
#+ Look at the top 15 per axis
images <- list()
mat <- combn(PCs, 2)
for(i in 1:ncol(mat)) {
    # i <- 1
    j <- mat[, i]
    
    PC_x <- x_label <- j[1]
    PC_y <- y_label <- j[2]
    
    images[[paste0("PCAtools.", PC_x, ".v.", PC_y)]] <- plot_biplot(
        pca = obj_pca,
        PC_x = PC_x,
        PC_y = PC_y,
        loadings_show = FALSE,
        loadings_n = 0,
        meta_color = "state",
        meta_shape = "transcription",
        x_min = -100,
        x_max = 100,
        y_min = -100,
        y_max = 100
    )
    
    images[[paste0("KA.", PC_x, ".v.", PC_y)]] <-
        plot_pos_neg_loadings_each_axis(
            df_all = loadings,
            df_pos = top_loadings_pos,
            df_neg = top_loadings_neg,
            PC_x = PC_x,
            PC_y = PC_y,
            row_start = 1,
            row_end = 15,  # 30
            x_min = -0.1,  # -0.15,  # -1.0,
            x_max = 0.1,  # 0.15,  # 1.0,
            y_min = -0.1,  # -0.1,  # -0.5,
            y_max = 0.1,  # 0.1,  # 0.5,
            x_nudge = 0.02,  # 0.02,  # 0.04,
            y_nudge = 0.04,  # 0.04,  # 0.02,
            x_label = x_label,
            y_label = y_label,
            col_line_pos = "black",
            col_line_neg = "red",
            col_seg_pos = "grey",
            col_seg_neg = "grey"
        )
    
    images[[paste0("KA.", PC_x, ".v.", PC_y)]]
}

#  How do things look?
images$PCAtools.PC1.v.PC2
images$KA.PC1.v.PC2$PC_x_pos
images$KA.PC1.v.PC2$PC_x_neg
images$KA.PC1.v.PC2$PC_y_pos
images$KA.PC1.v.PC2$PC_y_neg

images$PCAtools.PC1.v.PC3
images$KA.PC1.v.PC3$PC_x_pos
images$KA.PC1.v.PC3$PC_x_neg
images$KA.PC1.v.PC3$PC_y_pos
images$KA.PC1.v.PC3$PC_y_neg

images$PCAtools.PC1.v.PC4
images$KA.PC1.v.PC4$PC_x_pos
images$KA.PC1.v.PC4$PC_x_neg
images$KA.PC1.v.PC4$PC_y_pos
images$KA.PC1.v.PC4$PC_y_neg

images$PCAtools.PC2.v.PC3
images$KA.PC2.v.PC3$PC_x_pos
images$KA.PC2.v.PC3$PC_x_neg
images$KA.PC2.v.PC3$PC_y_pos
images$KA.PC2.v.PC3$PC_y_neg

images$PCAtools.PC2.v.PC4
images$KA.PC2.v.PC4$PC_x_pos
images$KA.PC2.v.PC4$PC_x_neg
images$KA.PC2.v.PC4$PC_y_pos
images$KA.PC2.v.PC4$PC_y_neg

images$PCAtools.PC3.v.PC4
images$KA.PC3.v.PC4$PC_x_pos
images$KA.PC3.v.PC4$PC_x_neg
images$KA.PC3.v.PC4$PC_y_pos
images$KA.PC3.v.PC4$PC_y_neg

# images$PCAtools.PC1.v.PC3
# images$KA.PC1.v.PC3
# images$PCAtools.PC1.v.PC4
# images$KA.PC1.v.PC4
# images$PCAtools.PC2.v.PC3
# images$KA.PC2.v.PC3
```
</details>
<br />

### Evaluate the top features and metadata-PC correlation
#### Code
<details>
<summary><i>Code: Evaluate the top features and metadata-PC correlation</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Plot the top features on an axis of component loading range ----------------
#+ ...to visualize the top variables (features) that drive variance among
#+ principal components of interest
p_loadings <- PCAtools::plotloadings(
    obj_pca,
    # components = getComponents(obj_pca, 1),
    components = getComponents(obj_pca, 1:3),
    rangeRetain = 0.025,
    absolute = FALSE,
    col = c("#167C2875", "#FFFFFF75", "#7835AC75"),
    title = "Loadings plot",
    subtitle = "Top 2.5% of variables (i.e., features)",
    # shapeSizeRange = c(4, 16),
    borderColour = "#000000",
    borderWidth = 0.2,
    gridlines.major = TRUE,
    gridlines.minor = TRUE,
    axisLabSize = 10,
    labSize = 3,  # label_size
    drawConnectors = TRUE,
    widthConnectors = 0.2,
    typeConnectors = 'closed',
    colConnectors = 'black'
) +
    # ggplot2::coord_flip() +
    theme_slick_no_legend
p_loadings
#TODO Work up some logic for saving the plot


#  Evaluate correlations between PCs and model variables ----------------------
#+ Answer, "What is driving biologically significant variance in our data?"
PC_cor <- PCAtools::eigencorplot(
    obj_pca,
    components = PCAtools::getComponents(obj_pca, 1:8),
    metavars = c("state", "transcription", "replicate"),
    col = c("#FFFFFF75", "#7835AC75"),
    scale = FALSE,
    corFUN = "pearson",
    corMultipleTestCorrection = "BH",
    plotRsquared = TRUE,
    colFrame = "#FFFFFF",
    main = bquote(Pearson ~ r^2 ~ correlates),
    # main = "PC Pearson r-squared correlates",
    fontMain = 1,
    titleX = "Principal components",
    fontTitleX = 1,
    fontLabX = 1,
    titleY = "Model variables",
    rotTitleY = 90,
    fontTitleY = 1,
    fontLabY = 1
)
PC_cor
```
</details>
<br />

### Get lists of top loadings for GO analyses
#### Code
<details>
<summary><i>Code: Get lists of top loadings for GO analyses</i></summary>

```{r, eval=FALSE}
#!/usr/bin/env Rscript

# for(i in c("PC1", "PC2", "PC3", "PC4")) {
for(i in c("PC1", "PC2", "PC3")) {
    # i <- "PC1"
    #  Positive
    loadings_pos_PC <- rownames(top_loadings_pos[[i]])[1:500]
    save_title_pos_PC <- paste0(
        "top-500.",
        stringr::str_replace_all(get_name_of_var(loadings_pos_PC), "_", "-"),
        ".", i, ".txt"
    )
    # readr::write_tsv(
    #     dplyr::as_tibble(loadings_pos_PC),
    #     paste0(args$directory_out, "/", save_title_pos_PC),
    #     col_names = FALSE
    # )
    #TODO Work up some logic for location(s) for outfiles
    
    #  Negative
    loadings_neg_PC <- rownames(top_loadings_neg[[i]])[1:500]
    save_title_neg_PC <- paste0(
        "top-500.",
        stringr::str_replace_all(get_name_of_var(loadings_neg_PC), "_", "-"),
        ".", i, ".txt"
    )
    # readr::write_tsv(
    #     dplyr::as_tibble(loadings_neg_PC),
    #     paste0(args$directory_out, "/", save_title_neg_PC),
    #     col_names = FALSE
    # )
    #TODO Work up some logic for location(s) for outfiles
}
```
</details>
<br />

### Clean up
#### Code
<details>
<summary><i>Code: Clean up</i></summary>

```{r}
#!/usr/bin/env Rscript

rm(dds_S_none_rel, rF_S_none_rel, rT_S_none_rel)

rm(
    horn, images, loadings, mat, obj_pca, p_loadings, PC_cor, scree,
    top_loadings_all, top_loadings_neg, top_loadings_pos
)

rm(
    elbow, i, j, loadings_neg_PC, loadings_pos_PC, PC_x, PC_y, PCs,
    save_title_neg_PC, save_title_pos_PC, x_label, y_label
)
```
</details>
<br />
<br />

## Process data, then plot counts proportions
### Read in, process the dataframe for `work_calculate_uni-multimappers-etc.md` metrics
After having written and run the code in [`work_calculate_uni-multimappers-etc.md`](./work_calculate_uni-multimappers-etc.md)...

#### Code
<details>
<summary><i>Code: Read in, process the dataframe for `work_calculate_uni-multimappers-etc.md` metrics</i></summary>

```{r}
#!/usr/bin/env Rscript

#HERE
p_uni_multi_etc <- "outfiles_htseq-count"
f_uni_multi_etc <- "calculate_uni-multimappers-etc.UT_prim_UMI.txt"
uni_multi_etc <- readr::read_tsv(
    paste(p_uni_multi_etc, f_uni_multi_etc, sep = "/"),
    show_col_types = FALSE
)

rm(f_uni_multi_etc, p_uni_multi_etc)

#  Isolate (and clean up the names of) relevant information ("Ovation" data)
relevant <- colnames(t_S_none)[  #FIXME t_S_none needs to have been loaded
    stringr::str_detect(colnames(t_S_none), "ovn")
] %>%
    gsub("bams_renamed/UT_prim_UMI/", "", .) %>%
    gsub("\\.UT_prim_UMI\\.bam", "", .)
uni_multi_etc_rel <- uni_multi_etc %>% dplyr::filter(sample %in% relevant)
uni_multi_etc_rel$sample <- uni_multi_etc_rel$sample %>%
   gsub("*_day._ovn*", "", .) %>%
   gsub("aux-F_tc-F_", "", .) %>%
   gsub("_tech1", "", .)

#  Transpose the dataframe
uni_multi_etc_rel <- uni_multi_etc_rel %>%
    t() %>%
    tibble::as_tibble(rownames = "category")

#  Convert row 1 to column names
colnames(uni_multi_etc_rel) <- uni_multi_etc_rel[1, ]
uni_multi_etc_rel <- uni_multi_etc_rel[-1, ]
colnames(uni_multi_etc_rel)[1] <- "gene_id"

#  Convert number elements from type character to type numeric
uni_multi_etc_rel[, 2:length(uni_multi_etc_rel)] <- sapply(
    uni_multi_etc_rel[, 2:length(uni_multi_etc_rel)], as.numeric
) %>%
    tibble::as_tibble()

#  Clean up
rm(relevant)
```
</details>
<br />

### Combine feature counts and `htseq-count`- and `samtools`-derived summary values
#### Code
<details>
<summary><i>Code: Combine feature counts and `htseq-count`- and `samtools`-derived summary values</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Row-bind t_S_none_rel, `t_S_none_rel_htseq-count-summary`, and
#+ uni_multi_etc_rel
ovation <- dplyr::bind_rows(
    t_S_none_rel, `t_S_none_rel_htseq-count-summary`, uni_multi_etc_rel
)
```
</details>
<br />

### Check on "important rows"
#### Code
<details>
<summary><i>Code: Check on "important rows"</i></summary>

```{r, eval=FALSE}
#!/usr/bin/env Rscript

#  Check the column-wise sum of feature-assigned counts
sapply(t_S_none_rel[, -1], sum)
# WT_G1_N_rep1  WT_G1_N_rep2 WT_G1_SS_rep1 WT_G1_SS_rep2   WT_Q_N_rep1   WT_Q_N_rep2  WT_Q_SS_rep1  WT_Q_SS_rep2
#      9832375       9778987       4386156       4569639       6418902       7202562       3069577       2228669

#  Observe sample_total (all counts/read pairs in bam)
uni_multi_etc_rel[1, -1]
# WT_G1_N_rep1 WT_G1_N_rep2 WT_G1_SS_rep1 WT_G1_SS_rep2 WT_Q_N_rep1 WT_Q_N_rep2 WT_Q_SS_rep1 WT_Q_SS_rep2
#     15634566	   15755143	     10723500	   10009120	   18858614	   17445726	     9515191	  8394823

#  Observe SC-I-XVI_all (all counts/read pairs associated with S. cerevisiae
#+ chromosomes I-XVI)
uni_multi_etc_rel[5, -1]
# WT_G1_N_rep1 WT_G1_N_rep2 WT_G1_SS_rep1 WT_G1_SS_rep2 WT_Q_N_rep1 WT_Q_N_rep2 WT_Q_SS_rep1 WT_Q_SS_rep2
#     15219175	   15054009	      9398774	    8659383	    9532767	   10643767	     6350327	  5263914

#  Observe SC-I-XVI_uni (unimapping counts/read pairs associated with S.
#+ cerevisiae chromosomes I-XVI)
uni_multi_etc_rel[6, -1]
# WT_G1_N_rep1 WT_G1_N_rep2 WT_G1_SS_rep1 WT_G1_SS_rep2 WT_Q_N_rep1 WT_Q_N_rep2 WT_Q_SS_rep1 WT_Q_SS_rep2
#     13188385	   12953728	      6619041	    6866366	    8502909	    9697530	     4374618	  3403427
```
</details>
<br />

### Run the processing steps detailed [above](#evaluate-the-assignments-from-htseq-count)
#### Code
<details>
<summary><i>Code: Run the processing steps detailed above</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Step 1: Get multimappers associated with "Mito-KL-20S" ---------------------
`Mito-KL-20S_multi` <-
    ovation[stringr::str_detect(ovation$gene_id, "Mito-KL-20S_multi"), ]


#  Step 2: Get unimappers associated with "Mito-KL-20S" -----------------------
`Mito-KL-20S_uni` <-
    ovation[stringr::str_detect(ovation$gene_id, "Mito-KL-20S_uni"), ]


#  Step 3: Get unimappers associated with "SC-I-XVI" --------------------------
`SC-I-XVI_uni` <-
    ovation[stringr::str_detect(ovation$gene_id, "SC-I-XVI_uni"), ]


#  Step 4: Subtract `Mito-KL-20S_multi` from "`__alignment_not_unique` --------
`__alignment_not_unique` <- ovation[
    stringr::str_detect(ovation$gene_id, "__alignment_not_unique"), 2:9
]

`__alignment_not_unique_I-XVI` <-
    `__alignment_not_unique` - `Mito-KL-20S_multi`[, 2:9]

`__alignment_not_unique_I-XVI`$gene_id <- "__alignment_not_unique_I-XVI"
`__alignment_not_unique_I-XVI` <- `__alignment_not_unique_I-XVI` %>%
    dplyr::relocate(gene_id, .before = "WT_G1_N_rep1")
# `__alignment_not_unique_I-XVI`


#  Step 5: Subtract `Mito-KL-20S_uni` from "`__no_feature`" -------------------
`__no_feature` <- ovation[
    stringr::str_detect(ovation$gene_id, "__no_feature"), 2:9
]

`__no_feature_I-XVI` <-
    `__no_feature` - `Mito-KL-20S_uni`[, 2:9]

`__no_feature_I-XVI`$gene_id <- "__no_feature_I-XVI"
`__no_feature_I-XVI` <- `__no_feature_I-XVI` %>%
    dplyr::relocate(gene_id, .before = "WT_G1_N_rep1")
# `__no_feature_I-XVI`


#  Step 6: Take the sum of the following --------------------------------------
#+    - "`__no_feature_I-XVI`"
#+    - "`__ambiguous`"
#+    - "`__alignment_not_unique_I-XVI`"
#+    - "valid" counts
`__ambiguous` <- ovation[stringr::str_detect(ovation$gene_id, "__ambiguous"), ]
valid_counts <- sapply(t_S_none_rel[, -1], sum)

`__sum_I-XVI` <- `__no_feature_I-XVI`[, 2:9] +
    `__ambiguous`[, 2:9] +
    `__alignment_not_unique_I-XVI`[, 2:9] +
    valid_counts
`__sum_I-XVI`$gene_id <- "__sum_I-XVI"
`__sum_I-XVI` <- `__sum_I-XVI` %>%
    dplyr::relocate(gene_id, .before = "WT_G1_N_rep1")
# `__sum_I-XVI`


#  Step 7: Get numbers of all counts associated with "SC-I-XVI" ---------------
`SC-I-XVI_all` <- 
    ovation[stringr::str_detect(ovation$gene_id, "SC-I-XVI_all"), ]
# `SC-I-XVI_all`


#  Step 8: Check that `__sum_I-XVI` and `SC-I-XVI_all` are equal --------------
#+ If not, then there is a problem and troubleshooting needs to occur
identical(
    as.numeric(`__sum_I-XVI`[, 2:9]),
    as.numeric(`SC-I-XVI_all`[, 2:9])
)  # [1] TRUE
```
</details>
<br />

### Plot counts proportions
#### Code
<details>
<summary><i>Code: Plot counts proportions</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Initialize useful function for subsetting
`%notin%` <- Negate(`%in%`)

#  Create a dataframe of relevant feature counts the various summary values 
ovation <- dplyr::bind_rows(
    ovation,
    `__alignment_not_unique_I-XVI`,
    `__no_feature_I-XVI`,
    `__sum_I-XVI`
)
# ovation %>% tail(n = 20)
# ovation %>% tail(n = 15) %>% dplyr::select(gene_id)

ovation.pre <- ovation
# ovation <- ovation.pre

#  Retain only relevant, appropriately calculated summary values
exclude <- c(
    "__no_feature", "__too_low_aQual", "__not_aligned",
    "__alignment_not_unique", "sample_total", "Mito-KL-20S_all",
    "Mito-KL-20S_uni", "Mito-KL-20S_multi", "SC-I-XVI_all", "SC-I-XVI_uni",
    "SC-I-XVI_multi", "__sum_I-XVI"
)

ovation <- ovation[ovation$gene_id %notin% exclude, ]

#  Join the dataframe with feature meta- and positional data in the gtf-derived
#+ dataframe
df_tmp_1 <- dplyr::full_join(
    ovation[1:(nrow(ovation) - 3), ],
    comp_S,
    by = "gene_id"
)
df_tmp_2 <- ovation[(nrow(ovation) - 2):nrow(ovation), ]

ovation <- dplyr::bind_rows(df_tmp_1, df_tmp_2)

rm(df_tmp_1, df_tmp_2)

#  Be sure the summary values have non-NA values in metadata column "category"
ovation$category <- ifelse(
    is.na(ovation$category), ovation$gene_id, ovation$category
)

#  Create a dataframe of counts grouped by category metadata
ovation_summarize <- ovation %>%
    dplyr::group_by(category) %>%
    dplyr::summarize(
        sum_WT_G1_N_rep1 = sum(WT_G1_N_rep1),
        sum_WT_G1_N_rep2 = sum(WT_G1_N_rep2),
        sum_WT_G1_SS_rep1 = sum(WT_G1_SS_rep1),
        sum_WT_G1_SS_rep2 = sum(WT_G1_SS_rep2),
        sum_WT_Q_N_rep1 = sum(WT_Q_N_rep1),
        sum_WT_Q_N_rep2 = sum(WT_Q_N_rep2),
        sum_WT_Q_SS_rep1 = sum(WT_Q_SS_rep1),
        sum_WT_Q_SS_rep2 = sum(WT_Q_SS_rep2),
        number_of_features = dplyr::n()
    )

#  Give the summarized categories clearer, better names
ovation_summarize$category[stringr::str_detect(
    ovation_summarize$category, "__alignment_not_unique_I-XVI"
)] <- "multimapper (excluded)"
ovation_summarize$category[stringr::str_detect(
    ovation_summarize$category, "__ambiguous"
)] <- "ambiguous"
ovation_summarize$category[stringr::str_detect(
    ovation_summarize$category, "__no_feature_I-XVI"
)] <- "no feature"
ovation_summarize$category[stringr::str_detect(
    ovation_summarize$category, "PG"
)] <- "pseudogene"

#  Order the categories alphabetically without respect to case, and exclude
#+ the "multimapper (excluded)" category
ovation_summarize <- ovation_summarize %>%
    dplyr::arrange(tolower(category)) %>%
    dplyr::filter(!stringr::str_detect(
        category, "^multimapper*"
    ))

#  Assign NA to "number of features" for the summary-value categories
ovation_summarize$number_of_features <- ifelse(
    ovation_summarize$number_of_features == 1,
    NA_integer_,
    ovation_summarize$number_of_features
)

#  Remove unnecessary string "sum_WT_" from the column names
colnames(ovation_summarize) <- colnames(ovation_summarize) %>%
    stringr::str_remove("sum_WT_")

# #  Test the piping of the dataframe
# ovation_summarize %>%
#     dplyr::select(-number_of_features) %>%
#     tidyr::pivot_longer(cols = c(
#         G1_N_rep1, G1_N_rep2, G1_SS_rep1, G1_SS_rep2,
#         Q_N_rep1, Q_N_rep2, Q_SS_rep1, Q_SS_rep2
#     )) %>%
#     dplyr::rename(sample = name, counts = value)


#  Plot per-replicate counts proportions --------------------------------------
# set.seed(24)
ovation_summarize %>%
    tidyr::pivot_longer(cols = c(
        G1_N_rep1, G1_N_rep2, G1_SS_rep1, G1_SS_rep2,
        Q_N_rep1, Q_N_rep2, Q_SS_rep1, Q_SS_rep2
    )) %>%
    dplyr::rename(samples = name, counts = value) %>%
    ggplot(aes(fill = category, y = counts, x = samples)) +
    geom_bar(position = "fill", stat = "identity") +
    scale_fill_manual(
        values = length(ovation_summarize$category) %>%
            viridisLite::viridis() #%>%
            #sample()
    ) +
    theme_slick

ovation_summarize %>%
    tidyr::pivot_longer(cols = c(
        G1_N_rep1, G1_N_rep2, G1_SS_rep1, G1_SS_rep2,
        Q_N_rep1, Q_N_rep2, Q_SS_rep1, Q_SS_rep2
    )) %>%
    dplyr::rename(samples = name, counts = value) %>%
    ggplot(aes(fill = category, y = counts, x = samples)) +
    geom_bar(position = "fill", stat = "identity") +
    theme_slick


#  Plot replicate-averaged counts proportions ---------------------------------
ovation_summarize.pre <- ovation_summarize
# ovation_summarize <- ovation_summarize.pre

ovation_summarize$mean_G1_N <- apply(
    ovation_summarize.pre[, 2:3], 1, function(x) mean(x)
)
ovation_summarize$SEM_G1_N <- apply(
    ovation_summarize.pre[, 2:3], 1, function(x) sd(x)/sqrt(length(x))
)
ovation_summarize$mean_G1_SS <- apply(
    ovation_summarize.pre[, 4:5], 1, function(x) mean(x)
)
ovation_summarize$SEM_G1_SS <- apply(
    ovation_summarize.pre[, 4:5], 1, function(x) sd(x)/sqrt(length(x))
)
ovation_summarize$mean_Q_N <- apply(
    ovation_summarize.pre[, 6:7], 1, function(x) mean(x)
)
ovation_summarize$SEM_Q_N <- apply(
    ovation_summarize.pre[, 6:7], 1, function(x) sd(x)/sqrt(length(x))
)
ovation_summarize$mean_Q_SS <- apply(
    ovation_summarize.pre[, 8:9], 1, function(x) mean(x)
)
ovation_summarize$SEM_Q_SS <- apply(
    ovation_summarize.pre[, 8:9], 1, function(x) sd(x)/sqrt(length(x))
)

# set.seed(24)
ovation_summarize %>%
    tidyr::pivot_longer(cols = c(
        mean_G1_N, mean_G1_SS, mean_Q_N, mean_Q_SS
    )) %>%
    dplyr::rename(samples = name, counts = value) %>%
    ggplot(aes(fill = category, y = counts, x = samples)) +
    geom_bar(position = "fill", stat = "identity") +
    scale_fill_manual(
        values = length(ovation_summarize$category) %>%
            viridisLite::viridis() #%>%
            #sample()
    ) +
    theme_slick

ovation_summarize %>%
    tidyr::pivot_longer(cols = c(
        mean_G1_N, mean_G1_SS, mean_Q_N, mean_Q_SS
    )) %>%
    dplyr::rename(samples = name, counts = value) %>%
    ggplot(aes(fill = category, y = counts, x = samples)) +
    geom_bar(position = "fill", stat = "identity") +
    theme_slick
```
</details>
<br />

### Clean up
#### Code

<details>
<summary><i>Code: Clean up</i></summary>

```{r}
#!/usr/bin/env Rscript

rm(
    `__alignment_not_unique`, `__alignment_not_unique_I-XVI`, `__ambiguous`,
    `__no_feature`, `__no_feature_I-XVI`, `__sum_I-XVI`, comp_S, comp_SA,
    exclude, metadata, `Mito-KL-20S_multi`, `Mito-KL-20S_uni`, ovation,
    ovation.pre, ovation_summarize, ovation_summarize.pre, `SC-I-XVI_all`,
    `SC-I-XVI_uni`, t_S_all, t_S_frac, t_S_none, t_S_none_rel,
    `t_S_none_rel_htseq-count-summary`, t_S_none_rel_joined,
    t_S_none_rel_pos_info, t_S_rand, t_SA_all, t_SA_frac, t_SA_none, t_SA_rand,
    uni_multi_etc, uni_multi_etc_rel, valid_counts
)
```
</details>
<br />
<br />

## Perform extensive test analyses
`#NOTE` Functions and themes that are already loaded into the environment:
- `theme_slick`
- `theme_slick_no_legend`
- `%notin%`
- `draw_scree_plot`
- `get_name_of_var`
- `get_top_loadings`
- `plot_biplot`
- `plot_loadings`
- `plot_pos_neg_loadings_each_axis`
- `read_in_counts_matrix`
- `record_positional_info`

Chunk `Get situated` has been run as well.
<br />

### Initialize additional functions
#### Code
<details>
<summary><i>Code: Initialize additional functions</i></summary>

```{r}
#!/usr/bin/env Rscript

read_in_counts_matrix <- function(x) {
    # ...
    # :param x: counts matrix from htseq-count
    # :return y: counts matrix as tibble
    y <- readr::read_tsv(x, show_col_types = FALSE) %>% 
        dplyr::rename(gene_id = ...1)
    return(y)
}


clean_column_names <- function(x) {
    # ...
    # :param x: tibble of counts for relevant datasets
    # :return y: "cleaned," abbreviated column names for above tibble
    y <- colnames(x) %>%
        gsub("bams_renamed/UT_prim_UMI/", "", .) %>%
        gsub("\\.UT_prim_UMI\\.bam", "", .) %>%
        gsub("*_day._ovn*", "", .) %>%
        gsub("aux-F_tc-F_", "", .) %>%
        gsub("_tech1", "", .) %>%
        gsub("WT_", "", .)  # Clean up column names
    return(y)
}


run_PCA_pipeline <- function(counts, metadata, gene_id, transformed) {
    # ...
    # :param counts: ... <data.frame>
    # :param metadata: ... <data.frame>
    # :param gene_id: ... <character vector>
    # :param transformed: FALSE for raw counts, TRUE for rlog counts <logical>
    # :return results_list: ... <list>
    stopifnot(is.data.frame(counts))
    stopifnot(is.data.frame(metadata))
    stopifnot(isTRUE(tibble::has_rownames(metadata)))
    stopifnot(is.character(gene_id))
    stopifnot(is.logical(transformed))

    #  Create a PCAtools "pca" S4 object
    pca <- PCAtools::pca(counts, metadata = metadata)
    rownames(pca$loadings) <- gene_id
    
    #  Determine "significant" PCs with Horn's parallel analysis (see
    #+ Horn, 1965)
    horn <- PCAtools::parallelPCA(counts[, 2:ncol(counts)])
    
    #  Determine "significant" principle components with the elbow
    #+ method (see Buja and Eyuboglu, 1992)
    elbow <- PCAtools::findElbowPoint(pca$variance)
    
    #  Evaluate cumulative proportion of explained variance with a
    #+ scree plot
    p_scree <- draw_scree_plot(pca, horn = horn$n, elbow = elbow)
    
    #  Save component loading vectors in their own dataframe
    loadings <- as.data.frame(pca$loadings)
    
    #  Evaluate the component loading vectors for the number of
    #+ "significant" PCs identified via the elbow method plus two
    PCs <- paste0("PC", 1:(as.numeric(elbow) + 2))
    top_loadings_all <- lapply(
        PCs, get_top_loadings, x = loadings, z = "all", a = TRUE
    )
    top_loadings_pos <- lapply(
        PCs, get_top_loadings, x = loadings, z = "pos", a = TRUE
    )
    top_loadings_neg <- lapply(
        PCs, get_top_loadings, x = loadings, z = "neg", a = TRUE
    )
    names(top_loadings_all) <-
        names(top_loadings_pos) <-
        names(top_loadings_neg) <-
        PCs
    
    #  Evaluate positive and negative loadings on axes of biplots; look at the
    #+ top 15 per axis
    p_images <- list()
    mat <- combn(PCs, 2)
    for(l in 1:ncol(mat)) {
        # l <- 1
        m<- mat[, l]
        
        PC_x <- x_label <- m[1]
        PC_y <- y_label <- m[2]
        
        if(isFALSE(transformed)) {
            x_min_biplot <- -350000
            x_max_biplot <- 350000
            y_min_biplot <- -350000
            y_max_biplot <- 350000
            x_min_loadings_plot <- -0.5
            x_max_loadings_plot <- 0.5
            y_min_loadings_plot <- -0.5
            y_max_loadings_plot <- 0.5
        } else if(isTRUE(transformed)) {
            x_min_biplot <- -100  #ARGUMENT?
            x_max_biplot <- 100  #ARGUMENT?
            y_min_biplot <- -100  #ARGUMENT?
            y_max_biplot <- 100  #ARGUMENT?
            x_min_loadings_plot <- -0.1
            x_max_loadings_plot <- 0.1
            y_min_loadings_plot <- -0.1
            y_max_loadings_plot <- 0.1
        }
        
        p_images[[paste0("PCAtools.", PC_x, ".v.", PC_y)]] <-
            plot_biplot(
                pca = pca,
                PC_x = PC_x,
                PC_y = PC_y,
                loadings_show = FALSE,
                loadings_n = 0,
                meta_color = "state",  #ARGUMENT
                meta_shape = "transcription",  #ARGUMENT
                x_min = x_min_biplot,
                x_max = x_max_biplot,
                y_min = y_min_biplot,
                y_max = y_max_biplot
            )
        
        p_images[[paste0("KA.", PC_x, ".v.", PC_y)]] <-
            plot_pos_neg_loadings_each_axis(
                df_all = loadings,
                df_pos = top_loadings_pos,
                df_neg = top_loadings_neg,
                PC_x = PC_x,
                PC_y = PC_y,
                row_start = 1,
                row_end = 15,  # 30
                x_min = x_min_loadings_plot,
                x_max = x_max_loadings_plot,
                y_min = y_min_loadings_plot,
                y_max = y_max_loadings_plot,
                x_nudge = 0.02,  # 0.02,  # 0.04,
                y_nudge = 0.04,  # 0.04,  # 0.02,
                x_label = x_label,
                y_label = y_label,
                col_line_pos = "#229E37",
                col_line_neg = "#113275",
                col_seg_pos = "grey",
                col_seg_neg = "grey"
            )
        
        p_images[[paste0("KA.", PC_x, ".v.", PC_y)]]
    }
    
    
    #  Plot the top features on an axis of "component loading range" to
    #+ visualize the top variables (features) that drive variance among
    #+ PCs of interest
    p_loadings <- PCAtools::plotloadings(
        pca,
        components = PCAtools::getComponents(
            pca, 1:length(PCs)
        ),
        rangeRetain = 0.025,
        absolute = FALSE,
        col = c("#167C2875", "#FFFFFF75", "#7835AC75"),
        title = "Loadings plot",
        subtitle = "Top 2.5% of variables (i.e., features)",
        borderColour = "#000000",
        borderWidth = 0.2,
        gridlines.major = TRUE,
        gridlines.minor = TRUE,
        axisLabSize = 10,
        labSize = 3,  # label_size
        drawConnectors = TRUE,
        widthConnectors = 0.2,
        typeConnectors = "closed",
        colConnectors = "black"
    ) +
        # ggplot2::coord_flip() +
        theme_slick_no_legend
    p_loadings
    #TODO Work up some logic for saving the plot
    
    
    #  Evaluate correlations between PCs and model variables; answer
    #+ the question, "What is driving biologically significant variance
    #+ in our data?"
    p_cor <- PCAtools::eigencorplot(
        pca,
        components = PCAtools::getComponents(pca, 1:8),
        metavars = c("state", "transcription", "replicate"),  #ARGUMENT
        # col = viridisLite::viridis(n = 100) %>% rev(),
        col = c("#FFFFFF", "#7835AC"),
        scale = FALSE,
        corFUN = "pearson",
        corMultipleTestCorrection = "BH",
        plotRsquared = TRUE,
        colFrame = "#FFFFFF",
        main = bquote(Pearson ~ r^2 ~ correlates),
        fontMain = 1,
        titleX = "Principal components",
        fontTitleX = 1,
        fontLabX = 1,
        titleY = "Model variables",
        rotTitleY = 90,
        fontTitleY = 1,
        fontLabY = 1
    )
    p_cor
    
    results_list <- list()
    results_list[["01_pca"]] <- pca
    results_list[["02_horn"]]<- horn
    results_list[["03_elbow"]]<- elbow
    results_list[["04_p_scree"]]<- p_scree
    results_list[["05_loadings"]]<- loadings
    results_list[["06_PCs"]]<- PCs
    results_list[["07_top_loadings_all"]]<- top_loadings_all
    results_list[["08_top_loadings_pos"]]<- top_loadings_pos
    results_list[["09_top_loadings_neg"]]<- top_loadings_neg
    results_list[["10_p_images"]]<- p_images
    results_list[["11_p_loadings"]]<- p_loadings
    results_list[["12_p_cor"]]<- p_cor
    
    return(results_list)
}


process_feature_summary_counts <- function(
        counts,
        summary_htseq,
        summary_samtools
) {
    # ...
    # :param counts: dataframe of relevant counts
    # :param summary_htseq: dataframe of relevant htseq-count "summary values"
    # :param summary_samtools: dataframe of relevant samtools "summary values"
    
    #  Test
    # counts <- counts_relevant
    # summary_htseq <- summary_values
    # summary_samtools <- uni_multi_etc_rel
    
    #  ----------------------------------------------------------------
    #  Load in and process combined feature/summary counts
    #  ------------------------
    #  - Step 00: Combine feature counts with htseq-count-/samtools-
    #+           derived summary values
    combined_raw <- dplyr::bind_rows(counts, summary_htseq, summary_samtools)
    
    #  ------------------------
    #  - Step 01: Get multimappers associated with "Mito-KL-20S"
    `Mito-KL-20S_multi` <-
        combined_raw[stringr::str_detect(
            combined_raw$gene_id, "Mito-KL-20S_multi"
        ), ]
    # `Mito-KL-20S_multi`
    
    #  ------------------------
    #  - Step 02: Get unimappers associated with "Mito-KL-20S"
    `Mito-KL-20S_uni` <-
        combined_raw[stringr::str_detect(
            combined_raw$gene_id, "Mito-KL-20S_uni"
        ), ]
    # `Mito-KL-20S_uni`
    
    #  ------------------------
    #  - Step 03: Get unimappers associated with "SC-I-XVI"
    `SC-I-XVI_uni` <-
        combined_raw[stringr::str_detect(
            combined_raw$gene_id, "SC-I-XVI_uni"
        ), ]
    # `SC-I-XVI_uni`
    
    #  ------------------------
    #  - Step 04: Subtract `Mito-KL-20S_multi` from
    #+           `__alignment_not_unique`
    `__alignment_not_unique` <- combined_raw[stringr::str_detect(
        combined_raw$gene_id, "__alignment_not_unique"
    ), 2:9]
    
    `__alignment_not_unique_I-XVI` <-
        `__alignment_not_unique` - `Mito-KL-20S_multi`[, 2:9]
    
    `__alignment_not_unique_I-XVI`$gene_id <-
        "__alignment_not_unique_I-XVI"
    `__alignment_not_unique_I-XVI` <-
        `__alignment_not_unique_I-XVI` %>%
        dplyr::relocate(gene_id, .before = "G1_N_rep1")
    # `__alignment_not_unique_I-XVI`
    
    #  ------------------------
    #  - Step 05: Subtract `Mito-KL-20S_uni` from `__no_feature`
    `__no_feature` <- combined_raw[stringr::str_detect(
        combined_raw$gene_id, "__no_feature"
    ), 2:9]
    
    `__no_feature_I-XVI` <-
        `__no_feature` - `Mito-KL-20S_uni`[, 2:9]
    
    `__no_feature_I-XVI`$gene_id <-
        "__no_feature_I-XVI"
    `__no_feature_I-XVI` <-
        `__no_feature_I-XVI` %>%
        dplyr::relocate(gene_id, .before = "G1_N_rep1")
    # `__no_feature_I-XVI`
    
    #  ------------------------
    #  - Step 06: Take the sum of the following
    #+     - "`__no_feature_I-XVI`"
    #+     - "`__ambiguous`"
    #+     - "`__alignment_not_unique_I-XVI`"
    #+     - "valid" counts
    `__ambiguous` <- combined_raw[stringr::str_detect(
        combined_raw$gene_id, "__ambiguous"), 
    ]
    valid_counts <- sapply(counts_relevant[, -1], sum)
    
    `__sum_I-XVI` <-
        `__no_feature_I-XVI`[, 2:9] +
        `__ambiguous`[, 2:9] +
        `__alignment_not_unique_I-XVI`[, 2:9] +
        valid_counts
    `__sum_I-XVI`$gene_id <-
        "__sum_I-XVI"
    `__sum_I-XVI` <-
        `__sum_I-XVI` %>%
        dplyr::relocate(gene_id, .before = "G1_N_rep1")
    # `__sum_I-XVI`
    
    #  ------------------------
    #  - Step 07: Get numbers of all counts associated with "SC-I-XVI"
    `SC-I-XVI_all` <- 
        combined_raw[stringr::str_detect(
            combined_raw$gene_id, "SC-I-XVI_all"
        ), ]
    # `SC-I-XVI_all`
    
    #  ------------------------
    #  - Step 08: Check that `__sum_I-XVI` and `SC-I-XVI_all` are equal
    #+           (if not, then there is a problem and troubleshooting
    #+           needs to occur)
    identical(
        as.numeric(`__sum_I-XVI`[, 2:9]),
        as.numeric(`SC-I-XVI_all`[, 2:9])
    )  # [1] TRUE
    
    #  ------------------------
    #  - Step 09: Create a dataframe of only relevant feature counts and
    #+           relevant summary values 
    combined_raw_rel <- dplyr::bind_rows(
        combined_raw,
        `__alignment_not_unique_I-XVI`,
        `__no_feature_I-XVI`,
        `__sum_I-XVI`
    )
    # combined_raw_rel %>% tail(n = 20)
    # combined_raw_rel %>% tail(n = 15) %>% dplyr::select(gene_id)
    
    combined_raw_rel.pre <- combined_raw_rel
    # combined_raw_rel <- combined_raw_rel.pre
    
    #  Retain only relevant, appropriately calculated summary values
    exclude <- c(
        "__no_feature", "__too_low_aQual", "__not_aligned",
        "__alignment_not_unique", "sample_total", "Mito-KL-20S_all",
        "Mito-KL-20S_uni", "Mito-KL-20S_multi", "SC-I-XVI_all",
        "SC-I-XVI_uni", "SC-I-XVI_multi", "__sum_I-XVI"
    )
    
    combined_raw_rel <- combined_raw_rel[
        combined_raw_rel$gene_id %notin% exclude, 
    ]
    # combined_raw_rel %>% tail(n = 10)
    
    #  ------------------------
    #  - Step 10: Join the dataframe with feature meta- and positional 
    #+            data in the gtf-derived dataframe
    df_tmp_1 <- dplyr::full_join(
        combined_raw_rel[1:(nrow(combined_raw_rel) - 3), ],
        gtf,
        by = "gene_id"
    )
    df_tmp_2 <- combined_raw_rel[
        (nrow(combined_raw_rel) - 2):nrow(combined_raw_rel), 
    ]
    combined_raw_rel <- dplyr::bind_rows(df_tmp_1, df_tmp_2)
    
    #  Be sure the summary values have non-NA values in metadata column
    #+ "category"
    combined_raw_rel$category <- ifelse(
        is.na(combined_raw_rel$category),
        combined_raw_rel$gene_id,
        combined_raw_rel$category
    )
    
    #  ------------------------
    #  - Step 11: Clean up unnecessary variables
    rm(
        `__alignment_not_unique`, `__alignment_not_unique_I-XVI`,
        `__ambiguous`, `__no_feature`, `__no_feature_I-XVI`,
        `__sum_I-XVI`, combined_raw_rel.pre, df_tmp_1, df_tmp_2,
        `Mito-KL-20S_uni`, `Mito-KL-20S_multi`, `SC-I-XVI_all`,
        `SC-I-XVI_uni`
    )
    
    return(combined_raw_rel)
}


#HERE
pivot_on_columns <- function(
    tbl,
    vec_col,
    rename_n = "samples",
    rename_v = "counts"
) {
    # ...
    # :param tbl: tibble/dataframe
    # :param vec_col: a vector of column names to long-pivot on
    # :param rename_n: what to rename new column "name"
    # :param rename_v: what to rename new column "value"
    
    # #  Test
    # tbl <- summary
    # vec_col <- col_piv
    # rename_n <- "samples"
    # rename_v <- "counts"
    # # rm(tbl, vec_col, rename_n, rename_v)
    
    if(length(vec_col) <= 1) {
        stop("Vector of column names must be greater than 1")
    } else if(isFALSE(any(vec_col %in% colnames(tbl)))) {
        stop("No column-name elements match tibble/dataframe column names")
    }
    tbl <- tbl %>%
        tidyr::pivot_longer(cols = vec_col) %>%
        dplyr::rename(
            !! dplyr::quo_name(rename_n) := name,
            !! dplyr::quo_name(rename_v) := value
        )
    
    return(tbl)
}


#HERE
`plot-rep-prop_by-sample_stacked` <- function(tbl) {
    # ...
    # :param tbl: Pivoted summary tibble
    # :return plot: ...
    plot <- tbl %>%
        ggplot(aes(x = samples, y = counts, fill = category)) +
        geom_bar(position = "fill", stat = "identity") +
        theme_slick +
        ggpubr::rotate_x_text(45) +
        ggtitle(
            "Sample-wise proportions of counts",
            subtitle = "Stacked"
        )
    return(plot)
}


`plot-rep-prop_by-category_stacked` <- function(tbl) {
    # ...
    # :param tbl: Pivoted summary tibble
    # :return plot: ...
    plot <- tbl %>%
        ggplot(aes(x = category, y = counts, fill = samples)) +
            geom_bar(position = "fill", stat = "identity") +
            theme_slick +
            ggpubr::rotate_x_text(45) +
            ggtitle(
                "Category-wise proportions of counts",
                subtitle = "Stacked"
            )
    return(plot)
}


`plot-rep-prop_by-category_dodged` <- function(tbl) {
    # ...
    # :param tbl: Pivoted summary tibble
    # :return plot: ...
    plot <- tbl %>%
        ggplot(aes(
            x = category, y = counts / sum(counts), fill = samples
        )) +
            geom_bar(position = "dodge", stat = "identity") +
            theme_slick +
            ggpubr::rotate_x_text(45) +
            ggtitle(
                "Category-wise proportions of counts",
                subtitle = "Dodged"
            )
    return(plot)
}


`plot-rep-prop_by-sample_dodged` <- function(tbl) {
    # ...
    # :param tbl: Pivoted summary tibble
    # :return plot: ...
    plot <- tbl %>%
        ggplot(aes(
            x = samples, y = counts / sum(counts), fill = category
        )) +
            geom_bar(position = "dodge", stat = "identity") +
            theme_slick +
            ggpubr::rotate_x_text(45) +
            ggtitle(
                "Sample-wise proportions of counts",
                subtitle = "Dodged"
            )
    return(plot)
}
```
</details>
<br />

### Load "comprehensive" `gtf` files
#### Code
<details>
<summary><i>Code: Load "comprehensive" `gtf` files</i></summary>

```{r TBD 1, results='hide', message=FALSE, warning=FALSE}
#!/usr/bin/env Rscript

p_gtf <- "./outfiles_gtf-gff3/comprehensive/S288C_reference_genome_R64-1-1_20110203"
f_FS <- "processed_features_sense.gtf"
f_FIS <- "processed_features-intergenic_sense.gtf"
f_FIAS <- "processed_features-intergenic_sense-antisense.gtf"

comp_FS <- paste(p_gtf, f_FS, sep = "/") %>%
    rtracklayer::import() %>%
    tibble::as_tibble() %>%
    dplyr::arrange(seqnames, start) %>%
    dplyr::select(-c(score, phase)) %>% 
    dplyr::rename(category = type.1)

comp_FIS <- paste(p_gtf, f_FIS, sep = "/") %>%
    rtracklayer::import() %>%
    tibble::as_tibble() %>%
    dplyr::arrange(seqnames, start) %>%
    dplyr::select(-c(score, phase)) %>% 
    dplyr::rename(category = type.1)

comp_FIAS <- paste(p_gtf, f_FIAS, sep = "/") %>%
    rtracklayer::import() %>%
    tibble::as_tibble() %>%
    dplyr::arrange(seqnames, start) %>%
    dplyr::select(-c(score, phase)) %>% 
    dplyr::rename(category = type.1)

rm(p_gtf, f_FS, f_FIS, f_FIAS)
```
</details>
<br />

### Read in counts matrix information for relevant datasets ("Ovation" samples)
#### Code
<details>
<summary><i>Code: Read in counts matrix information for relevant datasets ("Ovation" samples)</i></summary>

```{r}
p_matrices <- "./outfiles_htseq-count/comprehensive/S288C_reference_genome_R64-1-1_20110203/UT_prim_UMI"

f_part_1 <- "all-bams"
f_part_2 <- "hc-strd-eq"
f_part_3 <- c("union", "intersection-nonempty")
f_part_4 <- paste("nonunique", c("none", "fraction", "all"), sep = "-")
f_part_5 <- paste0(
    "processed_features",
    c("_sense", "-intergenic_sense", "-intergenic_sense-antisense"),
    ".tsv"
)
```
</details>
<br />

### Read in, process metrics from `work_calculate_uni-multimappers-etc.md`
#### Code
<details>
<summary><i>Code: Read in, process metrics from `work_calculate_uni-multimappers-etc.md`</i></summary>

```{r}
#!/usr/bin/env Rscript

#  Read in metrics table
p_uni_multi_etc <- "outfiles_htseq-count"
f_uni_multi_etc <- "calculate_uni-multimappers-etc.UT_prim_UMI.txt"
uni_multi_etc <- readr::read_tsv(
    paste(p_uni_multi_etc, f_uni_multi_etc, sep = "/"),
    show_col_types = FALSE
)

rm(f_uni_multi_etc, p_uni_multi_etc)

#  Isolate relevant dataset metrics ("Ovation samples)
relevant <- c(
    "WT_G1_day1_ovn_N_aux-F_tc-F_rep1_tech1",
    "WT_G1_day1_ovn_N_aux-F_tc-F_rep2_tech1",
    "WT_G1_day1_ovn_SS_aux-F_tc-F_rep1_tech1",
    "WT_G1_day1_ovn_SS_aux-F_tc-F_rep2_tech1",
    "WT_Q_day7_ovn_N_aux-F_tc-F_rep1_tech1",
    "WT_Q_day7_ovn_N_aux-F_tc-F_rep2_tech1",
    "WT_Q_day7_ovn_SS_aux-F_tc-F_rep1_tech1",
    "WT_Q_day7_ovn_SS_aux-F_tc-F_rep2_tech1"
)
uni_multi_etc_rel <- uni_multi_etc %>% dplyr::filter(sample %in% relevant)
uni_multi_etc_rel$sample <- uni_multi_etc_rel$sample %>%
    gsub("*_day._ovn*", "", .) %>%
    gsub("aux-F_tc-F_", "", .) %>%
    gsub("_tech1", "", .) %>%
    gsub("WT_", "", .)

#  Transpose the dataframe
uni_multi_etc_rel <- uni_multi_etc_rel %>%
    t() %>%
    tibble::as_tibble(rownames = "category")

#  Convert row 1 to column names
colnames(uni_multi_etc_rel) <- uni_multi_etc_rel[1, ]
uni_multi_etc_rel <- uni_multi_etc_rel[-1, ]
colnames(uni_multi_etc_rel)[1] <- "gene_id"

#  Convert number elements from type character to type numeric
uni_multi_etc_rel[, 2:length(uni_multi_etc_rel)] <- sapply(
    uni_multi_etc_rel[, 2:length(uni_multi_etc_rel)], as.numeric
) %>%
    tibble::as_tibble()

#  Clean up
rm(relevant)
```
</details>
<br />

### Run the analysis pipeline
#### Code
<details>
<summary><i>Code: Run the analysis pipeline</i></summary>

```{r}
#  Run the analysis pipeline
h <- 0
matrices <- list()
# for(i in f_part_3) {
#     for(j in f_part_4) {
#         for(k in f_part_5) {
            i <- f_part_3[1]  # print(i)
            j <- f_part_4[1]  # print(j)
            k <- f_part_5[1]  # print(k)
            
            h <- h + 1
            if(h < 10) {
                h_n <- paste0(0, h)
            } else {
                h_n <- h
            }

            if(i == "union") {
                i_r = "u"
            } else if(i == "intersection-nonempty") {
                i_r = "i"
            }
            
            if(j == "nonunique-none") {
                j_r <- "n"
            } else if(j == "nonunique-fraction") {
                j_r <- "f"
            } else if(j == "nonunique-all") {
                j_r <- "a"
            }
            
            if(k == "processed_features_sense.tsv") {
                k_r <- "fs"
                gtf <- comp_FS
            } else if(k == "processed_features-intergenic_sense.tsv") {
                k_r <- "fis"
                gtf <- comp_FIS
            } else if(k == "processed_features-intergenic_sense-antisense.tsv") {
                k_r <- "fias"
                gtf <- comp_FIAS
            }
            
            #  ----------------------------------------------------------------
            #  Name the list element
            name <- paste(h_n, i_r, j_r, k_r, sep = ".")
            
            #  ----------------------------------------------------------------
            #  Specify the counts matrix file
            file <- paste(
                paste(p_matrices, f_part_1, sep = "/"),
                f_part_2, i, j, k,
                sep = "."
            )
            
            #  ----------------------------------------------------------------
            #  Load the counts matrix
            counts <- read_in_counts_matrix(file)
            
            #  ----------------------------------------------------------------
            #  Isolate relevant WT G1 and Q data
            relevant <- colnames(counts)[
                stringr::str_detect(colnames(counts), "ovn")
            ]
            
            #  ----------------------------------------------------------------
            #  Isolate relevant WT G1 and Q data: htseq-count counts
            counts_relevant <- counts[, c("gene_id", relevant)] %>%
                dplyr::filter(!stringr::str_detect(gene_id, "^__"))
            colnames(counts_relevant) <- counts_relevant %>%
                clean_column_names()
            
            #  ----------------------------------------------------------------
            #  Isolate relevant WT G1 and Q data: htseq-count "summary values"
            summary_values <- counts[, c("gene_id", relevant)] %>%
                dplyr::filter(stringr::str_detect(gene_id, "^__"))
            colnames(summary_values) <- summary_values %>%
                clean_column_names()
            
            #  ----------------------------------------------------------------
            #  Join relevant sample data with "positional" (etc.) information
            joined <- dplyr::full_join(counts_relevant, gtf, by = "gene_id")
            
            #  ----------------------------------------------------------------
            #  Create a metadata matrix for WT G1 and Q datasets
            metadata <- counts_relevant[, 2:ncol(counts_relevant)] %>%
                colnames() %>%
                stringr::str_split("_") %>%
                as.data.frame() %>%
                t() %>%
                as.data.frame() %>%
                dplyr::rename(state = V1, transcription = V2, replicate = V3) %>% 
                dplyr::mutate(
                    state = factor(state, level = c("G1", "Q")),
                    transcription = factor(transcription, level = c("SS", "N")),
                    replicate = factor(replicate, level = c("rep1", "rep2")),
                    no_state = as.factor(sapply(
                        as.character(state),
                        switch,
                        "G1" = 0,
                        "Q" = 1,
                        USE.NAMES = FALSE
                    )),
                    no_transcription = as.factor(sapply(
                        as.character(transcription),
                        switch,
                        "SS" = 0,
                        "N" = 1,
                        USE.NAMES = FALSE
                    )),
                    no_replicate = as.factor(sapply(
                        as.character(replicate),
                        switch,
                        "rep1" = 0,
                        "rep2" = 1,
                        USE.NAMES = FALSE
                    )),
                )
            rownames(metadata) <- colnames(
                counts_relevant[, 2:ncol(counts_relevant)]
            )
            
            #  ----------------------------------------------------------------
            #  Record positional information in a GRanges object
            #TODO pos_info$thorough
            pos_info <- GenomicRanges::GRanges(
                seqnames = joined$seqnames,
                ranges = IRanges::IRanges(joined$start, joined$end),
                strand = joined$strand,
                length = joined$width,
                gene_id = joined$gene_id,
                transcript_id = joined$transcript_id,
                category = joined$category,
                orf_classification = joined$orf_classification,
                source_id = joined$source_id
            )
            
            #  ----------------------------------------------------------------
            #  Test replicate consistency via PCA of non-normalized counts
            pca_raw <- run_PCA_pipeline(
                counts_relevant[, 2:ncol(counts_relevant)],
                metadata,
                counts_relevant$gene_id,
                transformed = FALSE
            )
            
            #  ----------------------------------------------------------------
            #  Test replicate consistency via PCA of rlog-transformed counts
            #+ (with no model, i.e., vary on intercept)
            #  Create DESeq2 "dds" object
            counts_relevant_dds <- DESeq2::DESeqDataSetFromMatrix(
                countData = sapply(
                    counts_relevant[, 2:ncol(counts_relevant)],
                    as.integer
                ),
                colData = metadata,
                design = ~ 1,
                rowRanges = pos_info
            )
            
            #  Perform DESeq2 rlog transformation; since the model design is
            #+ "~ 1", it makes no difference if rlog blind is TRUE or FALSE;
            #+ go with the default, which is TRUE
            counts_relevant_rlog <- DESeq2::rlog(counts_relevant_dds) %>%
                SummarizedExperiment::assay() %>%
                as.data.frame() %>%
                dplyr::mutate(gene_id = counts_relevant$gene_id) %>%
                dplyr::relocate(gene_id, .before = G1_N_rep1)
            
            #  Run PCA pipeline with rlog-tranformed counts
            pca_rlog <- run_PCA_pipeline(
                counts_relevant_rlog[, 2:ncol(counts_relevant_rlog)],
                metadata,
                counts_relevant_rlog$gene_id,
                transformed = TRUE
            )
            
            #  Check
            pca_raw
            pca_rlog
```

```{r}
            #HERE
            #  ----------------------------------------------------------------
            #  Load in and process combined feature/summary counts
            combined_raw_rel <- process_feature_summary_counts(
                counts = counts_relevant,
                summary_htseq = summary_values,
                summary_samtools = uni_multi_etc_rel
            )
            
            #  ----------------------------------------------------------------
            #  For plotting raw counts proportions, create a dataframe of
            #+ counts grouped by category metadata
            summary <- combined_raw_rel %>%
                dplyr::group_by(category) %>%
                dplyr::summarize(
                    G1_N_rep1 = sum(G1_N_rep1),
                    G1_N_rep2 = sum(G1_N_rep2),
                    G1_SS_rep1 = sum(G1_SS_rep1),
                    G1_SS_rep2 = sum(G1_SS_rep2),
                    Q_N_rep1 = sum(Q_N_rep1),
                    Q_N_rep2 = sum(Q_N_rep2),
                    Q_SS_rep1 = sum(Q_SS_rep1),
                    Q_SS_rep2 = sum(Q_SS_rep2),
                    number_of_features = dplyr::n()
                )
            
            #  Give the summarized categories clearer, better names
            summary$category[stringr::str_detect(
                summary$category, "__alignment_not_unique_I-XVI"
            )] <- "multimapper (excluded)"
            summary$category[stringr::str_detect(
                summary$category, "__ambiguous"
            )] <- "ambiguous"
            summary$category[stringr::str_detect(
                summary$category, "__no_feature_I-XVI"
            )] <- "no feature"
            summary$category[stringr::str_detect(
                summary$category, "PG"
            )] <- "pseudogene"
            
            #  Exclude the categories "multimapper (excluded)", "rRNA"
            #+ (requested by AG, TT, 2023-0509), and "tRNA" (requested by AG,
            #+ TT, 2023-0509)
            summary <- summary %>%
                dplyr::filter(!stringr::str_detect(
                    category, "^multimapper*"
                )) %>%
                dplyr::filter(!stringr::str_detect(category, "^rRNA$")) %>%
                dplyr::filter(!stringr::str_detect(category, "^tRNA$"))
            
            #  Assign NA to "number of features" for the summary-value
            #+ categories
            summary$number_of_features <- ifelse(
                summary$number_of_features == 1,
                NA_integer_,
                summary$number_of_features
            )
            
            #  Create a version of tibble "summary" that combines categories
            #+ "ARS", "centromere", "ncRNA", "pseudogene", and "telomere" into
            #+ a single new category, "other" (requested by AG, TT, 2023-0509)
            tmp_other <- summary %>%
                dplyr::filter(category %in% c("ARS", "centromere", "ncRNA", "pseudogene", "telomere")) %>%
                dplyr::bind_rows(summarise_all(., ~if(is.numeric(.)) { sum(.) } else { "other" })) %>%
                dplyr::filter(!category %in% c("ARS", "centromere", "ncRNA", "pseudogene", "telomere"))
            summary_other <- summary %>%
                dplyr::filter(!category %in% c("ARS", "centromere", "ncRNA", "pseudogene", "telomere")) %>%
                dplyr::bind_rows(tmp_other)
            rm(tmp_other)
            
            #  Sort tibbles "summary" and "summary_other" such that category
            #+ "gene" is the first row and the remaining rows are ordered
            #+ alphabetically
            summary <- summary %>%
                dplyr::arrange(tolower(category)) %>%
                dplyr::slice(4, 1:3, 5:nrow(summary)) %>%
                dplyr::mutate(
                    category = factor(category, level = category)
                )
            summary_other <- summary_other %>%
                dplyr::arrange(tolower(category)) %>%
                dplyr::slice(2, 1, 3:nrow(summary)) %>%
                dplyr::mutate(
                    category = factor(category, level = category)
                )
            
            #  Create version of "summary" and "summary_other" made up of
            #+ proportions (i.e., inter-sample proportions)
            prop_summary <- summary[, 2:9] %>%
                prop.table() %>%
                dplyr::mutate(
                    category = summary$category,
                    number_of_features = summary$number_of_features
                ) %>%
                dplyr::relocate(category, .before = G1_N_rep1)
            
            prop_summary_other <- summary_other[, 2:9] %>%
                prop.table() %>%
                dplyr::mutate(
                    category = summary_other$category,
                    number_of_features = summary_other$number_of_features
                ) %>%
                dplyr::relocate(category, .before = G1_N_rep1)
            
            #HERE
            #  Create version of "summary" and "summary_other" made up of
            #+ per-column (i.e., sample-wise or intra sample) proportions
            # (summary[, 2] / sum(summary[, 2])) * 100
            columnwise_prop_summary <-
                sweep(summary[, 2:9], 2, colSums(summary[, 2:9]), `/`) %>%
                dplyr::mutate(
                    category = summary$category,
                    number_of_features = summary$number_of_features
                ) %>%
                dplyr::relocate(category, .before = G1_N_rep1)
            
            columnwise_prop_summary_other <- sweep(
                    summary_other[, 2:9],
                    2,
                    colSums(summary_other[, 2:9]),
                    `/`
                ) %>%
                dplyr::mutate(
                    category = summary_other$category,
                    number_of_features = summary_other$number_of_features
                ) %>%
                dplyr::relocate(category, .before = G1_N_rep1)
            
            #  Check
            summary
            summary_other
            prop_summary
            prop_summary_other
            columnwise_prop_summary
            columnwise_prop_summary_other
```

```{r}
            #  ----------------------------------------------------------------
            #  Plot per-replicate counts proportions
            col_piv <- c(
                "G1_N_rep1", "G1_N_rep2", "G1_SS_rep1", "G1_SS_rep2",
                "Q_N_rep1", "Q_N_rep2", "Q_SS_rep1", "Q_SS_rep2"
            )
            
            summary_other %>%
                pivot_on_columns(col_piv) %>%
                `plot-rep-prop_by-sample_stacked`() +
                scale_fill_manual(values = viridisLite::viridis(7)) +
                xlab("") +
                ylab("proportion")
            summary_other %>%
                pivot_on_columns(col_piv) %>%
                `plot-rep-prop_by-sample_stacked`() +
                scale_fill_manual(values = viridisLite::viridis(7)) +
                coord_cartesian(ylim = c(0, 0.35)) +
                xlab("") +
                ylab("proportion")
```

```{r}
            #HERE
            `sample-by-category_pivoted` <-
                columnwise_prop_summary_other[, 1:9] %>%
                    pivot_on_columns(col_piv) %>%
                    dplyr::mutate(
                        samples = stringr::str_remove(samples, "_rep1|_rep2") %>%
                            factor()
                    )
            
            `sample-by-category_stats` <-
                `sample-by-category_pivoted` %>%
                    dplyr::group_by(category) %>%
                    rstatix::t_test(
                        counts ~ samples,
                        alternative = "two.sided",
                        p.adjust.method = "BH",
                        var.equal = TRUE
                    ) %>%
                    dplyr::mutate(
                        p.signif = ifelse(
                            p <= 0.05 & p > 0.01,
                            "*",
                            ifelse(
                                p <= 0.01 & p > 0.001,
                                "**",
                                ifelse(
                                    p <= 0.001 & p > 0.0001,
                                    "***",
                                    ifelse(
                                        p <= 0.0001 & p > 0.00001,
                                        "****",
                                        "ns"
                                    )
                                )
                            )
                        )
                    ) %>%
                    dplyr::relocate(p.signif, .after = p)
            
            #  Check
            `sample-by-category_pivoted`
            `sample-by-category_stats`
```

```{r}
            #HERE
            `prop-plot_w-error_full` <- `sample-by-category_pivoted` %>%
                ggpubr::ggbarplot(
                    x = "samples",
                    y = "counts",
                    color = "black",
                    fill = "category",
                    palette = viridisLite::viridis(7),
                    label = FALSE,
                    add = "mean_se"
                ) +
                    # coord_cartesian(ylim = c(0, 0.35)) +
                    xlab("") +
                    ylab("proportion") +
                    theme_slick
            
            `prop-plot_w-error_zoom` <- `sample-by-category_pivoted` %>%
                ggpubr::ggbarplot(
                    x = "samples",
                    y = "counts",
                    color = "black",
                    fill = "category",
                    palette = viridisLite::viridis(7),
                    label = FALSE,
                    add = "mean_se"
                ) +
                    coord_cartesian(ylim = c(0, 0.35)) +
                    xlab("") +
                    ylab("proportion") +
                    theme_slick
            
            `prop-plot_no-error_full` <- `sample-by-category_pivoted` %>%
                ggpubr::ggbarplot(
                    x = "samples",
                    y = "counts",
                    color = NA,
                    fill = "category",
                    palette = viridisLite::viridis(7),
                    label = FALSE,
                    add = "mean_se"
                ) +
                    xlab("") +
                    ylab("proportion") +
                    theme_slick
            
            `prop-plot_no-error_zoom` <- `sample-by-category_pivoted` %>%
                ggpubr::ggbarplot(
                    x = "samples",
                    y = "counts",
                    color = NA,
                    fill = "category",
                    palette = viridisLite::viridis(7),
                    label = FALSE,
                    add = "mean_se"
                ) +
                    coord_cartesian(ylim = c(0, 0.35)) +
                    xlab("") +
                    ylab("proportion") +
                    theme_slick
            
            #  Check
            `prop-plot_w-error_full`
            `prop-plot_w-error_zoom`
            `prop-plot_no-error_full`
            `prop-plot_no-error_zoom`
```

```{r}
            #  ------------------------
            #  ...for summary_other, prop_summary_other
            mean_G1_N <- apply(
                summary_other[, 2:3], 1,
                function(x) mean(x)
            )
            SEM_G1_N <- apply(
                summary_other[, 2:3], 1,
                function(x) sd(x) / sqrt(length(x))
            )
            mean_G1_SS <- apply(
                summary_other[, 4:5], 1,
                function(x) mean(x)
            )
            SEM_G1_SS <- apply(
                summary_other[, 4:5], 1,
                function(x) sd(x) / sqrt(length(x))
            )
            mean_Q_N <- apply(
                summary_other[, 6:7], 1,
                function(x) mean(x)
            )
            SEM_Q_N <- apply(
                summary_other[, 6:7], 1,
                function(x) sd(x) / sqrt(length(x))
            )
            mean_Q_SS <- apply(
                summary_other[, 8:9], 1,
                function(x) mean(x)
            )
            SEM_Q_SS <- apply(
                summary_other[, 8:9], 1,
                function(x) sd(x) / sqrt(length(x))
            )
        
        # mean_G1_N
        # SEM_G1_N
        # mean_G1_SS
        # SEM_G1_SS
        # mean_Q_N
        # SEM_Q_N
        # mean_Q_SS
        # SEM_Q_SS
        
        contingency <- dplyr::bind_cols(
            summary_other$category,
            round(mean_G1_N),
            round(mean_G1_SS),
            round(mean_Q_N),
            round(mean_Q_SS)
        ) %>%
            dplyr::rename(
                category = ...1,
                G1_N = ...2,
                G1_SS = ...3,
                Q_N = ...4,
                Q_SS = ...5
            ) %>%
            tibble::column_to_rownames("category")
        
        #  Perform         
        chisq <- chisq.test(contingency)
        chisq_contrib <- 100 * (chisq$residuals^2 / chisq$statistic)

        #  Visualize the contribution, saving the results
        #+ - sthda.com/english/wiki/chi-square-test-of-independence-in-r
        #+ - github.com/taiyun/corrplot/issues/108
        prepare_list_corrplot <- function(df) {
            plot <- function() {
                #  Note: variables "df" and "p" are bound from within the
                #+ closure
                corrplot::corrplot(
                    df,
                    method = "circle",
                    is.cor = FALSE,
                    col = grDevices::colorRampPalette(
                        c("#FFFFFF", "#7835AC")
                    )(200),
                    tl.col = "#000000",
                    cl.align.text = "l",
                    addgrid.col = "#FFFFFF"
                )
            }
            list(plot = plot, p = p)
        }
        
        list_corrplot <- prepare_list_corrplot(df = chisq_contrib)
        # list_corrplot$plot()
```

```{r}
            #  ----------------------------------------------------------------
            #  Save data to list element
            matrices[[name]][["01_file"]] <- file
            matrices[[name]][["02_gtf"]] <- gtf
            matrices[[name]][["03_counts"]] <- counts
            matrices[[name]][["04_counts_relevant"]] <- counts_relevant
            matrices[[name]][["05_summary_values"]] <- summary_values
            matrices[[name]][["06_metadata"]] <- metadata
            matrices[[name]][["07_pos_info"]] <- pos_info
            matrices[[name]][["08_pca_raw"]] <- pca_raw
            matrices[[name]][["09_pca_rlog"]] <- pca_rlog
            
            # matrices[["01.u.n.fs"]][["09_pca_rlog"]][["10_p_images"]]
#         }
#     }
# }
rm(gtf, h, h_n, i, i_r, j, k)

matrices %>% names()
paste(f_part_1, f_part_2, f_part_3, f_part_4, f_part_5, sep = ".")


```
</details>
<br />
<br />