results/2023-0215/rough-draft_estimate-RNA-degredation.R

#!/usr/bin/env Rscript

#  rough-draft_estimate-RNA-degradation.R
#  KA


#  Goal: Estimate RNA degredation using a model based in Vock and Simon, RNA
#+ 2023
#+ 
#+ From the paper (p. 4, partial ¶ 1):
#+ "If RNA levels are assumed to be at steady-state, meaning that the total RNA
#+ concentration during the experiment is the ratio of k_syn to k_deg, the
#+ fraction new (denoted θ) is θ = 1 - e^(-k_deg * t_label)"
#+ 
#+ (Hereafter, denoting "k_deg" as "k" and "t_label" as "t".)
#+ 
#+ Rearrange θ = 1 - e^(-kt) to solve for k:
#+     1. θ = 1 - e^(-kt)
#+     2. θ - 1 = -e^(-kt)
#+     3. 1 - θ = e^(-kt)
#+     4. ln(1 - θ) = ln(e^(-kt))
#+     5. ln(1 - θ) = -kt
#+     6. ln(1 - θ)/t = -k
#+     7. k = -ln(1 - θ)/t
#+ 
#+ ...where
#+     - θ is the fraction of new transcription; we estimate this value by
#+       dividing feature_N/input (feature_SS)
#+     - k is RNA degradation in absolute units of inverse time, since θ is
#+       dimensionless
#+     - t is labeling time, which we set to 6 minutes per Alison's benchwork
#+ 
#+ Then, draw distributions for the samples G1 rep1, G1 rep2, Q rep1, Q rep2


#  Initialize arguments =======================================================
#TODO Parser
type <- "mRNA"  #ARGUMENT
samples <- "Ovation"  #ARGUMENT
filtering <- "min-4-cts"


#  Load libraries, set options ================================================
# suppressMessages(library(DESeq2))
suppressMessages(library(tidyverse))

options(scipen = 999)
options(ggrepel.max.overlaps = Inf)


#  Initialize functions and ggplot2 themes ====================================
`%notin%` <- base::Negate(`%in%`)


filter_process_counts_matrix <- function(
    counts_matrix,
    named_character_vector
) {
    # ...
    #
    # :param counts_matrix: counts matrix from htseq-count
    # :param named_character_vector: ...
    # :return df: counts matrix as tibble
    
    #  Perform debugging
    debug <- FALSE
    if(base::isTRUE(debug)) {
        counts_matrix <- t_cm
        named_character_vector <- col_cor
    }
    
    df <- dplyr::bind_cols(
        counts_matrix[, 1],
        counts_matrix[
            , colnames(counts_matrix) %in% named_character_vector
        ]
    )
    df <- dplyr::bind_cols(
        df[, 1],
        df[, 2:ncol(df)][
            , match(named_character_vector, colnames(df)[2:ncol(df)])
        ]
    )
    names(df)[2:ncol(df)] <- names(named_character_vector)
    
    return(df)
}


#  Load custom ggplot2 themes -------------------------------------------------
theme_slick <- theme_classic() +
    theme(
        panel.grid.major = ggplot2::element_line(linewidth = 0.4),
        panel.grid.minor = ggplot2::element_line(linewidth = 0.2),
        axis.line = ggplot2::element_line(linewidth = 0.2),
        axis.ticks = ggplot2::element_line(linewidth = 0.4),
        axis.text = ggplot2::element_text(color = "black"),
        axis.title.x = ggplot2::element_text(),
        axis.title.y = ggplot2::element_text(),
        plot.title = ggplot2::element_text(),
        text = element_text(family = "")
    )


theme_AG <- theme_classic() +
    theme(
        panel.grid.major = ggplot2::element_line(linewidth = 3),
        panel.grid.minor = ggplot2::element_line(linewidth = 2),
        axis.line = ggplot2::element_line(linewidth = 0.5),
        axis.ticks = ggplot2::element_line(linewidth = 1.0),
        axis.text = ggplot2::element_text(
            color = "black", size = 20, face = "bold"
        ),
        axis.title.x = ggplot2::element_text(size = 25, face = "bold"),
        axis.title.y = ggplot2::element_text(size = 25, face = "bold"),
        plot.title = ggplot2::element_text(size = 20),
        text = element_text(family = "")
    )

theme_AG_boxed <- theme_AG +
    theme(
        axis.line = ggplot2::element_line(linewidth = 0),
        panel.border = element_rect(linewidth = 2, color = "black", fill = NA)
    )

theme_slick_no_legend <- theme_slick + theme(legend.position = "none")

theme_AG_no_legend <- theme_AG + theme(legend.position = "none")

theme_AG_boxed_no_legend <- theme_AG_boxed + theme(legend.position = "none")


#  Get situated, load counts matrix ===========================================
if(stringr::str_detect(getwd(), "kalavattam")) {
    p_base <- "/Users/kalavattam/Dropbox/FHCC"
} else {
    p_base <- "/Users/kalavatt/projects-etc"
}
p_exp <- "2022-2023_RRP6-NAB3/results/2023-0215"

#  Set work dir
paste(p_base, p_exp, sep = "/") %>% setwd()
# getwd()

#  Determine mRNA counts matrix to work with, then load it
#  Check on "type" option
if(base::isTRUE(type %notin% c(
    "mRNA", "pa-ncRNA", "Trinity-Q", "Trinity-G1", "Trinity-Q-unique",
    "Trinity-G1-unique", "representation"
))) {
    stop(paste(
        "Variable \"type\" must be \"mRNA\", \"pa-ncRNA\",",
        "\"Trinity-Q\", \"Trinity-G1\", \"Trinity-Q\", \"Trinity-G1\",",
        "\"representation\""
    ))
}


#  Read in htseq-count counts matrix ------------------------------------------
if(type == "mRNA") {
    p_cm <- "outfiles_htseq-count/already/combined-SC-KL-20S/UT_prim_UMI"
    f_cm <- "all-samples.combined-SC-KL-20S.hc-strd-eq.mRNA.tsv"
    
    p_gtf <- "infiles_gtf-gff3/already"
    f_gtf <- "combined_SC_KL_20S.gff3"
}

#  Check that counts matrix exists
run <- FALSE
if(base::isTRUE(run)) {
    paste(p_base, p_exp, p_cm, f_cm, sep = "/") %>%
        file.exists()  # [1] TRUE
}

#  Load counts matrix
if(type == "mRNA") {
    t_cm <- paste(p_base, p_exp, p_cm, f_cm, sep = "/") %>%
        readr::read_tsv(show_col_types = FALSE) %>%
        dplyr::slice(-1)  # Slice out the first row, which contains file info
} else {
    t_cm <- paste(p_base, p_exp, p_cm, f_cm, sep = "/") %>%
        readr::read_tsv(show_col_types = FALSE)
}

#  "Clean up" counts matrix column names and "features" elements
if(base::isTRUE(type == "mRNA")) {
    colnames(t_cm) <- colnames(t_cm) %>%
        gsub(".UT_prim_UMI.hc-strd-eq.tsv", "", .)
} else {
    colnames(t_cm)[1] <- "features"
    colnames(t_cm) <- colnames(t_cm) %>%
        gsub("bams_renamed/UT_prim_UMI/", "", .) %>%
        gsub(".UT_prim_UMI.bam", "", .)
}

if(type == "mRNA") {
    t_cm <- t_cm %>%
        dplyr::mutate(
            features = features %>%
                gsub("^transcript\\:", "", .) %>%
                gsub("_mRNA", "", .)
        )
}

#  Clean up, correct, and abbreviate sample names
col_cor <- setNames(
    c(
        "n3-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1",
        "n3-d_Q_day7_tcn_N_aux-T_tc-F_rep2_tech1",
        "n3-d_Q_day7_tcn_N_aux-T_tc-F_rep3_tech1",       #EXCLUDE
        "n3-d_Q_day7_tcn_SS_aux-T_tc-F_rep1_tech1",
        "n3-d_Q_day7_tcn_SS_aux-T_tc-F_rep2_tech1",
        "n3-d_Q_day7_tcn_SS_aux-T_tc-F_rep3_tech1",      #EXCLUDE
        "o-d_Q_day7_tcn_N_aux-T_tc-F_rep1_tech1",
        "o-d_Q_day7_tcn_N_aux-T_tc-F_rep2_tech1",
        "o-d_Q_day7_tcn_SS_aux-T_tc-F_rep1_tech1",
        "o-d_Q_day7_tcn_SS_aux-T_tc-F_rep2_tech1",
        "r1-n_Q_day8_tcn_N_aux-F_tc-F_rep1_tech1",       #FIXME* ∆ rep1 → rep2
        "r1-n_Q_day8_tcn_N_aux-F_tc-F_rep2_tech1",       #FIXME* ∆ rep2 → rep1
        "r1-n_Q_day8_tcn_SS_aux-F_tc-F_rep1_tech1",      #FIXME* ∆ rep1 → rep2
        "r1-n_Q_day8_tcn_SS_aux-F_tc-F_rep2_tech1",      #FIXME* ∆ rep2 → rep1
        "r6-n_DSm2_day2_tcn_SS_aux-F_tc-T_rep1_tech1",   #FIXME* ∆ rep1 → rep2
        "r6-n_DSm2_day2_tcn_SS_aux-F_tc-T_rep2_tech1",   #FIXME* ∆ rep2 → rep1
        "r6-n_DSp24_day3_tcn_SS_aux-F_tc-T_rep1_tech1",  #FIXME* ∆ rep1 → rep2
        "r6-n_DSp24_day3_tcn_SS_aux-F_tc-T_rep2_tech1",  #FIXME* ∆ rep2 → rep1
        "r6-n_DSp2_day2_tcn_SS_aux-F_tc-T_rep1_tech1",   #FIXME* ∆ rep1 → rep2
        "r6-n_DSp2_day2_tcn_SS_aux-F_tc-T_rep2_tech1",   #FIXME* ∆ rep2 → rep1
        "r6-n_DSp48_day4_tcn_SS_aux-F_tc-T_rep1_tech1",  #FIXME* ∆ rep1 → rep2
        "r6-n_DSp48_day4_tcn_SS_aux-F_tc-T_rep2_tech1",  #FIXME* ∆ rep2 → rep1  #FIXME‡ ∆ tech1 → tech2
        "r6-n_G1_day1_tcn_SS_aux-F_tc-F_rep1_tech1",     #FIXME* ∆ rep1 → rep2  #FIXME‡ ∆ tech1 → tech2
        "r6-n_G1_day1_tcn_SS_aux-F_tc-F_rep2_tech1",     #FIXME* ∆ rep2 → rep1  #FIXME‡ ∆ tech1 → tech2
        "r6-n_Q_day8_tcn_N_aux-F_tc-F_rep1_tech1",       #FIXME* ∆ rep1 → rep2
        "r6-n_Q_day8_tcn_N_aux-F_tc-F_rep2_tech1",       #FIXME* ∆ rep2 → rep1
        "r6-n_Q_day8_tcn_SS_aux-F_tc-F_rep1_tech1",      #FIXME* ∆ rep1 → rep2  #OK
        "r6-n_Q_day8_tcn_SS_aux-F_tc-F_rep1_tech2",      #FIXME* ∆ rep1 → rep2  #OK
        "r6-n_Q_day8_tcn_SS_aux-F_tc-F_rep2_tech1",      #FIXME* ∆ rep2 → rep1
        "t4-n_DSm2_day2_tcn_SS_aux-F_tc-T_rep1_tech1",
        "t4-n_DSm2_day2_tcn_SS_aux-F_tc-T_rep2_tech1",
        "t4-n_DSp24_day3_tcn_SS_aux-F_tc-T_rep1_tech1",
        "t4-n_DSp24_day3_tcn_SS_aux-F_tc-T_rep2_tech1",
        "t4-n_DSp2_day2_tcn_SS_aux-F_tc-T_rep1_tech1",
        "t4-n_DSp2_day2_tcn_SS_aux-F_tc-T_rep2_tech1",
        "t4-n_DSp48_day4_tcn_SS_aux-F_tc-T_rep1_tech1",
        "t4-n_DSp48_day4_tcn_SS_aux-F_tc-T_rep2_tech1",
        "WT_DSm2_day2_tcn_SS_aux-F_tc-T_rep1_tech1",
        "WT_DSm2_day2_tcn_SS_aux-F_tc-T_rep2_tech1",
        "WT_DSp24_day3_tcn_SS_aux-F_tc-T_rep1_tech1",
        "WT_DSp24_day3_tcn_SS_aux-F_tc-T_rep2_tech1",
        "WT_DSp2_day2_tcn_SS_aux-F_tc-T_rep1_tech1",
        "WT_DSp2_day2_tcn_SS_aux-F_tc-T_rep2_tech1",
        "WT_DSp48_day4_tcn_SS_aux-F_tc-T_rep1_tech1",    #OK
        "WT_DSp48_day4_tcn_SS_aux-F_tc-T_rep1_tech2",    #OK
        "WT_DSp48_day4_tcn_SS_aux-F_tc-T_rep2_tech1",
        "WT_G1_day1_ovn_N_aux-F_tc-F_rep1_tech1",
        "WT_G1_day1_ovn_N_aux-F_tc-F_rep2_tech1",
        "WT_G1_day1_ovn_SS_aux-F_tc-F_rep1_tech1",
        "WT_G1_day1_ovn_SS_aux-F_tc-F_rep2_tech1",
        "WT_G1_day1_tcn_SS_aux-F_tc-F_rep1_tech1",       #FIXME‡ ∆ tech1 → tech2
        "WT_G1_day1_tcn_SS_aux-F_tc-F_rep2_tech1",       #FIXME‡ ∆ tech1 → tech2
        "WT_Q_day7_ovn_N_aux-F_tc-F_rep1_tech1",
        "WT_Q_day7_ovn_N_aux-F_tc-F_rep2_tech1", 
        "WT_Q_day7_ovn_SS_aux-F_tc-F_rep1_tech1",
        "WT_Q_day7_ovn_SS_aux-F_tc-F_rep2_tech1",
        "WT_Q_day7_tcn_N_aux-F_tc-F_rep2_tech1",         #FIXME† Duplicated #1
        "WT_Q_day7_tcn_SS_aux-F_tc-F_rep2_tech1",        #FIXME† Duplicated #2
        "WT_Q_day8_tcn_N_aux-F_tc-F_rep1_tech1",
        "WT_Q_day8_tcn_N_aux-F_tc-F_rep2_tech1",         #FIXME† Duplicated #1
        "WT_Q_day8_tcn_SS_aux-F_tc-F_rep1_tech1",
        "WT_Q_day8_tcn_SS_aux-F_tc-F_rep2_tech1"         #FIXME† Duplicated #2
    ),
    c(
        "n3d_Q_N_rep1_tech1", 
        "n3d_Q_N_rep2_tech1", 
        "n3d_Q_N_rep3_tech1",       #EXCLUDE
        "n3d_Q_SS_rep1_tech1", 
        "n3d_Q_SS_rep2_tech1", 
        "n3d_Q_SS_rep3_tech1",      #EXCLUDE
        "od_Q_N_rep1_tech1", 
        "od_Q_N_rep2_tech1", 
        "od_Q_SS_rep1_tech1", 
        "od_Q_SS_rep2_tech1", 
        "r1n_Q_N_rep2_tech1",       #DONE* ∆ rep1 → rep2
        "r1n_Q_N_rep1_tech1",       #DONE* ∆ rep2 → rep1
        "r1n_Q_SS_rep2_tech1",      #DONE* ∆ rep1 → rep2
        "r1n_Q_SS_rep1_tech1",      #DONE* ∆ rep2 → rep1
        "r6n_DSm2_SS_rep2_tech1",   #DONE* ∆ rep1 → rep2
        "r6n_DSm2_SS_rep1_tech1",   #DONE* ∆ rep2 → rep1
        "r6n_DSp24_SS_rep2_tech1",  #DONE* ∆ rep1 → rep2
        "r6n_DSp24_SS_rep1_tech1",  #DONE* ∆ rep2 → rep1
        "r6n_DSp2_SS_rep2_tech1",   #DONE* ∆ rep1 → rep2
        "r6n_DSp2_SS_rep1_tech1",   #DONE* ∆ rep2 → rep1
        "r6n_DSp48_SS_rep2_tech1",  #DONE* ∆ rep1 → rep2
        "r6n_DSp48_SS_rep1_tech2",  #DONE* ∆ rep2 → rep1  #DONE‡ ∆ tech1 → tech2
        "r6n_G1_SS_rep2_tech2",     #DONE* ∆ rep1 → rep2  #DONE‡ ∆ tech1 → tech2
        "r6n_G1_SS_rep1_tech2",     #DONE* ∆ rep2 → rep1  #DONE‡ ∆ tech1 → tech2
        "r6n_Q_N_rep2_tech1",       #DONE* ∆ rep1 → rep2
        "r6n_Q_N_rep1_tech1",       #DONE* ∆ rep2 → rep1
        "r6n_Q_SS_rep2_tech1",      #DONE* ∆ rep1 → rep2  #OK
        "r6n_Q_SS_rep2_tech2",      #DONE* ∆ rep1 → rep2  #OK
        "r6n_Q_SS_rep1_tech1",      #DONE* ∆ rep2 → rep1
        "t4n_DSm2_SS_rep1_tech1", 
        "t4n_DSm2_SS_rep2_tech1", 
        "t4n_DSp24_SS_rep1_tech1", 
        "t4n_DSp24_SS_rep2_tech1", 
        "t4n_DSp2_SS_rep1_tech1", 
        "t4n_DSp2_SS_rep2_tech1", 
        "t4n_DSp48_SS_rep1_tech1", 
        "t4n_DSp48_SS_rep2_tech1", 
        "WT_DSm2_SS_rep1_tech1", 
        "WT_DSm2_SS_rep2_tech1", 
        "WT_DSp24_SS_rep1_tech1", 
        "WT_DSp24_SS_rep2_tech1", 
        "WT_DSp2_SS_rep1_tech1", 
        "WT_DSp2_SS_rep2_tech1", 
        "WT_DSp48_SS_rep1_tech1",   #OK
        "WT_DSp48_SS_rep1_tech2",   #OK
        "WT_DSp48_SS_rep2_tech1", 
        "WTovn_G1_N_rep1_tech1", 
        "WTovn_G1_N_rep2_tech1", 
        "WTovn_G1_SS_rep1_tech1", 
        "WTovn_G1_SS_rep2_tech1", 
        "WT_G1_SS_rep1_tech2",      #DONE‡ ∆ tech1 → tech2
        "WT_G1_SS_rep2_tech2",      #DONE‡ ∆ tech1 → tech2
        "WTovn_Q_N_rep1_tech1", 
        "WTovn_Q_N_rep2_tech1", 
        "WTovn_Q_SS_rep1_tech1", 
        "WTovn_Q_SS_rep2_tech1", 
        "WTtest_Q_N_rep2_tech1",    #DONE† Duplicated #1
        "WTtest_Q_SS_rep2_tech1",   #DONE† Duplicated #2
        "WT_Q_N_rep1_tech1", 
        "WT_Q_N_rep2_tech1",        #DONE† Duplicated #1
        "WT_Q_SS_rep1_tech1", 
        "WT_Q_SS_rep2_tech1"        #DONE† Duplicated #2
    )
)

run <- FALSE
if(base::isTRUE(run)) {
    t_cm.bak <- t_cm
    # t_cm <- t_cm.bak
}
t_cm <- filter_process_counts_matrix(t_cm, col_cor)


#  To associate features with metadata, load gff3 or gtf file -----------------
run <- FALSE
if(base::isTRUE(run)) {
    paste(p_gtf, f_gtf, sep = "/") %>% file.exists()  # [1] TRUE
}

#  Load in, subset, and "clean up" gff3
if(type == "mRNA") {
    t_gtf <- paste(p_gtf, f_gtf, sep = "/") %>%
        rtracklayer::import() %>%
        as.data.frame() %>%
        dplyr::as_tibble() %>%
        dplyr::filter(type == "mRNA") %>%
        dplyr::mutate(
            ID = ID %>%
                gsub("^transcript\\:", "", .) %>%
                gsub("_mRNA", "", .)
        ) %>%
        dplyr::rename(
            c(chr = seqnames, names = Name, features = ID)
        )
} else {
    t_gtf <- paste(p_gtf, f_gtf, sep = "/") %>%
        rtracklayer::import() %>%
        as.data.frame() %>%
        dplyr::as_tibble() %>%
        dplyr::select(-c(score, phase))
    
    if(stringr::str_detect(type, "unique")) {
        #  Load dataframe for custom annotations that do not overlap R64 or
        #+ pa-ncRNA collapsed/merged annotations:
        #+ "Trinity_putative-transcripts.2023-0620.unique"
        p_df <- "notebook/KA.2023-0620.Trinity_putative-transcripts.Q_G1"
        
        if(stringr::str_detect(type, "Q")) {
            f_df <- "Trinity_putative-transcripts.2023-0620.unique.Q.tsv"
        } else if(stringr::str_detect(type, "G1")) {
            f_df <- "Trinity_putative-transcripts.2023-0620.unique.G1.tsv"
        }            
        
        df <- readr::read_tsv(
            paste(p_df, f_df, sep = "/"), show_col_types = FALSE
        )
        
        #  Filter gtf to retain only "unique" custom annotations
        t_gtf <- t_gtf[t_gtf$locus_id %in% df$feature, ]
        t_cm <- t_cm[t_cm$features %in% df$feature, ]
        
        rm(p_df, f_df, df)
    }
}

#  Subset gff3 tibble to keep only relevant columns
if(type == "mRNA") {
    keep <- c(
        "chr", "start", "end",
        "width", "strand", "type",
        "features", "biotype", "names"
    )
    t_gtf <- t_gtf[, colnames(t_gtf) %in% keep]
}

#  Convert column names from list to character vector, and replace empty fields
#+ with NA character values
t_gtf$names <- ifelse(
    as.character(t_gtf$names) == "character(0)",
    NA_character_,
    as.character(t_gtf$names)
)

#  Combine "counts matrix tibble" and "gff3 tibble" ---------------------------
t_mat <- dplyr::full_join(t_gtf, t_cm, by = "features")

#  Add column of "thorough" names
t_mat$thorough <- ifelse(!is.na(t_mat$names), t_mat$names, t_mat$features)
t_mat <- t_mat %>% dplyr::relocate(thorough, .after = names)

#  Sort counts columns by column names
tmp_A <- t_mat[, 1:10]
tmp_B <- t_mat[, 11:ncol(t_mat)][, order(names(t_mat[, 11:ncol(t_mat)]))]
t_mat <- dplyr::bind_cols(tmp_A, tmp_B)

#  Remove unneeded variables
rm(list = ls(pattern = "tmp_"))
rm(f_gtf, f_cm, p_base, p_exp, p_gtf, p_cm, t_gtf, t_cm)


#  Order and categorize the combined counts matrix/gff3 tibble ----------------
#  Order tibble by chromosome names and feature start positions
chr_SC <- c(
    "I", "II", "III", "IV", "V", "VI",
    "VII", "VIII", "IX", "X", "XI", "XII",
    "XIII", "XIV", "XV", "XVI", "Mito"
)
chr_KL <- c("A", "B", "C", "D", "E", "F")
chr_20S <- "20S"
chr_order <- c(chr_SC, chr_KL, chr_20S)
t_mat$chr <- t_mat$chr %>% as.factor()
t_mat$chr <- ordered(t_mat$chr, levels = chr_order)

t_mat <- t_mat %>% dplyr::arrange(chr, start)

#  Categorize chromosomes by genome of origin
t_mat$genome <- ifelse(
    t_mat$chr %in% chr_SC,
    "S_cerevisiae",
    ifelse(
        t_mat$chr %in% chr_KL,
        "K_lactis",
        ifelse(
            # t_mat$chr %in% chr_20S,
            t_mat$features %in% chr_20S,
            "20S",
            NA
        )
    )
) %>%
    as.factor()
t_mat <- t_mat %>% dplyr::relocate(genome, .before = chr)

#  (Optional) Give feature "20S" certain placeholder values
run <- FALSE
if(base::isTRUE(run)) {
    t_mat$start[which(t_mat$genome == "20S")] <-
        t_mat$end[which(t_mat$genome == "20S")] <-
        0
    t_mat$chr[which(t_mat$genome == "20S")] <- "20S"
}

#  Remove unneeded variables
rm(chr_20S, chr_KL, chr_SC, chr_order)


#  For analyses of non-"unique" feat., extract htseq-count summary metrics ----
if(!stringr::str_detect(type, "unique")) {
    #  They are at the end of the matrices and have names that begin with two
    #+ underscore characters
    underscore <- t_mat[
        stringr::str_detect(t_mat$features, "^__[a-zA-Z0-9_]*$"), 
    ]

    #  Exclude htseq-count summary metrics from t_mat
    t_mat <- t_mat[!stringr::str_detect(t_mat$features, "^__[a-zA-Z0-9_]*$"), ]
}

run <- FALSE
if(base::isTRUE(run)) t_mat %>% tail(10)


#  Subset t_mat to include counts only for samples of interest ----------------
run <- TRUE
if(base::isTRUE(run)) {
    t_mat.bak <- t_mat
    # t_mat <- t_mat.bak
    # colnames(t_mat)
}

if(samples %notin% c("Ovation")) {
    stop(paste("Variable \"samples\" must be \"Ovation\""))
}

tmp_A <- t_mat[, 1:11]
tmp_B <- t_mat[, 12:ncol(t_mat)]

if(samples == "Ovation") {
    tmp_C <- tmp_B[, stringr::str_detect(
        colnames(tmp_B), "ovn"
    )]
}

t_mat <- dplyr::bind_cols(tmp_A, tmp_C)
rm(list = ls(pattern = "tmp_"))


#  (Optional) Filter counts matrix --------------------------------------------
#+ ...to exclude rows without a minimum of 10 counts in n - 1 samples
if(filtering %notin% c("none", "min-4-cts-all-but-1-samps", "min-4-cts")) {
    stop(paste(
        "Argument for \"filtering\" must be \"none\",",
        "\"min-10-cts-all-but-1-samps\", or \"min-4-cts\""
    ))
}

t_sub <- t_mat[t_mat$genome == "S_cerevisiae", ]

if(filtering == "none") {
    #  Do nothing...
} else if(filtering == "min-4-cts-all-but-1-samps") {
    counts <- sapply(t_sub[, 12:ncol(t_mat)], as.numeric)
    keep <- rowSums(counts >= 4) >= ncol(counts) - 1
    t_sub <- t_sub[keep, ]
    
    run <- TRUE
    if(base::isTRUE(run)) rm(counts, keep)
} else if(filtering == "min-4-cts") {
    counts <- sapply(t_sub[, 12:ncol(t_mat)], as.numeric)
    
    sketch_1 <- FALSE
    if(base::isTRUE(sketch_1)) {
        #  Sketch 1 ---------------------------------
        counts_G1_1 <- counts[, c(1, 3)]
        counts_G1_2 <- counts[, c(2, 4)]
        counts_Q_1 <- counts[, c(5, 7)]
        counts_Q_2 <- counts[, c(6, 8)]
        
        keep_G1_1 <- rowSums(counts_G1_1 >= 4) >= ncol(counts_G1_1)
        keep_G1_2 <- rowSums(counts_G1_2 >= 4) >= ncol(counts_G1_2)
        keep_Q_1 <- rowSums(counts_Q_1 >= 4) >= ncol(counts_Q_1)
        keep_Q_2 <- rowSums(counts_Q_2 >= 4) >= ncol(counts_Q_2)
        
        # table(keep_G1_1)
        # table(keep_G1_2)
        # table(keep_Q_1)
        # table(keep_Q_2)
    }
    
    sketch_2 <- TRUE
    if(base::isTRUE(sketch_2)) {
        #  Sketch 2 ---------------------------------
        counts_G1 <- counts[, c(1, 3, 2, 4)]
        counts_Q <- counts[, c(5, 7, 6, 8)]
        
        keep_G1 <- rowSums(counts_G1 >= 4) >= ncol(counts_G1)
        keep_Q <- rowSums(counts_Q >= 4) >= ncol(counts_Q)
        
        # table(keep_G1)
        # table(keep_Q)
        
        df_G1 <- dplyr::bind_cols(t_sub[keep_G1, 1:11], counts_G1[keep_G1, ])
        df_Q <- dplyr::bind_cols(t_sub[keep_Q, 1:11], counts_G1[keep_Q, ])
    }
    
    sketch_3 <- TRUE
    if(base::isTRUE(sketch_3)) {
        #  Sketch 3 ---------------------------------
        counts_all <- counts[, c(1, 3, 2, 4, 5, 7, 6, 8)]
        keep_all <- rowSums(counts_all >= 4) >= ncol(counts_all)
        # table(keep_all)
        
        df_all <- dplyr::bind_cols(
            t_sub[keep_all, 1:11],
            counts_all[keep_all, ]
        )
    }
}


#  Calculate k = -ln(1 - θ)/t -------------------------------------------------
#+ ...where
#+     - θ is the fraction of new transcription; we estimate this value by
#+       dividing feature_N ÷ feature_SS
#+     - k is RNA degradation in absolute units of inverse time, since θ is
#+       dimensionless
#+     - t is labeling time, which we set to 6 minutes per Alison's benchwork
calculate_k <- function(N, SS, t = 6) {
    #  Perform debugging
    debug <- FALSE
    if(base::isTRUE(debug)) {
        N <- G_N_1
        SS <- G_S_1
    }
    
    k <- -log(1 - N/SS)/t
    
    return(k)
}


calculate_k_updated <- function(N, SS, t = 6) {
    #  Perform debugging
    debug <- FALSE
    if(base::isTRUE(debug)) {
        N <- G_N_1
        SS <- G_S_1
    }
    
    k <- -log(1 - N/(N + SS))/t
    
    return(k)
}


df_all[, 12:ncol(df_all)] <- sapply(df_all[, 12:ncol(df_all)], as.numeric)

G_N_1 <- df_all$WTovn_G1_N_rep1_tech1
G_S_1 <- df_all$WTovn_G1_SS_rep1_tech1

G_N_2 <- df_all$WTovn_G1_N_rep2_tech1
G_S_2 <- df_all$WTovn_G1_SS_rep2_tech1

Q_N_1 <- df_all$WTovn_Q_N_rep1_tech1
Q_S_1 <- df_all$WTovn_Q_SS_rep1_tech1

Q_N_2 <- df_all$WTovn_Q_N_rep2_tech1
Q_S_2 <- df_all$WTovn_Q_SS_rep2_tech1

t <- 6

# k_G_1 <- calculate_k(N = G_N_1, SS = G_S_1, t = t)
# k_G_2 <- calculate_k(N = G_N_2, SS = G_S_2, t = t)
# k_Q_1 <- calculate_k(N = Q_N_1, SS = Q_S_1, t = t)
# k_Q_2 <- calculate_k(N = Q_N_2, SS = Q_S_2, t = t)

k_G_1 <- calculate_k_updated(N = G_N_1, SS = G_S_1, t = t)
k_G_2 <- calculate_k_updated(N = G_N_2, SS = G_S_2, t = t)
k_Q_1 <- calculate_k_updated(N = Q_N_1, SS = Q_S_1, t = t)
k_Q_2 <- calculate_k_updated(N = Q_N_2, SS = Q_S_2, t = t)

k_all <- tibble::tibble(
    k_G_1 = k_G_1,
    k_G_2 = k_G_2,
    k_Q_1 = k_Q_1,
    k_Q_2 = k_Q_2
)

min(k_all)
max(k_all)


k_all_long <- reshape2::melt(k_all)
colnames(k_all_long) <- c("sample", "k")
ggplot2::ggplot(k_all_long, aes(x = sample, y = k, fill = sample)) +
    geom_violin(trim = FALSE, color = "black") +
    geom_boxplot(
        width = 0.2,
        fill = "white",
        color = "black",
        outlier.shape = NA
    ) +
    labs(x = "", y = "k_deg") +
    ggtitle(
        "Distributions of k_deg values",
        subtitle = "theta = N / (N + SS)"
    ) +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme_minimal()

#  Minimal reproducible example ###############################################
suppressMessages(library(tidyverse))

#  Load functions for two ways to calculate k
calculate_k <- function(N, SS, t = 6) {
    k <- -log(1 - (N/SS))/t  # Only SS in theta denominator
    return(k)
}


calculate_k_updated <- function(N, SS, t = 6) {
    k <- -log(1 - (N/(N + SS)))/t  # N plus SS in theta denominator
    return(k)
}


#  Fill 100-row dataframe with random integers between 0 and 1000
num_rows <- 100
max_value <- 1000
df <- data.frame(matrix(ncol = 8, nrow = num_rows))

set.seed(24)
for (col in 1:8) {
    df[, col] <- sample(0:max_value, num_rows, replace = TRUE)
}

#  Give dataframe sample-like column names
colnames(df) <- c(
    paste(rep(c("G1_N", "G1_SS"), 2), c(1, 1, 2, 2), sep = "_"),
    paste(rep(c("Q_N", "Q_SS"), 2), c(1, 1, 2, 2), sep = "_")
)

#  Make dataframe of k values when only feature_SS is in the denominator
df_k_denom_SS <- data.frame(
    k_G1_1 = calculate_k(df$G1_N_1, df$G1_SS_1, 6),
    k_G1_2 = calculate_k(df$G1_N_2, df$G1_SS_2, 6),
    k_Q_1 = calculate_k(df$Q_N_1, df$Q_SS_1, 6),
    k_Q_2 = calculate_k(df$Q_N_2, df$Q_SS_2, 6)
)  # Get NaN warnings

#  Make dataframe of k values when feature_N + feature_SS is in the denominator
df_k_denom_N_SS <- data.frame(
    k_G1_1 = calculate_k_updated(df$G1_N_1, df$G1_SS_1, 6),
    k_G1_2 = calculate_k_updated(df$G1_N_2, df$G1_SS_2, 6),
    k_Q_1 = calculate_k_updated(df$Q_N_1, df$Q_SS_1, 6),
    k_Q_2 = calculate_k_updated(df$Q_N_2, df$Q_SS_2, 6)
)

head(df_k_denom_SS)
head(df_k_denom_N_SS)

#  Plot the distributions of the simulated samples
df_k_denom_N_SS_long <- reshape2::melt(df_k_denom_N_SS)
colnames(df_k_denom_N_SS_long) <- c("sample", "k")
ggplot2::ggplot(df_k_denom_N_SS_long, aes(x = sample, y = k, fill = sample)) +
    geom_violin(trim = FALSE, color = "black") +
    geom_boxplot(
        width = 0.2,
        fill = "white",
        color = "black",
        outlier.shape = NA
    ) +
    labs(x = "", y = "k_deg") +
    ggtitle(
        "Distributions of k_deg values",
        subtitle = "theta = N / (N + SS)"
    ) +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme_minimal()