Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pre-processing vignette, new preProcessAdat() function #134

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Depends:
Imports:
cli,
dplyr (>= 1.0.6),
ggplot2,
lifecycle (>= 1.0.0),
magrittr (>= 2.0.1),
methods,
Expand All @@ -33,7 +34,6 @@ Imports:
tidyr (>= 1.1.3)
Suggests:
Biobase,
ggplot2,
knitr,
purrr,
recipes,
Expand Down
36 changes: 36 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ S3method(left_join,soma_adat)
S3method(median,soma_adat)
S3method(merge,soma_adat)
S3method(mutate,soma_adat)
S3method(plot,Map)
S3method(plot,outlier_map)
S3method(print,adat_summary)
S3method(print,outlier_map)
S3method(print,soma_adat)
S3method(print,target_map)
S3method(rename,soma_adat)
Expand All @@ -68,6 +71,7 @@ export(anti_join)
export(antilog)
export(apt2seqid)
export(arrange)
export(calcOutlierMap)
export(calc_eLOD)
export(checkSomaScanVersion)
export(cleanNames)
Expand All @@ -82,6 +86,7 @@ export(getAnalyteInfo)
export(getAnalytes)
export(getFeatureData)
export(getFeatures)
export(getFlaggedIds)
export(getMeta)
export(getSeqId)
export(getSeqIdMatches)
Expand Down Expand Up @@ -111,6 +116,7 @@ export(merge_clin)
export(mutate)
export(parseHeader)
export(pivotExpressionSet)
export(preProcessAdat)
export(read.adat)
export(read_adat)
export(read_annotations)
Expand Down Expand Up @@ -144,6 +150,7 @@ importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(dplyr,rename)
importFrom(dplyr,right_join)
importFrom(dplyr,row_number)
importFrom(dplyr,sample_frac)
importFrom(dplyr,sample_n)
importFrom(dplyr,select)
Expand All @@ -153,6 +160,34 @@ importFrom(dplyr,slice_sample)
importFrom(dplyr,starts_with)
importFrom(dplyr,summarise)
importFrom(dplyr,ungroup)
importFrom(ggplot2,aes)
importFrom(ggplot2,element_blank)
importFrom(ggplot2,element_rect)
importFrom(ggplot2,element_text)
importFrom(ggplot2,facet_wrap)
importFrom(ggplot2,geom_boxplot)
importFrom(ggplot2,geom_hline)
importFrom(ggplot2,geom_point)
importFrom(ggplot2,geom_raster)
importFrom(ggplot2,geom_vline)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,ggsave)
importFrom(ggplot2,guide_axis)
importFrom(ggplot2,guide_colorbar)
importFrom(ggplot2,guide_legend)
importFrom(ggplot2,guides)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_color_manual)
importFrom(ggplot2,scale_fill_gradientn)
importFrom(ggplot2,scale_fill_manual)
importFrom(ggplot2,scale_x_continuous)
importFrom(ggplot2,scale_x_discrete)
importFrom(ggplot2,scale_y_reverse)
importFrom(ggplot2,sec_axis)
importFrom(ggplot2,theme)
importFrom(ggplot2,theme_bw)
importFrom(ggplot2,unit)
importFrom(ggplot2,ylim)
importFrom(lifecycle,deprecate_soft)
importFrom(lifecycle,deprecate_stop)
importFrom(lifecycle,deprecate_warn)
Expand All @@ -173,6 +208,7 @@ importFrom(tibble,deframe)
importFrom(tibble,enframe)
importFrom(tibble,is_tibble)
importFrom(tibble,tibble)
importFrom(tidyr,gather)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,separate)
importFrom(tidyr,unite)
Expand Down
15 changes: 14 additions & 1 deletion R/0-declare-global-variables.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,26 @@ utils::globalVariables(
"AptName",
"array_id",
"blank_col",
"ColCheck",
"Dilution",
"eLOD",
"feature",
"Group",
"GroupCount",
"Normalization Scale Factor",
"Organism",
"prefix",
"Response",
"RowCheck",
"rn",
"SampleId",
"SampleType",
"SeqId",
"seqid",
"value"
"SubjectCount",
"SubjectId",
"Type",
"value",
"Value"
)
)
199 changes: 199 additions & 0 deletions R/calcOutlierMap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
#' Calculate MAD Outlier Map
#'
#' Calculate the median absolute deviation (statistical) outliers measurements
#' and fold-change criteria from an ADAT. Two values are required for the
#' calculation: median absolute deviation (MAD) and fold-change (FC). Outliers
#' are determined based on the result of _both_ `6*MAD` and `x*FC` , where `x`
#' is the number of fold changes defined.
#'
#' For the S3 plotting method, see [plot.Map()].
#'
#' @family Calc Map
#' @param data A `soma_adat` object containing RFU feature data.
#' @param anno_tbl An annotations table produced via [getAnalyteInfo()].
#' Used to calculate analyte dilutions for the matrix column ordering.
#' If `NULL`, a table is generated internally from `data` (if possible), and
#' the analytes are plotted in dilution order.
#' @param apt.order Character. How should the columns/features be ordered?
#' Options include: by dilution mix ("dilution"), by median overall signal
#' ("signal"), or as-is in `data` (default).
#' @param sample.order Either a character string indicating the column name
#' with entries to be used to order the data frame rows, or a numeric vector
#' representing the order of the data frame rows. The
#' default (`NULL`) leaves the row ordering as it is in `data`.
#' @param fc.crit Integer. The fold change criterion to evaluate. Defaults to 5x.
#' @return A list of class `c("outlier_map", "Map")` containing:
#' \item{matrix}{A boolean matrix of `TRUE/FALSE` whether each sample is an
#' outlier according the the stated criteria.}
#' \item{x.lab}{A character string containing the plot x-axis label.}
#' \item{title}{A character string containing the plot title.}
#' \item{rows.by.freq}{A logical indicating if the samples are ordered
#' by outlier frequency.}
#' \item{class.tab}{A table containing the frequencies of each class if input
#' `sample.order` is defined as a categorical variable.}
#' \item{sample.order}{A numeric vector representing the order of the data
#' frame rows.}
#' \item{legend.sub}{A character string containing the plot legend subtitle.}
#' @author Stu Field
#' @examples
#' om <- calcOutlierMap(example_data)
#' class(om)
#'
#' # S3 print method
#' om
#'
#' # `sample.order = "frequency"` orders samples by outlier frequency
#' om <- calcOutlierMap(example_data, sample.order = "frequency")
#' om$rows.by.freq
#' om$sample.order
#'
#' # order samples by user specified indices
#' om <- calcOutlierMap(example_data, sample.order = 192:1)
#' om$sample.order
#'
#' # order samples field in Adat
#' om <- calcOutlierMap(example_data, sample.order = "Sex")
#' om$sample.order
#' @importFrom stats median

#' @export
calcOutlierMap <- function(data, anno_tbl = NULL,
apt.order = c(NA, "dilution", "signal"),
sample.order = NULL, fc.crit = 5) {

apt.order <- match.arg(apt.order)
data <- .refactorData(data)
sampleL <- length(sample.order)
freq <- sampleL == 1L && tolower(sample.order) %in% "frequency"
class_tab <- NA
ord <- seq_len(nrow(data))
ret <- list(matrix = matrix(0)) # placeholder: reserve position 1

if ( is.null(anno_tbl) ) {
anno_tbl <- getAnalyteInfo(data)
}

# Order of the rows in the Map
if ( !is.null(sample.order) && !freq ) {
if ( sampleL > 1L && is.numeric(sample.order) ) {
if ( sampleL != nrow(data) ) {
stop(
"Incorrect number of row indices: ", value(nrow(data)),
"rows vs. ", value(sampleL), " indices.", call. = FALSE
)
} else {
data <- data[sample.order, ]
ord <- sample.order
ret$y.lab <- "Samples (User Specified Order)"
}
} else if ( sampleL == 1L && is.character(sample.order) ) {
stopifnot(sample.order %in% names(data))
ord <- order(data[[sample.order]])
data <- data[ord, ]
class_tab <- table(data[[sample.order]])
ret$y.lab <- sprintf("Samples (by %s)", sample.order)
} else {
stop(
"Something wrong with `sample.order =` argument: ",
value(sample.order), call. = FALSE
)
}
}

data_mat <- data.matrix(data[, getAnalytes(data)])

# calc statistical outliers matrix (boolean matrix of TRUE/FALSE)
mat <- apply(data_mat, 2, function(.apt) {
seq_along(.apt) %in% .getOutliers(.apt, fc.crit)
})
rownames(mat) <- rownames(data_mat) # rownames stripped by apply()

if ( sum(mat) == 0 ) {
warning("No outliers detected in outlier map!", call. = FALSE)
}

if ( freq ) {
mat <- mat[ names(sort(rowSums(mat))), ]
ret$y.lab <- "Samples Ordered by Outlier Frequency"
}

if ( is.na(apt.order) ) {

ret$x.lab <- "Proteins Ordered in Adat"

} else if ( apt.order == "dilution" ) {

apt.dils <- .getDilList(anno_tbl)
mat <- mat[, unlist(apt.dils)]
ret$dil.nums <- lengths(apt.dils)
ret$col.order <- "dilution"
ret$x.lab <- sprintf("Dilution Ordered Proteins (%s)",
paste(names(apt.dils), collapse = " | "))

} else if ( apt.order == "signal" ) {

signal.order <- sort(apply(data_mat, 2, stats::median))
mat <- mat[, names(signal.order)]
ret$col.order <- "signal"
ret$x.lab <- "Proteins by Median Signal"

} else {
stop("Problem with `apt.order =` argument: ",
value(apt.order), call. = FALSE)
}

ret$title <- paste0(
"Outlier Map: | x - median(x) | > 6 * mad(x) & FC > ", fc.crit, "x"
)
ret$rows.by.freq <- freq
ret$class.tab <- class_tab
ret$sample.order <- ord
ret$matrix <- mat
ret$legend.sub <- "Proteins"
invisible(
structure(
ret,
class = c("outlier_map", "Map", "list")
)
)
}


#' @describeIn calcOutlierMap
#' There is a S3 print method for class `"outlier_map"`.
#' @param x An object of class `"outlier_map"`.
#' @param ... Arguments for S3 print methods.
#' @export
print.outlier_map <- function(x, ...) {
writeLines(
cli_rule("SomaLogic Outlier Map", line = "double", line_col = "magenta")
)
key <- c(
"Outlier Map dimensions",
"Title",
"Class Table",
"Rows by Frequency",
"Sample Order",
"x-label",
"Legend Sub-title") |> .pad(25)
value <- c(
.value(paste(dim(x$matrix), collapse = " x ")),
.value(x$title),
c(x$class.tab),
x$rows.by.freq,
.value(x$x.lab),
.value(x$sample.order),
.value(x$legend.sub)
)
writeLines(paste(" ", key, value))
writeLines(cli_rule(line = "double", line_col = "green"))
invisible(x)
}


#' S3 plot methods for class outlier_map
#' @noRd
#' @export
plot.outlier_map <- function(x, ...) {
NextMethod("plot", type = "outlier")
}
61 changes: 61 additions & 0 deletions R/getFlaggedIds.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#' Get Flagged Ids From MAD Outlier Map
#'
#' Return the IDs of flagged samples for objects of the `outlier_map` class.
#' Samples are flagged based on the percent analytes (RFU columns) for a given
#' sample that were identified as outliers using the median absolute
#' deviation (MAD).
#'
#' @family Calc Map
#' @inheritParams plot.Map
#' @param x An object of class:
#' * `outlier_map` - from [calcOutlierMap()]
#' @param data Optional. The data originally used to create the map `x`. If
#' omitted, a single column data frame is returned.
#' @param include Optional. Character vector of column name(s) in `data` to
#' include in the resulting data frame. Ignored if `data = NULL`.
#' @return A `data.frame` of the indices (`idx`) of flagged samples, along
#' with any additional variables as specified by `include`.
#' @author Caleb Scheidel
#' @examples
#' # flagged outliers
#' # create a single sample outlier (12)
#' out_adat <- example_data
#' apts <- getAnalytes(out_adat)
#' out_adat[12, apts] <- out_adat[12, apts] * 10
#'
#' om <- calcOutlierMap(out_adat)
#' getFlaggedIds(om, out_adat, flags = 0.05, include = c("Sex", "Subarray"))
#' @export
getFlaggedIds <- function(x, flags = 0.05, data = NULL, include = NULL) {

if ( !inherits(x, "outlier_map") ) {
stop("Input `x` object must be class `outlier_map`!",
call. = FALSE)
}

# ensure that flags value is between 0 & 1
if ( flags < 0 || flags > 1 ) {
stop("`flags =` argument must be between 0 and 1!", call. = FALSE)
}

flagged <- which(rowSums(x$matrix) >= ncol(x$matrix) * flags) |> unname()
df_idx <- data.frame(idx = flagged) # default 1-col df

if ( !length(flagged) ) {
.todo("No observations were flagged at this flagging proportion: {.val {flags}}")
}

if ( is.null(data) ) {
df_idx
} else {
stopifnot(
"The `data` argument must be a `data.frame` object." = is.data.frame(data),
"All `include` must be in `data`." = all(include %in% names(data))
)
df <- as.data.frame(data) # strip soma_adat class
cbind(
df_idx,
rm_rn(df[flagged, include, drop = FALSE]) # ensure no rn
)
}
}
Loading
Loading