SomaLogic · scheidec · Jun 6, 2024 · Sep 19, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -25,6 +25,7 @@ Depends:
 Imports:
     cli,
     dplyr (>= 1.0.6),
+    ggplot2,
     lifecycle (>= 1.0.0),
     magrittr (>= 2.0.1),
     methods,
@@ -33,7 +34,6 @@ Imports:
     tidyr (>= 1.1.3)
 Suggests:
     Biobase,
-    ggplot2,
     knitr,
     purrr,
     recipes,

diff --git a/NAMESPACE b/NAMESPACE
@@ -43,7 +43,10 @@ S3method(left_join,soma_adat)
 S3method(median,soma_adat)
 S3method(merge,soma_adat)
 S3method(mutate,soma_adat)
+S3method(plot,Map)
+S3method(plot,outlier_map)
 S3method(print,adat_summary)
+S3method(print,outlier_map)
 S3method(print,soma_adat)
 S3method(print,target_map)
 S3method(rename,soma_adat)
@@ -68,6 +71,7 @@ export(anti_join)
 export(antilog)
 export(apt2seqid)
 export(arrange)
+export(calcOutlierMap)
 export(calc_eLOD)
 export(checkSomaScanVersion)
 export(cleanNames)
@@ -82,6 +86,7 @@ export(getAnalyteInfo)
 export(getAnalytes)
 export(getFeatureData)
 export(getFeatures)
+export(getFlaggedIds)
 export(getMeta)
 export(getSeqId)
 export(getSeqIdMatches)
@@ -111,6 +116,7 @@ export(merge_clin)
 export(mutate)
 export(parseHeader)
 export(pivotExpressionSet)
+export(preProcessAdat)
 export(read.adat)
 export(read_adat)
 export(read_annotations)
@@ -144,6 +150,7 @@ importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,rename)
 importFrom(dplyr,right_join)
+importFrom(dplyr,row_number)
 importFrom(dplyr,sample_frac)
 importFrom(dplyr,sample_n)
 importFrom(dplyr,select)
@@ -153,6 +160,34 @@ importFrom(dplyr,slice_sample)
 importFrom(dplyr,starts_with)
 importFrom(dplyr,summarise)
 importFrom(dplyr,ungroup)
+importFrom(ggplot2,aes)
+importFrom(ggplot2,element_blank)
+importFrom(ggplot2,element_rect)
+importFrom(ggplot2,element_text)
+importFrom(ggplot2,facet_wrap)
+importFrom(ggplot2,geom_boxplot)
+importFrom(ggplot2,geom_hline)
+importFrom(ggplot2,geom_point)
+importFrom(ggplot2,geom_raster)
+importFrom(ggplot2,geom_vline)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,ggsave)
+importFrom(ggplot2,guide_axis)
+importFrom(ggplot2,guide_colorbar)
+importFrom(ggplot2,guide_legend)
+importFrom(ggplot2,guides)
+importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_color_manual)
+importFrom(ggplot2,scale_fill_gradientn)
+importFrom(ggplot2,scale_fill_manual)
+importFrom(ggplot2,scale_x_continuous)
+importFrom(ggplot2,scale_x_discrete)
+importFrom(ggplot2,scale_y_reverse)
+importFrom(ggplot2,sec_axis)
+importFrom(ggplot2,theme)
+importFrom(ggplot2,theme_bw)
+importFrom(ggplot2,unit)
+importFrom(ggplot2,ylim)
 importFrom(lifecycle,deprecate_soft)
 importFrom(lifecycle,deprecate_stop)
 importFrom(lifecycle,deprecate_warn)
@@ -173,6 +208,7 @@ importFrom(tibble,deframe)
 importFrom(tibble,enframe)
 importFrom(tibble,is_tibble)
 importFrom(tibble,tibble)
+importFrom(tidyr,gather)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,separate)
 importFrom(tidyr,unite)

diff --git a/R/0-declare-global-variables.R b/R/0-declare-global-variables.R
@@ -11,13 +11,26 @@ utils::globalVariables(
     "AptName",
     "array_id",
     "blank_col",
+    "ColCheck",
     "Dilution",
     "eLOD",
     "feature",
+    "Group",
+    "GroupCount",
+    "Normalization Scale Factor",
+    "Organism",
     "prefix",
+    "Response",
+    "RowCheck",
     "rn",
+    "SampleId",
+    "SampleType",
     "SeqId",
     "seqid",
-    "value"
+    "SubjectCount",
+    "SubjectId",
+    "Type",
+    "value",
+    "Value"
   )
 )
diff --git a/R/calcOutlierMap.R b/R/calcOutlierMap.R
@@ -0,0 +1,199 @@
+#' Calculate MAD Outlier Map
+#'
+#' Calculate the median absolute deviation (statistical) outliers measurements
+#' and fold-change criteria from an ADAT. Two values are required for the
+#' calculation: median absolute deviation (MAD) and fold-change (FC). Outliers
+#' are determined based on the result of _both_ `6*MAD` and `x*FC` , where `x`
+#' is the number of fold changes defined.
+#'
+#' For the S3 plotting method, see [plot.Map()].
+#'
+#' @family Calc Map
+#' @param data A `soma_adat` object containing RFU feature data.
+#' @param anno_tbl An annotations table produced via [getAnalyteInfo()].
+#'   Used to calculate analyte dilutions for the matrix column ordering.
+#'   If `NULL`, a table is generated internally from `data` (if possible), and
+#'   the analytes are plotted in dilution order.
+#' @param apt.order Character. How should the columns/features be ordered?
+#'   Options include: by dilution mix ("dilution"), by median overall signal
+#'   ("signal"), or as-is in `data` (default).
+#' @param sample.order Either a character string indicating the column name
+#'   with entries to be used to order the data frame rows, or a numeric vector
+#'   representing the order of the data frame rows. The
+#'   default (`NULL`) leaves the row ordering as it is in `data`.
+#' @param fc.crit Integer. The fold change criterion to evaluate. Defaults to 5x.
+#' @return A list of class `c("outlier_map", "Map")` containing:
+#'   \item{matrix}{A boolean matrix of `TRUE/FALSE` whether each sample is an
+#'     outlier according the the stated criteria.}
+#'   \item{x.lab}{A character string containing the plot x-axis label.}
+#'   \item{title}{A character string containing the plot title.}
+#'   \item{rows.by.freq}{A logical indicating if the samples are ordered
+#'     by outlier frequency.}
+#'   \item{class.tab}{A table containing the frequencies of each class if input
+#'     `sample.order` is defined as a categorical variable.}
+#'   \item{sample.order}{A numeric vector representing the order of the data
+#'     frame rows.}
+#'   \item{legend.sub}{A character string containing the plot legend subtitle.}
+#' @author Stu Field
+#' @examples
+#' om <- calcOutlierMap(example_data)
+#' class(om)
+#'
+#' # S3 print method
+#' om
+#'
+#' # `sample.order = "frequency"` orders samples by outlier frequency
+#' om <- calcOutlierMap(example_data, sample.order = "frequency")
+#' om$rows.by.freq
+#' om$sample.order
+#'
+#' # order samples by user specified indices
+#' om <- calcOutlierMap(example_data, sample.order = 192:1)
+#' om$sample.order
+#'
+#' # order samples field in Adat
+#' om <- calcOutlierMap(example_data, sample.order = "Sex")
+#' om$sample.order
+#' @importFrom stats median
+
+#' @export
+calcOutlierMap <- function(data, anno_tbl = NULL,
+                           apt.order = c(NA, "dilution", "signal"),
+                           sample.order = NULL, fc.crit = 5) {
+
+  apt.order  <- match.arg(apt.order)
+  data       <- .refactorData(data)
+  sampleL    <- length(sample.order)
+  freq       <- sampleL == 1L && tolower(sample.order) %in% "frequency"
+  class_tab  <- NA
+  ord        <- seq_len(nrow(data))
+  ret        <- list(matrix = matrix(0)) # placeholder: reserve position 1
+
+  if ( is.null(anno_tbl) ) {
+    anno_tbl <- getAnalyteInfo(data)
+  }
+
+  # Order of the rows in the Map
+  if ( !is.null(sample.order) && !freq ) {
+    if ( sampleL > 1L && is.numeric(sample.order) ) {
+      if ( sampleL != nrow(data) ) {
+        stop(
+          "Incorrect number of row indices: ", value(nrow(data)),
+          "rows vs. ", value(sampleL), " indices.", call. = FALSE
+        )
+      } else {
+        data      <- data[sample.order, ]
+        ord       <- sample.order
+        ret$y.lab <- "Samples (User Specified Order)"
+      }
+    } else if ( sampleL == 1L && is.character(sample.order) ) {
+      stopifnot(sample.order %in% names(data))
+      ord       <- order(data[[sample.order]])
+      data      <- data[ord, ]
+      class_tab <- table(data[[sample.order]])
+      ret$y.lab <- sprintf("Samples (by %s)", sample.order)
+    } else {
+      stop(
+        "Something wrong with `sample.order =` argument: ",
+        value(sample.order), call. = FALSE
+      )
+    }
+  }
+
+  data_mat <- data.matrix(data[, getAnalytes(data)])
+
+  # calc statistical outliers matrix (boolean matrix of TRUE/FALSE)
+  mat <- apply(data_mat, 2, function(.apt) {
+    seq_along(.apt) %in% .getOutliers(.apt, fc.crit)
+  })
+  rownames(mat) <- rownames(data_mat)   # rownames stripped by apply()
+
+  if ( sum(mat) == 0 ) {
+    warning("No outliers detected in outlier map!", call. = FALSE)
+  }
+
+  if ( freq ) {
+    mat       <- mat[ names(sort(rowSums(mat))), ]
+    ret$y.lab <- "Samples Ordered by Outlier Frequency"
+  }
+
+  if ( is.na(apt.order) ) {
+
+    ret$x.lab <- "Proteins Ordered in Adat"
+
+  } else if ( apt.order == "dilution" ) {
+
+    apt.dils      <- .getDilList(anno_tbl)
+    mat           <- mat[, unlist(apt.dils)]
+    ret$dil.nums  <- lengths(apt.dils)
+    ret$col.order <- "dilution"
+    ret$x.lab     <- sprintf("Dilution Ordered Proteins (%s)",
+                             paste(names(apt.dils), collapse = " | "))
+
+  } else if ( apt.order == "signal" ) {
+
+    signal.order  <- sort(apply(data_mat, 2, stats::median))
+    mat           <- mat[, names(signal.order)]
+    ret$col.order <- "signal"
+    ret$x.lab     <- "Proteins by Median Signal"
+
+  } else {
+    stop("Problem with `apt.order =` argument: ",
+         value(apt.order), call. = FALSE)
+  }
+
+  ret$title <- paste0(
+    "Outlier Map: | x - median(x) | > 6 * mad(x) & FC > ", fc.crit, "x"
+  )
+  ret$rows.by.freq <- freq
+  ret$class.tab    <- class_tab
+  ret$sample.order <- ord
+  ret$matrix       <- mat
+  ret$legend.sub   <- "Proteins"
+  invisible(
+    structure(
+      ret,
+      class = c("outlier_map", "Map", "list")
+    )
+  )
+}
+
+
+#' @describeIn calcOutlierMap
+#' There is a S3 print method for class `"outlier_map"`.
+#' @param x An object of class `"outlier_map"`.
+#' @param ... Arguments for S3 print methods.
+#' @export
+print.outlier_map <- function(x, ...) {
+  writeLines(
+    cli_rule("SomaLogic Outlier Map", line = "double", line_col = "magenta")
+  )
+  key <- c(
+    "Outlier Map dimensions",
+    "Title",
+    "Class Table",
+    "Rows by Frequency",
+    "Sample Order",
+    "x-label",
+    "Legend Sub-title") |> .pad(25)
+  value <- c(
+    .value(paste(dim(x$matrix), collapse = " x ")),
+    .value(x$title),
+    c(x$class.tab),
+    x$rows.by.freq,
+    .value(x$x.lab),
+    .value(x$sample.order),
+    .value(x$legend.sub)
+  )
+  writeLines(paste(" ", key, value))
+  writeLines(cli_rule(line = "double", line_col = "green"))
+  invisible(x)
+}
+
+
+#' S3 plot methods for class outlier_map
+#' @noRd
+#' @export
+plot.outlier_map <- function(x, ...) {
+  NextMethod("plot", type = "outlier")
+}
diff --git a/R/getFlaggedIds.R b/R/getFlaggedIds.R
@@ -0,0 +1,61 @@
+#' Get Flagged Ids From MAD Outlier Map
+#'
+#' Return the IDs of flagged samples for objects of the `outlier_map` class.
+#' Samples are flagged based on the percent analytes (RFU columns) for a given
+#' sample that were identified as outliers using the median absolute
+#' deviation (MAD).
+#'
+#' @family Calc Map
+#' @inheritParams plot.Map
+#' @param x An object of class:
+#'   * `outlier_map` - from [calcOutlierMap()]
+#' @param data Optional. The data originally used to create the map `x`. If
+#'   omitted, a single column data frame is returned.
+#' @param include Optional. Character vector of column name(s) in `data` to
+#'   include in the resulting data frame. Ignored if `data = NULL`.
+#' @return A `data.frame` of the indices (`idx`) of flagged samples, along
+#'   with any additional variables as specified by `include`.
+#' @author Caleb Scheidel
+#' @examples
+#' # flagged outliers
+#' # create a single sample outlier (12)
+#' out_adat <- example_data
+#' apts     <- getAnalytes(out_adat)
+#' out_adat[12, apts] <- out_adat[12, apts] * 10
+#'
+#' om <- calcOutlierMap(out_adat)
+#' getFlaggedIds(om, out_adat, flags = 0.05, include = c("Sex", "Subarray"))
+#' @export
+getFlaggedIds <- function(x, flags = 0.05, data = NULL, include = NULL) {
+
+  if ( !inherits(x, "outlier_map") ) {
+    stop("Input `x` object must be class `outlier_map`!",
+         call. = FALSE)
+  }
+
+  # ensure that flags value is between 0 & 1
+  if ( flags < 0 || flags > 1 ) {
+    stop("`flags =` argument must be between 0 and 1!", call. = FALSE)
+  }
+
+  flagged <- which(rowSums(x$matrix) >= ncol(x$matrix) * flags) |> unname()
+  df_idx  <- data.frame(idx = flagged)  # default 1-col df
+
+  if ( !length(flagged) ) {
+    .todo("No observations were flagged at this flagging proportion: {.val {flags}}")
+  }
+
+  if ( is.null(data) ) {
+    df_idx
+  } else {
+    stopifnot(
+      "The `data` argument must be a `data.frame` object." = is.data.frame(data),
+      "All `include` must be in `data`." = all(include %in% names(data))
+    )
+    df <- as.data.frame(data)  # strip soma_adat class
+    cbind(
+      df_idx,
+      rm_rn(df[flagged, include, drop = FALSE])   # ensure no rn
+    )
+  }
+}