jpquast · elena-krismer · Sep 24, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -42,8 +42,9 @@ Imports:
     httr,
     methods,
     R.utils,
-    stats
-RoxygenNote: 7.3.1
+    stats,
+    missForest
+RoxygenNote: 7.3.2
 Suggests: 
     testthat,
     covr,
@@ -67,7 +68,9 @@ Suggests:
     iq,
     scales,
     farver,
-    ggforce
+    ggforce,
+    xml2,
+    jsonlite
 Depends: 
     R (>= 4.0)
 URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/

diff --git a/NAMESPACE b/NAMESPACE
@@ -38,6 +38,7 @@ export(find_peptide_in_structure)
 export(fit_drc_4p)
 export(go_enrichment)
 export(impute)
+export(impute_randomforest)
 export(kegg_enrichment)
 export(map_peptides_on_structure)
 export(median_normalisation)
@@ -78,6 +79,7 @@ export(volcano_protti)
 export(woods_plot)
 import(dplyr)
 import(ggplot2)
+import(missForest)
 import(progress)
 import(purrr)
 import(stringr)
@@ -134,6 +136,7 @@ importFrom(purrr,pluck)
 importFrom(purrr,pmap)
 importFrom(purrr,reduce)
 importFrom(purrr,set_names)
+importFrom(readr,read_csv)
 importFrom(readr,read_tsv)
 importFrom(readr,write_csv)
 importFrom(readr,write_tsv)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# protti ***
+## New features
+* `impute_randomforest()` new imputation method random forest using `missForest`.
+
 # protti 0.9.0
 
 ## New features 

diff --git a/R/impute.R b/R/impute.R
@@ -102,8 +102,10 @@ impute <- function(data,
                    noise = NULL,
                    method = "ludovic",
                    skip_log2_transform_error = FALSE,
-                   retain_columns = NULL) {
+                   retain_columns = NULL,
+                   ...) {
   noise_missing <- missing(noise) # check if argument noise was provided or not
+
   result <- data %>%
     dplyr::distinct(
       {{ sample }},

diff --git a/R/impute_randomforest.R b/R/impute_randomforest.R
@@ -0,0 +1,157 @@
+#' Imputation of Missing Values Using Random Forest Imputation
+#'
+#' \code{impute_randomforest} performs imputation for missing values in the data using the random
+#' forest-based method implemented in the \code{missForest} package.
+#'
+#' The function imputes missing values by building random forests, where missing values are
+#' predicted based on other available values within the dataset. For each variable with missing
+#' data, the function trains a random forest model using the available (non-missing) data in
+#' that variable, and subsequently predicts the missing values.
+#'
+#' In addition to the imputed values, users can choose to retain additional columns from the
+#' original input data frame that were not part of the imputation process.
+#'
+#' This function allows passing additional parameters to the underlying \code{missForest} function,
+#' such as controlling the number of trees used in the random forest models or specifying the
+#' stopping criteria. For a full list of parameters, refer to the \code{missForest} documentation.
+#'
+#' To enable parallelization, ensure that the `doParallel` package is installed and loaded:
+#' ```
+#' install.packages("doParallel")
+#' library(doParallel)
+#' ```
+#' Then register the desired number of cores for parallel processing:
+#' ```
+#' registerDoParallel(cores = 6)
+#' ```
+#' To leverage parallelization during the imputation, pass `parallelize = "variables"`
+#' as an argument to the `missForest` function.
+# `
+#'
+#' Stekhoven, D.J., & Bühlmann, P. (2012). MissForest—non-parametric missing value imputation
+#' for mixed-type data. Bioinformatics, 28(1), 112-118. https://doi.org/10.1093/bioinformatics/btr597
+#'
+#' @param data A data frame that contains the input variables. This should include columns for
+#' the sample names, precursor or peptide identifiers, and intensity values.
+#' @param sample A character column in the \code{data} data frame that contains the sample names.
+#' @param grouping A character column in the \code{data} data frame that contains the precursor or
+#' peptide identifiers.
+#' @param intensity_log2 A numeric column in the \code{data} data frame that contains the intensity
+#' values.
+#' @param retain_columns A character vector indicating which columns should be retained from the
+#' input data frame. These columns will be preserved in the output alongside the imputed values.
+#' By default, no additional columns are retained (\code{retain_columns = NULL}), but specific
+#' columns can be retained by providing their names as a vector.
+#' @param ... Additional parameters to pass to the \code{missForest} function. These parameters
+#' can control aspects such as the number of trees (\code{ntree}) and the stopping criteria
+#' (\code{maxiter}).
+#'
+#' @return A data frame that contains an \code{imputed_intensity} column with the imputed values
+#' and an \code{imputed} column indicating whether each value was imputed (\code{TRUE}) or not
+#' (\code{FALSE}), in addition to any columns retained via \code{retain_columns}.
+#'
+#' @import dplyr
+#' @importFrom rlang .data
+#' @importFrom magrittr %>%
+#' @importFrom stringr str_sort
+#' @import missForest
+#' @export
+#'
+#' @examples
+#' set.seed(123) # Makes example reproducible
+#'
+#' # Create example data
+#' data <- create_synthetic_data(
+#'   n_proteins = 10,
+#'   frac_change = 0.5,
+#'   n_replicates = 4,
+#'   n_conditions = 2,
+#'   method = "effect_random",
+#'   additional_metadata = FALSE
+#' )
+#'
+#' head(data, n = 24)
+#'
+#' # Perform imputation
+#' data_imputed <- impute_randomforest(
+#'   data,
+#'   sample = sample,
+#'   grouping = peptide,
+#'   intensity_log2 = peptide_intensity_missing
+#' )
+#'
+#' head(data_imputed, n = 24)
+impute_randomforest <- function(
+    data,
+    sample,
+    grouping,
+    intensity_log2,
+    retain_columns = NULL,
+    ...) {
+  # Convert inputs to symbols
+  sample_sym <- rlang::ensym(sample)
+  grouping_sym <- rlang::ensym(grouping)
+  intensity_sym <- rlang::ensym(intensity_log2)
+
+  # Pivot to wide format and remove the sample column
+  data_wide <- data %>%
+    tidyr::pivot_wider(
+      id_cols = !!sample_sym,
+      names_from = !!grouping_sym,
+      values_from = !!intensity_sym,
+      names_repair = "minimal"
+    ) %>%
+    dplyr::select(-!!sample_sym) # Exclude the sample column for imputation
+
+  # Convert to numeric and suppress warnings for coercion
+  data_wide <- suppressWarnings(data.frame(
+    lapply(data_wide, function(x) as.numeric(as.character(x))),
+    check.names = FALSE
+  ))
+
+  # Perform the random forest imputation
+  data_imputed_rf <- missForest::missForest(data_wide, ...)
+
+  # Get the imputed values
+  data_imputed <- data_imputed_rf$ximp
+
+  # Add the sample column back to the imputed data
+  data_imputed[[as.character(sample_sym)]] <- data %>%
+    tidyr::pivot_wider(
+      id_cols = !!sample_sym,
+      names_from = !!grouping_sym,
+      values_from = !!intensity_sym,
+      names_repair = "minimal"
+    ) %>%
+    dplyr::pull(!!sample_sym)
+
+  # Convert back to long format
+  data_imputed <- data_imputed %>%
+    as.data.frame() %>%
+    tidyr::pivot_longer(
+      cols = -as.character(sample_sym),
+      names_to = as.character(grouping_sym),
+      values_to = as.character(intensity_sym)
+    )
+
+  # Join the retained columns if specified
+  if (!is.null(retain_columns)) {
+    data_to_join <- data %>%
+      dplyr::select(all_of(retain_columns), !!grouping_sym, !!sample_sym, -!!intensity_sym) %>%
+      dplyr::distinct()
+
+    result <- data_imputed %>%
+      dplyr::left_join(
+        data_to_join,
+        by = c(as.character(sample_sym), as.character(grouping_sym))
+      )
+  } else {
+    result <- data_imputed
+  }
+
+  return(result)
+}
+
+#' @references
+#' Stekhoven, D.J., & Bühlmann, P. (2012). MissForest—non-parametric missing value imputation
+#' for mixed-type data. Bioinformatics, 28(1), 112-118. https://doi.org/10.1093/bioinformatics/btr597
diff --git a/R/qc_cvs.R b/R/qc_cvs.R
@@ -122,7 +122,7 @@ The function does not handle log2 transformed data.",
         dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>%
         dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>%
         dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>%
-        dplyr::select(-.data$type) %>%
+        dplyr::select(-"type") %>%
         dplyr::group_by({{ condition }}) %>%
         dplyr::mutate(median = stats::median(.data$values)) %>%
         dplyr::distinct()

diff --git a/R/try_query.R b/R/try_query.R
@@ -13,7 +13,6 @@
 #' @param type a character value that specifies the type of data at the target URL. Options are
 #' all options that can be supplied to httr::content, these include e.g.
 #' "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values".
-#' Default is "tab-separated-values".
 #' @param timeout a numeric value that specifies the maximum request time. Default is 60 seconds.
 #' @param accept a character value that specifies the type of data that should be sent by the API if
 #' it uses content negotiation. The default is NULL and it should only be set for APIs that use
@@ -22,6 +21,7 @@
 #'
 #' @importFrom curl has_internet
 #' @importFrom httr GET timeout http_error message_for_status http_status content accept
+#' @importFrom readr read_tsv read_csv
 #'
 #' @return A data frame that contains the table from the url.
 try_query <-
@@ -88,7 +88,38 @@ try_query <-
     # Change variable to not show progress if readr is used
     options(readr.show_progress = FALSE)
 
-    result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
+    # Retrieve the content as raw bytes using httr::content
+    raw_content <- httr::content(query_result, type = "raw")
+    # Check for gzip magic number (1f 8b) before decompression
+    compressed <- length(raw_content) >= 2 && raw_content[1] == as.raw(0x1f) && raw_content[2] == as.raw(0x8b)
+
+    # Check if the content is gzip compressed
+    if (!is.null(query_result$headers[["content-encoding"]]) && query_result$headers[["content-encoding"]] == "gzip" && compressed) {
+      # Decompress the raw content using base R's `memDecompress`
+      decompressed_content <- memDecompress(raw_content, type = "gzip")
+
+      # Convert the raw bytes to a character string
+      text_content <- rawToChar(decompressed_content)
+
+      # Read the decompressed content based on the specified type
+      if (type == "text/tab-separated-values") {
+        result <- readr::read_tsv(text_content, ...)
+      } else if (type == "text/html") {
+        result <- xml2::read_html(text_content, ...)
+      } else if (type == "text/xml") {
+        result <- xml2::read_xml(text_content, ...)
+      } else if (type == "text/csv" || type == "txt/csv") {
+        result <- readr::read_csv(text_content, ...)
+      } else if (type == "application/json") {
+        result <- jsonlite::fromJSON(text_content, ...) # Using jsonlite for JSON parsing
+      } else if (type == "text") {
+        result <- text_content # Return raw text as-is
+      } else {
+        stop("Unsupported content type: ", type)
+      }
+    } else {
+      result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
+    }
 
     return(result)
   }
diff --git a/man/impute.Rd b/man/impute.Rd