diff --git a/DESCRIPTION b/DESCRIPTION index 0faf4c66..bdf7eb33 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,8 +42,9 @@ Imports: httr, methods, R.utils, - stats -RoxygenNote: 7.3.1 + stats, + missForest +RoxygenNote: 7.3.2 Suggests: testthat, covr, @@ -67,7 +68,9 @@ Suggests: iq, scales, farver, - ggforce + ggforce, + xml2, + jsonlite Depends: R (>= 4.0) URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/ diff --git a/NAMESPACE b/NAMESPACE index 09754811..128dacbb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ export(find_peptide_in_structure) export(fit_drc_4p) export(go_enrichment) export(impute) +export(impute_randomforest) export(kegg_enrichment) export(map_peptides_on_structure) export(median_normalisation) @@ -78,6 +79,7 @@ export(volcano_protti) export(woods_plot) import(dplyr) import(ggplot2) +import(missForest) import(progress) import(purrr) import(stringr) @@ -134,6 +136,7 @@ importFrom(purrr,pluck) importFrom(purrr,pmap) importFrom(purrr,reduce) importFrom(purrr,set_names) +importFrom(readr,read_csv) importFrom(readr,read_tsv) importFrom(readr,write_csv) importFrom(readr,write_tsv) diff --git a/NEWS.md b/NEWS.md index f947cda2..c49a71e1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# protti *** +## New features +* `impute_randomforest()` new imputation method random forest using `missForest`. + # protti 0.9.0 ## New features diff --git a/R/impute.R b/R/impute.R index 5bfd7393..5dd02fab 100644 --- a/R/impute.R +++ b/R/impute.R @@ -102,8 +102,10 @@ impute <- function(data, noise = NULL, method = "ludovic", skip_log2_transform_error = FALSE, - retain_columns = NULL) { + retain_columns = NULL, + ...) { noise_missing <- missing(noise) # check if argument noise was provided or not + result <- data %>% dplyr::distinct( {{ sample }}, diff --git a/R/impute_randomforest.R b/R/impute_randomforest.R new file mode 100644 index 00000000..a78d4b34 --- /dev/null +++ b/R/impute_randomforest.R @@ -0,0 +1,157 @@ +#' Imputation of Missing Values Using Random Forest Imputation +#' +#' \code{impute_randomforest} performs imputation for missing values in the data using the random +#' forest-based method implemented in the \code{missForest} package. +#' +#' The function imputes missing values by building random forests, where missing values are +#' predicted based on other available values within the dataset. For each variable with missing +#' data, the function trains a random forest model using the available (non-missing) data in +#' that variable, and subsequently predicts the missing values. +#' +#' In addition to the imputed values, users can choose to retain additional columns from the +#' original input data frame that were not part of the imputation process. +#' +#' This function allows passing additional parameters to the underlying \code{missForest} function, +#' such as controlling the number of trees used in the random forest models or specifying the +#' stopping criteria. For a full list of parameters, refer to the \code{missForest} documentation. +#' +#' To enable parallelization, ensure that the `doParallel` package is installed and loaded: +#' ``` +#' install.packages("doParallel") +#' library(doParallel) +#' ``` +#' Then register the desired number of cores for parallel processing: +#' ``` +#' registerDoParallel(cores = 6) +#' ``` +#' To leverage parallelization during the imputation, pass `parallelize = "variables"` +#' as an argument to the `missForest` function. +# ` +#' +#' Stekhoven, D.J., & Bühlmann, P. (2012). MissForest—non-parametric missing value imputation +#' for mixed-type data. Bioinformatics, 28(1), 112-118. https://doi.org/10.1093/bioinformatics/btr597 +#' +#' @param data A data frame that contains the input variables. This should include columns for +#' the sample names, precursor or peptide identifiers, and intensity values. +#' @param sample A character column in the \code{data} data frame that contains the sample names. +#' @param grouping A character column in the \code{data} data frame that contains the precursor or +#' peptide identifiers. +#' @param intensity_log2 A numeric column in the \code{data} data frame that contains the intensity +#' values. +#' @param retain_columns A character vector indicating which columns should be retained from the +#' input data frame. These columns will be preserved in the output alongside the imputed values. +#' By default, no additional columns are retained (\code{retain_columns = NULL}), but specific +#' columns can be retained by providing their names as a vector. +#' @param ... Additional parameters to pass to the \code{missForest} function. These parameters +#' can control aspects such as the number of trees (\code{ntree}) and the stopping criteria +#' (\code{maxiter}). +#' +#' @return A data frame that contains an \code{imputed_intensity} column with the imputed values +#' and an \code{imputed} column indicating whether each value was imputed (\code{TRUE}) or not +#' (\code{FALSE}), in addition to any columns retained via \code{retain_columns}. +#' +#' @import dplyr +#' @importFrom rlang .data +#' @importFrom magrittr %>% +#' @importFrom stringr str_sort +#' @import missForest +#' @export +#' +#' @examples +#' set.seed(123) # Makes example reproducible +#' +#' # Create example data +#' data <- create_synthetic_data( +#' n_proteins = 10, +#' frac_change = 0.5, +#' n_replicates = 4, +#' n_conditions = 2, +#' method = "effect_random", +#' additional_metadata = FALSE +#' ) +#' +#' head(data, n = 24) +#' +#' # Perform imputation +#' data_imputed <- impute_randomforest( +#' data, +#' sample = sample, +#' grouping = peptide, +#' intensity_log2 = peptide_intensity_missing +#' ) +#' +#' head(data_imputed, n = 24) +impute_randomforest <- function( + data, + sample, + grouping, + intensity_log2, + retain_columns = NULL, + ...) { + # Convert inputs to symbols + sample_sym <- rlang::ensym(sample) + grouping_sym <- rlang::ensym(grouping) + intensity_sym <- rlang::ensym(intensity_log2) + + # Pivot to wide format and remove the sample column + data_wide <- data %>% + tidyr::pivot_wider( + id_cols = !!sample_sym, + names_from = !!grouping_sym, + values_from = !!intensity_sym, + names_repair = "minimal" + ) %>% + dplyr::select(-!!sample_sym) # Exclude the sample column for imputation + + # Convert to numeric and suppress warnings for coercion + data_wide <- suppressWarnings(data.frame( + lapply(data_wide, function(x) as.numeric(as.character(x))), + check.names = FALSE + )) + + # Perform the random forest imputation + data_imputed_rf <- missForest::missForest(data_wide, ...) + + # Get the imputed values + data_imputed <- data_imputed_rf$ximp + + # Add the sample column back to the imputed data + data_imputed[[as.character(sample_sym)]] <- data %>% + tidyr::pivot_wider( + id_cols = !!sample_sym, + names_from = !!grouping_sym, + values_from = !!intensity_sym, + names_repair = "minimal" + ) %>% + dplyr::pull(!!sample_sym) + + # Convert back to long format + data_imputed <- data_imputed %>% + as.data.frame() %>% + tidyr::pivot_longer( + cols = -as.character(sample_sym), + names_to = as.character(grouping_sym), + values_to = as.character(intensity_sym) + ) + + # Join the retained columns if specified + if (!is.null(retain_columns)) { + data_to_join <- data %>% + dplyr::select(all_of(retain_columns), !!grouping_sym, !!sample_sym, -!!intensity_sym) %>% + dplyr::distinct() + + result <- data_imputed %>% + dplyr::left_join( + data_to_join, + by = c(as.character(sample_sym), as.character(grouping_sym)) + ) + } else { + result <- data_imputed + } + + return(result) +} + +#' @references +#' Stekhoven, D.J., & Bühlmann, P. (2012). MissForest—non-parametric missing value imputation +#' for mixed-type data. Bioinformatics, 28(1), 112-118. https://doi.org/10.1093/bioinformatics/btr597 diff --git a/R/qc_cvs.R b/R/qc_cvs.R index 713303d3..3ffa1ff2 100644 --- a/R/qc_cvs.R +++ b/R/qc_cvs.R @@ -122,7 +122,7 @@ The function does not handle log2 transformed data.", dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>% dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>% dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>% - dplyr::select(-.data$type) %>% + dplyr::select(-"type") %>% dplyr::group_by({{ condition }}) %>% dplyr::mutate(median = stats::median(.data$values)) %>% dplyr::distinct() diff --git a/R/try_query.R b/R/try_query.R index 8016a00b..ad72c9ca 100644 --- a/R/try_query.R +++ b/R/try_query.R @@ -13,7 +13,6 @@ #' @param type a character value that specifies the type of data at the target URL. Options are #' all options that can be supplied to httr::content, these include e.g. #' "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values". -#' Default is "tab-separated-values". #' @param timeout a numeric value that specifies the maximum request time. Default is 60 seconds. #' @param accept a character value that specifies the type of data that should be sent by the API if #' it uses content negotiation. The default is NULL and it should only be set for APIs that use @@ -22,6 +21,7 @@ #' #' @importFrom curl has_internet #' @importFrom httr GET timeout http_error message_for_status http_status content accept +#' @importFrom readr read_tsv read_csv #' #' @return A data frame that contains the table from the url. try_query <- @@ -88,7 +88,38 @@ try_query <- # Change variable to not show progress if readr is used options(readr.show_progress = FALSE) - result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...)) + # Retrieve the content as raw bytes using httr::content + raw_content <- httr::content(query_result, type = "raw") + # Check for gzip magic number (1f 8b) before decompression + compressed <- length(raw_content) >= 2 && raw_content[1] == as.raw(0x1f) && raw_content[2] == as.raw(0x8b) + + # Check if the content is gzip compressed + if (!is.null(query_result$headers[["content-encoding"]]) && query_result$headers[["content-encoding"]] == "gzip" && compressed) { + # Decompress the raw content using base R's `memDecompress` + decompressed_content <- memDecompress(raw_content, type = "gzip") + + # Convert the raw bytes to a character string + text_content <- rawToChar(decompressed_content) + + # Read the decompressed content based on the specified type + if (type == "text/tab-separated-values") { + result <- readr::read_tsv(text_content, ...) + } else if (type == "text/html") { + result <- xml2::read_html(text_content, ...) + } else if (type == "text/xml") { + result <- xml2::read_xml(text_content, ...) + } else if (type == "text/csv" || type == "txt/csv") { + result <- readr::read_csv(text_content, ...) + } else if (type == "application/json") { + result <- jsonlite::fromJSON(text_content, ...) # Using jsonlite for JSON parsing + } else if (type == "text") { + result <- text_content # Return raw text as-is + } else { + stop("Unsupported content type: ", type) + } + } else { + result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...)) + } return(result) } diff --git a/man/impute.Rd b/man/impute.Rd index 188820db..8bef561b 100644 --- a/man/impute.Rd +++ b/man/impute.Rd @@ -15,7 +15,8 @@ impute( noise = NULL, method = "ludovic", skip_log2_transform_error = FALSE, - retain_columns = NULL + retain_columns = NULL, + ... ) } \arguments{ diff --git a/man/impute_randomforest.Rd b/man/impute_randomforest.Rd new file mode 100644 index 00000000..59e22895 --- /dev/null +++ b/man/impute_randomforest.Rd @@ -0,0 +1,100 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute_randomforest.R +\name{impute_randomforest} +\alias{impute_randomforest} +\title{Imputation of Missing Values Using Random Forest Imputation} +\usage{ +impute_randomforest( + data, + sample, + grouping, + intensity_log2, + retain_columns = NULL, + ... +) +} +\arguments{ +\item{data}{A data frame that contains the input variables. This should include columns for +the sample names, precursor or peptide identifiers, and intensity values.} + +\item{sample}{A character column in the \code{data} data frame that contains the sample names.} + +\item{grouping}{A character column in the \code{data} data frame that contains the precursor or +peptide identifiers.} + +\item{intensity_log2}{A numeric column in the \code{data} data frame that contains the intensity +values.} + +\item{retain_columns}{A character vector indicating which columns should be retained from the +input data frame. These columns will be preserved in the output alongside the imputed values. +By default, no additional columns are retained (\code{retain_columns = NULL}), but specific +columns can be retained by providing their names as a vector.} + +\item{...}{Additional parameters to pass to the \code{missForest} function. These parameters +can control aspects such as the number of trees (\code{ntree}) and the stopping criteria +(\code{maxiter}).} +} +\value{ +A data frame that contains an \code{imputed_intensity} column with the imputed values +and an \code{imputed} column indicating whether each value was imputed (\code{TRUE}) or not +(\code{FALSE}), in addition to any columns retained via \code{retain_columns}. +} +\description{ +\code{impute_randomforest} performs imputation for missing values in the data using the random +forest-based method implemented in the \code{missForest} package. +} +\details{ +The function imputes missing values by building random forests, where missing values are +predicted based on other available values within the dataset. For each variable with missing +data, the function trains a random forest model using the available (non-missing) data in +that variable, and subsequently predicts the missing values. + +In addition to the imputed values, users can choose to retain additional columns from the +original input data frame that were not part of the imputation process. + +This function allows passing additional parameters to the underlying \code{missForest} function, +such as controlling the number of trees used in the random forest models or specifying the +stopping criteria. For a full list of parameters, refer to the \code{missForest} documentation. + +To enable parallelization, ensure that the \code{doParallel} package is installed and loaded: + +\if{html}{\out{