Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
3feb11f
add randomforest
elena-krismer Sep 24, 2024
4f2a737
Style code (GHA)
elena-krismer Oct 10, 2024
86d58ae
add random forest imputation
elena-krismer Oct 10, 2024
cce817c
Merge branch 'random_forest_imputation' of https://github.com/jpquast…
elena-krismer Oct 10, 2024
2feb1a4
Style code (GHA)
elena-krismer Oct 10, 2024
3065a3b
fix vroom problem
jpquast Sep 29, 2024
e1c9f04
Merge branch 'random_forest_imputation' of https://github.com/jpquast…
elena-krismer Oct 10, 2024
c113f21
Style code (GHA)
elena-krismer Oct 10, 2024
e4ad95e
Add xml2 and jsonlite to suggests
jpquast Sep 29, 2024
91fbd34
Fixed another bug in try_query
jpquast Sep 29, 2024
93eb66a
Fix some issues
jpquast Sep 30, 2024
70dd1ca
Merge branch 'random_forest_imputation' of https://github.com/jpquast…
elena-krismer Oct 10, 2024
d704131
Style code (GHA)
elena-krismer Oct 10, 2024
2ae279e
fix syntax
elena-krismer Oct 10, 2024
76950ae
Merge branch 'random_forest_imputation' of https://github.com/jpquast…
elena-krismer Oct 10, 2024
1db1f18
Style code (GHA)
elena-krismer Oct 10, 2024
5e61077
update tests
elena-krismer Oct 13, 2024
5e6403a
Style code (GHA)
elena-krismer Oct 13, 2024
947fc64
update getch_mobidb results
elena-krismer Oct 13, 2024
fe991c4
Merge branch 'random_forest_imputation' of https://github.com/jpquast…
elena-krismer Oct 13, 2024
67a8f35
update function description
elena-krismer Oct 13, 2024
0616d45
Style code (GHA)
elena-krismer Oct 13, 2024
122f73a
set names repair to minimal
elena-krismer Oct 14, 2024
989fe45
Merge branch 'random_forest_imputation' of https://github.com/jpquast…
elena-krismer Oct 14, 2024
3df6bb5
fix select all warning
elena-krismer Nov 4, 2024
d46ba11
fix column renaiming
elena-krismer Nov 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ Imports:
httr,
methods,
R.utils,
stats
RoxygenNote: 7.3.1
stats,
missForest
RoxygenNote: 7.3.2
Suggests:
testthat,
covr,
Expand All @@ -67,7 +68,9 @@ Suggests:
iq,
scales,
farver,
ggforce
ggforce,
xml2,
jsonlite
Depends:
R (>= 4.0)
URL: https://github.com/jpquast/protti, https://jpquast.github.io/protti/
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export(find_peptide_in_structure)
export(fit_drc_4p)
export(go_enrichment)
export(impute)
export(impute_randomforest)
export(kegg_enrichment)
export(map_peptides_on_structure)
export(median_normalisation)
Expand Down Expand Up @@ -78,6 +79,7 @@ export(volcano_protti)
export(woods_plot)
import(dplyr)
import(ggplot2)
import(missForest)
import(progress)
import(purrr)
import(stringr)
Expand Down Expand Up @@ -134,6 +136,7 @@ importFrom(purrr,pluck)
importFrom(purrr,pmap)
importFrom(purrr,reduce)
importFrom(purrr,set_names)
importFrom(readr,read_csv)
importFrom(readr,read_tsv)
importFrom(readr,write_csv)
importFrom(readr,write_tsv)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# protti ***
## New features
* `impute_randomforest()` new imputation method random forest using `missForest`.

# protti 0.9.0

## New features
Expand Down
4 changes: 3 additions & 1 deletion R/impute.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,10 @@ impute <- function(data,
noise = NULL,
method = "ludovic",
skip_log2_transform_error = FALSE,
retain_columns = NULL) {
retain_columns = NULL,
...) {
noise_missing <- missing(noise) # check if argument noise was provided or not

result <- data %>%
dplyr::distinct(
{{ sample }},
Expand Down
157 changes: 157 additions & 0 deletions R/impute_randomforest.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
#' Imputation of Missing Values Using Random Forest Imputation
#'
#' \code{impute_randomforest} performs imputation for missing values in the data using the random
#' forest-based method implemented in the \code{missForest} package.
#'
#' The function imputes missing values by building random forests, where missing values are
#' predicted based on other available values within the dataset. For each variable with missing
#' data, the function trains a random forest model using the available (non-missing) data in
#' that variable, and subsequently predicts the missing values.
#'
#' In addition to the imputed values, users can choose to retain additional columns from the
#' original input data frame that were not part of the imputation process.
#'
#' This function allows passing additional parameters to the underlying \code{missForest} function,
#' such as controlling the number of trees used in the random forest models or specifying the
#' stopping criteria. For a full list of parameters, refer to the \code{missForest} documentation.
#'
#' To enable parallelization, ensure that the `doParallel` package is installed and loaded:
#' ```
#' install.packages("doParallel")
#' library(doParallel)
#' ```
#' Then register the desired number of cores for parallel processing:
#' ```
#' registerDoParallel(cores = 6)
#' ```
#' To leverage parallelization during the imputation, pass `parallelize = "variables"`
#' as an argument to the `missForest` function.
# `
#'
#' Stekhoven, D.J., & Bühlmann, P. (2012). MissForest—non-parametric missing value imputation
#' for mixed-type data. Bioinformatics, 28(1), 112-118. https://doi.org/10.1093/bioinformatics/btr597
#'
#' @param data A data frame that contains the input variables. This should include columns for
#' the sample names, precursor or peptide identifiers, and intensity values.
#' @param sample A character column in the \code{data} data frame that contains the sample names.
#' @param grouping A character column in the \code{data} data frame that contains the precursor or
#' peptide identifiers.
#' @param intensity_log2 A numeric column in the \code{data} data frame that contains the intensity
#' values.
#' @param retain_columns A character vector indicating which columns should be retained from the
#' input data frame. These columns will be preserved in the output alongside the imputed values.
#' By default, no additional columns are retained (\code{retain_columns = NULL}), but specific
#' columns can be retained by providing their names as a vector.
#' @param ... Additional parameters to pass to the \code{missForest} function. These parameters
#' can control aspects such as the number of trees (\code{ntree}) and the stopping criteria
#' (\code{maxiter}).
#'
#' @return A data frame that contains an \code{imputed_intensity} column with the imputed values
#' and an \code{imputed} column indicating whether each value was imputed (\code{TRUE}) or not
#' (\code{FALSE}), in addition to any columns retained via \code{retain_columns}.
#'
#' @import dplyr
#' @importFrom rlang .data
#' @importFrom magrittr %>%
#' @importFrom stringr str_sort
#' @import missForest
#' @export
#'
#' @examples
#' set.seed(123) # Makes example reproducible
#'
#' # Create example data
#' data <- create_synthetic_data(
#' n_proteins = 10,
#' frac_change = 0.5,
#' n_replicates = 4,
#' n_conditions = 2,
#' method = "effect_random",
#' additional_metadata = FALSE
#' )
#'
#' head(data, n = 24)
#'
#' # Perform imputation
#' data_imputed <- impute_randomforest(
#' data,
#' sample = sample,
#' grouping = peptide,
#' intensity_log2 = peptide_intensity_missing
#' )
#'
#' head(data_imputed, n = 24)
impute_randomforest <- function(
data,
sample,
grouping,
intensity_log2,
retain_columns = NULL,
...) {
# Convert inputs to symbols
sample_sym <- rlang::ensym(sample)
grouping_sym <- rlang::ensym(grouping)
intensity_sym <- rlang::ensym(intensity_log2)

# Pivot to wide format and remove the sample column
data_wide <- data %>%
tidyr::pivot_wider(
id_cols = !!sample_sym,
names_from = !!grouping_sym,
values_from = !!intensity_sym,
names_repair = "minimal"
) %>%
dplyr::select(-!!sample_sym) # Exclude the sample column for imputation

# Convert to numeric and suppress warnings for coercion
data_wide <- suppressWarnings(data.frame(
lapply(data_wide, function(x) as.numeric(as.character(x))),
check.names = FALSE
))

# Perform the random forest imputation
data_imputed_rf <- missForest::missForest(data_wide, ...)

# Get the imputed values
data_imputed <- data_imputed_rf$ximp

# Add the sample column back to the imputed data
data_imputed[[as.character(sample_sym)]] <- data %>%
tidyr::pivot_wider(
id_cols = !!sample_sym,
names_from = !!grouping_sym,
values_from = !!intensity_sym,
names_repair = "minimal"
) %>%
dplyr::pull(!!sample_sym)

# Convert back to long format
data_imputed <- data_imputed %>%
as.data.frame() %>%
tidyr::pivot_longer(
cols = -as.character(sample_sym),
names_to = as.character(grouping_sym),
values_to = as.character(intensity_sym)
)

# Join the retained columns if specified
if (!is.null(retain_columns)) {
data_to_join <- data %>%
dplyr::select(all_of(retain_columns), !!grouping_sym, !!sample_sym, -!!intensity_sym) %>%
dplyr::distinct()

result <- data_imputed %>%
dplyr::left_join(
data_to_join,
by = c(as.character(sample_sym), as.character(grouping_sym))
)
} else {
result <- data_imputed
}

return(result)
}

#' @references
#' Stekhoven, D.J., & Bühlmann, P. (2012). MissForest—non-parametric missing value imputation
#' for mixed-type data. Bioinformatics, 28(1), 112-118. https://doi.org/10.1093/bioinformatics/btr597
2 changes: 1 addition & 1 deletion R/qc_cvs.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ The function does not handle log2 transformed data.",
dplyr::mutate({{ condition }} := forcats::fct_expand({{ condition }}, "combined")) %>%
dplyr::mutate({{ condition }} := replace({{ condition }}, .data$type == "cv_combined", "combined")) %>%
dplyr::mutate({{ condition }} := forcats::fct_relevel({{ condition }}, "combined")) %>%
dplyr::select(-.data$type) %>%
dplyr::select(-"type") %>%
dplyr::group_by({{ condition }}) %>%
dplyr::mutate(median = stats::median(.data$values)) %>%
dplyr::distinct()
Expand Down
35 changes: 33 additions & 2 deletions R/try_query.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#' @param type a character value that specifies the type of data at the target URL. Options are
#' all options that can be supplied to httr::content, these include e.g.
#' "text/tab-separated-values", "application/json" and "txt/csv". Default is "text/tab-separated-values".
#' Default is "tab-separated-values".
#' @param timeout a numeric value that specifies the maximum request time. Default is 60 seconds.
#' @param accept a character value that specifies the type of data that should be sent by the API if
#' it uses content negotiation. The default is NULL and it should only be set for APIs that use
Expand All @@ -22,6 +21,7 @@
#'
#' @importFrom curl has_internet
#' @importFrom httr GET timeout http_error message_for_status http_status content accept
#' @importFrom readr read_tsv read_csv
#'
#' @return A data frame that contains the table from the url.
try_query <-
Expand Down Expand Up @@ -88,7 +88,38 @@ try_query <-
# Change variable to not show progress if readr is used
options(readr.show_progress = FALSE)

result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
# Retrieve the content as raw bytes using httr::content
raw_content <- httr::content(query_result, type = "raw")
# Check for gzip magic number (1f 8b) before decompression
compressed <- length(raw_content) >= 2 && raw_content[1] == as.raw(0x1f) && raw_content[2] == as.raw(0x8b)

# Check if the content is gzip compressed
if (!is.null(query_result$headers[["content-encoding"]]) && query_result$headers[["content-encoding"]] == "gzip" && compressed) {
# Decompress the raw content using base R's `memDecompress`
decompressed_content <- memDecompress(raw_content, type = "gzip")

# Convert the raw bytes to a character string
text_content <- rawToChar(decompressed_content)

# Read the decompressed content based on the specified type
if (type == "text/tab-separated-values") {
result <- readr::read_tsv(text_content, ...)
} else if (type == "text/html") {
result <- xml2::read_html(text_content, ...)
} else if (type == "text/xml") {
result <- xml2::read_xml(text_content, ...)
} else if (type == "text/csv" || type == "txt/csv") {
result <- readr::read_csv(text_content, ...)
} else if (type == "application/json") {
result <- jsonlite::fromJSON(text_content, ...) # Using jsonlite for JSON parsing
} else if (type == "text") {
result <- text_content # Return raw text as-is
} else {
stop("Unsupported content type: ", type)
}
} else {
result <- suppressMessages(httr::content(query_result, type = type, encoding = "UTF-8", ...))
}

return(result)
}
3 changes: 2 additions & 1 deletion man/impute.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading