Skip to content

Commit 6619e69

Browse files
committed
Update version to 2.1.1 and enhance scale_abundance function with chunk processing
- Bumped package version to 2.1.1. - Added chunk_sample_size parameter to scale_abundance for memory-efficient processing of large datasets. - Improved documentation to include new parameter details. - Refactored scaling logic to support chunked processing with BiocParallel for better performance.
1 parent 2520947 commit 6619e69

File tree

5 files changed

+207
-139
lines changed

5 files changed

+207
-139
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: tidybulk
33
Title: Brings transcriptomics to the tidyverse
4-
Version: 2.1.0
4+
Version: 2.1.1
55
Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
66
role = c("aut", "cre")),
77
person("Maria", "Doyle", email = "[email protected]",

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ importFrom(ggplot2,scale_x_continuous)
9696
importFrom(ggplot2,scale_y_continuous)
9797
importFrom(lifecycle,deprecate_soft)
9898
importFrom(lifecycle,deprecate_warn)
99+
importFrom(lifecycle,is_present)
99100
importFrom(magrittr,"%$%")
100101
importFrom(magrittr,divide_by)
101102
importFrom(magrittr,equals)
@@ -127,6 +128,7 @@ importFrom(rlang,enquo)
127128
importFrom(rlang,inform)
128129
importFrom(rlang,is_quosure)
129130
importFrom(rlang,quo)
131+
importFrom(rlang,quo_get_expr)
130132
importFrom(rlang,quo_is_missing)
131133
importFrom(rlang,quo_is_null)
132134
importFrom(rlang,quo_is_symbol)

R/scale_abundance.R

Lines changed: 116 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference.
1818
#' @param .subset_for_scaling A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example
1919
#' @param suffix A character string to append to the scaled abundance column name. Default is "_scaled".
20+
#' @param chunk_sample_size An integer indicating how many samples to process per chunk. Default is `Inf` (no chunking). For HDF5-backed data or large datasets, set to a finite value (e.g., 50) to enable memory-efficient chunked processing with BiocParallel parallelization.
2021
#'
2122
#' @param reference_selection_function DEPRECATED. please use reference_sample.
2223
#' @param ... Further arguments.
@@ -68,6 +69,7 @@ setGeneric("scale_abundance", function(.data,
6869
reference_sample = NULL,
6970
.subset_for_scaling = NULL,
7071
suffix = "_scaled",
72+
chunk_sample_size = Inf,
7173
# DEPRECATED
7274
reference_selection_function = NULL,
7375
...,
@@ -107,7 +109,7 @@ setGeneric("scale_abundance", function(.data,
107109
reference_sample = NULL,
108110
.subset_for_scaling = NULL,
109111
suffix = "_scaled",
110-
chunk_size = Inf,
112+
chunk_sample_size = Inf,
111113
# DEPRECATED
112114
reference_selection_function = NULL,
113115
...,
@@ -161,12 +163,15 @@ setGeneric("scale_abundance", function(.data,
161163
if (nrow(.data_filtered) == 0)
162164
stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data.")
163165

164-
# Determine the correct assay name
165-
my_counts_filtered = assays(.data_filtered)[[my_assay]] |> na.omit()
166-
library_size_filtered = my_counts_filtered |> colSums(na.rm = TRUE)
167-
166+
# Get the rownames of features that passed filtering
167+
features_to_use <- rownames(.data_filtered)
168+
169+
# We just carry the gene names, no need to stress the memory with the full data frame
170+
rm(.data_filtered)
171+
172+
168173
# If not enough genes, warning
169-
if(nrow(my_counts_filtered)<100) warning(warning_for_scaling_with_few_genes)
174+
if(length(features_to_use)<100) warning(warning_for_scaling_with_few_genes)
170175

171176
# Set column name for value scaled
172177
value_scaled = paste0(my_assay, suffix)
@@ -175,152 +180,102 @@ setGeneric("scale_abundance", function(.data,
175180
reference <-
176181
reference_sample
177182

178-
if (is.null(reference))
183+
if (is.null(reference)){
184+
# Get filtered counts for reference selection
185+
library_size_filtered = assays(.data[features_to_use, ])[[my_assay]] |> colSums(na.rm = TRUE)
179186
reference = library_size_filtered |>
180187
sort() |>
181188
tail(1) |>
182189
names()
190+
}
191+
183192

184193

185194
# Communicate the reference if chosen by default
186195
if(is.null(reference_sample)) message(sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", reference))
187196

188-
# Calculate TMM
189-
nf <-
190-
edgeR::calcNormFactors(
191-
my_counts_filtered,
192-
refColumn = reference,
193-
method = method
194-
)
195-
196-
# Calculate multiplier
197-
multiplier =
198-
# Relecting the ratio of effective library size of the reference sample to the effective library size of each sample
199-
(library_size_filtered[reference] * nf[reference]) |>
200-
divide_by(library_size_filtered * nf)
201-
202-
# Add to sample info
203-
colData(.data)$TMM = nf
204-
colData(.data)$multiplier = multiplier
205-
206-
# Check if input is DelayedArray to preserve memory efficiency
207-
input_assay <- assay(.data, my_assay)
208-
is_delayed <- is(input_assay, "DelayedArray")
209-
210-
if (is_delayed) {
211-
# For DelayedArray, use sweep to create a delayed operation
212-
check_and_install_packages("DelayedArray")
213-
my_counts_scaled <- list(
214-
DelayedArray::sweep(input_assay, 2, multiplier, "*")
215-
) |> setNames(value_scaled)
216-
} else {
217-
# For regular matrices, use matrix multiplication
218-
my_counts_scaled =
219-
list(
220-
input_assay %*%
221-
diag(multiplier)
222-
223-
) |>
224-
setNames(value_scaled)
225-
}
197+
198+
# Calculate TMM normalization factors once for all samples
199+
chunk_counts_filtered <- assays(.data[features_to_use, ])[[my_assay]] |> na.omit()
200+
chunk_library_size <- chunk_counts_filtered |> colSums(na.rm = TRUE)
226201

227-
colnames(my_counts_scaled[[1]]) = colnames(input_assay)
202+
nf <- edgeR::calcNormFactors(
203+
chunk_counts_filtered,
204+
refColumn = reference,
205+
method = method
206+
)
228207

229-
scaled_symbol <- rlang::sym(value_scaled)
230-
scaled_quosure <- drop_enquo_env(rlang::new_quosure(scaled_symbol))
208+
# Calculate multiplier for all samples
209+
multiplier <-
210+
(chunk_library_size[reference] * nf[reference]) |>
211+
divide_by(chunk_library_size * nf)
231212

232-
# Add the assay
233-
assays(.data, withDimnames=FALSE) = assays(.data) |> c(my_counts_scaled)
213+
# Add to sample info
214+
colData(.data)$TMM <- nf
215+
colData(.data)$multiplier <- multiplier
234216

235-
.data |>
217+
# If chunk_sample_size is finite, process in chunks with parallelization
218+
if (!is.finite(chunk_sample_size) || chunk_sample_size >= ncol(.data)) {
236219

237-
# Add methods
238-
memorise_methods_used(c("edger", "tmm")) |>
220+
# Standard processing without chunking - just apply the multipliers
221+
.data <- apply_scaling_only(.data, my_assay, multiplier, value_scaled)
239222

240-
# Attach column internals
241-
add_tt_columns(.abundance_scaled = !!scaled_quosure )
242-
243-
}
223+
} else {
244224

245-
#' Scale transcript abundance for HDF5-backed SummarizedExperiment
246-
#'
247-
#' @description A memory-efficient variant of `scale_abundance()` that chunks the
248-
#' `SummarizedExperiment` by sample and applies the base `scale_abundance()` subroutine to
249-
#' each partition. When no reference sample is supplied, the sample whose library size is
250-
#' closest to the mean library size is selected automatically. Each chunk includes that
251-
#' reference, the results are column-bound, and the scaled assay is appended to the original
252-
#' object. Processing is parallelized using BiocParallel; configure parallelization with
253-
#' `BiocParallel::register()` before calling this function.
254-
#'
255-
#' @inheritParams scale_abundance
256-
#' @param chunk_sample_size An integer indicating how many samples to process per partition before column-binding the scaled result. Default is `50`.
257-
#'
258-
#' @return A `SummarizedExperiment` object with an additional scaled assay.
259-
#' @export
260-
scale_abundance_h5 <- function(.data,
261-
abundance = SummarizedExperiment::assayNames(.data)[1],
262-
method = "TMM",
263-
reference_sample = NULL,
264-
suffix = "_scaled",
265-
chunk_sample_size = 50,
266-
...) {
267-
268-
my_assay <- abundance
269-
225+
sample_names <- colnames(.data)
226+
sample_indices <- seq_along(sample_names)
227+
sample_groups <- split(sample_names, ceiling(sample_indices / chunk_sample_size))
228+
229+
# Check if BiocParallel is available, otherwise install
230+
check_and_install_packages("BiocParallel")
231+
232+
# Get the current BiocParallel backend
233+
bp_param <- BiocParallel::bpparam()
234+
235+
# Inform user about parallelization settings
236+
if (is(bp_param, "SerialParam")) {
237+
message("tidybulk says: Processing chunks serially. For parallel processing, configure BiocParallel with BiocParallel::register() before calling this function. For example: BiocParallel::register(BiocParallel::MulticoreParam(workers = 4, progressbar = TRUE))")
238+
} else {
239+
message(sprintf("tidybulk says: Processing %d chunks in parallel using %s with %d workers",
240+
length(sample_groups), class(bp_param)[1], BiocParallel::bpnworkers(bp_param)))
241+
}
242+
243+
chunk_results <-
244+
BiocParallel::bplapply(
245+
sample_groups,
246+
function(current_samples) {
247+
# Extract chunk
248+
chunk_se <- .data[, current_samples, drop = FALSE]
249+
250+
# Get multipliers for this chunk
251+
chunk_multiplier <- multiplier[current_samples]
252+
253+
# Apply scaling to chunk (TMM already calculated)
254+
chunk_scaled <- apply_scaling_only(
255+
chunk_se,
256+
my_assay,
257+
chunk_multiplier,
258+
value_scaled
259+
)
260+
261+
chunk_scaled
262+
},
263+
BPPARAM = bp_param
264+
)
265+
266+
# Combine chunks - use SummarizedExperiment::cbind for proper S4 dispatch
267+
message("tidybulk says: Combining chunks")
268+
.data <- do.call(SummarizedExperiment::cbind, args = chunk_results)
270269

271-
272-
value_scaled <- paste0(my_assay, suffix)
273-
274-
chunk_sample_size <- as.integer(chunk_sample_size)
275-
if (is.na(chunk_sample_size) || chunk_sample_size < 1) {
276-
stop("tidybulk says: chunk_size must be a positive integer")
277-
}
278-
279-
sample_names <- colnames(.data)
280-
sample_indices <- seq_along(sample_names)
281-
sample_groups <- split(sample_names, ceiling(sample_indices / chunk_sample_size))
282-
283-
# Check if BiocParallel is available, otherwise install
284-
check_and_install_packages("BiocParallel")
285-
286-
# Get the current BiocParallel backend
287-
bp_param <- BiocParallel::bpparam()
288-
289-
# Inform user about parallelization settings
290-
if (is(bp_param, "SerialParam")) {
291-
message("tidybulk says: Processing chunks serially. For parallel processing, configure BiocParallel with BiocParallel::register() before calling this function. For example: BiocParallel::register(BiocParallel::MulticoreParam(workers = 4, progressbar = TRUE))")
292-
} else {
293-
message(sprintf("tidybulk says: Processing %d chunks in parallel using %s with %d workers",
294-
length(sample_groups), class(bp_param)[1], BiocParallel::bpnworkers(bp_param)))
295270
}
296-
297-
chunk_results <-
298-
BiocParallel::bplapply(
299-
sample_groups,
300-
function(current_samples) {
301-
chunk_samples <- unique(c(reference_sample, current_samples))
302-
chunk_se <- .data[, chunk_samples, drop = FALSE]
303-
304271

305-
306-
scale_abundance(
307-
chunk_se,
308-
abundance = my_assay,
309-
method = method,
310-
reference_sample = reference_sample,
311-
suffix = suffix,
312-
...
313-
)
314-
},
315-
BPPARAM = bp_param
316-
)
317-
318-
.data = do.call(cbind, args = chunk_results)
319-
.data |>
272+
scaled_symbol <- rlang::sym(value_scaled)
273+
scaled_quosure <- drop_enquo_env(rlang::new_quosure(scaled_symbol))
320274

321-
memorise_methods_used(c("edger", "tmm")) |>
322-
323-
add_tt_columns(.abundance_scaled = !!(((function(x, v) enquo(v))(x,!!as.symbol(value_scaled))) |> drop_enquo_env()) )
275+
.data |>
276+
memorise_methods_used(c("edger", "tmm")) |>
277+
add_tt_columns(.abundance_scaled = !!scaled_quosure)
278+
324279
}
325280

326281
#' scale_abundance
@@ -344,3 +299,32 @@ setMethod("scale_abundance",
344299
setMethod("scale_abundance",
345300
"RangedSummarizedExperiment",
346301
.scale_abundance_se)
302+
303+
304+
# Core scaling function - applies pre-calculated multipliers to create scaled assay
305+
apply_scaling_only <- function(data_obj, assay_name, multipliers, scaled_name) {
306+
307+
# Get the assay data and apply scaling
308+
chunk_assay <- assay(data_obj, assay_name)
309+
is_delayed <- is(chunk_assay, "DelayedArray")
310+
311+
if (is_delayed) {
312+
# For DelayedArray, use sweep to create a delayed operation
313+
check_and_install_packages("DelayedArray")
314+
my_counts_scaled <- list(
315+
DelayedArray::sweep(chunk_assay, 2, multipliers, "*")
316+
) |> setNames(scaled_name)
317+
} else {
318+
# For regular matrices, use matrix multiplication
319+
my_counts_scaled <- list(
320+
chunk_assay %*% diag(multipliers)
321+
) |> setNames(scaled_name)
322+
}
323+
324+
colnames(my_counts_scaled[[1]]) <- colnames(chunk_assay)
325+
326+
# Add the scaled assay
327+
assays(data_obj, withDimnames = FALSE) <- assays(data_obj) |> c(my_counts_scaled)
328+
data_obj
329+
}
330+

man/scale_abundance-methods.Rd

Lines changed: 8 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)