diff --git a/snapatac2-core/src/utils/mod.rs b/snapatac2-core/src/utils/mod.rs index bbd5ff45..066a7828 100644 --- a/snapatac2-core/src/utils/mod.rs +++ b/snapatac2-core/src/utils/mod.rs @@ -54,6 +54,24 @@ pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes) peak } +pub fn score_per_million(mut peaks: Vec) -> Result> { + + let total_signal: f64 = peaks.iter().filter_map(|p| p.p_value).sum(); + + if total_signal == 0.0 { + // Prevent division by zero; just return peaks as-is. + return Ok(peaks); + } + let factor = 1_000_000.0 / total_signal; + for peak in &mut peaks { + if let Some(score) = peak.p_value.as_mut() { + *score *= factor; + } + } + + Ok(peaks) +} + #[derive(Debug, Clone, Copy)] pub enum Compression { Gzip, diff --git a/snapatac2/tools/_call_peaks.py b/snapatac2/tools/_call_peaks.py index 600ae13f..345e65e9 100644 --- a/snapatac2/tools/_call_peaks.py +++ b/snapatac2/tools/_call_peaks.py @@ -194,6 +194,7 @@ def merge_peaks( peaks: dict[str, 'polars.DataFrame'], chrom_sizes: dict[str, int] | Genome, half_width: int = 250, + normalize: bool = False, ) -> 'polars.DataFrame': """Merge peaks from different groups. @@ -218,6 +219,8 @@ def merge_peaks( chromosome sizes will be obtained from the genome. half_width Half width of the merged peaks. + normalize + Score per million normalization. Maybe useful when sequencing depth varies, DOI: 10.1126/science.aav1898. Returns ------- @@ -232,7 +235,7 @@ def merge_peaks( import polars as pl chrom_sizes = chrom_sizes.chrom_sizes if isinstance(chrom_sizes, Genome) else chrom_sizes peaks = { k: pl.from_pandas(v) if isinstance(v, pd.DataFrame) else v for k, v in peaks.items()} - return _snapatac2.py_merge_peaks(peaks, chrom_sizes, half_width) + return _snapatac2.py_merge_peaks(peaks, chrom_sizes, half_width, normalize) def _par_map(mapper, args, nprocs): import time diff --git a/src/call_peaks.rs b/src/call_peaks.rs index ff12c6cd..2ef9a3a9 100644 --- a/src/call_peaks.rs +++ b/src/call_peaks.rs @@ -9,7 +9,7 @@ use pyo3::ffi::c_str; use snapatac2_core::utils::{self, Compression}; use snapatac2_core::{ preprocessing::Fragment, - utils::{clip_peak, merge_peaks, open_file_for_write}, + utils::{clip_peak, merge_peaks, open_file_for_write,score_per_million}, SnapData, }; @@ -35,6 +35,7 @@ pub fn py_merge_peaks<'py>( peaks: HashMap, chrom_sizes: HashMap, half_width: u64, + normalize: bool, ) -> Result { let peak_list: Vec<_> = peaks .into_iter() @@ -43,7 +44,14 @@ pub fn py_merge_peaks<'py>( Ok((key, ps)) }) .collect::>()?; - + let peak_list = if normalize { + peak_list + .into_iter() + .map(|(key, peaks)| Ok((key, score_per_million(peaks)?))) + .collect::>>()? + } else { + peak_list + }; let chrom_sizes = chrom_sizes.into_iter().collect(); let peaks: Vec<_> = merge_peaks(peak_list.iter().flat_map(|x| x.1.clone()), half_width) .flatten()