From 35d0e97cfe3d83510ece6a247ccd992922e06e0c Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 07:24:29 +0000 Subject: [PATCH 01/29] refactor: reorganize reconsensus This is an idea for improving code structure in reconsensus module: - split away blocks requiring realignment early - separate logic for mutation-only reconsensus and full reconsensus Additionally: - make `find_majority*` functions into methods of `Block` - they look very much like fancy accessors --- packages/pangraph/src/pangraph/edits.rs | 88 +++++ .../pangraph/src/pangraph/graph_merging.rs | 2 +- .../pangraph/src/pangraph/pangraph_block.rs | 70 +++- .../pangraph/src/reconsensus/reconsensus.rs | 344 ++++++++---------- 4 files changed, 312 insertions(+), 192 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 1c0e7aed..dfcc6995 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -127,6 +127,26 @@ impl Edit { self.subs.is_empty() && self.dels.is_empty() && self.inss.is_empty() } + /// Returns true if this edit contains any insertions or deletions (indels) + pub fn has_indels(&self) -> bool { + self.has_dels() || self.has_inss() + } + + /// Returns true if this edit contains any deletions + pub fn has_dels(&self) -> bool { + !self.dels.is_empty() + } + + /// Returns true if this edit contains any insertions + pub fn has_inss(&self) -> bool { + !self.inss.is_empty() + } + + /// Returns true if this edit contains any substitutions + pub fn has_subs(&self) -> bool { + !self.subs.is_empty() + } + /// Construct edit which consists of a deletion of length `len` pub fn deleted(len: usize) -> Self { Self { @@ -1045,4 +1065,72 @@ mod tests { assert_eq!(mean_shift, 3); assert_eq!(bandwidth, Some(4)); } + + #[test] + fn test_has_indels() { + // Edit with no indels (only substitutions) + let edit_no_indels = Edit::new(vec![], vec![], vec![Sub::new(1, 'A')]); + assert!(!edit_no_indels.has_indels()); + + // Edit with deletions + let edit_with_dels = Edit::new(vec![], vec![Del::new(5, 2)], vec![]); + assert!(edit_with_dels.has_indels()); + + // Edit with insertions + let edit_with_inss = Edit::new(vec![Ins::new(10, "ATG")], vec![], vec![]); + assert!(edit_with_inss.has_indels()); + + // Edit with both insertions and deletions + let edit_with_both = Edit::new(vec![Ins::new(10, "ATG")], vec![Del::new(5, 2)], vec![Sub::new(1, 'A')]); + assert!(edit_with_both.has_indels()); + + // Empty edit + let edit_empty = Edit::empty(); + assert!(!edit_empty.has_indels()); + } + + #[test] + fn test_has_dels() { + // Edit with no deletions + let edit_no_dels = Edit::new(vec![Ins::new(10, "ATG")], vec![], vec![Sub::new(1, 'A')]); + assert!(!edit_no_dels.has_dels()); + + // Edit with deletions + let edit_with_dels = Edit::new(vec![], vec![Del::new(5, 2)], vec![]); + assert!(edit_with_dels.has_dels()); + + // Empty edit + let edit_empty = Edit::empty(); + assert!(!edit_empty.has_dels()); + } + + #[test] + fn test_has_inss() { + // Edit with no insertions + let edit_no_inss = Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'A')]); + assert!(!edit_no_inss.has_inss()); + + // Edit with insertions + let edit_with_inss = Edit::new(vec![Ins::new(10, "ATG")], vec![], vec![]); + assert!(edit_with_inss.has_inss()); + + // Empty edit + let edit_empty = Edit::empty(); + assert!(!edit_empty.has_inss()); + } + + #[test] + fn test_has_subs() { + // Edit with no substitutions + let edit_no_subs = Edit::new(vec![Ins::new(10, "ATG")], vec![Del::new(5, 2)], vec![]); + assert!(!edit_no_subs.has_subs()); + + // Edit with substitutions + let edit_with_subs = Edit::new(vec![], vec![], vec![Sub::new(1, 'A')]); + assert!(edit_with_subs.has_subs()); + + // Empty edit + let edit_empty = Edit::empty(); + assert!(!edit_empty.has_subs()); + } } diff --git a/packages/pangraph/src/pangraph/graph_merging.rs b/packages/pangraph/src/pangraph/graph_merging.rs index b875b0db..8ccf37b2 100644 --- a/packages/pangraph/src/pangraph/graph_merging.rs +++ b/packages/pangraph/src/pangraph/graph_merging.rs @@ -166,7 +166,7 @@ pub fn self_merge(graph: Pangraph, args: &PangraphBuildArgs) -> Result<(Pangraph // update consensus and alignment of merged blocks. let merge_block_ids = new_blocks_dict.keys().copied().collect_vec(); - reconsensus_graph(&mut graph, merge_block_ids, args).wrap_err("During reconsensus")?; + reconsensus_graph(&mut graph, &merge_block_ids, args).wrap_err("During reconsensus")?; Ok((graph, true)) } diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 81071d6a..f95d9a63 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -1,16 +1,18 @@ use crate::io::fasta::FastaRecord; use crate::io::json::{JsonPretty, json_write_str}; use crate::io::seq::reverse_complement; -use crate::pangraph::edits::Edit; +use crate::pangraph::edits::{Del, Edit, Ins, Sub}; use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_node::NodeId; use crate::pangraph::pangraph_path::PathId; use crate::representation::seq::Seq; use crate::representation::seq_char::AsciiChar; use crate::utils::collections::has_duplicates; +use crate::utils::interval::positions_to_intervals; use derive_more::{Display, From}; use eyre::{Report, WrapErr}; use getset::{CopyGetters, Getters}; +use itertools::Itertools; use maplit::btreemap; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -195,6 +197,72 @@ impl PangraphBlock { }) }) } + + /// Finds all majority edits (substitutions, deletions, insertions) in this block + pub fn find_majority_edits(&self) -> Edit { + Edit::new( + self.find_majority_insertions(), + self.find_majority_deletions(), + self.find_majority_substitutions(), + ) + } + + /// Helper method to check if a count represents a majority + fn is_majority(&self, count: usize) -> bool { + count > self.depth() / 2 + } + + /// Finds majority substitutions in this block + pub fn find_majority_substitutions(&self) -> Vec { + let mut substitutions: Vec<_> = self + .alignments() + .values() + .flat_map(|edit| &edit.subs) + .map(|sub| (sub.pos, sub.alt)) + .into_group_map() + .into_iter() + .filter_map(|(pos, alts)| { + let (alt, count) = alts.into_iter().counts().into_iter().max_by_key(|(_, count)| *count)?; + self.is_majority(count).then_some(Sub::new(pos, alt)) + }) + .collect(); + + substitutions.sort_by_key(|sub| sub.pos); + substitutions + } + + /// Finds majority deletions in this block + pub fn find_majority_deletions(&self) -> Vec { + let majority_positions: Vec = self + .alignments() + .values() + .flat_map(|edit| edit.dels.iter().flat_map(|del| del.range())) + .counts() + .into_iter() + .filter_map(|(pos, count)| self.is_majority(count).then_some(pos)) + .collect(); + + positions_to_intervals(&majority_positions) + .into_iter() + .map(|interval| Del::new(interval.start, interval.len())) + .collect() + } + + /// Finds majority insertions in this block + pub fn find_majority_insertions(&self) -> Vec { + let mut insertions: Vec<_> = self + .alignments() + .values() + .flat_map(|edit| &edit.inss) + .map(|insertion| (insertion.pos, insertion.seq.clone())) + .counts() + .into_iter() + .filter_map(|((pos, seq), count)| self.is_majority(count).then_some(Ins::new(pos, seq))) + .collect(); + + insertions.sort_by_key(|ins| ins.pos); + insertions + } } #[derive(Copy, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 104df332..43f520a6 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -1,20 +1,18 @@ use crate::align::map_variations::{BandParameters, map_variations}; use crate::commands::build::build_args::PangraphBuildArgs; +use crate::make_error; use crate::pangraph::detach_unaligned::detach_unaligned_nodes; -use crate::pangraph::edits::{Del, Edit}; -use crate::pangraph::edits::{Ins, Sub}; +use crate::pangraph::edits::Edit; +use crate::pangraph::edits::Sub; use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_block::{BlockId, PangraphBlock}; use crate::pangraph::pangraph_node::NodeId; -use crate::utils::interval::positions_to_intervals; -use crate::{make_error, make_report}; // use crate::reconsensus::remove_nodes::remove_emtpy_nodes; use crate::reconsensus::remove_nodes::find_empty_nodes; use crate::representation::seq::Seq; use crate::representation::seq_char::AsciiChar; use eyre::Report; -use itertools::Itertools; -use maplit::btreemap; +use itertools::{Either, Itertools}; use rayon::prelude::*; use std::collections::BTreeMap; @@ -32,205 +30,148 @@ use std::collections::BTreeMap; /// - if the consensus has updated indels, then re-aligns all the sequences to the new consensus pub fn reconsensus_graph( graph: &mut Pangraph, - ids_updated_blocks: Vec, + ids_updated_blocks: &[BlockId], args: &PangraphBuildArgs, ) -> Result<(), Report> { // there should be no empty nodes in the graph debug_assert!( - find_empty_nodes(graph, &ids_updated_blocks).is_empty(), + find_empty_nodes(graph, ids_updated_blocks).is_empty(), "Empty nodes found in the graph" ); - // reconsensus each block - // i.e. update the consensus with majority variants - // ids of blocks that undergo re-alignment are collected in realigned_block_ids - let mut realigned_block_ids = Vec::new(); - for block_id in ids_updated_blocks { - let block = graph - .blocks - .get_mut(&block_id) - .ok_or_else(|| make_report!("Block {} not found in graph during reconsensus", block_id))?; - let realigned = reconsensus(block, args)?; - if realigned { - realigned_block_ids.push(block_id); - } - } + // Analyze blocks and determine which need realignment + let (blocks_with_mutations_only, blocks_needing_realignment) = + analyze_blocks_for_reconsensus(graph, ids_updated_blocks); + + // Apply mutation-only reconsensus (no realignment needed) + blocks_with_mutations_only.into_iter().try_for_each(|block_id| { + let block = graph.blocks.get_mut(&block_id).unwrap(); + let majority_edits = block.find_majority_edits(); + apply_mutation_reconsensus(block, &majority_edits.subs) + })?; - // For realigned blocks, pop them from graph.blocks, apply detach_unaligned_nodes to the list, and re-add them - if !realigned_block_ids.is_empty() { + // Handle blocks requiring realignment + if !blocks_needing_realignment.is_empty() { // Pop the realigned blocks from graph.blocks - let mut realigned_blocks = realigned_block_ids + let mut realigned_blocks: Vec<_> = blocks_needing_realignment .iter() .filter_map(|block_id| graph.blocks.remove(block_id)) - .collect_vec(); + .collect(); + + // Apply full reconsensus with realignment + realigned_blocks.iter_mut().try_for_each(|block| { + let majority_edits = block.find_majority_edits(); + apply_full_reconsensus(block, &majority_edits, args) + })?; // Apply detach_unaligned_nodes. This removes unaligned nodes and re-adds them to the list as new blocks. detach_unaligned_nodes(&mut realigned_blocks, &mut graph.nodes)?; // Re-add all the blocks (including potentially new singleton blocks) to graph.blocks - graph - .blocks - .extend(realigned_blocks.into_iter().map(|block| (block.id(), block))); + realigned_blocks.into_iter().for_each(|block| { + graph.blocks.insert(block.id(), block); + }); } Ok(()) } -/// Performs the reconsensus operation inplace on a block. -/// Returns true or false, depending on whether sequences were re-aligned or not. -/// - if a position is mutated in > N/2 sites, it adds the mutation to the consensus and updates the alignment. -/// - if an in/del is present in > N/2 sites, it adds it to the consensus and re-aligns the sequences to the updated consensus. -fn reconsensus(block: &mut PangraphBlock, args: &PangraphBuildArgs) -> Result { - reconsensus_mutations(block)?; - let ins = majority_insertions(block); - let dels = majority_deletions(block); - let re_align = !ins.is_empty() || !dels.is_empty(); - if re_align { - let consensus = block.consensus(); - let consensus = apply_indels(consensus, &dels, &ins); - // average shift of the new consensus with respect to the old one - // once the indels are applied - // used to help re-align the sequences - let new_consensus_edits = Edit { - subs: vec![], - dels: positions_to_intervals(&dels) - .iter() - .map(|d| Del::new(d.start, d.len())) - .collect(), - inss: ins, - }; - - let new_consensus_band_params = BandParameters::from_edits(&new_consensus_edits, block.consensus_len())?; - - // debug assert: consensus is not empty - debug_assert!(!consensus.is_empty(), "Consensus is empty after indels"); +/// Analyzes blocks to determine which need realignment vs. mutation-only reconsensus +fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> (Vec, Vec) { + let (mutations_only, need_realignment): (Vec<_>, Vec<_>) = block_ids + .iter() + .filter_map(|&block_id| { + let block = &graph.blocks[&block_id]; + let majority_edits = block.find_majority_edits(); + + if majority_edits.has_indels() { + Some(Either::Right(block_id)) + } else if majority_edits.has_subs() { + Some(Either::Left(block_id)) + } else { + None // Blocks with no variants are skipped + } + }) + .partition_map(|either| either); - update_block_consensus(block, &consensus, new_consensus_band_params, args)?; - } - Ok(re_align) + (mutations_only, need_realignment) } -/// Re-computes the consensus for a block if a position is mutated in > N/2 sites. -fn reconsensus_mutations(block: &mut PangraphBlock) -> Result<(), Report> { - let n = block.depth(); - let mut muts = btreemap! {}; - - // count mutations - for edit in block.alignments().values() { - for s in &edit.subs { - *muts - .entry(s.pos) - .or_insert_with(BTreeMap::new) - .entry(s.alt) - .or_insert(0) += 1; - } - } - - // change positions that are different in more than N/2 sites. - let mut changes = Vec::new(); - for (pos, ct) in muts { - let (alt, count) = ct.iter().max_by_key(|&(_, v)| v).unwrap(); - if *count > n / 2 { - changes.push((pos, *alt)); - } - } - - // apply change - for (pos, alt) in changes { - let original = block.consensus()[pos]; - - // update consensus - block.set_consensus_char(pos, alt); - - // change mutations - for edit in &mut block.alignments_mut().values_mut() { - let subs_at_pos: Vec<_> = edit.subs.iter().filter(|s| s.pos == pos).cloned().collect(); - match subs_at_pos.len() { - 0 => { - // if the position is not in a deletion, append the mutation - if !edit.dels.iter().any(|d| d.contains(pos)) { - edit.subs.push(Sub::new(pos, original)); - edit.subs.sort_by_key(|s| s.pos); - } - }, - 1 => { - let s = &subs_at_pos[0]; - if s.alt == alt { - edit.subs.retain(|sub| !(sub.pos == pos && sub.alt == alt)); - } - }, - _ => { - return make_error!( - "At block {}: at position {pos}: sequence states disagree: {:}", - block.id(), - subs_at_pos - .iter() - .map(|sub| sub.alt.to_string()) - .collect_vec() - .join(", ") - ); - }, +/// Updates alignment for a single mutation +fn update_alignment_for_mutation( + edit: &mut Edit, + pos: usize, + alt: AsciiChar, + original: AsciiChar, +) -> Result<(), Report> { + let subs_at_pos: Vec<_> = edit.subs.iter().filter(|s| s.pos == pos).cloned().collect(); + + match subs_at_pos.len() { + 0 => { + // if the position is not in a deletion, append the mutation + if !edit.dels.iter().any(|d| d.contains(pos)) { + edit.subs.push(Sub::new(pos, original)); + edit.subs.sort_by_key(|s| s.pos); } - } + }, + 1 => { + let s = &subs_at_pos[0]; + if s.alt == alt { + edit.subs.retain(|sub| !(sub.pos == pos && sub.alt == alt)); + } + }, + _ => { + return make_error!( + "At position {pos}: sequence states disagree: {:}", + subs_at_pos + .iter() + .map(|sub| sub.alt.to_string()) + .collect_vec() + .join(", ") + ); + }, } - Ok(()) } -/// Returns a list of positions to be removed from the consensus, because they are deleted in > N/2 sites. -fn majority_deletions(block: &PangraphBlock) -> Vec { - let n = block.depth(); - let mut n_dels = btreemap! {}; - // for each deleted position, increment the counter - for edit in block.alignments().values() { - for d in &edit.dels { - for pos in d.range() { - *n_dels.entry(pos).or_insert(0) += 1; - } - } - } - // return the positions that are deleted in more than N/2 sites - n_dels - .into_iter() - .filter(|&(_, count)| count > n / 2) - .map(|(pos, _)| pos) - .collect() -} +/// Applies only mutation reconsensus without realignment +fn apply_mutation_reconsensus(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { + subs.iter().try_for_each(|sub| { + let original = block.consensus()[sub.pos]; -/// Returns a list of insertions to be added to the consensus, because they are inserted in > N/2 sites. -fn majority_insertions(block: &PangraphBlock) -> Vec { - let n = block.depth(); - let mut n_ins = btreemap! {}; - for edit in block.alignments().values() { - for i in &edit.inss { - *n_ins.entry((i.pos, i.seq.clone())).or_insert(0) += 1; - } - } + // Update consensus + block.set_consensus_char(sub.pos, sub.alt); - // return the positions that are inserted in more than N/2 sites - n_ins - .into_iter() - .filter(|&(_, count)| count > n / 2) - .map(|((pos, ins), _)| Ins::new(pos, ins)) - .collect() + // Update alignments + block + .alignments_mut() + .values_mut() + .try_for_each(|edit| update_alignment_for_mutation(edit, sub.pos, sub.alt, original)) + }) } -/// Updates the consensus sequence with the deletions and insertions. -fn apply_indels(cons: impl Into, dels: &[usize], inss: &[Ins]) -> Seq { - let mut cons = cons.into(); +/// Applies full reconsensus including indels and realignment +fn apply_full_reconsensus( + block: &mut PangraphBlock, + majority_edits: &Edit, + args: &PangraphBuildArgs, +) -> Result<(), Report> { + // First apply mutations + apply_mutation_reconsensus(block, &majority_edits.subs)?; + + // Then apply indels and realign if present + if majority_edits.has_indels() { + let consensus = block.consensus(); + let new_consensus = majority_edits.apply(consensus)?; - for &pos in dels { - cons[pos] = AsciiChar(b'\0'); // Using '\0' to temporarily denote deleted positions - } + let band_params = BandParameters::from_edits(majority_edits, block.consensus_len())?; - // Reverse to maintain correct insertion indexes after each insert - for Ins { pos, seq } in inss.iter().rev() { - cons.insert_seq(*pos, seq); - } + debug_assert!(!new_consensus.is_empty(), "Consensus is empty after indels"); - cons.retain(|&c| c != AsciiChar(b'\0')); + update_block_consensus(block, &new_consensus, band_params, args)?; + } - cons + Ok(()) } /// Updates the consensus sequence of the block and re-aligns the sequences to the new consensus. @@ -258,16 +199,14 @@ fn update_block_consensus( // debug assets: all sequences are non-empty #[cfg(any(debug_assertions, test))] { - for (nid, (seq, _bp)) in &seqs { - if seq.is_empty() { - return make_error!( - "node is empty!\nblock: {}\nnode: {}\nedits: {:?}\nconsensus: {}", - block.id(), - nid, - block.alignments().get(nid), - block.consensus() - ); - } + if let Some((nid, (_seq, _))) = seqs.iter().find(|(_, (seq, _))| seq.is_empty()) { + return make_error!( + "node is empty!\nblock: {}\nnode: {}\nedits: {:?}\nconsensus: {}", + block.id(), + nid, + block.alignments().get(nid), + block.consensus() + ); } } @@ -425,28 +364,34 @@ mod tests { fn test_reconsensus_mutations() { let mut block = block_1(); let expected_block = block_1_mut_reconsensus(); - reconsensus_mutations(&mut block).unwrap(); + let subs = block.find_majority_substitutions(); + apply_mutation_reconsensus(&mut block, &subs).unwrap(); assert_eq!(block, expected_block); } #[test] fn test_majority_deletions() { - let dels = majority_deletions(&block_2()); + let dels: Vec = block_2() + .find_majority_deletions() + .iter() + .flat_map(|d| d.range()) + .collect(); assert_eq!(dels, vec![5, 6, 20]); } #[test] fn test_majority_insertions() { - let ins = majority_insertions(&block_2()); + let ins = block_2().find_majority_insertions(); assert_eq!(ins, vec![Ins::new(0, "G"), Ins::new(13, "AA"), Ins::new(23, "TT")]); } #[test] - fn test_apply_indels() { + fn test_apply_edits() { let consensus = "AGGACTTCGATCTATTCGGAGAA"; - let dels = vec![5, 6, 20]; + let dels = vec![Del::new(5, 2), Del::new(20, 1)]; let ins = vec![Ins::new(0, "G"), Ins::new(13, "AA"), Ins::new(23, "TT")]; - let cons = apply_indels(consensus, &dels, &ins); + let edit = Edit::new(ins, dels, vec![]); + let cons = edit.apply(consensus).unwrap(); assert_eq!(cons, "GAGGACCGATCTAAATTCGGAAATT"); } @@ -454,8 +399,22 @@ mod tests { fn test_reconsensus() { let mut block = block_3(); let expected_block = block_3_reconsensus(); - let re_aligned = reconsensus(&mut block, &PangraphBuildArgs::default()).unwrap(); - assert!(re_aligned); + + let majority_edits = block.find_majority_edits(); + + // Apply mutations + apply_mutation_reconsensus(&mut block, &majority_edits.subs).unwrap(); + + // Apply indels if present + let has_indels = majority_edits.has_indels(); + if has_indels { + let consensus = block.consensus(); + let new_consensus = majority_edits.apply(consensus).unwrap(); + let band_params = BandParameters::from_edits(&majority_edits, block.consensus_len()).unwrap(); + update_block_consensus(&mut block, &new_consensus, band_params, &PangraphBuildArgs::default()).unwrap(); + } + + assert!(has_indels); // This test expects realignment to have occurred assert_eq!(block.consensus(), expected_block.consensus()); assert_eq!(block.alignments(), expected_block.alignments()); } @@ -464,10 +423,15 @@ mod tests { fn test_reconsensus_mutations_only_no_realignment() { let mut block = block_mutations_only(); let expected_block = block_mutations_only_after_reconsensus(); - let re_aligned = reconsensus(&mut block, &PangraphBuildArgs::default()).unwrap(); - // Should return false because no indels require re-alignment - assert!(!re_aligned); + let majority_edits = block.find_majority_edits(); + + // Apply mutations + apply_mutation_reconsensus(&mut block, &majority_edits.subs).unwrap(); + + // Check that no indels require re-alignment + let has_indels = majority_edits.has_indels(); + assert!(!has_indels); // Should return false because no indels require re-alignment // But consensus should be updated and mutations healed assert_eq!(block.consensus(), expected_block.consensus()); @@ -544,7 +508,7 @@ mod tests { let mut graph = Pangraph { paths, blocks, nodes }; // Apply reconsensus_graph - let result = reconsensus_graph(&mut graph, vec![initial_block.id()], &PangraphBuildArgs::default()); + let result = reconsensus_graph(&mut graph, &[initial_block.id()], &PangraphBuildArgs::default()); // Check that the operation succeeded result.unwrap(); From 7c6a0a31f8cea6356de95edae9add46894f8bb6e Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 10:10:49 +0200 Subject: [PATCH 02/29] refactor: return struct instead of tuple --- .../pangraph/src/reconsensus/reconsensus.rs | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 43f520a6..e5519750 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -14,8 +14,18 @@ use crate::representation::seq_char::AsciiChar; use eyre::Report; use itertools::{Either, Itertools}; use rayon::prelude::*; +use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; +/// Result of analyzing blocks for reconsensus operation +#[derive(Clone, Debug, Serialize, Deserialize)] +struct BlockAnalysis { + /// Blocks that need only mutation-based reconsensus (no realignment) + mutations_only: Vec, + /// Blocks that need full reconsensus with realignment due to indels + need_realignment: Vec, +} + /// Applies the reconsensus operation to each updated block in the graph: /// - updates the block consensus following a merge /// - removes potentially empty nodes @@ -40,20 +50,20 @@ pub fn reconsensus_graph( ); // Analyze blocks and determine which need realignment - let (blocks_with_mutations_only, blocks_needing_realignment) = - analyze_blocks_for_reconsensus(graph, ids_updated_blocks); + let analysis = analyze_blocks_for_reconsensus(graph, ids_updated_blocks); // Apply mutation-only reconsensus (no realignment needed) - blocks_with_mutations_only.into_iter().try_for_each(|block_id| { + analysis.mutations_only.into_iter().try_for_each(|block_id| { let block = graph.blocks.get_mut(&block_id).unwrap(); let majority_edits = block.find_majority_edits(); apply_mutation_reconsensus(block, &majority_edits.subs) })?; // Handle blocks requiring realignment - if !blocks_needing_realignment.is_empty() { + if !analysis.need_realignment.is_empty() { // Pop the realigned blocks from graph.blocks - let mut realigned_blocks: Vec<_> = blocks_needing_realignment + let mut realigned_blocks: Vec<_> = analysis + .need_realignment .iter() .filter_map(|block_id| graph.blocks.remove(block_id)) .collect(); @@ -77,7 +87,7 @@ pub fn reconsensus_graph( } /// Analyzes blocks to determine which need realignment vs. mutation-only reconsensus -fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> (Vec, Vec) { +fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> BlockAnalysis { let (mutations_only, need_realignment): (Vec<_>, Vec<_>) = block_ids .iter() .filter_map(|&block_id| { @@ -94,7 +104,10 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> (V }) .partition_map(|either| either); - (mutations_only, need_realignment) + BlockAnalysis { + mutations_only, + need_realignment, + } } /// Updates alignment for a single mutation @@ -467,7 +480,7 @@ mod tests { PangraphBlock::new(BlockId(20), consensus, aln) } - fn singleton_block_expected() -> PangraphBlock { + fn signleton_block_expected() -> PangraphBlock { // This is the singleton block that should be created for NodeId(1) after reconsensus // this should be the reverse-complement of CGCTTGAGGC, because NodeId(1) was in Reverse strand let consensus = Seq::from("GCCTCAAGCG"); @@ -479,7 +492,7 @@ mod tests { // Create a block that will be modified by reconsensus let initial_block = block_for_graph_test(); let expected_block = block_for_graph_test_expected(); - let singleton_block_exp = singleton_block_expected(); + let singleton_block_exp = signleton_block_expected(); // Create nodes for the block with lengths reflecting actual sequence lengths let nodes = btreemap! { From 1893ba3e3a946f0e1cbe99d452770a6268ec3eba Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 10:14:10 +0200 Subject: [PATCH 03/29] perf: add inline hints to small utility methods --- packages/pangraph/src/pangraph/edits.rs | 4 ++++ packages/pangraph/src/pangraph/pangraph_block.rs | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index dfcc6995..82ed6d41 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -128,21 +128,25 @@ impl Edit { } /// Returns true if this edit contains any insertions or deletions (indels) + #[inline] pub fn has_indels(&self) -> bool { self.has_dels() || self.has_inss() } /// Returns true if this edit contains any deletions + #[inline] pub fn has_dels(&self) -> bool { !self.dels.is_empty() } /// Returns true if this edit contains any insertions + #[inline] pub fn has_inss(&self) -> bool { !self.inss.is_empty() } /// Returns true if this edit contains any substitutions + #[inline] pub fn has_subs(&self) -> bool { !self.subs.is_empty() } diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index f95d9a63..b93b0967 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -208,7 +208,8 @@ impl PangraphBlock { } /// Helper method to check if a count represents a majority - fn is_majority(&self, count: usize) -> bool { + #[inline] + pub fn is_majority(&self, count: usize) -> bool { count > self.depth() / 2 } From bdd3f52f8f7958bb665e679b6338468c9e7013af Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 10:25:30 +0200 Subject: [PATCH 04/29] refactor: decompose update_alignment_for_mutation into smaller functions --- packages/pangraph/src/pangraph/edits.rs | 25 ++++++++++++ .../pangraph/src/reconsensus/reconsensus.rs | 40 +++++++------------ 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 82ed6d41..251beb30 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -151,6 +151,31 @@ impl Edit { !self.subs.is_empty() } + /// Checks if a position is deleted in this edit + #[inline] + pub fn is_position_deleted(&self, pos: usize) -> bool { + self.dels.iter().any(|d| d.contains(pos)) + } + + /// Adds reversion mutation when no substitutions exist at a position + pub fn add_reversion_if_not_deleted(&mut self, pos: usize, original: AsciiChar) { + if !self.is_position_deleted(pos) { + self.subs.push(Sub::new(pos, original)); + self.subs.sort_by_key(|s| s.pos); + } + } + + /// Removes substitution if it matches the new consensus character + pub fn remove_matching_substitution(&mut self, substitution: &Sub) { + if let Some(existing_sub) = self.subs.iter().find(|s| s.pos == substitution.pos) { + if existing_sub.alt == substitution.alt { + self + .subs + .retain(|sub| !(sub.pos == substitution.pos && sub.alt == substitution.alt)); + } + } + } + /// Construct edit which consists of a deletion of length `len` pub fn deleted(len: usize) -> Self { Self { diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index e5519750..874fa65c 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -88,7 +88,7 @@ pub fn reconsensus_graph( /// Analyzes blocks to determine which need realignment vs. mutation-only reconsensus fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> BlockAnalysis { - let (mutations_only, need_realignment): (Vec<_>, Vec<_>) = block_ids + let (blocks_requiring_mutations_only, blocks_requiring_full_realignment): (Vec<_>, Vec<_>) = block_ids .iter() .filter_map(|&block_id| { let block = &graph.blocks[&block_id]; @@ -105,37 +105,27 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl .partition_map(|either| either); BlockAnalysis { - mutations_only, - need_realignment, + mutations_only: blocks_requiring_mutations_only, + need_realignment: blocks_requiring_full_realignment, } } /// Updates alignment for a single mutation -fn update_alignment_for_mutation( - edit: &mut Edit, - pos: usize, - alt: AsciiChar, - original: AsciiChar, -) -> Result<(), Report> { - let subs_at_pos: Vec<_> = edit.subs.iter().filter(|s| s.pos == pos).cloned().collect(); +fn update_alignment_for_mutation(edit: &mut Edit, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { + let subs_at_pos: Vec<_> = edit + .subs + .iter() + .filter(|s| s.pos == substitution.pos) + .cloned() + .collect(); match subs_at_pos.len() { - 0 => { - // if the position is not in a deletion, append the mutation - if !edit.dels.iter().any(|d| d.contains(pos)) { - edit.subs.push(Sub::new(pos, original)); - edit.subs.sort_by_key(|s| s.pos); - } - }, - 1 => { - let s = &subs_at_pos[0]; - if s.alt == alt { - edit.subs.retain(|sub| !(sub.pos == pos && sub.alt == alt)); - } - }, + 0 => edit.add_reversion_if_not_deleted(substitution.pos, original), + 1 => edit.remove_matching_substitution(substitution), _ => { return make_error!( - "At position {pos}: sequence states disagree: {:}", + "At position {}: sequence states disagree: {:}", + substitution.pos, subs_at_pos .iter() .map(|sub| sub.alt.to_string()) @@ -159,7 +149,7 @@ fn apply_mutation_reconsensus(block: &mut PangraphBlock, subs: &[Sub]) -> Result block .alignments_mut() .values_mut() - .try_for_each(|edit| update_alignment_for_mutation(edit, sub.pos, sub.alt, original)) + .try_for_each(|edit| update_alignment_for_mutation(edit, sub, original)) }) } From 9ec5e92998de17998af9d514e71dbc26baf46c21 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 10:30:19 +0200 Subject: [PATCH 05/29] refactor: move update_alignment_for_mutation to Edit method --- packages/pangraph/src/pangraph/edits.rs | 28 +++++++++++++++++ .../pangraph/src/reconsensus/reconsensus.rs | 30 +------------------ 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 251beb30..4711baa2 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -1,4 +1,5 @@ use crate::io::seq::{complement, reverse_complement}; +use crate::make_error; use crate::representation::seq::Seq; use crate::representation::seq_char::AsciiChar; use crate::utils::interval::Interval; @@ -176,6 +177,33 @@ impl Edit { } } + /// Updates alignment for a single mutation during reconsensus + pub fn update_alignment_for_mutation(&mut self, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { + let subs_at_pos: Vec<_> = self + .subs + .iter() + .filter(|s| s.pos == substitution.pos) + .cloned() + .collect(); + + match subs_at_pos.len() { + 0 => self.add_reversion_if_not_deleted(substitution.pos, original), + 1 => self.remove_matching_substitution(substitution), + _ => { + return make_error!( + "At position {}: sequence states disagree: {:}", + substitution.pos, + subs_at_pos + .iter() + .map(|sub| sub.alt.to_string()) + .collect_vec() + .join(", ") + ); + }, + } + Ok(()) + } + /// Construct edit which consists of a deletion of length `len` pub fn deleted(len: usize) -> Self { Self { diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 874fa65c..5c2f4cef 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -10,7 +10,6 @@ use crate::pangraph::pangraph_node::NodeId; // use crate::reconsensus::remove_nodes::remove_emtpy_nodes; use crate::reconsensus::remove_nodes::find_empty_nodes; use crate::representation::seq::Seq; -use crate::representation::seq_char::AsciiChar; use eyre::Report; use itertools::{Either, Itertools}; use rayon::prelude::*; @@ -110,33 +109,6 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl } } -/// Updates alignment for a single mutation -fn update_alignment_for_mutation(edit: &mut Edit, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { - let subs_at_pos: Vec<_> = edit - .subs - .iter() - .filter(|s| s.pos == substitution.pos) - .cloned() - .collect(); - - match subs_at_pos.len() { - 0 => edit.add_reversion_if_not_deleted(substitution.pos, original), - 1 => edit.remove_matching_substitution(substitution), - _ => { - return make_error!( - "At position {}: sequence states disagree: {:}", - substitution.pos, - subs_at_pos - .iter() - .map(|sub| sub.alt.to_string()) - .collect_vec() - .join(", ") - ); - }, - } - Ok(()) -} - /// Applies only mutation reconsensus without realignment fn apply_mutation_reconsensus(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { subs.iter().try_for_each(|sub| { @@ -149,7 +121,7 @@ fn apply_mutation_reconsensus(block: &mut PangraphBlock, subs: &[Sub]) -> Result block .alignments_mut() .values_mut() - .try_for_each(|edit| update_alignment_for_mutation(edit, sub, original)) + .try_for_each(|edit| edit.update_alignment_for_mutation(sub, original)) }) } From ab428de63d7244131f28856e33e5e686f14fe073 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 10:34:37 +0200 Subject: [PATCH 06/29] refactor: extract substitutions_at_position method --- packages/pangraph/src/pangraph/edits.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 4711baa2..22564d60 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -177,14 +177,14 @@ impl Edit { } } + /// Returns all substitutions at a specific position + fn substitutions_at_position(&self, pos: usize) -> Vec { + self.subs.iter().filter(|s| s.pos == pos).cloned().collect() + } + /// Updates alignment for a single mutation during reconsensus pub fn update_alignment_for_mutation(&mut self, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { - let subs_at_pos: Vec<_> = self - .subs - .iter() - .filter(|s| s.pos == substitution.pos) - .cloned() - .collect(); + let subs_at_pos = self.substitutions_at_position(substitution.pos); match subs_at_pos.len() { 0 => self.add_reversion_if_not_deleted(substitution.pos, original), From a52114f3024a635a502b579cd114d52aeb5a7479 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 10:46:35 +0200 Subject: [PATCH 07/29] refactor: rename update_alignment_for_mutation --- packages/pangraph/src/pangraph/edits.rs | 10 ++++++++-- packages/pangraph/src/reconsensus/reconsensus.rs | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 22564d60..58ba89fc 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -182,13 +182,19 @@ impl Edit { self.subs.iter().filter(|s| s.pos == pos).cloned().collect() } - /// Updates alignment for a single mutation during reconsensus - pub fn update_alignment_for_mutation(&mut self, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { + /// Reconciles genome alignment when consensus changes due to a substitution. + /// + /// During reconsensus, when a position in the consensus sequence is updated with a new + /// character, this method adjusts the genome's edit to maintain correct alignment. + pub fn reconcile_substitution_with_consensus(&mut self, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { let subs_at_pos = self.substitutions_at_position(substitution.pos); match subs_at_pos.len() { + // If genome has no mutation at this position: adds reversion to original character 0 => self.add_reversion_if_not_deleted(substitution.pos, original), + // If genome has matching mutation: removes it (now matches new consensus) 1 => self.remove_matching_substitution(substitution), + // If genome has conflicting mutations: returns error for inconsistent state _ => { return make_error!( "At position {}: sequence states disagree: {:}", diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 5c2f4cef..f572de6c 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -121,7 +121,7 @@ fn apply_mutation_reconsensus(block: &mut PangraphBlock, subs: &[Sub]) -> Result block .alignments_mut() .values_mut() - .try_for_each(|edit| edit.update_alignment_for_mutation(sub, original)) + .try_for_each(|edit| edit.reconcile_substitution_with_consensus(sub, original)) }) } From c333bc442cf075d430186f43076025c325b338de Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 11:24:53 +0200 Subject: [PATCH 08/29] perf: avoid unnecessary vector allocation in substitution reconciliation - Use iterator count instead of collecting to vector for performance - Only create vector when needed for error message formatting - Reduces heap allocations in common case paths --- packages/pangraph/src/pangraph/edits.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 58ba89fc..3060ecf8 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -186,16 +186,21 @@ impl Edit { /// /// During reconsensus, when a position in the consensus sequence is updated with a new /// character, this method adjusts the genome's edit to maintain correct alignment. - pub fn reconcile_substitution_with_consensus(&mut self, substitution: &Sub, original: AsciiChar) -> Result<(), Report> { - let subs_at_pos = self.substitutions_at_position(substitution.pos); - - match subs_at_pos.len() { + pub fn reconcile_substitution_with_consensus( + &mut self, + substitution: &Sub, + original: AsciiChar, + ) -> Result<(), Report> { + let subs_count = self.subs.iter().filter(|s| s.pos == substitution.pos).count(); + + match subs_count { // If genome has no mutation at this position: adds reversion to original character 0 => self.add_reversion_if_not_deleted(substitution.pos, original), // If genome has matching mutation: removes it (now matches new consensus) 1 => self.remove_matching_substitution(substitution), // If genome has conflicting mutations: returns error for inconsistent state _ => { + let subs_at_pos = self.substitutions_at_position(substitution.pos); return make_error!( "At position {}: sequence states disagree: {:}", substitution.pos, From 6b4144fbc841c550cb8656d5c1a1b4f3d404a2af Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 11:25:57 +0200 Subject: [PATCH 09/29] refactor: rename substitutions_at_position to subs_at_position - Improves consistency with field name 'subs' - More concise while maintaining clarity - Follows existing naming patterns in codebase --- packages/pangraph/src/pangraph/edits.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 3060ecf8..d23f64c6 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -178,7 +178,7 @@ impl Edit { } /// Returns all substitutions at a specific position - fn substitutions_at_position(&self, pos: usize) -> Vec { + fn subs_at_position(&self, pos: usize) -> Vec { self.subs.iter().filter(|s| s.pos == pos).cloned().collect() } @@ -200,7 +200,7 @@ impl Edit { 1 => self.remove_matching_substitution(substitution), // If genome has conflicting mutations: returns error for inconsistent state _ => { - let subs_at_pos = self.substitutions_at_position(substitution.pos); + let subs_at_pos = self.subs_at_position(substitution.pos); return make_error!( "At position {}: sequence states disagree: {:}", substitution.pos, From e0616a47410741983c6b792b79793f092da0918d Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 11:59:02 +0200 Subject: [PATCH 10/29] refactor: simplify var names --- packages/pangraph/src/reconsensus/reconsensus.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index f572de6c..e6d33bcf 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -87,7 +87,7 @@ pub fn reconsensus_graph( /// Analyzes blocks to determine which need realignment vs. mutation-only reconsensus fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> BlockAnalysis { - let (blocks_requiring_mutations_only, blocks_requiring_full_realignment): (Vec<_>, Vec<_>) = block_ids + let (mutations_only, need_realignment): (Vec<_>, Vec<_>) = block_ids .iter() .filter_map(|&block_id| { let block = &graph.blocks[&block_id]; @@ -104,8 +104,8 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl .partition_map(|either| either); BlockAnalysis { - mutations_only: blocks_requiring_mutations_only, - need_realignment: blocks_requiring_full_realignment, + mutations_only, + need_realignment, } } From 0ce37bd7d03ab9bdd3b5511c5cdec670a9b198f6 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 12:04:23 +0200 Subject: [PATCH 11/29] feat: add extra error context --- packages/pangraph/src/reconsensus/reconsensus.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index e6d33bcf..3a08e860 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -10,7 +10,7 @@ use crate::pangraph::pangraph_node::NodeId; // use crate::reconsensus::remove_nodes::remove_emtpy_nodes; use crate::reconsensus::remove_nodes::find_empty_nodes; use crate::representation::seq::Seq; -use eyre::Report; +use eyre::{Context, Report}; use itertools::{Either, Itertools}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -56,6 +56,7 @@ pub fn reconsensus_graph( let block = graph.blocks.get_mut(&block_id).unwrap(); let majority_edits = block.find_majority_edits(); apply_mutation_reconsensus(block, &majority_edits.subs) + .wrap_err_with(|| format!("When processing block {}", block.id())) })?; // Handle blocks requiring realignment @@ -71,6 +72,7 @@ pub fn reconsensus_graph( realigned_blocks.iter_mut().try_for_each(|block| { let majority_edits = block.find_majority_edits(); apply_full_reconsensus(block, &majority_edits, args) + .wrap_err_with(|| format!("When processing block {}", block.id())) })?; // Apply detach_unaligned_nodes. This removes unaligned nodes and re-adds them to the list as new blocks. From dd970fc4c56198730ee640331fdff58356af3beb Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Fri, 25 Jul 2025 12:16:19 +0200 Subject: [PATCH 12/29] fix: handle .unwrap() --- packages/pangraph/src/reconsensus/reconsensus.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 3a08e860..94956582 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -1,12 +1,12 @@ use crate::align::map_variations::{BandParameters, map_variations}; use crate::commands::build::build_args::PangraphBuildArgs; -use crate::make_error; use crate::pangraph::detach_unaligned::detach_unaligned_nodes; use crate::pangraph::edits::Edit; use crate::pangraph::edits::Sub; use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_block::{BlockId, PangraphBlock}; use crate::pangraph::pangraph_node::NodeId; +use crate::{make_error, make_report}; // use crate::reconsensus::remove_nodes::remove_emtpy_nodes; use crate::reconsensus::remove_nodes::find_empty_nodes; use crate::representation::seq::Seq; @@ -53,7 +53,10 @@ pub fn reconsensus_graph( // Apply mutation-only reconsensus (no realignment needed) analysis.mutations_only.into_iter().try_for_each(|block_id| { - let block = graph.blocks.get_mut(&block_id).unwrap(); + let block = graph + .blocks + .get_mut(&block_id) + .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; let majority_edits = block.find_majority_edits(); apply_mutation_reconsensus(block, &majority_edits.subs) .wrap_err_with(|| format!("When processing block {}", block.id())) From aa1e4b12ebbf26bcfdc3e3f518ba6c8fcd4cb7e5 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Thu, 31 Jul 2025 15:35:27 +0200 Subject: [PATCH 13/29] test: add unit tests for majority edits in PangraphBlock --- .../pangraph/src/pangraph/pangraph_block.rs | 326 ++++++++++++++++++ 1 file changed, 326 insertions(+) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index b93b0967..267532d5 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -272,3 +272,329 @@ pub enum RecordNaming { Node, Path, } + +#[cfg(test)] +mod tests { + use super::*; + use crate::pangraph::edits::{Del, Edit, Ins, Sub}; + use crate::pangraph::pangraph_node::NodeId; + use maplit::btreemap; + use pretty_assertions::assert_eq; + + fn s(pos: usize, alt: char) -> Sub { + Sub::new(pos, alt) + } + + fn e_subs(subs: Vec) -> Edit { + Edit::new(vec![], vec![], subs) + } + + fn d(pos: usize, len: usize) -> Del { + Del::new(pos, len) + } + + fn e_dels(dels: Vec) -> Edit { + Edit::new(vec![], dels, vec![]) + } + + fn i(pos: usize, seq: &str) -> Ins { + Ins::new(pos, seq) + } + + fn e_inss(inss: Vec) -> Edit { + Edit::new(inss, vec![], vec![]) + } + + #[test] + fn test_find_majority_substitutions_single_node() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_subs(vec![s(0, 'G'), s(2, 'A')]) + }, + ); + let result = block.find_majority_substitutions(); + // Single node is always majority (1 > 0) + assert_eq!(result, vec![s(0, 'G'), s(2, 'A')]); + } + + #[test] + fn test_find_majority_substitutions_no_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_subs(vec![s(0, 'G')]), + NodeId(2) => e_subs(vec![s(0, 'C')]), + NodeId(3) => e_subs(vec![s(0, 'T')]), + }, + ); + let result = block.find_majority_substitutions(); + // No substitution has majority (1 is not > 3/2 = 1) + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_substitutions_clear_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_subs(vec![s(0, 'G'), s(2, 'A')]), + NodeId(2) => e_subs(vec![s(0, 'G'), s(3, 'A')]), + NodeId(3) => e_subs(vec![s(0, 'C'), s(2, 'A')]), + }, + ); + let result = block.find_majority_substitutions(); + assert_eq!(result, vec![s(0, 'G'), s(2, 'A')]); + } + + #[test] + fn test_find_majority_substitutions_tie_no_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_subs(vec![]), + NodeId(2) => e_subs(vec![]), + NodeId(3) => e_subs(vec![s(0, 'C')]), + NodeId(4) => e_subs(vec![s(0, 'C')]), + }, + ); + let result = block.find_majority_substitutions(); + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_deletions_single_node() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAA", + btreemap! { + NodeId(1) => e_dels(vec![d(1, 2), d(4, 1)]) + }, + ); + let result = block.find_majority_deletions(); + // Single node is always majority (1 > 0) + assert_eq!(result, vec![d(1, 2), d(4, 1)]); + } + + #[test] + fn test_find_majority_deletions_no_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAA", + btreemap! { + NodeId(1) => e_dels(vec![d(0, 1)]), + NodeId(2) => e_dels(vec![d(1, 1)]), + NodeId(3) => e_dels(vec![d(2, 1)]), + }, + ); + let result = block.find_majority_deletions(); + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_deletions_clear_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAA", + btreemap! { + NodeId(1) => e_dels(vec![d(1, 2), d(4, 1)]), + NodeId(2) => e_dels(vec![d(1, 2), d(5, 1)]), + NodeId(3) => e_dels(vec![d(0, 1), d(4, 1)]), + }, + ); + let result = block.find_majority_deletions(); + assert_eq!(result, vec![d(1, 2), d(4, 1)]); + } + + #[test] + fn test_find_majority_deletions_overlapping_intervals() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAATT", + btreemap! { + NodeId(1) => e_dels(vec![d(1, 3)]), // deletes positions 1,2,3 + NodeId(2) => e_dels(vec![d(2, 3)]), // deletes positions 2,3,4 + NodeId(3) => e_dels(vec![d(3, 2)]), // deletes positions 3,4 + NodeId(4) => e_dels(vec![d(6, 1)]), // deletes position 6 + NodeId(5) => e_dels(vec![d(6, 2)]), // deletes positions 6,7 + }, + ); + let result = block.find_majority_deletions(); + assert_eq!(result, vec![d(3, 1)]); + } + + #[test] + fn test_find_majority_deletions_contiguous_intervals() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAATT", + btreemap! { + NodeId(1) => e_dels(vec![d(1, 1), d(2, 1), d(3, 1)]), // deletes 1,2,3 separately + NodeId(2) => e_dels(vec![d(1, 3)]), // deletes 1,2,3 as one interval + NodeId(3) => e_dels(vec![d(1, 1), d(2, 2)]), // deletes 1, then 2,3 + NodeId(4) => e_dels(vec![d(5, 1)]), // deletes 5 + NodeId(5) => e_dels(vec![d(5, 1), d(6, 1)]), // deletes 5,6 separately + }, + ); + let result = block.find_majority_deletions(); + assert_eq!(result, vec![d(1, 3)]); + } + + #[test] + fn test_find_majority_insertions_empty_block() { + let block = PangraphBlock::new(BlockId(1), "ATCG", btreemap! {}); + let result = block.find_majority_insertions(); + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_insertions_single_node() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_inss(vec![i(1, "GG"), i(3, "AA")]) + }, + ); + let result = block.find_majority_insertions(); + // Single node is always majority (1 > 0) + assert_eq!(result, vec![i(1, "GG"), i(3, "AA")]); + } + + #[test] + fn test_find_majority_insertions_no_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_inss(vec![i(1, "A")]), + NodeId(2) => e_inss(vec![i(1, "T")]), + NodeId(3) => e_inss(vec![i(1, "G")]), + }, + ); + let result = block.find_majority_insertions(); + // No insertion has majority (1 is not > 3/2 = 1) + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_insertions_clear_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_inss(vec![i(1, "GGG"), i(3, "A")]), + NodeId(2) => e_inss(vec![i(1, "GGG"), i(2, "TT")]), + NodeId(3) => e_inss(vec![i(1, "CC"), i(3, "A")]), + }, + ); + let result = block.find_majority_insertions(); + assert_eq!(result, vec![i(1, "GGG"), i(3, "A")]); + } + + #[test] + fn test_find_majority_insertions_exact_sequence_match() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_inss(vec![i(1, "ATG")]), + NodeId(2) => e_inss(vec![i(1, "ATG")]), + NodeId(3) => e_inss(vec![i(1, "ATG")]), + NodeId(4) => e_inss(vec![i(1, "GTA")]), + NodeId(5) => e_inss(vec![i(1, "GTA")]), + }, + ); + let result = block.find_majority_insertions(); + assert_eq!(result, vec![i(1, "ATG")]); + } + + #[test] + fn test_find_majority_insertions_different_positions() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAA", + btreemap! { + NodeId(1) => e_inss(vec![i(0, "G"), i(2, "T"), i(4, "C")]), + NodeId(2) => e_inss(vec![i(0, "G"), i(3, "A"), i(5, "T")]), + NodeId(3) => e_inss(vec![i(1, "A"), i(2, "T"), i(4, "C")]), + NodeId(4) => e_inss(vec![i(0, "C"), i(2, "T"), i(6, "G")]), + NodeId(5) => e_inss(vec![i(0, "G"), i(3, "A"), i(4, "C")]), + }, + ); + let result = block.find_majority_insertions(); + assert_eq!(result, vec![i(0, "G"), i(2, "T"), i(4, "C")]); + } + + #[test] + fn test_find_majority_insertions_tie_no_majority() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_inss(vec![]), + NodeId(2) => e_inss(vec![]), + NodeId(3) => e_inss(vec![i(1, "AA")]), + NodeId(4) => e_inss(vec![i(1, "AA")]), + }, + ); + let result = block.find_majority_insertions(); + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_edits() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => Edit::empty(), + NodeId(2) => Edit::empty(), + NodeId(3) => Edit::empty(), + }, + ); + let result = block.find_majority_edits(); + assert!(result.is_empty()); + } + + #[test] + fn test_find_majority_edits_comprehensive() { + let block = PangraphBlock::new( + BlockId(1), + "ATCGAATT", + btreemap! { + NodeId(1) => Edit::new(vec![i(1, "GG"), i(4, "C")], vec![d(2, 1), d(6, 1)], vec![s(0, 'G'), s(5, 'C')]), + NodeId(2) => Edit::new(vec![i(1, "GG"), i(3, "A")], vec![d(2, 1), d(7, 1)], vec![s(0, 'G'), s(5, 'T')]), + NodeId(3) => Edit::new(vec![i(1, "AA"), i(4, "C")], vec![d(2, 1), d(6, 1)], vec![s(0, 'C'), s(5, 'C')]), + NodeId(4) => Edit::new(vec![i(1, "GG"), i(4, "C")], vec![d(1, 1), d(6, 1)], vec![s(0, 'G'), s(4, 'A')]), + NodeId(5) => Edit::new(vec![i(1, "GG"), i(4, "C")], vec![d(2, 1), d(5, 1)], vec![s(0, 'G'), s(5, 'C')]), + }, + ); + let result = block.find_majority_edits(); + + // Depth = 5, majority threshold = 5/2 = 2, so need > 2 = at least 3 + + // Insertions: + // Position 1: GG appears 4 times (majority), AA appears 1 time + // Position 4: C appears 4 times (majority) + // Position 3: A appears 1 time (not majority) + + // Deletions: + // Position 2: deleted by 4 nodes (majority) + // Position 6: deleted by 4 nodes (majority) + // Position 1,5,7: deleted by 1 node each (not majority) + + // Substitutions: + // Position 0: G appears 4 times (majority), C appears 1 time + // Position 5: C appears 3 times (majority), T appears 1 time + // Position 4: A appears 1 time (not majority) + + assert_eq!(result.inss, vec![i(1, "GG"), i(4, "C")]); + assert_eq!(result.dels, vec![d(2, 1), d(6, 1)]); + assert_eq!(result.subs, vec![s(0, 'G'), s(5, 'C')]); + } +} From 76f834be945c871bb4c53c31a7f014abc9934df7 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Fri, 1 Aug 2025 13:34:46 +0200 Subject: [PATCH 14/29] refactor: update BlockAnalysis to avoid recomputing edits --- .../pangraph/src/reconsensus/reconsensus.rs | 48 ++++++++++--------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 94956582..31365d77 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -20,9 +20,9 @@ use std::collections::BTreeMap; #[derive(Clone, Debug, Serialize, Deserialize)] struct BlockAnalysis { /// Blocks that need only mutation-based reconsensus (no realignment) - mutations_only: Vec, + mutations_only: Vec<(BlockId, Edit)>, /// Blocks that need full reconsensus with realignment due to indels - need_realignment: Vec, + need_realignment: Vec<(BlockId, Edit)>, } /// Applies the reconsensus operation to each updated block in the graph: @@ -52,33 +52,37 @@ pub fn reconsensus_graph( let analysis = analyze_blocks_for_reconsensus(graph, ids_updated_blocks); // Apply mutation-only reconsensus (no realignment needed) - analysis.mutations_only.into_iter().try_for_each(|block_id| { + analysis.mutations_only.into_iter().try_for_each(|(block_id, edits)| { let block = graph .blocks .get_mut(&block_id) .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; - let majority_edits = block.find_majority_edits(); - apply_mutation_reconsensus(block, &majority_edits.subs) - .wrap_err_with(|| format!("When processing block {}", block.id())) + apply_mutation_reconsensus(block, &edits.subs).wrap_err_with(|| format!("When processing block {}", block.id())) })?; // Handle blocks requiring realignment if !analysis.need_realignment.is_empty() { - // Pop the realigned blocks from graph.blocks - let mut realigned_blocks: Vec<_> = analysis + let mut realigned_blocks: Vec = analysis .need_realignment - .iter() - .filter_map(|block_id| graph.blocks.remove(block_id)) - .collect(); - - // Apply full reconsensus with realignment - realigned_blocks.iter_mut().try_for_each(|block| { - let majority_edits = block.find_majority_edits(); - apply_full_reconsensus(block, &majority_edits, args) - .wrap_err_with(|| format!("When processing block {}", block.id())) - })?; - - // Apply detach_unaligned_nodes. This removes unaligned nodes and re-adds them to the list as new blocks. + .into_iter() + .map(|(block_id, edits)| { + // Pop the realigned blocks from graph.blocks + let mut block = graph + .blocks + .remove(&block_id) + .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; + + // Apply full reconsensus with realignment + apply_full_reconsensus(&mut block, &edits, args) + .wrap_err_with(|| format!("When processing block {}", block.id()))?; + // Return the realigned block if successful + Ok::(block) + }) + .collect::, _>>()?; + + // Apply detach_unaligned_nodes. This removes unaligned nodes and re-adds them to + // the `realigned_blocks` list as new blocks. + // it also modifies the nodes dictionary accordingly. detach_unaligned_nodes(&mut realigned_blocks, &mut graph.nodes)?; // Re-add all the blocks (including potentially new singleton blocks) to graph.blocks @@ -99,9 +103,9 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl let majority_edits = block.find_majority_edits(); if majority_edits.has_indels() { - Some(Either::Right(block_id)) + Some(Either::Right((block_id, majority_edits))) } else if majority_edits.has_subs() { - Some(Either::Left(block_id)) + Some(Either::Left((block_id, majority_edits))) } else { None // Blocks with no variants are skipped } From b610badda0b8866cf058f7ad5948c5e804b3aed7 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Fri, 1 Aug 2025 15:31:11 +0200 Subject: [PATCH 15/29] feat: add block method to change consensus nucleotide and update alignments --- .../pangraph/src/pangraph/pangraph_block.rs | 132 ++++++++++++++++-- 1 file changed, 120 insertions(+), 12 deletions(-) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 267532d5..881e2c8c 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -1,6 +1,7 @@ use crate::io::fasta::FastaRecord; use crate::io::json::{JsonPretty, json_write_str}; use crate::io::seq::reverse_complement; +use crate::make_internal_error; use crate::pangraph::edits::{Del, Edit, Ins, Sub}; use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_node::NodeId; @@ -80,18 +81,6 @@ impl PangraphBlock { &self.consensus } - pub fn consensus_mut(&mut self) -> &mut Seq { - &mut self.consensus - } - - pub fn set_consensus(&mut self, consensus: Seq) { - self.consensus = consensus; - } - - pub fn set_consensus_char(&mut self, pos: usize, c: AsciiChar) { - self.consensus[pos] = c; - } - pub fn consensus_len(&self) -> usize { self.consensus.len() } @@ -264,6 +253,38 @@ impl PangraphBlock { insertions.sort_by_key(|ins| ins.pos); insertions } + + /// Change a nucleotide in the consensus sequence at a specific position + /// and update the alignments accordingly, without changing the block sequences. + pub fn change_consensus_nucleotide_at_pos(&mut self, pos: usize, c: AsciiChar) -> Result<(), Report> { + if pos >= self.consensus_len() { + return make_internal_error!( + "Position {pos} is out of bounds for consensus of length {}", + self.consensus_len() + ); + } + + // get the original character + let original_char = self.consensus[pos]; + // check: the two must be different + if original_char == c { + return make_internal_error!( + "Cannot change consensus character at position {pos} to '{c}' because it is already '{original_char}'" + ); + } + + // update the consensus + self.consensus[pos] = c; + + // Update the alignments + self.alignments_mut().values_mut().try_for_each(|edit| { + edit + .reconcile_substitution_with_consensus(&Sub::new(pos, c), original_char) + .wrap_err_with(|| format!("When reconciling substitution at position {pos} with character '{c}'")) + })?; + + Ok(()) + } } #[derive(Copy, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] @@ -597,4 +618,91 @@ mod tests { assert_eq!(result.dels, vec![d(2, 1), d(6, 1)]); assert_eq!(result.subs, vec![s(0, 'G'), s(5, 'C')]); } + + #[test] + fn test_change_consensus_nucleotide_at_pos_basic() { + let mut block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => Edit::empty(), + NodeId(2) => e_subs(vec![s(1, 'G'), s(2, 'C')]), + NodeId(3) => e_subs(vec![s(1, 'A')]), + }, + ); + + let expected_block = PangraphBlock::new( + BlockId(1), + "AGCG", + btreemap! { + NodeId(1) => e_subs(vec![s(1, 'T')]), + NodeId(2) => e_subs(vec![s(2, 'C')]), + NodeId(3) => e_subs(vec![s(1, 'A')]), + }, + ); + + // Change position 1 from T to G + block.change_consensus_nucleotide_at_pos(1, 'G'.into()).unwrap(); + assert_eq!(block, expected_block); + } + + #[test] + fn test_change_consensus_nucleotide_at_pos_with_deletion() { + let mut block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => e_dels(vec![d(1, 2)]), // Node 1 deletes positions 1-2 (TC) + NodeId(2) => Edit::empty(), + NodeId(3) => e_subs(vec![s(1, 'A')]), + NodeId(4) => e_subs(vec![s(1, 'G')]), + }, + ); + + let expected_block = PangraphBlock::new( + BlockId(1), + "AGCG", + btreemap! { + NodeId(1) => e_dels(vec![d(1, 2)]), + NodeId(2) => e_subs(vec![s(1, 'T')]), // Node 2 should get a reversion substitution T at position 1 + NodeId(3) => e_subs(vec![s(1, 'A')]), // Node 3's substitution remains unchanged + NodeId(4) => Edit::empty(), // Node 4's substitution is removed (now matches consensus) + }, + ); + + // Change position 1 from T to G + block.change_consensus_nucleotide_at_pos(1, 'G'.into()).unwrap(); + assert_eq!(block, expected_block); + } + + #[test] + fn test_change_consensus_nucleotide_at_pos_out_of_bounds() { + let mut block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => Edit::empty(), + }, + ); + + // Try to change position 4 (out of bounds for length 4) + let result = block.change_consensus_nucleotide_at_pos(4, 'A'.into()); + assert!(result.is_err()); + } + + #[test] + fn test_change_consensus_nucleotide_at_pos_same_character() { + let mut block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => Edit::empty(), + }, + ); + + // Try to change position 1 to the same character (T) + let result = block.change_consensus_nucleotide_at_pos(1, 'T'.into()); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("already")); + } } From 0f8496d77fffd8dc65a63251744fa9ab10c08f8c Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sat, 2 Aug 2025 17:42:02 +0200 Subject: [PATCH 16/29] test: add unit tests for is_position_deleted --- packages/pangraph/src/pangraph/edits.rs | 43 +++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index d23f64c6..1a6f9ea9 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -1201,4 +1201,47 @@ mod tests { let edit_empty = Edit::empty(); assert!(!edit_empty.has_subs()); } + + #[test] + fn test_is_position_deleted() { + // Edit with no deletions + let edit_no_dels = Edit::new(vec![Ins::new(10, "ATG")], vec![], vec![Sub::new(1, 'A')]); + assert!(!edit_no_dels.is_position_deleted(0)); + assert!(!edit_no_dels.is_position_deleted(5)); + assert!(!edit_no_dels.is_position_deleted(10)); + + // Edit with single deletion at positions 5-7 (length 3) + let edit_single_del = Edit::new(vec![], vec![Del::new(5, 3)], vec![]); + assert!(!edit_single_del.is_position_deleted(4)); // position before deletion + assert!(edit_single_del.is_position_deleted(5)); // first position of deletion + assert!(edit_single_del.is_position_deleted(6)); // middle position of deletion + assert!(edit_single_del.is_position_deleted(7)); // last position of deletion + assert!(!edit_single_del.is_position_deleted(8)); // position after deletion + + // Edit with multiple deletions + let edit_multiple_dels = Edit::new( + vec![], + vec![Del::new(2, 2), Del::new(8, 2)], // deletions at 2-3 and 8-9 + vec![], + ); + assert!(!edit_multiple_dels.is_position_deleted(1)); // before first deletion + assert!(edit_multiple_dels.is_position_deleted(2)); // in first deletion + assert!(edit_multiple_dels.is_position_deleted(3)); // in first deletion + assert!(!edit_multiple_dels.is_position_deleted(4)); // between deletions + assert!(!edit_multiple_dels.is_position_deleted(7)); // between deletions + assert!(edit_multiple_dels.is_position_deleted(8)); // in second deletion + assert!(edit_multiple_dels.is_position_deleted(9)); // in second deletion + assert!(!edit_multiple_dels.is_position_deleted(10)); // after last deletion + + // Edit with single-position deletion + let edit_single_pos_del = Edit::new(vec![], vec![Del::new(10, 1)], vec![]); + assert!(!edit_single_pos_del.is_position_deleted(9)); // position before + assert!(edit_single_pos_del.is_position_deleted(10)); // deleted position + assert!(!edit_single_pos_del.is_position_deleted(11)); // position after + + // Empty edit + let edit_empty = Edit::empty(); + assert!(!edit_empty.is_position_deleted(0)); + assert!(!edit_empty.is_position_deleted(100)); + } } From dd9abae44048230119143f1c25a27f389357ee90 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sat, 2 Aug 2025 17:42:53 +0200 Subject: [PATCH 17/29] fix: correct apply_full_reconsensus function logic and change names --- .../pangraph/src/reconsensus/reconsensus.rs | 60 ++++++++----------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 31365d77..7ae9626f 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -57,7 +57,8 @@ pub fn reconsensus_graph( .blocks .get_mut(&block_id) .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; - apply_mutation_reconsensus(block, &edits.subs).wrap_err_with(|| format!("When processing block {}", block.id())) + block_reconsensus_via_substitutions(block, &edits.subs) + .wrap_err_with(|| format!("When processing block {} for reconsensus via substitutions", block.id())) })?; // Handle blocks requiring realignment @@ -73,8 +74,8 @@ pub fn reconsensus_graph( .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; // Apply full reconsensus with realignment - apply_full_reconsensus(&mut block, &edits, args) - .wrap_err_with(|| format!("When processing block {}", block.id()))?; + block_reconsensus_via_realignment(&mut block, &edits, args) + .wrap_err_with(|| format!("When processing block {} for reconsensus via realignment", block.id()))?; // Return the realigned block if successful Ok::(block) }) @@ -118,48 +119,38 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl } } -/// Applies only mutation reconsensus without realignment -fn apply_mutation_reconsensus(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { +/// Applies a set of substitutions to the block's consensus sequence. +/// Does not re-align the sequences. +fn block_reconsensus_via_substitutions(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { subs.iter().try_for_each(|sub| { - let original = block.consensus()[sub.pos]; - - // Update consensus - block.set_consensus_char(sub.pos, sub.alt); - - // Update alignments block - .alignments_mut() - .values_mut() - .try_for_each(|edit| edit.reconcile_substitution_with_consensus(sub, original)) + .change_consensus_nucleotide_at_pos(sub.pos, sub.alt) + .wrap_err_with(|| format!("Failed to apply mutation at pos {}: {}", sub.pos, sub.alt)) }) } -/// Applies full reconsensus including indels and realignment -fn apply_full_reconsensus( +/// Applies full set of edits (substitutions and indels) +/// to the block's consensus sequence, and then re-aligns the sequences +/// to the new consensus. +fn block_reconsensus_via_realignment( block: &mut PangraphBlock, majority_edits: &Edit, args: &PangraphBuildArgs, ) -> Result<(), Report> { - // First apply mutations - apply_mutation_reconsensus(block, &majority_edits.subs)?; - - // Then apply indels and realign if present - if majority_edits.has_indels() { - let consensus = block.consensus(); - let new_consensus = majority_edits.apply(consensus)?; + // original consensus + let consensus = block.consensus(); + // apply edits to the consensus + let new_consensus = majority_edits.apply(consensus)?; - let band_params = BandParameters::from_edits(majority_edits, block.consensus_len())?; - - debug_assert!(!new_consensus.is_empty(), "Consensus is empty after indels"); - - update_block_consensus(block, &new_consensus, band_params, args)?; - } + let band_params = BandParameters::from_edits(majority_edits, block.consensus_len())?; + debug_assert!(!new_consensus.is_empty(), "Consensus is empty after indels"); + update_block_alignments_to_new_consensus(block, &new_consensus, band_params, args)?; Ok(()) } /// Updates the consensus sequence of the block and re-aligns the sequences to the new consensus. -fn update_block_consensus( +fn update_block_alignments_to_new_consensus( block: &mut PangraphBlock, consensus: &Seq, new_consensus_band_params: BandParameters, @@ -349,7 +340,7 @@ mod tests { let mut block = block_1(); let expected_block = block_1_mut_reconsensus(); let subs = block.find_majority_substitutions(); - apply_mutation_reconsensus(&mut block, &subs).unwrap(); + block_reconsensus_via_substitutions(&mut block, &subs).unwrap(); assert_eq!(block, expected_block); } @@ -387,7 +378,7 @@ mod tests { let majority_edits = block.find_majority_edits(); // Apply mutations - apply_mutation_reconsensus(&mut block, &majority_edits.subs).unwrap(); + block_reconsensus_via_substitutions(&mut block, &majority_edits.subs).unwrap(); // Apply indels if present let has_indels = majority_edits.has_indels(); @@ -395,7 +386,8 @@ mod tests { let consensus = block.consensus(); let new_consensus = majority_edits.apply(consensus).unwrap(); let band_params = BandParameters::from_edits(&majority_edits, block.consensus_len()).unwrap(); - update_block_consensus(&mut block, &new_consensus, band_params, &PangraphBuildArgs::default()).unwrap(); + update_block_alignments_to_new_consensus(&mut block, &new_consensus, band_params, &PangraphBuildArgs::default()) + .unwrap(); } assert!(has_indels); // This test expects realignment to have occurred @@ -411,7 +403,7 @@ mod tests { let majority_edits = block.find_majority_edits(); // Apply mutations - apply_mutation_reconsensus(&mut block, &majority_edits.subs).unwrap(); + block_reconsensus_via_substitutions(&mut block, &majority_edits.subs).unwrap(); // Check that no indels require re-alignment let has_indels = majority_edits.has_indels(); From c96826baf5b52664f4056b44adc4331912af2888 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 12:19:14 +0200 Subject: [PATCH 18/29] feat: move consensus update to block method edit_consensus_and_realign --- .../pangraph/src/pangraph/pangraph_block.rs | 37 ++++++ .../pangraph/src/reconsensus/reconsensus.rs | 105 +++--------------- 2 files changed, 50 insertions(+), 92 deletions(-) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 881e2c8c..41e49267 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -1,3 +1,5 @@ +use crate::align::map_variations::{BandParameters, map_variations}; +use crate::commands::build::build_args::PangraphBuildArgs; use crate::io::fasta::FastaRecord; use crate::io::json::{JsonPretty, json_write_str}; use crate::io::seq::reverse_complement; @@ -285,6 +287,41 @@ impl PangraphBlock { Ok(()) } + + /// Applies a set of edits to the block's consensus sequence and re-aligns the sequences + /// to the new consensus. Returns a new `PangraphBlock` object with the same BlockId. + pub fn edit_consensus_and_realign(self, edits: &Edit, args: &PangraphBuildArgs) -> Result { + // apply the edits to the consensus + let new_consensus = edits.apply(&self.consensus)?; + debug_assert!(!new_consensus.is_empty(), "Consensus cannot be empty"); + + // calculate alignment band parameters + let band_params = BandParameters::from_edits(edits, self.consensus_len())?; + + // realign the block sequences to the new consensus + let new_alignments = self + .alignments() + .iter() + .map(|(&nid, edit)| { + let seq = edit.apply(&self.consensus)?; + debug_assert!(!seq.is_empty(), "Aligned sequence cannot be empty"); + let old_band_params = BandParameters::from_edits(edit, self.consensus_len())?; + let updated_band_params = BandParameters::new( + old_band_params.mean_shift() - band_params.mean_shift(), + old_band_params.band_width() + band_params.band_width(), + ); + let new_edits = map_variations(&new_consensus, &seq, updated_band_params, args)?; + Ok((nid, new_edits)) + }) + .collect::, Report>>()?; + + // create a new block with the updated alignments + Ok(PangraphBlock { + id: self.id(), + consensus: new_consensus, + alignments: new_alignments, + }) + } } #[derive(Copy, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 7ae9626f..1c64c8db 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -1,20 +1,14 @@ -use crate::align::map_variations::{BandParameters, map_variations}; use crate::commands::build::build_args::PangraphBuildArgs; +use crate::make_report; use crate::pangraph::detach_unaligned::detach_unaligned_nodes; use crate::pangraph::edits::Edit; use crate::pangraph::edits::Sub; use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_block::{BlockId, PangraphBlock}; -use crate::pangraph::pangraph_node::NodeId; -use crate::{make_error, make_report}; -// use crate::reconsensus::remove_nodes::remove_emtpy_nodes; use crate::reconsensus::remove_nodes::find_empty_nodes; -use crate::representation::seq::Seq; use eyre::{Context, Report}; use itertools::{Either, Itertools}; -use rayon::prelude::*; use serde::{Deserialize, Serialize}; -use std::collections::BTreeMap; /// Result of analyzing blocks for reconsensus operation #[derive(Clone, Debug, Serialize, Deserialize)] @@ -68,16 +62,18 @@ pub fn reconsensus_graph( .into_iter() .map(|(block_id, edits)| { // Pop the realigned blocks from graph.blocks - let mut block = graph + let block = graph .blocks .remove(&block_id) .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; // Apply full reconsensus with realignment - block_reconsensus_via_realignment(&mut block, &edits, args) - .wrap_err_with(|| format!("When processing block {} for reconsensus via realignment", block.id()))?; + // block_reconsensus_via_realignment(&mut block, &edits, args) + let new_block = block + .edit_consensus_and_realign(&edits, args) + .wrap_err_with(|| "When processing block for reconsensus via realignment")?; // Return the realigned block if successful - Ok::(block) + Ok::(new_block) }) .collect::, _>>()?; @@ -129,73 +125,6 @@ fn block_reconsensus_via_substitutions(block: &mut PangraphBlock, subs: &[Sub]) }) } -/// Applies full set of edits (substitutions and indels) -/// to the block's consensus sequence, and then re-aligns the sequences -/// to the new consensus. -fn block_reconsensus_via_realignment( - block: &mut PangraphBlock, - majority_edits: &Edit, - args: &PangraphBuildArgs, -) -> Result<(), Report> { - // original consensus - let consensus = block.consensus(); - // apply edits to the consensus - let new_consensus = majority_edits.apply(consensus)?; - - let band_params = BandParameters::from_edits(majority_edits, block.consensus_len())?; - debug_assert!(!new_consensus.is_empty(), "Consensus is empty after indels"); - - update_block_alignments_to_new_consensus(block, &new_consensus, band_params, args)?; - Ok(()) -} - -/// Updates the consensus sequence of the block and re-aligns the sequences to the new consensus. -fn update_block_alignments_to_new_consensus( - block: &mut PangraphBlock, - consensus: &Seq, - new_consensus_band_params: BandParameters, - args: &PangraphBuildArgs, -) -> Result<(), Report> { - // Reconstruct block sequences - let seqs = block - .alignments() - .iter() - .map(|(&nid, edit)| { - let seq = edit.apply(block.consensus())?; - let old_band_params = BandParameters::from_edits(edit, block.consensus_len())?; - let updated_band_params = BandParameters::new( - old_band_params.mean_shift() - new_consensus_band_params.mean_shift(), - old_band_params.band_width() + new_consensus_band_params.band_width(), - ); - Ok((nid, (seq, updated_band_params))) - }) - .collect::, Report>>()?; - - // debug assets: all sequences are non-empty - #[cfg(any(debug_assertions, test))] - { - if let Some((nid, (_seq, _))) = seqs.iter().find(|(_, (seq, _))| seq.is_empty()) { - return make_error!( - "node is empty!\nblock: {}\nnode: {}\nedits: {:?}\nconsensus: {}", - block.id(), - nid, - block.alignments().get(nid), - block.consensus() - ); - } - } - - // Re-align sequences - let alignments = seqs - .into_par_iter() - .map(|(nid, (seq, band_params))| Ok((nid, map_variations(consensus, &seq, band_params, args)?))) - .collect::>()?; - - *block = PangraphBlock::new(block.id(), consensus, alignments); - - Ok(()) -} - #[cfg(test)] mod tests { use super::*; @@ -205,6 +134,7 @@ mod tests { use crate::pangraph::pangraph_node::PangraphNode; use crate::pangraph::pangraph_path::{PangraphPath, PathId}; use crate::pangraph::strand::Strand::{Forward, Reverse}; + use crate::representation::seq::Seq; use crate::utils::id::id; use maplit::btreemap; @@ -372,25 +302,16 @@ mod tests { #[test] fn test_reconsensus() { - let mut block = block_3(); + let block = block_3(); let expected_block = block_3_reconsensus(); let majority_edits = block.find_majority_edits(); - // Apply mutations - block_reconsensus_via_substitutions(&mut block, &majority_edits.subs).unwrap(); + let block = block + .edit_consensus_and_realign(&majority_edits, &PangraphBuildArgs::default()) + .unwrap(); - // Apply indels if present - let has_indels = majority_edits.has_indels(); - if has_indels { - let consensus = block.consensus(); - let new_consensus = majority_edits.apply(consensus).unwrap(); - let band_params = BandParameters::from_edits(&majority_edits, block.consensus_len()).unwrap(); - update_block_alignments_to_new_consensus(&mut block, &new_consensus, band_params, &PangraphBuildArgs::default()) - .unwrap(); - } - - assert!(has_indels); // This test expects realignment to have occurred + assert!(majority_edits.has_indels()); // This test expects realignment to have occurred assert_eq!(block.consensus(), expected_block.consensus()); assert_eq!(block.alignments(), expected_block.alignments()); } From 267f5b76227d88695008eb3e0d93c8de8a2c731d Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 12:51:33 +0200 Subject: [PATCH 19/29] docs: improve reconsensus.rs inline comments --- .../pangraph/src/reconsensus/reconsensus.rs | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 1c64c8db..f3f4ffcc 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -24,13 +24,11 @@ struct BlockAnalysis { /// - removes potentially empty nodes /// /// Reconsensus function: -/// - for blocks that: -/// - originate from a new merger -/// - have depth > 2 -/// - goes through each block and re-defines the consensus -/// - for mutations, changes any mutated position. Also update the alignment (this is straightforward) -/// - for any in/del present in > M/2 sites, appends it to the consensus -/// - if the consensus has updated indels, then re-aligns all the sequences to the new consensus +/// for blocks that originate from a new merger: +/// - check whether blocks have majority substitutions, deletions, or insertions +/// - for blocks with only majority substitutions, updates the consensus +/// and transfers the substitutions to the alignment +/// - for blocks with majority indels, updates the consensus and realigns the sequences. pub fn reconsensus_graph( graph: &mut Pangraph, ids_updated_blocks: &[BlockId], @@ -42,16 +40,18 @@ pub fn reconsensus_graph( "Empty nodes found in the graph" ); - // Analyze blocks and determine which need realignment + // Analyze blocks, find majority edits and determine which need realignment let analysis = analyze_blocks_for_reconsensus(graph, ids_updated_blocks); // Apply mutation-only reconsensus (no realignment needed) analysis.mutations_only.into_iter().try_for_each(|(block_id, edits)| { + // get mutable block let block = graph .blocks .get_mut(&block_id) .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; - block_reconsensus_via_substitutions(block, &edits.subs) + // apply the substitions to the block consensus and alignment + apply_substitutions_to_block(block, &edits.subs) .wrap_err_with(|| format!("When processing block {} for reconsensus via substitutions", block.id())) })?; @@ -67,8 +67,8 @@ pub fn reconsensus_graph( .remove(&block_id) .ok_or_else(|| make_report!("Block {} not found in graph", block_id))?; - // Apply full reconsensus with realignment - // block_reconsensus_via_realignment(&mut block, &edits, args) + // apply edits to the block consensus and re-align sequences + // returns a new block let new_block = block .edit_consensus_and_realign(&edits, args) .wrap_err_with(|| "When processing block for reconsensus via realignment")?; @@ -80,6 +80,8 @@ pub fn reconsensus_graph( // Apply detach_unaligned_nodes. This removes unaligned nodes and re-adds them to // the `realigned_blocks` list as new blocks. // it also modifies the nodes dictionary accordingly. + // Nb: This is done to handle the edge-case when re-alignment could generate + // nodes that are not empty, but have zero aligned nucleotides to the consensus detach_unaligned_nodes(&mut realigned_blocks, &mut graph.nodes)?; // Re-add all the blocks (including potentially new singleton blocks) to graph.blocks @@ -100,8 +102,10 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl let majority_edits = block.find_majority_edits(); if majority_edits.has_indels() { + // the block needs full realignment Some(Either::Right((block_id, majority_edits))) } else if majority_edits.has_subs() { + // no re-alignment needed, substitutions are sufficient Some(Either::Left((block_id, majority_edits))) } else { None // Blocks with no variants are skipped @@ -115,9 +119,9 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl } } -/// Applies a set of substitutions to the block's consensus sequence. +/// Applies a set of substitutions to the block's consensus sequence and alignment. /// Does not re-align the sequences. -fn block_reconsensus_via_substitutions(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { +fn apply_substitutions_to_block(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { subs.iter().try_for_each(|sub| { block .change_consensus_nucleotide_at_pos(sub.pos, sub.alt) @@ -270,7 +274,7 @@ mod tests { let mut block = block_1(); let expected_block = block_1_mut_reconsensus(); let subs = block.find_majority_substitutions(); - block_reconsensus_via_substitutions(&mut block, &subs).unwrap(); + apply_substitutions_to_block(&mut block, &subs).unwrap(); assert_eq!(block, expected_block); } @@ -324,7 +328,7 @@ mod tests { let majority_edits = block.find_majority_edits(); // Apply mutations - block_reconsensus_via_substitutions(&mut block, &majority_edits.subs).unwrap(); + apply_substitutions_to_block(&mut block, &majority_edits.subs).unwrap(); // Check that no indels require re-alignment let has_indels = majority_edits.has_indels(); From 493a7fe1aa064892b99477aa73ea8da85af4b7d0 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 13:03:53 +0200 Subject: [PATCH 20/29] docs: inline comments in pangraph_block.rs --- packages/pangraph/src/pangraph/pangraph_block.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 41e49267..4343f5ff 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -303,13 +303,19 @@ impl PangraphBlock { .alignments() .iter() .map(|(&nid, edit)| { + // reconstruct the alignment sequence let seq = edit.apply(&self.consensus)?; debug_assert!(!seq.is_empty(), "Aligned sequence cannot be empty"); + + // calculate the alignment band parameters from the orgiginal alignment plus the displacement + // given by the edits applied to the original consensus let old_band_params = BandParameters::from_edits(edit, self.consensus_len())?; let updated_band_params = BandParameters::new( old_band_params.mean_shift() - band_params.mean_shift(), old_band_params.band_width() + band_params.band_width(), ); + + // re-align the sequence and returns the new set of edits let new_edits = map_variations(&new_consensus, &seq, updated_band_params, args)?; Ok((nid, new_edits)) }) From 947304532ad40501ff1e71b35629c550a16cd9a6 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 13:21:19 +0200 Subject: [PATCH 21/29] test: add unit test for reverse_complement method in PangraphBlock --- .../pangraph/src/pangraph/pangraph_block.rs | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 4343f5ff..65a37929 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -748,4 +748,30 @@ mod tests { assert!(result.is_err()); assert!(result.unwrap_err().to_string().contains("already")); } + + #[test] + fn test_reverse_complement() { + let block = PangraphBlock::new( + BlockId(1), + "ATCG", + btreemap! { + NodeId(1) => Edit::new(vec![i(1, "AA")], vec![d(2, 1)], vec![s(0, 'G')]), + NodeId(2) => Edit::new(vec![], vec![], vec![s(1, 'G'), s(3, 'A')]), + NodeId(3) => Edit::empty(), + }, + ); + + let expected_block = PangraphBlock::new( + BlockId(1), + "CGAT", + btreemap! { + NodeId(1) => Edit::new(vec![i(3, "TT")], vec![d(1, 1)], vec![s(3, 'C')]), + NodeId(2) => Edit::new(vec![], vec![], vec![s(0, 'T'), s(2, 'C')]), + NodeId(3) => Edit::empty(), + }, + ); + + let rev_block = block.reverse_complement().unwrap(); + assert_eq!(rev_block, expected_block); + } } From 8064456c464c9599937877ded4bcc732fc03b942 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 13:21:46 +0200 Subject: [PATCH 22/29] feat: sort reverse complement results for subs, dels, and inss in Edit struct --- packages/pangraph/src/pangraph/edits.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 1a6f9ea9..891c6ab9 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -233,19 +233,22 @@ impl Edit { } pub fn reverse_complement(&self, len: usize) -> Result { - let subs = self + let mut subs = self .subs .iter() .map(|s| s.reverse_complement(len)) .collect::, Report>>()?; + subs.sort_by_key(|s| s.pos); - let dels = self.dels.iter().map(|d| d.reverse_complement(len)).collect_vec(); + let mut dels = self.dels.iter().map(|d| d.reverse_complement(len)).collect_vec(); + dels.sort_by_key(|d| d.pos); - let inss = self + let mut inss = self .inss .iter() .map(|i| i.reverse_complement(len)) .collect::, Report>>()?; + inss.sort_by_key(|i| i.pos); Ok(Self { subs, dels, inss }) } From 86ad1bc2df26f226c7da6209019f4dd1961f1ee7 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 16:14:21 +0200 Subject: [PATCH 23/29] test: add unit tests for `reconcile_substitution_with_consensus` --- packages/pangraph/src/pangraph/edits.rs | 88 ++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 891c6ab9..80b5bd39 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -167,7 +167,7 @@ impl Edit { } /// Removes substitution if it matches the new consensus character - pub fn remove_matching_substitution(&mut self, substitution: &Sub) { + pub fn remove_substitution_if_matching(&mut self, substitution: &Sub) { if let Some(existing_sub) = self.subs.iter().find(|s| s.pos == substitution.pos) { if existing_sub.alt == substitution.alt { self @@ -183,9 +183,16 @@ impl Edit { } /// Reconciles genome alignment when consensus changes due to a substitution. + /// Takes as argument the substitution and the original nucleotide. /// /// During reconsensus, when a position in the consensus sequence is updated with a new /// character, this method adjusts the genome's edit to maintain correct alignment. + /// - if the position already has a substitution: + /// - removes it if matching the new consensus + /// - keeps it if non-matching the new consensus + /// - if the position does not already have a substitution: + /// - if it's not deleted, a reversion substitution is added. + /// - otherwise if the position is deleted nothing is done pub fn reconcile_substitution_with_consensus( &mut self, substitution: &Sub, @@ -196,8 +203,17 @@ impl Edit { match subs_count { // If genome has no mutation at this position: adds reversion to original character 0 => self.add_reversion_if_not_deleted(substitution.pos, original), - // If genome has matching mutation: removes it (now matches new consensus) - 1 => self.remove_matching_substitution(substitution), + // If genome has matching mutation: removes it if it matches the new consensus + 1 => { + if self.is_position_deleted(substitution.pos) { + return make_error!( + "At position {}: sequence has both a substitution and a deletion {:?}", + substitution.pos, + self + ); + } + self.remove_substitution_if_matching(substitution); + }, // If genome has conflicting mutations: returns error for inconsistent state _ => { let subs_at_pos = self.subs_at_position(substitution.pos); @@ -306,6 +322,8 @@ impl Edit { Ok(qry) } + /// Apply the edits to the query sequence to obtain the aligned query + /// Nb: insertions are missing. pub fn apply_aligned(&self, reff: impl Into) -> Result { let mut qry = reff.into(); @@ -1247,4 +1265,68 @@ mod tests { assert!(!edit_empty.is_position_deleted(0)); assert!(!edit_empty.is_position_deleted(100)); } + + #[test] + fn test_reconcile_substitution_with_consensus_no_existing_sub_not_deleted() { + // Case: No existing substitution at position, position not deleted + // Should add reversion to original character + let mut edit = Edit::new(vec![], vec![], vec![Sub::new(1, 'G')]); + let substitution = Sub::new(5, 'T'); + let original = AsciiChar(b'A'); + let expected_edit = Edit::new(vec![], vec![], vec![Sub::new(1, 'G'), Sub::new(5, 'A')]); + + edit + .reconcile_substitution_with_consensus(&substitution, original) + .unwrap(); + + assert_eq!(edit, expected_edit); + } + + #[test] + fn test_reconcile_substitution_with_consensus_no_existing_sub_deleted() { + // Case: No existing substitution at position, position is deleted + // Should do nothing (no reversion added) + let mut edit = Edit::new(vec![], vec![Del::new(5, 3)], vec![Sub::new(1, 'G')]); // deletion covers positions 5-7 + let substitution = Sub::new(7, 'T'); // position 6 is within deletion + let original = AsciiChar(b'A'); + let expected_edit = Edit::new(vec![], vec![Del::new(5, 3)], vec![Sub::new(1, 'G')]); + + edit + .reconcile_substitution_with_consensus(&substitution, original) + .unwrap(); + + assert_eq!(edit, expected_edit); + } + + #[test] + fn test_reconcile_substitution_with_consensus_matching_existing_sub() { + // Case: One existing substitution that matches the new consensus + // Should remove the existing substitution + let mut edit = Edit::new(vec![], vec![], vec![Sub::new(5, 'T')]); + let substitution = Sub::new(5, 'T'); // matches existing substitution + let original = AsciiChar(b'A'); + let expected_edit = Edit::empty(); + + edit + .reconcile_substitution_with_consensus(&substitution, original) + .unwrap(); + + assert_eq!(edit, expected_edit); + } + + #[test] + fn test_reconcile_substitution_with_consensus_non_matching_existing_sub() { + // Case: One existing substitution that doesn't match the new consensus + // Should keep the existing substitution unchanged + let mut edit = Edit::new(vec![], vec![], vec![Sub::new(5, 'G')]); + let substitution = Sub::new(5, 'T'); // different from existing substitution + let original = AsciiChar(b'A'); + let expected_edit = Edit::new(vec![], vec![], vec![Sub::new(5, 'G')]); + + edit + .reconcile_substitution_with_consensus(&substitution, original) + .unwrap(); + + assert_eq!(edit, expected_edit); + } } From ca15a1df924218544dbec329c0b2178e5fa0adf5 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 16:50:00 +0200 Subject: [PATCH 24/29] test: add unit test for `edit_consensus_and_realign` --- .../pangraph/src/pangraph/pangraph_block.rs | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 65a37929..c4040ca8 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -307,7 +307,7 @@ impl PangraphBlock { let seq = edit.apply(&self.consensus)?; debug_assert!(!seq.is_empty(), "Aligned sequence cannot be empty"); - // calculate the alignment band parameters from the orgiginal alignment plus the displacement + // calculate the alignment band parameters from the original alignment plus the displacement // given by the edits applied to the original consensus let old_band_params = BandParameters::from_edits(edit, self.consensus_len())?; let updated_band_params = BandParameters::new( @@ -340,6 +340,7 @@ pub enum RecordNaming { #[cfg(test)] mod tests { use super::*; + use crate::commands::build::build_args::PangraphBuildArgs; use crate::pangraph::edits::{Del, Edit, Ins, Sub}; use crate::pangraph::pangraph_node::NodeId; use maplit::btreemap; @@ -774,4 +775,49 @@ mod tests { let rev_block = block.reverse_complement().unwrap(); assert_eq!(rev_block, expected_block); } + + #[test] + fn test_edit_consensus_and_realign() { + // Create a simple block with consensus "ATCG" and some alignments + let block = PangraphBlock::new( + BlockId(1), + "ATCGGCGATG", + btreemap! { + NodeId(1) => Edit::empty(), + NodeId(2) => Edit::new(vec![i(10, "AAA")], vec![], vec![s(0, 'G')]), + NodeId(2) => Edit::new(vec![], vec![d(6,2)], vec![s(2, 'G')]), + }, + ); + // Apply a substitution edit to change consensus position 0 from A to G + let edits = Edit::new(vec![i(10, "AAA")], vec![d(6, 2)], vec![s(0, 'G')]); + + let expected_block = PangraphBlock::new( + BlockId(1), + "GTCGGCTGAAA", + btreemap! { + NodeId(1) => Edit::new( + vec![i(6, "GA")], + vec![d(8, 3)], + vec![s(0, 'A')] + ), + NodeId(2) => Edit::new( + vec![i(6, "GA")], + vec![], + vec![] + ), + NodeId(2) => Edit::new( + vec![], + vec![d(8, 3)], + vec![s(0, 'A'), s(2, 'G')] + ), + }, + ); + + // Create build args with default values for testing + let args = PangraphBuildArgs::default(); + // Apply the edits and realign + let result_block = block.edit_consensus_and_realign(&edits, &args).unwrap(); + + assert_eq!(result_block, expected_block); + } } From fbe9c0d2f96dae907064eeacde42c5d7b0c9e7c0 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 18:49:29 +0200 Subject: [PATCH 25/29] test: reconsensus unit tests update --- .../pangraph/src/reconsensus/reconsensus.rs | 312 +++++++++++++----- 1 file changed, 222 insertions(+), 90 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index f3f4ffcc..436b9e56 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -144,44 +144,105 @@ mod tests { use maplit::btreemap; use pretty_assertions::assert_eq; + fn block_0() -> PangraphBlock { + let consensus = "ATGCGATCGATCGA"; + // 01234567890123 + // node 1) .C............ (mutation at pos 1: T->C) + // node 2) .C............ (mutation at pos 1: T->C) + // node 3) .C............ (mutation at pos 1: T->C) + // node 4) ..........G... (mutation at pos 10: C->G) + // node 5) ..........G... (mutation at pos 10: C->G) + // L = 14, N = 5 + // Position 1: T->C appears in 3/5 sequences (majority) + // Position 10: C->G appears in 2/5 sequences (not majority) + let aln = btreemap! { + NodeId(1) => Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]), + NodeId(2) => Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]), + NodeId(3) => Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]), + NodeId(4) => Edit::new(vec![], vec![], vec![Sub::new(10, 'G')]), + NodeId(5) => Edit::new(vec![], vec![], vec![Sub::new(10, 'G')]), + }; + PangraphBlock::new(BlockId(0), consensus, aln) + } + + fn block_0_reconsensus() -> PangraphBlock { + let consensus = "ACGCGATCGATCGA"; + // 01234567890123 + // node 1) .............. (no mutations after consensus update) + // node 2) .............. (no mutations after consensus update) + // node 3) .............. (no mutations after consensus update) + // node 4) .T........G... (gets T at pos 1, keeps G at pos 10) + // node 5) .T........G... (gets T at pos 1, keeps G at pos 10) + // L = 14, N = 5 + // After reconsensus: consensus[1] = 'C' (majority), original T becomes mutation in nodes 4,5 + let aln = btreemap! { + NodeId(1) => Edit::new(vec![], vec![], vec![]), + NodeId(2) => Edit::new(vec![], vec![], vec![]), + NodeId(3) => Edit::new(vec![], vec![], vec![]), + NodeId(4) => Edit::new(vec![], vec![], vec![Sub::new(1, 'T'), Sub::new(10, 'G')]), + NodeId(5) => Edit::new(vec![], vec![], vec![Sub::new(1, 'T'), Sub::new(10, 'G')]), + }; + PangraphBlock::new(BlockId(0), consensus, aln) + } + fn block_1() -> PangraphBlock { let consensus = "AGGACTTCGATCTATTCGGAGAA"; // 0 1 2 // 01234567890123456789012 - // node 1) .T...--..........A..... - // node 2) .T...--...C......|..... + // node 1) .T...--.........|A..... + // node 2) .T...--...C............ // node 3) .T...--...C.....--..... // node 4) .C.......---.....A..... - // node 5) .....|...........A..... + // node 5) .....|-..........A..... // L = 23, N = 5 let aln = btreemap! { - NodeId(1) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(17, 'A')]), - NodeId(2) => Edit::new(vec![Ins::new(17, "CAAT")], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), - NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), - NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C'), Sub::new(17, 'A')]), - NodeId(5) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(17, 'A')]), + NodeId(1) => Edit::new(vec![Ins::new(17, "TTTT")], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(17, 'A')]), + NodeId(2) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), + NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), + NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C'), Sub::new(17, 'A')]), + NodeId(5) => Edit::new(vec![Ins::new(5, "AA")], vec![Del::new(5, 2)], vec![Sub::new(17, 'A')]), }; - PangraphBlock::new(BlockId(0), consensus, aln) + PangraphBlock::new(BlockId(1), consensus, aln) } fn block_1_mut_reconsensus() -> PangraphBlock { let consensus = "ATGACTTCGATCTATTCAGAGAA"; // 0 1 2 // 01234567890123456789012 - // node 1) .....--................ - // node 2) .....--...C......G|.... + // node 1) .....--..........|..... + // node 2) .....--...C......G..... // node 3) .....--...C.....--..... // node 4) .C.......---........... - // node 5) .G...|................. + // node 5) .G...|-................ // L = 23, N = 5 let aln = btreemap! { - NodeId(1) => Edit::new(vec![], vec![Del::new(5, 2)], vec![]), - NodeId(2) => Edit::new(vec![Ins::new(17, "CAAT")], vec![Del::new(5, 2)], vec![Sub::new(10, 'C'), Sub::new(17, 'G')]), - NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(10, 'C')]), - NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C')]), - NodeId(5) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'G')]), + NodeId(1) => Edit::new(vec![Ins::new(17, "TTTT")], vec![Del::new(5, 2)], vec![]), + NodeId(2) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(10, 'C'), Sub::new(17, 'G')]), + NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(10, 'C')]), + NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C')]), + NodeId(5) => Edit::new(vec![Ins::new(5, "AA")], vec![Del::new(5, 2)], vec![Sub::new(1, 'G')]), }; - PangraphBlock::new(BlockId(0), consensus, aln) + PangraphBlock::new(BlockId(1), consensus, aln) + } + + fn block_1_reconsensus() -> PangraphBlock { + let consensus = "ATGACCGATCTATTCAGAGAA"; + // 0 1 2 + // 012345678901234567890 + // node 1) .....|.........|..... + // node 2) .....|..C......G..... + // node 3) .....|..C.....--..... + // node 4) .C...|.---........... + // node 5) .G...|............... + // L = 21, N = 5 + let aln = btreemap! { + NodeId(1) => Edit::new(vec![Ins::new(15, "TTTT")], vec![], vec![]), + NodeId(2) => Edit::new(vec![], vec![], vec![Sub::new(8, 'C'), Sub::new(15, 'G')]), + NodeId(3) => Edit::new(vec![], vec![Del::new(14,2)], vec![Sub::new(8, 'C')]), + NodeId(4) => Edit::new(vec![Ins::new(5, "TT")], vec![Del::new(7, 3)], vec![Sub::new(1, 'C')]), + NodeId(5) => Edit::new(vec![Ins::new(5, "AA")], vec![], vec![Sub::new(1, 'G')]), + }; + PangraphBlock::new(BlockId(1), consensus, aln) } fn block_2() -> PangraphBlock { @@ -201,11 +262,39 @@ mod tests { NodeId(4) => Edit::new(vec![Ins::new(3, "C"), Ins::new(23, "TT")], vec![Del::new(9, 3)], vec![Sub::new(1, 'C'), Sub::new(17, 'A')]), NodeId(5) => Edit::new(vec![Ins::new(0, "G"), Ins::new(3, "C"), Ins::new(13, "AA")], vec![Del::new(19, 2)], vec![Sub::new(17, 'A')]) }; - PangraphBlock::new(BlockId(0), consensus, aln) + PangraphBlock::new(BlockId(2), consensus, aln) } + // fn block_2_reconsensus() -> PangraphBlock { + // let consensus = "GATGACTTCGATCTATTCGGAGAA"; + // // 0 1 2 + // // 01234567890123456789012 + // // node 1) ....|.--......|...A..-.. + // // node 2) ......--...C..|......--.| + // // node 3) -....----..C............| + // // node 4) -.C.|.....---.....A.....| + // // node 5) ..G.|.........|...A.--.. + // // L = 23, N = 5 + // let aln = btreemap! { + // NodeId(1) => Edit::new(vec![Ins::new(4, "AA"), Ins::new(13, "AA")], vec![Del::new(5, 2), Del::new(20, 1)], vec![Sub::new(17, 'A')]), + // NodeId(2) => Edit::new(vec![Ins::new(13, "AA"), Ins::new(23, "TT")], vec![Del::new(5, 2), Del::new(20, 2)], vec![Sub::new(10, 'C')]), + // NodeId(3) => Edit::new(vec![Ins::new(23, "TT")], vec![Del::new(0,1), Del::new(4, 4)], vec![Sub::new(10, 'C')]), + // NodeId(4) => Edit::new(vec![Ins::new(4, "C"), Ins::new(23, "TT")], vec![Del::new(0,1), Del::new(9, 3)], vec![Sub::new(2, 'C'), Sub::new(17, 'A')]), + // NodeId(5) => Edit::new(vec![Ins::new(4, "C"), Ins::new(13, "AA")], vec![Del::new(19, 2)], vec![Sub::new(2, 'G'), Sub::new(17, 'A')]), + // }; + // PangraphBlock::new(BlockId(2), consensus, aln) + // } + fn block_3() -> PangraphBlock { let consensus = "GCCTCTTCCCGACCACGCGTTACAACATGGGACAGGCCTGCGCTTGAGGC"; + // 0 1 2 3 4 + // 01234567890123456789012345678901234567890123456789 + // node 1) .....A.............----........................... + // node 2) .....A..............---.....................|....| + // node 3) ..............G............G...................... + // node 4) .....A..............---..........................| + // node 5) .................................................| + // L = 50, N = 5 let aln = btreemap! { NodeId(1) => Edit::new(vec![], vec![Del::new(19, 4)], vec![Sub::new(5, 'A')]), NodeId(2) => Edit::new(vec![Ins::new(35, "AA"), Ins::new(50, "TT")], vec![Del::new(20, 3)], vec![Sub::new(5, 'A')]), @@ -218,6 +307,14 @@ mod tests { fn block_3_reconsensus() -> PangraphBlock { let consensus = "GCCTCATCCCGACCACGCGTAACATGGGACAGGCCTGCGCTTGAGGCTT"; + // 0 1 2 3 4 + // 0123456789012345678901234567890123456789012345678 + // node 1) ...................-...........................-- + // node 2) ................................|................ + // node 3) .....T........G.....|..G.......................-- + // node 4) ................................................. + // node 5) .....T..............|............................ + // L = 49, N = 5 let aln = btreemap! { NodeId(1) => Edit::new(vec![], vec![Del::new(19, 1), Del::new(47, 2)], vec![]), NodeId(2) => Edit::new(vec![Ins::new(32, "AA")], vec![], vec![]), @@ -228,70 +325,65 @@ mod tests { PangraphBlock::new(BlockId(3), consensus, aln) } - fn block_mutations_only() -> PangraphBlock { - let consensus = "ATGCGATCGATCGA"; - // 01234567890123 - // node 1) .C............ (mutation at pos 1: T->C) - // node 2) .C............ (mutation at pos 1: T->C) - // node 3) .C............ (mutation at pos 1: T->C) - // node 4) ..........G... (mutation at pos 10: C->G) - // node 5) ..........G... (mutation at pos 10: C->G) - // L = 14, N = 5 - // Position 1: T->C appears in 3/5 sequences (majority) - // Position 10: C->G appears in 2/5 sequences (not majority) - let aln = btreemap! { - NodeId(1) => Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]), - NodeId(2) => Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]), - NodeId(3) => Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]), - NodeId(4) => Edit::new(vec![], vec![], vec![Sub::new(10, 'G')]), - NodeId(5) => Edit::new(vec![], vec![], vec![Sub::new(10, 'G')]), + // TODO: + // - test majority substitutions, insertions, deletions on the examples + // x test analyze_blocks_for_reconsensus + // - test only-substitutions reconsensus + // - test full reconsensus + + #[test] + fn test_analyze_block_reconsensus() { + let graph = Pangraph { + blocks: btreemap! ( + BlockId(0) => block_0(), + BlockId(1) => block_1(), + BlockId(2) => block_2(), + BlockId(3) => block_3(), + ), + paths: btreemap! {}, + nodes: btreemap! {}, }; - PangraphBlock::new(BlockId(10), consensus, aln) + + let block_ids = vec![BlockId(0), BlockId(1), BlockId(2), BlockId(3)]; + let results = analyze_blocks_for_reconsensus(&graph, &block_ids); + + let subs_blockids: Vec = results.mutations_only.iter().map(|(bid, _)| *bid).collect(); + let realign_blockids: Vec = results.need_realignment.iter().map(|(bid, _)| *bid).collect(); + + assert_eq!(subs_blockids, vec![BlockId(0)]); + assert_eq!(realign_blockids, vec![BlockId(1), BlockId(2), BlockId(3)]); } - fn block_mutations_only_after_reconsensus() -> PangraphBlock { - let consensus = "ACGCGATCGATCGA"; - // 01234567890123 - // node 1) .............. (no mutations after consensus update) - // node 2) .............. (no mutations after consensus update) - // node 3) .............. (no mutations after consensus update) - // node 4) .T........G... (gets T at pos 1, keeps G at pos 10) - // node 5) .T........G... (gets T at pos 1, keeps G at pos 10) - // L = 14, N = 5 - // After reconsensus: consensus[1] = 'C' (majority), original T becomes mutation in nodes 4,5 - let aln = btreemap! { - NodeId(1) => Edit::new(vec![], vec![], vec![]), - NodeId(2) => Edit::new(vec![], vec![], vec![]), - NodeId(3) => Edit::new(vec![], vec![], vec![]), - NodeId(4) => Edit::new(vec![], vec![], vec![Sub::new(1, 'T'), Sub::new(10, 'G')]), - NodeId(5) => Edit::new(vec![], vec![], vec![Sub::new(1, 'T'), Sub::new(10, 'G')]), - }; - PangraphBlock::new(BlockId(10), consensus, aln) + #[test] + fn test_find_majority_edits_block0() { + let edits = block_0().find_majority_edits(); + let expected_edits = Edit::new(vec![], vec![], vec![Sub::new(1, 'C')]); + assert_eq!(edits, expected_edits); } #[test] - fn test_reconsensus_mutations() { - let mut block = block_1(); - let expected_block = block_1_mut_reconsensus(); - let subs = block.find_majority_substitutions(); - apply_substitutions_to_block(&mut block, &subs).unwrap(); - assert_eq!(block, expected_block); + fn test_find_majority_edits_block1() { + let edits = block_1().find_majority_edits(); + let expected_edits = Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(17, 'A')]); + assert_eq!(edits, expected_edits); } #[test] - fn test_majority_deletions() { - let dels: Vec = block_2() - .find_majority_deletions() - .iter() - .flat_map(|d| d.range()) - .collect(); - assert_eq!(dels, vec![5, 6, 20]); + fn test_find_majority_edits_block2() { + let edits = block_2().find_majority_edits(); + let expected_edits = Edit::new( + vec![Ins::new(0, "G"), Ins::new(13, "AA"), Ins::new(23, "TT")], + vec![Del::new(5, 2), Del::new(20, 1)], + vec![Sub::new(1, 'T'), Sub::new(17, 'A')], + ); + assert_eq!(edits, expected_edits); } #[test] - fn test_majority_insertions() { - let ins = block_2().find_majority_insertions(); - assert_eq!(ins, vec![Ins::new(0, "G"), Ins::new(13, "AA"), Ins::new(23, "TT")]); + fn test_find_majority_edits_block3() { + let edits = block_3().find_majority_edits(); + let expected_edits = Edit::new(vec![Ins::new(50, "TT")], vec![Del::new(20, 3)], vec![Sub::new(5, 'A')]); + assert_eq!(edits, expected_edits); } #[test] @@ -305,38 +397,78 @@ mod tests { } #[test] - fn test_reconsensus() { - let block = block_3(); - let expected_block = block_3_reconsensus(); + fn test_mutations_only_reconsensus_block0() { + let mut block = block_0(); + let expected_block = block_0_reconsensus(); + // Apply mutations let majority_edits = block.find_majority_edits(); + assert!(!majority_edits.has_indels()); // This block has no indels requiring re-alignment + apply_substitutions_to_block(&mut block, &majority_edits.subs).unwrap(); + // But consensus should be updated and mutations healed + assert_eq!(block, expected_block); + } + + #[test] + fn test_mutations_only_reconsensus_block1() { + let mut block = block_1(); + let expected_block = block_1_mut_reconsensus(); + + // Apply mutations + let majority_edits = block.find_majority_edits(); + apply_substitutions_to_block(&mut block, &majority_edits.subs).unwrap(); + + // But consensus should be updated and mutations healed + assert_eq!(block, expected_block); + } + + #[test] + fn test_realign_reconsensus_block1() { + let block = block_1(); + let expected_block = block_1_reconsensus(); + + // Apply majority edits + let majority_edits = block.find_majority_edits(); + assert!(majority_edits.has_indels()); // This block has indels requiring re-alignment let block = block .edit_consensus_and_realign(&majority_edits, &PangraphBuildArgs::default()) .unwrap(); - assert!(majority_edits.has_indels()); // This test expects realignment to have occurred - assert_eq!(block.consensus(), expected_block.consensus()); - assert_eq!(block.alignments(), expected_block.alignments()); + // Check that the re-alignment produced the expected result + assert_eq!(block, expected_block); } - #[test] - fn test_reconsensus_mutations_only_no_realignment() { - let mut block = block_mutations_only(); - let expected_block = block_mutations_only_after_reconsensus(); + // #[test] + // fn test_realign_reconsensus_block2() { + // let block = block_2(); + // let expected_block = block_2_reconsensus(); - let majority_edits = block.find_majority_edits(); + // // Apply majority edits + // let majority_edits = block.find_majority_edits(); + // assert!(majority_edits.has_indels()); // This block has indels requiring re-alignment + // let block = block + // .edit_consensus_and_realign(&majority_edits, &PangraphBuildArgs::default()) + // .unwrap(); - // Apply mutations - apply_substitutions_to_block(&mut block, &majority_edits.subs).unwrap(); + // // Check that the re-alignment produced the expected result + // assert_eq!(block, expected_block); + // } + + #[test] + fn test_realign_reconsensus_block3() { + let block = block_3(); + let expected_block = block_3_reconsensus(); - // Check that no indels require re-alignment - let has_indels = majority_edits.has_indels(); - assert!(!has_indels); // Should return false because no indels require re-alignment + // Apply majority edits + let majority_edits = block.find_majority_edits(); + assert!(majority_edits.has_indels()); // This block has indels requiring re-alignment + let block = block + .edit_consensus_and_realign(&majority_edits, &PangraphBuildArgs::default()) + .unwrap(); - // But consensus should be updated and mutations healed - assert_eq!(block.consensus(), expected_block.consensus()); - assert_eq!(block.alignments(), expected_block.alignments()); + // Check that the re-alignment produced the expected result + assert_eq!(block, expected_block); } fn block_for_graph_test() -> PangraphBlock { From 1087faaa86214d51b9b55999a249113ab5f5086f Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sun, 3 Aug 2025 19:40:53 +0200 Subject: [PATCH 26/29] test: reconsensus unit test update --- .../pangraph/src/reconsensus/reconsensus.rs | 88 +++++++++---------- 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 436b9e56..8a1ec4e5 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -265,26 +265,6 @@ mod tests { PangraphBlock::new(BlockId(2), consensus, aln) } - // fn block_2_reconsensus() -> PangraphBlock { - // let consensus = "GATGACTTCGATCTATTCGGAGAA"; - // // 0 1 2 - // // 01234567890123456789012 - // // node 1) ....|.--......|...A..-.. - // // node 2) ......--...C..|......--.| - // // node 3) -....----..C............| - // // node 4) -.C.|.....---.....A.....| - // // node 5) ..G.|.........|...A.--.. - // // L = 23, N = 5 - // let aln = btreemap! { - // NodeId(1) => Edit::new(vec![Ins::new(4, "AA"), Ins::new(13, "AA")], vec![Del::new(5, 2), Del::new(20, 1)], vec![Sub::new(17, 'A')]), - // NodeId(2) => Edit::new(vec![Ins::new(13, "AA"), Ins::new(23, "TT")], vec![Del::new(5, 2), Del::new(20, 2)], vec![Sub::new(10, 'C')]), - // NodeId(3) => Edit::new(vec![Ins::new(23, "TT")], vec![Del::new(0,1), Del::new(4, 4)], vec![Sub::new(10, 'C')]), - // NodeId(4) => Edit::new(vec![Ins::new(4, "C"), Ins::new(23, "TT")], vec![Del::new(0,1), Del::new(9, 3)], vec![Sub::new(2, 'C'), Sub::new(17, 'A')]), - // NodeId(5) => Edit::new(vec![Ins::new(4, "C"), Ins::new(13, "AA")], vec![Del::new(19, 2)], vec![Sub::new(2, 'G'), Sub::new(17, 'A')]), - // }; - // PangraphBlock::new(BlockId(2), consensus, aln) - // } - fn block_3() -> PangraphBlock { let consensus = "GCCTCTTCCCGACCACGCGTTACAACATGGGACAGGCCTGCGCTTGAGGC"; // 0 1 2 3 4 @@ -325,12 +305,6 @@ mod tests { PangraphBlock::new(BlockId(3), consensus, aln) } - // TODO: - // - test majority substitutions, insertions, deletions on the examples - // x test analyze_blocks_for_reconsensus - // - test only-substitutions reconsensus - // - test full reconsensus - #[test] fn test_analyze_block_reconsensus() { let graph = Pangraph { @@ -439,22 +413,6 @@ mod tests { assert_eq!(block, expected_block); } - // #[test] - // fn test_realign_reconsensus_block2() { - // let block = block_2(); - // let expected_block = block_2_reconsensus(); - - // // Apply majority edits - // let majority_edits = block.find_majority_edits(); - // assert!(majority_edits.has_indels()); // This block has indels requiring re-alignment - // let block = block - // .edit_consensus_and_realign(&majority_edits, &PangraphBuildArgs::default()) - // .unwrap(); - - // // Check that the re-alignment produced the expected result - // assert_eq!(block, expected_block); - // } - #[test] fn test_realign_reconsensus_block3() { let block = block_3(); @@ -471,7 +429,41 @@ mod tests { assert_eq!(block, expected_block); } - fn block_for_graph_test() -> PangraphBlock { + #[test] + fn reconsensus_test() { + let block = block_1(); + let block_id = block.id(); + let expected_block = block_1_reconsensus(); + + let nodes = btreemap! { + NodeId(1) => PangraphNode::new(Some(NodeId(1)), block.id(), PathId(1), Forward, (0, 23)), + NodeId(2) => PangraphNode::new(Some(NodeId(2)), block.id(), PathId(2), Forward, (0, 23)), + NodeId(3) => PangraphNode::new(Some(NodeId(3)), block.id(), PathId(3), Forward, (0, 23)), + NodeId(4) => PangraphNode::new(Some(NodeId(4)), block.id(), PathId(4), Forward, (0, 23)), + NodeId(5) => PangraphNode::new(Some(NodeId(5)), block.id(), PathId(5), Forward, (0, 23)), + }; + let paths = btreemap! { + PathId(1) => PangraphPath::new(Some(PathId(1)), [NodeId(1)], 23, false, None, None), + PathId(2) => PangraphPath::new(Some(PathId(2)), [NodeId(2)], 23, false, None, None), + PathId(3) => PangraphPath::new(Some(PathId(3)), [NodeId(3)], 23, false, None, None), + PathId(4) => PangraphPath::new(Some(PathId(4)), [NodeId(4)], 23, false, None, None), + PathId(5) => PangraphPath::new(Some(PathId(5)), [NodeId(5)], 23, false, None, None), + }; + let mut graph = Pangraph { + blocks: btreemap! { + block_id => block, + }, + nodes, + paths, + }; + + let result = reconsensus_graph(&mut graph, &[block_id], &PangraphBuildArgs::default()); + result.unwrap(); + + assert_eq!(graph.blocks[&block_id], expected_block); + } + + fn block_for_edge_case_test() -> PangraphBlock { let consensus = "GCCTCTTCCCGACCACGCGTTACAACATGGGACAGGCCTGCGCTTGAGGC"; // 0 1 2 3 4 // 01234567890123456789012345678901234567890123456789 @@ -485,7 +477,7 @@ mod tests { PangraphBlock::new(BlockId(20), consensus, aln) } - fn block_for_graph_test_expected() -> PangraphBlock { + fn block_for_edge_case_test_expected() -> PangraphBlock { // After reconsensus, the majority deletions (positions 35-49) should be applied // Original consensus: "GCCTCTTCCCGACCACGCGTTACAACATGGGACAGGCCTGCGCTTGAGGC" (50 chars) // Expected consensus: "GCCTCTTCCCGACCACGCGTTACAACATGGGACAG" (35 chars) @@ -508,10 +500,12 @@ mod tests { } #[test] - fn test_reconsensus_graph() { + fn test_edge_case_reconsensus_graph() { + // test the edge case in which a node needs to be detached and separated in a new block + // Create a block that will be modified by reconsensus - let initial_block = block_for_graph_test(); - let expected_block = block_for_graph_test_expected(); + let initial_block = block_for_edge_case_test(); + let expected_block = block_for_edge_case_test_expected(); let singleton_block_exp = signleton_block_expected(); // Create nodes for the block with lengths reflecting actual sequence lengths From 0b170e22dc0e55e09c18cf68d54c8ef7eb77df83 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 4 Aug 2025 10:59:47 +0200 Subject: [PATCH 27/29] refactor: update change_consensus_nucleotide_at_pos to accept Sub --- .../pangraph/src/pangraph/pangraph_block.rs | 38 +++++++++++-------- .../pangraph/src/reconsensus/reconsensus.rs | 2 +- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index c4040ca8..11a755d1 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -9,7 +9,6 @@ use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_node::NodeId; use crate::pangraph::pangraph_path::PathId; use crate::representation::seq::Seq; -use crate::representation::seq_char::AsciiChar; use crate::utils::collections::has_duplicates; use crate::utils::interval::positions_to_intervals; use derive_more::{Display, From}; @@ -256,33 +255,36 @@ impl PangraphBlock { insertions } - /// Change a nucleotide in the consensus sequence at a specific position - /// and update the alignments accordingly, without changing the block sequences. - pub fn change_consensus_nucleotide_at_pos(&mut self, pos: usize, c: AsciiChar) -> Result<(), Report> { - if pos >= self.consensus_len() { + /// Change a nucleotide in the consensus sequence at a specific position by applying a substitution. + /// Updates the alignment substitutions accordingly, without changing the underlying sequences. + pub fn change_consensus_nucleotide_at_pos(&mut self, s: &Sub) -> Result<(), Report> { + if s.pos >= self.consensus_len() { return make_internal_error!( - "Position {pos} is out of bounds for consensus of length {}", + "Position {} is out of bounds for consensus of length {}", + s.pos, self.consensus_len() ); } // get the original character - let original_char = self.consensus[pos]; + let original_char = self.consensus[s.pos]; // check: the two must be different - if original_char == c { + if original_char == s.alt { return make_internal_error!( - "Cannot change consensus character at position {pos} to '{c}' because it is already '{original_char}'" + "Cannot change consensus character at position {} to '{}' because it is already '{original_char}'", + s.pos, + s.alt ); } // update the consensus - self.consensus[pos] = c; + self.consensus[s.pos] = s.alt; // Update the alignments self.alignments_mut().values_mut().try_for_each(|edit| { edit - .reconcile_substitution_with_consensus(&Sub::new(pos, c), original_char) - .wrap_err_with(|| format!("When reconciling substitution at position {pos} with character '{c}'")) + .reconcile_substitution_with_consensus(s, original_char) + .wrap_err_with(|| format!("When reconciling substitution {s:?} with original character '{original_char}'")) })?; Ok(()) @@ -686,7 +688,8 @@ mod tests { ); // Change position 1 from T to G - block.change_consensus_nucleotide_at_pos(1, 'G'.into()).unwrap(); + let sub = s(1, 'G'); + block.change_consensus_nucleotide_at_pos(&sub).unwrap(); assert_eq!(block, expected_block); } @@ -715,7 +718,8 @@ mod tests { ); // Change position 1 from T to G - block.change_consensus_nucleotide_at_pos(1, 'G'.into()).unwrap(); + let sub = s(1, 'G'); + block.change_consensus_nucleotide_at_pos(&sub).unwrap(); assert_eq!(block, expected_block); } @@ -730,7 +734,8 @@ mod tests { ); // Try to change position 4 (out of bounds for length 4) - let result = block.change_consensus_nucleotide_at_pos(4, 'A'.into()); + let sub = s(4, 'A'); + let result = block.change_consensus_nucleotide_at_pos(&sub); assert!(result.is_err()); } @@ -745,7 +750,8 @@ mod tests { ); // Try to change position 1 to the same character (T) - let result = block.change_consensus_nucleotide_at_pos(1, 'T'.into()); + let sub = s(1, 'T'); + let result = block.change_consensus_nucleotide_at_pos(&sub); assert!(result.is_err()); assert!(result.unwrap_err().to_string().contains("already")); } diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 8a1ec4e5..8b410b45 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -124,7 +124,7 @@ fn analyze_blocks_for_reconsensus(graph: &Pangraph, block_ids: &[BlockId]) -> Bl fn apply_substitutions_to_block(block: &mut PangraphBlock, subs: &[Sub]) -> Result<(), Report> { subs.iter().try_for_each(|sub| { block - .change_consensus_nucleotide_at_pos(sub.pos, sub.alt) + .change_consensus_nucleotide_at_pos(sub) .wrap_err_with(|| format!("Failed to apply mutation at pos {}: {}", sub.pos, sub.alt)) }) } From b09c759bba8b6cb0d0f090fdf04e47d4c8d6c602 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 4 Aug 2025 14:42:47 +0200 Subject: [PATCH 28/29] docs: update changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca2bad09..423ac684 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## Unreleased -- Fix an edge-case bug: when updating the consensus of a block with a majority deletion, very rarely an unaligned node could be created. This is now fixed by extracting such nodes into a separate singleton block. +- Fix an edge-case bug: when updating the consensus of a block with a majority deletion, very rarely an unaligned node could be created. This is now fixed by extracting such nodes into a separate singleton block, see #158. +- Internal refactoring: improved logic of the `reconsensus` function, see #164. ## 1.2.0 From 243b2264d0e5ebd2736ea93fb1a4b4591f7563c5 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 5 Aug 2025 15:15:49 +0200 Subject: [PATCH 29/29] refactor: reconcile substitution method --- packages/pangraph/src/pangraph/edits.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 80b5bd39..8ee48ec8 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -158,16 +158,16 @@ impl Edit { self.dels.iter().any(|d| d.contains(pos)) } - /// Adds reversion mutation when no substitutions exist at a position - pub fn add_reversion_if_not_deleted(&mut self, pos: usize, original: AsciiChar) { - if !self.is_position_deleted(pos) { - self.subs.push(Sub::new(pos, original)); + /// Adds a substitution if a position is not deleted. + fn add_substitution_if_not_deleted(&mut self, sub: Sub) { + if !self.is_position_deleted(sub.pos) { + self.subs.push(sub); self.subs.sort_by_key(|s| s.pos); } } - /// Removes substitution if it matches the new consensus character - pub fn remove_substitution_if_matching(&mut self, substitution: &Sub) { + /// Removes an existing substitution if it exactly matches the given substitution. + fn remove_substitution_if_matching(&mut self, substitution: &Sub) { if let Some(existing_sub) = self.subs.iter().find(|s| s.pos == substitution.pos) { if existing_sub.alt == substitution.alt { self @@ -201,9 +201,15 @@ impl Edit { let subs_count = self.subs.iter().filter(|s| s.pos == substitution.pos).count(); match subs_count { - // If genome has no mutation at this position: adds reversion to original character - 0 => self.add_reversion_if_not_deleted(substitution.pos, original), - // If genome has matching mutation: removes it if it matches the new consensus + // If genome has no mutation at this position: + // adds reversion to original character if not deleted + 0 => { + let reversion = Sub::new(substitution.pos, original); + self.add_substitution_if_not_deleted(reversion); + }, + // If genome already has a mutation: + // - removes it if it matches the new consensus exactly (same substitution) + // - keeps it if non-matching the new consensus 1 => { if self.is_position_deleted(substitution.pos) { return make_error!(