diff --git a/docs/README.md b/docs/README.md index dab71202..0102b811 100644 --- a/docs/README.md +++ b/docs/README.md @@ -55,7 +55,7 @@ The `docs/docs/reference.md` file is generated using script `generate-reference- ```bash cargo build --bin=pangraph cd docs -./generate-reference-docs "../target/debug/pangraph" "docs/docs/reference.md" +./generate-reference-docs "../target/debug/pangraph" "docs/reference.md" ``` > ⚠️ Do not edit the generated file manually! All manual changes will be overwritten by automation. diff --git a/docs/docs/reference.md b/docs/docs/reference.md index 2218b275..4ea30211 100644 --- a/docs/docs/reference.md +++ b/docs/docs/reference.md @@ -115,6 +115,7 @@ Align genomes into a multiple sequence alignment graph Default value: `10` * `-K`, `--kmer-length ` — Sets kmer length for mmseqs2 aligner +* `-S`, `--strict-max-divergence` — Set strict maximal divergence. When toggled, proposed mergers with maximal divergence above 1/b are rejected * `-c`, `--circular` — Toggle if input genomes are circular * `-x`, `--max-self-map ` — Maximum number of alignment rounds to consider per pairwise graph merger diff --git a/packages/pangraph/src/align/alignment_args.rs b/packages/pangraph/src/align/alignment_args.rs index 490a84b8..f0ccd143 100644 --- a/packages/pangraph/src/align/alignment_args.rs +++ b/packages/pangraph/src/align/alignment_args.rs @@ -32,4 +32,8 @@ pub struct AlignmentArgs { #[clap(long, short = 'K')] #[clap(value_hint = ValueHint::Other)] pub kmer_length: Option, + + /// Set strict maximal divergence. When toggled, proposed mergers with maximal divergence above 1/b are rejected. + #[clap(long, short = 'S')] + pub strict_max_divergence: bool, } diff --git a/packages/pangraph/src/align/nextclade/align/params.rs b/packages/pangraph/src/align/nextclade/align/params.rs index 0035d0a3..398e2441 100644 --- a/packages/pangraph/src/align/nextclade/align/params.rs +++ b/packages/pangraph/src/align/nextclade/align/params.rs @@ -168,7 +168,7 @@ impl Default for NextalignParams { min_match_length: 40, // Experimentally determined, to keep off-target matches reasonably low allowed_mismatches: 8, // Ns count as mismatches window_size: 30, - max_alignment_attempts: 3, + max_alignment_attempts: 5, // The following args are deprecated and are kept for backwards compatibility (to emit errors if they are set) max_indel: None, diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 1e17c2f2..ae017c43 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -36,6 +36,10 @@ impl Sub { alt: self.alt, } } + + pub fn to_n(&self) -> bool { + self.alt.is_ambiguous() + } } #[must_use] @@ -249,6 +253,23 @@ impl Edit { seq_len == 0 } + // Return the average divergence from the consensus sequence, + // given as number of mutations (excluding `N`) per non-deleted base. + pub fn divergence(&self, cons_len: usize, exclude_positions: &[usize]) -> Option { + let deleted_len = self.dels.iter().map(|d| d.len).sum::(); + let excluded_deleted = exclude_positions + .iter() + .filter(|&pos| self.dels.iter().any(|d| d.contains(*pos))) + .count(); + let n_bases = cons_len - deleted_len - exclude_positions.len() + excluded_deleted; + let n_mutations = self + .subs + .iter() + .filter(|s| !s.to_n() && !exclude_positions.contains(&s.pos)) + .count(); + (n_bases > 0).then_some(n_mutations as f64 / n_bases as f64) + } + #[cfg(any(test, debug_assertions))] pub fn sanity_check(&self, len: usize) -> Result<(), Report> { let block_interval = Interval::new(0, len); diff --git a/packages/pangraph/src/pangraph/graph_merging.rs b/packages/pangraph/src/pangraph/graph_merging.rs index 49115cb9..278a199e 100644 --- a/packages/pangraph/src/pangraph/graph_merging.rs +++ b/packages/pangraph/src/pangraph/graph_merging.rs @@ -117,7 +117,13 @@ pub fn self_merge(graph: Pangraph, args: &PangraphBuildArgs) -> Result<(Pangraph // - calculate energy and keep only matches with E < 0 // - sort them by energy // - discard incompatible matches (the ones that have overlapping regions) - let mut matches = filter_matches(&matches, &args.aln_args); + + let block_div = args + .aln_args + .strict_max_divergence + .then(|| graph.blocks_max_divergence()); + + let mut matches = filter_matches(&matches, &args.aln_args, &block_div); debug!("Matches after filtering: {}", matches.len()); trace!("{matches:#?}"); @@ -179,15 +185,20 @@ pub fn find_matches( .wrap_err_with(|| format!("When trying to align sequences using {}", &args.alignment_kernel)) } -pub fn filter_matches(alns: &[Alignment], args: &AlignmentArgs) -> Vec { +pub fn filter_matches( + alns: &[Alignment], + args: &AlignmentArgs, + block_divergence: &Option>>, +) -> Vec { // - evaluates the energy of the alignments // - keeps only matches with E < 0 + // - (optionally) keeps only matches with max divergence < threshold // - sorts them by energy // - discards incompatible matches (the ones that have overlapping regions) // TODO: energy is calculated for each alignment. // Consider calculating it earlier and making it a property to simplify filtering and sorting. - let alns = alns + let mut alns = alns .iter() .map(|aln| (aln, alignment_energy2(aln, args))) .filter(|(_, energy)| energy < &0.0) @@ -195,6 +206,30 @@ pub fn filter_matches(alns: &[Alignment], args: &AlignmentArgs) -> Vec 0, then + // filter alignment, only keep those with max divergence below the threshold + if args.strict_max_divergence && (args.beta > 0.0) { + let bd = block_divergence.as_ref().unwrap(); + alns = alns + .iter() + .filter(|aln| { + let div_q = bd.get(&aln.qry.name).unwrap(); + let div_r = bd.get(&aln.reff.name).unwrap(); + let aln_div = aln.divergence.unwrap(); + + // if any of these options is not set, remove the alignment + match (div_q, div_r) { + (Some(div_q), Some(div_r)) => { + let total_div = div_q + div_r + aln_div; + total_div < 1. / args.beta + }, + _ => false, + } + }) + .copied() + .collect_vec(); + } + // Iteratively accept alignments if they do not overlap with previously accepted ones let mut accepted_alns = vec![]; let mut accepted_intervals = btreemap![]; @@ -364,7 +399,7 @@ mod tests { let alns = [aln_0.clone(), aln_1.clone(), aln_2, aln_3]; - assert_eq!(filter_matches(&alns, &args), vec![aln_1, aln_0]); + assert_eq!(filter_matches(&alns, &args, &None), vec![aln_1, aln_0]); Ok(()) } diff --git a/packages/pangraph/src/pangraph/pangraph.rs b/packages/pangraph/src/pangraph/pangraph.rs index b86b32f5..ffdab27a 100644 --- a/packages/pangraph/src/pangraph/pangraph.rs +++ b/packages/pangraph/src/pangraph/pangraph.rs @@ -130,6 +130,15 @@ impl Pangraph { } } + // Returns a map of block ids to the maximum divergence in the block alignment. + pub fn blocks_max_divergence(&self) -> BTreeMap> { + self + .blocks + .iter() + .map(|(bid, block)| (*bid, block.max_divergence())) + .collect() + } + #[cfg(any(test, debug_assertions))] pub fn sanity_check(&self) -> Result<(), Report> { for (node_id, node) in &self.nodes { diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 81071d6a..b5196d86 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -12,6 +12,7 @@ use derive_more::{Display, From}; use eyre::{Report, WrapErr}; use getset::{CopyGetters, Getters}; use maplit::btreemap; +use ordered_float::OrderedFloat; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use serde_json::json; @@ -195,6 +196,35 @@ impl PangraphBlock { }) }) } + + pub fn max_divergence(&self) -> Option { + // length of consensus sequence minus ambiguous nucleotides + let consensus_len = self.consensus_len(); + let ambiguous_positions = self.consensus.ambiguous_positions(); + + // if any of the divergences cannot be calculated, return None + let divs = self + .alignments + .values() + .map(|edit| edit.divergence(consensus_len, &ambiguous_positions)) + .collect::>(); + + // if any of the divergences cannot be calculated, return None + if divs.iter().any(|div| div.is_none()) { + None + } else { + Some( + divs + .iter() + .map(|div| OrderedFloat(div.unwrap())) + .max() + .unwrap() + .into_inner(), + ) + } + + // return the maximum divergence + } } #[derive(Copy, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] diff --git a/packages/pangraph/src/representation/seq.rs b/packages/pangraph/src/representation/seq.rs index 30c40b21..8a407376 100644 --- a/packages/pangraph/src/representation/seq.rs +++ b/packages/pangraph/src/representation/seq.rs @@ -184,6 +184,19 @@ impl Seq { pub fn rotate_right(&mut self, mid: usize) { self.data.rotate_right(mid); } + + pub fn n_ambiguous_bases(&self) -> usize { + self.data.iter().filter(|&c| c.is_ambiguous()).count() + } + + pub fn ambiguous_positions(&self) -> Vec { + self + .data + .iter() + .enumerate() + .filter_map(|(i, c)| c.is_ambiguous().then_some(i)) + .collect() + } } impl PartialEq for Seq { diff --git a/packages/pangraph/src/representation/seq_char.rs b/packages/pangraph/src/representation/seq_char.rs index e43be676..2b303f48 100644 --- a/packages/pangraph/src/representation/seq_char.rs +++ b/packages/pangraph/src/representation/seq_char.rs @@ -19,6 +19,34 @@ impl AsciiChar { debug_assert!(s.len() == 1); Self(s.as_bytes()[0]) } + + pub fn is_ambiguous(&self) -> bool { + matches!( + self.0, + b'N' + | b'n' + | b'R' + | b'r' + | b'Y' + | b'y' + | b'S' + | b's' + | b'W' + | b'w' + | b'K' + | b'k' + | b'M' + | b'm' + | b'B' + | b'b' + | b'D' + | b'd' + | b'H' + | b'h' + | b'V' + | b'v' + ) + } } impl core::fmt::Display for AsciiChar { diff --git a/packages/pangraph/src/tree/neighbor_joining.rs b/packages/pangraph/src/tree/neighbor_joining.rs index b274e473..8563d54e 100644 --- a/packages/pangraph/src/tree/neighbor_joining.rs +++ b/packages/pangraph/src/tree/neighbor_joining.rs @@ -3,7 +3,7 @@ use crate::distance::mash::mash_distance::mash_distance; use crate::distance::mash::minimizer::MinimizersParams; use crate::pangraph::pangraph::Pangraph; -use crate::tree::balance::balance; +// use crate::tree::balance::balance; use crate::tree::clade::Clade; use crate::utils::lock::Lock; use crate::utils::ndarray::broadcast; @@ -28,7 +28,7 @@ pub fn build_tree_using_neighbor_joining(graphs: Vec) -> Result