Skip to content

Commit 79dc8b2

Browse files
committed
feat(sort): add CB (cellular barcode) to template-coordinate sort key
Port fgbio PR #1142 to include the CB tag in the template-coordinate sort order for single-cell data. Without CB in the sort, templates from different cells at the same locus are interleaved, breaking grouper adjacency assumptions. Comparison order is now: tid1 → tid2 → pos1 → pos2 → neg1 → neg2 → CB → MI → name → library → is_upper Changes: - Add cb_hash (u64) to TemplateKey between secondary and tertiary, expanding packed key from 32 to 40 bytes and inline header from 40 to 48 bytes - Add cid (String) to TemplateCoordinateKey with length-first then lexicographic comparison matching fgbio - Add cell_tag parameter to compare_template_coordinate_raw() for raw byte comparison path - Add cell_tag field and builder method to RawExternalSorter - Add --cell-tag / -c CLI argument to sort command (default "CB") - Update group and dedup help text to mention --cell-tag in sort suggestions
1 parent bf4b24f commit 79dc8b2

7 files changed

Lines changed: 320 additions & 97 deletions

File tree

crates/fgumi-raw-bam/src/sort.rs

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
33
use crate::cigar::mate_unclipped_5prime;
44
use crate::cigar::unclipped_5prime_sort;
55
use crate::fields::{self, flags, mate_pos, mate_ref_id, pos, read_name, ref_id};
6-
use crate::tags::{find_mc_tag_in_record, find_mi_tag_in_record};
6+
use crate::tags::{find_mc_tag_in_record, find_mi_tag_in_record, find_string_tag_in_record};
77

88
#[must_use]
99
pub fn compare_coordinate_raw(a: &[u8], b: &[u8]) -> Ordering {
@@ -57,9 +57,15 @@ pub fn compare_queryname_raw(a: &[u8], b: &[u8]) -> Ordering {
5757
/// Compare for template-coordinate ordering using raw bytes.
5858
///
5959
/// This matches samtools' template-coordinate sorting which uses unclipped 5' positions.
60+
/// When `cell_tag` is `Some`, the CB (cellular barcode) tag is included in the comparison
61+
/// between neg2 and MI, matching fgbio's sort order.
6062
#[inline]
6163
#[must_use]
62-
pub fn compare_template_coordinate_raw(a: &[u8], b: &[u8]) -> Ordering {
64+
pub fn compare_template_coordinate_raw(
65+
a: &[u8],
66+
b: &[u8],
67+
cell_tag: Option<&[u8; 2]>,
68+
) -> Ordering {
6369
// Extract all needed fields from both records
6470
let a_tid = ref_id(a);
6571
let a_pos = pos(a);
@@ -133,6 +139,16 @@ pub fn compare_template_coordinate_raw(a: &[u8], b: &[u8]) -> Ordering {
133139
(false, true) => Ordering::Greater,
134140
_ => Ordering::Equal,
135141
})
142+
.then_with(|| {
143+
// CB (cellular barcode): length-first, then lexicographic (matching fgbio)
144+
if let Some(tag) = cell_tag {
145+
let a_cb = find_string_tag_in_record(a, tag).unwrap_or(b"");
146+
let b_cb = find_string_tag_in_record(b, tag).unwrap_or(b"");
147+
a_cb.len().cmp(&b_cb.len()).then_with(|| a_cb.cmp(b_cb))
148+
} else {
149+
Ordering::Equal
150+
}
151+
})
136152
.then_with(|| compare_mi_tags_raw(a, b))
137153
.then_with(|| compare_names_raw(a, b))
138154
.then_with(|| a_upper.cmp(&b_upper))
@@ -201,6 +217,7 @@ fn compare_mi_tags_raw(a: &[u8], b: &[u8]) -> Ordering {
201217
mod tests {
202218
use super::*;
203219
use crate::testutil::*;
220+
use rstest::rstest;
204221
use std::cmp::Ordering;
205222

206223
// ========================================================================
@@ -319,7 +336,7 @@ mod tests {
319336
-1,
320337
&[],
321338
);
322-
assert_eq!(compare_template_coordinate_raw(&rec, &rec), Ordering::Equal);
339+
assert_eq!(compare_template_coordinate_raw(&rec, &rec, None), Ordering::Equal);
323340
}
324341

325342
#[test]
@@ -329,8 +346,8 @@ mod tests {
329346
let cigar = &[(10 << 4) | 0]; // 10M
330347
let a = make_bam_bytes(0, 100, 0, b"rea", cigar, 10, -1, -1, &[]);
331348
let b = make_bam_bytes(2, 100, 0, b"rea", cigar, 10, -1, -1, &[]);
332-
assert_eq!(compare_template_coordinate_raw(&a, &b), Ordering::Less);
333-
assert_eq!(compare_template_coordinate_raw(&b, &a), Ordering::Greater);
349+
assert_eq!(compare_template_coordinate_raw(&a, &b, None), Ordering::Less);
350+
assert_eq!(compare_template_coordinate_raw(&b, &a, None), Ordering::Greater);
334351
}
335352

336353
#[test]
@@ -339,7 +356,7 @@ mod tests {
339356
let a = make_bam_bytes(-1, -1, flags::UNMAPPED, b"aaa", &[], 0, -1, -1, &[]);
340357
let b = make_bam_bytes(-1, -1, flags::UNMAPPED, b"zzz", &[], 0, -1, -1, &[]);
341358
// Both fully unmapped, compare by name
342-
assert_eq!(compare_template_coordinate_raw(&a, &b), Ordering::Less);
359+
assert_eq!(compare_template_coordinate_raw(&a, &b, None), Ordering::Less);
343360
}
344361

345362
#[test]
@@ -350,7 +367,7 @@ mod tests {
350367
let b = make_bam_bytes(0, 200, flags::PAIRED, b"rea", cigar, 10, 0, 100, &[]);
351368
// a is unmapped with mate at tid=0,pos=100; b is mapped at tid=0,pos=200
352369
// a sorts by mate position (100) which is < b's position (200)
353-
let cmp = compare_template_coordinate_raw(&a, &b);
370+
let cmp = compare_template_coordinate_raw(&a, &b, None);
354371
assert_ne!(cmp, Ordering::Equal);
355372
}
356373

@@ -381,7 +398,7 @@ mod tests {
381398
&[],
382399
);
383400
// Both mapped, mates unmapped. a at pos 100 < b at pos 200
384-
assert_eq!(compare_template_coordinate_raw(&a, &b), Ordering::Less);
401+
assert_eq!(compare_template_coordinate_raw(&a, &b, None), Ordering::Less);
385402
}
386403

387404
#[test]
@@ -394,7 +411,7 @@ mod tests {
394411
let b = make_bam_bytes(0, 100, flags::PAIRED, b"rea", cigar, 10, 0, 200, &[]);
395412
// After canonical ordering, both should produce same (tid1=0,tid2=0,pos1=~100,pos2=~200)
396413
// so they compare equal on positions, differentiated by is_upper
397-
let cmp = compare_template_coordinate_raw(&a, &b);
414+
let cmp = compare_template_coordinate_raw(&a, &b, None);
398415
// a is_upper=true, b is_upper=false -> a > b (true > false)
399416
assert_eq!(cmp, Ordering::Greater);
400417
}
@@ -428,7 +445,7 @@ mod tests {
428445
&[],
429446
);
430447
// Reverse strand sorts before forward in samtools convention (neg1=true < neg1=false)
431-
let cmp = compare_template_coordinate_raw(&a, &b);
448+
let cmp = compare_template_coordinate_raw(&a, &b, None);
432449
assert_ne!(cmp, Ordering::Equal);
433450
}
434451

@@ -467,7 +484,38 @@ mod tests {
467484
&aux_b,
468485
);
469486
// MI 10 < MI 20
470-
assert_eq!(compare_template_coordinate_raw(&a, &b), Ordering::Less);
487+
assert_eq!(compare_template_coordinate_raw(&a, &b, None), Ordering::Less);
488+
}
489+
490+
// ========================================================================
491+
// compare_template_coordinate_raw: CB (cell barcode) tag tests
492+
// ========================================================================
493+
494+
/// CB ordering: parameterized cases for compare_template_coordinate_raw.
495+
#[rstest]
496+
// Same-length CBs: AAAA < BBBB lexicographically
497+
#[case(b"CBZ\x41\x41\x41\x41\x00".as_slice(), b"CBZ\x42\x42\x42\x42\x00".as_slice(), Some(b"CB"), Ordering::Less, "AAAA < BBBB")]
498+
// CB ignored when cell_tag is None
499+
#[case(b"CBZ\x41\x41\x41\x41\x00".as_slice(), b"CBZ\x42\x42\x42\x42\x00".as_slice(), None, Ordering::Equal, "CB ignored without cell_tag")]
500+
// Missing CB (empty) < present CB by length
501+
#[case(b"".as_slice(), b"CBZ\x41\x41\x41\x41\x00".as_slice(), Some(b"CB"), Ordering::Less, "no CB < has CB")]
502+
// Shorter CB < longer CB by length
503+
#[case(b"CBZA\x00".as_slice(), b"CBZAA\x00".as_slice(), Some(b"CB"), Ordering::Less, "A < AA by length")]
504+
fn test_compare_template_coordinate_raw_cb_ordering(
505+
#[case] aux_a: &[u8],
506+
#[case] aux_b: &[u8],
507+
#[case] cell_tag: Option<&[u8; 2]>,
508+
#[case] expected: Ordering,
509+
#[case] msg: &str,
510+
) {
511+
let cigar = &[(10 << 4) | 0]; // 10M
512+
let a = make_bam_bytes(
513+
0, 100, flags::PAIRED | flags::MATE_UNMAPPED, b"rea", cigar, 10, -1, -1, aux_a,
514+
);
515+
let b = make_bam_bytes(
516+
0, 100, flags::PAIRED | flags::MATE_UNMAPPED, b"rea", cigar, 10, -1, -1, aux_b,
517+
);
518+
assert_eq!(compare_template_coordinate_raw(&a, &b, cell_tag), expected, "{msg}");
471519
}
472520

473521
// ========================================================================

src/commands/dedup.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,7 +1039,7 @@ is selected as the representative; all others are marked as duplicates.
10391039
# Input Requirements
10401040
10411041
- Must be processed with `fgumi zipper` (adds `pa` tag for secondary/supplementary reads)
1042-
- Must be sorted with `fgumi sort --order template-coordinate`
1042+
- Must be sorted with `fgumi sort --order template-coordinate --cell-tag CB`
10431043
- UMI tags on reads (default: RX tag)
10441044
10451045
Note: Using `samtools sort` will NOT work correctly because it doesn't use the
@@ -1188,7 +1188,9 @@ impl Command for MarkDuplicates {
11881188
"Input BAM must be template-coordinate sorted.\n\n\
11891189
To prepare your BAM file, run:\n \
11901190
fgumi zipper -u unmapped.bam -r reference.fa -o merged.bam\n \
1191-
fgumi sort -i merged.bam -o sorted.bam --order template-coordinate"
1191+
fgumi sort -i merged.bam -o sorted.bam --order template-coordinate \
1192+
--cell-tag {}",
1193+
self.cell_tag
11921194
);
11931195
}
11941196
info!("Template-coordinate sorted");
@@ -1398,9 +1400,11 @@ impl Command for MarkDuplicates {
13981400
`fgumi zipper` during the merge of unmapped and mapped BAMs.\n\n\
13991401
To fix this, re-run your pipeline starting from `fgumi zipper`:\n \
14001402
fgumi zipper --unmapped unmapped.bam --mapped aligned.bam -o merged.bam\n \
1401-
fgumi sort -i merged.bam -o sorted.bam --order template-coordinate\n \
1403+
fgumi sort -i merged.bam -o sorted.bam --order template-coordinate \
1404+
--cell-tag {}\n \
14021405
fgumi dedup -i sorted.bam -o deduped.bam",
1403-
final_metrics.missing_pa_tag
1406+
final_metrics.missing_pa_tag,
1407+
self.cell_tag
14041408
);
14051409
}
14061410

src/commands/group.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -675,8 +675,9 @@ Accepts reads in any order (including unsorted) and outputs reads sorted by:
675675
4. Read Name
676676
677677
It is recommended to sort the reads into template-coordinate order prior to running
678-
this tool to avoid re-sorting the input. Use `fgumi sort --order template-coordinate` for
679-
the pre-sorting. The output will always be written in template-coordinate order.
678+
this tool to avoid re-sorting the input. Use `fgumi sort --order template-coordinate`
679+
(with matching `--cell-tag`) for the pre-sorting. The output will always be written
680+
in template-coordinate order.
680681
681682
During grouping, reads and templates are filtered out as follows:
682683
@@ -924,7 +925,9 @@ impl Command for GroupReadsByUmi {
924925
bail!(
925926
"Input BAM must be template-coordinate sorted.\n\n\
926927
To sort your BAM file, run:\n \
927-
fgumi sort -i input.bam -o sorted.bam --order template-coordinate"
928+
fgumi sort -i input.bam -o sorted.bam --order template-coordinate \
929+
--cell-tag {}",
930+
self.cell_tag
928931
);
929932
}
930933
info!("template coordinate sorted");

src/commands/sort.rs

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use clap::{Parser, ValueEnum};
2525
use fgumi_lib::bam_io::create_bam_reader;
2626
use fgumi_lib::logging::OperationTimer;
2727
use fgumi_lib::sort::{RawExternalSorter, SortOrder};
28-
use fgumi_lib::validation::validate_file_exists;
28+
use fgumi_lib::validation::{string_to_tag, validate_file_exists};
2929
use log::info;
3030
use std::path::PathBuf;
3131

@@ -176,6 +176,15 @@ pub struct Sort {
176176
/// position tracking.
177177
#[arg(long = "write-index", default_value = "false")]
178178
pub write_index: bool,
179+
180+
/// Cell barcode tag for template-coordinate sort.
181+
///
182+
/// When sorting in template-coordinate order, this tag is included in the
183+
/// sort key so that templates from different cells at the same locus are
184+
/// not interleaved. Use the same value as `--cell-tag` in `fgumi group`
185+
/// and `fgumi dedup`. Only used for template-coordinate sort.
186+
#[arg(short = 'c', long = "cell-tag", default_value = "CB")]
187+
pub cell_tag: String,
179188
}
180189

181190
/// Parse memory size string (e.g., "512M", "1G", "2G").
@@ -264,6 +273,17 @@ impl Command for Sort {
264273
}
265274

266275
impl Sort {
276+
/// Parse the cell tag for template-coordinate sort/verify, returning `None`
277+
/// for other sort orders.
278+
fn parse_cell_tag(&self) -> Result<Option<[u8; 2]>> {
279+
if matches!(self.order, SortOrderArg::TemplateCoordinate) {
280+
let tag = string_to_tag(&self.cell_tag, "cell-tag")?;
281+
Ok(Some([tag.as_ref()[0], tag.as_ref()[1]]))
282+
} else {
283+
Ok(None)
284+
}
285+
}
286+
267287
/// Execute sort mode: read, sort, and write output.
268288
fn execute_sort(&self, command_line: &str) -> Result<()> {
269289
let output = self.output.as_ref().expect("output required for sort mode");
@@ -286,10 +306,15 @@ impl Sort {
286306
self.max_memory
287307
};
288308

309+
let cell_tag = self.parse_cell_tag()?;
310+
289311
info!("Starting Sort");
290312
info!("Input: {}", self.input.display());
291313
info!("Output: {}", output.display());
292314
info!("Sort order: {:?}", self.order);
315+
if let Some(ct) = cell_tag {
316+
info!("Cell tag: {}{}", ct[0] as char, ct[1] as char);
317+
}
293318
if self.memory_per_thread {
294319
info!(
295320
"Max memory: {} MB ({} MB/thread × {} threads)",
@@ -318,6 +343,10 @@ impl Sort {
318343
.write_index(self.write_index)
319344
.pg_info(crate::version::VERSION.to_string(), command_line.to_string());
320345

346+
if let Some(ct) = cell_tag {
347+
sorter = sorter.cell_tag(ct);
348+
}
349+
321350
if let Some(ref tmp) = self.tmp_dir {
322351
sorter = sorter.temp_dir(tmp.clone());
323352
}
@@ -349,11 +378,16 @@ impl Sort {
349378
use std::cmp::Ordering;
350379
use std::fs::File;
351380

381+
let cell_tag = self.parse_cell_tag()?;
382+
352383
let timer = OperationTimer::new("Verifying BAM sort order");
353384

354385
info!("Starting Sort Verification");
355386
info!("Input: {}", self.input.display());
356387
info!("Expected order: {:?}", self.order);
388+
if let Some(ct) = cell_tag {
389+
info!("Cell tag: {}{}", ct[0] as char, ct[1] as char);
390+
}
357391

358392
// Get header using noodles reader, then use raw reader for records
359393
let (_, header) = create_bam_reader(&self.input, 1)?;
@@ -383,7 +417,7 @@ impl Sort {
383417
let lib_lookup = LibraryLookup::from_header(&header);
384418
verify_sort_order(
385419
raw_reader,
386-
|bam| extract_template_key_inline(bam, &lib_lookup),
420+
|bam| extract_template_key_inline(bam, &lib_lookup, cell_tag.as_ref()),
387421
// Use core_cmp to ignore name_hash tie-breaker differences
388422
// This allows both fgumi and samtools sorted files to pass
389423
|key, prev| key.core_cmp(prev) == Ordering::Less,

0 commit comments

Comments
 (0)