Skip to content

Commit 7725f3a

Browse files
committed
Fix clipping issues when reads overlap
1 parent 9634745 commit 7725f3a

3 files changed

Lines changed: 245 additions & 4 deletions

File tree

src/main/scala/com/fulcrumgenomics/bam/ClipBam.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ class ClipBam
126126
case (Some(r1), Some(r2)) =>
127127
clipPair(r1=r1, r2=r2, r1Metric=metricsMap.get(ReadType.ReadOne), r2Metric=metricsMap.get(ReadType.ReadTwo))
128128
SamPairUtil.setMateInfo(r1.asSam, r2.asSam, true)
129+
// SamPairUtil.setMateInfo does not clear the proper-pair (0x2) flag when one read becomes unmapped
130+
if (r1.unmapped || r2.unmapped) { r1.properlyPaired = false; r2.properlyPaired = false }
129131
template.r1Supplementals.foreach(s => SamPairUtil.setMateInformationOnSupplementalAlignment(s.asSam, r2.asSam, true))
130132
template.r2Supplementals.foreach(s => SamPairUtil.setMateInformationOnSupplementalAlignment(s.asSam, r1.asSam, true))
131133
case (Some(frag), None) =>

src/main/scala/com/fulcrumgenomics/bam/SamRecordClipper.scala

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -314,9 +314,14 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean)
314314
else retval
315315
}
316316
val readEnd = rec.readPosAtRefPos(pos=midPoint, returnLastBaseIfDeleted=true)
317-
val mateStart = { // NB: need to be careful if the midpoint falls in a deletion
318-
val retval = mate.readPosAtRefPos(pos=midPoint + 1, returnLastBaseIfDeleted=false)
319-
if (retval != 0) retval else mate.readPosAtRefPos(pos=midPoint + 1, returnLastBaseIfDeleted=true) + 1
317+
val mateStart = { // NB: need to be careful if the midpoint falls in a deletion or past the end of the mate
318+
if (midPoint >= mate.end) {
319+
// The midpoint is at or past the mate's last aligned base: clip all of the mate.
320+
mate.length + 1
321+
} else {
322+
val retval = mate.readPosAtRefPos(pos=midPoint + 1, returnLastBaseIfDeleted=false)
323+
if (retval != 0) retval else mate.readPosAtRefPos(pos=midPoint + 1, returnLastBaseIfDeleted=true) + 1
324+
}
320325
}
321326
val numOverlappingBasesRead = this.clip3PrimeEndOfRead(rec, rec.cigar.trailingHardClippedBases + rec.length - readEnd)
322327
val numOverlappingBasesMate = this.clip3PrimeEndOfRead(mate, mate.cigar.leadingHardClippedBases + mateStart - 1)
@@ -354,7 +359,7 @@ class SamRecordClipper(val mode: ClippingMode, val autoClipAttributes: Boolean)
354359
* @return the additional number of bases clipped (3' end in sequencing order) for the read and mate respectively
355360
*/
356361
def clipExtendingPastMateEnds(rec: SamRecord, mate: SamRecord): (Int, Int) = {
357-
if (rec.isFrPair) {
362+
if (rec.isFrPair && rec.mapped && mate.mapped) {
358363
val basesClipped1 = clipExtendingPastMateEnd(
359364
rec = rec,
360365
mateUnSoftClippedStart = mate.unSoftClippedStart,

src/test/scala/com/fulcrumgenomics/bam/ClipBamTest.scala

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,240 @@ class ClipBamTest extends UnitSpec with ErrorLogLevel with OptionValues {
618618
r2.end shouldBe 90 + 200 - 1 // 289: the original end of R2
619619
}
620620

621+
/** Helper that asserts two reads have the same start, end, and CIGAR. */
622+
private def assertSameClipping(got: SamRecord, expected: SamRecord, label: String): Unit = {
623+
withClue(s"$label start:") { got.start shouldBe expected.start }
624+
withClue(s"$label end:") { got.end shouldBe expected.end }
625+
withClue(s"$label cigar:") { got.cigar.toString shouldBe expected.cigar.toString }
626+
()
627+
}
628+
629+
// Tests for the ordering issue raised in https://github.com/fulcrumgenomics/fgbio/issues/1141#issuecomment-3998369518
630+
// The order of --clip-overlapping-reads and --clip-bases-past-mate should not affect the final result.
631+
// Each test runs the two operations in both orderings and asserts they produce identical results.
632+
633+
private val samClipper = new SamRecordClipper(mode = ClippingMode.Hard, autoClipAttributes = false)
634+
635+
it should "produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order for symmetrically dovetailed reads" in {
636+
// R1 and R2 dovetail symmetrically: each extends 10 bp past the other's 5' start
637+
val Seq(r1a, r2a) = new SamBuilder(readLength = 100).addPair(start1 = 100, start2 = 90)
638+
val Seq(r1b, r2b) = new SamBuilder(readLength = 100).addPair(start1 = 100, start2 = 90)
639+
640+
// Order A (current production order): overlapping → past-mate
641+
samClipper.clipOverlappingReads(r1a, r2a)
642+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
643+
644+
// Order B (reversed): past-mate → overlapping
645+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
646+
samClipper.clipOverlappingReads(r1b, r2b)
647+
648+
assertSameClipping(r1a, r1b, "R1")
649+
assertSameClipping(r2a, r2b, "R2")
650+
}
651+
652+
it should "produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order for heavily dovetailed reads" in {
653+
// R1 and R2 heavily dovetail: R2 starts 50 bp before R1's 5' start
654+
val Seq(r1a, r2a) = new SamBuilder(readLength = 100).addPair(start1 = 100, start2 = 50)
655+
val Seq(r1b, r2b) = new SamBuilder(readLength = 100).addPair(start1 = 100, start2 = 50)
656+
657+
// Order A: overlapping → past-mate
658+
samClipper.clipOverlappingReads(r1a, r2a)
659+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
660+
661+
// Order B: past-mate → overlapping
662+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
663+
samClipper.clipOverlappingReads(r1b, r2b)
664+
665+
assertSameClipping(r1a, r1b, "R1")
666+
assertSameClipping(r2a, r2b, "R2")
667+
}
668+
669+
it should "produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order when R2 is much longer and extends past both ends of R1" in {
670+
// R2 is much longer than R1 and dovetails, extending past both R1.start and R1.end.
671+
// This is the scenario eboyden described where the shorter read could be clipped to zero length.
672+
// R1 is 20 bases long; R2 is 100 bases long.
673+
val bases20 = "A" * 20
674+
val quals20 = "F" * 20
675+
val bases100 = "A" * 100
676+
val quals100 = "F" * 100
677+
val Seq(r1a, r2a) = new SamBuilder(readLength = 100).addPair(
678+
start1 = 100, start2 = 80, cigar1 = "20M", cigar2 = "100M",
679+
bases1 = bases20, quals1 = quals20, bases2 = bases100, quals2 = quals100,
680+
)
681+
val Seq(r1b, r2b) = new SamBuilder(readLength = 100).addPair(
682+
start1 = 100, start2 = 80, cigar1 = "20M", cigar2 = "100M",
683+
bases1 = bases20, quals1 = quals20, bases2 = bases100, quals2 = quals100,
684+
)
685+
686+
// Order A: overlapping → past-mate
687+
samClipper.clipOverlappingReads(r1a, r2a)
688+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
689+
690+
// Order B: past-mate → overlapping
691+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
692+
samClipper.clipOverlappingReads(r1b, r2b)
693+
694+
assertSameClipping(r1a, r1b, "R1")
695+
assertSameClipping(r2a, r2b, "R2")
696+
}
697+
698+
it should "produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order when R1 is much longer and contains a short R2" in {
699+
// R1 is much longer; R2 is short and starts late within R1's range.
700+
// The concern is that clipOverlappingReads clips the midpoint to well before R2.start,
701+
// dramatically shortening R1 but leaving R2 nearly intact.
702+
val bases100 = "A" * 100
703+
val quals100 = "F" * 100
704+
val bases20 = "A" * 20
705+
val quals20 = "F" * 20
706+
val Seq(r1a, r2a) = new SamBuilder(readLength = 100).addPair(
707+
start1 = 100, start2 = 180, cigar1 = "100M", cigar2 = "20M",
708+
bases1 = bases100, quals1 = quals100, bases2 = bases20, quals2 = quals20,
709+
)
710+
val Seq(r1b, r2b) = new SamBuilder(readLength = 100).addPair(
711+
start1 = 100, start2 = 180, cigar1 = "100M", cigar2 = "20M",
712+
bases1 = bases100, quals1 = quals100, bases2 = bases20, quals2 = quals20,
713+
)
714+
715+
// Order A: overlapping → past-mate
716+
samClipper.clipOverlappingReads(r1a, r2a)
717+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
718+
719+
// Order B: past-mate → overlapping
720+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
721+
samClipper.clipOverlappingReads(r1b, r2b)
722+
723+
assertSameClipping(r1a, r1b, "R1")
724+
assertSameClipping(r2a, r2b, "R2")
725+
}
726+
727+
it should "produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order with 5-prime hard-clipped reads (issue 1141 scenario)" in {
728+
// Reproduces the asymmetric hard-clipped scenario from issue #1141:
729+
// - R1 has 25H at its 5' (left) end
730+
// - R2 has 25H at its 5' (right/trailing-CIGAR) end
731+
// - Both reads dovetail: R2 starts before R1's reference start
732+
// In this scenario the mate's hard-clipped 5' region must be ignored when determining
733+
// clipping boundaries (fixed in PR #1065). The order of the two operations should not matter.
734+
val readLen = 89 // bases after removing 25 hard-clipped
735+
val Seq(r1a, r2a) = new SamBuilder(readLength = readLen).addPair(
736+
start1 = 100,
737+
start2 = 75,
738+
cigar1 = s"25H${readLen}M",
739+
cigar2 = s"${readLen}M25H",
740+
)
741+
val Seq(r1b, r2b) = new SamBuilder(readLength = readLen).addPair(
742+
start1 = 100,
743+
start2 = 75,
744+
cigar1 = s"25H${readLen}M",
745+
cigar2 = s"${readLen}M25H",
746+
)
747+
748+
// Order A: overlapping → past-mate
749+
samClipper.clipOverlappingReads(r1a, r2a)
750+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
751+
752+
// Order B: past-mate → overlapping
753+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
754+
samClipper.clipOverlappingReads(r1b, r2b)
755+
756+
assertSameClipping(r1a, r1b, "R1")
757+
assertSameClipping(r2a, r2b, "R2")
758+
}
759+
760+
it should "produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order for soft-clipped dovetailed reads" in {
761+
// R2 has 10 soft-clipped bases at its 3' (left reference) end extending the footprint to
762+
// the left. Both reads dovetail: R2's soft-clipped bases reach R1.start.
763+
// cigar2 = "10S90M" has lengthOnQuery=100, matching readLength=100.
764+
val Seq(r1a, r2a) = new SamBuilder(readLength = 100).addPair(
765+
start1 = 100,
766+
start2 = 110, // R2 aligned start is 110; the leading 10S extend to ref position 100
767+
cigar1 = "100M",
768+
cigar2 = "10S90M",
769+
)
770+
val Seq(r1b, r2b) = new SamBuilder(readLength = 100).addPair(
771+
start1 = 100,
772+
start2 = 110,
773+
cigar1 = "100M",
774+
cigar2 = "10S90M",
775+
)
776+
777+
// Order A: overlapping → past-mate
778+
samClipper.clipOverlappingReads(r1a, r2a)
779+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
780+
781+
// Order B: past-mate → overlapping
782+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
783+
samClipper.clipOverlappingReads(r1b, r2b)
784+
785+
assertSameClipping(r1a, r1b, "R1")
786+
assertSameClipping(r2a, r2b, "R2")
787+
}
788+
789+
it should "produce a 1bp R1 and an unmapped R2 when both 5-prime ends are at the same reference position and R2 has leading soft-clips" in {
790+
// R1 is 1bp at ref 100. R2 has 5 soft-clipped bases followed by 1 aligned base, also starting at ref 100.
791+
// The midpoint falls at mate.end (ref 100), so clipOverlappingReads clips all of R2, unmapping it.
792+
// clipExtendingPastMateEnds is then a no-op because the mate is already unmapped.
793+
val builder = new SamBuilder(readLength = 1)
794+
val Seq(r1, r2) = builder.addPair(
795+
start1 = 100,
796+
start2 = 100,
797+
cigar1 = "1M",
798+
cigar2 = "5S1M",
799+
bases1 = "A",
800+
quals1 = "F",
801+
bases2 = "AAAAAA",
802+
quals2 = "FFFFFF",
803+
)
804+
805+
samClipper.clipOverlappingReads(r1, r2)
806+
samClipper.clipExtendingPastMateEnds(r1, r2)
807+
808+
r1.start shouldBe 100
809+
r1.end shouldBe 100
810+
r2.unmapped shouldBe true
811+
}
812+
813+
it should "produce a 1bp R1 and an unmapped R2 when both reads are 1bp and fully co-located" in {
814+
// When two 1bp reads are at the same reference position, the midpoint equals mate.end,
815+
// so clipOverlappingReads clips all of R2, resulting in R2 being unmapped.
816+
val Seq(r1a, r2a) = new SamBuilder(readLength = 1).addPair(start1 = 100, start2 = 100)
817+
val Seq(r1b, r2b) = new SamBuilder(readLength = 1).addPair(start1 = 100, start2 = 100)
818+
819+
// Order A: overlapping → past-mate
820+
samClipper.clipOverlappingReads(r1a, r2a)
821+
samClipper.clipExtendingPastMateEnds(r1a, r2a)
822+
823+
// Order B: past-mate → overlapping
824+
samClipper.clipExtendingPastMateEnds(r1b, r2b)
825+
samClipper.clipOverlappingReads(r1b, r2b)
826+
827+
r1a.start shouldBe 100
828+
r1a.end shouldBe 100
829+
r2a.unmapped shouldBe true
830+
831+
r1b.start shouldBe 100
832+
r1b.end shouldBe 100
833+
r2b.unmapped shouldBe true
834+
}
835+
836+
it should "clear the proper-pair flag on the surviving read when its mate is clipped to unmapped" in {
837+
val builder = new SamBuilder(readLength = 1, sort = Some(Queryname))
838+
builder.addPair(start1 = 100, start2 = 100)
839+
840+
val out = makeTempFile("out.", ".bam")
841+
new ClipBam(input = builder.toTempFile(), output = out, ref = ref, clipOverlappingReads = true).execute()
842+
843+
val clipped = SamSource(out).toSeq
844+
clipped should have length 2
845+
846+
val mapped = clipped.find(_.mapped).value
847+
val unmapped = clipped.find(_.unmapped).value
848+
849+
unmapped.unmapped shouldBe true
850+
mapped.mateUnmapped shouldBe true
851+
mapped.properlyPaired shouldBe false
852+
unmapped.properlyPaired shouldBe false
853+
}
854+
621855
it should "unmap reads when the hard clipping length requested is greater than the length of the reads" in {
622856
val builder = new SamBuilder(readLength = 100, sort = Some(Queryname))
623857
val clipper = new ClipBam(

0 commit comments

Comments
 (0)