@@ -618,6 +618,240 @@ class ClipBamTest extends UnitSpec with ErrorLogLevel with OptionValues {
618618 r2.end shouldBe 90 + 200 - 1 // 289: the original end of R2
619619 }
620620
621+ /** Helper that asserts two reads have the same start, end, and CIGAR. */
622+ private def assertSameClipping (got : SamRecord , expected : SamRecord , label : String ): Unit = {
623+ withClue(s " $label start: " ) { got.start shouldBe expected.start }
624+ withClue(s " $label end: " ) { got.end shouldBe expected.end }
625+ withClue(s " $label cigar: " ) { got.cigar.toString shouldBe expected.cigar.toString }
626+ ()
627+ }
628+
629+ // Tests for the ordering issue raised in https://github.com/fulcrumgenomics/fgbio/issues/1141#issuecomment-3998369518
630+ // The order of --clip-overlapping-reads and --clip-bases-past-mate should not affect the final result.
631+ // Each test runs the two operations in both orderings and asserts they produce identical results.
632+
633+ private val samClipper = new SamRecordClipper (mode = ClippingMode .Hard , autoClipAttributes = false )
634+
635+ it should " produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order for symmetrically dovetailed reads" in {
636+ // R1 and R2 dovetail symmetrically: each extends 10 bp past the other's 5' start
637+ val Seq (r1a, r2a) = new SamBuilder (readLength = 100 ).addPair(start1 = 100 , start2 = 90 )
638+ val Seq (r1b, r2b) = new SamBuilder (readLength = 100 ).addPair(start1 = 100 , start2 = 90 )
639+
640+ // Order A (current production order): overlapping → past-mate
641+ samClipper.clipOverlappingReads(r1a, r2a)
642+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
643+
644+ // Order B (reversed): past-mate → overlapping
645+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
646+ samClipper.clipOverlappingReads(r1b, r2b)
647+
648+ assertSameClipping(r1a, r1b, " R1" )
649+ assertSameClipping(r2a, r2b, " R2" )
650+ }
651+
652+ it should " produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order for heavily dovetailed reads" in {
653+ // R1 and R2 heavily dovetail: R2 starts 50 bp before R1's 5' start
654+ val Seq (r1a, r2a) = new SamBuilder (readLength = 100 ).addPair(start1 = 100 , start2 = 50 )
655+ val Seq (r1b, r2b) = new SamBuilder (readLength = 100 ).addPair(start1 = 100 , start2 = 50 )
656+
657+ // Order A: overlapping → past-mate
658+ samClipper.clipOverlappingReads(r1a, r2a)
659+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
660+
661+ // Order B: past-mate → overlapping
662+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
663+ samClipper.clipOverlappingReads(r1b, r2b)
664+
665+ assertSameClipping(r1a, r1b, " R1" )
666+ assertSameClipping(r2a, r2b, " R2" )
667+ }
668+
669+ it should " produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order when R2 is much longer and extends past both ends of R1" in {
670+ // R2 is much longer than R1 and dovetails, extending past both R1.start and R1.end.
671+ // This is the scenario eboyden described where the shorter read could be clipped to zero length.
672+ // R1 is 20 bases long; R2 is 100 bases long.
673+ val bases20 = " A" * 20
674+ val quals20 = " F" * 20
675+ val bases100 = " A" * 100
676+ val quals100 = " F" * 100
677+ val Seq (r1a, r2a) = new SamBuilder (readLength = 100 ).addPair(
678+ start1 = 100 , start2 = 80 , cigar1 = " 20M" , cigar2 = " 100M" ,
679+ bases1 = bases20, quals1 = quals20, bases2 = bases100, quals2 = quals100,
680+ )
681+ val Seq (r1b, r2b) = new SamBuilder (readLength = 100 ).addPair(
682+ start1 = 100 , start2 = 80 , cigar1 = " 20M" , cigar2 = " 100M" ,
683+ bases1 = bases20, quals1 = quals20, bases2 = bases100, quals2 = quals100,
684+ )
685+
686+ // Order A: overlapping → past-mate
687+ samClipper.clipOverlappingReads(r1a, r2a)
688+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
689+
690+ // Order B: past-mate → overlapping
691+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
692+ samClipper.clipOverlappingReads(r1b, r2b)
693+
694+ assertSameClipping(r1a, r1b, " R1" )
695+ assertSameClipping(r2a, r2b, " R2" )
696+ }
697+
698+ it should " produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order when R1 is much longer and contains a short R2" in {
699+ // R1 is much longer; R2 is short and starts late within R1's range.
700+ // The concern is that clipOverlappingReads clips the midpoint to well before R2.start,
701+ // dramatically shortening R1 but leaving R2 nearly intact.
702+ val bases100 = " A" * 100
703+ val quals100 = " F" * 100
704+ val bases20 = " A" * 20
705+ val quals20 = " F" * 20
706+ val Seq (r1a, r2a) = new SamBuilder (readLength = 100 ).addPair(
707+ start1 = 100 , start2 = 180 , cigar1 = " 100M" , cigar2 = " 20M" ,
708+ bases1 = bases100, quals1 = quals100, bases2 = bases20, quals2 = quals20,
709+ )
710+ val Seq (r1b, r2b) = new SamBuilder (readLength = 100 ).addPair(
711+ start1 = 100 , start2 = 180 , cigar1 = " 100M" , cigar2 = " 20M" ,
712+ bases1 = bases100, quals1 = quals100, bases2 = bases20, quals2 = quals20,
713+ )
714+
715+ // Order A: overlapping → past-mate
716+ samClipper.clipOverlappingReads(r1a, r2a)
717+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
718+
719+ // Order B: past-mate → overlapping
720+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
721+ samClipper.clipOverlappingReads(r1b, r2b)
722+
723+ assertSameClipping(r1a, r1b, " R1" )
724+ assertSameClipping(r2a, r2b, " R2" )
725+ }
726+
727+ it should " produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order with 5-prime hard-clipped reads (issue 1141 scenario)" in {
728+ // Reproduces the asymmetric hard-clipped scenario from issue #1141:
729+ // - R1 has 25H at its 5' (left) end
730+ // - R2 has 25H at its 5' (right/trailing-CIGAR) end
731+ // - Both reads dovetail: R2 starts before R1's reference start
732+ // In this scenario the mate's hard-clipped 5' region must be ignored when determining
733+ // clipping boundaries (fixed in PR #1065). The order of the two operations should not matter.
734+ val readLen = 89 // bases after removing 25 hard-clipped
735+ val Seq (r1a, r2a) = new SamBuilder (readLength = readLen).addPair(
736+ start1 = 100 ,
737+ start2 = 75 ,
738+ cigar1 = s " 25H ${readLen}M " ,
739+ cigar2 = s " ${readLen}M25H " ,
740+ )
741+ val Seq (r1b, r2b) = new SamBuilder (readLength = readLen).addPair(
742+ start1 = 100 ,
743+ start2 = 75 ,
744+ cigar1 = s " 25H ${readLen}M " ,
745+ cigar2 = s " ${readLen}M25H " ,
746+ )
747+
748+ // Order A: overlapping → past-mate
749+ samClipper.clipOverlappingReads(r1a, r2a)
750+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
751+
752+ // Order B: past-mate → overlapping
753+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
754+ samClipper.clipOverlappingReads(r1b, r2b)
755+
756+ assertSameClipping(r1a, r1b, " R1" )
757+ assertSameClipping(r2a, r2b, " R2" )
758+ }
759+
760+ it should " produce the same result for clip-overlapping-reads then clip-bases-past-mate as the reverse order for soft-clipped dovetailed reads" in {
761+ // R2 has 10 soft-clipped bases at its 3' (left reference) end extending the footprint to
762+ // the left. Both reads dovetail: R2's soft-clipped bases reach R1.start.
763+ // cigar2 = "10S90M" has lengthOnQuery=100, matching readLength=100.
764+ val Seq (r1a, r2a) = new SamBuilder (readLength = 100 ).addPair(
765+ start1 = 100 ,
766+ start2 = 110 , // R2 aligned start is 110; the leading 10S extend to ref position 100
767+ cigar1 = " 100M" ,
768+ cigar2 = " 10S90M" ,
769+ )
770+ val Seq (r1b, r2b) = new SamBuilder (readLength = 100 ).addPair(
771+ start1 = 100 ,
772+ start2 = 110 ,
773+ cigar1 = " 100M" ,
774+ cigar2 = " 10S90M" ,
775+ )
776+
777+ // Order A: overlapping → past-mate
778+ samClipper.clipOverlappingReads(r1a, r2a)
779+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
780+
781+ // Order B: past-mate → overlapping
782+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
783+ samClipper.clipOverlappingReads(r1b, r2b)
784+
785+ assertSameClipping(r1a, r1b, " R1" )
786+ assertSameClipping(r2a, r2b, " R2" )
787+ }
788+
789+ it should " produce a 1bp R1 and an unmapped R2 when both 5-prime ends are at the same reference position and R2 has leading soft-clips" in {
790+ // R1 is 1bp at ref 100. R2 has 5 soft-clipped bases followed by 1 aligned base, also starting at ref 100.
791+ // The midpoint falls at mate.end (ref 100), so clipOverlappingReads clips all of R2, unmapping it.
792+ // clipExtendingPastMateEnds is then a no-op because the mate is already unmapped.
793+ val builder = new SamBuilder (readLength = 1 )
794+ val Seq (r1, r2) = builder.addPair(
795+ start1 = 100 ,
796+ start2 = 100 ,
797+ cigar1 = " 1M" ,
798+ cigar2 = " 5S1M" ,
799+ bases1 = " A" ,
800+ quals1 = " F" ,
801+ bases2 = " AAAAAA" ,
802+ quals2 = " FFFFFF" ,
803+ )
804+
805+ samClipper.clipOverlappingReads(r1, r2)
806+ samClipper.clipExtendingPastMateEnds(r1, r2)
807+
808+ r1.start shouldBe 100
809+ r1.end shouldBe 100
810+ r2.unmapped shouldBe true
811+ }
812+
813+ it should " produce a 1bp R1 and an unmapped R2 when both reads are 1bp and fully co-located" in {
814+ // When two 1bp reads are at the same reference position, the midpoint equals mate.end,
815+ // so clipOverlappingReads clips all of R2, resulting in R2 being unmapped.
816+ val Seq (r1a, r2a) = new SamBuilder (readLength = 1 ).addPair(start1 = 100 , start2 = 100 )
817+ val Seq (r1b, r2b) = new SamBuilder (readLength = 1 ).addPair(start1 = 100 , start2 = 100 )
818+
819+ // Order A: overlapping → past-mate
820+ samClipper.clipOverlappingReads(r1a, r2a)
821+ samClipper.clipExtendingPastMateEnds(r1a, r2a)
822+
823+ // Order B: past-mate → overlapping
824+ samClipper.clipExtendingPastMateEnds(r1b, r2b)
825+ samClipper.clipOverlappingReads(r1b, r2b)
826+
827+ r1a.start shouldBe 100
828+ r1a.end shouldBe 100
829+ r2a.unmapped shouldBe true
830+
831+ r1b.start shouldBe 100
832+ r1b.end shouldBe 100
833+ r2b.unmapped shouldBe true
834+ }
835+
836+ it should " clear the proper-pair flag on the surviving read when its mate is clipped to unmapped" in {
837+ val builder = new SamBuilder (readLength = 1 , sort = Some (Queryname ))
838+ builder.addPair(start1 = 100 , start2 = 100 )
839+
840+ val out = makeTempFile(" out." , " .bam" )
841+ new ClipBam (input = builder.toTempFile(), output = out, ref = ref, clipOverlappingReads = true ).execute()
842+
843+ val clipped = SamSource (out).toSeq
844+ clipped should have length 2
845+
846+ val mapped = clipped.find(_.mapped).value
847+ val unmapped = clipped.find(_.unmapped).value
848+
849+ unmapped.unmapped shouldBe true
850+ mapped.mateUnmapped shouldBe true
851+ mapped.properlyPaired shouldBe false
852+ unmapped.properlyPaired shouldBe false
853+ }
854+
621855 it should " unmap reads when the hard clipping length requested is greater than the length of the reads" in {
622856 val builder = new SamBuilder (readLength = 100 , sort = Some (Queryname ))
623857 val clipper = new ClipBam (
0 commit comments