Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/main/scala/com/fulcrumgenomics/bam/SortBam.scala
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ import com.fulcrumgenomics.util.Io
|4. **RandomQuery**: sorts the reads into a random order but keeps reads with the same
| queryname together. The ordering is deterministic for any given input.
|5. **TemplateCoordinate**: The sort order used by `GroupReadByUmi`. Sorts reads by
| the earlier unclipped 5' coordinate of the read pair, the higher unclipped 5' coordinate of the
| read pair, library, the molecular identifier (MI tag), read name, and if R1 has the lower
| coordinates of the pair.
| the earlier un-soft-clipped 5' coordinate of the read pair, the higher un-soft-clipped 5' coordinate of the
| read pair, the cellular barcode (CB tag), the molecular identifier (MI tag), read name, library,
| and if R1 has the lower coordinates of the pair.
|
|Uses a temporary directory to buffer sets of sorted reads to disk. The number of reads kept in memory
|affects memory use and can be changed with the `--max-records-in-ram` option. The temporary directory
Expand Down
17 changes: 11 additions & 6 deletions src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ package com.fulcrumgenomics.bam.api
import com.fulcrumgenomics.umi.ConsensusTags
import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
import htsjdk.samtools.util.Murmur3
import htsjdk.samtools.SAMFileHeader
import htsjdk.samtools.{SAMFileHeader, SAMTag}

/** Trait for specifying BAM orderings. */
sealed trait SamOrder extends Product {
Expand Down Expand Up @@ -163,9 +163,10 @@ object SamOrder {
}

/**
* The sort order used by GroupReadByUmi. Sorts reads by the earlier unclipped 5' coordinate of the read
* pair, the higher un-soft-clipped 5' coordinate of the read pair, library, the molecular identifier (see
* [[com.fulcrumgenomics.umi.ConsensusTags.MolecularId]]), read name, and if R1 has the lower coordinates of the pair.
* The sort order used by GroupReadByUmi. Sorts reads by the earlier un-soft-clipped 5' coordinate of the read
* pair, the higher un-soft-clipped 5' coordinate of the read pair, cellular barcode, the molecular identifier
* (see [[com.fulcrumgenomics.umi.ConsensusTags.MolecularId]]), read name, library, and if R1 has the lower
* coordinates of the pair.
*/
case object TemplateCoordinate extends SamOrder {
override type A = TemplateCoordinateKey
Expand All @@ -184,17 +185,18 @@ object SamOrder {
else { rec.mateUnSoftClippedStart.getOrElse(throw new IllegalArgumentException("Record does not have a mate cigar!")) }
}
val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
val cid = rec.getOrElse[String](SAMTag.CB.name, "")
val mid = rec.get[String](ConsensusTags.MolecularId).map { m =>
val index: Int = m.lastIndexOf('/')
if (index >= 0) m.substring(0, index) else m
}.getOrElse("")

if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
(readChrom == mateChrom && readPos == matePos && !readNeg)) {
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, cid, mid, rec.name, false)
}
else {
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, cid, mid, rec.name, true)
}
}
}
Expand All @@ -207,6 +209,7 @@ object SamOrder {
neg1: Boolean,
neg2: Boolean,
library: String,
cid: String,
mid: String,
name : String,
isUpperOfPair: Boolean) extends Ordered[TemplateCoordinateKey] {
Expand All @@ -217,6 +220,8 @@ object SamOrder {
if (retval == 0) retval = Integer.compare(this.pos2, that.pos2)
if (retval == 0) retval = this.neg1.compare(that.neg1)
if (retval == 0) retval = this.neg2.compare(that.neg2)
if (retval == 0) retval = this.cid.length.compare(that.cid.length)
if (retval == 0) retval = this.cid.compare(that.cid)
if (retval == 0) retval = this.mid.length.compareTo(that.mid.length)
if (retval == 0) retval = this.mid.compare(that.mid)
if (retval == 0) retval = this.name.compareTo(that.name)
Expand Down
31 changes: 31 additions & 0 deletions src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,37 @@ class SamOrderTest extends UnitSpec {
}
}

it should "group molecules within cells when the CB tag is defined" in {
// Two cells (AAAA, BBBB), each with two molecules sharing the same coordinates.
// Without CB the sort degenerates to MI-only ordering, which interleaves molecules
// across cells (AAAA/mol1 adjacent to BBBB/mol1, etc.). With CB present, reads
// must be grouped by cell first, then by molecule within the cell.
val addFuncs: Seq[SamBuilder => Unit] = Seq(
b => { val _ = b.addPair(name="aa1", start1=100, start2=200, attrs=Map("MI" -> "1/A", "CB" -> "AAAA"), bases1="AAAAAAAAAA", bases2="AAAAAAAAAA") },
b => { val _ = b.addPair(name="bb1", start1=100, start2=200, attrs=Map("MI" -> "1/A", "CB" -> "BBBB"), bases1="AAAAAAAAAA", bases2="AAAAAAAAAA") },
b => { val _ = b.addPair(name="aa2", start1=100, start2=200, attrs=Map("MI" -> "2/A", "CB" -> "AAAA"), bases1="AAAAAAAAAA", bases2="AAAAAAAAAA") },
b => { val _ = b.addPair(name="bb2", start1=100, start2=200, attrs=Map("MI" -> "2/A", "CB" -> "BBBB"), bases1="AAAAAAAAAA", bases2="AAAAAAAAAA") },
)

def seq(n: Int, str: String): Seq[String] = IndexedSeq.fill[String](n)(str)

Range.inclusive(start=1, end=10).foreach { _ =>
val builder = new SamBuilder(readLength=10)
scala.util.Random.shuffle(addFuncs).foreach { func => func(builder) }
val f = SamOrder.TemplateCoordinate.sortkey
val recs = builder.iterator.toSeq.sortBy(f(_))
recs should have length 8

// All reads from cell AAAA must precede all reads from cell BBBB.
recs.take(4).map(_.apply[String]("CB")).foreach(_ shouldBe "AAAA")
recs.drop(4).map(_.apply[String]("CB")).foreach(_ shouldBe "BBBB")

// Within each cell, molecules are ordered by MI.
recs.take(4).map(_.apply[String]("MI")) should contain theSameElementsInOrderAs (seq(2, "1/A") ++ seq(2, "2/A"))
recs.drop(4).map(_.apply[String]("MI")) should contain theSameElementsInOrderAs (seq(2, "1/A") ++ seq(2, "2/A"))
}
}

it should "sort pairs by the 'lower' 5' position of the pair" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
val exp = ListBuffer[SamRecord]()
Expand Down