diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index cd0ebf83..b59cbdef 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -99,6 +99,13 @@ cdef class IteratorRowSelection(IteratorRow): cdef int cnext(self) +cdef class IteratorRowFilter(IteratorRow): + cdef int flag_filter + cdef int flag_require + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + + cdef class IteratorColumn: # result of the last plbuf_push diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 6f106af9..228f4f98 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -165,6 +165,12 @@ class AlignmentFile(HTSFile): def find_introns( self, read_iterator: Iterable[AlignedSegment] ) -> Dict[Tuple[int, int], int]: ... + def filter( + self, + flag_filter: int = ..., + flag_require: int = ..., + multiple_iterators: bool = ..., + ) -> IteratorRowFilter: ... def close(self) -> None: ... def write(self, read: AlignedSegment) -> int: ... def __enter__(self) -> AlignmentFile: ... @@ -202,6 +208,7 @@ class IteratorRowAllRefs(IteratorRow): ... class IteratorRowHead(IteratorRow): ... class IteratorRowRegion(IteratorRow): ... class IteratorRowSelection(IteratorRow): ... +class IteratorRowFilter(IteratorRow): ... class IteratorColumn: def __iter__(self) -> IteratorColumn: ... diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index dc8cadd0..61eae02d 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1663,6 +1663,30 @@ cdef class AlignmentFile(HTSFile): res[(junc_start, base_position)] += 1 return res + def filter(self, **kwargs): + """iterate and filter segments in the file. + + Parameters + ---------- + + flag_filter : int + + ignore reads where any of the bits in the flag are set. The default + is BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. + + flag_require : int + + only use reads where certain flags are set. The default is 0. + + Returns + ------- + + an iterator over filtered rows. : IteratorRowFilter + + """ + if not self.is_open: + raise ValueError("I/O operation on closed file") + return IteratorRowFilter(self, **kwargs) def close(self): '''closes the :class:`pysam.AlignmentFile`.''' @@ -2323,6 +2347,59 @@ cdef class IteratorRowSelection(IteratorRow): raise IOError(read_failure_reason(ret)) +cdef class IteratorRowFilter(IteratorRow): + """*(AlignmentFile samfile, int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, int flag_require = 0, bool multiple_iterators=False)* + + iterates over reads that have none of the given flags set. + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a :meth:`AlignmentFile.filter`. + """ + def __init__( + self, + AlignmentFile samfile, + int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, + int flag_require = 0, + bint multiple_iterators=True + ): + super().__init__(samfile, multiple_iterators=multiple_iterators) + + self.flag_filter = flag_filter + self.flag_require = flag_require + + def __iter__(self): + return self + + cdef bam1_t* getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr + with nogil: + while 1: + ret = sam_read1(self.htsfile, hdr, self.b) + if ret < 0: + break + if self.b.core.flag & self.flag_filter: + continue + elif self.flag_require and not (self.b.core.flag & self.flag_require): + continue + break + return ret + + def __next__(self): + cdef int ret = self.cnext() + if ret >= 0: + return makeAlignedSegment(self.b, self.header) + elif ret == -1: + raise StopIteration + else: + raise IOError(read_failure_reason(ret)) + + cdef int __advance_nofilter(void *data, bam1_t *b): '''advance without any read filtering. ''' diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 0d599df3..dc5340ba 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -2411,7 +2411,52 @@ def test_reading_writing_cram(self): return read = self.build_read() self.check_read(read, mode="cram") - + + +class TestAlignmentFileFilter(unittest.TestCase): + + filename = os.path.join(BAM_DATADIR, 'ex1.bam') + mode = "rb" + + def testNoFilter(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + total = 0 + for read in samfile1: + total += 1 + with pysam.AlignmentFile(self.filename, self.mode) as samfile2: + n2 = 0 + for read in samfile2.filter(flag_filter=0): + n2 += 1 + self.assertEqual(total, n2) + + def testMapped(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + n1 = 0 + total = 0 + for read in samfile1: + n1 += read.is_mapped + total += 1 + self.assertLessEqual(n1, total) + with pysam.AlignmentFile(self.filename, self.mode) as samfile2: + n2 = 0 + for read in samfile2.filter(flag_filter=pysam.FUNMAP): + n2 += 1 + self.assertEqual(n1, n2) + + def testUnmapped(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + n1 = 0 + total = 0 + for read in samfile1: + n1 += not read.is_mapped + total += 1 + self.assertLessEqual(n1, total) + with pysam.AlignmentFile(self.filename, self.mode) as samfile2: + n2 = 0 + for read in samfile2.filter(flag_filter=0, flag_require=pysam.FUNMAP): + n2 += 1 + self.assertEqual(n1, n2) + # SAM writing fails, as query length is 0 # class TestSanityCheckingSAM(TestSanityCheckingSAM): # mode = "w"