Skip to content

Add support for filtering rows by flags #1337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pysam/libcalignmentfile.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ cdef class IteratorRowSelection(IteratorRow):
cdef int cnext(self)


cdef class IteratorRowFilter(IteratorRow):
cdef int flag_filter
cdef int flag_require
cdef bam1_t * getCurrent(self)
cdef int cnext(self)


cdef class IteratorColumn:

# result of the last plbuf_push
Expand Down
7 changes: 7 additions & 0 deletions pysam/libcalignmentfile.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,12 @@ class AlignmentFile(HTSFile):
def find_introns(
self, read_iterator: Iterable[AlignedSegment]
) -> Dict[Tuple[int, int], int]: ...
def filter(
self,
flag_filter: int = ...,
flag_require: int = ...,
multiple_iterators: bool = ...,
) -> IteratorRowFilter: ...
def close(self) -> None: ...
def write(self, read: AlignedSegment) -> int: ...
def __enter__(self) -> AlignmentFile: ...
Expand Down Expand Up @@ -202,6 +208,7 @@ class IteratorRowAllRefs(IteratorRow): ...
class IteratorRowHead(IteratorRow): ...
class IteratorRowRegion(IteratorRow): ...
class IteratorRowSelection(IteratorRow): ...
class IteratorRowFilter(IteratorRow): ...

class IteratorColumn:
def __iter__(self) -> IteratorColumn: ...
Expand Down
77 changes: 77 additions & 0 deletions pysam/libcalignmentfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1663,6 +1663,30 @@ cdef class AlignmentFile(HTSFile):
res[(junc_start, base_position)] += 1
return res

def filter(self, **kwargs):
"""iterate and filter segments in the file.

Parameters
----------

flag_filter : int

ignore reads where any of the bits in the flag are set. The default
is BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP.

flag_require : int

only use reads where certain flags are set. The default is 0.

Returns
-------

an iterator over filtered rows. : IteratorRowFilter

"""
if not self.is_open:
raise ValueError("I/O operation on closed file")
return IteratorRowFilter(self, **kwargs)

def close(self):
'''closes the :class:`pysam.AlignmentFile`.'''
Expand Down Expand Up @@ -2323,6 +2347,59 @@ cdef class IteratorRowSelection(IteratorRow):
raise IOError(read_failure_reason(ret))


cdef class IteratorRowFilter(IteratorRow):
"""*(AlignmentFile samfile, int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, int flag_require = 0, bool multiple_iterators=False)*

iterates over reads that have none of the given flags set.

.. note::
It is usually not necessary to create an object of this class
explicitly. It is returned as a result of call to a :meth:`AlignmentFile.filter`.
"""
def __init__(
self,
AlignmentFile samfile,
int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP,
int flag_require = 0,
bint multiple_iterators=True
):
super().__init__(samfile, multiple_iterators=multiple_iterators)

self.flag_filter = flag_filter
self.flag_require = flag_require

def __iter__(self):
return self

cdef bam1_t* getCurrent(self):
return self.b

cdef int cnext(self):
'''cversion of iterator. Used by IteratorColumn'''
cdef int ret
cdef bam_hdr_t * hdr = self.header.ptr
with nogil:
while 1:
ret = sam_read1(self.htsfile, hdr, self.b)
if ret < 0:
break
if self.b.core.flag & self.flag_filter:
continue
elif self.flag_require and not (self.b.core.flag & self.flag_require):
continue
break
return ret

def __next__(self):
cdef int ret = self.cnext()
if ret >= 0:
return makeAlignedSegment(self.b, self.header)
elif ret == -1:
raise StopIteration
else:
raise IOError(read_failure_reason(ret))


cdef int __advance_nofilter(void *data, bam1_t *b):
'''advance without any read filtering.
'''
Expand Down
47 changes: 46 additions & 1 deletion tests/AlignmentFile_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2411,7 +2411,52 @@ def test_reading_writing_cram(self):
return
read = self.build_read()
self.check_read(read, mode="cram")



class TestAlignmentFileFilter(unittest.TestCase):

filename = os.path.join(BAM_DATADIR, 'ex1.bam')
mode = "rb"

def testNoFilter(self):
with pysam.AlignmentFile(self.filename, self.mode) as samfile1:
total = 0
for read in samfile1:
total += 1
with pysam.AlignmentFile(self.filename, self.mode) as samfile2:
n2 = 0
for read in samfile2.filter(flag_filter=0):
n2 += 1
self.assertEqual(total, n2)

def testMapped(self):
with pysam.AlignmentFile(self.filename, self.mode) as samfile1:
n1 = 0
total = 0
for read in samfile1:
n1 += read.is_mapped
total += 1
self.assertLessEqual(n1, total)
with pysam.AlignmentFile(self.filename, self.mode) as samfile2:
n2 = 0
for read in samfile2.filter(flag_filter=pysam.FUNMAP):
n2 += 1
self.assertEqual(n1, n2)

def testUnmapped(self):
with pysam.AlignmentFile(self.filename, self.mode) as samfile1:
n1 = 0
total = 0
for read in samfile1:
n1 += not read.is_mapped
total += 1
self.assertLessEqual(n1, total)
with pysam.AlignmentFile(self.filename, self.mode) as samfile2:
n2 = 0
for read in samfile2.filter(flag_filter=0, flag_require=pysam.FUNMAP):
n2 += 1
self.assertEqual(n1, n2)

# SAM writing fails, as query length is 0
# class TestSanityCheckingSAM(TestSanityCheckingSAM):
# mode = "w"
Expand Down
Loading