Skip to content

feat: add class to generate pileup over AlignedSegments #1353

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion pysam/libcalignmentfile.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ cdef class IteratorColumn:
# backwards compatibility
cdef char * getSequence(self)


cdef class IteratorColumnRegion(IteratorColumn):
cdef int start
cdef int stop
Expand All @@ -144,6 +143,19 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
cdef class IteratorColumnAll(IteratorColumn):
pass

cdef class IteratorColumnRecords:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this cannot extend IteratorColumn since IteratorColumn requires a SAM file.

cdef int cnext(self)
cdef bam_plp_t plp_iter
cdef int tid
cdef int pos
cdef int n_plp
cdef uint32_t min_base_quality
cdef const bam_pileup1_t * plp
cdef AlignmentHeader header
cdef char * seq
cdef int seq_len
cdef faidx_t * fastafile


cdef class IndexedReads:
cdef AlignmentFile samfile
Expand Down
7 changes: 7 additions & 0 deletions pysam/libcalignmentfile.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,13 @@ class IteratorColumn:
class IteratorColumnAll(IteratorColumn): ...
class IteratorColumnAllRefs(IteratorColumn): ...
class IteratorColumnRegion(IteratorColumn): ...
class IteratorColumnRecords():
def __iter__(self) -> IteratorColumn: ...
def __next__(self) -> PileupColumn: ...
@property
def seq_len(self) -> int: ...
def add_reference(self, fastafile: FastaFile) -> None: ...
def has_reference(self) -> bool: ...

class SNPCall:
@property
Expand Down
141 changes: 139 additions & 2 deletions pysam/libcalignmentfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# class IteratorColumnRegion
# class IteratorColumnAll
# class IteratorColumnAllRefs
# class IteratorColumnRecords
#
########################################################
#
Expand Down Expand Up @@ -57,6 +58,8 @@
########################################################
import os
import collections
from typing import Iterable, Optional

try:
from collections.abc import Sequence, Mapping # noqa
except ImportError:
Expand All @@ -67,7 +70,8 @@ import array
from libc.errno cimport errno, EPIPE
from libc.string cimport strcmp, strpbrk
from libc.stdint cimport INT32_MAX

import faulthandler
faulthandler.enable()
from cpython cimport array as c_array

from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
Expand All @@ -86,6 +90,7 @@ __all__ = [
"AlignmentHeader",
"IteratorRow",
"IteratorColumn",
"IteratorColumnRecords",
"IndexedReads"]

IndexStats = collections.namedtuple("IndexStats",
Expand Down Expand Up @@ -2688,7 +2693,7 @@ cdef class IteratorColumn:
return self.get_sequence()
def addReference(self, FastaFile fastafile):
return self.add_reference(fastafile)


cdef class IteratorColumnRegion(IteratorColumn):
'''iterates over a region only.
Expand Down Expand Up @@ -2811,6 +2816,138 @@ cdef class IteratorColumnAll(IteratorColumn):
self.samfile.header)



cdef class IteratorColumnRecords:
'''Iterator over columns when given a collection of :class:`~pysam.AlignedSegment`s.

For reasons of efficiency, the iterator requires the given
:class:`~pysam.AlignedSegment`s to be in coordinate sorted order.
For implementation simplicity, all the records will be consumed
from the given iterator.

For example:

f = AlignmentFile("file.bam", "rb")
result = list(IteratorColumnRecords([rec for rec in f]))

Here, `result`` will be a list of ``n`` lists of objects of type
:class:`~pysam.PileupRead`.

If the iterator is associated with a :class:`~pysam.Fastafile`
using the :meth:`add_reference` method, then the iterator will
export the current sequence via the methods :meth:`get_sequence`
and :meth:`seq_len`.

See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
'''

def __cinit__(self, recs: Iterable[AlignedSegment], **kwargs):
cdef FastaFile fastafile = kwargs.get("fastafile", None)
if fastafile is None:
self.fastafile = NULL
else:
self.fastafile = fastafile.fastafile
self.min_base_quality = kwargs.get("min_base_quality", 13)
self.plp_iter = <bam_plp_t>bam_plp_init(NULL, NULL)
rec: AlignedSegment
self.header: Optional[AlignmentHeader] = None
for rec in recs:
if self.header is None:
self.header = rec.header
if bam_plp_push(self.plp_iter, rec._delegate) != 0:
raise Exception("Could not add record to the iteator: {}".format(str(rec)))

def __dealloc__(self):
bam_plp_destroy(self.plp_iter)
self.plp_iter = <bam_plp_t>NULL
if self.seq != NULL:
free(self.seq)
self.seq = NULL

def __iter__(self):
return self

cdef int cnext(self):
'''perform next iteration.
'''
self.plp = <bam_pileup1_t*>bam_plp_next(
self.plp_iter,
&self.tid,
&self.pos,
&self.n_plp
)
if self.plp == NULL:
return 0
# if self.plp_iter.error > 0:
# return -1
# else:
# return 0
else:
return 1

def __next__(self):
cdef int n
cdef int tid
n = self.cnext()
if n < 0:
raise ValueError("error during iteration")

if n == 0:
raise StopIteration

# reload sequence
cdef bam1_t *b = self.plp[0].b
if self.fastafile != NULL and self.tid != b.core.tid:
if self.seq != NULL:
free(self.seq)
self.tid = b.core.tid
tid = self.tid
assert self.header is not None
with nogil:
self.seq = faidx_fetch_seq(
self.fastafile,
self.header.ptr.target_name[tid],
0, MAX_POS,
&self.seq_len)

if self.seq == NULL:
raise ValueError(
"reference sequence for '{}' (tid={}) not found".format(
self.header.target_name[self.tid], self.tid))

return makePileupColumn(&self.plp,
self.tid,
self.pos,
self.n_plp,
self.min_base_quality,
self.seq,
self.header)

cdef char * get_sequence(self):
'''return current reference sequence underlying the iterator.
'''
return self.seq

property seq_len:
'''current sequence length.'''
def __get__(self):
return self.seq_len

def add_reference(self, FastaFile fastafile):
'''add reference sequences in `fastafile` to iterator.'''
self.fastafile = fastafile.fastafile
if self.seq != NULL:
free(self.seq)
self.tid = -1


def has_reference(self):
'''
return true if iterator is associated with a reference'''
return self.fastafile != NULL



cdef class SNPCall:
'''the results of a SNP call.'''
cdef int _tid
Expand Down
3 changes: 2 additions & 1 deletion pysam/libchtslib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,8 @@ cdef extern from "htslib/sam.h" nogil:
ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
ctypedef int (*bam_test_f)()

ctypedef struct __bam_plp_t
ctypedef struct __bam_plp_t:
int error
ctypedef __bam_plp_t *bam_plp_t

ctypedef struct __bam_mplp_t
Expand Down
Loading