diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7ccc0f5c..57967a0e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -28,7 +28,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install prerequisite Python libraries - run: pip install cython mypy pytest setuptools + run: pip install cython mypy pytest setuptools parameterized - name: Install Linux build prerequisites if: runner.os == 'Linux' @@ -79,7 +79,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install prerequisite Python libraries - run: pip install cython pytest + run: pip install cython pytest parameterized - name: Install build prerequisites if: runner.os == 'Linux' @@ -147,7 +147,7 @@ jobs: run: python setup.py install - name: Install test prerequisites via Conda - run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest + run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest parameterized - name: Run tests run: REF_PATH=':' pytest diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index cd0ebf83..845d9d61 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -130,7 +130,6 @@ cdef class IteratorColumn: # backwards compatibility cdef char * getSequence(self) - cdef class IteratorColumnRegion(IteratorColumn): cdef int start cdef int stop @@ -144,6 +143,20 @@ cdef class IteratorColumnAllRefs(IteratorColumn): cdef class IteratorColumnAll(IteratorColumn): pass +cdef class IteratorColumnRecords: + cdef int cnext(self) + cdef bam_plp_t plp_iter + cdef int tid + cdef int pos + cdef int n_plp + cdef uint32_t min_base_quality + cdef const bam_pileup1_t * plp + cdef AlignmentHeader header + cdef char * seq + cdef int seq_len + cdef faidx_t * fastafile + cdef char * get_sequence(self) + cdef class IndexedReads: cdef AlignmentFile samfile diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 5723a5af..15cf7173 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -210,6 +210,13 @@ class IteratorColumn: class IteratorColumnAll(IteratorColumn): ... class IteratorColumnAllRefs(IteratorColumn): ... class IteratorColumnRegion(IteratorColumn): ... +class IteratorColumnRecords(): + def __iter__(self) -> IteratorColumn: ... + def __next__(self) -> PileupColumn: ... + @property + def seq_len(self) -> int: ... + def add_reference(self, fastafile: FastaFile) -> None: ... + def has_reference(self) -> bool: ... class SNPCall: @property diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 4373a75f..ea2c8cb2 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -29,6 +29,7 @@ # class IteratorColumnRegion # class IteratorColumnAll # class IteratorColumnAllRefs +# class IteratorColumnRecords # ######################################################## # @@ -57,6 +58,8 @@ ######################################################## import os import collections +from typing import Iterable, Optional + try: from collections.abc import Sequence, Mapping # noqa except ImportError: @@ -67,7 +70,8 @@ import array from libc.errno cimport errno, EPIPE from libc.string cimport strcmp, strpbrk from libc.stdint cimport INT32_MAX - +import faulthandler +faulthandler.enable() from cpython cimport array as c_array from pysam.libcutils cimport force_bytes, force_str, charptr_to_str @@ -86,6 +90,7 @@ __all__ = [ "AlignmentHeader", "IteratorRow", "IteratorColumn", + "IteratorColumnRecords", "IndexedReads"] IndexStats = collections.namedtuple("IndexStats", @@ -2688,7 +2693,7 @@ cdef class IteratorColumn: return self.get_sequence() def addReference(self, FastaFile fastafile): return self.add_reference(fastafile) - + cdef class IteratorColumnRegion(IteratorColumn): '''iterates over a region only. @@ -2811,6 +2816,138 @@ cdef class IteratorColumnAll(IteratorColumn): self.samfile.header) + +cdef class IteratorColumnRecords: + '''Iterator over columns when given a collection of :class:`~pysam.AlignedSegment`s. + + For reasons of efficiency, the iterator requires the given + :class:`~pysam.AlignedSegment`s to be in coordinate sorted order. + For implementation simplicity, all the records will be consumed + from the given iterator. + + For example: + + f = AlignmentFile("file.bam", "rb") + result = list(IteratorColumnRecords([rec for rec in f])) + + Here, `result`` will be a list of ``n`` lists of objects of type + :class:`~pysam.PileupRead`. + + If the iterator is associated with a :class:`~pysam.Fastafile` + using the :meth:`add_reference` method, then the iterator will + export the current sequence via the methods :meth:`get_sequence` + and :meth:`seq_len`. + + See :class:`~AlignmentFile.pileup` for kwargs to the iterator. + ''' + + def __cinit__(self, recs: Iterable[AlignedSegment], **kwargs): + cdef FastaFile fastafile = kwargs.get("fastafile", None) + if fastafile is None: + self.fastafile = NULL + else: + self.fastafile = fastafile.fastafile + self.min_base_quality = kwargs.get("min_base_quality", 13) + self.plp_iter = bam_plp_init(NULL, NULL) + rec: AlignedSegment + self.header: Optional[AlignmentHeader] = None + for rec in recs: + if self.header is None: + self.header = rec.header + if bam_plp_push(self.plp_iter, rec._delegate) != 0: + raise Exception("Could not add record to the iteator: {}".format(str(rec))) + + def __dealloc__(self): + bam_plp_destroy(self.plp_iter) + self.plp_iter = NULL + if self.seq != NULL: + free(self.seq) + self.seq = NULL + + def __iter__(self): + return self + + cdef int cnext(self): + '''perform next iteration. + ''' + self.plp = bam_plp_next( + self.plp_iter, + &self.tid, + &self.pos, + &self.n_plp + ) + if self.plp == NULL: + return 0 + # if self.plp_iter.error > 0: + # return -1 + # else: + # return 0 + else: + return 1 + + def __next__(self): + cdef int n + cdef int tid + n = self.cnext() + if n < 0: + raise ValueError("error during iteration") + + if n == 0: + raise StopIteration + + # reload sequence + cdef bam1_t *b = self.plp[0].b + if self.fastafile != NULL and self.tid != b.core.tid: + if self.seq != NULL: + free(self.seq) + self.tid = b.core.tid + tid = self.tid + assert self.header is not None + with nogil: + self.seq = faidx_fetch_seq( + self.fastafile, + self.header.ptr.target_name[tid], + 0, MAX_POS, + &self.seq_len) + + if self.seq == NULL: + raise ValueError( + "reference sequence for '{}' (tid={}) not found".format( + self.header.target_name[self.tid], self.tid)) + + return makePileupColumn(&self.plp, + self.tid, + self.pos, + self.n_plp, + self.min_base_quality, + self.seq, + self.header) + + cdef char * get_sequence(self): + '''return current reference sequence underlying the iterator. + ''' + return self.seq + + property seq_len: + '''current sequence length.''' + def __get__(self): + return self.seq_len + + def add_reference(self, FastaFile fastafile): + '''add reference sequences in `fastafile` to iterator.''' + self.fastafile = fastafile.fastafile + if self.seq != NULL: + free(self.seq) + self.tid = -1 + + + def has_reference(self): + ''' + return true if iterator is associated with a reference''' + return self.fastafile != NULL + + + cdef class SNPCall: '''the results of a SNP call.''' cdef int _tid diff --git a/pysam/libchtslib.pxd b/pysam/libchtslib.pxd index 99ea1576..d2c2704b 100644 --- a/pysam/libchtslib.pxd +++ b/pysam/libchtslib.pxd @@ -1222,7 +1222,8 @@ cdef extern from "htslib/sam.h" nogil: ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b) ctypedef int (*bam_test_f)() - ctypedef struct __bam_plp_t + ctypedef struct __bam_plp_t: + int error ctypedef __bam_plp_t *bam_plp_t ctypedef struct __bam_mplp_t diff --git a/requirements-dev.txt b/requirements-dev.txt index 420c17e9..f3be92f1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1 +1,2 @@ Cython>=0.29.12,<4 +parameterized>=0.9.0,<1 diff --git a/tests/AlignmentFilePileup_test.py b/tests/AlignmentFilePileup_test.py index 7b56b1a5..66e8bc61 100644 --- a/tests/AlignmentFilePileup_test.py +++ b/tests/AlignmentFilePileup_test.py @@ -1,5 +1,8 @@ """Benchmarking module for AlignmentFile functionality""" import os + +from parameterized import parameterized_class + import pysam import sys import unittest @@ -352,15 +355,18 @@ def testStr(self): s = str(pcolumn) self.assertEqual(len(s.split("\n")), 2) - +@parameterized_class(("from_file", ), [ + (True, ), (False, ) +]) class PileUpColumnTests(unittest.TestCase): + from_file: bool fn = os.path.join(BAM_DATADIR, "ex2.bam") fn_fasta = os.path.join(BAM_DATADIR, "ex1.fa") def test_pileup_depths_are_equal(self): samtools_result = PileupTestUtils.build_depth_with_samtoolspipe(self.fn) - pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn) + pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn, from_file=self.from_file) self.assertEqual(pysam_result, samtools_result) def test_pileup_query_bases_without_reference_are_equal(self): diff --git a/tests/PileupTestUtils.py b/tests/PileupTestUtils.py index 33acc9ec..1c963001 100644 --- a/tests/PileupTestUtils.py +++ b/tests/PileupTestUtils.py @@ -3,6 +3,7 @@ import pysam from TestUtils import force_str +from pysam.libcalignmentfile import IteratorColumnRecords def build_pileup_with_samtoolsshell(fn): @@ -40,9 +41,12 @@ def build_depth_with_samtoolspipe(fn): return [int(x[3]) for x in data] -def build_depth_with_filter_with_pysam(*args, **kwargs): +def build_depth_with_filter_with_pysam(*args, from_file: bool = True, **kwargs): with pysam.AlignmentFile(*args, **kwargs) as inf: - return [x.get_num_aligned() for x in inf.pileup(stepper="samtools")] + if from_file: + return [x.get_num_aligned() for x in inf.pileup(stepper="samtools")] + else: + return [x.get_num_aligned() for x in IteratorColumnRecords(inf)] def build_depth_with_pysam(*args, **kwargs):