Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Install prerequisite Python libraries
run: pip install cython mypy pytest setuptools
run: pip install cython mypy pytest setuptools parameterized

- name: Install Linux build prerequisites
if: runner.os == 'Linux'
Expand Down Expand Up @@ -79,7 +79,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Install prerequisite Python libraries
run: pip install cython pytest
run: pip install cython pytest parameterized

- name: Install build prerequisites
if: runner.os == 'Linux'
Expand Down Expand Up @@ -147,7 +147,7 @@ jobs:
run: python setup.py install

- name: Install test prerequisites via Conda
run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest
run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest parameterized

- name: Run tests
run: REF_PATH=':' pytest
15 changes: 14 additions & 1 deletion pysam/libcalignmentfile.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ cdef class IteratorColumn:
# backwards compatibility
cdef char * getSequence(self)


cdef class IteratorColumnRegion(IteratorColumn):
cdef int start
cdef int stop
Expand All @@ -144,6 +143,20 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
cdef class IteratorColumnAll(IteratorColumn):
pass

cdef class IteratorColumnRecords:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this cannot extend IteratorColumn since IteratorColumn requires a SAM file.

cdef int cnext(self)
cdef bam_plp_t plp_iter
cdef int tid
cdef int pos
cdef int n_plp
cdef uint32_t min_base_quality
cdef const bam_pileup1_t * plp
cdef AlignmentHeader header
cdef char * seq
cdef int seq_len
cdef faidx_t * fastafile
cdef char * get_sequence(self)


cdef class IndexedReads:
cdef AlignmentFile samfile
Expand Down
7 changes: 7 additions & 0 deletions pysam/libcalignmentfile.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,13 @@ class IteratorColumn:
class IteratorColumnAll(IteratorColumn): ...
class IteratorColumnAllRefs(IteratorColumn): ...
class IteratorColumnRegion(IteratorColumn): ...
class IteratorColumnRecords():
def __iter__(self) -> IteratorColumn: ...
def __next__(self) -> PileupColumn: ...
@property
def seq_len(self) -> int: ...
def add_reference(self, fastafile: FastaFile) -> None: ...
def has_reference(self) -> bool: ...

class SNPCall:
@property
Expand Down
141 changes: 139 additions & 2 deletions pysam/libcalignmentfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# class IteratorColumnRegion
# class IteratorColumnAll
# class IteratorColumnAllRefs
# class IteratorColumnRecords
#
########################################################
#
Expand Down Expand Up @@ -57,6 +58,8 @@
########################################################
import os
import collections
from typing import Iterable, Optional

try:
from collections.abc import Sequence, Mapping # noqa
except ImportError:
Expand All @@ -67,7 +70,8 @@ import array
from libc.errno cimport errno, EPIPE
from libc.string cimport strcmp, strpbrk
from libc.stdint cimport INT32_MAX

import faulthandler
faulthandler.enable()
from cpython cimport array as c_array

from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
Expand All @@ -86,6 +90,7 @@ __all__ = [
"AlignmentHeader",
"IteratorRow",
"IteratorColumn",
"IteratorColumnRecords",
"IndexedReads"]

IndexStats = collections.namedtuple("IndexStats",
Expand Down Expand Up @@ -2688,7 +2693,7 @@ cdef class IteratorColumn:
return self.get_sequence()
def addReference(self, FastaFile fastafile):
return self.add_reference(fastafile)


cdef class IteratorColumnRegion(IteratorColumn):
'''iterates over a region only.
Expand Down Expand Up @@ -2811,6 +2816,138 @@ cdef class IteratorColumnAll(IteratorColumn):
self.samfile.header)



cdef class IteratorColumnRecords:
'''Iterator over columns when given a collection of :class:`~pysam.AlignedSegment`s.

For reasons of efficiency, the iterator requires the given
:class:`~pysam.AlignedSegment`s to be in coordinate sorted order.
For implementation simplicity, all the records will be consumed
from the given iterator.

For example:

f = AlignmentFile("file.bam", "rb")
result = list(IteratorColumnRecords([rec for rec in f]))

Here, `result`` will be a list of ``n`` lists of objects of type
:class:`~pysam.PileupRead`.

If the iterator is associated with a :class:`~pysam.Fastafile`
using the :meth:`add_reference` method, then the iterator will
export the current sequence via the methods :meth:`get_sequence`
and :meth:`seq_len`.

See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
'''

def __cinit__(self, recs: Iterable[AlignedSegment], **kwargs):
cdef FastaFile fastafile = kwargs.get("fastafile", None)
if fastafile is None:
self.fastafile = NULL
else:
self.fastafile = fastafile.fastafile
self.min_base_quality = kwargs.get("min_base_quality", 13)
self.plp_iter = <bam_plp_t>bam_plp_init(NULL, NULL)
rec: AlignedSegment
self.header: Optional[AlignmentHeader] = None
for rec in recs:
if self.header is None:
self.header = rec.header
if bam_plp_push(self.plp_iter, rec._delegate) != 0:
raise Exception("Could not add record to the iteator: {}".format(str(rec)))

def __dealloc__(self):
bam_plp_destroy(self.plp_iter)
self.plp_iter = <bam_plp_t>NULL
if self.seq != NULL:
free(self.seq)
self.seq = NULL

def __iter__(self):
return self

cdef int cnext(self):
'''perform next iteration.
'''
self.plp = <bam_pileup1_t*>bam_plp_next(
self.plp_iter,
&self.tid,
&self.pos,
&self.n_plp
)
if self.plp == NULL:
return 0
# if self.plp_iter.error > 0:
# return -1
# else:
# return 0
else:
return 1

def __next__(self):
cdef int n
cdef int tid
n = self.cnext()
if n < 0:
raise ValueError("error during iteration")

if n == 0:
raise StopIteration

# reload sequence
cdef bam1_t *b = self.plp[0].b
if self.fastafile != NULL and self.tid != b.core.tid:
if self.seq != NULL:
free(self.seq)
self.tid = b.core.tid
tid = self.tid
assert self.header is not None
with nogil:
self.seq = faidx_fetch_seq(
self.fastafile,
self.header.ptr.target_name[tid],
0, MAX_POS,
&self.seq_len)

if self.seq == NULL:
raise ValueError(
"reference sequence for '{}' (tid={}) not found".format(
self.header.target_name[self.tid], self.tid))

return makePileupColumn(&self.plp,
self.tid,
self.pos,
self.n_plp,
self.min_base_quality,
self.seq,
self.header)

cdef char * get_sequence(self):
'''return current reference sequence underlying the iterator.
'''
return self.seq

property seq_len:
'''current sequence length.'''
def __get__(self):
return self.seq_len

def add_reference(self, FastaFile fastafile):
'''add reference sequences in `fastafile` to iterator.'''
self.fastafile = fastafile.fastafile
if self.seq != NULL:
free(self.seq)
self.tid = -1


def has_reference(self):
'''
return true if iterator is associated with a reference'''
return self.fastafile != NULL



cdef class SNPCall:
'''the results of a SNP call.'''
cdef int _tid
Expand Down
3 changes: 2 additions & 1 deletion pysam/libchtslib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,8 @@ cdef extern from "htslib/sam.h" nogil:
ctypedef int (*bam_plp_auto_f)(void *data, bam1_t *b)
ctypedef int (*bam_test_f)()

ctypedef struct __bam_plp_t
ctypedef struct __bam_plp_t:
int error
ctypedef __bam_plp_t *bam_plp_t

ctypedef struct __bam_mplp_t
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
Cython>=0.29.12,<4
parameterized>=0.9.0,<1
10 changes: 8 additions & 2 deletions tests/AlignmentFilePileup_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Benchmarking module for AlignmentFile functionality"""
import os

from parameterized import parameterized_class

import pysam
import sys
import unittest
Expand Down Expand Up @@ -352,15 +355,18 @@ def testStr(self):
s = str(pcolumn)
self.assertEqual(len(s.split("\n")), 2)


@parameterized_class(("from_file", ), [
(True, ), (False, )
])
class PileUpColumnTests(unittest.TestCase):
from_file: bool

fn = os.path.join(BAM_DATADIR, "ex2.bam")
fn_fasta = os.path.join(BAM_DATADIR, "ex1.fa")

def test_pileup_depths_are_equal(self):
samtools_result = PileupTestUtils.build_depth_with_samtoolspipe(self.fn)
pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn)
pysam_result = PileupTestUtils.build_depth_with_filter_with_pysam(self.fn, from_file=self.from_file)
self.assertEqual(pysam_result, samtools_result)

def test_pileup_query_bases_without_reference_are_equal(self):
Expand Down
8 changes: 6 additions & 2 deletions tests/PileupTestUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pysam

from TestUtils import force_str
from pysam.libcalignmentfile import IteratorColumnRecords


def build_pileup_with_samtoolsshell(fn):
Expand Down Expand Up @@ -40,9 +41,12 @@ def build_depth_with_samtoolspipe(fn):
return [int(x[3]) for x in data]


def build_depth_with_filter_with_pysam(*args, **kwargs):
def build_depth_with_filter_with_pysam(*args, from_file: bool = True, **kwargs):
with pysam.AlignmentFile(*args, **kwargs) as inf:
return [x.get_num_aligned() for x in inf.pileup(stepper="samtools")]
if from_file:
return [x.get_num_aligned() for x in inf.pileup(stepper="samtools")]
else:
return [x.get_num_aligned() for x in IteratorColumnRecords(inf)]


def build_depth_with_pysam(*args, **kwargs):
Expand Down
Loading