From 2653c898b22468bfce62193e80ab43e42d8441fe Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Mon, 15 Sep 2025 12:58:28 -0700 Subject: [PATCH 1/5] feat(headers.common): Update common header to use unsigned char instead. --- src/designer_dna/headers/common.pxd | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/designer_dna/headers/common.pxd b/src/designer_dna/headers/common.pxd index 38b68cf..9aa7f6a 100644 --- a/src/designer_dna/headers/common.pxd +++ b/src/designer_dna/headers/common.pxd @@ -42,7 +42,7 @@ cdef extern from "Python.h": ctypedef struct StringView: - char* ptr + unsigned char* ptr Py_ssize_t size bint origin @@ -53,7 +53,7 @@ cdef inline StringView construct(bytes s, Py_ssize_t length, bint isbytes): char* buffer = PyBytes_AS_STRING(s) StringView view - view.ptr = malloc((length + 1) * sizeof(char)) + view.ptr = malloc((length + 1) * sizeof(unsigned char)) memcpy(view.ptr, buffer, length + 1) view.ptr[length] = "\0" # c string terminator view.size = length @@ -63,7 +63,7 @@ cdef inline StringView construct(bytes s, Py_ssize_t length, bint isbytes): cdef inline StringView bytes_to_view(bytes b): - """Construct StringView from python bytes object""" + """Construct StringView from python bytes object.""" cdef Py_ssize_t length = PyBytes_GET_SIZE(b) return construct(b, length, True) @@ -80,7 +80,7 @@ cdef inline StringView str_to_view(str s): cdef inline str to_str(StringView view): """Convert StringView back into a python string object, safely releasing memory.""" - cdef str obj = PyUnicode_DecodeUTF8Stateful(view.ptr, view.size, NULL, NULL) + cdef str obj = PyUnicode_DecodeUTF8Stateful( view.ptr, view.size, NULL, NULL) free(view.ptr) return obj @@ -88,7 +88,15 @@ cdef inline str to_str(StringView view): cdef inline bytes to_bytes(StringView view): """Convert StringView back into a python bytes object, safely releasing memory.""" - cdef bytes obj = PyBytes_FromStringAndSize(view.ptr, view.size) + cdef bytes obj = PyBytes_FromStringAndSize( view.ptr, view.size) free(view.ptr) return obj + + +cdef inline object to_object(StringView view): + """Convert StringView back into a python object, safely releasing memory.""" + if view.origin: + return to_bytes(view) + + return to_str(view) From b5eb6b6e5783a1219c2b555d47f52ea4b61787a4 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Mon, 15 Sep 2025 13:44:24 -0700 Subject: [PATCH 2/5] feat(_oligos): Update cython oligos tools to use unsigned char and array representation functions. --- src/designer_dna/_oligos.pxd | 8 +- src/designer_dna/_oligos.pyi | 57 +++++++- src/designer_dna/_oligos.pyx | 248 +++++++++++++++++++++++++++++------ 3 files changed, 261 insertions(+), 52 deletions(-) diff --git a/src/designer_dna/_oligos.pxd b/src/designer_dna/_oligos.pxd index 0eff996..c19c047 100644 --- a/src/designer_dna/_oligos.pxd +++ b/src/designer_dna/_oligos.pxd @@ -30,11 +30,13 @@ from common cimport StringView cdef: - void c_reverse(char*, Py_ssize_t) + void c_reverse(unsigned char*, Py_ssize_t) void v_reverse(StringView*) - void c_complement(char*, Py_ssize_t, unsigned char*) + void c_complement(unsigned char*, Py_ssize_t, unsigned char*) void v_complement(StringView*, bint) - void c_reverse_complement(char*, Py_ssize_t, unsigned char*) + void c_reverse_complement(unsigned char*, Py_ssize_t, unsigned char*) void v_reverse_complement(StringView*, bint) + + int c_stretch(unsigned char*, Py_ssize_t) diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi index 4710980..32e09a0 100644 --- a/src/designer_dna/_oligos.pyi +++ b/src/designer_dna/_oligos.pyi @@ -27,7 +27,19 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -"""Cythonized oligonucleotide functions.""" +from array import array +from typing import Any + +def m_reverse(sequence: array[int]) -> Any: + """Reverse a nucleotide sequence. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + + Returns: + (void) Reverse a sequence in place. + + """ def reverse(sequence: str) -> str: """Reverse a nucleotide sequence. @@ -46,6 +58,18 @@ def reverse(sequence: str) -> str: """ +def m_complement(sequence: array[int], dna: bool = ...) -> Any: + """Complement a nucleotide sequence. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (void) Complement nucleotide sequence in place. + + """ + def complement(sequence: str, dna: bool = ...) -> str: """Complement a nucleotide sequence. @@ -64,6 +88,18 @@ def complement(sequence: str, dna: bool = ...) -> str: """ +def m_reverse_complement(sequence: array[int], dna: bool = ...) -> Any: + """Reverse complement a nucleotide sequence. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (void) Reverse complement nucleotide sequence in place. + + """ + def reverse_complement(sequence: str, dna: bool = ...) -> str: """Reverse complement a nucleotide sequence. @@ -90,7 +126,7 @@ def palindrome(sequence: str, dna: bool = ...) -> str: dna (bool): Sequence is DNA, else RNA. Returns: - (str): longest palindromic subsequence within sequence. + (str) longest palindromic subsequence within sequence. Examples: .. code-block:: python @@ -98,15 +134,24 @@ def palindrome(sequence: str, dna: bool = ...) -> str: palindrome("ATAT") == "ATAT" palindrome("GATATG") == "ATAT" palindrome("ANT") == "ANT" # Handles degenerate bases - palindrome("UGCA", False) == "UGCA" # Handles RNA sequences Notes: - * Algorithmic time complexity O(NlogN). * If a sequence contains two or more palindromic substrings of equal size, the first leftmost palindrome is prioritized. """ +def m_stretch(sequence: array[int]) -> int: + """Return the maximum length of a single letter (nucleotide) repeat in a string. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + + Returns: + (int) Length of maximum run of a single letter. + + """ + def stretch(sequence: str) -> int: """Return the maximum length of a single letter (nucleotide) repeat in a string. @@ -114,7 +159,7 @@ def stretch(sequence: str) -> int: sequence (str): Nucleotide sequence string. Returns: - (int): Length of maximum run of a single letter. + (int) Length of maximum run of a single letter. Examples: .. code-block:: python @@ -136,7 +181,7 @@ def nrepeats(sequence: str, n: int) -> int: of length n characters. Raises: - ValueError: if value of n is less than 1. + ZeroDivisionError: if value of n is 0. Examples: .. code-block:: python diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index c9e2adb..3601eb6 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -43,26 +43,52 @@ cdef extern from "oligos.h": const unsigned char RNA[0x100] -cdef inline void c_reverse(char* seq, Py_ssize_t length) noexcept: +cdef inline void c_reverse(unsigned char* sequence, Py_ssize_t length) noexcept: """Reverse a C string in place. Args: - seq (char*): buffer sequence. - length (Py_ssize_t): length of seq. + sequence (uchar*): Buffer sequence. + length (Py_ssize_t): Length of sequence. + + Returns: + (void) Reverse sequence in place. """ cdef Py_ssize_t start, end, x = length // 2 for start in range(x): end = length - start - 1 - seq[start], seq[end] = seq[end], seq[start] + sequence[start], sequence[end] = sequence[end], sequence[start] cdef inline void v_reverse(StringView* view) noexcept: - """Handle reverse in place on StringView directly.""" + """Handle reverse in place on StringView directly. + + Args: + view (StringView*): Nucleotide sequence view. + + Returns: + (void) Reverse char in place. + + """ c_reverse(view[0].ptr, view[0].size) +cpdef void m_reverse(unsigned char[:] sequence): + """Reverse a nucleotide sequence. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + + Returns: + (void) Reverse a sequence in place. + + """ + cdef Py_ssize_t length = sequence.shape[0] + + c_reverse(&sequence[0], length) + + cpdef str reverse(str sequence): """Reverse a nucleotide sequence. @@ -83,16 +109,19 @@ cpdef str reverse(str sequence): cdef void c_complement( - char* sequence, + unsigned char* sequence, Py_ssize_t length, unsigned char* table ) noexcept: """Complement sequence C string in place. Args: - seq (char*): buffer sequence. + seq (uchar*): buffer sequence. length (Py_ssize_t): length of seq. - table (unsigned char*): translation table. + table (uchar*): translation table. + + Returns: + (void) Complement seq in place. """ cdef: @@ -108,13 +137,42 @@ cdef void c_complement( cdef inline void v_complement(StringView* view, bint dna) noexcept: - """Handle complement in place on StringView directly.""" + """Handle complement in place on StringView directly. + + Args: + view (StringView*): Nucleotide sequence view. + + Returns: + (void) Complement char in place. + + """ if dna: c_complement(view[0].ptr, view[0].size, &DNA[0]) else: c_complement(view[0].ptr, view[0].size, &RNA[0]) +cpdef void m_complement(unsigned char[:] sequence, bint dna = True): + """Complement a nucleotide sequence. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (void) Complement nucleotide sequence in place. + + """ + cdef: + Py_ssize_t length = sequence.shape[0] + unsigned char* c_string = &sequence[0] + + if dna: + c_complement(c_string, length, &DNA[0]) + else: + c_complement(c_string, length, &RNA[0]) + + cpdef str complement(str sequence, bint dna = True): """Complement a nucleotide sequence. @@ -139,20 +197,23 @@ cpdef str complement(str sequence, bint dna = True): cdef void c_reverse_complement( - char* sequence, + unsigned char* sequence, Py_ssize_t length, unsigned char* table ) noexcept: """Reverse complement sequence C string in place. Args: - sequence (char*): buffer pointer to nucleotide char sequence. - length (Py_ssize_t): length of seq. - table (unsigned char*): translation table. + sequence (uchar*): Buffer pointer to nucleotide char sequence. + length (Py_ssize_t): Length of sequence. + table (uchar*): Translation table. + + Returns: + (void) Reverse complement sequence in place. """ cdef: - char* end_ptr = sequence + (length - 1) + unsigned char* end_ptr = sequence + (length - 1) while end_ptr > sequence: sequence[0], end_ptr[0] = ( @@ -167,13 +228,42 @@ cdef void c_reverse_complement( cdef inline void v_reverse_complement(StringView* view, bint dna) noexcept: - """Handle reverse complement in place on StringView directly.""" + """Handle reverse complement in place on StringView directly. + + Args: + view (StringView*): Nucleotide sequence view. + + Returns: + (void) Reverse complement char in place. + + """ if dna: c_reverse_complement(view[0].ptr, view[0].size, &DNA[0]) else: c_reverse_complement(view[0].ptr, view[0].size, &RNA[0]) +cpdef void m_reverse_complement(unsigned char[:] sequence, bint dna = True): + """Reverse complement a nucleotide sequence. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + dna (bool): Sequence is DNA, else RNA. + + Returns: + (void) Reverse complement nucleotide sequence in place. + + """ + cdef: + Py_ssize_t length = sequence.shape[0] + unsigned char* c_string = &sequence[0] + + if dna: + c_reverse_complement(c_string, length, &DNA[0]) + else: + c_reverse_complement(c_string, length, &RNA[0]) + + cpdef str reverse_complement(str sequence, bint dna = True): """Reverse complement a nucleotide sequence. @@ -198,8 +288,8 @@ cpdef str reverse_complement(str sequence, bint dna = True): cdef inline void _center( - char* seq, - char* comp, + unsigned char* seq, + unsigned char* comp, Py_ssize_t* left, Py_ssize_t* right, Py_ssize_t length, @@ -235,7 +325,7 @@ cpdef str palindrome(str sequence, bint dna = True): dna (bool): Sequence is DNA, else RNA. Returns: - (str): longest palindromic subsequence within sequence. + (str) longest palindromic subsequence within sequence. Examples: .. code-block:: python @@ -243,10 +333,8 @@ cpdef str palindrome(str sequence, bint dna = True): palindrome("ATAT") == "ATAT" palindrome("GATATG") == "ATAT" palindrome("ANT") == "ANT" # Handles degenerate bases - palindrome("UGCA", False) == "UGCA" # Handles RNA sequences Notes: - * Algorithmic time complexity O(NlogN). * If a sequence contains two or more palindromic substrings of equal size, the first leftmost palindrome is prioritized. @@ -280,6 +368,53 @@ cpdef str palindrome(str sequence, bint dna = True): return sequence[start: end] +cdef inline int c_stretch( + unsigned char* sequence, + Py_ssize_t length, +) noexcept: + """Compute nucleotide stretch of a single character repeat. + + Args: + sequence (uchar*): buffer pointer to nucleotide char sequence. + length (Py_ssize_t): length of sequence. + + Returns: + (int) length of longest single character repeat subsequence. + + """ + cdef: + unsigned char prev = sequence[0] + Py_ssize_t j + int current = 0, longest = 0 + + for j in range(1, length): + if sequence[j] == prev: + current += 1 + if current > longest: + longest = current + else: + prev = sequence[j] + current = 0 + + return longest + + +cpdef int m_stretch(unsigned char[:] sequence): + """Return the maximum length of a single letter (nucleotide) repeat in a string. + + Args: + sequence (uchar[]): Nucleotide sequence writeable memory view. + + Returns: + (int) Length of maximum run of a single letter. + + """ + cdef: + Py_ssize_t length = sequence.shape[0] + + return c_stretch(&sequence[0], length) + + cpdef int stretch(str sequence): """Return the maximum length of a single letter (nucleotide) repeat in a string. @@ -287,7 +422,7 @@ cpdef int stretch(str sequence): sequence (str): Nucleotide sequence string. Returns: - (int): Length of maximum run of a single letter. + (int) Length of maximum run of a single letter. Examples: .. code-block:: python @@ -300,7 +435,7 @@ cpdef int stretch(str sequence): StringView view = str_to_view(sequence) Py_ssize_t j int longest = 0, current = 0 - char prev = view.ptr[0] + unsigned char prev = view.ptr[0] for j in range(1, view.size): if view.ptr[j] == prev: @@ -316,8 +451,8 @@ cpdef int stretch(str sequence): cdef inline bint _compare( - char* p, - char* q, + unsigned char* p, + unsigned char* q, Py_ssize_t start, Py_ssize_t end ) noexcept: @@ -333,7 +468,12 @@ cdef inline bint _compare( return True -cdef inline void _assign(char* src, char* dest, Py_ssize_t start, Py_ssize_t end): +cdef inline void _assign( + unsigned char* src, + unsigned char* dest, + Py_ssize_t start, + Py_ssize_t end +): """Overcome slice assignment between two char variables of different sizes.""" cdef: Py_ssize_t j, count = 0 @@ -343,7 +483,7 @@ cdef inline void _assign(char* src, char* dest, Py_ssize_t start, Py_ssize_t end count += 1 -cpdef int nrepeats(str sequence, int n): +cdef int c_nrepeats(unsigned char* sequence, int length, int n) noexcept: """Calculate the maximum observed repeats of composite pattern size n characters. Args: @@ -354,38 +494,60 @@ cpdef int nrepeats(str sequence, int n): (int) The longest tandem run of nucleotides comprised of a composite pattern of length n characters. - Raises: - ValueError: if value of n is less than 1. - - Examples: - .. code-block:: python - - nrepeats("AAAA", 1) == 3 # True - nrepeats("AAAA", 2) == 1 # True - nrepeats("ACAACAACA", 3) == 2 # True - """ cdef: - StringView view = str_to_view(sequence) Py_ssize_t t = n - Py_ssize_t v = view.size // t + Py_ssize_t v = length // t Py_ssize_t j, k int current, max_val = 0 - char* previous = malloc((t + 1) * sizeof(char)) + unsigned char* previous = malloc( + (t + 1) * sizeof(unsigned char) + ) for k in range(t): - _assign(view.ptr, previous, k, t + k) + _assign(sequence, previous, k, t + k) current = 0 for j in range(1, v): - if _compare(view.ptr, previous, j * t + k, j * t + k + t): + if _compare(sequence, previous, j * t + k, j * t + k + t): current += 1 if current > max_val: max_val = current else: current = 0 - _assign(view.ptr, previous, j * t + k, j * t + k + t) + _assign(sequence, previous, j * t + k, j * t + k + t) - free(view.ptr) free(previous) return max_val + + +cpdef int nrepeats(str sequence, int n): + """Calculate the maximum observed repeats of composite pattern size n characters. + + Args: + sequence (str): Nucleotide sequence string. + n (int): Size of k-mers (composite pattern) to observe. + + Returns: + (int) The longest tandem run of nucleotides comprised of a composite pattern + of length n characters. + + Raises: + ZeroDivisionError: if value of n is 0. + + Examples: + .. code-block:: python + + nrepeats("AAAA", 1) == 3 # True + nrepeats("AAAA", 2) == 1 # True + nrepeats("ACAACAACA", 3) == 2 # True + + """ + cdef: + StringView view = str_to_view(sequence) + int max_val + + max_val = c_nrepeats(view.ptr, view.size, n) + free(view.ptr) + + return max_val From c4d60c85ec67bcf983d4de5c1ae535a8e1ffa91c Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Mon, 15 Sep 2025 13:47:22 -0700 Subject: [PATCH 3/5] feat(_oligonucleotides): Update manacher's algorithm to use unisgned char. --- src/designer_dna/_oligonucleotides.pyi | 5 +--- src/designer_dna/_oligonucleotides.pyx | 36 +++++++++++++++++--------- src/designer_dna/_oligos.pxd | 1 + 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/designer_dna/_oligonucleotides.pyi b/src/designer_dna/_oligonucleotides.pyi index 6bb0a89..74dd7a4 100644 --- a/src/designer_dna/_oligonucleotides.pyi +++ b/src/designer_dna/_oligonucleotides.pyi @@ -37,12 +37,9 @@ def manacher(sequence: str, dna: bool = ...) -> str: dna (bool): Sequence is DNA, else RNA. Returns: - (str): longest palindromic substring within sequence. + (str) Longest palindromic substring within a sequence. Notes: * This is a cython/c++ implementation of the O(n) Manacher's algorithm. - * This algorithm is typically slower than the O(nlogn) palindrome function for - strings up to 2^23 characters (not benchmarked beyond this limit). - * This function here is primarily here for demonstration purposes. """ diff --git a/src/designer_dna/_oligonucleotides.pyx b/src/designer_dna/_oligonucleotides.pyx index f9bf877..2df65f4 100644 --- a/src/designer_dna/_oligonucleotides.pyx +++ b/src/designer_dna/_oligonucleotides.pyx @@ -30,22 +30,36 @@ # distutils: language = c++ """Oligonucleotide functions with the help of C++.""" -from narray cimport NumericArray -from designer_dna._oligos cimport v_complement -from common cimport StringView, str_to_view -from libc.stdlib cimport free +from libc.stdlib cimport free cdef extern from "Python.h": str PyUnicode_Join(str, str) +from common cimport StringView, str_to_view +from narray cimport NumericArray + +from designer_dna._oligos cimport v_complement + cdef inline void _compute( - char* s, - char* c, + unsigned char* s, + unsigned char* c, NumericArray[int]* arr, ssize_t n, ): + """Primary computation behind manacher's algorithm. + + Args: + s (uchar*): nucleotide sequence + c (uchar*): complement of nucleotide sequence + arr (NumericArray[int]*): an array of integers + n (ssize_t): length of input sequence, s. + + Returns: + (void) relevant data saved in place to NumericArray + + """ cdef: ssize_t mirror, a, b, i, stemp, center = 0, radius = 0 int temp, zero = 0 @@ -92,13 +106,10 @@ cpdef str manacher(str sequence, bint dna = True): dna (bool): Sequence is DNA, else RNA. Returns: - (str): longest palindromic substring within sequence. + (str) Longest palindromic substring within a sequence. Notes: * This is a cython/c++ implementation of the O(n) Manacher's algorithm. - * This algorithm is typically slower than the O(nlogn) palindrome function for - strings up to 2^23 characters (not benchmarked beyond this limit). - * This function here is primarily here for demonstration purposes. """ cdef: @@ -117,11 +128,12 @@ cpdef str manacher(str sequence, bint dna = True): free(ref.ptr) free(com.ptr) - # Enumerate, capturing index (center) and value of max (radius) + # Enumerate, capturing index (center) at value of max (radius) for i in range(1, ref.size - 1): if arr[0][i] > radius: radius = arr[0][i] center = i del arr - return k[center - radius + 1: center + radius: 2] + # By nature, a palindrome is symmetrical around center (+/- radius) + return sequence[(center - radius + 1) // 2 - 1: (center + radius) // 2] diff --git a/src/designer_dna/_oligos.pxd b/src/designer_dna/_oligos.pxd index c19c047..3dec69e 100644 --- a/src/designer_dna/_oligos.pxd +++ b/src/designer_dna/_oligos.pxd @@ -40,3 +40,4 @@ cdef: void v_reverse_complement(StringView*, bint) int c_stretch(unsigned char*, Py_ssize_t) + int c_nrepeats(unsigned char*, int, int) From 3c8da576233ef339c4160fba676e8009e1fb6a86 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Mon, 15 Sep 2025 14:30:52 -0700 Subject: [PATCH 4/5] tests(oligos): Include unit tests for memoryview equivalent of functions offered through cython _oligos module. --- tests/unit/test_oligos.py | 66 +++++++++++++++++++++++++++++++++++---- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_oligos.py b/tests/unit/test_oligos.py index cf588eb..ccc1675 100644 --- a/tests/unit/test_oligos.py +++ b/tests/unit/test_oligos.py @@ -29,14 +29,52 @@ """Unit test oligos module.""" -from typing import Callable +from functools import wraps +from typing import Any, Callable import pytest -from designer_dna import oligos +from designer_dna import _oligos, oligos -@pytest.mark.parametrize("function", [oligos.reverse, oligos.reverse_py]) +def wrapper(f: Callable[[str], str]) -> Callable[[str], str]: + """Wrap a function which acts inplace on string.""" + + @wraps(f) + def inner(s: str, *args) -> str: + b: bytes = s.encode("utf8") + ba: bytearray = bytearray(b) + m: memoryview = memoryview(ba) + + f(m, *args) + + return ba.decode("utf8") + + return inner + + +def wrap_out(f: Callable[[str], Any]) -> Callable[[str], Any]: + """Wrap a function which uses a string, but provides a different output.""" + + @wraps(f) + def inner(s: str, *args) -> Any: + b: bytes = s.encode("utf8") + ba: bytearray = bytearray(b) + m: memoryview = memoryview(ba) + + return f(m, *args) + + return inner + + +@pytest.mark.parametrize( + "function", + [ + oligos.reverse, + wrapper(_oligos.m_reverse), + oligos.reverse_py, + ], +) @pytest.mark.parametrize( ["seq", "expected"], [ @@ -67,7 +105,11 @@ def test_reverse(seq: str, expected: str, function: Callable[[str], str]) -> Non @pytest.mark.parametrize( "function", - [oligos.complement, oligos.complement_py], + [ + oligos.complement, + wrapper(_oligos.m_complement), + oligos.complement_py, + ], ) @pytest.mark.parametrize( ["seq", "dna", "expected"], @@ -108,7 +150,12 @@ def test_complement( @pytest.mark.parametrize( - "function", [oligos.reverse_complement, oligos.reverse_complement_py] + "function", + [ + oligos.reverse_complement, + wrapper(_oligos.m_reverse_complement), + oligos.reverse_complement_py, + ], ) @pytest.mark.parametrize( ["seq", "dna", "expected"], @@ -137,7 +184,14 @@ def test_reverse_complement( assert result == expected, "Unexpected reverse complement." -@pytest.mark.parametrize("function", [oligos.stretch, oligos.stretch_py]) +@pytest.mark.parametrize( + "function", + [ + oligos.stretch, + wrap_out(_oligos.m_stretch), + oligos.stretch_py, + ], +) @pytest.mark.parametrize( ["seq", "expected"], [ From 482596034050759f04d802625d01825f596bae49 Mon Sep 17 00:00:00 2001 From: Spill-Tea Date: Mon, 15 Sep 2025 19:31:10 -0700 Subject: [PATCH 5/5] feat(oligos): Implement c extraction of palindrome function, and further organization improvements of code to reduce duplication. --- src/designer_dna/_oligos.pxd | 5 +- src/designer_dna/_oligos.pyi | 16 ++++ src/designer_dna/_oligos.pyx | 151 +++++++++++++++++++++++++---------- tests/unit/test_oligos.py | 9 ++- 4 files changed, 135 insertions(+), 46 deletions(-) diff --git a/src/designer_dna/_oligos.pxd b/src/designer_dna/_oligos.pxd index 3dec69e..82ea680 100644 --- a/src/designer_dna/_oligos.pxd +++ b/src/designer_dna/_oligos.pxd @@ -33,11 +33,12 @@ cdef: void c_reverse(unsigned char*, Py_ssize_t) void v_reverse(StringView*) - void c_complement(unsigned char*, Py_ssize_t, unsigned char*) + void c_complement(unsigned char*, Py_ssize_t, bint) void v_complement(StringView*, bint) - void c_reverse_complement(unsigned char*, Py_ssize_t, unsigned char*) + void c_reverse_complement(unsigned char*, Py_ssize_t, bint) void v_reverse_complement(StringView*, bint) + (Py_ssize_t, Py_ssize_t) c_palindrome(unsigned char*, Py_ssize_t, bint) int c_stretch(unsigned char*, Py_ssize_t) int c_nrepeats(unsigned char*, int, int) diff --git a/src/designer_dna/_oligos.pyi b/src/designer_dna/_oligos.pyi index 32e09a0..3d87e04 100644 --- a/src/designer_dna/_oligos.pyi +++ b/src/designer_dna/_oligos.pyi @@ -169,6 +169,22 @@ def stretch(sequence: str) -> int: """ +def m_nrepeats(sequence: array[int], n: int) -> int: + """Calculate the maximum observed repeats of composite pattern size n characters. + + Args: + sequence (uchar[]): Nucleotide sequence string. + n (int): Size of k-mers (composite pattern) to observe. + + Returns: + (int) The longest tandem run of nucleotides comprised of a composite pattern + of length n characters. + + Raises: + ZeroDivisionError: if value of n is 0. + + """ + def nrepeats(sequence: str, n: int) -> int: """Calculate the maximum observed repeats of composite pattern size n characters. diff --git a/src/designer_dna/_oligos.pyx b/src/designer_dna/_oligos.pyx index 3601eb6..d654895 100644 --- a/src/designer_dna/_oligos.pyx +++ b/src/designer_dna/_oligos.pyx @@ -30,6 +30,7 @@ # cython: boundscheck=False, wraparound=False, nonecheck=False """Cythonized oligonucleotide functions.""" +from libc.string cimport memcpy from libc.stdlib cimport free, malloc from common cimport ( @@ -108,7 +109,7 @@ cpdef str reverse(str sequence): return sequence[::-1] -cdef void c_complement( +cdef inline void _c_complement( unsigned char* sequence, Py_ssize_t length, unsigned char* table @@ -116,8 +117,8 @@ cdef void c_complement( """Complement sequence C string in place. Args: - seq (uchar*): buffer sequence. - length (Py_ssize_t): length of seq. + sequence (uchar*): Nucleotide sequence pointer. + length (Py_ssize_t): Length of seq. table (uchar*): translation table. Returns: @@ -136,6 +137,28 @@ cdef void c_complement( sequence[idx] = table[ sequence[idx]] +cdef void c_complement( + unsigned char* sequence, + Py_ssize_t length, + bint dna +) noexcept: + """Complement sequence C string in place. + + Args: + sequence (uchar*): Nucleotide sequence pointer. + length (Py_ssize_t): Length of seq. + dna (bint): DNA else RNA. + + Returns: + (void) Complement sequence in place. + + """ + if dna: + _c_complement(sequence, length, &DNA[0]) + else: + _c_complement(sequence, length, &RNA[0]) + + cdef inline void v_complement(StringView* view, bint dna) noexcept: """Handle complement in place on StringView directly. @@ -146,10 +169,7 @@ cdef inline void v_complement(StringView* view, bint dna) noexcept: (void) Complement char in place. """ - if dna: - c_complement(view[0].ptr, view[0].size, &DNA[0]) - else: - c_complement(view[0].ptr, view[0].size, &RNA[0]) + c_complement(view[0].ptr, view[0].size, dna) cpdef void m_complement(unsigned char[:] sequence, bint dna = True): @@ -167,10 +187,7 @@ cpdef void m_complement(unsigned char[:] sequence, bint dna = True): Py_ssize_t length = sequence.shape[0] unsigned char* c_string = &sequence[0] - if dna: - c_complement(c_string, length, &DNA[0]) - else: - c_complement(c_string, length, &RNA[0]) + c_complement(c_string, length, dna) cpdef str complement(str sequence, bint dna = True): @@ -196,7 +213,7 @@ cpdef str complement(str sequence, bint dna = True): return to_str(view) -cdef void c_reverse_complement( +cdef inline void _c_reverse_complement( unsigned char* sequence, Py_ssize_t length, unsigned char* table @@ -204,7 +221,7 @@ cdef void c_reverse_complement( """Reverse complement sequence C string in place. Args: - sequence (uchar*): Buffer pointer to nucleotide char sequence. + sequence (uchar*): Nucleotide sequence pointer. length (Py_ssize_t): Length of sequence. table (uchar*): Translation table. @@ -227,6 +244,28 @@ cdef void c_reverse_complement( sequence[0] = table[ sequence[0]] +cdef void c_reverse_complement( + unsigned char* sequence, + Py_ssize_t length, + bint dna +) noexcept: + """Reverse complement sequence C string in place. + + Args: + sequence (uchar*): Nucleotide sequence pointer. + length (Py_ssize_t): Length of seq. + dna (bint): Sequence is DNA, else RNA. + + Returns: + (void) Complement seq in place. + + """ + if dna: + _c_reverse_complement(sequence, length, &DNA[0]) + else: + _c_reverse_complement(sequence, length, &RNA[0]) + + cdef inline void v_reverse_complement(StringView* view, bint dna) noexcept: """Handle reverse complement in place on StringView directly. @@ -237,10 +276,7 @@ cdef inline void v_reverse_complement(StringView* view, bint dna) noexcept: (void) Reverse complement char in place. """ - if dna: - c_reverse_complement(view[0].ptr, view[0].size, &DNA[0]) - else: - c_reverse_complement(view[0].ptr, view[0].size, &RNA[0]) + c_reverse_complement(view[0].ptr, view[0].size, dna) cpdef void m_reverse_complement(unsigned char[:] sequence, bint dna = True): @@ -256,12 +292,8 @@ cpdef void m_reverse_complement(unsigned char[:] sequence, bint dna = True): """ cdef: Py_ssize_t length = sequence.shape[0] - unsigned char* c_string = &sequence[0] - if dna: - c_reverse_complement(c_string, length, &DNA[0]) - else: - c_reverse_complement(c_string, length, &RNA[0]) + c_reverse_complement(&sequence[0], length, dna) cpdef str reverse_complement(str sequence, bint dna = True): @@ -317,6 +349,40 @@ cdef inline void _update_bounds( end[0] = right +cdef inline (Py_ssize_t, Py_ssize_t) c_palindrome( + unsigned char* seq, + Py_ssize_t size, + bint dna +) noexcept: + cdef: + unsigned char* com = malloc((size + 1) * sizeof(unsigned char)) + Py_ssize_t i, left, right, current, length = 0, start = 0, end = 0 + + memcpy(com, seq, size) + com[size] = "\0" + c_complement(com, size, dna) + + for i in range(size - 1): + # Check even length palindromes first (more common for ATGC based sequences) + left = i + right = i + 1 + _center(seq, com, &left, &right, size) + _update_bounds(left, right, ¤t, &length, &start, &end) + + # Only check odd length palindromes in case of (center) degenerate bases + if seq[i] != com[i]: + continue + + left = i - 1 + right = i + 1 + _center(seq, com, &left, &right, size) + _update_bounds(left, right, ¤t, &length, &start, &end) + + free(com) + + return start, end + + cpdef str palindrome(str sequence, bint dna = True): """Find the longest palindromic substring within a nucleotide sequence. @@ -341,29 +407,10 @@ cpdef str palindrome(str sequence, bint dna = True): """ cdef: StringView seq = str_to_view(sequence) - StringView com = str_to_view(sequence) - Py_ssize_t i, left, right, current, length = 0, start = 0, end = 0 - - v_complement(&com, dna) - - for i in range(seq.size - 1): - # Check even length palindromes first (more common for ATGC based sequences) - left = i - right = i + 1 - _center(seq.ptr, com.ptr, &left, &right, seq.size) - _update_bounds(left, right, ¤t, &length, &start, &end) - - # Only check odd length palindromes in case of (center) degenerate bases - if seq.ptr[i] != com.ptr[i]: - continue - - left = i - 1 - right = i + 1 - _center(seq.ptr, com.ptr, &left, &right, seq.size) - _update_bounds(left, right, ¤t, &length, &start, &end) + Py_ssize_t start, end + start, end = c_palindrome(seq.ptr, seq.size, dna) free(seq.ptr) - free(com.ptr) return sequence[start: end] @@ -521,6 +568,24 @@ cdef int c_nrepeats(unsigned char* sequence, int length, int n) noexcept: return max_val +cpdef int m_nrepeats(unsigned char[:] sequence, int n): + """Calculate the maximum observed repeats of composite pattern size n characters. + + Args: + sequence (uchar[]): Nucleotide sequence string. + n (int): Size of k-mers (composite pattern) to observe. + + Returns: + (int) The longest tandem run of nucleotides comprised of a composite pattern + of length n characters. + + Raises: + ZeroDivisionError: if value of n is 0. + + """ + return c_nrepeats(&sequence[0], sequence.shape[0], n) + + cpdef int nrepeats(str sequence, int n): """Calculate the maximum observed repeats of composite pattern size n characters. diff --git a/tests/unit/test_oligos.py b/tests/unit/test_oligos.py index ccc1675..6da7734 100644 --- a/tests/unit/test_oligos.py +++ b/tests/unit/test_oligos.py @@ -210,7 +210,14 @@ def test_stretch(seq, expected: int, function: Callable[[str], int]) -> None: assert result == expected, f"Unexpected stretch calculation: {result}" -@pytest.mark.parametrize("function", [oligos.nrepeats, oligos.nrepeats_py]) +@pytest.mark.parametrize( + "function", + [ + oligos.nrepeats, + wrap_out(_oligos.m_nrepeats), + oligos.nrepeats_py, + ], +) @pytest.mark.parametrize( ["seq", "n", "expected"], [