cschu · cschu · Dec 15, 2024 · Dec 15, 2024 · Dec 15, 2024 · Dec 16, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:22.04
 
 LABEL maintainer="cschu1981@gmail.com"
-LABEL version="2.18.0"
+LABEL version="2.19.0"
 LABEL description="gffquant - functional profiling of metagenomic/transcriptomic wgs samples"
 
 

diff --git a/gffquant/__init__.py b/gffquant/__init__.py
@@ -5,7 +5,7 @@
 from enum import Enum, auto, unique
 
 
-__version__ = "2.18.0"
+__version__ = "2.19.0"
 __tool__ = "gffquant"
 
 

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+from ..counters.count_manager import CountManager
 
 logger = logging.getLogger(__name__)
 
@@ -198,17 +199,18 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
     # pylint: disable=R0914,W0613
-    def annotate(self, refmgr, db, count_manager, gene_group_db=False):
+    def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
         """
         Annotate a set of region counts via db-lookup.
         input:
         - bam: bamr.BamFile to use as lookup table for reference names
         - db: GffDatabaseManager holding functional annotation database
         - count_manager: count_data
         """
-        for rid in set(count_manager.uniq_regioncounts).union(
-            count_manager.ambig_regioncounts
-        ):
+        # for rid in set(count_manager.uniq_regioncounts).union(
+        #     count_manager.ambig_regioncounts
+        # ):
+        for rid in count_manager.get_all_regions(region_counts=True):
             ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
 
             for region in count_manager.get_regions(rid):
@@ -273,7 +275,7 @@ class GeneCountAnnotator(CountAnnotator):
     def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    def annotate(self, refmgr, db, count_manager, gene_group_db=False):
+    def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
         """
         Annotate a set of gene counts via db-iteration.
         input:
@@ -286,9 +288,7 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False):
             if self.strand_specific else None
         )
 
-        for rid in set(count_manager.uniq_seqcounts).union(
-            count_manager.ambig_seqcounts
-        ):
+        for rid in count_manager.get_all_regions():
             ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
             uniq_counts, ambig_counts = count_manager.get_counts(

diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py
@@ -3,7 +3,7 @@
 
 """module docstring"""
 
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 from .region_counter import RegionCounter
 from .seq_counter import UniqueSeqCounter, AmbiguousSeqCounter
 from .count_manager import CountManager
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
@@ -0,0 +1,91 @@
+from collections import Counter
+
+import numpy as np
+
+from .. import DistributionMode
+
+
+class AlignmentCounter:
+    COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
+    INITIAL_SIZE = 1000
+
+    @staticmethod
+    def normalise_counts(counts, feature_len, scaling_factor):
+        """Returns raw, length-normalised, and scaled feature counts."""
+        normalised = counts / feature_len
+        scaled = normalised * scaling_factor
+        return counts, normalised, scaled
+
-    @staticmethod
-    def normalise_counts(counts, feature_len, scaling_factor):
-        """Returns raw, length-normalised, and scaled feature counts."""
-        normalised = counts / feature_len
-        scaled = normalised * scaling_factor
-        return counts, normalised, scaled
+    @staticmethod
+    def normalise_counts(counts, feature_len, scaling_factor):
+        """Returns raw, length-normalised, and scaled feature counts."""
+        if feature_len <= 0:
+            raise ValueError("Feature length must be positive")
+        if scaling_factor < 0:
+            raise ValueError("Scaling factor cannot be negative")
+        normalised = counts / feature_len
+        scaled = normalised * scaling_factor
+        return counts, normalised, scaled
-    @staticmethod
-    def normalise_counts(counts, feature_len, scaling_factor):
-        """Returns raw, length-normalised, and scaled feature counts."""
-        normalised = counts / feature_len
-        scaled = normalised * scaling_factor
-        return counts, normalised, scaled
+    @staticmethod
+    def normalise_counts(counts, feature_len, scaling_factor):
+        """Returns raw, length-normalised, and scaled feature counts."""
+        if feature_len <= 0:
+            raise ValueError("Feature length must be positive")
+        if scaling_factor < 0:
+            raise ValueError("Scaling factor cannot be negative")
+        normalised = counts / feature_len
+        scaled = normalised * scaling_factor
+        return counts, normalised, scaled
+    def get_increment(self, n_aln, increment):
+        # 1overN = lavern. Maya <3
+        return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
+
+    def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False):
+        self.distribution_mode = distribution_mode
+        self.strand_specific = strand_specific
+        self.unannotated_reads = 0
+
+        self.index = {}
+        self.counts = np.zeros(
+            (AlignmentCounter.INITIAL_SIZE, 2),
+        )
+    def dump(self, prefix, refmgr):
+        ...
+    def get(self, key, default_val):
+        key_index = self.index.get(key)
+        if key_index is None:
+            return Counter()
+        return Counter({key: self.counts[key_index]})
+
+    def setdefault(self, key, default_val):
+        ...
+
+    def has_ambig_counts(self):
+        return bool(self.counts[:, 1].sum() != 0)
+
+    def __iter__(self):
+        yield from self.index.keys()
+    def __getitem__(self, key):
+        key_index = self.index.get(key)
+        if key_index is None:
+            return 0.0
+        return self.counts[key_index]
+    def __setitem__(self, key, value):
+        key_index = self.index.get(key)
+        if key_index is not None:
+            self.counts[key_index] = value
+        else:
+            raise KeyError(f"{key=} not found.")
+
+    def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
+        contributed_counts = 0
+        for hits, aln_count in count_stream:
+            hit = hits[0]
+            inc = (
+                (
+                    self.get_increment(aln_count, increment),
+                    increment,
+                )
+            )[aln_count == 1]
+            key = (
+                (
+                    (hit.rid, hit.rev_strand),
+                    hit.rid
+                )
+            )[self.strand_specific]
+
+            key_index = self.index.get(key)
+            if key_index is None:
+                nrows = self.counts.shape[0]
+                if len(self.index) == nrows:
+                    self.counts = np.pad(
+                        self.counts,
+                        ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
+                    )
+                # key_index = self.index.setdefault(key, len(self.index))
+                key_index = self.index[key] = len(self.index)
+            self.counts[key_index][int(ambiguous_counts)] += inc
+            contributed_counts += inc
+
+        return contributed_counts
+
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
@@ -3,7 +3,7 @@
 from collections import Counter
 
 from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 from .region_counter import RegionCounter
 
 
@@ -50,32 +50,47 @@ def __init__(
         self.increments = [1.0, 1.0]
         self.increments_auto_detect = [1.0, self.paired_end_count / 2.0]
 
-        self.uniq_seqcounts, self.ambig_seqcounts = None, None
-        self.uniq_regioncounts, self.ambig_regioncounts = None, None
+        # self.uniq_seqcounts, self.ambig_seqcounts = None, None
+        # self.uniq_regioncounts, self.ambig_regioncounts = None, None
+        self.seqcounts, self.regioncounts = None, None
-        # self.uniq_seqcounts, self.ambig_seqcounts = None, None
-        # self.uniq_regioncounts, self.ambig_regioncounts = None, None
-        self.seqcounts, self.regioncounts = None, None
+        """
+        Initialize counters that handle both unique and ambiguous counts.
+        This consolidation improves maintainability and reduces code duplication.
+        """
+        self.seqcounts, self.regioncounts = None, None
-        # self.uniq_seqcounts, self.ambig_seqcounts = None, None
-        # self.uniq_regioncounts, self.ambig_regioncounts = None, None
-        self.seqcounts, self.regioncounts = None, None
+        """
+        Initialize counters that handle both unique and ambiguous counts.
+        This consolidation improves maintainability and reduces code duplication.
+        """
+        self.seqcounts, self.regioncounts = None, None
 
         if region_counts:
-            self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific)
-            self.ambig_regioncounts = RegionCounter(
+            # self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific)
+            # self.ambig_regioncounts = RegionCounter(
+            #     strand_specific=strand_specific,
+            #     distribution_mode=distribution_mode,
+            # )
+            self.regioncounts = RegionCounter(
                 strand_specific=strand_specific,
                 distribution_mode=distribution_mode,
             )
 
         else:
-            self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific)
-            self.ambig_seqcounts = AlignmentCounter(
+            # self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific)
+            # self.ambig_seqcounts = AlignmentCounter(
+            #     strand_specific=strand_specific,
+            #     distribution_mode=distribution_mode
+            # )
+            self.seqcounts = AlignmentCounter(
                 strand_specific=strand_specific,
-                distribution_mode=distribution_mode
+                distribution_mode=distribution_mode,
             )
 
     def has_ambig_counts(self):
-        return self.ambig_regioncounts or self.ambig_seqcounts
+        return any(
+            (
+                self.seqcounts and self.seqcounts.has_ambig_counts(),
+                self.regioncounts and self.regioncounts.has_ambig_counts(),
+            )
+        )
+        # return self.ambig_regioncounts or self.ambig_seqcounts
 
     def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None):
-        seq_counter, region_counter = (
-            (self.uniq_seqcounts, self.uniq_regioncounts)
-            if not ambiguous_counts
-            else (self.ambig_seqcounts, self.ambig_regioncounts)
-        )
+        # seq_counter, region_counter = (
+        #     (self.uniq_seqcounts, self.uniq_regioncounts)
+        #     if not ambiguous_counts
+        #     else (self.ambig_seqcounts, self.ambig_regioncounts)
+        # )
 
         if pe_library is not None:
             # this is the case when the alignment has a read group tag
@@ -91,40 +106,51 @@ def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_lib
             increment = self.increments[pair]
 
         contributed_counts = 0
-        if seq_counter is not None:
-            contributed_counts = seq_counter.update_counts(count_stream, increment=increment)
-        elif region_counter is not None:
-            contributed_counts = region_counter.update_counts(count_stream, increment=increment)
+        if self.seqcounts is not None:
+            contributed_counts = self.seqcounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
+        elif self.regioncounts is not None:
+            contributed_counts = self.regioncounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
+        # if seq_counter is not None:
+            # contributed_counts = seq_counter.update_counts(count_stream, increment=increment)
+        # elif region_counter is not None:
+            # contributed_counts = region_counter.update_counts(count_stream, increment=increment)
 
         return contributed_counts
 
     def dump_raw_counters(self, prefix, refmgr):
-        if self.uniq_seqcounts is not None:
-            self.uniq_seqcounts.dump(prefix, refmgr)
-        if self.ambig_seqcounts is not None:
-            self.ambig_seqcounts.dump(prefix, refmgr)
-        if self.uniq_regioncounts is not None:
-            self.uniq_regioncounts.dump(prefix, refmgr)
-        if self.ambig_regioncounts is not None:
-            self.ambig_regioncounts.dump(prefix, refmgr)
+        # if self.uniq_seqcounts is not None:
+        #     self.uniq_seqcounts.dump(prefix, refmgr)
+        # if self.ambig_seqcounts is not None:
+        #     self.ambig_seqcounts.dump(prefix, refmgr)
+        # if self.uniq_regioncounts is not None:
+        #     self.uniq_regioncounts.dump(prefix, refmgr)
+        # if self.ambig_regioncounts is not None:
+        #     self.ambig_regioncounts.dump(prefix, refmgr)
+        ...
 
     def get_unannotated_reads(self):
         unannotated_reads = 0
 
-        if self.uniq_regioncounts is not None:
-            unannotated_reads += self.uniq_regioncounts.unannotated_reads
-        if self.ambig_regioncounts is not None:
-            unannotated_reads += self.ambig_regioncounts.unannotated_reads
-        if self.uniq_seqcounts is not None:
-            unannotated_reads += self.uniq_seqcounts.unannotated_reads
-        if self.ambig_seqcounts is not None:
-            unannotated_reads += self.ambig_seqcounts.unannotated_reads
+        # if self.uniq_regioncounts is not None:
+        #     unannotated_reads += self.uniq_regioncounts.unannotated_reads
+        # if self.ambig_regioncounts is not None:
+        #     unannotated_reads += self.ambig_regioncounts.unannotated_reads
+        # if self.uniq_seqcounts is not None:
+        #     unannotated_reads += self.uniq_seqcounts.unannotated_reads
+        # if self.ambig_seqcounts is not None:
+        #     unannotated_reads += self.ambig_seqcounts.unannotated_reads
+        if self.regioncounts is not None:
+            unannotated_reads += self.regioncounts
+        if self.seqcounts is not None:
+            unannotated_reads += self.seqcounts
-        if self.seqcounts is not None:
-            unannotated_reads += self.seqcounts
+        if self.seqcounts is not None:
+            unannotated_reads += self.seqcounts.unannotated_reads
-        if self.seqcounts is not None:
-            unannotated_reads += self.seqcounts
+        if self.seqcounts is not None:
+            unannotated_reads += self.seqcounts.unannotated_reads
 
         return unannotated_reads
 
     def get_counts(self, seqid, region_counts=False, strand_specific=False):
         if region_counts:
+            raise NotImplementedError()
             rid, seqid = seqid[0], seqid[1:]
+
             uniq_counter = self.uniq_regioncounts.get(rid, Counter())
             ambig_counter = self.ambig_regioncounts.get(rid, Counter())
 
@@ -135,9 +161,11 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 return [uniq_counter[seqid]], [ambig_counter[seqid]]
 
         else:
-            uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts
+            # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts
+
 
             if strand_specific:
+                raise NotImplementedError()
                 uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
                 uniq_counts[seqid[1]] = uniq_counter[seqid]
                 ambig_counts[seqid[1]] = ambig_counter[seqid]
@@ -152,11 +180,29 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 #     ambig_counter[(rid, CountManager.MINUS_STRAND)],
                 # ]
             else:
-                uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
+                # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
+                uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]]
 
             return uniq_counts, ambig_counts
 
     def get_regions(self, rid):
-        return set(self.uniq_regioncounts.get(rid, set())).union(
-            self.ambig_regioncounts.get(rid, set())
+        # return set(self.uniq_regioncounts.get(rid, set())).union(
+        #     self.ambig_regioncounts.get(rid, set())
+        # )
+        return set(self.uniq_regioncounts.get(rid, Counter())).union(
+            self.ambig_regioncounts.get(rid, Counter())
         )
+
+    def get_all_regions(self, region_counts=False):
+        # uniq_counts, ambig_counts = (
+        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
+        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
+        # )[region_counts]
+        # yield from set(uniq_counts).union(ambig_counts)
+        counts = (
+            self.seqcounts,
+            self.regioncounts,
+        )[region_counts]
+
+        yield from counts
+
-    def get_all_regions(self, region_counts=False):
-        # uniq_counts, ambig_counts = (
-        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
-        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
-        # )[region_counts]
-        # yield from set(uniq_counts).union(ambig_counts)
-        counts = (
-            self.seqcounts,
-            self.regioncounts,
-        )[region_counts]
-
-        yield from counts
+    def get_all_regions(self, region_counts=False):
+        # uniq_counts, ambig_counts = (
+        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
+        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
+        # )[region_counts]
+        # yield from set(uniq_counts).union(ambig_counts)
+        counts = (
+            self.seqcounts,
+            self.regioncounts,
+        )[region_counts]
+
+        if counts is None:
+            return
+        yield from counts
-    def get_all_regions(self, region_counts=False):
-        # uniq_counts, ambig_counts = (
-        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
-        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
-        # )[region_counts]
-        # yield from set(uniq_counts).union(ambig_counts)
-        counts = (
-            self.seqcounts,
-            self.regioncounts,
-        )[region_counts]
-
-        yield from counts
+    def get_all_regions(self, region_counts=False):
+        # uniq_counts, ambig_counts = (
+        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
+        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
+        # )[region_counts]
+        # yield from set(uniq_counts).union(ambig_counts)
+        counts = (
+            self.seqcounts,
+            self.regioncounts,
+        )[region_counts]
+
+        if counts is None:
+            return
+        yield from counts
diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py
@@ -5,7 +5,7 @@
 from collections import Counter
 
 from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 
 
 class RegionCounter(AlignmentCounter):

diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py
@@ -3,7 +3,7 @@
 """ module docstring """
 
 from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 
 
 class UniqueSeqCounter(AlignmentCounter):