From 61b93da99defeb80d4b6f155d671fc4662b73d9c Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 15 Dec 2024 23:37:09 +0100
Subject: [PATCH 001/128] initial

---
 gffquant/annotation/count_annotator.py  | 19 +++---
 gffquant/counters/__init__.py           |  2 +-
 gffquant/counters/alignment_counter2.py | 81 +++++++++++++++++++++++++
 gffquant/counters/count_manager.py      | 17 +++++-
 gffquant/counters/region_counter.py     |  2 +-
 gffquant/counters/seq_counter.py        |  2 +-
 6 files changed, 109 insertions(+), 14 deletions(-)
 create mode 100644 gffquant/counters/alignment_counter2.py

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 760250dc..4d91f8cd 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+from ..counters.count_manager import CountManager
 
 logger = logging.getLogger(__name__)
 
@@ -198,7 +199,7 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
     # pylint: disable=R0914,W0613
-    def annotate(self, refmgr, db, count_manager, gene_group_db=False):
+    def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
         """
         Annotate a set of region counts via db-lookup.
         input:
@@ -206,9 +207,10 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False):
         - db: GffDatabaseManager holding functional annotation database
         - count_manager: count_data
         """
-        for rid in set(count_manager.uniq_regioncounts).union(
-            count_manager.ambig_regioncounts
-        ):
+        # for rid in set(count_manager.uniq_regioncounts).union(
+        #     count_manager.ambig_regioncounts
+        # ):
+        for rid in count_manager.get_all_regions(region_counts=True):
             ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
 
             for region in count_manager.get_regions(rid):
@@ -273,7 +275,7 @@ class GeneCountAnnotator(CountAnnotator):
     def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    def annotate(self, refmgr, db, count_manager, gene_group_db=False):
+    def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
         """
         Annotate a set of gene counts via db-iteration.
         input:
@@ -286,9 +288,10 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False):
             if self.strand_specific else None
         )
 
-        for rid in set(count_manager.uniq_seqcounts).union(
-            count_manager.ambig_seqcounts
-        ):
+        # for rid in set(count_manager.uniq_seqcounts).union(
+        #     count_manager.ambig_seqcounts
+        # ):
+        for rid in count_manager.get_all_regions():
             ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
             uniq_counts, ambig_counts = count_manager.get_counts(
diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py
index 7641c957..774cd03c 100644
--- a/gffquant/counters/__init__.py
+++ b/gffquant/counters/__init__.py
@@ -3,7 +3,7 @@
 
 """module docstring"""
 
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 from .region_counter import RegionCounter
 from .seq_counter import UniqueSeqCounter, AmbiguousSeqCounter
 from .count_manager import CountManager
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
new file mode 100644
index 00000000..e6f82111
--- /dev/null
+++ b/gffquant/counters/alignment_counter2.py
@@ -0,0 +1,81 @@
+from collections import Counter
+
+import numpy as np
+
+from .. import DistributionMode
+
+
+class AlignmentCounter:
+    COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
+    INITIAL_SIZE = 1000
+
+    @staticmethod
+    def normalise_counts(counts, feature_len, scaling_factor):
+        """Returns raw, length-normalised, and scaled feature counts."""
+        normalised = counts / feature_len
+        scaled = normalised * scaling_factor
+        return counts, normalised, scaled
+
+    def get_increment(self, n_aln, increment):
+        # 1overN = lavern. Maya <3
+        return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
+
+    def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False):
+        self.distribution_mode = distribution_mode
+        self.strand_specific = strand_specific
+        self.unannotated_reads = 0
+
+        self.index = {}
+        self.counts = np.zeros(
+            (AlignmentCounter.INITIAL_SIZE, 1),
+        )
+    def dump(self, prefix, refmgr):
+        ...
+    def get(self, key, default_val):
+        key_index = self.index.get(key)
+        if key_index is None:
+            return Counter()
+        return Counter({key: self.counts[key_index]})
+    
+    def setdefault(self, key, default_val):
+        ...
+    
+    def __iter__(self):
+        yield from self.index.keys()
+    def __getitem__(self, key):
+        return self.counts.get(self.index.get(key), 0.0)
+    def __setitem__(self, key, value):
+        key_index = self.index.get(key)
+        if key_index is not None:
+            self.counts[key_index] = value
+        raise KeyError(f"{key=} not found.")
+        
+    def update_counts(self, count_stream, increment=1):
+        contributed_counts = 0
+        for hits, aln_count in count_stream:
+            hit = hits[0]
+            inc = (
+                (
+                    self.get_increment(aln_count, increment),
+                    increment,
+                )
+            )[aln_count == 1]
+            key = (
+                (
+                    (hit.rid, hit.rev_strand),
+                    hit.rid
+                )
+            )[self.strand_specific]
+
+            key_index = self.index.get(key)
+            if key_index is None:
+                nrows = self.counts.shape[0]
+                if len(self.index) == nrows:
+                    self.counts = np.pad(
+                        self.counts,
+                        ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
+                    )
+                key_index = self.index.setdefault(key, len(self.index))
+                self.counts[key_index] += inc
+                contributed_counts += inc
+
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 40ae72a6..0eabc67e 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -3,7 +3,7 @@
 from collections import Counter
 
 from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 from .region_counter import RegionCounter
 
 
@@ -157,6 +157,17 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
             return uniq_counts, ambig_counts
 
     def get_regions(self, rid):
-        return set(self.uniq_regioncounts.get(rid, set())).union(
-            self.ambig_regioncounts.get(rid, set())
+        # return set(self.uniq_regioncounts.get(rid, set())).union(
+        #     self.ambig_regioncounts.get(rid, set())
+        # )
+        return set(self.uniq_regioncounts.get(rid, Counter())).union(
+            self.ambig_regioncounts.get(rid, Counter())
         )
+    
+    def get_all_regions(self, region_counts=False):
+        uniq_counts, ambig_counts = (
+            (self.uniq_seqcounts, self.ambig_seqcounts,),
+            (self.uniq_regioncounts, self.ambig_regioncounts,),
+        )[region_counts]
+        yield from set(uniq_counts).union(ambig_counts)
+
diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py
index 7a617056..a7a74756 100644
--- a/gffquant/counters/region_counter.py
+++ b/gffquant/counters/region_counter.py
@@ -5,7 +5,7 @@
 from collections import Counter
 
 from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 
 
 class RegionCounter(AlignmentCounter):
diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py
index bc71c7fb..91e28628 100644
--- a/gffquant/counters/seq_counter.py
+++ b/gffquant/counters/seq_counter.py
@@ -3,7 +3,7 @@
 """ module docstring """
 
 from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
+from .alignment_counter2 import AlignmentCounter
 
 
 class UniqueSeqCounter(AlignmentCounter):

From 6c80f0aacc7c92ff27f6f0753732fbbbe0fed5ab Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 15 Dec 2024 23:41:02 +0100
Subject: [PATCH 002/128] version

---
 Dockerfile           | 2 +-
 gffquant/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ac8b542d..aa88185f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:22.04
 
 LABEL maintainer="cschu1981@gmail.com"
-LABEL version="2.18.0"
+LABEL version="2.19.0"
 LABEL description="gffquant - functional profiling of metagenomic/transcriptomic wgs samples"
 
 
diff --git a/gffquant/__init__.py b/gffquant/__init__.py
index 128d76bf..31f4177f 100644
--- a/gffquant/__init__.py
+++ b/gffquant/__init__.py
@@ -5,7 +5,7 @@
 from enum import Enum, auto, unique
 
 
-__version__ = "2.18.0"
+__version__ = "2.19.0"
 __tool__ = "gffquant"
 
 

From 2be12b462dbc14edeaf87a5a90673f47c84457fa Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 15 Dec 2024 23:53:30 +0100
Subject: [PATCH 003/128] fix: getitem implementation

---
 gffquant/counters/alignment_counter2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index e6f82111..14d77d96 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -43,7 +43,10 @@ def setdefault(self, key, default_val):
     def __iter__(self):
         yield from self.index.keys()
     def __getitem__(self, key):
-        return self.counts.get(self.index.get(key), 0.0)
+        key_index = self.index.get(key)
+        if key_index is None:
+            return 0.0
+        return self.counts[self.index.get(key)]
     def __setitem__(self, key, value):
         key_index = self.index.get(key)
         if key_index is not None:

From e143eb34b49511b16e08c759f3d2031fadfbd153 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 16 Dec 2024 21:08:21 +0100
Subject: [PATCH 004/128] fix?: missing counts

---
 gffquant/counters/alignment_counter2.py | 9 ++++++---
 gffquant/counters/count_manager.py      | 4 ++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 14d77d96..4723a410 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -78,7 +78,10 @@ def update_counts(self, count_stream, increment=1):
                         self.counts,
                         ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
                     )
-                key_index = self.index.setdefault(key, len(self.index))
-                self.counts[key_index] += inc
-                contributed_counts += inc
+                # key_index = self.index.setdefault(key, len(self.index))
+                key_index = self.index[key] = len(self.index)
+            self.counts[key_index] += inc
+            contributed_counts += inc
+
+        return contributed_counts
 
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 0eabc67e..c7d4718e 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -66,6 +66,10 @@ def __init__(
                 strand_specific=strand_specific,
                 distribution_mode=distribution_mode
             )
+            # self.seqcounts = AlignmentCounter(
+            #     strand_specific=strand_specific,
+            #     distribution_mode=distribution_mode,
+            # )
 
     def has_ambig_counts(self):
         return self.ambig_regioncounts or self.ambig_seqcounts

From 32ce2bd090998d175546bc4c35511ccfc8b0b9f1 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 16 Dec 2024 21:18:09 +0100
Subject: [PATCH 005/128] fix: fixing AlignmentCounter __getitem__/__setitem__
 methods

---
 gffquant/counters/alignment_counter2.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 4723a410..c5e61b36 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -46,12 +46,13 @@ def __getitem__(self, key):
         key_index = self.index.get(key)
         if key_index is None:
             return 0.0
-        return self.counts[self.index.get(key)]
+        return self.counts[key_index]
     def __setitem__(self, key, value):
         key_index = self.index.get(key)
         if key_index is not None:
             self.counts[key_index] = value
-        raise KeyError(f"{key=} not found.")
+        else:
+            raise KeyError(f"{key=} not found.")
         
     def update_counts(self, count_stream, increment=1):
         contributed_counts = 0

From ed0eaab8eaaef2cc21c21a02eb9c47a56fd77e56 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 16 Dec 2024 23:33:56 +0100
Subject: [PATCH 006/128] merge uniq/ambig seqcounters

---
 gffquant/annotation/count_annotator.py  |   3 -
 gffquant/counters/alignment_counter2.py |   9 +-
 gffquant/counters/count_manager.py      | 115 +++++++++++++++---------
 3 files changed, 78 insertions(+), 49 deletions(-)

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 4d91f8cd..70bc6c86 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -288,9 +288,6 @@ def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False)
             if self.strand_specific else None
         )
 
-        # for rid in set(count_manager.uniq_seqcounts).union(
-        #     count_manager.ambig_seqcounts
-        # ):
         for rid in count_manager.get_all_regions():
             ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index c5e61b36..6b54a845 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -27,7 +27,7 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi
 
         self.index = {}
         self.counts = np.zeros(
-            (AlignmentCounter.INITIAL_SIZE, 1),
+            (AlignmentCounter.INITIAL_SIZE, 2),
         )
     def dump(self, prefix, refmgr):
         ...
@@ -39,6 +39,9 @@ def get(self, key, default_val):
     
     def setdefault(self, key, default_val):
         ...
+
+    def has_ambig_counts(self):
+        ...
     
     def __iter__(self):
         yield from self.index.keys()
@@ -54,7 +57,7 @@ def __setitem__(self, key, value):
         else:
             raise KeyError(f"{key=} not found.")
         
-    def update_counts(self, count_stream, increment=1):
+    def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
         for hits, aln_count in count_stream:
             hit = hits[0]
@@ -81,7 +84,7 @@ def update_counts(self, count_stream, increment=1):
                     )
                 # key_index = self.index.setdefault(key, len(self.index))
                 key_index = self.index[key] = len(self.index)
-            self.counts[key_index] += inc
+            self.counts[key_index][int(ambiguous_counts)] += inc
             contributed_counts += inc
 
         return contributed_counts
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index c7d4718e..37d31f6b 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -50,36 +50,45 @@ def __init__(
         self.increments = [1.0, 1.0]
         self.increments_auto_detect = [1.0, self.paired_end_count / 2.0]
 
-        self.uniq_seqcounts, self.ambig_seqcounts = None, None
-        self.uniq_regioncounts, self.ambig_regioncounts = None, None
+        # self.uniq_seqcounts, self.ambig_seqcounts = None, None
+        # self.uniq_regioncounts, self.ambig_regioncounts = None, None
+        self.seqcounts, self.regioncounts = None, None
 
         if region_counts:
-            self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific)
-            self.ambig_regioncounts = RegionCounter(
+            # self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific)
+            # self.ambig_regioncounts = RegionCounter(
+            #     strand_specific=strand_specific,
+            #     distribution_mode=distribution_mode,
+            # )
+            self.regioncounts = RegionCounter(
                 strand_specific=strand_specific,
                 distribution_mode=distribution_mode,
             )
 
         else:
-            self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific)
-            self.ambig_seqcounts = AlignmentCounter(
-                strand_specific=strand_specific,
-                distribution_mode=distribution_mode
-            )
-            # self.seqcounts = AlignmentCounter(
+            # self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific)
+            # self.ambig_seqcounts = AlignmentCounter(
             #     strand_specific=strand_specific,
-            #     distribution_mode=distribution_mode,
+            #     distribution_mode=distribution_mode
             # )
+            self.seqcounts = AlignmentCounter(
+                strand_specific=strand_specific,
+                distribution_mode=distribution_mode,
+            )
 
     def has_ambig_counts(self):
-        return self.ambig_regioncounts or self.ambig_seqcounts
+        return any(
+            self.seqcounts and self.seqcounts.has_ambig_counts(),
+            self.regioncounts and self.regioncounts.has_ambig_counts(),
+        )
+        # return self.ambig_regioncounts or self.ambig_seqcounts
 
     def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None):
-        seq_counter, region_counter = (
-            (self.uniq_seqcounts, self.uniq_regioncounts)
-            if not ambiguous_counts
-            else (self.ambig_seqcounts, self.ambig_regioncounts)
-        )
+        # seq_counter, region_counter = (
+        #     (self.uniq_seqcounts, self.uniq_regioncounts)
+        #     if not ambiguous_counts
+        #     else (self.ambig_seqcounts, self.ambig_regioncounts)
+        # )
 
         if pe_library is not None:
             # this is the case when the alignment has a read group tag
@@ -95,40 +104,51 @@ def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_lib
             increment = self.increments[pair]
 
         contributed_counts = 0
-        if seq_counter is not None:
-            contributed_counts = seq_counter.update_counts(count_stream, increment=increment)
-        elif region_counter is not None:
-            contributed_counts = region_counter.update_counts(count_stream, increment=increment)
+        if self.seqcounts is not None:
+            contributed_counts = self.seqcounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
+        elif self.regioncounts is not None:
+            contributed_counts = self.regioncounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
+        # if seq_counter is not None:
+            # contributed_counts = seq_counter.update_counts(count_stream, increment=increment)
+        # elif region_counter is not None:
+            # contributed_counts = region_counter.update_counts(count_stream, increment=increment)
 
         return contributed_counts
 
     def dump_raw_counters(self, prefix, refmgr):
-        if self.uniq_seqcounts is not None:
-            self.uniq_seqcounts.dump(prefix, refmgr)
-        if self.ambig_seqcounts is not None:
-            self.ambig_seqcounts.dump(prefix, refmgr)
-        if self.uniq_regioncounts is not None:
-            self.uniq_regioncounts.dump(prefix, refmgr)
-        if self.ambig_regioncounts is not None:
-            self.ambig_regioncounts.dump(prefix, refmgr)
+        # if self.uniq_seqcounts is not None:
+        #     self.uniq_seqcounts.dump(prefix, refmgr)
+        # if self.ambig_seqcounts is not None:
+        #     self.ambig_seqcounts.dump(prefix, refmgr)
+        # if self.uniq_regioncounts is not None:
+        #     self.uniq_regioncounts.dump(prefix, refmgr)
+        # if self.ambig_regioncounts is not None:
+        #     self.ambig_regioncounts.dump(prefix, refmgr)
+        ...
 
     def get_unannotated_reads(self):
         unannotated_reads = 0
 
-        if self.uniq_regioncounts is not None:
-            unannotated_reads += self.uniq_regioncounts.unannotated_reads
-        if self.ambig_regioncounts is not None:
-            unannotated_reads += self.ambig_regioncounts.unannotated_reads
-        if self.uniq_seqcounts is not None:
-            unannotated_reads += self.uniq_seqcounts.unannotated_reads
-        if self.ambig_seqcounts is not None:
-            unannotated_reads += self.ambig_seqcounts.unannotated_reads
+        # if self.uniq_regioncounts is not None:
+        #     unannotated_reads += self.uniq_regioncounts.unannotated_reads
+        # if self.ambig_regioncounts is not None:
+        #     unannotated_reads += self.ambig_regioncounts.unannotated_reads
+        # if self.uniq_seqcounts is not None:
+        #     unannotated_reads += self.uniq_seqcounts.unannotated_reads
+        # if self.ambig_seqcounts is not None:
+        #     unannotated_reads += self.ambig_seqcounts.unannotated_reads
+        if self.regioncounts is not None:
+            unannotated_reads += self.regioncounts
+        if self.seqcounts is not None:
+            unannotated_reads += self.seqcounts
 
         return unannotated_reads
 
     def get_counts(self, seqid, region_counts=False, strand_specific=False):
         if region_counts:
+            raise NotImplementedError()
             rid, seqid = seqid[0], seqid[1:]
+            
             uniq_counter = self.uniq_regioncounts.get(rid, Counter())
             ambig_counter = self.ambig_regioncounts.get(rid, Counter())
 
@@ -139,9 +159,11 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 return [uniq_counter[seqid]], [ambig_counter[seqid]]
 
         else:
-            uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts
+            # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts
+
 
             if strand_specific:
+                raise NotImplementedError()
                 uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
                 uniq_counts[seqid[1]] = uniq_counter[seqid]
                 ambig_counts[seqid[1]] = ambig_counter[seqid]
@@ -156,7 +178,8 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 #     ambig_counter[(rid, CountManager.MINUS_STRAND)],
                 # ]
             else:
-                uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
+                # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
+                uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]]
 
             return uniq_counts, ambig_counts
 
@@ -169,9 +192,15 @@ def get_regions(self, rid):
         )
     
     def get_all_regions(self, region_counts=False):
-        uniq_counts, ambig_counts = (
-            (self.uniq_seqcounts, self.ambig_seqcounts,),
-            (self.uniq_regioncounts, self.ambig_regioncounts,),
+        # uniq_counts, ambig_counts = (
+        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
+        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
+        # )[region_counts]
+        # yield from set(uniq_counts).union(ambig_counts)
+        counts = (
+            self.seqcounts,
+            self.regioncounts,
         )[region_counts]
-        yield from set(uniq_counts).union(ambig_counts)
+
+        yield from counts
 

From 6c32072c15b54aa20b51a560ad5cbab4ae99122d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 16 Dec 2024 23:47:41 +0100
Subject: [PATCH 007/128] fix: AlignmentCounter.has_ambig_counts(),
 CountManager.has_ambig_counts()

---
 gffquant/counters/alignment_counter2.py | 2 +-
 gffquant/counters/count_manager.py      | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 6b54a845..9e9e7ce1 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -41,7 +41,7 @@ def setdefault(self, key, default_val):
         ...
 
     def has_ambig_counts(self):
-        ...
+        return bool(self.counts[:, 1].sum() != 0)
     
     def __iter__(self):
         yield from self.index.keys()
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 37d31f6b..b25cf21b 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -78,8 +78,10 @@ def __init__(
 
     def has_ambig_counts(self):
         return any(
-            self.seqcounts and self.seqcounts.has_ambig_counts(),
-            self.regioncounts and self.regioncounts.has_ambig_counts(),
+            (
+                self.seqcounts and self.seqcounts.has_ambig_counts(),
+                self.regioncounts and self.regioncounts.has_ambig_counts(),
+            )
         )
         # return self.ambig_regioncounts or self.ambig_seqcounts
 

From 28f84e8a8aed979cad28b575647bcc7319a3bd6d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 16 Dec 2024 23:58:25 +0100
Subject: [PATCH 008/128] fix: minor

---
 gffquant/counters/count_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index b25cf21b..22fa9988 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -140,9 +140,9 @@ def get_unannotated_reads(self):
         # if self.ambig_seqcounts is not None:
         #     unannotated_reads += self.ambig_seqcounts.unannotated_reads
         if self.regioncounts is not None:
-            unannotated_reads += self.regioncounts
+            unannotated_reads += self.regioncounts.unannotated_reads
         if self.seqcounts is not None:
-            unannotated_reads += self.seqcounts
+            unannotated_reads += self.seqcounts.unannotated_reads
 
         return unannotated_reads
 

From b344e9f7cb3e8bedb9cc5b5e7a134d7f759d8726 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 21:59:24 +0100
Subject: [PATCH 009/128] updating count annotation

---
 gffquant/annotation/count_annotator.py     |  6 ++-
 gffquant/annotation/genecount_annotator.py | 48 ++++++++++++++++++++++
 gffquant/counters/alignment_counter2.py    | 30 +++++++++++++-
 gffquant/counters/count_manager.py         | 14 ++++++-
 4 files changed, 93 insertions(+), 5 deletions(-)
 create mode 100644 gffquant/annotation/genecount_annotator.py

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 70bc6c86..67f6ce7a 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -95,7 +95,8 @@ def calculate_scaling_factors(self, default_scaling_factor=0):
         def calc_scaling_factor(raw, normed, default=0):
             return (raw / normed) if normed else default
 
-        total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts
+        # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts
+        total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts
         logger.info(
             "TOTAL COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
             total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
@@ -109,7 +110,8 @@ def calc_scaling_factor(raw, normed, default=0):
             total_ambi, total_ambi_normed, default_scaling_factor
         )
 
-        total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts
+        # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts
+        total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts
         logger.info(
             "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
             total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
new file mode 100644
index 00000000..7fab2d11
--- /dev/null
+++ b/gffquant/annotation/genecount_annotator.py
@@ -0,0 +1,48 @@
+import logging 
+
+import numpy as np
+
+from .count_annotator import CountAnnotator
+from ..counters import CountManager
+
+
+logger = logging.getLogger(__name__)
+
+
+class GeneCountAnnotator(CountAnnotator):
+	""" CountAnnotator subclass for gene-based counting. """
+
+	def __init__(self, strand_specific, report_scaling_factors=True):
+		CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
+
+	def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
+		self.total_gene_counts = count_manager.transform_counts(refmgr)
+		self.total_counts = self.total_gene_counts  # ?
+
+		for rid in count_manager.get_all_regions():
+			counts = count_manager.get_counts(rid)
+			ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+
+			if gene_group_db:
+				ref_tokens = ref.split(".")
+				gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+			else:
+				gene_id, ggroup_id = ref, ref
+
+			region_annotation = db.query_sequence(ggroup_id)
+			if region_annotation is not None:
+				_, _, region_annotation = region_annotation
+				logger.info(
+					"GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
+					gene_id, ggroup_id, counts[0], counts[2],
+				)
+				self.distribute_feature_counts(counts, region_annotation)
+
+			else:
+				logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
+				self.unannotated_counts += counts[:4]
+
+		self.calculate_scaling_factors()
+
+
+			
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 9e9e7ce1..4d91b9ae 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -27,7 +27,8 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi
 
         self.index = {}
         self.counts = np.zeros(
-            (AlignmentCounter.INITIAL_SIZE, 2),
+            (AlignmentCounter.INITIAL_SIZE, 2,),
+            dtype='float64',
         )
     def dump(self, prefix, refmgr):
         ...
@@ -88,4 +89,31 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
             contributed_counts += inc
 
         return contributed_counts
+    
+    def transform(self, refmgr):
+        # transform 2-column uniq/ambig count matrix
+        # into 4 columns
+        # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm
+
+        # obtain gene lengths
+        lengths = np.array(
+            (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
+            for key in self.index
+        )
+
+        # calculate combined_raw
+        self.counts[:, 1:2] += self.counts[:, 0:1]
+
+        # duplicate the raw counts
+        self.counts = np.concatenate(
+            (self.counts, self.counts,),
+            axis=1,
+        )
+
+        # length-normalise the lnorm columns
+        self.counts[:, 2:4] /= lengths[:, None]
+
+        # return count sums
+        return self.counts.sum(axis=0)
+
 
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 22fa9988..4488babb 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -2,6 +2,8 @@
 
 from collections import Counter
 
+import numpy as np
+
 from .. import DistributionMode
 from .alignment_counter2 import AlignmentCounter
 from .region_counter import RegionCounter
@@ -181,9 +183,11 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
                 # ]
             else:
                 # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
-                uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]]
+                # uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]]
+                counts = self.seqcounts[seqid]
 
-            return uniq_counts, ambig_counts
+            # return uniq_counts, ambig_counts
+            return np.array((counts[0], counts[2], counts[1], counts[3]))
 
     def get_regions(self, rid):
         # return set(self.uniq_regioncounts.get(rid, set())).union(
@@ -206,3 +210,9 @@ def get_all_regions(self, region_counts=False):
 
         yield from counts
 
+    def transform_counts(self, refmgr):
+        if self.seqcounts is not None:
+            self.seqcounts.transform(refmgr)
+        if self.regioncounts is not None:
+            self.regioncounts.transform(refmgr)
+

From 5ed8cfc95c7b76a1e58a1bfc20d264a2bff78002 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 22:15:37 +0100
Subject: [PATCH 010/128] fixed import

---
 gffquant/annotation/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gffquant/annotation/__init__.py b/gffquant/annotation/__init__.py
index 4ae04f44..1649dcb4 100644
--- a/gffquant/annotation/__init__.py
+++ b/gffquant/annotation/__init__.py
@@ -2,5 +2,7 @@
 
 """ module docstring """
 
-from .count_annotator import GeneCountAnnotator, RegionCountAnnotator
+# from .count_annotator import GeneCountAnnotator, RegionCountAnnotator
+from .count_annotator import RegionCountAnnotator
 from .count_writer import CountWriter
+from .genecount_annotator import GeneCountAnnotator

From a92722ae8a2181417d87e41a92c9260357fc7021 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 22:34:16 +0100
Subject: [PATCH 011/128] added debug message

---
 gffquant/counters/alignment_counter2.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 4d91b9ae..02d53b15 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -1,3 +1,5 @@
+import logging
+
 from collections import Counter
 
 import numpy as np
@@ -5,6 +7,9 @@
 from .. import DistributionMode
 
 
+logger = logging.getLogger(__name__)
+
+
 class AlignmentCounter:
     COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
     INITIAL_SIZE = 1000
@@ -100,6 +105,7 @@ def transform(self, refmgr):
             (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
             for key in self.index
         )
+        logger.info("LENGTHS ARRAY = %s", lengths.shape)
 
         # calculate combined_raw
         self.counts[:, 1:2] += self.counts[:, 0:1]

From 515e71f2c0346524fe4febd64d35ee13a6cbeb64 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 22:46:07 +0100
Subject: [PATCH 012/128] added debug message

---
 gffquant/counters/alignment_counter2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 02d53b15..5df5de16 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -106,6 +106,7 @@ def transform(self, refmgr):
             for key in self.index
         )
         logger.info("LENGTHS ARRAY = %s", lengths.shape)
+        logger.info("INDEX SIZE = %s", len(self.index))
 
         # calculate combined_raw
         self.counts[:, 1:2] += self.counts[:, 0:1]

From 6efbca61156509d13466c0c4acaeaa61b5ad85de Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 22:57:08 +0100
Subject: [PATCH 013/128] fixing empty length vector issue?

---
 gffquant/counters/alignment_counter2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 5df5de16..bdeebf08 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -102,8 +102,10 @@ def transform(self, refmgr):
 
         # obtain gene lengths
         lengths = np.array(
-            (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
-            for key in self.index
+            tuple(
+                (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
+                for key in self.index
+            )
         )
         logger.info("LENGTHS ARRAY = %s", lengths.shape)
         logger.info("INDEX SIZE = %s", len(self.index))

From ef34cde79fc030e73decf66319f29c9059521213 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 23:12:14 +0100
Subject: [PATCH 014/128] fixing empty length vector issue?

---
 gffquant/counters/alignment_counter2.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index bdeebf08..8516006c 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -110,6 +110,9 @@ def transform(self, refmgr):
         logger.info("LENGTHS ARRAY = %s", lengths.shape)
         logger.info("INDEX SIZE = %s", len(self.index))
 
+        # remove the un-indexed rows
+        self.counts = self.counts[0:len(self.index), :]
+
         # calculate combined_raw
         self.counts[:, 1:2] += self.counts[:, 0:1]
 

From aa09c1affee68a415701bfa016f03f83aec994a9 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 23:22:44 +0100
Subject: [PATCH 015/128] added debug message

---
 gffquant/annotation/genecount_annotator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 7fab2d11..d34f3032 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -17,6 +17,7 @@ def __init__(self, strand_specific, report_scaling_factors=True):
 
 	def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
 		self.total_gene_counts = count_manager.transform_counts(refmgr)
+		logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
 		self.total_counts = self.total_gene_counts  # ?
 
 		for rid in count_manager.get_all_regions():

From d3649c28cd2334522d620d4387748d79826c52b2 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 23:30:58 +0100
Subject: [PATCH 016/128] fixing empty total counts?

---
 gffquant/counters/count_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 4488babb..97bb2cc0 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -212,7 +212,7 @@ def get_all_regions(self, region_counts=False):
 
     def transform_counts(self, refmgr):
         if self.seqcounts is not None:
-            self.seqcounts.transform(refmgr)
+            return self.seqcounts.transform(refmgr)
         if self.regioncounts is not None:
-            self.regioncounts.transform(refmgr)
+            return self.regioncounts.transform(refmgr)
 

From e3bee67e54facd5d831ba309cc11f8e85327248f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 19 Dec 2024 23:55:54 +0100
Subject: [PATCH 017/128] fixing total count issue?

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index d34f3032..ed161d40 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -18,7 +18,7 @@ def __init__(self, strand_specific, report_scaling_factors=True):
 	def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
 		self.total_gene_counts = count_manager.transform_counts(refmgr)
 		logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
-		self.total_counts = self.total_gene_counts  # ?
+		# self.total_counts = self.total_gene_counts  # ?
 
 		for rid in count_manager.get_all_regions():
 			counts = count_manager.get_counts(rid)

From b466381cc2e6312c7a11031474edf39e875dcbd7 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 20 Dec 2024 22:12:22 +0100
Subject: [PATCH 018/128] debug messaging

---
 gffquant/annotation/count_annotator.py | 6 +++---
 gffquant/annotation/count_writer.py    | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 67f6ce7a..b3f74d18 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -95,8 +95,8 @@ def calculate_scaling_factors(self, default_scaling_factor=0):
         def calc_scaling_factor(raw, normed, default=0):
             return (raw / normed) if normed else default
 
-        # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts
-        total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts
+        total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts
+        # total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts
         logger.info(
             "TOTAL COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
             total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
@@ -141,7 +141,7 @@ def calc_scaling_factor(raw, normed, default=0):
                     total_ambi, total_ambi_normed, default_scaling_factor
                 )
             )
-
+            
             if self.report_scaling_factors:
                 logger.info(
                     "Calculating scaling factors for category=%s: uraw=%s unorm=%s araw=%s anorm=%s -> factors=%s",
diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index fa67c3fc..1827a2b8 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -142,6 +142,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un
 
                 if "category" in self.publish_reports:
                     cat_counts = counts.get(f"cat:::{category_id}")
+                    logger.info("CAT %s: %s", category_id, str(cat_counts))
                     if cat_counts is not None:
                         cat_row = self.compile_output_row(
                             cat_counts,

From 8249d9f9d6c4f91f177101228ca61cbe9275807e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 20 Dec 2024 22:58:45 +0100
Subject: [PATCH 019/128] fixed gene writing?

---
 gffquant/annotation/count_writer.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 1827a2b8..f1092192 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -8,6 +8,8 @@
 
 import numpy as np
 
+from ..counters import CountManager
+
 
 logger = logging.getLogger(__name__)
 
@@ -161,16 +163,25 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un
                         )
                         CountWriter.write_row(feature.name, out_row, stream=feat_out)
 
-    def write_gene_counts(self, gene_counts, uniq_scaling_factor, ambig_scaling_factor):
+    def write_gene_counts(self, gene_counts: CountManager, uniq_scaling_factor, ambig_scaling_factor):
         if "scaled" in self.publish_reports:
             logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
             print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True)
 
-            for gene, g_counts in sorted(gene_counts.items()):
+            # for gene, g_counts in sorted(gene_counts.items()):
+            #     out_row = self.compile_output_row(
+            #         g_counts,
+            #         scaling_factor=uniq_scaling_factor,
+            #         ambig_scaling_factor=ambig_scaling_factor
+            #     )
+            #     CountWriter.write_row(gene, out_row, stream=gene_out)
+            for rid in gene_counts.get_all_regions():
+                counts = gene_counts.get_counts(rid)
                 out_row = self.compile_output_row(
-                    g_counts,
+                    counts,
                     scaling_factor=uniq_scaling_factor,
-                    ambig_scaling_factor=ambig_scaling_factor
+                    ambig_scaling_factor=ambig_scaling_factor,
                 )
-                CountWriter.write_row(gene, out_row, stream=gene_out)
+                CountWriter.write_row(rid, out_row, stream=gene_out,)
+

From 7fb5c56e65b8fdc6709bf167f4a400b1d8632f62 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 20 Dec 2024 23:15:45 +0100
Subject: [PATCH 020/128] fixed gene writing?

---
 gffquant/profilers/feature_quantifier.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index bd5c5c96..a8897ffa 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -186,7 +186,8 @@ def process_counters(
         )
 
         count_writer.write_gene_counts(
-            count_annotator.gene_counts,
+            # count_annotator.gene_counts,
+            self.count_manager,
             count_annotator.scaling_factors["total_gene_uniq"],
             count_annotator.scaling_factors["total_gene_ambi"]
         )

From d2226acb79cfa7f894c9ef650cf389220fcf4a7e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 20 Dec 2024 23:38:34 +0100
Subject: [PATCH 021/128] fixed gene writing?

---
 gffquant/annotation/count_writer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index f1092192..7a73406c 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -163,7 +163,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un
                         )
                         CountWriter.write_row(feature.name, out_row, stream=feat_out)
 
-    def write_gene_counts(self, gene_counts: CountManager, uniq_scaling_factor, ambig_scaling_factor):
+    def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor):
         if "scaled" in self.publish_reports:
             logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
@@ -183,5 +183,6 @@ def write_gene_counts(self, gene_counts: CountManager, uniq_scaling_factor, ambi
                     scaling_factor=uniq_scaling_factor,
                     ambig_scaling_factor=ambig_scaling_factor,
                 )
-                CountWriter.write_row(rid, out_row, stream=gene_out,)
+                ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
+                CountWriter.write_row(ref, out_row, stream=gene_out,)
 

From a1e54c29d0834a92e0704cfec0f2359a17b31cc4 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 00:01:14 +0100
Subject: [PATCH 022/128] fixed gene writing?

---
 gffquant/profilers/feature_quantifier.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index a8897ffa..358d3fd8 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -188,6 +188,7 @@ def process_counters(
         count_writer.write_gene_counts(
             # count_annotator.gene_counts,
             self.count_manager,
+            self.reference_manager,
             count_annotator.scaling_factors["total_gene_uniq"],
             count_annotator.scaling_factors["total_gene_ambi"]
         )

From f3364934328765a84ad6ad1635ab33f1c494ada9 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 09:43:34 +0100
Subject: [PATCH 023/128] fixed gene writing?

---
 gffquant/annotation/count_writer.py      | 11 +++++++++--
 gffquant/profilers/feature_quantifier.py |  3 ++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 7a73406c..2b6168aa 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -163,7 +163,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un
                         )
                         CountWriter.write_row(feature.name, out_row, stream=feat_out)
 
-    def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor):
+    def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False):
         if "scaled" in self.publish_reports:
             logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
@@ -184,5 +184,12 @@ def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_fact
                     ambig_scaling_factor=ambig_scaling_factor,
                 )
                 ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
-                CountWriter.write_row(ref, out_row, stream=gene_out,)
+
+                if gene_group_db:
+                    ref_tokens = ref.split(".")
+                    gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                else:
+                    gene_id = ref
+
+                CountWriter.write_row(gene_id, out_row, stream=gene_out,)
 
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 358d3fd8..7ecdd55c 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -190,7 +190,8 @@ def process_counters(
             self.count_manager,
             self.reference_manager,
             count_annotator.scaling_factors["total_gene_uniq"],
-            count_annotator.scaling_factors["total_gene_ambi"]
+            count_annotator.scaling_factors["total_gene_ambi"],
+            gene_group_db=gene_group_db,
         )
 
         self.adm.clear_caches()

From cb9ea29420fde0d7e879c373ec7e871ace873596 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 20:48:16 +0100
Subject: [PATCH 024/128] dump seqcounters for debugging

---
 gffquant/counters/alignment_counter2.py  | 8 ++++++++
 gffquant/counters/count_manager.py       | 5 +++++
 gffquant/profilers/feature_quantifier.py | 5 +++++
 3 files changed, 18 insertions(+)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 8516006c..d5c8ad3d 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -36,6 +36,14 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi
             dtype='float64',
         )
     def dump(self, prefix, refmgr):
+        import gzip
+        with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
+            for key in self:
+                ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
+                print(key, ref, reflen, self.counts[key], sep="\t", file=_out)
+            # for k, v in self.items():
+            # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
+            # print(k, ref, reflen, v, sep="\t", file=_out)
         ...
     def get(self, key, default_val):
         key_index = self.index.get(key)
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 97bb2cc0..8f8517f8 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -216,3 +216,8 @@ def transform_counts(self, refmgr):
         if self.regioncounts is not None:
             return self.regioncounts.transform(refmgr)
 
+    def dump(self, prefix, refmgr):
+        if self.seqcounts is not None:
+            self.seqcounts.dump(prefix, refmgr)
+        if self.regioncounts is not None:
+            self.regioncounts.dump(prefix, refmgr)
\ No newline at end of file
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 7ecdd55c..fa1ce0d9 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -164,8 +164,13 @@ def process_counters(
 
         Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required]
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
+
+        self.count_manager.dump("pre_annotate", self.reference_manager)
+
         count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,)
 
+        self.count_manager.dump("post_annotate", self.reference_manager)
+
         count_writer = CountWriter(
             self.out_prefix,
             has_ambig_counts=self.count_manager.has_ambig_counts(),

From 6ae5c5a8aec81d6bef12b7737728c4ebf8f0ef80 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 21:03:39 +0100
Subject: [PATCH 025/128] dump seqcounters for debugging

---
 gffquant/counters/alignment_counter2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index d5c8ad3d..34db48a1 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -38,7 +38,7 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi
     def dump(self, prefix, refmgr):
         import gzip
         with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
-            for key in self:
+            for key in self.index.keys():
                 ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
                 print(key, ref, reflen, self.counts[key], sep="\t", file=_out)
             # for k, v in self.items():

From fbb6a0edb6b6a3e928811d36b9c3e450e18032e2 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 21:21:53 +0100
Subject: [PATCH 026/128] dump seqcounters for debugging

---
 gffquant/counters/alignment_counter2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 34db48a1..679790f2 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -38,9 +38,9 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi
     def dump(self, prefix, refmgr):
         import gzip
         with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
-            for key in self.index.keys():
+            for key, key_index in self.index.items():
                 ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
-                print(key, ref, reflen, self.counts[key], sep="\t", file=_out)
+                print(key, ref, reflen, self.counts[key_index], sep="\t", file=_out)
             # for k, v in self.items():
             # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
             # print(k, ref, reflen, v, sep="\t", file=_out)

From a70f1a8ec19fc1b7ee1962d426f9d16d542dbf63 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 21:45:37 +0100
Subject: [PATCH 027/128] changed strand specific order

---
 gffquant/counters/alignment_counter2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 679790f2..3650dc18 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -83,8 +83,8 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
             )[aln_count == 1]
             key = (
                 (
-                    (hit.rid, hit.rev_strand),
                     hit.rid
+                    (hit.rid, hit.rev_strand),
                 )
             )[self.strand_specific]
 

From bb13e149a9c2da412a50196ebb8155e49fd362cc Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 21:57:59 +0100
Subject: [PATCH 028/128] debug log

---
 gffquant/counters/alignment_counter2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 3650dc18..48c5ece0 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -74,6 +74,7 @@ def __setitem__(self, key, value):
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
         for hits, aln_count in count_stream:
+            logger.info("update_counts:: HITS: %s", hits)
             hit = hits[0]
             inc = (
                 (

From 4579affeb5f16d48022c3cbea57080d415d0355e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 22:11:14 +0100
Subject: [PATCH 029/128] debug log

---
 gffquant/counters/alignment_counter2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 48c5ece0..0a417edb 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -74,8 +74,10 @@ def __setitem__(self, key, value):
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
         for hits, aln_count in count_stream:
+            # [2024-12-21 22:05:40,032] update_counts:: HITS: [258011	None	None	False	None	None	None	None	True	2	1]
             logger.info("update_counts:: HITS: %s", hits)
             hit = hits[0]
+            logger.info("update_counts:: HIT %s (%s)", hit, type(hit))
             inc = (
                 (
                     self.get_increment(aln_count, increment),

From a5f7fb1eef16e9cd985c112fb6fccdafb9d7409f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 22:19:42 +0100
Subject: [PATCH 030/128] debug log

---
 gffquant/counters/alignment_counter2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 0a417edb..1aa3fde4 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -86,7 +86,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
             )[aln_count == 1]
             key = (
                 (
-                    hit.rid
+                    hit.rid,
                     (hit.rid, hit.rev_strand),
                 )
             )[self.strand_specific]

From 68fa1942b5c814066f58327de6e4949094a83a52 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 22:56:51 +0100
Subject: [PATCH 031/128] fixed gene writing?

---
 gffquant/annotation/count_writer.py     | 38 ++++++++++++++++++++-----
 gffquant/counters/alignment_counter2.py |  3 --
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 2b6168aa..06952cf7 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -176,20 +176,44 @@ def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_fact
             #         ambig_scaling_factor=ambig_scaling_factor
             #     )
             #     CountWriter.write_row(gene, out_row, stream=gene_out)
-            for rid in gene_counts.get_all_regions():
-                counts = gene_counts.get_counts(rid)
-                out_row = self.compile_output_row(
-                    counts,
-                    scaling_factor=uniq_scaling_factor,
-                    ambig_scaling_factor=ambig_scaling_factor,
+            ref_stream = (
+                (
+                    refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0],
+                    rid,
                 )
-                ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
+                for rid in gene_counts.get_all_regions()
+            )
 
+            for ref, rid in sorted(ref_stream):
+                counts = gene_counts.get_counts(rid)
                 if gene_group_db:
                     ref_tokens = ref.split(".")
                     gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
                 else:
                     gene_id = ref
 
+                out_row = self.compile_output_row(
+                    counts,
+                    scaling_factor=uniq_scaling_factor,
+                    ambig_scaling_factor=ambig_scaling_factor,
+                )
+
                 CountWriter.write_row(gene_id, out_row, stream=gene_out,)
 
+            # for rid in gene_counts.get_all_regions():
+            #     counts = gene_counts.get_counts(rid)
+            #     out_row = self.compile_output_row(
+            #         counts,
+            #         scaling_factor=uniq_scaling_factor,
+            #         ambig_scaling_factor=ambig_scaling_factor,
+            #     )
+            #     ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
+
+            #     if gene_group_db:
+            #         ref_tokens = ref.split(".")
+            #         gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            #     else:
+            #         gene_id = ref
+
+            #     CountWriter.write_row(gene_id, out_row, stream=gene_out,)
+
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 1aa3fde4..545deafd 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -74,10 +74,7 @@ def __setitem__(self, key, value):
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
         for hits, aln_count in count_stream:
-            # [2024-12-21 22:05:40,032] update_counts:: HITS: [258011	None	None	False	None	None	None	None	True	2	1]
-            logger.info("update_counts:: HITS: %s", hits)
             hit = hits[0]
-            logger.info("update_counts:: HIT %s (%s)", hit, type(hit))
             inc = (
                 (
                     self.get_increment(aln_count, increment),

From 313a0615cced87eed056f45ace560c81943aa865 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 22:57:38 +0100
Subject: [PATCH 032/128] fixed gene writing?

---
 gffquant/profilers/feature_quantifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index fa1ce0d9..e685f955 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -165,11 +165,11 @@ def process_counters(
         Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required]
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
 
-        self.count_manager.dump("pre_annotate", self.reference_manager)
+        # self.count_manager.dump("pre_annotate", self.reference_manager)
 
         count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,)
 
-        self.count_manager.dump("post_annotate", self.reference_manager)
+        # self.count_manager.dump("post_annotate", self.reference_manager)
 
         count_writer = CountWriter(
             self.out_prefix,

From e917fe2c93d0a0ce87f00eea33d7fa585e8898fd Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 21 Dec 2024 23:55:51 +0100
Subject: [PATCH 033/128] starting to replace CountManager

---
 gffquant/alignment/__init__.py             |  1 +
 gffquant/alignment/reference_hit.py        | 34 ++++++++++
 gffquant/annotation/count_annotator.py     | 12 ++--
 gffquant/annotation/count_writer.py        |  4 +-
 gffquant/annotation/genecount_annotator.py | 10 +--
 gffquant/counters/alignment_counter2.py    | 62 +++++++++++++++++-
 gffquant/profilers/feature_quantifier.py   | 74 +++++++---------------
 7 files changed, 134 insertions(+), 63 deletions(-)
 create mode 100644 gffquant/alignment/reference_hit.py

diff --git a/gffquant/alignment/__init__.py b/gffquant/alignment/__init__.py
index 9c07f61f..77c59f0a 100644
--- a/gffquant/alignment/__init__.py
+++ b/gffquant/alignment/__init__.py
@@ -6,6 +6,7 @@
 
 from .aln_group import AlignmentGroup
 from .pysam_alignment_processor import AlignmentProcessor
+from .reference_hit import ReferenceHit
 from .samflags import SamFlags
 from .cigarops import CigarOps
 
diff --git a/gffquant/alignment/reference_hit.py b/gffquant/alignment/reference_hit.py
new file mode 100644
index 00000000..29c5a0f6
--- /dev/null
+++ b/gffquant/alignment/reference_hit.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass, asdict
+
+
+@dataclass(slots=True)
+class ReferenceHit:
+    rid: int = None
+    start: int = None
+    end: int = None
+    rev_strand: bool = None
+    cov_start: int = None
+    cov_end: int = None
+    has_annotation: bool = None
+    n_aln: int = None
+    is_ambiguous: bool = None
+    library_mod: int = None
+    mate_id: int = None
+
+    def __hash__(self):
+        return hash(tuple(asdict(self).values()))
+
+    def __eq__(self, other):
+        return all(
+            item[0][1] == item[1][1]
+            for item in zip(
+                sorted(asdict(self).items()),
+                sorted(asdict(other).items())
+            )
+        )
+
+    def __str__(self):
+        return "\t".join(map(str, asdict(self).values()))
+
+    def __repr__(self):
+        return str(self)
diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index b3f74d18..8352300e 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -9,6 +9,8 @@
 import numpy as np
 
 from ..counters.count_manager import CountManager
+from ..counters.alignment_counter2 import AlignmentCounter
+
 
 logger = logging.getLogger(__name__)
 
@@ -277,23 +279,23 @@ class GeneCountAnnotator(CountAnnotator):
     def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
+    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
         """
         Annotate a set of gene counts via db-iteration.
         input:
         - bam: bamr.BamFile to use as reverse lookup table for reference ids
         - db: GffDatabaseManager holding functional annotation database
-        - count_manager: count_data
+        - counter: count_data
         """
         strand_specific_counts = (
-            (count_manager.PLUS_STRAND, count_manager.MINUS_STRAND)
+            (counter.PLUS_STRAND, counter.MINUS_STRAND)
             if self.strand_specific else None
         )
 
-        for rid in count_manager.get_all_regions():
+        for rid in counter.get_all_regions():
             ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
-            uniq_counts, ambig_counts = count_manager.get_counts(
+            uniq_counts, ambig_counts = counter.get_counts(
                 rid, region_counts=False, strand_specific=self.strand_specific
             )
 
diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 06952cf7..fcff0f2e 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from ..counters import CountManager
+from ..counters import CountManager, AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
@@ -163,7 +163,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un
                         )
                         CountWriter.write_row(feature.name, out_row, stream=feat_out)
 
-    def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False):
+    def write_gene_counts(self, gene_counts: AlignmentCounter, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False):
         if "scaled" in self.publish_reports:
             logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index ed161d40..9ef4455e 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from .count_annotator import CountAnnotator
-from ..counters import CountManager
+from ..counters import CountManager, AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
@@ -15,13 +15,13 @@ class GeneCountAnnotator(CountAnnotator):
 	def __init__(self, strand_specific, report_scaling_factors=True):
 		CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-	def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
-		self.total_gene_counts = count_manager.transform_counts(refmgr)
+	def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
+		self.total_gene_counts = counter.transform(refmgr)  # count_manager.transform_counts(refmgr)
 		logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
 		# self.total_counts = self.total_gene_counts  # ?
 
-		for rid in count_manager.get_all_regions():
-			counts = count_manager.get_counts(rid)
+		for rid in counter.get_all_regions():
+			counts = counter.get_counts(rid)
 			ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
 			if gene_group_db:
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 545deafd..2e7bc3b8 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -13,6 +13,7 @@
 class AlignmentCounter:
     COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
     INITIAL_SIZE = 1000
+    PLUS_STRAND, MINUS_STRAND = False, True
 
     @staticmethod
     def normalise_counts(counts, feature_len, scaling_factor):
@@ -24,10 +25,39 @@ def normalise_counts(counts, feature_len, scaling_factor):
     def get_increment(self, n_aln, increment):
         # 1overN = lavern. Maya <3
         return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
+    
+    def toggle_single_read_handling(self, unmarked_orphans):
+        # precalculate count-increment for single-end, paired-end reads
+        # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing),
+        # properly attribute fractional counts to the orphans
+        # Increments:
+        # alignment from single end library read: 1
+        # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2)
+        # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2)
+
+        # old code:
+        # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5
+
+        # if pair:
+        #     increment = 1 if self.paired_end_count == 2 else 0.5
+        # else:
+        #     increment = 0.5 if self.unmarked_orphans else 1
+        self.increments = (
+            (self.paired_end_count / 2.0) if unmarked_orphans else 1.0,
+            self.paired_end_count / 2.0,
+        )
 
-    def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False):
+    def __init__(
+        self,
+        distribution_mode=DistributionMode.ONE_OVER_N,
+        strand_specific=False,
+        paired_end_count=1,
+    ):
         self.distribution_mode = distribution_mode
         self.strand_specific = strand_specific
+        self.paired_end_count = paired_end_count
+        self.increments = (1.0, 1.0,)
+        self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,)
         self.unannotated_reads = 0
 
         self.index = {}
@@ -70,6 +100,36 @@ def __setitem__(self, key, value):
             self.counts[key_index] = value
         else:
             raise KeyError(f"{key=} not found.")
+    
+    def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,):
+        if pe_library is not None:
+            # this is the case when the alignment has a read group tag
+            # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans)
+            # else (RG tag '1') -> take single-end increment
+            increment = self.increments_auto_detect[pe_library]
+        else:
+            # if the alignment has no (appropriate) read group tag
+            # use the paired-end information instead
+            # if orphan reads are present in the input sam/bam,
+            # the flag `--unmarked_orphans` should be set
+            # otherwise orphan reads will be assigned a count of 1.
+            increment = self.increments[pair]
+
+        contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
+
+        return contributed_counts
+    
+    def get_unannotated_reads(self):
+        return self.unannotated_reads
+    
+    def get_counts(self, seqid, strand_specific=False):
+        if strand_specific:
+            raise NotImplementedError()
+        counts = self[seqid]
+        return np.array((counts[0], counts[2], counts[1], counts[3]))
+    
+    def get_all_regions(self):
+        yield from self 
         
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index e685f955..2af697f4 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -13,9 +13,9 @@
 from dataclasses import dataclass, asdict
 
 from .panda_coverage_profiler import PandaCoverageProfiler
-from ..alignment import AlignmentGroup, AlignmentProcessor, SamFlags
+from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags
 from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter
-from ..counters import CountManager
+from ..counters import CountManager, AlignmentCounter
 from ..db.annotation_db import AnnotationDatabaseManager
 
 from .. import __tool__, DistributionMode, RunMode
@@ -24,39 +24,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass(slots=True)
-class ReferenceHit:
-    rid: int = None
-    start: int = None
-    end: int = None
-    rev_strand: bool = None
-    cov_start: int = None
-    cov_end: int = None
-    has_annotation: bool = None
-    n_aln: int = None
-    is_ambiguous: bool = None
-    library_mod: int = None
-    mate_id: int = None
-
-    def __hash__(self):
-        return hash(tuple(asdict(self).values()))
-
-    def __eq__(self, other):
-        return all(
-            item[0][1] == item[1][1]
-            for item in zip(
-                sorted(asdict(self).items()),
-                sorted(asdict(other).items())
-            )
-        )
-
-    def __str__(self):
-        return "\t".join(map(str, asdict(self).values()))
-
-    def __repr__(self):
-        return str(self)
-
-
 class FeatureQuantifier(ABC):
     """
         Three groups of alignments:
@@ -93,10 +60,15 @@ def __init__(
         self.db = db
         self.adm = None
         self.run_mode = run_mode
-        self.count_manager = CountManager(
+        # self.count_manager = CountManager(
+        #     distribution_mode=distribution_mode,
+        #     region_counts=run_mode.overlap_required,
+        #     strand_specific=strand_specific and not run_mode.overlap_required,
+        #     paired_end_count=paired_end_count,
+        # )
+        self.counter = AlignmentCounter(
             distribution_mode=distribution_mode,
-            region_counts=run_mode.overlap_required,
-            strand_specific=strand_specific and not run_mode.overlap_required,
+            strand_specific=strand_specific,
             paired_end_count=paired_end_count,
         )
         self.out_prefix = out_prefix
@@ -158,22 +130,20 @@ def process_counters(
             self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory)
 
         if dump_counters:
-            self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager)
+            # self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager)
+            self.counter.dump(self.out_prefix, self.reference_manager,)
 
         report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports
 
         Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required]
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
 
-        # self.count_manager.dump("pre_annotate", self.reference_manager)
-
-        count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,)
-
-        # self.count_manager.dump("post_annotate", self.reference_manager)
-
+        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
+        
         count_writer = CountWriter(
             self.out_prefix,
-            has_ambig_counts=self.count_manager.has_ambig_counts(),
+            # has_ambig_counts=self.count_manager.has_ambig_counts(),
+            has_ambig_counts=self.counter.has_ambig_counts(),
             strand_specific=self.strand_specific,
             restrict_reports=restrict_reports,
             report_category=report_category,
@@ -181,7 +151,8 @@ def process_counters(
             filtered_readcount=self.aln_counter["filtered_read_count"],
         )
 
-        unannotated_reads = self.count_manager.get_unannotated_reads()
+        # unannotated_reads = self.count_manager.get_unannotated_reads()
+        unannotated_reads = self.counter.get_unannotated_reads()
         unannotated_reads += self.aln_counter["unannotated_ambig"]
 
         count_writer.write_feature_counts(
@@ -192,7 +163,8 @@ def process_counters(
 
         count_writer.write_gene_counts(
             # count_annotator.gene_counts,
-            self.count_manager,
+            # self.count_manager,
+            self.counter,
             self.reference_manager,
             count_annotator.scaling_factors["total_gene_uniq"],
             count_annotator.scaling_factors["total_gene_ambi"],
@@ -234,7 +206,8 @@ def process_alignments(
             filtered_sam=debug_samfile,
         )
 
-        self.count_manager.toggle_single_read_handling(unmarked_orphans)
+        # self.count_manager.toggle_single_read_handling(unmarked_orphans)
+        self.counter.toggle_single_read_handling(unmarked_orphans)
         ac = self.aln_counter
 
         read_count = 0
@@ -455,7 +428,8 @@ def process_alignment_group(self, aln_group, aln_reader):
                 )
             )
 
-            contributed_counts = self.count_manager.update_counts(
+            # contributed_counts = self.count_manager.update_counts(
+            contributed_counts = self.counter.update(
                 count_stream,
                 ambiguous_counts=is_ambiguous_group,
                 pair=aln_group.is_paired(),

From 37b624e300d12df3bef2a7a91a416cec7e95db2d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 15:59:11 +0100
Subject: [PATCH 034/128] pleasing linters

---
 gffquant/alignment/reference_hit.py        |  4 +
 gffquant/annotation/count_annotator.py     | 62 +---------------
 gffquant/annotation/count_writer.py        | 12 ++-
 gffquant/annotation/genecount_annotator.py | 86 +++++++++++-----------
 gffquant/counters/alignment_counter2.py    | 51 +++++++++----
 gffquant/counters/count_manager.py         |  7 +-
 gffquant/counters/region_counter.py        | 15 ++++
 gffquant/counters/seq_counter.py           |  4 +-
 gffquant/profilers/feature_quantifier.py   |  5 +-
 gffquant/profilers/panda_profiler.py       |  3 -
 10 files changed, 118 insertions(+), 131 deletions(-)

diff --git a/gffquant/alignment/reference_hit.py b/gffquant/alignment/reference_hit.py
index 29c5a0f6..968da602 100644
--- a/gffquant/alignment/reference_hit.py
+++ b/gffquant/alignment/reference_hit.py
@@ -1,3 +1,7 @@
+# pylint: disable=R0902
+
+""" module docstring """
+
 from dataclasses import dataclass, asdict
 
 
diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 8352300e..cc146006 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -9,7 +9,6 @@
 import numpy as np
 
 from ..counters.count_manager import CountManager
-from ..counters.alignment_counter2 import AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
@@ -143,7 +142,7 @@ def calc_scaling_factor(raw, normed, default=0):
                     total_ambi, total_ambi_normed, default_scaling_factor
                 )
             )
-            
+
             if self.report_scaling_factors:
                 logger.info(
                     "Calculating scaling factors for category=%s: uraw=%s unorm=%s araw=%s anorm=%s -> factors=%s",
@@ -271,62 +270,3 @@ def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False)
                     self.total_gene_counts += counts[:4]
 
         self.calculate_scaling_factors()
-
-
-class GeneCountAnnotator(CountAnnotator):
-    """ CountAnnotator subclass for gene-based counting. """
-
-    def __init__(self, strand_specific, report_scaling_factors=True):
-        CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
-
-    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
-        """
-        Annotate a set of gene counts via db-iteration.
-        input:
-        - bam: bamr.BamFile to use as reverse lookup table for reference ids
-        - db: GffDatabaseManager holding functional annotation database
-        - counter: count_data
-        """
-        strand_specific_counts = (
-            (counter.PLUS_STRAND, counter.MINUS_STRAND)
-            if self.strand_specific else None
-        )
-
-        for rid in counter.get_all_regions():
-            ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-
-            uniq_counts, ambig_counts = counter.get_counts(
-                rid, region_counts=False, strand_specific=self.strand_specific
-            )
-
-            counts = self.compute_count_vector(
-                uniq_counts,
-                ambig_counts,
-                region_length,
-                strand_specific_counts=strand_specific_counts,
-            )
-
-            if gene_group_db:
-                ref_tokens = ref.split(".")
-                gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-            else:
-                ggroup_id, gene_id = ref, ref
-
-            gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
-            gcounts += counts
-            self.total_gene_counts += counts[:4]
-
-            region_annotation = db.query_sequence(ggroup_id)
-            if region_annotation is not None:
-                _, _, region_annotation = region_annotation
-                logger.info(
-                    "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
-                    gene_id, ggroup_id, counts[0], counts[2],
-                )
-                self.distribute_feature_counts(counts, region_annotation)
-
-            else:
-                logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
-                self.unannotated_counts += counts[:4]
-
-        self.calculate_scaling_factors()
diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index fcff0f2e..e5959598 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from ..counters import CountManager, AlignmentCounter
+from ..counters import AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
@@ -163,7 +163,14 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un
                         )
                         CountWriter.write_row(feature.name, out_row, stream=feat_out)
 
-    def write_gene_counts(self, gene_counts: AlignmentCounter, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False):
+    def write_gene_counts(
+        self,
+        gene_counts: AlignmentCounter,
+        refmgr,
+        uniq_scaling_factor,
+        ambig_scaling_factor,
+        gene_group_db=False
+    ):
         if "scaled" in self.publish_reports:
             logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
@@ -216,4 +223,3 @@ def write_gene_counts(self, gene_counts: AlignmentCounter, refmgr, uniq_scaling_
             #         gene_id = ref
 
             #     CountWriter.write_row(gene_id, out_row, stream=gene_out,)
-
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 9ef4455e..bbd7581d 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -1,49 +1,53 @@
-import logging 
-
-import numpy as np
+""" module docstring """
+import logging
 
 from .count_annotator import CountAnnotator
-from ..counters import CountManager, AlignmentCounter
+from ..counters import AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
 
 
 class GeneCountAnnotator(CountAnnotator):
-	""" CountAnnotator subclass for gene-based counting. """
-
-	def __init__(self, strand_specific, report_scaling_factors=True):
-		CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
-
-	def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
-		self.total_gene_counts = counter.transform(refmgr)  # count_manager.transform_counts(refmgr)
-		logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
-		# self.total_counts = self.total_gene_counts  # ?
-
-		for rid in counter.get_all_regions():
-			counts = counter.get_counts(rid)
-			ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-
-			if gene_group_db:
-				ref_tokens = ref.split(".")
-				gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-			else:
-				gene_id, ggroup_id = ref, ref
-
-			region_annotation = db.query_sequence(ggroup_id)
-			if region_annotation is not None:
-				_, _, region_annotation = region_annotation
-				logger.info(
-					"GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
-					gene_id, ggroup_id, counts[0], counts[2],
-				)
-				self.distribute_feature_counts(counts, region_annotation)
-
-			else:
-				logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
-				self.unannotated_counts += counts[:4]
-
-		self.calculate_scaling_factors()
-
-
-			
+    """ CountAnnotator subclass for gene-based counting. """
+
+    def __init__(self, strand_specific, report_scaling_factors=True):
+        """ __init__() """
+        CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
+
+    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
+        """ Annotate a set of gene counts with functional annotations. """
+        self.total_gene_counts = counter.transform(refmgr)  # count_manager.transform_counts(refmgr)
+        logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
+        # self.total_counts = self.total_gene_counts  # ?
+
+        # formerly used in compute_count_vector
+        strand_specific_counts = (
+            (counter.PLUS_STRAND, counter.MINUS_STRAND)
+            if self.strand_specific else None
+        )
+
+        for rid in counter.get_all_regions():
+            counts = counter.get_counts(rid, strand_specific=self.strand_specific)
+            ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+
+            if gene_group_db:
+                ref_tokens = ref.split(".")
+                gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            else:
+                gene_id, ggroup_id = ref, ref
+
+            region_annotation = db.query_sequence(ggroup_id)
+            if region_annotation is not None:
+                _, _, region_annotation = region_annotation
+                logger.info(
+                    "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
+                    gene_id, ggroup_id, counts[0], counts[2],
+                )
+                self.distribute_feature_counts(counts, region_annotation)
+
+            else:
+                logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
+                self.unannotated_counts += counts[:4]
+
+        self.calculate_scaling_factors()
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index 2e7bc3b8..a0c84716 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -1,3 +1,8 @@
+# pylint: disable=R0902
+
+""" module docstring """
+
+import gzip
 import logging
 
 from collections import Counter
@@ -13,6 +18,9 @@
 class AlignmentCounter:
     COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
     INITIAL_SIZE = 1000
+    # this may be counter-intuitive
+    # but originates from the samflags 0x10, 0x20,
+    # which explicitly identify the reverse-strandness of the read
     PLUS_STRAND, MINUS_STRAND = False, True
 
     @staticmethod
@@ -25,7 +33,7 @@ def normalise_counts(counts, feature_len, scaling_factor):
     def get_increment(self, n_aln, increment):
         # 1overN = lavern. Maya <3
         return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
-    
+
     def toggle_single_read_handling(self, unmarked_orphans):
         # precalculate count-increment for single-end, paired-end reads
         # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing),
@@ -65,8 +73,8 @@ def __init__(
             (AlignmentCounter.INITIAL_SIZE, 2,),
             dtype='float64',
         )
+
     def dump(self, prefix, refmgr):
-        import gzip
         with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
             for key, key_index in self.index.items():
                 ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
@@ -74,33 +82,35 @@ def dump(self, prefix, refmgr):
             # for k, v in self.items():
             # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
             # print(k, ref, reflen, v, sep="\t", file=_out)
-        ...
+
     def get(self, key, default_val):
         key_index = self.index.get(key)
         if key_index is None:
             return Counter()
         return Counter({key: self.counts[key_index]})
-    
+
     def setdefault(self, key, default_val):
         ...
 
     def has_ambig_counts(self):
         return bool(self.counts[:, 1].sum() != 0)
-    
+
     def __iter__(self):
         yield from self.index.keys()
+
     def __getitem__(self, key):
         key_index = self.index.get(key)
         if key_index is None:
             return 0.0
         return self.counts[key_index]
+
     def __setitem__(self, key, value):
         key_index = self.index.get(key)
         if key_index is not None:
             self.counts[key_index] = value
         else:
             raise KeyError(f"{key=} not found.")
-    
+
     def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,):
         if pe_library is not None:
             # this is the case when the alignment has a read group tag
@@ -118,19 +128,32 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No
         contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
 
         return contributed_counts
-    
+
     def get_unannotated_reads(self):
         return self.unannotated_reads
-    
+
     def get_counts(self, seqid, strand_specific=False):
         if strand_specific:
-            raise NotImplementedError()
+                raise NotImplementedError()
+                uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
+                uniq_counts[seqid[1]] = uniq_counter[seqid]
+                ambig_counts[seqid[1]] = ambig_counter[seqid]
+
+                # rid = seqid[0] if isinstance(seqid, tuple) else seqid
+                # uniq_counts = [
+                #     uniq_counter[(rid, CountManager.PLUS_STRAND)],
+                #     uniq_counter[(rid, CountManager.MINUS_STRAND)],
+                # ]
+                # ambig_counts = [
+                #     ambig_counter[(rid, CountManager.PLUS_STRAND)],
+                #     ambig_counter[(rid, CountManager.MINUS_STRAND)],
+                # ]
         counts = self[seqid]
         return np.array((counts[0], counts[2], counts[1], counts[3]))
-    
+
     def get_all_regions(self):
-        yield from self 
-        
+        yield from self
+
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
         for hits, aln_count in count_stream:
@@ -162,7 +185,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
             contributed_counts += inc
 
         return contributed_counts
-    
+
     def transform(self, refmgr):
         # transform 2-column uniq/ambig count matrix
         # into 4 columns
@@ -195,5 +218,3 @@ def transform(self, refmgr):
 
         # return count sums
         return self.counts.sum(axis=0)
-
-
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index 8f8517f8..d3eb059f 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -152,7 +152,7 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
         if region_counts:
             raise NotImplementedError()
             rid, seqid = seqid[0], seqid[1:]
-            
+
             uniq_counter = self.uniq_regioncounts.get(rid, Counter())
             ambig_counter = self.ambig_regioncounts.get(rid, Counter())
 
@@ -165,7 +165,6 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False):
         else:
             # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts
 
-
             if strand_specific:
                 raise NotImplementedError()
                 uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
@@ -196,7 +195,7 @@ def get_regions(self, rid):
         return set(self.uniq_regioncounts.get(rid, Counter())).union(
             self.ambig_regioncounts.get(rid, Counter())
         )
-    
+
     def get_all_regions(self, region_counts=False):
         # uniq_counts, ambig_counts = (
         #     (self.uniq_seqcounts, self.ambig_seqcounts,),
@@ -220,4 +219,4 @@ def dump(self, prefix, refmgr):
         if self.seqcounts is not None:
             self.seqcounts.dump(prefix, refmgr)
         if self.regioncounts is not None:
-            self.regioncounts.dump(prefix, refmgr)
\ No newline at end of file
+            self.regioncounts.dump(prefix, refmgr)
diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py
index a7a74756..3056288a 100644
--- a/gffquant/counters/region_counter.py
+++ b/gffquant/counters/region_counter.py
@@ -8,6 +8,21 @@
 from .alignment_counter2 import AlignmentCounter
 
 
+# from count_manager.get_counts()
+# if region_counts:
+#     raise NotImplementedError()
+#     rid, seqid = seqid[0], seqid[1:]
+
+#     uniq_counter = self.uniq_regioncounts.get(rid, Counter())
+#     ambig_counter = self.ambig_regioncounts.get(rid, Counter())
+
+#     # pylint: disable=R1720
+#     if strand_specific:
+#         raise NotImplementedError
+#     else:
+#         return [uniq_counter[seqid]], [ambig_counter[seqid]]
+
+
 class RegionCounter(AlignmentCounter):
     """This counter class can be used in overlap mode, i.e.
     when reads are aligned against long references (e.g. contigs)
diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py
index 91e28628..261ed575 100644
--- a/gffquant/counters/seq_counter.py
+++ b/gffquant/counters/seq_counter.py
@@ -1,9 +1,11 @@
 # pylint: disable=W0223
 
+# deprecated
+
 """ module docstring """
 
 from .. import DistributionMode
-from .alignment_counter2 import AlignmentCounter
+from .alignment_counter import AlignmentCounter
 
 
 class UniqueSeqCounter(AlignmentCounter):
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 2af697f4..6477c1be 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -10,12 +10,11 @@
 
 from abc import ABC
 from collections import Counter
-from dataclasses import dataclass, asdict
 
 from .panda_coverage_profiler import PandaCoverageProfiler
 from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags
 from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter
-from ..counters import CountManager, AlignmentCounter
+from ..counters import AlignmentCounter
 from ..db.annotation_db import AnnotationDatabaseManager
 
 from .. import __tool__, DistributionMode, RunMode
@@ -139,7 +138,7 @@ def process_counters(
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
 
         count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
-        
+
         count_writer = CountWriter(
             self.out_prefix,
             # has_ambig_counts=self.count_manager.has_ambig_counts(),
diff --git a/gffquant/profilers/panda_profiler.py b/gffquant/profilers/panda_profiler.py
index 53dacee6..4a5ee057 100644
--- a/gffquant/profilers/panda_profiler.py
+++ b/gffquant/profilers/panda_profiler.py
@@ -28,7 +28,6 @@ def __init__(
         self._buffer_size = 0
         self._max_buffer_size = 400_000_000
 
-
     def get_gene_coords(self):
         if self.with_overlap:
             for rid, start, end in zip(
@@ -283,7 +282,6 @@ def add_records(self, hits, last_update=False):
         self._buffer += hits
         self._buffer_size += hits_size
 
-
     def merge_dataframes(self):
         print("BUFFER:", len(self._buffer), self._buffer[:1])
         hits_df = pd.DataFrame(self._buffer)
@@ -319,7 +317,6 @@ def merge_dataframes(self):
                 .groupby(by=self.index_columns, as_index=False) \
                 .sum(numeric_only=True)
 
-
     def add_records_old(self, hits):
 
         # [2024-02-08 14:51:17,846] count_stream:

From 9e078ce9b8665257c7d26c6ade602a56b9333b64 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 21:47:16 +0100
Subject: [PATCH 035/128] removed seq_counter.py

---
 gffquant/annotation/count_annotator.py   |  4 +-
 gffquant/counters/alignment_counter2.py  |  8 +--
 gffquant/counters/seq_counter.py         | 65 ------------------------
 gffquant/profilers/feature_quantifier.py |  7 ---
 4 files changed, 6 insertions(+), 78 deletions(-)
 delete mode 100644 gffquant/counters/seq_counter.py

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index cc146006..12d811ae 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from ..counters.count_manager import CountManager
+from ..counters.count_manager import CountManager, AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
@@ -202,7 +202,7 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
     # pylint: disable=R0914,W0613
-    def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False):
+    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
         """
         Annotate a set of region counts via db-lookup.
         input:
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
index a0c84716..8fc96d2c 100644
--- a/gffquant/counters/alignment_counter2.py
+++ b/gffquant/counters/alignment_counter2.py
@@ -141,12 +141,12 @@ def get_counts(self, seqid, strand_specific=False):
 
                 # rid = seqid[0] if isinstance(seqid, tuple) else seqid
                 # uniq_counts = [
-                #     uniq_counter[(rid, CountManager.PLUS_STRAND)],
-                #     uniq_counter[(rid, CountManager.MINUS_STRAND)],
+                #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+                #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
                 # ]
                 # ambig_counts = [
-                #     ambig_counter[(rid, CountManager.PLUS_STRAND)],
-                #     ambig_counter[(rid, CountManager.MINUS_STRAND)],
+                #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+                #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
                 # ]
         counts = self[seqid]
         return np.array((counts[0], counts[2], counts[1], counts[3]))
diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py
deleted file mode 100644
index 261ed575..00000000
--- a/gffquant/counters/seq_counter.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# pylint: disable=W0223
-
-# deprecated
-
-""" module docstring """
-
-from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
-
-
-class UniqueSeqCounter(AlignmentCounter):
-    def __init__(self, strand_specific=False):
-        AlignmentCounter.__init__(self, strand_specific=strand_specific)
-
-    def get_counts(self, seq_ids):
-        """
-        Given a list of sequence ids, return the total number of reads that mapped to each of those
-        sequences
-
-        :param seq_ids: a list of sequence ids to count
-        :return: A list of counts for each sequence ID.
-        """
-        if self.strand_specific:
-            return sum(
-                self[(seq_id, strand)] for seq_id in seq_ids for strand in (True, False)
-            )
-        return sum(self[seq_id] for seq_id in seq_ids)
-
-    def update_counts(self, count_stream, increment=1):
-        for counts, _, _ in count_stream:
-
-            for rid, hits in counts.items():
-
-                if self.strand_specific:
-                    strands = tuple(int(strand) for _, _, strand, _, _ in hits)
-
-                    self[(rid, True)] += sum(strands) * increment
-                    self[(rid, False)] += (len(hits) - sum(strands)) * increment
-
-                else:
-                    self[rid] += len(hits) * increment
-
-
-class AmbiguousSeqCounter(AlignmentCounter):
-    def __init__(self, strand_specific=False, distribution_mode=DistributionMode.ONE_OVER_N):
-        AlignmentCounter.__init__(
-            self, distribution_mode=distribution_mode, strand_specific=strand_specific
-        )
-
-    def update_counts(self, count_stream, increment=1):
-
-        for counts, aln_count, _ in count_stream:
-
-            inc = self.get_increment(aln_count, increment)
-
-            for rid, hits in counts.items():
-
-                if self.strand_specific:
-                    strands = tuple(int(strand) for _, _, strand, _, _ in hits)
-
-                    self[(rid, True)] += sum(strands) * inc
-                    self[(rid, False)] += (len(hits) - sum(strands)) * inc
-
-                else:
-                    self[rid] += len(hits) * inc
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 6477c1be..b3b08a9f 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -59,12 +59,6 @@ def __init__(
         self.db = db
         self.adm = None
         self.run_mode = run_mode
-        # self.count_manager = CountManager(
-        #     distribution_mode=distribution_mode,
-        #     region_counts=run_mode.overlap_required,
-        #     strand_specific=strand_specific and not run_mode.overlap_required,
-        #     paired_end_count=paired_end_count,
-        # )
         self.counter = AlignmentCounter(
             distribution_mode=distribution_mode,
             strand_specific=strand_specific,
@@ -74,7 +68,6 @@ def __init__(
         self.distribution_mode = distribution_mode
         self.reference_manager = {}
         self.strand_specific = strand_specific
-        # self.coverage_counter = {}
         self.debug = debug
         self.panda_cv = PandaCoverageProfiler(dump_dataframes=self.debug) if calculate_coverage else None
 

From 7976a4da982df4750706be468a41995fd3f3c03e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 21:49:22 +0100
Subject: [PATCH 036/128] removed Unique- and AmbiguousRegionCounter classes

---
 gffquant/counters/region_counter.py | 65 -----------------------------
 1 file changed, 65 deletions(-)

diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py
index 3056288a..5b36a876 100644
--- a/gffquant/counters/region_counter.py
+++ b/gffquant/counters/region_counter.py
@@ -49,68 +49,3 @@ def update_counts(self, count_stream, increment=1):
                 )
                 contributed_counts += inc
         return contributed_counts
-
-
-class UniqueRegionCounter(RegionCounter):
-    """This counter class can be used in overlap mode, i.e.
-    when reads are aligned against long references (e.g. contigs)
-    with multiple regions of interest (features).
-    """
-
-    def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False):
-        RegionCounter.__init__(
-            self, distribution_mode=distribution_mode, strand_specific=strand_specific,
-        )
-
-    # pylint: disable=W0613
-    def update_counts(self, count_stream, increment=1):
-        """Update counter with alignments against the same reference.
-
-        input: count_stream
-        - counts: set of overlaps with the reference
-        - aln_count: 1 if overlaps else 0
-        - unaligned: 1 - aln_count
-        (redundant input due to streamlining uniq/ambig dataflows)
-        """
-        for counts, aln_count, unaligned in count_stream:
-            if aln_count:
-                for rid, hits in counts.items():
-                    for hit in hits:
-                        self._update_region(
-                            rid, *hit, increment=increment
-                        )
-            else:
-                self.unannotated_reads += unaligned
-
-
-class AmbiguousRegionCounter(RegionCounter):
-    """This counter class can be used in overlap mode, i.e.
-    when reads are aligned against long references (e.g. contigs)
-    with multiple regions of interest (features).
-    """
-
-    def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False):
-        RegionCounter.__init__(
-            self, distribution_mode=distribution_mode, strand_specific=strand_specific,
-        )
-
-    # pylint: disable=W0613
-    def update_counts(self, count_stream, increment=1):
-        """Update counter with alignments against the same reference.
-
-        input: count_stream
-        - counts: set of overlaps with the reference
-        - aln_count: 1 if overlaps else 0
-        - unaligned: 1 - aln_count
-        (redundant input due to streamlining uniq/ambig dataflows)
-        """
-        for counts, aln_count, unaligned in count_stream:
-            if aln_count:
-                inc = self.get_increment(aln_count, increment)
-                for rid, hits in counts.items():
-                    for hit in hits:
-                        self._update_region(
-                            rid, *hit, increment=inc
-                        )
-            else:
-                self.unannotated_reads += unaligned

From f754653f4e2a0e4142bc881fca71cd47f843eb6c Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 21:57:53 +0100
Subject: [PATCH 037/128] updated alignment_counter, removed alignment_counter2

---
 gffquant/counters/__init__.py           |   3 +-
 gffquant/counters/alignment_counter.py  | 202 ++++++++++++++++++++--
 gffquant/counters/alignment_counter2.py | 220 ------------------------
 gffquant/counters/count_manager.py      |   2 +-
 gffquant/counters/region_counter.py     |   2 +-
 5 files changed, 188 insertions(+), 241 deletions(-)
 delete mode 100644 gffquant/counters/alignment_counter2.py

diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py
index 774cd03c..34d30242 100644
--- a/gffquant/counters/__init__.py
+++ b/gffquant/counters/__init__.py
@@ -3,7 +3,6 @@
 
 """module docstring"""
 
-from .alignment_counter2 import AlignmentCounter
+from .alignment_counter import AlignmentCounter
 from .region_counter import RegionCounter
-from .seq_counter import UniqueSeqCounter, AmbiguousSeqCounter
 from .count_manager import CountManager
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 3c42b254..8fc96d2c 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -1,18 +1,27 @@
-# pylint: disable=W0223
-# pylint: disable=C0103
-# pylint: disable=W1514
+# pylint: disable=R0902
 
-"""module docstring"""
+""" module docstring """
 
 import gzip
+import logging
 
 from collections import Counter
 
+import numpy as np
+
 from .. import DistributionMode
 
 
-class AlignmentCounter(Counter):
-    COUNT_HEADER_ELEMENTS = ["raw", "lnorm", "scaled"]
+logger = logging.getLogger(__name__)
+
+
+class AlignmentCounter:
+    COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
+    INITIAL_SIZE = 1000
+    # this may be counter-intuitive
+    # but originates from the samflags 0x10, 0x20,
+    # which explicitly identify the reverse-strandness of the read
+    PLUS_STRAND, MINUS_STRAND = False, True
 
     @staticmethod
     def normalise_counts(counts, feature_len, scaling_factor):
@@ -25,28 +34,187 @@ def get_increment(self, n_aln, increment):
         # 1overN = lavern. Maya <3
         return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
 
-    def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False):
-        Counter.__init__(self)
+    def toggle_single_read_handling(self, unmarked_orphans):
+        # precalculate count-increment for single-end, paired-end reads
+        # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing),
+        # properly attribute fractional counts to the orphans
+        # Increments:
+        # alignment from single end library read: 1
+        # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2)
+        # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2)
+
+        # old code:
+        # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5
+
+        # if pair:
+        #     increment = 1 if self.paired_end_count == 2 else 0.5
+        # else:
+        #     increment = 0.5 if self.unmarked_orphans else 1
+        self.increments = (
+            (self.paired_end_count / 2.0) if unmarked_orphans else 1.0,
+            self.paired_end_count / 2.0,
+        )
+
+    def __init__(
+        self,
+        distribution_mode=DistributionMode.ONE_OVER_N,
+        strand_specific=False,
+        paired_end_count=1,
+    ):
         self.distribution_mode = distribution_mode
         self.strand_specific = strand_specific
+        self.paired_end_count = paired_end_count
+        self.increments = (1.0, 1.0,)
+        self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,)
         self.unannotated_reads = 0
 
+        self.index = {}
+        self.counts = np.zeros(
+            (AlignmentCounter.INITIAL_SIZE, 2,),
+            dtype='float64',
+        )
+
     def dump(self, prefix, refmgr):
         with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
-            for k, v in self.items():
-                ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
-                print(k, ref, reflen, v, sep="\t", file=_out)
+            for key, key_index in self.index.items():
+                ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
+                print(key, ref, reflen, self.counts[key_index], sep="\t", file=_out)
+            # for k, v in self.items():
+            # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
+            # print(k, ref, reflen, v, sep="\t", file=_out)
+
+    def get(self, key, default_val):
+        key_index = self.index.get(key)
+        if key_index is None:
+            return Counter()
+        return Counter({key: self.counts[key_index]})
+
+    def setdefault(self, key, default_val):
+        ...
+
+    def has_ambig_counts(self):
+        return bool(self.counts[:, 1].sum() != 0)
+
+    def __iter__(self):
+        yield from self.index.keys()
+
+    def __getitem__(self, key):
+        key_index = self.index.get(key)
+        if key_index is None:
+            return 0.0
+        return self.counts[key_index]
+
+    def __setitem__(self, key, value):
+        key_index = self.index.get(key)
+        if key_index is not None:
+            self.counts[key_index] = value
+        else:
+            raise KeyError(f"{key=} not found.")
+
+    def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,):
+        if pe_library is not None:
+            # this is the case when the alignment has a read group tag
+            # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans)
+            # else (RG tag '1') -> take single-end increment
+            increment = self.increments_auto_detect[pe_library]
+        else:
+            # if the alignment has no (appropriate) read group tag
+            # use the paired-end information instead
+            # if orphan reads are present in the input sam/bam,
+            # the flag `--unmarked_orphans` should be set
+            # otherwise orphan reads will be assigned a count of 1.
+            increment = self.increments[pair]
 
-    def update_counts(self, count_stream, increment=1):
+        contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
+
+        return contributed_counts
+
+    def get_unannotated_reads(self):
+        return self.unannotated_reads
+
+    def get_counts(self, seqid, strand_specific=False):
+        if strand_specific:
+                raise NotImplementedError()
+                uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
+                uniq_counts[seqid[1]] = uniq_counter[seqid]
+                ambig_counts[seqid[1]] = ambig_counter[seqid]
+
+                # rid = seqid[0] if isinstance(seqid, tuple) else seqid
+                # uniq_counts = [
+                #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+                #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
+                # ]
+                # ambig_counts = [
+                #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+                #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
+                # ]
+        counts = self[seqid]
+        return np.array((counts[0], counts[2], counts[1], counts[3]))
+
+    def get_all_regions(self):
+        yield from self
+
+    def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
         for hits, aln_count in count_stream:
             hit = hits[0]
-            inc = increment if aln_count == 1 else self.get_increment(aln_count, increment)
-            if self.strand_specific:
-                self[(hit.rid, hit.rev_strand)] += inc
-            else:
-                self[hit.rid] += inc
+            inc = (
+                (
+                    self.get_increment(aln_count, increment),
+                    increment,
+                )
+            )[aln_count == 1]
+            key = (
+                (
+                    hit.rid,
+                    (hit.rid, hit.rev_strand),
+                )
+            )[self.strand_specific]
 
+            key_index = self.index.get(key)
+            if key_index is None:
+                nrows = self.counts.shape[0]
+                if len(self.index) == nrows:
+                    self.counts = np.pad(
+                        self.counts,
+                        ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
+                    )
+                # key_index = self.index.setdefault(key, len(self.index))
+                key_index = self.index[key] = len(self.index)
+            self.counts[key_index][int(ambiguous_counts)] += inc
             contributed_counts += inc
 
         return contributed_counts
+
+    def transform(self, refmgr):
+        # transform 2-column uniq/ambig count matrix
+        # into 4 columns
+        # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm
+
+        # obtain gene lengths
+        lengths = np.array(
+            tuple(
+                (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
+                for key in self.index
+            )
+        )
+        logger.info("LENGTHS ARRAY = %s", lengths.shape)
+        logger.info("INDEX SIZE = %s", len(self.index))
+
+        # remove the un-indexed rows
+        self.counts = self.counts[0:len(self.index), :]
+
+        # calculate combined_raw
+        self.counts[:, 1:2] += self.counts[:, 0:1]
+
+        # duplicate the raw counts
+        self.counts = np.concatenate(
+            (self.counts, self.counts,),
+            axis=1,
+        )
+
+        # length-normalise the lnorm columns
+        self.counts[:, 2:4] /= lengths[:, None]
+
+        # return count sums
+        return self.counts.sum(axis=0)
diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py
deleted file mode 100644
index 8fc96d2c..00000000
--- a/gffquant/counters/alignment_counter2.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# pylint: disable=R0902
-
-""" module docstring """
-
-import gzip
-import logging
-
-from collections import Counter
-
-import numpy as np
-
-from .. import DistributionMode
-
-
-logger = logging.getLogger(__name__)
-
-
-class AlignmentCounter:
-    COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled")
-    INITIAL_SIZE = 1000
-    # this may be counter-intuitive
-    # but originates from the samflags 0x10, 0x20,
-    # which explicitly identify the reverse-strandness of the read
-    PLUS_STRAND, MINUS_STRAND = False, True
-
-    @staticmethod
-    def normalise_counts(counts, feature_len, scaling_factor):
-        """Returns raw, length-normalised, and scaled feature counts."""
-        normalised = counts / feature_len
-        scaled = normalised * scaling_factor
-        return counts, normalised, scaled
-
-    def get_increment(self, n_aln, increment):
-        # 1overN = lavern. Maya <3
-        return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
-
-    def toggle_single_read_handling(self, unmarked_orphans):
-        # precalculate count-increment for single-end, paired-end reads
-        # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing),
-        # properly attribute fractional counts to the orphans
-        # Increments:
-        # alignment from single end library read: 1
-        # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2)
-        # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2)
-
-        # old code:
-        # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5
-
-        # if pair:
-        #     increment = 1 if self.paired_end_count == 2 else 0.5
-        # else:
-        #     increment = 0.5 if self.unmarked_orphans else 1
-        self.increments = (
-            (self.paired_end_count / 2.0) if unmarked_orphans else 1.0,
-            self.paired_end_count / 2.0,
-        )
-
-    def __init__(
-        self,
-        distribution_mode=DistributionMode.ONE_OVER_N,
-        strand_specific=False,
-        paired_end_count=1,
-    ):
-        self.distribution_mode = distribution_mode
-        self.strand_specific = strand_specific
-        self.paired_end_count = paired_end_count
-        self.increments = (1.0, 1.0,)
-        self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,)
-        self.unannotated_reads = 0
-
-        self.index = {}
-        self.counts = np.zeros(
-            (AlignmentCounter.INITIAL_SIZE, 2,),
-            dtype='float64',
-        )
-
-    def dump(self, prefix, refmgr):
-        with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
-            for key, key_index in self.index.items():
-                ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
-                print(key, ref, reflen, self.counts[key_index], sep="\t", file=_out)
-            # for k, v in self.items():
-            # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
-            # print(k, ref, reflen, v, sep="\t", file=_out)
-
-    def get(self, key, default_val):
-        key_index = self.index.get(key)
-        if key_index is None:
-            return Counter()
-        return Counter({key: self.counts[key_index]})
-
-    def setdefault(self, key, default_val):
-        ...
-
-    def has_ambig_counts(self):
-        return bool(self.counts[:, 1].sum() != 0)
-
-    def __iter__(self):
-        yield from self.index.keys()
-
-    def __getitem__(self, key):
-        key_index = self.index.get(key)
-        if key_index is None:
-            return 0.0
-        return self.counts[key_index]
-
-    def __setitem__(self, key, value):
-        key_index = self.index.get(key)
-        if key_index is not None:
-            self.counts[key_index] = value
-        else:
-            raise KeyError(f"{key=} not found.")
-
-    def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,):
-        if pe_library is not None:
-            # this is the case when the alignment has a read group tag
-            # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans)
-            # else (RG tag '1') -> take single-end increment
-            increment = self.increments_auto_detect[pe_library]
-        else:
-            # if the alignment has no (appropriate) read group tag
-            # use the paired-end information instead
-            # if orphan reads are present in the input sam/bam,
-            # the flag `--unmarked_orphans` should be set
-            # otherwise orphan reads will be assigned a count of 1.
-            increment = self.increments[pair]
-
-        contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
-
-        return contributed_counts
-
-    def get_unannotated_reads(self):
-        return self.unannotated_reads
-
-    def get_counts(self, seqid, strand_specific=False):
-        if strand_specific:
-                raise NotImplementedError()
-                uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
-                uniq_counts[seqid[1]] = uniq_counter[seqid]
-                ambig_counts[seqid[1]] = ambig_counter[seqid]
-
-                # rid = seqid[0] if isinstance(seqid, tuple) else seqid
-                # uniq_counts = [
-                #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-                #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-                # ]
-                # ambig_counts = [
-                #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-                #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-                # ]
-        counts = self[seqid]
-        return np.array((counts[0], counts[2], counts[1], counts[3]))
-
-    def get_all_regions(self):
-        yield from self
-
-    def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
-        contributed_counts = 0
-        for hits, aln_count in count_stream:
-            hit = hits[0]
-            inc = (
-                (
-                    self.get_increment(aln_count, increment),
-                    increment,
-                )
-            )[aln_count == 1]
-            key = (
-                (
-                    hit.rid,
-                    (hit.rid, hit.rev_strand),
-                )
-            )[self.strand_specific]
-
-            key_index = self.index.get(key)
-            if key_index is None:
-                nrows = self.counts.shape[0]
-                if len(self.index) == nrows:
-                    self.counts = np.pad(
-                        self.counts,
-                        ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
-                    )
-                # key_index = self.index.setdefault(key, len(self.index))
-                key_index = self.index[key] = len(self.index)
-            self.counts[key_index][int(ambiguous_counts)] += inc
-            contributed_counts += inc
-
-        return contributed_counts
-
-    def transform(self, refmgr):
-        # transform 2-column uniq/ambig count matrix
-        # into 4 columns
-        # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm
-
-        # obtain gene lengths
-        lengths = np.array(
-            tuple(
-                (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
-                for key in self.index
-            )
-        )
-        logger.info("LENGTHS ARRAY = %s", lengths.shape)
-        logger.info("INDEX SIZE = %s", len(self.index))
-
-        # remove the un-indexed rows
-        self.counts = self.counts[0:len(self.index), :]
-
-        # calculate combined_raw
-        self.counts[:, 1:2] += self.counts[:, 0:1]
-
-        # duplicate the raw counts
-        self.counts = np.concatenate(
-            (self.counts, self.counts,),
-            axis=1,
-        )
-
-        # length-normalise the lnorm columns
-        self.counts[:, 2:4] /= lengths[:, None]
-
-        # return count sums
-        return self.counts.sum(axis=0)
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
index d3eb059f..4a37ad9a 100644
--- a/gffquant/counters/count_manager.py
+++ b/gffquant/counters/count_manager.py
@@ -5,7 +5,7 @@
 import numpy as np
 
 from .. import DistributionMode
-from .alignment_counter2 import AlignmentCounter
+from .alignment_counter import AlignmentCounter
 from .region_counter import RegionCounter
 
 
diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py
index 5b36a876..41f2b574 100644
--- a/gffquant/counters/region_counter.py
+++ b/gffquant/counters/region_counter.py
@@ -5,7 +5,7 @@
 from collections import Counter
 
 from .. import DistributionMode
-from .alignment_counter2 import AlignmentCounter
+from .alignment_counter import AlignmentCounter
 
 
 # from count_manager.get_counts()

From 26fbb556dfdefa0e2d962fc666a57b2673fc3c25 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 22:52:23 +0100
Subject: [PATCH 038/128] throwing out old code, splitting of
 regioncount_annotator

---
 gffquant/annotation/__init__.py              |  3 +-
 gffquant/annotation/count_annotator.py       | 79 +------------------
 gffquant/annotation/regioncount_annotator.py | 81 ++++++++++++++++++++
 gffquant/counters/alignment_counter.py       | 28 +++----
 4 files changed, 98 insertions(+), 93 deletions(-)
 create mode 100644 gffquant/annotation/regioncount_annotator.py

diff --git a/gffquant/annotation/__init__.py b/gffquant/annotation/__init__.py
index 1649dcb4..2f8e1c0c 100644
--- a/gffquant/annotation/__init__.py
+++ b/gffquant/annotation/__init__.py
@@ -3,6 +3,7 @@
 """ module docstring """
 
 # from .count_annotator import GeneCountAnnotator, RegionCountAnnotator
-from .count_annotator import RegionCountAnnotator
+from .count_annotator import CountAnnotator
 from .count_writer import CountWriter
 from .genecount_annotator import GeneCountAnnotator
+from .regioncount_annotator import RegionCountAnnotator
diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 12d811ae..669eb325 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from ..counters.count_manager import CountManager, AlignmentCounter
+from ..counters.count_manager import AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
@@ -193,80 +193,3 @@ def compute_count_vector(
         counts[1::2] /= float(length)
 
         return counts
-
-
-class RegionCountAnnotator(CountAnnotator):
-    """ CountAnnotator subclass for contig/region-based counting. """
-
-    def __init__(self, strand_specific, report_scaling_factors=True):
-        CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
-
-    # pylint: disable=R0914,W0613
-    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
-        """
-        Annotate a set of region counts via db-lookup.
-        input:
-        - bam: bamr.BamFile to use as lookup table for reference names
-        - db: GffDatabaseManager holding functional annotation database
-        - count_manager: count_data
-        """
-        # for rid in set(count_manager.uniq_regioncounts).union(
-        #     count_manager.ambig_regioncounts
-        # ):
-        for rid in count_manager.get_all_regions(region_counts=True):
-            ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
-
-            for region in count_manager.get_regions(rid):
-                if self.strand_specific:
-                    (start, end), rev_strand = region
-                else:
-                    (start, end), rev_strand = region, None
-                # the region_annotation is a tuple of key-value pairs:
-                # (strand, func_category1: subcategories, func_category2: subcategories, ...)
-                # the first is the strand, the second is the gene id, the rest are the features
-
-                region_annotation = db.query_sequence(ref, start=start, end=end)
-                if region_annotation is not None:
-                    region_strand, feature_id, region_annotation = region_annotation
-                    if feature_id is None:
-                        feature_id = ref
-
-                    on_other_strand = (region_strand == "+" and rev_strand) \
-                        or (region_strand == "-" and not rev_strand)
-
-                    antisense_region = self.strand_specific and on_other_strand
-
-                    uniq_counts, ambig_counts = count_manager.get_counts(
-                        (rid, start, end), region_counts=True, strand_specific=self.strand_specific
-                    )
-
-                    if self.strand_specific:
-                        # if the region is antisense, 'sense-counts' (relative to the) region come from the
-                        # negative strand and 'antisense-counts' from the positive strand
-                        # vice-versa for a sense-region
-                        strand_specific_counts = (
-                            (count_manager.MINUS_STRAND, count_manager.PLUS_STRAND)
-                            if antisense_region
-                            else (count_manager.PLUS_STRAND, count_manager.MINUS_STRAND)
-                        )
-                    else:
-                        strand_specific_counts = None
-
-                    region_length = end - start + 1
-                    counts = self.compute_count_vector(
-                        uniq_counts,
-                        ambig_counts,
-                        region_length,
-                        strand_specific_counts=strand_specific_counts,
-                        region_counts=True,
-                    )
-
-                    self.distribute_feature_counts(counts, region_annotation)
-
-                    gcounts = self.gene_counts.setdefault(
-                        feature_id, np.zeros(self.bins)
-                    )
-                    gcounts += counts
-                    self.total_gene_counts += counts[:4]
-
-        self.calculate_scaling_factors()
diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py
new file mode 100644
index 00000000..8db52e1c
--- /dev/null
+++ b/gffquant/annotation/regioncount_annotator.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+from . import CountAnnotator
+from ..counters import AlignmentCounter
+
+
+class RegionCountAnnotator(CountAnnotator):
+    """ CountAnnotator subclass for contig/region-based counting. """
+
+    def __init__(self, strand_specific, report_scaling_factors=True):
+        CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
+
+    # pylint: disable=R0914,W0613
+    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
+        """
+        Annotate a set of region counts via db-lookup.
+        input:
+        - bam: bamr.BamFile to use as lookup table for reference names
+        - db: GffDatabaseManager holding functional annotation database
+        - count_manager: count_data
+        """
+        # for rid in set(count_manager.uniq_regioncounts).union(
+        #     count_manager.ambig_regioncounts
+        # ):
+        for rid in counter.get_all_regions(region_counts=True):
+            ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
+
+            for region in counter.get_regions(rid):
+                if self.strand_specific:
+                    (start, end), rev_strand = region
+                else:
+                    (start, end), rev_strand = region, None
+                # the region_annotation is a tuple of key-value pairs:
+                # (strand, func_category1: subcategories, func_category2: subcategories, ...)
+                # the first is the strand, the second is the gene id, the rest are the features
+
+                region_annotation = db.query_sequence(ref, start=start, end=end)
+                if region_annotation is not None:
+                    region_strand, feature_id, region_annotation = region_annotation
+                    if feature_id is None:
+                        feature_id = ref
+
+                    on_other_strand = (region_strand == "+" and rev_strand) \
+                        or (region_strand == "-" and not rev_strand)
+
+                    antisense_region = self.strand_specific and on_other_strand
+
+                    uniq_counts, ambig_counts = counter.get_counts(
+                        (rid, start, end), region_counts=True, strand_specific=self.strand_specific
+                    )
+
+                    if self.strand_specific:
+                        # if the region is antisense, 'sense-counts' (relative to the) region come from the
+                        # negative strand and 'antisense-counts' from the positive strand
+                        # vice-versa for a sense-region
+                        strand_specific_counts = (
+                            (counter.MINUS_STRAND, counter.PLUS_STRAND)
+                            if antisense_region
+                            else (counter.PLUS_STRAND, counter.MINUS_STRAND)
+                        )
+                    else:
+                        strand_specific_counts = None
+
+                    region_length = end - start + 1
+                    counts = self.compute_count_vector(
+                        uniq_counts,
+                        ambig_counts,
+                        region_length,
+                        strand_specific_counts=strand_specific_counts,
+                        region_counts=True,
+                    )
+
+                    self.distribute_feature_counts(counts, region_annotation)
+
+                    gcounts = self.gene_counts.setdefault(
+                        feature_id, np.zeros(self.bins)
+                    )
+                    gcounts += counts
+                    self.total_gene_counts += counts[:4]
+
+        self.calculate_scaling_factors()
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 8fc96d2c..16e53b54 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -134,20 +134,20 @@ def get_unannotated_reads(self):
 
     def get_counts(self, seqid, strand_specific=False):
         if strand_specific:
-                raise NotImplementedError()
-                uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
-                uniq_counts[seqid[1]] = uniq_counter[seqid]
-                ambig_counts[seqid[1]] = ambig_counter[seqid]
-
-                # rid = seqid[0] if isinstance(seqid, tuple) else seqid
-                # uniq_counts = [
-                #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-                #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-                # ]
-                # ambig_counts = [
-                #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-                #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-                # ]
+            raise NotImplementedError()
+            # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
+            # uniq_counts[seqid[1]] = uniq_counter[seqid]
+            # ambig_counts[seqid[1]] = ambig_counter[seqid]
+
+            # rid = seqid[0] if isinstance(seqid, tuple) else seqid
+            # uniq_counts = [
+            #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+            #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
+            # ]
+            # ambig_counts = [
+            #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+            #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
+            # ]
         counts = self[seqid]
         return np.array((counts[0], counts[2], counts[1], counts[3]))
 

From bd1436f4812d9f7ac3505e7896a3fb50a836347e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 22:58:35 +0100
Subject: [PATCH 039/128] removed count_manager

---
 gffquant/counters/__init__.py      |   1 -
 gffquant/counters/count_manager.py | 222 -----------------------------
 2 files changed, 223 deletions(-)
 delete mode 100644 gffquant/counters/count_manager.py

diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py
index 34d30242..d325316e 100644
--- a/gffquant/counters/__init__.py
+++ b/gffquant/counters/__init__.py
@@ -5,4 +5,3 @@
 
 from .alignment_counter import AlignmentCounter
 from .region_counter import RegionCounter
-from .count_manager import CountManager
diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py
deleted file mode 100644
index 4a37ad9a..00000000
--- a/gffquant/counters/count_manager.py
+++ /dev/null
@@ -1,222 +0,0 @@
-"""count_manager"""
-
-from collections import Counter
-
-import numpy as np
-
-from .. import DistributionMode
-from .alignment_counter import AlignmentCounter
-from .region_counter import RegionCounter
-
-
-# pylint: disable=R0902
-class CountManager:
-    # this may be counter-intuitive
-    # but originates from the samflags 0x10, 0x20,
-    # which also identify the reverse-strandness of the read
-    # and not the forward-strandness
-    PLUS_STRAND, MINUS_STRAND = False, True
-
-    def toggle_single_read_handling(self, unmarked_orphans):
-        # precalculate count-increment for single-end, paired-end reads
-        # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing),
-        # properly attribute fractional counts to the orphans
-        # Increments:
-        # alignment from single end library read: 1
-        # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2)
-        # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2)
-
-        # old code:
-        # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5
-
-        # if pair:
-        #     increment = 1 if self.paired_end_count == 2 else 0.5
-        # else:
-        #     increment = 0.5 if self.unmarked_orphans else 1
-        self.increments = [
-            (self.paired_end_count / 2.0) if unmarked_orphans else 1.0,
-            self.paired_end_count / 2.0
-        ]
-
-    def __init__(
-        # pylint: disable=W0613,R0913
-        self,
-        distribution_mode=DistributionMode.ONE_OVER_N,
-        region_counts=True,
-        strand_specific=False,
-        paired_end_count=1,
-    ):
-        self.distribution_mode = distribution_mode
-        self.strand_specific = strand_specific
-        self.paired_end_count = paired_end_count
-        self.increments = [1.0, 1.0]
-        self.increments_auto_detect = [1.0, self.paired_end_count / 2.0]
-
-        # self.uniq_seqcounts, self.ambig_seqcounts = None, None
-        # self.uniq_regioncounts, self.ambig_regioncounts = None, None
-        self.seqcounts, self.regioncounts = None, None
-
-        if region_counts:
-            # self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific)
-            # self.ambig_regioncounts = RegionCounter(
-            #     strand_specific=strand_specific,
-            #     distribution_mode=distribution_mode,
-            # )
-            self.regioncounts = RegionCounter(
-                strand_specific=strand_specific,
-                distribution_mode=distribution_mode,
-            )
-
-        else:
-            # self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific)
-            # self.ambig_seqcounts = AlignmentCounter(
-            #     strand_specific=strand_specific,
-            #     distribution_mode=distribution_mode
-            # )
-            self.seqcounts = AlignmentCounter(
-                strand_specific=strand_specific,
-                distribution_mode=distribution_mode,
-            )
-
-    def has_ambig_counts(self):
-        return any(
-            (
-                self.seqcounts and self.seqcounts.has_ambig_counts(),
-                self.regioncounts and self.regioncounts.has_ambig_counts(),
-            )
-        )
-        # return self.ambig_regioncounts or self.ambig_seqcounts
-
-    def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None):
-        # seq_counter, region_counter = (
-        #     (self.uniq_seqcounts, self.uniq_regioncounts)
-        #     if not ambiguous_counts
-        #     else (self.ambig_seqcounts, self.ambig_regioncounts)
-        # )
-
-        if pe_library is not None:
-            # this is the case when the alignment has a read group tag
-            # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans)
-            # else (RG tag '1') -> take single-end increment
-            increment = self.increments_auto_detect[pe_library]
-        else:
-            # if the alignment has no (appropriate) read group tag
-            # use the paired-end information instead
-            # if orphan reads are present in the input sam/bam,
-            # the flag `--unmarked_orphans` should be set
-            # otherwise orphan reads will be assigned a count of 1.
-            increment = self.increments[pair]
-
-        contributed_counts = 0
-        if self.seqcounts is not None:
-            contributed_counts = self.seqcounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
-        elif self.regioncounts is not None:
-            contributed_counts = self.regioncounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,)
-        # if seq_counter is not None:
-            # contributed_counts = seq_counter.update_counts(count_stream, increment=increment)
-        # elif region_counter is not None:
-            # contributed_counts = region_counter.update_counts(count_stream, increment=increment)
-
-        return contributed_counts
-
-    def dump_raw_counters(self, prefix, refmgr):
-        # if self.uniq_seqcounts is not None:
-        #     self.uniq_seqcounts.dump(prefix, refmgr)
-        # if self.ambig_seqcounts is not None:
-        #     self.ambig_seqcounts.dump(prefix, refmgr)
-        # if self.uniq_regioncounts is not None:
-        #     self.uniq_regioncounts.dump(prefix, refmgr)
-        # if self.ambig_regioncounts is not None:
-        #     self.ambig_regioncounts.dump(prefix, refmgr)
-        ...
-
-    def get_unannotated_reads(self):
-        unannotated_reads = 0
-
-        # if self.uniq_regioncounts is not None:
-        #     unannotated_reads += self.uniq_regioncounts.unannotated_reads
-        # if self.ambig_regioncounts is not None:
-        #     unannotated_reads += self.ambig_regioncounts.unannotated_reads
-        # if self.uniq_seqcounts is not None:
-        #     unannotated_reads += self.uniq_seqcounts.unannotated_reads
-        # if self.ambig_seqcounts is not None:
-        #     unannotated_reads += self.ambig_seqcounts.unannotated_reads
-        if self.regioncounts is not None:
-            unannotated_reads += self.regioncounts.unannotated_reads
-        if self.seqcounts is not None:
-            unannotated_reads += self.seqcounts.unannotated_reads
-
-        return unannotated_reads
-
-    def get_counts(self, seqid, region_counts=False, strand_specific=False):
-        if region_counts:
-            raise NotImplementedError()
-            rid, seqid = seqid[0], seqid[1:]
-
-            uniq_counter = self.uniq_regioncounts.get(rid, Counter())
-            ambig_counter = self.ambig_regioncounts.get(rid, Counter())
-
-            # pylint: disable=R1720
-            if strand_specific:
-                raise NotImplementedError
-            else:
-                return [uniq_counter[seqid]], [ambig_counter[seqid]]
-
-        else:
-            # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts
-
-            if strand_specific:
-                raise NotImplementedError()
-                uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
-                uniq_counts[seqid[1]] = uniq_counter[seqid]
-                ambig_counts[seqid[1]] = ambig_counter[seqid]
-
-                # rid = seqid[0] if isinstance(seqid, tuple) else seqid
-                # uniq_counts = [
-                #     uniq_counter[(rid, CountManager.PLUS_STRAND)],
-                #     uniq_counter[(rid, CountManager.MINUS_STRAND)],
-                # ]
-                # ambig_counts = [
-                #     ambig_counter[(rid, CountManager.PLUS_STRAND)],
-                #     ambig_counter[(rid, CountManager.MINUS_STRAND)],
-                # ]
-            else:
-                # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]]
-                # uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]]
-                counts = self.seqcounts[seqid]
-
-            # return uniq_counts, ambig_counts
-            return np.array((counts[0], counts[2], counts[1], counts[3]))
-
-    def get_regions(self, rid):
-        # return set(self.uniq_regioncounts.get(rid, set())).union(
-        #     self.ambig_regioncounts.get(rid, set())
-        # )
-        return set(self.uniq_regioncounts.get(rid, Counter())).union(
-            self.ambig_regioncounts.get(rid, Counter())
-        )
-
-    def get_all_regions(self, region_counts=False):
-        # uniq_counts, ambig_counts = (
-        #     (self.uniq_seqcounts, self.ambig_seqcounts,),
-        #     (self.uniq_regioncounts, self.ambig_regioncounts,),
-        # )[region_counts]
-        # yield from set(uniq_counts).union(ambig_counts)
-        counts = (
-            self.seqcounts,
-            self.regioncounts,
-        )[region_counts]
-
-        yield from counts
-
-    def transform_counts(self, refmgr):
-        if self.seqcounts is not None:
-            return self.seqcounts.transform(refmgr)
-        if self.regioncounts is not None:
-            return self.regioncounts.transform(refmgr)
-
-    def dump(self, prefix, refmgr):
-        if self.seqcounts is not None:
-            self.seqcounts.dump(prefix, refmgr)
-        if self.regioncounts is not None:
-            self.regioncounts.dump(prefix, refmgr)

From b460ebdd38437f8889f3429e76a641eb5f7933d6 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 22 Dec 2024 23:00:35 +0100
Subject: [PATCH 040/128] removed count_manager references

---
 gffquant/annotation/count_annotator.py       | 2 +-
 gffquant/annotation/genecount_annotator.py   | 3 +--
 gffquant/annotation/regioncount_annotator.py | 4 ----
 gffquant/profilers/feature_quantifier.py     | 7 -------
 4 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 669eb325..f92b6472 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -8,7 +8,7 @@
 
 import numpy as np
 
-from ..counters.count_manager import AlignmentCounter
+from ..counters import AlignmentCounter
 
 
 logger = logging.getLogger(__name__)
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index bbd7581d..6e84cf95 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -17,9 +17,8 @@ def __init__(self, strand_specific, report_scaling_factors=True):
 
     def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
         """ Annotate a set of gene counts with functional annotations. """
-        self.total_gene_counts = counter.transform(refmgr)  # count_manager.transform_counts(refmgr)
+        self.total_gene_counts = counter.transform(refmgr)
         logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
-        # self.total_counts = self.total_gene_counts  # ?
 
         # formerly used in compute_count_vector
         strand_specific_counts = (
diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py
index 8db52e1c..6d719413 100644
--- a/gffquant/annotation/regioncount_annotator.py
+++ b/gffquant/annotation/regioncount_annotator.py
@@ -17,11 +17,7 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
         input:
         - bam: bamr.BamFile to use as lookup table for reference names
         - db: GffDatabaseManager holding functional annotation database
-        - count_manager: count_data
         """
-        # for rid in set(count_manager.uniq_regioncounts).union(
-        #     count_manager.ambig_regioncounts
-        # ):
         for rid in counter.get_all_regions(region_counts=True):
             ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
 
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index b3b08a9f..a26e9508 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -122,7 +122,6 @@ def process_counters(
             self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory)
 
         if dump_counters:
-            # self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager)
             self.counter.dump(self.out_prefix, self.reference_manager,)
 
         report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports
@@ -134,7 +133,6 @@ def process_counters(
 
         count_writer = CountWriter(
             self.out_prefix,
-            # has_ambig_counts=self.count_manager.has_ambig_counts(),
             has_ambig_counts=self.counter.has_ambig_counts(),
             strand_specific=self.strand_specific,
             restrict_reports=restrict_reports,
@@ -143,7 +141,6 @@ def process_counters(
             filtered_readcount=self.aln_counter["filtered_read_count"],
         )
 
-        # unannotated_reads = self.count_manager.get_unannotated_reads()
         unannotated_reads = self.counter.get_unannotated_reads()
         unannotated_reads += self.aln_counter["unannotated_ambig"]
 
@@ -154,8 +151,6 @@ def process_counters(
         )
 
         count_writer.write_gene_counts(
-            # count_annotator.gene_counts,
-            # self.count_manager,
             self.counter,
             self.reference_manager,
             count_annotator.scaling_factors["total_gene_uniq"],
@@ -198,7 +193,6 @@ def process_alignments(
             filtered_sam=debug_samfile,
         )
 
-        # self.count_manager.toggle_single_read_handling(unmarked_orphans)
         self.counter.toggle_single_read_handling(unmarked_orphans)
         ac = self.aln_counter
 
@@ -420,7 +414,6 @@ def process_alignment_group(self, aln_group, aln_reader):
                 )
             )
 
-            # contributed_counts = self.count_manager.update_counts(
             contributed_counts = self.counter.update(
                 count_stream,
                 ambiguous_counts=is_ambiguous_group,

From 178955119ce0cc3eeeda52909b7664ea44fecb3f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 23 Dec 2024 21:05:15 +0100
Subject: [PATCH 041/128] modified gene_count write behaviour in prep of ggroup
 annotation

---
 gffquant/alignment/aln_group.py            |  2 ++
 gffquant/annotation/count_annotator.py     | 26 +++++++++----------
 gffquant/annotation/genecount_annotator.py | 12 +++++++--
 gffquant/counters/alignment_counter.py     | 17 ++++++++++---
 gffquant/profilers/feature_quantifier.py   | 29 +++++++++++++++-------
 5 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/gffquant/alignment/aln_group.py b/gffquant/alignment/aln_group.py
index 057b5a3f..b25f5758 100644
--- a/gffquant/alignment/aln_group.py
+++ b/gffquant/alignment/aln_group.py
@@ -79,6 +79,8 @@ def get_all_hits(self, as_ambiguous=False):
                     except TypeError as err:
                         raise TypeError(f"Cannot derive sequencing library from tags: {aln.tags}") from err
 
+                # in region mode, there can be more hits (if the alignment overlaps multiple features of the target sequence)
+                # in gene mode, each alignment is a hit, i.e. there is at most 1 hit / alignment
                 yield aln.hits, n_aln
 
     def get_ambig_align_counts(self):
diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index f92b6472..3598d285 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -112,19 +112,19 @@ def calc_scaling_factor(raw, normed, default=0):
         )
 
         # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts
-        total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts
-        logger.info(
-            "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
-            total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
-        )
-
-        self.scaling_factors["total_gene_uniq"] = calc_scaling_factor(
-            total_uniq, total_uniq_normed, default_scaling_factor
-        )
-
-        self.scaling_factors["total_gene_ambi"] = calc_scaling_factor(
-            total_ambi, total_ambi_normed, default_scaling_factor
-        )
+        # total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts
+        # logger.info(
+        #     "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
+        #     total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
+        # )
+
+        # self.scaling_factors["total_gene_uniq"] = calc_scaling_factor(
+        #     total_uniq, total_uniq_normed, default_scaling_factor
+        # )
+
+        # self.scaling_factors["total_gene_ambi"] = calc_scaling_factor(
+        #     total_ambi, total_ambi_normed, default_scaling_factor
+        # )
 
         fc_items = self.feature_count_sums.items()
         for category, (
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 6e84cf95..dd405f6c 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -2,6 +2,7 @@
 import logging
 
 from .count_annotator import CountAnnotator
+from .count_writer import CountWriter
 from ..counters import AlignmentCounter
 
 
@@ -17,8 +18,15 @@ def __init__(self, strand_specific, report_scaling_factors=True):
 
     def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
         """ Annotate a set of gene counts with functional annotations. """
-        self.total_gene_counts = counter.transform(refmgr)
-        logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
+        # self.total_gene_counts, u_sf, a_sf = counter.generate_gene_count_matrix(refmgr)
+        # logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
+
+        # writer.write_gene_counts(
+        #     counter,
+        #     refmgr,
+        #     u_sf, a_sf,
+        #     gene_group_db=gene_group_db,
+        # )
 
         # formerly used in compute_count_vector
         strand_specific_counts = (
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 16e53b54..e377d67e 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -186,7 +186,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
 
         return contributed_counts
 
-    def transform(self, refmgr):
+    def generate_gene_count_matrix(self, refmgr):
         # transform 2-column uniq/ambig count matrix
         # into 4 columns
         # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm
@@ -216,5 +216,16 @@ def transform(self, refmgr):
         # length-normalise the lnorm columns
         self.counts[:, 2:4] /= lengths[:, None]
 
-        # return count sums
-        return self.counts.sum(axis=0)
+        count_sums = self.counts.sum(axis=0)
+
+        uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
+        ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
+
+        logger.info(
+            "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
+            count_sums[0], count_sums[2], count_sums[1], count_sums[3],
+            uniq_scaling_factor, ambig_scaling_factor,            
+        )
+
+        # return count sums and scaling factors
+        return count_sums, uniq_scaling_factor, ambig_scaling_factor
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index a26e9508..6f9a4558 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -129,8 +129,6 @@ def process_counters(
         Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required]
         count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors)
 
-        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
-
         count_writer = CountWriter(
             self.out_prefix,
             has_ambig_counts=self.counter.has_ambig_counts(),
@@ -141,6 +139,19 @@ def process_counters(
             filtered_readcount=self.aln_counter["filtered_read_count"],
         )
 
+        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
+
+        total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager)
+        logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts)
+
+        count_writer.write_gene_counts(
+            self.counter,
+            self.reference_manager,
+            u_sf, a_sf,
+            gene_group_db=gene_group_db,
+        )
+
+
         unannotated_reads = self.counter.get_unannotated_reads()
         unannotated_reads += self.aln_counter["unannotated_ambig"]
 
@@ -150,13 +161,13 @@ def process_counters(
             (None, unannotated_reads)[report_unannotated],
         )
 
-        count_writer.write_gene_counts(
-            self.counter,
-            self.reference_manager,
-            count_annotator.scaling_factors["total_gene_uniq"],
-            count_annotator.scaling_factors["total_gene_ambi"],
-            gene_group_db=gene_group_db,
-        )
+        # count_writer.write_gene_counts(
+        #     self.counter,
+        #     self.reference_manager,
+        #     count_annotator.scaling_factors["total_gene_uniq"],
+        #     count_annotator.scaling_factors["total_gene_ambi"],
+        #     gene_group_db=gene_group_db,
+        # )
 
         self.adm.clear_caches()
 

From fdd7aafa0408de2c2879cc20c94c26cafde6dccf Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 23 Dec 2024 21:16:33 +0100
Subject: [PATCH 042/128] modified gene_count write behaviour in prep of ggroup
 annotation

---
 gffquant/profilers/feature_quantifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 6f9a4558..f6ec4ea3 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -139,11 +139,11 @@ def process_counters(
             filtered_readcount=self.aln_counter["filtered_read_count"],
         )
 
-        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
-
         total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager)
         logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts)
 
+        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
+
         count_writer.write_gene_counts(
             self.counter,
             self.reference_manager,

From 4af2f1449079376aae279c2195e1c59fb20eb95d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 00:14:24 +0100
Subject: [PATCH 043/128] change gene group handling during annotation

---
 gffquant/annotation/genecount_annotator.py |  6 ++++--
 gffquant/counters/alignment_counter.py     | 23 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index dd405f6c..aac0b11a 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -36,11 +36,13 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
 
         for rid in counter.get_all_regions():
             counts = counter.get_counts(rid, strand_specific=self.strand_specific)
+            
             ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
             if gene_group_db:
-                ref_tokens = ref.split(".")
-                gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                # ref_tokens = ref.split(".")
+                # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                gene_id, ggroup_id = rid, rid
             else:
                 gene_id, ggroup_id = ref, ref
 
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index e377d67e..bbcd062e 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -229,3 +229,26 @@ def generate_gene_count_matrix(self, refmgr):
 
         # return count sums and scaling factors
         return count_sums, uniq_scaling_factor, ambig_scaling_factor
+    
+    def group_gene_count_matrix(self, refmgr):
+        ggroup_index = {}
+        for key, key_index in self.index.items():
+            ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0]
+            ref_tokens = ref.split(".")
+            _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            g_key_index = ggroup_index.get(ggroup_id)
+            if g_key_index is None:
+                g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
+            else:
+                # only add counts if group has been encountered before
+                # else there will be duplicates
+                self.counts[g_key_index] += self.counts[key_index]
+
+        # replace index with grouped index
+        self.index = ggroup_index
+
+        # remove the un-indexed (ungrouped) rows
+        self.counts = self.counts[0:len(self.index), :]
+
+
+    

From 39ebd15f7619289e7292b698b5ca37c79a4352ea Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 00:35:45 +0100
Subject: [PATCH 044/128] change gene group handling during annotation

---
 gffquant/profilers/feature_quantifier.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index f6ec4ea3..988202d3 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -142,7 +142,6 @@ def process_counters(
         total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager)
         logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts)
 
-        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
 
         count_writer.write_gene_counts(
             self.counter,
@@ -151,6 +150,9 @@ def process_counters(
             gene_group_db=gene_group_db,
         )
 
+        self.counter.group_gene_count_matrix(self.reference_manager)
+
+        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
 
         unannotated_reads = self.counter.get_unannotated_reads()
         unannotated_reads += self.aln_counter["unannotated_ambig"]

From c0b466479e9e4df4867c9593954403f1f3c883f1 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 00:45:11 +0100
Subject: [PATCH 045/128] change gene group handling during annotation

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index aac0b11a..6f3fceed 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -37,13 +37,13 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
         for rid in counter.get_all_regions():
             counts = counter.get_counts(rid, strand_specific=self.strand_specific)
             
-            ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
 
             if gene_group_db:
                 # ref_tokens = ref.split(".")
                 # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
                 gene_id, ggroup_id = rid, rid
             else:
+                ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
                 gene_id, ggroup_id = ref, ref
 
             region_annotation = db.query_sequence(ggroup_id)

From b935a24cf04a07851f7973a7cfe91f102fcdb971 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 09:49:58 +0100
Subject: [PATCH 046/128] added debug messaging

---
 gffquant/counters/alignment_counter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index bbcd062e..323d37d5 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -235,14 +235,16 @@ def group_gene_count_matrix(self, refmgr):
         for key, key_index in self.index.items():
             ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0]
             ref_tokens = ref.split(".")
-            _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
             g_key_index = ggroup_index.get(ggroup_id)
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
+                logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
             else:
                 # only add counts if group has been encountered before
                 # else there will be duplicates
                 self.counts[g_key_index] += self.counts[key_index]
+                logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
 
         # replace index with grouped index
         self.index = ggroup_index

From 69709f881c17655b365161f4a7f4a00007187aa6 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 14:36:40 +0100
Subject: [PATCH 047/128] solved?

---
 gffquant/counters/alignment_counter.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 323d37d5..f47f0581 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -237,13 +237,15 @@ def group_gene_count_matrix(self, refmgr):
             ref_tokens = ref.split(".")
             gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
             g_key_index = ggroup_index.get(ggroup_id)
+            gene_counts = self.counts[key_index]
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
+                self.counts[g_key_index] = gene_counts
                 logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
             else:
                 # only add counts if group has been encountered before
                 # else there will be duplicates
-                self.counts[g_key_index] += self.counts[key_index]
+                self.counts[g_key_index] += gene_counts
                 logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
 
         # replace index with grouped index

From 62c0a523982ceabd793f69dead0a538f766ee352 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 23:19:15 +0100
Subject: [PATCH 048/128] disabling various logger calls

---
 gffquant/annotation/genecount_annotator.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 6f3fceed..cbc74ff5 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -49,14 +49,14 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
             region_annotation = db.query_sequence(ggroup_id)
             if region_annotation is not None:
                 _, _, region_annotation = region_annotation
-                logger.info(
-                    "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
-                    gene_id, ggroup_id, counts[0], counts[2],
-                )
+                # logger.info(
+                #     "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
+                #     gene_id, ggroup_id, counts[0], counts[2],
+                # )
                 self.distribute_feature_counts(counts, region_annotation)
 
             else:
-                logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
+                # logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
                 self.unannotated_counts += counts[:4]
 
         self.calculate_scaling_factors()

From 2239d29673bb1a186609a7ef1e68b815099473b0 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 24 Dec 2024 23:29:58 +0100
Subject: [PATCH 049/128] disabling various logger calls

---
 gffquant/counters/alignment_counter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index f47f0581..51a2dd28 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -241,12 +241,12 @@ def group_gene_count_matrix(self, refmgr):
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
                 self.counts[g_key_index] = gene_counts
-                logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
+                # logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
             else:
                 # only add counts if group has been encountered before
                 # else there will be duplicates
                 self.counts[g_key_index] += gene_counts
-                logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
+                # logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
 
         # replace index with grouped index
         self.index = ggroup_index

From 413684abfb2b1c437151f9c2ebd598efe9e0c7e7 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 14:51:11 +0100
Subject: [PATCH 050/128] trying to update feature count processing

---
 gffquant/annotation/count_writer.py        | 77 +++++++++++++++-------
 gffquant/annotation/genecount_annotator.py | 61 ++++++++++++++++-
 gffquant/profilers/feature_quantifier.py   | 21 +++---
 3 files changed, 126 insertions(+), 33 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index e5959598..bd9d7788 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -109,6 +109,59 @@ def compile_block(raw, lnorm, scaling_factors):
     def write_row(header, data, stream=sys.stdout):
         print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream)
 
+    def write_category(self, category, counts, index, names, unique_sf, ambig_sf, unannotated_reads=None, report_unseen=True):
+        # category, c_counts, c_index, c_names, u_sf, a_sf
+        if "scaled" in self.publish_reports:
+            logger.info(
+                "SCALING FACTORS %s %s %s",
+                category, unique_sf, ambig_sf,
+            )
+        with gzip.open(f"{self.out_prefix}.{category}.txt.gz", "wt") as feat_out:
+            header = self.get_header()
+            print("feature", *header, sep="\t", file=feat_out)
+
+            if unannotated_reads is not None:
+                print("unannotated", unannotated_reads, sep="\t", file=feat_out)
+
+            if "total_readcount" in self.publish_reports:
+                CountWriter.write_row(
+                    "total_reads",
+                    np.zeros(len(header)) + self.total_readcount,
+                    stream=feat_out,
+                )
+
+            if "filtered_readcount" in self.publish_reports:
+                CountWriter.write_row(
+                    "filtered_reads",
+                    np.zeros(len(header)) + self.filtered_readcount,
+                    stream=feat_out,
+                )
+
+            if "category" in self.publish_reports:
+                # cat_counts = counts.get(f"cat:::{category_id}")
+                cat_counts = counts.get(0)
+                logger.info("CAT %s: %s", category, str(cat_counts))
+                if cat_counts is not None:
+                    cat_row = self.compile_output_row(
+                        cat_counts,
+                        # scaling_factor=featcounts.scaling_factors["total_uniq"],
+                        # ambig_scaling_factor=featcounts.scaling_factors["total_ambi"],
+                        scaling_factor=unique_sf,
+                        ambig_scaling_factor=ambig_sf,
+                    )
+                    CountWriter.write_row("category", cat_row, stream=feat_out)
+
+            for fid, i in index.items():
+                f_counts = np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3]))  #counts[fid]
+                if report_unseen or f_counts.sum():
+                    out_row = self.compile_output_row(
+                        f_counts,
+                        scaling_factor=unique_sf,
+                        ambig_scaling_factor=ambig_sf,
+                    )
+                    CountWriter.write_row(names[fid], out_row, stream=feat_out)
+
+
     # pylint: disable=R0914
     def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_unseen=True):
         for category_id, counts in sorted(featcounts.items()):
@@ -176,13 +229,6 @@ def write_gene_counts(
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
             print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True)
 
-            # for gene, g_counts in sorted(gene_counts.items()):
-            #     out_row = self.compile_output_row(
-            #         g_counts,
-            #         scaling_factor=uniq_scaling_factor,
-            #         ambig_scaling_factor=ambig_scaling_factor
-            #     )
-            #     CountWriter.write_row(gene, out_row, stream=gene_out)
             ref_stream = (
                 (
                     refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0],
@@ -206,20 +252,3 @@ def write_gene_counts(
                 )
 
                 CountWriter.write_row(gene_id, out_row, stream=gene_out,)
-
-            # for rid in gene_counts.get_all_regions():
-            #     counts = gene_counts.get_counts(rid)
-            #     out_row = self.compile_output_row(
-            #         counts,
-            #         scaling_factor=uniq_scaling_factor,
-            #         ambig_scaling_factor=ambig_scaling_factor,
-            #     )
-            #     ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]
-
-            #     if gene_group_db:
-            #         ref_tokens = ref.split(".")
-            #         gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-            #     else:
-            #         gene_id = ref
-
-            #     CountWriter.write_row(gene_id, out_row, stream=gene_out,)
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index cbc74ff5..65d22b54 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -1,9 +1,12 @@
 """ module docstring """
 import logging
 
+import numpy as np
+
 from .count_annotator import CountAnnotator
 from .count_writer import CountWriter
 from ..counters import AlignmentCounter
+from ..db.annotation_db import AnnotationDatabaseManager
 
 
 logger = logging.getLogger(__name__)
@@ -16,7 +19,63 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         """ __init__() """
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False):
+    def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
+        for it, category in enumerate(db.get_categories()):
+            features = tuple(db.get_features(category.id))
+            # total_reads     483808.00000    483808.00000    483808.00000    483808.00000    483808.00000    483808.00000
+            # filtered_reads  454437.00000    454437.00000    454437.00000    454437.00000    454437.00000    454437.00000
+            # category        45359.50000     47.10706        42266.81963     152875.83896    224.72779       149853.25971
+            category_counts = np.zeros(
+                (len(features) + 1, 2,),
+                dtype='float64',
+            )
+            category_index = {
+                feature.id: i
+                for i, feature in enumerate(features, start=1)
+            }
+            category_names = {
+                feature.id: feature.name
+                for feature in features
+            }
+            for rid in counter.get_all_regions():
+                counts = counter.get_counts(rid, strand_specific=self.strand_specific)
+                if gene_group_db:
+                    gene_id, ggroup_id = rid, rid
+                else:
+                    ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+                    gene_id, ggroup_id = ref, ref
+
+                region_annotation = db.query_sequence(ggroup_id)
+                if region_annotation is not None:
+                    category_features = dict(region_annotation).get(category.id)
+                    if category_features is not None:
+                        category_counts[0] += counts  # category row
+                        for cf in category_features:
+                            category_counts[category_index.get(cf)] += counts
+
+                elif it == 0:
+                    self.unannotated_counts += counts[:4]
+            
+            count_sums = self.counts.sum(axis=0)
+
+            uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
+            ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
+
+            logger.info(
+                "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
+                category.name,
+                count_sums[0], count_sums[2], count_sums[1], count_sums[3],
+                uniq_scaling_factor, ambig_scaling_factor,            
+            )
+
+            yield category.name, category_counts, category_index, category_names, uniq_scaling_factor, ambig_scaling_factor
+
+                
+
+
+
+
+    def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
         """ Annotate a set of gene counts with functional annotations. """
         # self.total_gene_counts, u_sf, a_sf = counter.generate_gene_count_matrix(refmgr)
         # logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 988202d3..81403321 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -154,14 +154,19 @@ def process_counters(
 
         count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
 
-        unannotated_reads = self.counter.get_unannotated_reads()
-        unannotated_reads += self.aln_counter["unannotated_ambig"]
-
-        count_writer.write_feature_counts(
-            self.adm,
-            count_annotator,
-            (None, unannotated_reads)[report_unannotated],
-        )
+        # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor
+        for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,):
+            unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
+            count_writer.write_category(category, c_counts, c_index, c_names, u_sf, a_sf, unannotated_reads=(None, unannotated_reads)[report_unannotated],)
+
+        # unannotated_reads = self.counter.get_unannotated_reads()
+        # unannotated_reads += self.aln_counter["unannotated_ambig"]
+
+        # count_writer.write_feature_counts(
+        #     self.adm,
+        #     count_annotator,
+        #     (None, unannotated_reads)[report_unannotated],
+        # )
 
         # count_writer.write_gene_counts(
         #     self.counter,

From 1b5cf9c602effebe268ef136c3df38feafd35c9a Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 15:01:42 +0100
Subject: [PATCH 051/128] trying to update feature count processing

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 81403321..38be7233 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -152,7 +152,7 @@ def process_counters(
 
         self.counter.group_gene_count_matrix(self.reference_manager)
 
-        count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
+        # count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
 
         # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor
         for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,):

From cb6ac1a58f29725d85813c935f47498c26e152bb Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 15:02:37 +0100
Subject: [PATCH 052/128] trying to update feature count processing

---
 gffquant/profilers/feature_quantifier.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 38be7233..4a6cc96e 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -156,6 +156,7 @@ def process_counters(
 
         # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor
         for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,):
+            logger.info("PROCESSING CATEGORY=%s", category)
             unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
             count_writer.write_category(category, c_counts, c_index, c_names, u_sf, a_sf, unannotated_reads=(None, unannotated_reads)[report_unannotated],)
 

From 8977c4d71bba5855a4dba757cce9d0813f7e655d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 16:23:15 +0100
Subject: [PATCH 053/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 65d22b54..3304b97f 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -46,7 +46,9 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                     gene_id, ggroup_id = ref, ref
 
                 region_annotation = db.query_sequence(ggroup_id)
+                # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation))
                 if region_annotation is not None:
+                    _, _, region_annotation = region_annotation
                     category_features = dict(region_annotation).get(category.id)
                     if category_features is not None:
                         category_counts[0] += counts  # category row

From f96e1b3b7a8baafa7054706d1bef0f8fd05c281e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 16:49:54 +0100
Subject: [PATCH 054/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 3304b97f..22a73ed1 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 elif it == 0:
                     self.unannotated_counts += counts[:4]
             
-            count_sums = self.counts.sum(axis=0)
+            count_sums = counter.sum(axis=0)
 
             uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
             ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]

From b4fb4d8fd30d329da0d77200dfbbadfe06e8c944 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 17:08:03 +0100
Subject: [PATCH 055/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 22a73ed1..85bbe569 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 elif it == 0:
                     self.unannotated_counts += counts[:4]
             
-            count_sums = counter.sum(axis=0)
+            count_sums = counter.counts.sum(axis=0)
 
             uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
             ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]

From 3594dfff13e72e475e394466044e002881354e70 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 17:19:38 +0100
Subject: [PATCH 056/128] trying to update feature count processing

---
 gffquant/annotation/count_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index bd9d7788..f4272cd7 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -139,7 +139,7 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un
 
             if "category" in self.publish_reports:
                 # cat_counts = counts.get(f"cat:::{category_id}")
-                cat_counts = counts.get(0)
+                cat_counts = np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3]))
                 logger.info("CAT %s: %s", category, str(cat_counts))
                 if cat_counts is not None:
                     cat_row = self.compile_output_row(

From 0dbd4ae5e723b3396ef9f0fdde4a0752bdaae778 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 17:45:44 +0100
Subject: [PATCH 057/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 85bbe569..a7768e56 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -26,7 +26,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             # filtered_reads  454437.00000    454437.00000    454437.00000    454437.00000    454437.00000    454437.00000
             # category        45359.50000     47.10706        42266.81963     152875.83896    224.72779       149853.25971
             category_counts = np.zeros(
-                (len(features) + 1, 2,),
+                (len(features) + 1, 4,),
                 dtype='float64',
             )
             category_index = {

From 7b8e6bf18e9c7d8d8d178512f8f74278d49a6d0b Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 20:15:28 +0100
Subject: [PATCH 058/128] debug log

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index a7768e56..b6c6e5c6 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -46,7 +46,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                     gene_id, ggroup_id = ref, ref
 
                 region_annotation = db.query_sequence(ggroup_id)
-                # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation))
+                logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id)
                 if region_annotation is not None:
                     _, _, region_annotation = region_annotation
                     category_features = dict(region_annotation).get(category.id)

From 563b4023855e703b43b9ef7a84d515643cc348fb Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 20:29:07 +0100
Subject: [PATCH 059/128] trying to fix annotate2

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index b6c6e5c6..bbe4da36 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -49,7 +49,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id)
                 if region_annotation is not None:
                     _, _, region_annotation = region_annotation
-                    category_features = dict(region_annotation).get(category.id)
+                    category_features = dict(region_annotation).get(str(category.id))
                     if category_features is not None:
                         category_counts[0] += counts  # category row
                         for cf in category_features:

From e9e171565bfb8b5a3bc1630d30b7a19db12bc3d2 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 21:16:12 +0100
Subject: [PATCH 060/128] turn off annotate2 log

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index bbe4da36..a0967e66 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -46,7 +46,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                     gene_id, ggroup_id = ref, ref
 
                 region_annotation = db.query_sequence(ggroup_id)
-                logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id)
+                # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id)
                 if region_annotation is not None:
                     _, _, region_annotation = region_annotation
                     category_features = dict(region_annotation).get(str(category.id))

From 07c674377fbc17b1a886ad26fc088b629782bdb6 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 25 Dec 2024 22:00:08 +0100
Subject: [PATCH 061/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index a0967e66..4de54ad9 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -53,7 +53,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                     if category_features is not None:
                         category_counts[0] += counts  # category row
                         for cf in category_features:
-                            category_counts[category_index.get(cf)] += counts
+                            category_counts[category_index.get(int(cf))] += counts
 
                 elif it == 0:
                     self.unannotated_counts += counts[:4]

From da8e93baa8c7145f78e037f016bf787020d58f42 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 26 Dec 2024 00:27:17 +0100
Subject: [PATCH 062/128] trying to update feature count processing

---
 gffquant/annotation/count_writer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index f4272cd7..4b4fc4e0 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -139,7 +139,7 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un
 
             if "category" in self.publish_reports:
                 # cat_counts = counts.get(f"cat:::{category_id}")
-                cat_counts = np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3]))
+                cat_counts = counts[0]  # np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3]))
                 logger.info("CAT %s: %s", category, str(cat_counts))
                 if cat_counts is not None:
                     cat_row = self.compile_output_row(
@@ -152,7 +152,7 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un
                     CountWriter.write_row("category", cat_row, stream=feat_out)
 
             for fid, i in index.items():
-                f_counts = np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3]))  #counts[fid]
+                f_counts = counts[i]  # np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3]))  #counts[fid]
                 if report_unseen or f_counts.sum():
                     out_row = self.compile_output_row(
                         f_counts,

From a94d9e7e782a99da3f445b5ba0a3b7bee3df7282 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 26 Dec 2024 01:47:39 +0100
Subject: [PATCH 063/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 4de54ad9..e77c15ef 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 elif it == 0:
                     self.unannotated_counts += counts[:4]
             
-            count_sums = counter.counts.sum(axis=0)
+            count_sums = counter.counts[1:].sum(axis=0)
 
             uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
             ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]

From 164cc6d51074938c43cf35a21924d12c9c29d853 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 26 Dec 2024 21:42:26 +0100
Subject: [PATCH 064/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index e77c15ef..363efa17 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 elif it == 0:
                     self.unannotated_counts += counts[:4]
             
-            count_sums = counter.counts[1:].sum(axis=0)
+            count_sums = category_counts[1:].sum(axis=0)
 
             uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
             ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]

From 0cdf49af70c2a47bd4c6e6e489602f6bc3ea5013 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 26 Dec 2024 22:56:21 +0100
Subject: [PATCH 065/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 363efa17..36b597eb 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -60,13 +60,14 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             
             count_sums = category_counts[1:].sum(axis=0)
 
-            uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
-            ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
+            uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
+            ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
 
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
                 category.name,
-                count_sums[0], count_sums[2], count_sums[1], count_sums[3],
+                # count_sums[0], count_sums[1], count_sums[1], count_sums[3],
+                *count_sums,
                 uniq_scaling_factor, ambig_scaling_factor,            
             )
 

From 3b2b5cb6b50c1ff984b4157be9fa804368c6967e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 27 Dec 2024 01:40:57 +0100
Subject: [PATCH 066/128] trying to update feature count processing

---
 gffquant/annotation/genecount_annotator.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 36b597eb..63150c46 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -60,8 +60,15 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             
             count_sums = category_counts[1:].sum(axis=0)
 
-            uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
-            ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
+            # should scaled counts use a factor derived from all counts or should multi-feature counts only contribute once?
+            # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
+            # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
+
+            uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0
+            if category_counts[0][1]:
+                uniq_scaling_factor = category_counts[0][0] / category_counts[0][1]
+            if category_counts[0][3]:
+                ambig_scaling_factor = category_counts[0][2] / category_counts[0][3]
 
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",

From cb9934e9157166804b3b26b3f41bb4a65df9b133 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 27 Dec 2024 11:31:24 +0100
Subject: [PATCH 067/128] added category scaling comment

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 63150c46..277a11a8 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -63,7 +63,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             # should scaled counts use a factor derived from all counts or should multi-feature counts only contribute once?
             # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
             # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
-
+            # pre 2.19 category count scaling was based on total counts
             uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0
             if category_counts[0][1]:
                 uniq_scaling_factor = category_counts[0][0] / category_counts[0][1]

From 4e119a4b32a49c226d7da2a8ea9c8d554aae5f8f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 27 Dec 2024 23:20:55 +0100
Subject: [PATCH 068/128] linting + obsolete code removal

---
 gffquant/alignment/aln_group.py              |  3 +-
 gffquant/annotation/count_annotator.py       |  2 -
 gffquant/annotation/count_writer.py          | 76 ++++----------------
 gffquant/annotation/genecount_annotator.py   | 46 ++++++------
 gffquant/annotation/regioncount_annotator.py |  2 +
 gffquant/counters/alignment_counter.py       | 19 +++--
 gffquant/profilers/feature_quantifier.py     | 40 +++++------
 7 files changed, 67 insertions(+), 121 deletions(-)

diff --git a/gffquant/alignment/aln_group.py b/gffquant/alignment/aln_group.py
index b25f5758..b465df46 100644
--- a/gffquant/alignment/aln_group.py
+++ b/gffquant/alignment/aln_group.py
@@ -79,7 +79,8 @@ def get_all_hits(self, as_ambiguous=False):
                     except TypeError as err:
                         raise TypeError(f"Cannot derive sequencing library from tags: {aln.tags}") from err
 
-                # in region mode, there can be more hits (if the alignment overlaps multiple features of the target sequence)
+                # in region mode, there can be more hits
+                # (if the alignment overlaps multiple features of the target sequence)
                 # in gene mode, each alignment is a hit, i.e. there is at most 1 hit / alignment
                 yield aln.hits, n_aln
 
diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py
index 3598d285..601eb865 100644
--- a/gffquant/annotation/count_annotator.py
+++ b/gffquant/annotation/count_annotator.py
@@ -8,8 +8,6 @@
 
 import numpy as np
 
-from ..counters import AlignmentCounter
-
 
 logger = logging.getLogger(__name__)
 
diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 4b4fc4e0..417404e2 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -1,4 +1,4 @@
-# pylint: disable=C0103,W1514,R0913,R0917
+# pylint: disable=C0103,W1514,R0913,R0917,R0914
 
 """ module docstring """
 
@@ -109,7 +109,17 @@ def compile_block(raw, lnorm, scaling_factors):
     def write_row(header, data, stream=sys.stdout):
         print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream)
 
-    def write_category(self, category, counts, index, names, unique_sf, ambig_sf, unannotated_reads=None, report_unseen=True):
+    def write_category(
+        self,
+        category,
+        counts,
+        index,
+        names,
+        unique_sf,
+        ambig_sf,
+        unannotated_reads=None,
+        report_unseen=True,
+    ):
         # category, c_counts, c_index, c_names, u_sf, a_sf
         if "scaled" in self.publish_reports:
             logger.info(
@@ -138,21 +148,18 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un
                 )
 
             if "category" in self.publish_reports:
-                # cat_counts = counts.get(f"cat:::{category_id}")
-                cat_counts = counts[0]  # np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3]))
+                cat_counts = counts[0]
                 logger.info("CAT %s: %s", category, str(cat_counts))
                 if cat_counts is not None:
                     cat_row = self.compile_output_row(
                         cat_counts,
-                        # scaling_factor=featcounts.scaling_factors["total_uniq"],
-                        # ambig_scaling_factor=featcounts.scaling_factors["total_ambi"],
                         scaling_factor=unique_sf,
                         ambig_scaling_factor=ambig_sf,
                     )
                     CountWriter.write_row("category", cat_row, stream=feat_out)
 
             for fid, i in index.items():
-                f_counts = counts[i]  # np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3]))  #counts[fid]
+                f_counts = counts[i]
                 if report_unseen or f_counts.sum():
                     out_row = self.compile_output_row(
                         f_counts,
@@ -161,61 +168,6 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un
                     )
                     CountWriter.write_row(names[fid], out_row, stream=feat_out)
 
-
-    # pylint: disable=R0914
-    def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_unseen=True):
-        for category_id, counts in sorted(featcounts.items()):
-            scaling_factor, ambig_scaling_factor = featcounts.scaling_factors[
-                category_id
-            ]
-            category = db.query_category(category_id).name
-            if "scaled" in self.publish_reports:
-                logger.info(
-                    "SCALING FACTORS %s %s %s",
-                    category, scaling_factor, ambig_scaling_factor
-                )
-            with gzip.open(f"{self.out_prefix}.{category}.txt.gz", "wt") as feat_out:
-                header = self.get_header()
-                print("feature", *header, sep="\t", file=feat_out)
-
-                if unannotated_reads is not None:
-                    print("unannotated", unannotated_reads, sep="\t", file=feat_out)
-
-                if "total_readcount" in self.publish_reports:
-                    CountWriter.write_row(
-                        "total_reads",
-                        np.zeros(len(header)) + self.total_readcount,
-                        stream=feat_out,
-                    )
-
-                if "filtered_readcount" in self.publish_reports:
-                    CountWriter.write_row(
-                        "filtered_reads",
-                        np.zeros(len(header)) + self.filtered_readcount,
-                        stream=feat_out,
-                    )
-
-                if "category" in self.publish_reports:
-                    cat_counts = counts.get(f"cat:::{category_id}")
-                    logger.info("CAT %s: %s", category_id, str(cat_counts))
-                    if cat_counts is not None:
-                        cat_row = self.compile_output_row(
-                            cat_counts,
-                            scaling_factor=featcounts.scaling_factors["total_uniq"],
-                            ambig_scaling_factor=featcounts.scaling_factors["total_ambi"],
-                        )
-                        CountWriter.write_row("category", cat_row, stream=feat_out)
-
-                for feature in db.get_features(category_id):
-                    f_counts = counts.get(str(feature.id), np.zeros(len(header)))
-                    if report_unseen or f_counts.sum():
-                        out_row = self.compile_output_row(
-                            f_counts,
-                            scaling_factor=scaling_factor,
-                            ambig_scaling_factor=ambig_scaling_factor,
-                        )
-                        CountWriter.write_row(feature.name, out_row, stream=feat_out)
-
     def write_gene_counts(
         self,
         gene_counts: AlignmentCounter,
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 277a11a8..304a9e6a 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -1,10 +1,11 @@
+# pylint: disable=R0914
+
 """ module docstring """
 import logging
 
 import numpy as np
 
 from .count_annotator import CountAnnotator
-from .count_writer import CountWriter
 from ..counters import AlignmentCounter
 from ..db.annotation_db import AnnotationDatabaseManager
 
@@ -20,11 +21,8 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
     def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
-        for it, category in enumerate(db.get_categories()):
+        for category in db.get_categories():
             features = tuple(db.get_features(category.id))
-            # total_reads     483808.00000    483808.00000    483808.00000    483808.00000    483808.00000    483808.00000
-            # filtered_reads  454437.00000    454437.00000    454437.00000    454437.00000    454437.00000    454437.00000
-            # category        45359.50000     47.10706        42266.81963     152875.83896    224.72779       149853.25971
             category_counts = np.zeros(
                 (len(features) + 1, 4,),
                 dtype='float64',
@@ -40,10 +38,12 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             for rid in counter.get_all_regions():
                 counts = counter.get_counts(rid, strand_specific=self.strand_specific)
                 if gene_group_db:
-                    gene_id, ggroup_id = rid, rid
+                    # gene_id, ggroup_id = rid, rid
+                    ggroup_id = rid
                 else:
                     ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                    gene_id, ggroup_id = ref, ref
+                    # gene_id, ggroup_id = ref, ref
+                    ggroup_id = ref
 
                 region_annotation = db.query_sequence(ggroup_id)
                 # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id)
@@ -55,12 +55,13 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                         for cf in category_features:
                             category_counts[category_index.get(int(cf))] += counts
 
-                elif it == 0:
-                    self.unannotated_counts += counts[:4]
-            
+                # elif it == 0:
+                #     self.unannotated_counts += counts[:4]
+
             count_sums = category_counts[1:].sum(axis=0)
 
-            # should scaled counts use a factor derived from all counts or should multi-feature counts only contribute once?
+            # should scaled counts use a factor derived from all counts
+            # or should multi-feature counts only contribute once?
             # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
             # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
             # pre 2.19 category count scaling was based on total counts
@@ -75,15 +76,17 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 category.name,
                 # count_sums[0], count_sums[1], count_sums[1], count_sums[3],
                 *count_sums,
-                uniq_scaling_factor, ambig_scaling_factor,            
+                uniq_scaling_factor, ambig_scaling_factor,
             )
 
-            yield category.name, category_counts, category_index, category_names, uniq_scaling_factor, ambig_scaling_factor
-
-                
-
-
-
+            yield (
+                category.name,
+                category_counts,
+                category_index,
+                category_names,
+                uniq_scaling_factor,
+                ambig_scaling_factor,
+            )
 
     def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
         """ Annotate a set of gene counts with functional annotations. """
@@ -105,15 +108,16 @@ def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCoun
 
         for rid in counter.get_all_regions():
             counts = counter.get_counts(rid, strand_specific=self.strand_specific)
-            
 
             if gene_group_db:
                 # ref_tokens = ref.split(".")
                 # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-                gene_id, ggroup_id = rid, rid
+                # gene_id, ggroup_id = rid, rid
+                ggroup_id = rid
             else:
                 ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                gene_id, ggroup_id = ref, ref
+                # gene_id, ggroup_id = ref, ref
+                ggroup_id = ref
 
             region_annotation = db.query_sequence(ggroup_id)
             if region_annotation is not None:
diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py
index 6d719413..84f85163 100644
--- a/gffquant/annotation/regioncount_annotator.py
+++ b/gffquant/annotation/regioncount_annotator.py
@@ -1,3 +1,5 @@
+""" module docstring """
+
 import numpy as np
 
 from . import CountAnnotator
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 51a2dd28..aad80de5 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -130,7 +130,11 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No
         return contributed_counts
 
     def get_unannotated_reads(self):
-        return self.unannotated_reads
+        # return self.unannotated_reads
+        no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056")
+        if no_annotation is not None:
+            return self.counts[no_annotation][0]
+        return 0.0
 
     def get_counts(self, seqid, strand_specific=False):
         if strand_specific:
@@ -224,35 +228,28 @@ def generate_gene_count_matrix(self, refmgr):
         logger.info(
             "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
             count_sums[0], count_sums[2], count_sums[1], count_sums[3],
-            uniq_scaling_factor, ambig_scaling_factor,            
+            uniq_scaling_factor, ambig_scaling_factor,
         )
 
         # return count sums and scaling factors
         return count_sums, uniq_scaling_factor, ambig_scaling_factor
-    
+
     def group_gene_count_matrix(self, refmgr):
         ggroup_index = {}
         for key, key_index in self.index.items():
             ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0]
             ref_tokens = ref.split(".")
-            gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+            _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
             g_key_index = ggroup_index.get(ggroup_id)
             gene_counts = self.counts[key_index]
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
                 self.counts[g_key_index] = gene_counts
-                # logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
             else:
-                # only add counts if group has been encountered before
-                # else there will be duplicates
                 self.counts[g_key_index] += gene_counts
-                # logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),)
 
         # replace index with grouped index
         self.index = ggroup_index
 
         # remove the un-indexed (ungrouped) rows
         self.counts = self.counts[0:len(self.index), :]
-
-
-    
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 4a6cc96e..d1480f57 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -142,7 +142,6 @@ def process_counters(
         total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager)
         logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts)
 
-
         count_writer.write_gene_counts(
             self.counter,
             self.reference_manager,
@@ -151,31 +150,24 @@ def process_counters(
         )
 
         self.counter.group_gene_count_matrix(self.reference_manager)
+        unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        # count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,)
-
-        # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor
-        for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,):
+        for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(
+            self.reference_manager,
+            self.adm,
+            self.counter,
+            gene_group_db=gene_group_db,
+        ):
             logger.info("PROCESSING CATEGORY=%s", category)
-            unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
-            count_writer.write_category(category, c_counts, c_index, c_names, u_sf, a_sf, unannotated_reads=(None, unannotated_reads)[report_unannotated],)
-
-        # unannotated_reads = self.counter.get_unannotated_reads()
-        # unannotated_reads += self.aln_counter["unannotated_ambig"]
-
-        # count_writer.write_feature_counts(
-        #     self.adm,
-        #     count_annotator,
-        #     (None, unannotated_reads)[report_unannotated],
-        # )
-
-        # count_writer.write_gene_counts(
-        #     self.counter,
-        #     self.reference_manager,
-        #     count_annotator.scaling_factors["total_gene_uniq"],
-        #     count_annotator.scaling_factors["total_gene_ambi"],
-        #     gene_group_db=gene_group_db,
-        # )
+            count_writer.write_category(
+                category,
+                c_counts,
+                c_index,
+                c_names,
+                u_sf,
+                a_sf,
+                unannotated_reads=(None, unannotated_reads)[report_unannotated],
+            )
 
         self.adm.clear_caches()
 

From 90eb4b504300da27456acfd06a9f3820a1c18bcf Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 30 Dec 2024 00:55:46 +0100
Subject: [PATCH 069/128] trying to optimise scaling factors, temp. disabled
 feature counts

---
 gffquant/annotation/count_writer.py        | 17 ++++++-----
 gffquant/annotation/genecount_annotator.py | 14 ---------
 gffquant/counters/alignment_counter.py     | 33 +++++++++++++++++-----
 gffquant/profilers/feature_quantifier.py   | 32 ++++++++++-----------
 4 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 417404e2..a33cef3f 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -76,6 +76,7 @@ def compile_block(raw, lnorm, scaling_factors):
 
         p, row = 0, []
         rpkm_factor = 1e9 / self.filtered_readcount
+
         # unique counts
         row += compile_block(*counts[p:p + 2], (scaling_factor, rpkm_factor,))
         p += 2
@@ -190,17 +191,19 @@ def write_gene_counts(
             )
 
             for ref, rid in sorted(ref_stream):
-                counts = gene_counts.get_counts(rid)
+                # counts = gene_counts.get_counts(rid)
+                counts = gene_counts[rid]
                 if gene_group_db:
                     ref_tokens = ref.split(".")
                     gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
                 else:
                     gene_id = ref
 
-                out_row = self.compile_output_row(
-                    counts,
-                    scaling_factor=uniq_scaling_factor,
-                    ambig_scaling_factor=ambig_scaling_factor,
-                )
+                # out_row = self.compile_output_row(
+                #     counts,
+                #     scaling_factor=uniq_scaling_factor,
+                #     ambig_scaling_factor=ambig_scaling_factor,
+                # )
 
-                CountWriter.write_row(gene_id, out_row, stream=gene_out,)
+                # CountWriter.write_row(gene_id, out_row, stream=gene_out,)
+                CountWriter.write_row(gene_id, counts, stream=gene_out,)
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 304a9e6a..80af4515 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -74,7 +74,6 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
                 category.name,
-                # count_sums[0], count_sums[1], count_sums[1], count_sums[3],
                 *count_sums,
                 uniq_scaling_factor, ambig_scaling_factor,
             )
@@ -90,15 +89,6 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
 
     def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
         """ Annotate a set of gene counts with functional annotations. """
-        # self.total_gene_counts, u_sf, a_sf = counter.generate_gene_count_matrix(refmgr)
-        # logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts)
-
-        # writer.write_gene_counts(
-        #     counter,
-        #     refmgr,
-        #     u_sf, a_sf,
-        #     gene_group_db=gene_group_db,
-        # )
 
         # formerly used in compute_count_vector
         strand_specific_counts = (
@@ -122,10 +112,6 @@ def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCoun
             region_annotation = db.query_sequence(ggroup_id)
             if region_annotation is not None:
                 _, _, region_annotation = region_annotation
-                # logger.info(
-                #     "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
-                #     gene_id, ggroup_id, counts[0], counts[2],
-                # )
                 self.distribute_feature_counts(counts, region_annotation)
 
             else:
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index aad80de5..c136759f 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -213,26 +213,45 @@ def generate_gene_count_matrix(self, refmgr):
 
         # duplicate the raw counts
         self.counts = np.concatenate(
-            (self.counts, self.counts,),
+            #(self.counts, self.counts, self.counts,),
+            (
+                self.counts[:, 0], self.counts[:, 0], self.counts[:, 0],  # 0, 1, 2
+                self.counts[:, 1], self.counts[:, 1], self.counts[:, 1],  # 3, 4, 5
+            ),
             axis=1,
         )
 
         # length-normalise the lnorm columns
-        self.counts[:, 2:4] /= lengths[:, None]
+        # self.counts[:, 2:4] /= lengths[:, None]
+        self.counts[:, 1::2] /= lengths[:, None]
 
         count_sums = self.counts.sum(axis=0)
 
-        uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
-        ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
+        # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
+        # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
+        uniq_scaling_factor, combined_scaling_factor = (
+            AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
+            AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
+        )
 
         logger.info(
-            "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
+            "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s",
             count_sums[0], count_sums[2], count_sums[1], count_sums[3],
-            uniq_scaling_factor, ambig_scaling_factor,
+            uniq_scaling_factor, combined_scaling_factor,
         )
 
+        # apply scaling factors
+        self.counts[:, 2] *= uniq_scaling_factor
+        self.counts[:, 5] *= combined_scaling_factor
+
         # return count sums and scaling factors
-        return count_sums, uniq_scaling_factor, ambig_scaling_factor
+        return count_sums, uniq_scaling_factor, combined_scaling_factor
+    
+    @staticmethod
+    def calculate_scaling_factor(raw, norm):
+        if norm == 0.0:
+            return 1.0
+        return raw / norm
 
     def group_gene_count_matrix(self, refmgr):
         ggroup_index = {}
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index d1480f57..01ffbdd3 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -152,22 +152,22 @@ def process_counters(
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(
-            self.reference_manager,
-            self.adm,
-            self.counter,
-            gene_group_db=gene_group_db,
-        ):
-            logger.info("PROCESSING CATEGORY=%s", category)
-            count_writer.write_category(
-                category,
-                c_counts,
-                c_index,
-                c_names,
-                u_sf,
-                a_sf,
-                unannotated_reads=(None, unannotated_reads)[report_unannotated],
-            )
+        # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(
+        #     self.reference_manager,
+        #     self.adm,
+        #     self.counter,
+        #     gene_group_db=gene_group_db,
+        # ):
+        #     logger.info("PROCESSING CATEGORY=%s", category)
+        #     count_writer.write_category(
+        #         category,
+        #         c_counts,
+        #         c_index,
+        #         c_names,
+        #         u_sf,
+        #         a_sf,
+        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
+        #     )
 
         self.adm.clear_caches()
 

From 5b0286f5c7b3fa4f307ceb8a141de08d458aeed2 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 30 Dec 2024 01:11:44 +0100
Subject: [PATCH 070/128] trying to optimise scaling factors, temp. disabled
 feature counts

---
 gffquant/counters/alignment_counter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index c136759f..7f3b871a 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -212,13 +212,13 @@ def generate_gene_count_matrix(self, refmgr):
         self.counts[:, 1:2] += self.counts[:, 0:1]
 
         # duplicate the raw counts
-        self.counts = np.concatenate(
+        self.counts = np.column_stack(
             #(self.counts, self.counts, self.counts,),
             (
                 self.counts[:, 0], self.counts[:, 0], self.counts[:, 0],  # 0, 1, 2
                 self.counts[:, 1], self.counts[:, 1], self.counts[:, 1],  # 3, 4, 5
             ),
-            axis=1,
+            # axis=1,
         )
 
         # length-normalise the lnorm columns

From 5baafe04fe6b334944cfbb633b17c002407d1afa Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 30 Dec 2024 01:23:15 +0100
Subject: [PATCH 071/128] trying to optimise scaling factors, temp. disabled
 feature counts

---
 gffquant/counters/alignment_counter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 7f3b871a..6bc70794 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -223,7 +223,7 @@ def generate_gene_count_matrix(self, refmgr):
 
         # length-normalise the lnorm columns
         # self.counts[:, 2:4] /= lengths[:, None]
-        self.counts[:, 1::2] /= lengths[:, None]
+        self.counts[:, 1::3] /= lengths[:, None]
 
         count_sums = self.counts.sum(axis=0)
 
@@ -241,8 +241,8 @@ def generate_gene_count_matrix(self, refmgr):
         )
 
         # apply scaling factors
-        self.counts[:, 2] *= uniq_scaling_factor
-        self.counts[:, 5] *= combined_scaling_factor
+        self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor
+        self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor
 
         # return count sums and scaling factors
         return count_sums, uniq_scaling_factor, combined_scaling_factor

From 49e11e367054bc4b41a3cbb9f2a3618b87fa431d Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 30 Dec 2024 01:46:39 +0100
Subject: [PATCH 072/128] re-enable feature counts

---
 gffquant/annotation/count_writer.py        | 24 ++++++++--------
 gffquant/annotation/genecount_annotator.py | 28 +++++++++++--------
 gffquant/counters/alignment_counter.py     |  2 +-
 gffquant/profilers/feature_quantifier.py   | 32 +++++++++++-----------
 4 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index a33cef3f..98225835 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -152,22 +152,22 @@ def write_category(
                 cat_counts = counts[0]
                 logger.info("CAT %s: %s", category, str(cat_counts))
                 if cat_counts is not None:
-                    cat_row = self.compile_output_row(
-                        cat_counts,
-                        scaling_factor=unique_sf,
-                        ambig_scaling_factor=ambig_sf,
-                    )
-                    CountWriter.write_row("category", cat_row, stream=feat_out)
+                    # cat_row = self.compile_output_row(
+                    #     cat_counts,
+                    #     scaling_factor=unique_sf,
+                    #     ambig_scaling_factor=ambig_sf,
+                    # )
+                    CountWriter.write_row("category", counts[0], stream=feat_out)
 
             for fid, i in index.items():
                 f_counts = counts[i]
                 if report_unseen or f_counts.sum():
-                    out_row = self.compile_output_row(
-                        f_counts,
-                        scaling_factor=unique_sf,
-                        ambig_scaling_factor=ambig_sf,
-                    )
-                    CountWriter.write_row(names[fid], out_row, stream=feat_out)
+                    # out_row = self.compile_output_row(
+                    #     f_counts,
+                    #     scaling_factor=unique_sf,
+                    #     ambig_scaling_factor=ambig_sf,
+                    # )
+                    CountWriter.write_row(names[fid], counts[i], stream=feat_out)
 
     def write_gene_counts(
         self,
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 80af4515..b4f25689 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -24,7 +24,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
         for category in db.get_categories():
             features = tuple(db.get_features(category.id))
             category_counts = np.zeros(
-                (len(features) + 1, 4,),
+                (len(features) + 1, 6,),
                 dtype='float64',
             )
             category_index = {
@@ -35,8 +35,9 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 feature.id: feature.name
                 for feature in features
             }
-            for rid in counter.get_all_regions():
-                counts = counter.get_counts(rid, strand_specific=self.strand_specific)
+            for rid in counter:
+                # counts = counter.get_counts(rid, strand_specific=self.strand_specific)
+                counts = counter[rid]
                 if gene_group_db:
                     # gene_id, ggroup_id = rid, rid
                     ggroup_id = rid
@@ -65,17 +66,22 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
             # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
             # pre 2.19 category count scaling was based on total counts
-            uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0
-            if category_counts[0][1]:
-                uniq_scaling_factor = category_counts[0][0] / category_counts[0][1]
-            if category_counts[0][3]:
-                ambig_scaling_factor = category_counts[0][2] / category_counts[0][3]
+            # uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0
+            # if category_counts[0][1]:
+            #     uniq_scaling_factor = category_counts[0][0] / category_counts[0][1]
+            # if category_counts[0][3]:
+            #     ambig_scaling_factor = category_counts[0][2] / category_counts[0][3]
+            uniq_scaling_factor, combined_scaling_factor = (
+            AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
+            AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
+        )
 
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
                 category.name,
-                *count_sums,
-                uniq_scaling_factor, ambig_scaling_factor,
+                # *count_sums,
+                count_sums[0], count_sums[1], count_sums[3], count_sums[4],
+                uniq_scaling_factor, combined_scaling_factor,
             )
 
             yield (
@@ -84,7 +90,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 category_index,
                 category_names,
                 uniq_scaling_factor,
-                ambig_scaling_factor,
+                combined_scaling_factor,
             )
 
     def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 6bc70794..04276477 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -236,7 +236,7 @@ def generate_gene_count_matrix(self, refmgr):
 
         logger.info(
             "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s",
-            count_sums[0], count_sums[2], count_sums[1], count_sums[3],
+            count_sums[0], count_sums[1], count_sums[3], count_sums[4],
             uniq_scaling_factor, combined_scaling_factor,
         )
 
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 01ffbdd3..d1480f57 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -152,22 +152,22 @@ def process_counters(
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(
-        #     self.reference_manager,
-        #     self.adm,
-        #     self.counter,
-        #     gene_group_db=gene_group_db,
-        # ):
-        #     logger.info("PROCESSING CATEGORY=%s", category)
-        #     count_writer.write_category(
-        #         category,
-        #         c_counts,
-        #         c_index,
-        #         c_names,
-        #         u_sf,
-        #         a_sf,
-        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
-        #     )
+        for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(
+            self.reference_manager,
+            self.adm,
+            self.counter,
+            gene_group_db=gene_group_db,
+        ):
+            logger.info("PROCESSING CATEGORY=%s", category)
+            count_writer.write_category(
+                category,
+                c_counts,
+                c_index,
+                c_names,
+                u_sf,
+                a_sf,
+                unannotated_reads=(None, unannotated_reads)[report_unannotated],
+            )
 
         self.adm.clear_caches()
 

From b39b4b10613ec9d2baf027cc77541b5ae7132516 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 30 Dec 2024 22:43:17 +0100
Subject: [PATCH 073/128] trying to fix scaling factor issue

---
 gffquant/annotation/genecount_annotator.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index b4f25689..63f0279d 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -59,7 +59,8 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 # elif it == 0:
                 #     self.unannotated_counts += counts[:4]
 
-            count_sums = category_counts[1:].sum(axis=0)
+            # count_sums = category_counts[1:].sum(axis=0)
+            count_sums = category_counts[0]
 
             # should scaled counts use a factor derived from all counts
             # or should multi-feature counts only contribute once?
@@ -72,9 +73,9 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             # if category_counts[0][3]:
             #     ambig_scaling_factor = category_counts[0][2] / category_counts[0][3]
             uniq_scaling_factor, combined_scaling_factor = (
-            AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
-            AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
-        )
+                AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
+                AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
+            )
 
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",

From b76f168edceff3b05a3bb4209884261faa7149e2 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Mon, 30 Dec 2024 23:13:07 +0100
Subject: [PATCH 074/128] trying to fix scaling factor issue

---
 gffquant/annotation/genecount_annotator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 63f0279d..5d676dc2 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -77,6 +77,10 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
             )
 
+            # apply scaling factors
+            category_counts[:, 2] = category_counts[:, 1] * uniq_scaling_factor
+            category_counts[:, 5] = category_counts[:, 4] * combined_scaling_factor
+
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
                 category.name,

From 992040427c4bac10909b5cb5b75424f0c86cd4c6 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 03:00:12 +0100
Subject: [PATCH 075/128] trying to implement count matrix

---
 gffquant/annotation/count_writer.py          |   2 +-
 gffquant/annotation/genecount_annotator.py   |  79 +++---
 gffquant/annotation/regioncount_annotator.py |   1 +
 gffquant/counters/__init__.py                |   1 +
 gffquant/counters/alignment_counter.py       | 259 ++++++++++---------
 gffquant/profilers/feature_quantifier.py     |  32 +--
 6 files changed, 187 insertions(+), 187 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 98225835..7fd1bab6 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -187,7 +187,7 @@ def write_gene_counts(
                     refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0],
                     rid,
                 )
-                for rid in gene_counts.get_all_regions()
+                for rid, _ in gene_counts  #.get_all_regions()
             )
 
             for ref, rid in sorted(ref_stream):
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 5d676dc2..c750f98b 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -20,7 +20,35 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         """ __init__() """
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
+    # def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
+    #     category_sums = np.zeros((len(db.get_categories()), 6))
+    #     functional_counts = np.zeros(())
+    #     for rid in counter:
+    #         counts = counter[rid]
+    #         if gene_group_db:
+    #             ggroup_id = rid
+    #         else:
+    #             ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+    #             ggroup_id = ref
+
+    #         region_annotation = db.query_sequence(ggroup_id)
+    #         if region_annotation is not None:
+    #             _, _, region_annotation = region_annotation
+    #             for category_id, features in region_annotation:
+    #                 category_sums[int(category_id)] += counts
+
+
+    #                 category_features = dict(region_annotation).get(str(category.id))
+    #                 if category_features is not None:
+    #                     category_counts[0] += counts  # category row
+    #                     for cf in category_features:
+    #                         category_counts[category_index.get(int(cf))] += counts
+
+
+
+    def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
+        """ Annotate a set of gene counts with functional annotations. """
+
         for category in db.get_categories():
             features = tuple(db.get_features(category.id))
             category_counts = np.zeros(
@@ -36,18 +64,14 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 for feature in features
             }
             for rid in counter:
-                # counts = counter.get_counts(rid, strand_specific=self.strand_specific)
                 counts = counter[rid]
                 if gene_group_db:
-                    # gene_id, ggroup_id = rid, rid
                     ggroup_id = rid
                 else:
                     ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                    # gene_id, ggroup_id = ref, ref
                     ggroup_id = ref
 
                 region_annotation = db.query_sequence(ggroup_id)
-                # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id)
                 if region_annotation is not None:
                     _, _, region_annotation = region_annotation
                     category_features = dict(region_annotation).get(str(category.id))
@@ -56,22 +80,11 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                         for cf in category_features:
                             category_counts[category_index.get(int(cf))] += counts
 
-                # elif it == 0:
-                #     self.unannotated_counts += counts[:4]
-
-            # count_sums = category_counts[1:].sum(axis=0)
             count_sums = category_counts[0]
 
             # should scaled counts use a factor derived from all counts
             # or should multi-feature counts only contribute once?
-            # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0]
-            # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0]
             # pre 2.19 category count scaling was based on total counts
-            # uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0
-            # if category_counts[0][1]:
-            #     uniq_scaling_factor = category_counts[0][0] / category_counts[0][1]
-            # if category_counts[0][3]:
-            #     ambig_scaling_factor = category_counts[0][2] / category_counts[0][3]
             uniq_scaling_factor, combined_scaling_factor = (
                 AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
                 AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
@@ -84,7 +97,6 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
             logger.info(
                 "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
                 category.name,
-                # *count_sums,
                 count_sums[0], count_sums[1], count_sums[3], count_sums[4],
                 uniq_scaling_factor, combined_scaling_factor,
             )
@@ -97,36 +109,3 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou
                 uniq_scaling_factor,
                 combined_scaling_factor,
             )
-
-    def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
-        """ Annotate a set of gene counts with functional annotations. """
-
-        # formerly used in compute_count_vector
-        strand_specific_counts = (
-            (counter.PLUS_STRAND, counter.MINUS_STRAND)
-            if self.strand_specific else None
-        )
-
-        for rid in counter.get_all_regions():
-            counts = counter.get_counts(rid, strand_specific=self.strand_specific)
-
-            if gene_group_db:
-                # ref_tokens = ref.split(".")
-                # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-                # gene_id, ggroup_id = rid, rid
-                ggroup_id = rid
-            else:
-                ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                # gene_id, ggroup_id = ref, ref
-                ggroup_id = ref
-
-            region_annotation = db.query_sequence(ggroup_id)
-            if region_annotation is not None:
-                _, _, region_annotation = region_annotation
-                self.distribute_feature_counts(counts, region_annotation)
-
-            else:
-                # logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
-                self.unannotated_counts += counts[:4]
-
-        self.calculate_scaling_factors()
diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py
index 84f85163..57931cff 100644
--- a/gffquant/annotation/regioncount_annotator.py
+++ b/gffquant/annotation/regioncount_annotator.py
@@ -10,6 +10,7 @@ class RegionCountAnnotator(CountAnnotator):
     """ CountAnnotator subclass for contig/region-based counting. """
 
     def __init__(self, strand_specific, report_scaling_factors=True):
+        raise NotImplementedError()
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
     # pylint: disable=R0914,W0613
diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py
index d325316e..e82470ea 100644
--- a/gffquant/counters/__init__.py
+++ b/gffquant/counters/__init__.py
@@ -4,4 +4,5 @@
 """module docstring"""
 
 from .alignment_counter import AlignmentCounter
+from .count_matrix import CountMatrix
 from .region_counter import RegionCounter
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 04276477..72faf1d9 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 
+from . import CountMatrix
 from .. import DistributionMode
 
 
@@ -68,13 +69,15 @@ def __init__(
         self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,)
         self.unannotated_reads = 0
 
-        self.index = {}
-        self.counts = np.zeros(
-            (AlignmentCounter.INITIAL_SIZE, 2,),
-            dtype='float64',
-        )
+        # self.index = {}
+        # self.counts = np.zeros(
+        #     (AlignmentCounter.INITIAL_SIZE, 2,),
+        #     dtype='float64',
+        # )
+        self.counts = CountMatrix(2, nrows=AlignmentCounter.INITIAL_SIZE)
 
     def dump(self, prefix, refmgr):
+        raise NotImplementedError()
         with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out:
             for key, key_index in self.index.items():
                 ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key)
@@ -83,33 +86,34 @@ def dump(self, prefix, refmgr):
             # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
             # print(k, ref, reflen, v, sep="\t", file=_out)
 
-    def get(self, key, default_val):
-        key_index = self.index.get(key)
-        if key_index is None:
-            return Counter()
-        return Counter({key: self.counts[key_index]})
+    # def get(self, key, default_val):
+    #     key_index = self.index.get(key)
+    #     if key_index is None:
+    #         return Counter()
+    #     return Counter({key: self.counts[key_index]})
 
-    def setdefault(self, key, default_val):
-        ...
+    # def setdefault(self, key, default_val):
+    #     ...
 
     def has_ambig_counts(self):
-        return bool(self.counts[:, 1].sum() != 0)
+        # return bool(self.counts[:, 1].sum() != 0)
+        return bool(self.counts.colsum(1) != 0)
 
-    def __iter__(self):
-        yield from self.index.keys()
+    # def __iter__(self):
+    #     yield from self.index.keys()
 
-    def __getitem__(self, key):
-        key_index = self.index.get(key)
-        if key_index is None:
-            return 0.0
-        return self.counts[key_index]
+    # def __getitem__(self, key):
+    #     key_index = self.index.get(key)
+    #     if key_index is None:
+    #         return 0.0
+    #     return self.counts[key_index]
 
-    def __setitem__(self, key, value):
-        key_index = self.index.get(key)
-        if key_index is not None:
-            self.counts[key_index] = value
-        else:
-            raise KeyError(f"{key=} not found.")
+    # def __setitem__(self, key, value):
+    #     key_index = self.index.get(key)
+    #     if key_index is not None:
+    #         self.counts[key_index] = value
+    #     else:
+    #         raise KeyError(f"{key=} not found.")
 
     def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,):
         if pe_library is not None:
@@ -131,32 +135,33 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No
 
     def get_unannotated_reads(self):
         # return self.unannotated_reads
-        no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056")
-        if no_annotation is not None:
-            return self.counts[no_annotation][0]
-        return 0.0
-
-    def get_counts(self, seqid, strand_specific=False):
-        if strand_specific:
-            raise NotImplementedError()
-            # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
-            # uniq_counts[seqid[1]] = uniq_counter[seqid]
-            # ambig_counts[seqid[1]] = ambig_counter[seqid]
-
-            # rid = seqid[0] if isinstance(seqid, tuple) else seqid
-            # uniq_counts = [
-            #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-            #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-            # ]
-            # ambig_counts = [
-            #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-            #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-            # ]
-        counts = self[seqid]
-        return np.array((counts[0], counts[2], counts[1], counts[3]))
-
-    def get_all_regions(self):
-        yield from self
+        return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0]
+        # no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056")
+        # if no_annotation is not None:
+        #     return self.counts[no_annotation][0]
+        # return 0.0
+
+    # def get_counts(self, seqid, strand_specific=False):
+    #     if strand_specific:
+    #         raise NotImplementedError()
+    #         # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
+    #         # uniq_counts[seqid[1]] = uniq_counter[seqid]
+    #         # ambig_counts[seqid[1]] = ambig_counter[seqid]
+
+    #         # rid = seqid[0] if isinstance(seqid, tuple) else seqid
+    #         # uniq_counts = [
+    #         #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+    #         #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
+    #         # ]
+    #         # ambig_counts = [
+    #         #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
+    #         #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
+    #         # ]
+    #     counts = self[seqid]
+    #     return np.array((counts[0], counts[2], counts[1], counts[3]))
+
+    # def get_all_regions(self):
+    #     yield from self
 
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
@@ -175,17 +180,18 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
                 )
             )[self.strand_specific]
 
-            key_index = self.index.get(key)
-            if key_index is None:
-                nrows = self.counts.shape[0]
-                if len(self.index) == nrows:
-                    self.counts = np.pad(
-                        self.counts,
-                        ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
-                    )
-                # key_index = self.index.setdefault(key, len(self.index))
-                key_index = self.index[key] = len(self.index)
-            self.counts[key_index][int(ambiguous_counts)] += inc
+            # key_index = self.index.get(key)
+            # if key_index is None:
+            #     nrows = self.counts.shape[0]
+            #     if len(self.index) == nrows:
+            #         self.counts = np.pad(
+            #             self.counts,
+            #             ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
+            #         )
+            #     # key_index = self.index.setdefault(key, len(self.index))
+            #     key_index = self.index[key] = len(self.index)
+            # self.counts[key_index][int(ambiguous_counts)] += inc
+            self.counts[key][int(ambiguous_counts)] += inc
             contributed_counts += inc
 
         return contributed_counts
@@ -196,56 +202,61 @@ def generate_gene_count_matrix(self, refmgr):
         # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm
 
         # obtain gene lengths
-        lengths = np.array(
+        gene_lengths = np.array(
             tuple(
                 (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
                 for key in self.index
             )
         )
-        logger.info("LENGTHS ARRAY = %s", lengths.shape)
-        logger.info("INDEX SIZE = %s", len(self.index))
-
-        # remove the un-indexed rows
-        self.counts = self.counts[0:len(self.index), :]
-
-        # calculate combined_raw
-        self.counts[:, 1:2] += self.counts[:, 0:1]
-
-        # duplicate the raw counts
-        self.counts = np.column_stack(
-            #(self.counts, self.counts, self.counts,),
-            (
-                self.counts[:, 0], self.counts[:, 0], self.counts[:, 0],  # 0, 1, 2
-                self.counts[:, 1], self.counts[:, 1], self.counts[:, 1],  # 3, 4, 5
-            ),
-            # axis=1,
-        )
 
-        # length-normalise the lnorm columns
-        # self.counts[:, 2:4] /= lengths[:, None]
-        self.counts[:, 1::3] /= lengths[:, None]
+        self.counts = self.counts.generate_gene_counts(gene_lengths)
 
-        count_sums = self.counts.sum(axis=0)
+        return self.counts.sum()
 
-        # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
-        # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
-        uniq_scaling_factor, combined_scaling_factor = (
-            AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
-            AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
-        )
+        # logger.info("LENGTHS ARRAY = %s", lengths.shape)
+        # logger.info("INDEX SIZE = %s", len(self.index))
 
-        logger.info(
-            "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s",
-            count_sums[0], count_sums[1], count_sums[3], count_sums[4],
-            uniq_scaling_factor, combined_scaling_factor,
-        )
+        # # remove the un-indexed rows
+        # self.counts = self.counts[0:len(self.index), :]
+
+        # # calculate combined_raw
+        # self.counts[:, 1:2] += self.counts[:, 0:1]
+
+        # # duplicate the raw counts
+        # self.counts = np.column_stack(
+        #     #(self.counts, self.counts, self.counts,),
+        #     (
+        #         self.counts[:, 0], self.counts[:, 0], self.counts[:, 0],  # 0, 1, 2
+        #         self.counts[:, 1], self.counts[:, 1], self.counts[:, 1],  # 3, 4, 5
+        #     ),
+        #     # axis=1,
+        # )
 
-        # apply scaling factors
-        self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor
-        self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor
+        # # length-normalise the lnorm columns
+        # # self.counts[:, 2:4] /= lengths[:, None]
+        # self.counts[:, 1::3] /= lengths[:, None]
 
-        # return count sums and scaling factors
-        return count_sums, uniq_scaling_factor, combined_scaling_factor
+        # count_sums = self.counts.sum(axis=0)
+
+        # # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
+        # # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
+        # uniq_scaling_factor, combined_scaling_factor = (
+        #     AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
+        #     AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
+        # )
+
+        # logger.info(
+        #     "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s",
+        #     count_sums[0], count_sums[1], count_sums[3], count_sums[4],
+        #     uniq_scaling_factor, combined_scaling_factor,
+        # )
+
+        # # apply scaling factors
+        # self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor
+        # self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor
+
+        # # return count sums and scaling factors
+        # return count_sums, uniq_scaling_factor, combined_scaling_factor
     
     @staticmethod
     def calculate_scaling_factor(raw, norm):
@@ -254,21 +265,29 @@ def calculate_scaling_factor(raw, norm):
         return raw / norm
 
     def group_gene_count_matrix(self, refmgr):
-        ggroup_index = {}
-        for key, key_index in self.index.items():
-            ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0]
-            ref_tokens = ref.split(".")
-            _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-            g_key_index = ggroup_index.get(ggroup_id)
-            gene_counts = self.counts[key_index]
-            if g_key_index is None:
-                g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
-                self.counts[g_key_index] = gene_counts
-            else:
-                self.counts[g_key_index] += gene_counts
-
-        # replace index with grouped index
-        self.index = ggroup_index
-
-        # remove the un-indexed (ungrouped) rows
-        self.counts = self.counts[0:len(self.index), :]
+
+        ggroups = (
+            (refmgr.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[-1]
+            for key, _ in self.counts
+        )
+
+        self.counts = self.counts.group_gene_counts(ggroups)
+
+        # ggroup_index = {}
+        # for key, key_index in self.index.items():
+        #     ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0]
+        #     ref_tokens = ref.split(".")
+        #     _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+        #     g_key_index = ggroup_index.get(ggroup_id)
+        #     gene_counts = self.counts[key_index]
+        #     if g_key_index is None:
+        #         g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
+        #         self.counts[g_key_index] = gene_counts
+        #     else:
+        #         self.counts[g_key_index] += gene_counts
+
+        # # replace index with grouped index
+        # self.index = ggroup_index
+
+        # # remove the un-indexed (ungrouped) rows
+        # self.counts = self.counts[0:len(self.index), :]
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index d1480f57..858b1670 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -152,22 +152,22 @@ def process_counters(
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(
-            self.reference_manager,
-            self.adm,
-            self.counter,
-            gene_group_db=gene_group_db,
-        ):
-            logger.info("PROCESSING CATEGORY=%s", category)
-            count_writer.write_category(
-                category,
-                c_counts,
-                c_index,
-                c_names,
-                u_sf,
-                a_sf,
-                unannotated_reads=(None, unannotated_reads)[report_unannotated],
-            )
+        # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate(
+        #     self.reference_manager,
+        #     self.adm,
+        #     self.counter,
+        #     gene_group_db=gene_group_db,
+        # ):
+        #     logger.info("PROCESSING CATEGORY=%s", category)
+        #     count_writer.write_category(
+        #         category,
+        #         c_counts,
+        #         c_index,
+        #         c_names,
+        #         u_sf,
+        #         a_sf,
+        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
+        #     )
 
         self.adm.clear_caches()
 

From 945cf8e89cd8727d0d2de1d6b3be13b968a65984 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 03:12:24 +0100
Subject: [PATCH 076/128] trying to implement count matrix

---
 gffquant/counters/__init__.py          | 2 +-
 gffquant/counters/alignment_counter.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py
index e82470ea..2b5426c3 100644
--- a/gffquant/counters/__init__.py
+++ b/gffquant/counters/__init__.py
@@ -4,5 +4,5 @@
 """module docstring"""
 
 from .alignment_counter import AlignmentCounter
-from .count_matrix import CountMatrix
+# from .count_matrix import CountMatrix
 from .region_counter import RegionCounter
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 72faf1d9..b13297f9 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from . import CountMatrix
+from .count_matrix import CountMatrix
 from .. import DistributionMode
 
 

From 07f85f096118b944aad34627aaccfce42a8df594 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 03:15:32 +0100
Subject: [PATCH 077/128] trying to implement count matrix

---
 gffquant/counters/alignment_counter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index b13297f9..8ec11878 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from .count_matrix import CountMatrix
+from count_matrix import CountMatrix
 from .. import DistributionMode
 
 

From dbb36da3b59c7fc849633ab09c7265afb57d92a3 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 03:16:49 +0100
Subject: [PATCH 078/128] trying to implement count matrix

---
 gffquant/counters/alignment_counter.py |   2 +-
 gffquant/counters/count_matrix.py      | 110 +++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 gffquant/counters/count_matrix.py

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 8ec11878..b13297f9 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from count_matrix import CountMatrix
+from .count_matrix import CountMatrix
 from .. import DistributionMode
 
 
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
new file mode 100644
index 00000000..7fcf86e3
--- /dev/null
+++ b/gffquant/counters/count_matrix.py
@@ -0,0 +1,110 @@
+import logging
+
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+class CountMatrix:
+
+    @staticmethod
+    def calculate_scaling_factor(raw, norm):
+        if norm == 0.0:
+            return 1.0
+        return raw / norm
+
+    def __init__(self, ncols, nrows=1000):
+        self.index = {}
+        self.counts = np.zeros(
+            (nrows, ncols,),
+            dtype='float64',
+        )
+
+    def _resize(self):
+        nrows = self.counts.shape[0]
+        if len(self.index) == nrows:
+            self.counts = np.pad(
+                self.counts,
+                ((0, nrows * 2), (0, 0),),
+            )
+        return len(self.index)
+    
+    def __getitem__(self, key):
+        key_index = self.index.get(key)
+        if key_index is None:
+            key_index = self.index[key] = self._resize()
+        return self.counts[key_index]
+    
+    def __setitem__(self, key, value):
+        key_index = self.index.get(key)
+        if key_index is None:			
+            key_index = self.index[key] = self._resize()
+        self.counts[key_index] = value
+
+    def __iter__(self):
+        yield from zip(self.index.keys(), self.counts)
+
+    def sum(self):
+        return self.counts.sum(axis=0)
+
+    def generate_gene_counts(self, lengths):
+        logger.info("LENGTHS ARRAY = %s", lengths.shape)
+        logger.info("INDEX SIZE = %s", len(self.index))
+
+        # remove the un-indexed rows
+        counts = self.counts[0:len(self.index), :]
+
+        # calculate combined_raw
+        counts[:, 1:2] += counts[:, 0:1]
+
+        # duplicate the raw counts
+        counts = np.column_stack(
+            (
+                counts[:, 0], counts[:, 0], counts[:, 0],  # 0, 1, 2
+                counts[:, 1], counts[:, 1], counts[:, 1],  # 3, 4, 5
+            ),
+        )
+
+        # length-normalise the lnorm columns
+        counts[:, 1::3] /= lengths[:, None]
+
+        count_sums = counts.sum(axis=0)
+
+        uniq_scaling_factor, combined_scaling_factor = (
+            CountMatrix.calculate_scaling_factor(*count_sums[0:2]),
+            CountMatrix.calculate_scaling_factor(*count_sums[3:5]),
+        )
+
+        logger.info(
+            "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s",
+            count_sums[0], count_sums[1], count_sums[3], count_sums[4],
+            uniq_scaling_factor, combined_scaling_factor,
+        )
+
+        # apply scaling factors
+        counts[:, 2] = counts[:, 1] * uniq_scaling_factor
+        counts[:, 5] = counts[:, 4] * combined_scaling_factor
+
+        return self
+    
+    def group_gene_counts(self, ggroups):
+        ggroup_index = {}
+        for (key, key_index), ggroup_id in zip(self.index.items(), ggroups):
+            g_key_index = ggroup_index.get(ggroup_id)
+            gene_counts = self.counts[self.index[key]]
+            if g_key_index is None:
+                g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
+                self.counts[g_key_index] = gene_counts
+            else:
+                self.counts[g_key_index] += gene_counts
+
+        # replace index with grouped index
+        self.index = ggroup_index
+
+        # remove the un-indexed (ungrouped) rows
+        self.counts = self.counts[0:len(self.index), :]
+
+        return self
+    
+    def colsum(self, col):
+        return self.counts[:, col].sum()
\ No newline at end of file

From b281a16c32decf6439a91f372d9af8bebef27dc8 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 03:25:00 +0100
Subject: [PATCH 079/128] trying to implement count matrix

---
 gffquant/counters/alignment_counter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index b13297f9..5606a43c 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -205,7 +205,7 @@ def generate_gene_count_matrix(self, refmgr):
         gene_lengths = np.array(
             tuple(
                 (refmgr.get(key[0] if isinstance(key, tuple) else key))[1]
-                for key in self.index
+                for key, _ in self.counts
             )
         )
 

From 0b64813d13748d21f36ca2e5f56fde53ac23b9c5 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 10:44:45 +0100
Subject: [PATCH 080/128] trying to implement count matrix

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 858b1670..381a950f 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -139,7 +139,7 @@ def process_counters(
             filtered_readcount=self.aln_counter["filtered_read_count"],
         )
 
-        total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager)
+        total_gene_counts = self.counter.generate_gene_count_matrix(self.reference_manager)
         logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts)
 
         count_writer.write_gene_counts(

From 7f93e353c9e19090eb7832883d1f875e8bcf7a54 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 11:06:27 +0100
Subject: [PATCH 081/128] trying to implement count matrix

---
 gffquant/annotation/count_writer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 7fd1bab6..5fd43f25 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -173,12 +173,12 @@ def write_gene_counts(
         self,
         gene_counts: AlignmentCounter,
         refmgr,
-        uniq_scaling_factor,
-        ambig_scaling_factor,
+        # uniq_scaling_factor,
+        # ambig_scaling_factor,
         gene_group_db=False
     ):
-        if "scaled" in self.publish_reports:
-            logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
+        # if "scaled" in self.publish_reports:
+        #     logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
             print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True)
 

From b8bdf080fc8690d219870143d9a335e598c6c282 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 11:15:49 +0100
Subject: [PATCH 082/128] trying to implement count matrix

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 381a950f..7142f5ef 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -145,7 +145,7 @@ def process_counters(
         count_writer.write_gene_counts(
             self.counter,
             self.reference_manager,
-            u_sf, a_sf,
+            # u_sf, a_sf,
             gene_group_db=gene_group_db,
         )
 

From 85d1def473517f6805030f0ffb40814d4dbed026 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 11:34:20 +0100
Subject: [PATCH 083/128] trying to implement count matrix

---
 gffquant/counters/alignment_counter.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 5606a43c..3da911bf 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -99,8 +99,9 @@ def has_ambig_counts(self):
         # return bool(self.counts[:, 1].sum() != 0)
         return bool(self.counts.colsum(1) != 0)
 
-    # def __iter__(self):
-    #     yield from self.index.keys()
+    def __iter__(self):
+        # yield from self.index.keys()
+        yield from self.counts
 
     # def __getitem__(self, key):
     #     key_index = self.index.get(key)

From e325127d57578ed03ced12464599ce30b395cdc6 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 11:53:13 +0100
Subject: [PATCH 084/128] trying to implement count matrix

---
 gffquant/counters/alignment_counter.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 3da911bf..9ca23e83 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -103,6 +103,9 @@ def __iter__(self):
         # yield from self.index.keys()
         yield from self.counts
 
+    def __getitem__(self, key):
+        return self.counts[key]
+
     # def __getitem__(self, key):
     #     key_index = self.index.get(key)
     #     if key_index is None:

From c1c93d14f50f1f6b974d633e775af0d1101871ee Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 31 Dec 2024 12:05:02 +0100
Subject: [PATCH 085/128] trying to implement count matrix

---
 gffquant/counters/count_matrix.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 7fcf86e3..620bf165 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -85,6 +85,8 @@ def generate_gene_counts(self, lengths):
         counts[:, 2] = counts[:, 1] * uniq_scaling_factor
         counts[:, 5] = counts[:, 4] * combined_scaling_factor
 
+        self.counts = counts
+
         return self
     
     def group_gene_counts(self, ggroups):

From 21fd9631fe9a183a50cc6d96864d00d7962a68b5 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 02:39:51 +0100
Subject: [PATCH 086/128] reactivated feature output

---
 gffquant/annotation/count_writer.py        | 43 ++++++++++++
 gffquant/annotation/genecount_annotator.py | 76 +++++++++++++++-------
 gffquant/counters/count_matrix.py          | 16 ++++-
 gffquant/profilers/feature_quantifier.py   | 23 +++++++
 4 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 5fd43f25..b65afa45 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -110,6 +110,49 @@ def compile_block(raw, lnorm, scaling_factors):
     def write_row(header, data, stream=sys.stdout):
         print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream)
 
+
+    def write_category2(
+        self,
+        category_id,
+        category_name,
+        category_sum,
+        counts,
+        feature_names,
+        unannotated_reads=None,
+        report_unseen=True,
+    ):
+        with gzip.open(f"{self.out_prefix}.{category_name}.txt.gz", "wt") as feat_out:
+            header = self.get_header()
+            print("feature", *header, sep="\t", file=feat_out)
+
+            if unannotated_reads is not None:
+                print("unannotated", unannotated_reads, sep="\t", file=feat_out)
+
+            if "total_readcount" in self.publish_reports:
+                CountWriter.write_row(
+                    "total_reads",
+                    np.zeros(len(header)) + self.total_readcount,
+                    stream=feat_out,
+                )
+
+            if "filtered_readcount" in self.publish_reports:
+                CountWriter.write_row(
+                    "filtered_reads",
+                    np.zeros(len(header)) + self.filtered_readcount,
+                    stream=feat_out,
+                )
+
+            if "category" in self.publish_reports:
+                cat_counts = counts[0]
+                logger.info("CAT %s: %s", category_name, str(cat_counts))
+                if cat_counts is not None:
+                    CountWriter.write_row("category", category_sum, stream=feat_out)
+
+            for (cid, fid), fcounts in counts:
+                if (report_unseen or fcounts.sum()) and cid == category_id:
+                    CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
+
+
     def write_category(
         self,
         category,
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index c750f98b..af63b220 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -7,6 +7,7 @@
 
 from .count_annotator import CountAnnotator
 from ..counters import AlignmentCounter
+from ..counters.count_matrix import CountMatrix
 from ..db.annotation_db import AnnotationDatabaseManager
 
 
@@ -20,29 +21,58 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         """ __init__() """
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    # def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
-    #     category_sums = np.zeros((len(db.get_categories()), 6))
-    #     functional_counts = np.zeros(())
-    #     for rid in counter:
-    #         counts = counter[rid]
-    #         if gene_group_db:
-    #             ggroup_id = rid
-    #         else:
-    #             ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-    #             ggroup_id = ref
-
-    #         region_annotation = db.query_sequence(ggroup_id)
-    #         if region_annotation is not None:
-    #             _, _, region_annotation = region_annotation
-    #             for category_id, features in region_annotation:
-    #                 category_sums[int(category_id)] += counts
-
-
-    #                 category_features = dict(region_annotation).get(str(category.id))
-    #                 if category_features is not None:
-    #                     category_counts[0] += counts  # category row
-    #                     for cf in category_features:
-    #                         category_counts[category_index.get(int(cf))] += counts
+    def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
+        categories = list(db.get_categories())
+        category_sums = np.zeros((len(categories), 6))
+        functional_counts = CountMatrix(6)
+
+        for category_id in categories:
+            features = db.get_features(category_id)
+            for feature_id in sorted(features):
+                _ = functional_counts[(category_id, feature_id)]
+
+        for rid in counter:
+            counts = counter[rid]
+            if gene_group_db:
+                ggroup_id = rid
+            else:
+                ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+                ggroup_id = ref
+
+            region_annotation = db.query_sequence(ggroup_id)
+            if region_annotation is not None:
+                _, _, region_annotation = region_annotation
+                for category_id, features in region_annotation:
+                    category_id = int(category_id)
+                    category_sums[category_id] += counts
+                    for feature_id in features:
+                        feature_id = int(feature_id)
+                        functional_counts[(category_id, feature_id)] += counts[:4]
+        
+        for i, category_id in enumerate(categories):
+            u_sf, c_sf = (
+                CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]),
+                CountMatrix.calculate_scaling_factor(*category_sums[i][2:4]),
+            )
+
+            category_id = int(category_id)
+
+            rows = tuple(
+                key[0] == category_id
+                for key, _ in functional_counts
+            )
+
+            functional_counts.scale_column(1, u_sf, rows=rows)
+            functional_counts.scale_column(4, c_sf, rows=rows)
+
+            category_sums[i, 2] = category_sums[i, 1] * u_sf
+            category_sums[i, 5] = category_sums[i, 4] * c_sf
+
+        return functional_counts, category_sums
+            
+
+
+                    
 
 
 
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 620bf165..347af127 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -47,6 +47,13 @@ def __iter__(self):
     def sum(self):
         return self.counts.sum(axis=0)
 
+    def scale_column(self, col_index, factor, rows=None):
+        # apply scaling factors
+        if rows is None:
+            self.counts[:, col_index + 1] = self.counts[:, col_index] * factor
+        else:
+            self.counts[rows, col_index + 1] = self.counts[rows, col_index] * factor
+
     def generate_gene_counts(self, lengths):
         logger.info("LENGTHS ARRAY = %s", lengths.shape)
         logger.info("INDEX SIZE = %s", len(self.index))
@@ -109,4 +116,11 @@ def group_gene_counts(self, ggroups):
         return self
     
     def colsum(self, col):
-        return self.counts[:, col].sum()
\ No newline at end of file
+        return self.counts[:, col].sum()
+    
+    def get_category(self, category_id):
+        rows = tuple(
+            cid == category_id
+            for (cid, _), _ in self
+        )
+        return self
\ No newline at end of file
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 7142f5ef..e0745ea7 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -152,6 +152,29 @@ def process_counters(
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
+        functional_counts, category_sums = count_annotator.annotate_gene_counts(
+            self.reference_manager,
+            self.adm,
+            self.counter,
+            gene_group_db=gene_group_db,
+        )
+
+        categories = self.adm.get_categories()
+        for category, category_sum in zip(categories, category_sums):
+            feature_names = {
+                feature.id: feature.name
+                for feature in self.adm.get_features(category.id)
+            }
+            logger.info("PROCESSING CATEGORY=%s", category)
+            count_writer.write_category2(
+                category.id,
+                category.name,
+                category_sum,
+                functional_counts,
+                feature_names,
+                unannotated_reads=(None, unannotated_reads)[report_unannotated],
+            )
+
         # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate(
         #     self.reference_manager,
         #     self.adm,

From a47b87e77bea0c8a25ae5a513ef9d980193e85b9 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 02:50:06 +0100
Subject: [PATCH 087/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index af63b220..827ca452 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -26,10 +26,10 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
         category_sums = np.zeros((len(categories), 6))
         functional_counts = CountMatrix(6)
 
-        for category_id in categories:
-            features = db.get_features(category_id)
-            for feature_id in sorted(features):
-                _ = functional_counts[(category_id, feature_id)]
+        for category in categories:
+            features = db.get_features(category.id)
+            for feature in sorted(features):
+                _ = functional_counts[(category.id, feature.id)]
 
         for rid in counter:
             counts = counter[rid]
@@ -49,16 +49,14 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
                         feature_id = int(feature_id)
                         functional_counts[(category_id, feature_id)] += counts[:4]
         
-        for i, category_id in enumerate(categories):
+        for i, category in enumerate(categories):
             u_sf, c_sf = (
                 CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]),
                 CountMatrix.calculate_scaling_factor(*category_sums[i][2:4]),
             )
 
-            category_id = int(category_id)
-
             rows = tuple(
-                key[0] == category_id
+                key[0] == category.id
                 for key, _ in functional_counts
             )
 

From 7723314943a52c0759585c6e9c4f075da3fdc48f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 02:58:55 +0100
Subject: [PATCH 088/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 827ca452..69f15481 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -27,8 +27,8 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
         functional_counts = CountMatrix(6)
 
         for category in categories:
-            features = db.get_features(category.id)
-            for feature in sorted(features):
+            features = ((feature.id, feature) for feature in db.get_features(category.id))
+            for _, feature in sorted(features, key=lambda x:x[0]):
                 _ = functional_counts[(category.id, feature.id)]
 
         for rid in counter:

From 27e9e386c19e4f26e83590c9024ae0aae4fc45b0 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 11:15:53 +0100
Subject: [PATCH 089/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 69f15481..5b47483b 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -31,7 +31,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
             for _, feature in sorted(features, key=lambda x:x[0]):
                 _ = functional_counts[(category.id, feature.id)]
 
-        for rid in counter:
+        for rid, counts in counter:
             counts = counter[rid]
             if gene_group_db:
                 ggroup_id = rid

From 2f938f627943496240e75775438a26c5f693ff46 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 11:30:29 +0100
Subject: [PATCH 090/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 5b47483b..64deadaf 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -47,7 +47,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
                     category_sums[category_id] += counts
                     for feature_id in features:
                         feature_id = int(feature_id)
-                        functional_counts[(category_id, feature_id)] += counts[:4]
+                        functional_counts[(category_id, feature_id)] += counts
         
         for i, category in enumerate(categories):
             u_sf, c_sf = (

From 03a4a9621d7122a6c5250371756aee922fb782f1 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 11:53:36 +0100
Subject: [PATCH 091/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 2 ++
 gffquant/counters/count_matrix.py          | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 64deadaf..ec4abb74 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -48,6 +48,8 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
                     for feature_id in features:
                         feature_id = int(feature_id)
                         functional_counts[(category_id, feature_id)] += counts
+
+        functional_counts.drop_unindexed()
         
         for i, category in enumerate(categories):
             u_sf, c_sf = (
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 347af127..89775b3e 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -54,6 +54,9 @@ def scale_column(self, col_index, factor, rows=None):
         else:
             self.counts[rows, col_index + 1] = self.counts[rows, col_index] * factor
 
+    def drop_unindexed(self):
+        self.counts = self.counts[0:len(self.index), :]
+
     def generate_gene_counts(self, lengths):
         logger.info("LENGTHS ARRAY = %s", lengths.shape)
         logger.info("INDEX SIZE = %s", len(self.index))

From 74c2992cbc01204ff3683f31764816521290e946 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 12:11:06 +0100
Subject: [PATCH 092/128] reactivated feature output

---
 gffquant/profilers/feature_quantifier.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index e0745ea7..08981551 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -159,6 +159,9 @@ def process_counters(
             gene_group_db=gene_group_db,
         )
 
+        logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10]))
+        logger.info("FC-counts: %s", str(functional_counts.counts[0:10,:]))
+
         categories = self.adm.get_categories()
         for category, category_sum in zip(categories, category_sums):
             feature_names = {

From 26a94d804e01d67a6733108cb19200ffd5d2b8a1 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 12:30:26 +0100
Subject: [PATCH 093/128] reactivated feature output

---
 gffquant/annotation/count_writer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index b65afa45..23439f59 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -148,6 +148,11 @@ def write_category2(
                 if cat_counts is not None:
                     CountWriter.write_row("category", category_sum, stream=feat_out)
 
+            for item in counts:
+                logger.info("ITEM: %s", str(item))
+
+
+
             for (cid, fid), fcounts in counts:
                 if (report_unseen or fcounts.sum()) and cid == category_id:
                     CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)

From fb73a0a82756c79bae7e376374fcf9ca48889f59 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 12:58:52 +0100
Subject: [PATCH 094/128] reactivated feature output

---
 gffquant/annotation/count_writer.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 23439f59..0ffa00e0 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -149,13 +149,18 @@ def write_category2(
                     CountWriter.write_row("category", category_sum, stream=feat_out)
 
             for item in counts:
-                logger.info("ITEM: %s", str(item))
+                if not isinstance(item[0], tuple):
+                    logger.info("ITEM: %s", str(item))
+                    raise TypeError(f"Weird key: {str(item)}")
+                (cid, fid), fcounts = item
+                if (report_unseen or fcounts.sum()) and cid == category_id:
+                    CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
 
 
-            for (cid, fid), fcounts in counts:
-                if (report_unseen or fcounts.sum()) and cid == category_id:
-                    CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
+            # for (cid, fid), fcounts in counts:
+            #     if (report_unseen or fcounts.sum()) and cid == category_id:
+            #         CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
 
     def write_category(

From d1a272060b10a8ab499d21bb804102402dfc64cc Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 14:12:00 +0100
Subject: [PATCH 095/128] reactivated feature output

---
 gffquant/annotation/count_writer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 0ffa00e0..6034bb90 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -143,7 +143,8 @@ def write_category2(
                 )
 
             if "category" in self.publish_reports:
-                cat_counts = counts[0]
+                # cat_counts = counts[0]
+                cat_counts = category_sum
                 logger.info("CAT %s: %s", category_name, str(cat_counts))
                 if cat_counts is not None:
                     CountWriter.write_row("category", category_sum, stream=feat_out)

From d54e3f096dbcacd60dc401a085e8c0cbc52363dc Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 14:40:25 +0100
Subject: [PATCH 096/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index ec4abb74..9db9fff6 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -54,7 +54,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
         for i, category in enumerate(categories):
             u_sf, c_sf = (
                 CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]),
-                CountMatrix.calculate_scaling_factor(*category_sums[i][2:4]),
+                CountMatrix.calculate_scaling_factor(*category_sums[i][3:5]),
             )
 
             rows = tuple(

From 44a773d72d0b8b55d206292c7874fe36b137e26a Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 14:42:37 +0100
Subject: [PATCH 097/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 9db9fff6..017c5af8 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -27,7 +27,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
         functional_counts = CountMatrix(6)
 
         for category in categories:
-            features = ((feature.id, feature) for feature in db.get_features(category.id))
+            features = ((feature.name, feature) for feature in db.get_features(category.id))
             for _, feature in sorted(features, key=lambda x:x[0]):
                 _ = functional_counts[(category.id, feature.id)]
 

From 6b06a52f3e492dec430f835fc1ab97715f1827d7 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 22:02:04 +0100
Subject: [PATCH 098/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 20 +++++++++++---------
 gffquant/profilers/feature_quantifier.py   |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 017c5af8..964f58bb 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -39,15 +39,17 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
                 ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
                 ggroup_id = ref
 
-            region_annotation = db.query_sequence(ggroup_id)
-            if region_annotation is not None:
-                _, _, region_annotation = region_annotation
-                for category_id, features in region_annotation:
-                    category_id = int(category_id)
-                    category_sums[category_id] += counts
-                    for feature_id in features:
-                        feature_id = int(feature_id)
-                        functional_counts[(category_id, feature_id)] += counts
+            with open("GGROUP_DATA.txt", "wt") as _out:
+                region_annotation = db.query_sequence(ggroup_id)
+                if region_annotation is not None:
+                    _, _, region_annotation = region_annotation
+                    print(ggroup_id, *(f"{category_id}={features}" for category_id, features in region_annotation), sep="\t", file=_out)
+                    for category_id, features in region_annotation:
+                        category_id = int(category_id)
+                        category_sums[category_id] += counts
+                        for feature_id in features:
+                            feature_id = int(feature_id)
+                            functional_counts[(category_id, feature_id)] += counts
 
         functional_counts.drop_unindexed()
         
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 08981551..a6323c2b 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -168,7 +168,7 @@ def process_counters(
                 feature.id: feature.name
                 for feature in self.adm.get_features(category.id)
             }
-            logger.info("PROCESSING CATEGORY=%s", category)
+            logger.info("PROCESSING CATEGORY=%s", category.name)
             count_writer.write_category2(
                 category.id,
                 category.name,

From a0d1032925c0dd5087445b7904a17a09a0bafc8c Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 22:24:51 +0100
Subject: [PATCH 099/128] reactivated feature output

---
 gffquant/annotation/genecount_annotator.py | 21 +++++++++++----------
 gffquant/counters/count_matrix.py          |  2 +-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 964f58bb..2dd44278 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -31,19 +31,20 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
             for _, feature in sorted(features, key=lambda x:x[0]):
                 _ = functional_counts[(category.id, feature.id)]
 
-        for rid, counts in counter:
-            counts = counter[rid]
-            if gene_group_db:
-                ggroup_id = rid
-            else:
-                ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                ggroup_id = ref
-
-            with open("GGROUP_DATA.txt", "wt") as _out:
+        with open("GGROUP_DATA.txt", "wt") as _out:
+
+            for rid, counts in counter:
+                counts = counter[rid]
+                if gene_group_db:
+                    ggroup_id = rid
+                else:
+                    ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+                    ggroup_id = ref
+
                 region_annotation = db.query_sequence(ggroup_id)
                 if region_annotation is not None:
                     _, _, region_annotation = region_annotation
-                    print(ggroup_id, *(f"{category_id}={features}" for category_id, features in region_annotation), sep="\t", file=_out)
+                    print(ggroup_id, *(f"{category_id}={','.join(features)}" for category_id, features in region_annotation), sep="\t", file=_out)
                     for category_id, features in region_annotation:
                         category_id = int(category_id)
                         category_sums[category_id] += counts
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 89775b3e..9e56e71d 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -25,7 +25,7 @@ def _resize(self):
         if len(self.index) == nrows:
             self.counts = np.pad(
                 self.counts,
-                ((0, nrows * 2), (0, 0),),
+                ((0, nrows + 1000), (0, 0),),
             )
         return len(self.index)
     

From 8a56e33c553f4cfba5d96d741d2f7b921e1cd527 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Wed, 1 Jan 2025 23:15:34 +0100
Subject: [PATCH 100/128] pleasing linters, cleanup

---
 gffquant/annotation/count_writer.py        |  97 ++------------
 gffquant/annotation/genecount_annotator.py | 119 ++++--------------
 gffquant/counters/alignment_counter.py     | 139 +--------------------
 gffquant/counters/count_matrix.py          |  22 ++--
 gffquant/counters/region_counter.py        |   2 +-
 gffquant/profilers/feature_quantifier.py   |  22 +---
 6 files changed, 53 insertions(+), 348 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 6034bb90..fe47a4c7 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -110,8 +110,7 @@ def compile_block(raw, lnorm, scaling_factors):
     def write_row(header, data, stream=sys.stdout):
         print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream)
 
-
-    def write_category2(
+    def write_category(
         self,
         category_id,
         category_name,
@@ -149,90 +148,24 @@ def write_category2(
                 if cat_counts is not None:
                     CountWriter.write_row("category", category_sum, stream=feat_out)
 
-            for item in counts:
-                if not isinstance(item[0], tuple):
-                    logger.info("ITEM: %s", str(item))
-                    raise TypeError(f"Weird key: {str(item)}")
-                (cid, fid), fcounts = item
-                if (report_unseen or fcounts.sum()) and cid == category_id:
-                    CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
-
-
-
-            # for (cid, fid), fcounts in counts:
+            # for item in counts:
+            #     if not isinstance(item[0], tuple):
+            #         logger.info("ITEM: %s", str(item))
+            #         raise TypeError(f"Weird key: {str(item)}")
+            #     (cid, fid), fcounts = item
             #     if (report_unseen or fcounts.sum()) and cid == category_id:
             #         CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
-
-    def write_category(
-        self,
-        category,
-        counts,
-        index,
-        names,
-        unique_sf,
-        ambig_sf,
-        unannotated_reads=None,
-        report_unseen=True,
-    ):
-        # category, c_counts, c_index, c_names, u_sf, a_sf
-        if "scaled" in self.publish_reports:
-            logger.info(
-                "SCALING FACTORS %s %s %s",
-                category, unique_sf, ambig_sf,
-            )
-        with gzip.open(f"{self.out_prefix}.{category}.txt.gz", "wt") as feat_out:
-            header = self.get_header()
-            print("feature", *header, sep="\t", file=feat_out)
-
-            if unannotated_reads is not None:
-                print("unannotated", unannotated_reads, sep="\t", file=feat_out)
-
-            if "total_readcount" in self.publish_reports:
-                CountWriter.write_row(
-                    "total_reads",
-                    np.zeros(len(header)) + self.total_readcount,
-                    stream=feat_out,
-                )
-
-            if "filtered_readcount" in self.publish_reports:
-                CountWriter.write_row(
-                    "filtered_reads",
-                    np.zeros(len(header)) + self.filtered_readcount,
-                    stream=feat_out,
-                )
-
-            if "category" in self.publish_reports:
-                cat_counts = counts[0]
-                logger.info("CAT %s: %s", category, str(cat_counts))
-                if cat_counts is not None:
-                    # cat_row = self.compile_output_row(
-                    #     cat_counts,
-                    #     scaling_factor=unique_sf,
-                    #     ambig_scaling_factor=ambig_sf,
-                    # )
-                    CountWriter.write_row("category", counts[0], stream=feat_out)
-
-            for fid, i in index.items():
-                f_counts = counts[i]
-                if report_unseen or f_counts.sum():
-                    # out_row = self.compile_output_row(
-                    #     f_counts,
-                    #     scaling_factor=unique_sf,
-                    #     ambig_scaling_factor=ambig_sf,
-                    # )
-                    CountWriter.write_row(names[fid], counts[i], stream=feat_out)
+            for (cid, fid), fcounts in counts:
+                if (report_unseen or fcounts.sum()) and cid == category_id:
+                    CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
     def write_gene_counts(
         self,
         gene_counts: AlignmentCounter,
         refmgr,
-        # uniq_scaling_factor,
-        # ambig_scaling_factor,
-        gene_group_db=False
+        gene_group_db=False,
     ):
-        # if "scaled" in self.publish_reports:
-        #     logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor)
         with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out:
             print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True)
 
@@ -241,11 +174,10 @@ def write_gene_counts(
                     refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0],
                     rid,
                 )
-                for rid, _ in gene_counts  #.get_all_regions()
+                for rid, _ in gene_counts
             )
 
             for ref, rid in sorted(ref_stream):
-                # counts = gene_counts.get_counts(rid)
                 counts = gene_counts[rid]
                 if gene_group_db:
                     ref_tokens = ref.split(".")
@@ -253,11 +185,4 @@ def write_gene_counts(
                 else:
                     gene_id = ref
 
-                # out_row = self.compile_output_row(
-                #     counts,
-                #     scaling_factor=uniq_scaling_factor,
-                #     ambig_scaling_factor=ambig_scaling_factor,
-                # )
-
-                # CountWriter.write_row(gene_id, out_row, stream=gene_out,)
                 CountWriter.write_row(gene_id, counts, stream=gene_out,)
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 2dd44278..3fbd8a8d 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -21,39 +21,42 @@ def __init__(self, strand_specific, report_scaling_factors=True):
         """ __init__() """
         CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)
 
-    def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
+    def annotate_gene_counts(
+        self,
+        refmgr,
+        db: AnnotationDatabaseManager,
+        counter: AlignmentCounter,
+        gene_group_db=False
+    ):
         categories = list(db.get_categories())
         category_sums = np.zeros((len(categories), 6))
         functional_counts = CountMatrix(6)
 
         for category in categories:
             features = ((feature.name, feature) for feature in db.get_features(category.id))
-            for _, feature in sorted(features, key=lambda x:x[0]):
+            for _, feature in sorted(features, key=lambda x: x[0]):
                 _ = functional_counts[(category.id, feature.id)]
 
-        with open("GGROUP_DATA.txt", "wt") as _out:
-
-            for rid, counts in counter:
-                counts = counter[rid]
-                if gene_group_db:
-                    ggroup_id = rid
-                else:
-                    ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                    ggroup_id = ref
-
-                region_annotation = db.query_sequence(ggroup_id)
-                if region_annotation is not None:
-                    _, _, region_annotation = region_annotation
-                    print(ggroup_id, *(f"{category_id}={','.join(features)}" for category_id, features in region_annotation), sep="\t", file=_out)
-                    for category_id, features in region_annotation:
-                        category_id = int(category_id)
-                        category_sums[category_id] += counts
-                        for feature_id in features:
-                            feature_id = int(feature_id)
-                            functional_counts[(category_id, feature_id)] += counts
+        for rid, counts in counter:
+            counts = counter[rid]
+            if gene_group_db:
+                ggroup_id = rid
+            else:
+                ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
+                ggroup_id = ref
+
+            region_annotation = db.query_sequence(ggroup_id)
+            if region_annotation is not None:
+                _, _, region_annotation = region_annotation
+                for category_id, features in region_annotation:
+                    category_id = int(category_id)
+                    category_sums[category_id] += counts
+                    for feature_id in features:
+                        feature_id = int(feature_id)
+                        functional_counts[(category_id, feature_id)] += counts
 
         functional_counts.drop_unindexed()
-        
+
         for i, category in enumerate(categories):
             u_sf, c_sf = (
                 CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]),
@@ -72,73 +75,3 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A
             category_sums[i, 5] = category_sums[i, 4] * c_sf
 
         return functional_counts, category_sums
-            
-
-
-                    
-
-
-
-    def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False):
-        """ Annotate a set of gene counts with functional annotations. """
-
-        for category in db.get_categories():
-            features = tuple(db.get_features(category.id))
-            category_counts = np.zeros(
-                (len(features) + 1, 6,),
-                dtype='float64',
-            )
-            category_index = {
-                feature.id: i
-                for i, feature in enumerate(features, start=1)
-            }
-            category_names = {
-                feature.id: feature.name
-                for feature in features
-            }
-            for rid in counter:
-                counts = counter[rid]
-                if gene_group_db:
-                    ggroup_id = rid
-                else:
-                    ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)
-                    ggroup_id = ref
-
-                region_annotation = db.query_sequence(ggroup_id)
-                if region_annotation is not None:
-                    _, _, region_annotation = region_annotation
-                    category_features = dict(region_annotation).get(str(category.id))
-                    if category_features is not None:
-                        category_counts[0] += counts  # category row
-                        for cf in category_features:
-                            category_counts[category_index.get(int(cf))] += counts
-
-            count_sums = category_counts[0]
-
-            # should scaled counts use a factor derived from all counts
-            # or should multi-feature counts only contribute once?
-            # pre 2.19 category count scaling was based on total counts
-            uniq_scaling_factor, combined_scaling_factor = (
-                AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
-                AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
-            )
-
-            # apply scaling factors
-            category_counts[:, 2] = category_counts[:, 1] * uniq_scaling_factor
-            category_counts[:, 5] = category_counts[:, 4] * combined_scaling_factor
-
-            logger.info(
-                "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s",
-                category.name,
-                count_sums[0], count_sums[1], count_sums[3], count_sums[4],
-                uniq_scaling_factor, combined_scaling_factor,
-            )
-
-            yield (
-                category.name,
-                category_counts,
-                category_index,
-                category_names,
-                uniq_scaling_factor,
-                combined_scaling_factor,
-            )
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 9ca23e83..b467eb0e 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -5,8 +5,6 @@
 import gzip
 import logging
 
-from collections import Counter
-
 import numpy as np
 
 from .count_matrix import CountMatrix
@@ -31,9 +29,10 @@ def normalise_counts(counts, feature_len, scaling_factor):
         scaled = normalised * scaling_factor
         return counts, normalised, scaled
 
-    def get_increment(self, n_aln, increment):
+    @staticmethod
+    def get_increment(n_aln, increment, distribution_mode):
         # 1overN = lavern. Maya <3
-        return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment
+        return (increment, (increment / n_aln))[distribution_mode == DistributionMode.ONE_OVER_N]
 
     def toggle_single_read_handling(self, unmarked_orphans):
         # precalculate count-increment for single-end, paired-end reads
@@ -69,11 +68,6 @@ def __init__(
         self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,)
         self.unannotated_reads = 0
 
-        # self.index = {}
-        # self.counts = np.zeros(
-        #     (AlignmentCounter.INITIAL_SIZE, 2,),
-        #     dtype='float64',
-        # )
         self.counts = CountMatrix(2, nrows=AlignmentCounter.INITIAL_SIZE)
 
     def dump(self, prefix, refmgr):
@@ -86,39 +80,16 @@ def dump(self, prefix, refmgr):
             # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k)
             # print(k, ref, reflen, v, sep="\t", file=_out)
 
-    # def get(self, key, default_val):
-    #     key_index = self.index.get(key)
-    #     if key_index is None:
-    #         return Counter()
-    #     return Counter({key: self.counts[key_index]})
-
-    # def setdefault(self, key, default_val):
-    #     ...
-
     def has_ambig_counts(self):
         # return bool(self.counts[:, 1].sum() != 0)
         return bool(self.counts.colsum(1) != 0)
 
     def __iter__(self):
-        # yield from self.index.keys()
         yield from self.counts
 
     def __getitem__(self, key):
         return self.counts[key]
 
-    # def __getitem__(self, key):
-    #     key_index = self.index.get(key)
-    #     if key_index is None:
-    #         return 0.0
-    #     return self.counts[key_index]
-
-    # def __setitem__(self, key, value):
-    #     key_index = self.index.get(key)
-    #     if key_index is not None:
-    #         self.counts[key_index] = value
-    #     else:
-    #         raise KeyError(f"{key=} not found.")
-
     def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,):
         if pe_library is not None:
             # this is the case when the alignment has a read group tag
@@ -138,34 +109,7 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No
         return contributed_counts
 
     def get_unannotated_reads(self):
-        # return self.unannotated_reads
         return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0]
-        # no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056")
-        # if no_annotation is not None:
-        #     return self.counts[no_annotation][0]
-        # return 0.0
-
-    # def get_counts(self, seqid, strand_specific=False):
-    #     if strand_specific:
-    #         raise NotImplementedError()
-    #         # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0]
-    #         # uniq_counts[seqid[1]] = uniq_counter[seqid]
-    #         # ambig_counts[seqid[1]] = ambig_counter[seqid]
-
-    #         # rid = seqid[0] if isinstance(seqid, tuple) else seqid
-    #         # uniq_counts = [
-    #         #     uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-    #         #     uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-    #         # ]
-    #         # ambig_counts = [
-    #         #     ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)],
-    #         #     ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)],
-    #         # ]
-    #     counts = self[seqid]
-    #     return np.array((counts[0], counts[2], counts[1], counts[3]))
-
-    # def get_all_regions(self):
-    #     yield from self
 
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
@@ -173,7 +117,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
             hit = hits[0]
             inc = (
                 (
-                    self.get_increment(aln_count, increment),
+                    AlignmentCounter.get_increment(aln_count, increment, self.distribution_mode),
                     increment,
                 )
             )[aln_count == 1]
@@ -184,17 +128,6 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
                 )
             )[self.strand_specific]
 
-            # key_index = self.index.get(key)
-            # if key_index is None:
-            #     nrows = self.counts.shape[0]
-            #     if len(self.index) == nrows:
-            #         self.counts = np.pad(
-            #             self.counts,
-            #             ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),),
-            #         )
-            #     # key_index = self.index.setdefault(key, len(self.index))
-            #     key_index = self.index[key] = len(self.index)
-            # self.counts[key_index][int(ambiguous_counts)] += inc
             self.counts[key][int(ambiguous_counts)] += inc
             contributed_counts += inc
 
@@ -217,51 +150,6 @@ def generate_gene_count_matrix(self, refmgr):
 
         return self.counts.sum()
 
-        # logger.info("LENGTHS ARRAY = %s", lengths.shape)
-        # logger.info("INDEX SIZE = %s", len(self.index))
-
-        # # remove the un-indexed rows
-        # self.counts = self.counts[0:len(self.index), :]
-
-        # # calculate combined_raw
-        # self.counts[:, 1:2] += self.counts[:, 0:1]
-
-        # # duplicate the raw counts
-        # self.counts = np.column_stack(
-        #     #(self.counts, self.counts, self.counts,),
-        #     (
-        #         self.counts[:, 0], self.counts[:, 0], self.counts[:, 0],  # 0, 1, 2
-        #         self.counts[:, 1], self.counts[:, 1], self.counts[:, 1],  # 3, 4, 5
-        #     ),
-        #     # axis=1,
-        # )
-
-        # # length-normalise the lnorm columns
-        # # self.counts[:, 2:4] /= lengths[:, None]
-        # self.counts[:, 1::3] /= lengths[:, None]
-
-        # count_sums = self.counts.sum(axis=0)
-
-        # # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0]
-        # # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0]
-        # uniq_scaling_factor, combined_scaling_factor = (
-        #     AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]),
-        #     AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]),
-        # )
-
-        # logger.info(
-        #     "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s",
-        #     count_sums[0], count_sums[1], count_sums[3], count_sums[4],
-        #     uniq_scaling_factor, combined_scaling_factor,
-        # )
-
-        # # apply scaling factors
-        # self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor
-        # self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor
-
-        # # return count sums and scaling factors
-        # return count_sums, uniq_scaling_factor, combined_scaling_factor
-    
     @staticmethod
     def calculate_scaling_factor(raw, norm):
         if norm == 0.0:
@@ -276,22 +164,3 @@ def group_gene_count_matrix(self, refmgr):
         )
 
         self.counts = self.counts.group_gene_counts(ggroups)
-
-        # ggroup_index = {}
-        # for key, key_index in self.index.items():
-        #     ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0]
-        #     ref_tokens = ref.split(".")
-        #     _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-        #     g_key_index = ggroup_index.get(ggroup_id)
-        #     gene_counts = self.counts[key_index]
-        #     if g_key_index is None:
-        #         g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
-        #         self.counts[g_key_index] = gene_counts
-        #     else:
-        #         self.counts[g_key_index] += gene_counts
-
-        # # replace index with grouped index
-        # self.index = ggroup_index
-
-        # # remove the un-indexed (ungrouped) rows
-        # self.counts = self.counts[0:len(self.index), :]
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 9e56e71d..e9c876cf 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -1,3 +1,5 @@
+""" module docstring """
+
 import logging
 
 import numpy as np
@@ -5,6 +7,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class CountMatrix:
 
     @staticmethod
@@ -28,16 +31,16 @@ def _resize(self):
                 ((0, nrows + 1000), (0, 0),),
             )
         return len(self.index)
-    
+
     def __getitem__(self, key):
         key_index = self.index.get(key)
         if key_index is None:
             key_index = self.index[key] = self._resize()
         return self.counts[key_index]
-    
+
     def __setitem__(self, key, value):
         key_index = self.index.get(key)
-        if key_index is None:			
+        if key_index is None:
             key_index = self.index[key] = self._resize()
         self.counts[key_index] = value
 
@@ -98,10 +101,10 @@ def generate_gene_counts(self, lengths):
         self.counts = counts
 
         return self
-    
+
     def group_gene_counts(self, ggroups):
         ggroup_index = {}
-        for (key, key_index), ggroup_id in zip(self.index.items(), ggroups):
+        for (key, _), ggroup_id in zip(self.index.items(), ggroups):
             g_key_index = ggroup_index.get(ggroup_id)
             gene_counts = self.counts[self.index[key]]
             if g_key_index is None:
@@ -117,13 +120,6 @@ def group_gene_counts(self, ggroups):
         self.counts = self.counts[0:len(self.index), :]
 
         return self
-    
+
     def colsum(self, col):
         return self.counts[:, col].sum()
-    
-    def get_category(self, category_id):
-        rows = tuple(
-            cid == category_id
-            for (cid, _), _ in self
-        )
-        return self
\ No newline at end of file
diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py
index 41f2b574..ffab7718 100644
--- a/gffquant/counters/region_counter.py
+++ b/gffquant/counters/region_counter.py
@@ -42,7 +42,7 @@ def _update_region(self, region_id, ostart, oend, rev_strand, cstart=None, cend=
     def update_counts(self, count_stream, increment=1):
         contributed_counts = 0
         for hits, aln_count in count_stream:
-            inc = increment if aln_count == 1 else self.get_increment(aln_count, increment)
+            inc = increment if aln_count == 1 else AlignmentCounter.get_increment(aln_count, increment, self.distribution_mode)
             for hit in hits:
                 self._update_region(
                     hit.rid, hit.start, hit.end, hit.rev_strand, increment=inc,
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index a6323c2b..3e3f7505 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -145,7 +145,6 @@ def process_counters(
         count_writer.write_gene_counts(
             self.counter,
             self.reference_manager,
-            # u_sf, a_sf,
             gene_group_db=gene_group_db,
         )
 
@@ -160,7 +159,7 @@ def process_counters(
         )
 
         logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10]))
-        logger.info("FC-counts: %s", str(functional_counts.counts[0:10,:]))
+        logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :]))
 
         categories = self.adm.get_categories()
         for category, category_sum in zip(categories, category_sums):
@@ -169,7 +168,7 @@ def process_counters(
                 for feature in self.adm.get_features(category.id)
             }
             logger.info("PROCESSING CATEGORY=%s", category.name)
-            count_writer.write_category2(
+            count_writer.write_category(
                 category.id,
                 category.name,
                 category_sum,
@@ -178,23 +177,6 @@ def process_counters(
                 unannotated_reads=(None, unannotated_reads)[report_unannotated],
             )
 
-        # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate(
-        #     self.reference_manager,
-        #     self.adm,
-        #     self.counter,
-        #     gene_group_db=gene_group_db,
-        # ):
-        #     logger.info("PROCESSING CATEGORY=%s", category)
-        #     count_writer.write_category(
-        #         category,
-        #         c_counts,
-        #         c_index,
-        #         c_names,
-        #         u_sf,
-        #         a_sf,
-        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
-        #     )
-
         self.adm.clear_caches()
 
     def register_reference(self, rid, aln_reader):

From 98c6cf4e02c0d77a92d34baea99f06cf29cbfacf Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 02:07:53 +0100
Subject: [PATCH 101/128] trying to reduce memory footprint

---
 gffquant/annotation/genecount_annotator.py |  8 +++---
 gffquant/counters/count_matrix.py          | 33 ++++++++++++++++++----
 gffquant/profilers/feature_quantifier.py   | 27 ++++++++++++++++--
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 3fbd8a8d..4ae066c4 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -32,10 +32,10 @@ def annotate_gene_counts(
         category_sums = np.zeros((len(categories), 6))
         functional_counts = CountMatrix(6)
 
-        for category in categories:
-            features = ((feature.name, feature) for feature in db.get_features(category.id))
-            for _, feature in sorted(features, key=lambda x: x[0]):
-                _ = functional_counts[(category.id, feature.id)]
+        # for category in categories:
+        #     features = ((feature.name, feature) for feature in db.get_features(category.id))
+        #     for _, feature in sorted(features, key=lambda x: x[0]):
+        #         _ = functional_counts[(category.id, feature.id)]
 
         for rid, counts in counter:
             counts = counter[rid]
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index e9c876cf..6fb098e3 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -10,18 +10,39 @@
 
 class CountMatrix:
 
+    @classmethod
+    def from_count_matrix(cls, cmatrix, rows=None):
+        if rows is None:
+            counts = np.array(cmatrix.counts)
+            index = dict(counts.index.items())
+        else:
+            counts = cmatrix.counts[rows, :]
+            index = {
+                key: value
+                for (key, value), keep in zip(counts.index.items(), rows)
+                if keep
+            }
+        return cls(index=index, counts=counts)        
+
     @staticmethod
     def calculate_scaling_factor(raw, norm):
         if norm == 0.0:
             return 1.0
         return raw / norm
 
-    def __init__(self, ncols, nrows=1000):
-        self.index = {}
-        self.counts = np.zeros(
-            (nrows, ncols,),
-            dtype='float64',
-        )
+    def __init__(self, ncols=2, nrows=1000, index=None, counts=None,):
+        if index is not None and counts is not None:
+            self.index = dict(index.items())
+            self.counts = counts
+        else:
+            self.index = {}
+            self.counts = np.zeros(
+                (nrows, ncols,),
+                dtype='float64',
+            )
+
+    def has_record(self, key):
+        return self.index.get(key) is not None
 
     def _resize(self):
         nrows = self.counts.shape[0]
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 3e3f7505..b8855bb2 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -15,6 +15,7 @@
 from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags
 from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter
 from ..counters import AlignmentCounter
+from ..counters.count_matrix import CountMatrix
 from ..db.annotation_db import AnnotationDatabaseManager
 
 from .. import __tool__, DistributionMode, RunMode
@@ -163,16 +164,38 @@ def process_counters(
 
         categories = self.adm.get_categories()
         for category, category_sum in zip(categories, category_sums):
+            features = tuple(self.adm.get_features(category.id))
             feature_names = {
                 feature.id: feature.name
-                for feature in self.adm.get_features(category.id)
+                for feature in features
             }
+            # rows = tuple(
+            #     key[0] == category.id
+            #     for key, _ in functional_counts
+            # )
+
+            # cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
+            cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
+            for feature in features:
+                key = (category.id, feature.id)
+                if functional_counts.has_record(key):
+                    cat_counts[key] += functional_counts[key]
+                else:
+                    _ = cat_counts[key]            
+            
+            # for category in categories:
+            # features = ((feature.name, feature) for feature in db.get_features(category.id))
+            # for _, feature in sorted(features, key=lambda x: x[0]):
+            #     _ = functional_counts[(category.id, feature.id)]
+
+
             logger.info("PROCESSING CATEGORY=%s", category.name)
             count_writer.write_category(
                 category.id,
                 category.name,
                 category_sum,
-                functional_counts,
+                # functional_counts,
+                cat_counts,
                 feature_names,
                 unannotated_reads=(None, unannotated_reads)[report_unannotated],
             )

From 7fa8339c4ee511b39e3c21b7e5bfe57659f31d9e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 03:01:23 +0100
Subject: [PATCH 102/128] trying to reduce memory footprint

---
 gffquant/annotation/count_writer.py        | 20 ++++++++++++----
 gffquant/annotation/genecount_annotator.py |  2 +-
 gffquant/profilers/feature_quantifier.py   | 27 +++++++++++-----------
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index fe47a4c7..e1980ad1 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -116,7 +116,8 @@ def write_category(
         category_name,
         category_sum,
         counts,
-        feature_names,
+        # feature_names,
+        features,
         unannotated_reads=None,
         report_unseen=True,
     ):
@@ -156,9 +157,20 @@ def write_category(
             #     if (report_unseen or fcounts.sum()) and cid == category_id:
             #         CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
-            for (cid, fid), fcounts in counts:
-                if (report_unseen or fcounts.sum()) and cid == category_id:
-                    CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
+            empty_row = np.zeros((1, 6), dtype='float64')
+            for feature in features:
+                key = (category_id, feature.id)
+                if counts.has_record(key):
+                    row = counts[key]
+                else:
+                    row = empty_row
+                if (report_unseen or row.sum()):
+                    CountWriter.write_row(feature.name, row, stream=feat_out,)
+                
+
+            # for (cid, fid), fcounts in counts:
+            #     if (report_unseen or fcounts.sum()) and cid == category_id:
+            #         CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
     def write_gene_counts(
         self,
diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py
index 4ae066c4..c7697536 100644
--- a/gffquant/annotation/genecount_annotator.py
+++ b/gffquant/annotation/genecount_annotator.py
@@ -38,7 +38,7 @@ def annotate_gene_counts(
         #         _ = functional_counts[(category.id, feature.id)]
 
         for rid, counts in counter:
-            counts = counter[rid]
+            # counts = counter[rid]
             if gene_group_db:
                 ggroup_id = rid
             else:
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index b8855bb2..7ac6433d 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -169,19 +169,19 @@ def process_counters(
                 feature.id: feature.name
                 for feature in features
             }
-            # rows = tuple(
-            #     key[0] == category.id
-            #     for key, _ in functional_counts
-            # )
+            rows = tuple(
+                key[0] == category.id
+                for key, _ in functional_counts
+            )
 
-            # cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
-            cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
-            for feature in features:
-                key = (category.id, feature.id)
-                if functional_counts.has_record(key):
-                    cat_counts[key] += functional_counts[key]
-                else:
-                    _ = cat_counts[key]            
+            cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
+            # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
+            # for feature in features:
+            #     key = (category.id, feature.id)
+            #     if functional_counts.has_record(key):
+            #         cat_counts[key] += functional_counts[key]
+            #     else:
+            #         _ = cat_counts[key]            
             
             # for category in categories:
             # features = ((feature.name, feature) for feature in db.get_features(category.id))
@@ -196,7 +196,8 @@ def process_counters(
                 category_sum,
                 # functional_counts,
                 cat_counts,
-                feature_names,
+                # feature_names,
+                features,
                 unannotated_reads=(None, unannotated_reads)[report_unannotated],
             )
 

From 3fde96f31e4b5ec38c34d44ee01843691cac9527 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 03:21:20 +0100
Subject: [PATCH 103/128] trying to reduce memory footprint

---
 gffquant/counters/count_matrix.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 6fb098e3..92f0af74 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -14,12 +14,12 @@ class CountMatrix:
     def from_count_matrix(cls, cmatrix, rows=None):
         if rows is None:
             counts = np.array(cmatrix.counts)
-            index = dict(counts.index.items())
+            index = dict(cmatrix.index.items())
         else:
             counts = cmatrix.counts[rows, :]
             index = {
                 key: value
-                for (key, value), keep in zip(counts.index.items(), rows)
+                for (key, value), keep in zip(cmatrix.index.items(), rows)
                 if keep
             }
         return cls(index=index, counts=counts)        

From 64845ec0e9755ea8a0a30747aa44371826529897 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 03:34:37 +0100
Subject: [PATCH 104/128] trying to reduce memory footprint

---
 gffquant/counters/count_matrix.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 92f0af74..6b573b8d 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -17,11 +17,15 @@ def from_count_matrix(cls, cmatrix, rows=None):
             index = dict(cmatrix.index.items())
         else:
             counts = cmatrix.counts[rows, :]
-            index = {
-                key: value
-                for (key, value), keep in zip(cmatrix.index.items(), rows)
-                if keep
-            }
+            index = {}
+            for (key, _), keep in zip(cmatrix.index.items(), rows):
+                if keep:
+                    index[key] = len(index)
+            # index = {
+            #     key: value
+            #     for (key, value), keep in zip(cmatrix.index.items(), rows)
+            #     if keep
+            # }
         return cls(index=index, counts=counts)        
 
     @staticmethod

From 4745be954f3df1e319f910a4688a1c8a0f669495 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 03:45:10 +0100
Subject: [PATCH 105/128] trying to reduce memory footprint

---
 gffquant/annotation/count_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index e1980ad1..b54e9ab2 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -157,7 +157,7 @@ def write_category(
             #     if (report_unseen or fcounts.sum()) and cid == category_id:
             #         CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
-            empty_row = np.zeros((1, 6), dtype='float64')
+            empty_row = np.zeros(6, dtype='float64')
             for feature in features:
                 key = (category_id, feature.id)
                 if counts.has_record(key):

From fa19a80a19866bae9abd7b997bba01e80cf4133b Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 10:24:35 +0100
Subject: [PATCH 106/128] trying to reduce memory footprint

---
 gffquant/annotation/count_writer.py | 3 ++-
 gffquant/counters/count_matrix.py   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index b54e9ab2..1e222689 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from ..counters import AlignmentCounter
+from ..counters.count_matrix import CountMatrix
 
 
 logger = logging.getLogger(__name__)
@@ -157,7 +158,7 @@ def write_category(
             #     if (report_unseen or fcounts.sum()) and cid == category_id:
             #         CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,)
 
-            empty_row = np.zeros(6, dtype='float64')
+            empty_row = np.zeros(6, dtype=CountMatrix.NUMPY_DTYPE)
             for feature in features:
                 key = (category_id, feature.id)
                 if counts.has_record(key):
diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 6b573b8d..36651bb6 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -9,6 +9,7 @@
 
 
 class CountMatrix:
+    NUMPY_DTYPE = 'float32'
 
     @classmethod
     def from_count_matrix(cls, cmatrix, rows=None):
@@ -42,7 +43,7 @@ def __init__(self, ncols=2, nrows=1000, index=None, counts=None,):
             self.index = {}
             self.counts = np.zeros(
                 (nrows, ncols,),
-                dtype='float64',
+                dtype=CountMatrix.NUMPY_DTYPE,
             )
 
     def has_record(self, key):

From 54b88a55292da47e806e996eedf89dfcd339cc09 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 11:46:41 +0100
Subject: [PATCH 107/128] trying to reduce memory footprint

---
 gffquant/counters/count_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 36651bb6..91ccaf70 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -9,7 +9,7 @@
 
 
 class CountMatrix:
-    NUMPY_DTYPE = 'float32'
+    NUMPY_DTYPE = 'float16'
 
     @classmethod
     def from_count_matrix(cls, cmatrix, rows=None):

From 3d4a4f5ca00b25dbd7c688f3391bf8ce481ae80a Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Thu, 2 Jan 2025 12:54:35 +0100
Subject: [PATCH 108/128] trying to reduce memory footprint

---
 gffquant/counters/count_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 91ccaf70..032774b2 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -9,7 +9,7 @@
 
 
 class CountMatrix:
-    NUMPY_DTYPE = 'float16'
+    NUMPY_DTYPE = 'float64'  # float16 causes some overflow issue during testing
 
     @classmethod
     def from_count_matrix(cls, cmatrix, rows=None):

From 2ffa0c5dad11392364f171af21618930f632d899 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 3 Jan 2025 00:02:25 +0100
Subject: [PATCH 109/128] trying category-wise processing

---
 gffquant/profilers/feature_quantifier.py | 119 ++++++++++++++++-------
 1 file changed, 83 insertions(+), 36 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 7ac6433d..a57a4c02 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -11,6 +11,8 @@
 from abc import ABC
 from collections import Counter
 
+import numpy as np
+
 from .panda_coverage_profiler import PandaCoverageProfiler
 from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags
 from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter
@@ -152,54 +154,99 @@ def process_counters(
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        functional_counts, category_sums = count_annotator.annotate_gene_counts(
-            self.reference_manager,
-            self.adm,
-            self.counter,
-            gene_group_db=gene_group_db,
-        )
-
-        logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10]))
-        logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :]))
 
         categories = self.adm.get_categories()
-        for category, category_sum in zip(categories, category_sums):
-            features = tuple(self.adm.get_features(category.id))
-            feature_names = {
-                feature.id: feature.name
-                for feature in features
-            }
-            rows = tuple(
-                key[0] == category.id
-                for key, _ in functional_counts
-            )
+        category_sum = np.array(6)
 
-            cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
-            # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
-            # for feature in features:
-            #     key = (category.id, feature.id)
-            #     if functional_counts.has_record(key):
-            #         cat_counts[key] += functional_counts[key]
-            #     else:
-            #         _ = cat_counts[key]            
-            
-            # for category in categories:
-            # features = ((feature.name, feature) for feature in db.get_features(category.id))
-            # for _, feature in sorted(features, key=lambda x: x[0]):
-            #     _ = functional_counts[(category.id, feature.id)]
+        for category in categories:
+            logger.info("PROCESSING CATEGORY=%s", category.name)
+            category_counts = CountMatrix(ncols=6)
+            for rid, counts in self.counter:
+                if gene_group_db:
+                    ggroup_id = rid
+                else:
+                    ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid)
+                    ggroup_id = ref
+
+                region_annotation = self.adm.query_sequence(ggroup_id)
+                if region_annotation is not None:
+                    _, _, region_annotation = region_annotation
+                    for category_id, features in region_annotation:
+                        if int(category_id) == category.id:
+                            category_sum += counts
+                            for feature_id in features:
+                                category_counts[(category.id, int(feature_id))] += counts
+                            break
+
+            u_sf, c_sf = (
+                CountMatrix.calculate_scaling_factor(*category_sum[0:2]),
+                CountMatrix.calculate_scaling_factor(*category_sum[3:5]),
+            )
 
+            category_counts.scale_column(1, u_sf)
+            category_counts.scale_column(4, c_sf)
 
-            logger.info("PROCESSING CATEGORY=%s", category.name)
+            features = tuple(self.adm.get_features(category.id))
             count_writer.write_category(
                 category.id,
                 category.name,
                 category_sum,
-                # functional_counts,
-                cat_counts,
-                # feature_names,
+                category_counts,
                 features,
                 unannotated_reads=(None, unannotated_reads)[report_unannotated],
             )
+        
+
+
+
+        # functional_counts, category_sums = count_annotator.annotate_gene_counts(
+        #     self.reference_manager,
+        #     self.adm,
+        #     self.counter,
+        #     gene_group_db=gene_group_db,
+        # )
+
+        # logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10]))
+        # logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :]))
+
+        # categories = self.adm.get_categories()
+        # for category, category_sum in zip(categories, category_sums):
+        #     features = tuple(self.adm.get_features(category.id))
+        #     feature_names = {
+        #         feature.id: feature.name
+        #         for feature in features
+        #     }
+        #     rows = tuple(
+        #         key[0] == category.id
+        #         for key, _ in functional_counts
+        #     )
+
+        #     cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
+        #     # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
+        #     # for feature in features:
+        #     #     key = (category.id, feature.id)
+        #     #     if functional_counts.has_record(key):
+        #     #         cat_counts[key] += functional_counts[key]
+        #     #     else:
+        #     #         _ = cat_counts[key]            
+            
+        #     # for category in categories:
+        #     # features = ((feature.name, feature) for feature in db.get_features(category.id))
+        #     # for _, feature in sorted(features, key=lambda x: x[0]):
+        #     #     _ = functional_counts[(category.id, feature.id)]
+
+
+        #     logger.info("PROCESSING CATEGORY=%s", category.name)
+        #     count_writer.write_category(
+        #         category.id,
+        #         category.name,
+        #         category_sum,
+        #         # functional_counts,
+        #         cat_counts,
+        #         # feature_names,
+        #         features,
+        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
+        #     )
 
         self.adm.clear_caches()
 

From 688a081deaa4a485ab791d6b21d5268beb3a3d89 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 3 Jan 2025 00:12:34 +0100
Subject: [PATCH 110/128] trying category-wise processing

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index a57a4c02..04251c2d 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -156,7 +156,7 @@ def process_counters(
 
 
         categories = self.adm.get_categories()
-        category_sum = np.array(6)
+        category_sum = np.array(6, dtype='float64')
 
         for category in categories:
             logger.info("PROCESSING CATEGORY=%s", category.name)

From 64f814931c2fecce5777530debc7bd84bf8cc3b1 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 3 Jan 2025 00:27:37 +0100
Subject: [PATCH 111/128] trying category-wise processing

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 04251c2d..3af4ba2c 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -156,10 +156,10 @@ def process_counters(
 
 
         categories = self.adm.get_categories()
-        category_sum = np.array(6, dtype='float64')
 
         for category in categories:
             logger.info("PROCESSING CATEGORY=%s", category.name)
+            category_sum = np.zeros(6, dtype='float64')
             category_counts = CountMatrix(ncols=6)
             for rid, counts in self.counter:
                 if gene_group_db:

From f7c16c3d7d5b7f47e4b99a6ff6721e6641396959 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 3 Jan 2025 14:03:36 +0100
Subject: [PATCH 112/128] truncated unannotated hash

---
 gffquant/counters/alignment_counter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index b467eb0e..5884c76a 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -109,7 +109,8 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No
         return contributed_counts
 
     def get_unannotated_reads(self):
-        return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0]
+        # return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0]
+        return self.counts["c591b65a0f4cd"][0] 
 
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0

From 2fe0bdc1a1ba48efe924e525117b17d0dced3b6e Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 7 Jan 2025 12:10:38 +0100
Subject: [PATCH 113/128] making adjustments for new db format

---
 gffquant/annotation/count_writer.py    | 11 ++++++-----
 gffquant/counters/alignment_counter.py |  5 +++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py
index 1e222689..155bac39 100644
--- a/gffquant/annotation/count_writer.py
+++ b/gffquant/annotation/count_writer.py
@@ -192,10 +192,11 @@ def write_gene_counts(
 
             for ref, rid in sorted(ref_stream):
                 counts = gene_counts[rid]
-                if gene_group_db:
-                    ref_tokens = ref.split(".")
-                    gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
-                else:
-                    gene_id = ref
+                # if gene_group_db:
+                #     ref_tokens = ref.split(".")
+                #     gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1]
+                # else:
+                #     gene_id = ref
+                gene_id = ref
 
                 CountWriter.write_row(gene_id, counts, stream=gene_out,)
diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py
index 5884c76a..3f716955 100644
--- a/gffquant/counters/alignment_counter.py
+++ b/gffquant/counters/alignment_counter.py
@@ -110,7 +110,8 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No
 
     def get_unannotated_reads(self):
         # return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0]
-        return self.counts["c591b65a0f4cd"][0] 
+        # return self.counts["c591b65a0f4cd"][0] 
+        return self.counts["00000000"][0]
 
     def update_counts(self, count_stream, increment=1, ambiguous_counts=False):
         contributed_counts = 0
@@ -160,7 +161,7 @@ def calculate_scaling_factor(raw, norm):
     def group_gene_count_matrix(self, refmgr):
 
         ggroups = (
-            (refmgr.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[-1]
+            (refmgr.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0]
             for key, _ in self.counts
         )
 

From 9a0f88805380e8123757a18296be2d4bd2ba03a2 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 7 Jan 2025 13:18:10 +0100
Subject: [PATCH 114/128] making adjustments for new db format

---
 gffquant/counters/count_matrix.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 032774b2..602c352e 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -130,9 +130,12 @@ def generate_gene_counts(self, lengths):
 
     def group_gene_counts(self, ggroups):
         ggroup_index = {}
-        for (key, _), ggroup_id in zip(self.index.items(), ggroups):
+        # for (key, _), ggroup_id in zip(self.index.items(), ggroups):
+        #     g_key_index = ggroup_index.get(ggroup_id)
+        #     gene_counts = self.counts[self.index[key]]
+        for gene_id, gene_counts in self:
+            ggroup_id = gene_id.split(".")[-1]
             g_key_index = ggroup_index.get(ggroup_id)
-            gene_counts = self.counts[self.index[key]]
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
                 self.counts[g_key_index] = gene_counts

From b51b48ddc0236b745c3b1d2593b7a11d6ea2b0ea Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 7 Jan 2025 13:27:28 +0100
Subject: [PATCH 115/128] making adjustments for new db format

---
 gffquant/counters/count_matrix.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 602c352e..ab1e450f 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -130,12 +130,12 @@ def generate_gene_counts(self, lengths):
 
     def group_gene_counts(self, ggroups):
         ggroup_index = {}
-        # for (key, _), ggroup_id in zip(self.index.items(), ggroups):
+        # for gene_id, gene_counts in self:
+        #     ggroup_id = gene_id.split(".")[-1]
         #     g_key_index = ggroup_index.get(ggroup_id)
-        #     gene_counts = self.counts[self.index[key]]
-        for gene_id, gene_counts in self:
-            ggroup_id = gene_id.split(".")[-1]
+        for (_, gene_counts), ggroup_id in zip(self, ggroups):
             g_key_index = ggroup_index.get(ggroup_id)
+            # gene_counts = self.counts[self.index[key]]
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
                 self.counts[g_key_index] = gene_counts

From 0e1edae4785ab0f5cc0bf682fcd42f1eac72cadf Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 7 Jan 2025 13:55:09 +0100
Subject: [PATCH 116/128] making adjustments for new db format

---
 gffquant/counters/count_matrix.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index ab1e450f..88b64db2 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -139,8 +139,10 @@ def group_gene_counts(self, ggroups):
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
                 self.counts[g_key_index] = gene_counts
+                logger.info("CM.group_gene_counts: Adding %s to new group %s (%s).", str(gene_counts), ggroup_id, g_key_index)
             else:
                 self.counts[g_key_index] += gene_counts
+                logger.info("CM.group_gene_counts: Adding %s to group %s (%s).", str(gene_counts), ggroup_id, g_key_index)
 
         # replace index with grouped index
         self.index = ggroup_index

From c065163da196659bdcb2201c61f44dd775b79578 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Tue, 7 Jan 2025 16:00:28 +0100
Subject: [PATCH 117/128] making adjustments for new db format

---
 gffquant/counters/count_matrix.py        | 4 ++--
 gffquant/profilers/feature_quantifier.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 88b64db2..1963c812 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -139,10 +139,10 @@ def group_gene_counts(self, ggroups):
             if g_key_index is None:
                 g_key_index = ggroup_index[ggroup_id] = len(ggroup_index)
                 self.counts[g_key_index] = gene_counts
-                logger.info("CM.group_gene_counts: Adding %s to new group %s (%s).", str(gene_counts), ggroup_id, g_key_index)
+                # logger.info("CM.group_gene_counts: Adding %s to new group %s (%s).", str(gene_counts), ggroup_id, g_key_index)
             else:
                 self.counts[g_key_index] += gene_counts
-                logger.info("CM.group_gene_counts: Adding %s to group %s (%s).", str(gene_counts), ggroup_id, g_key_index)
+                # logger.info("CM.group_gene_counts: Adding %s to group %s (%s).", str(gene_counts), ggroup_id, g_key_index)
 
         # replace index with grouped index
         self.index = ggroup_index
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 3af4ba2c..c590cc69 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -164,6 +164,7 @@ def process_counters(
             for rid, counts in self.counter:
                 if gene_group_db:
                     ggroup_id = rid
+                    logger.info("GGROUP %s: %s", ggroup_id, str(counts))
                 else:
                     ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid)
                     ggroup_id = ref

From 21d42644e736870a8630289afdcfaf31f54b29a3 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 10:43:05 +0100
Subject: [PATCH 118/128] added count matrix state dump

---
 gffquant/counters/count_matrix.py        | 6 ++++++
 gffquant/profilers/feature_quantifier.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 1963c812..ad3150f5 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -127,6 +127,12 @@ def generate_gene_counts(self, lengths):
         self.counts = counts
 
         return self
+    
+    def dump(self, state="genes"):
+        with open(f"CountMatrix.{state}.txt", "wt") as _out:
+            for index, counts in self:
+                print(index, *counts, sep="\t", file=_out)
+
 
     def group_gene_counts(self, ggroups):
         ggroup_index = {}
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index c590cc69..ad5dd242 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -151,9 +151,12 @@ def process_counters(
             gene_group_db=gene_group_db,
         )
 
+        self.counter.counts.dump()
+
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
+        self.counter.counts.dump(state="ggroup")
 
         categories = self.adm.get_categories()
 
@@ -187,6 +190,9 @@ def process_counters(
             category_counts.scale_column(1, u_sf)
             category_counts.scale_column(4, c_sf)
 
+            category_sum[2] = category_sum[1] / u_sf
+            category_sum[5] = category_sum[4] / c_sf
+
             features = tuple(self.adm.get_features(category.id))
             count_writer.write_category(
                 category.id,

From 929e2116c79c09a2bd5930ecd329ef2a4389d66f Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 11:03:16 +0100
Subject: [PATCH 119/128] added count matrix state dump

---
 gffquant/counters/count_matrix.py        | 9 ++++++---
 gffquant/profilers/feature_quantifier.py | 8 +++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index ad3150f5..0831d111 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -128,10 +128,13 @@ def generate_gene_counts(self, lengths):
 
         return self
     
-    def dump(self, state="genes"):
+    def dump(self, state="genes", labels=None,):
         with open(f"CountMatrix.{state}.txt", "wt") as _out:
-            for index, counts in self:
-                print(index, *counts, sep="\t", file=_out)
+            if labels is None:
+                for index, counts in self:
+                    print(index, *counts, sep="\t", file=_out)
+                for (index, counts), label in zip(self, labels):
+                    print(label, *counts, sep="\t", file=_out)
 
 
     def group_gene_counts(self, ggroups):
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index ad5dd242..7d84a32d 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -151,7 +151,13 @@ def process_counters(
             gene_group_db=gene_group_db,
         )
 
-        self.counter.counts.dump()
+        ggroups = (
+            (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0]
+            for key, _ in self.counter.counts
+        )
+
+
+        self.counter.counts.dump(labels=ggroups)
 
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]

From 86f5f78ea440f76e6e2098477fcdb577e45290cf Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 11:16:33 +0100
Subject: [PATCH 120/128] added count matrix state dump

---
 gffquant/counters/count_matrix.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 0831d111..72d38c02 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -133,6 +133,7 @@ def dump(self, state="genes", labels=None,):
             if labels is None:
                 for index, counts in self:
                     print(index, *counts, sep="\t", file=_out)
+            else:
                 for (index, counts), label in zip(self, labels):
                     print(label, *counts, sep="\t", file=_out)
 

From daac8f2cece125baf9dc5b660e1f58d979b82820 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 11:17:05 +0100
Subject: [PATCH 121/128] added count matrix state dump

---
 gffquant/profilers/feature_quantifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 7d84a32d..40196377 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -157,12 +157,12 @@ def process_counters(
         )
 
 
-        self.counter.counts.dump(labels=ggroups)
+        self.counter.counts.dump()
 
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        self.counter.counts.dump(state="ggroup")
+        self.counter.counts.dump(state="ggroup", labels=ggroups)
 
         categories = self.adm.get_categories()
 

From 88fa6f51e71b3ad8bb9557f28c93eb8401c4984c Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 11:38:36 +0100
Subject: [PATCH 122/128] added count matrix state dump

---
 gffquant/profilers/feature_quantifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 40196377..d5faee4f 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -151,9 +151,9 @@ def process_counters(
             gene_group_db=gene_group_db,
         )
 
-        ggroups = (
+        ggroups = tuple(
             (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0]
-            for key, _ in self.counter.counts
+            for key, _ in self.counter
         )
 
 

From 12dbfb3b1103dad84553d54900ebc70c16410d22 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 11:50:24 +0100
Subject: [PATCH 123/128] added count matrix state dump

---
 gffquant/profilers/feature_quantifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index d5faee4f..1442ae98 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -157,12 +157,12 @@ def process_counters(
         )
 
 
-        self.counter.counts.dump()
+        self.counter.counts.dump(labels=ggroups)
 
         self.counter.group_gene_count_matrix(self.reference_manager)
         unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"]
 
-        self.counter.counts.dump(state="ggroup", labels=ggroups)
+        self.counter.counts.dump(state="ggroup")
 
         categories = self.adm.get_categories()
 

From 166aab47c76e3a7affa119d9779b70dac2fca8ca Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Fri, 10 Jan 2025 12:06:21 +0100
Subject: [PATCH 124/128] added count matrix state dump

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 1442ae98..b30a3b9f 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -152,7 +152,7 @@ def process_counters(
         )
 
         ggroups = tuple(
-            (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0]
+            (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0]  # .split(".")[0]
             for key, _ in self.counter
         )
 

From a822d12dd350028fa80e4175c73bc9d1b8ce7e4b Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 11 Jan 2025 22:59:01 +0100
Subject: [PATCH 125/128] refactor group_gene_counts to be not in-place

---
 gffquant/counters/count_matrix.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 72d38c02..78cdb100 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -139,6 +139,15 @@ def dump(self, state="genes", labels=None,):
 
 
     def group_gene_counts(self, ggroups):
+
+        ggroup_counts = CountMatrix(ncols=6)
+        for (_, gene_counts), ggroup_id in zip(self, ggroups):
+            ggroup_counts[ggroup_id] +=gene_counts
+        
+        return ggroup_counts
+
+
+
         ggroup_index = {}
         # for gene_id, gene_counts in self:
         #     ggroup_id = gene_id.split(".")[-1]

From e3c86c1d887859937e0f0ea01bec051ce71c3891 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sat, 11 Jan 2025 23:54:33 +0100
Subject: [PATCH 126/128] refactor group_gene_counts to be not in-place

---
 gffquant/counters/count_matrix.py        | 2 +-
 gffquant/profilers/feature_quantifier.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py
index 78cdb100..dda3d8a6 100644
--- a/gffquant/counters/count_matrix.py
+++ b/gffquant/counters/count_matrix.py
@@ -142,7 +142,7 @@ def group_gene_counts(self, ggroups):
 
         ggroup_counts = CountMatrix(ncols=6)
         for (_, gene_counts), ggroup_id in zip(self, ggroups):
-            ggroup_counts[ggroup_id] +=gene_counts
+            ggroup_counts[ggroup_id] += gene_counts
         
         return ggroup_counts
 
diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index b30a3b9f..7287546d 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -208,6 +208,7 @@ def process_counters(
                 features,
                 unannotated_reads=(None, unannotated_reads)[report_unannotated],
             )
+            break
         
 
 

From c4a8fad1e5954e924a184320a4965bd29ba97f47 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 12 Jan 2025 00:02:38 +0100
Subject: [PATCH 127/128] refactor group_gene_counts to be not in-place

---
 gffquant/profilers/feature_quantifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index 7287546d..c27fa19d 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -208,7 +208,7 @@ def process_counters(
                 features,
                 unannotated_reads=(None, unannotated_reads)[report_unannotated],
             )
-            break
+
         
 
 

From 1f295bc469fea435db26b4c696692843015a4e61 Mon Sep 17 00:00:00 2001
From: Christian Schudoma <christian.schudoma@embl.de>
Date: Sun, 12 Jan 2025 00:07:54 +0100
Subject: [PATCH 128/128] refactor group_gene_counts to be not in-place

---
 gffquant/profilers/feature_quantifier.py | 164 +++++++++++------------
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py
index c27fa19d..53a5c2e6 100644
--- a/gffquant/profilers/feature_quantifier.py
+++ b/gffquant/profilers/feature_quantifier.py
@@ -164,103 +164,103 @@ def process_counters(
 
         self.counter.counts.dump(state="ggroup")
 
-        categories = self.adm.get_categories()
+        # categories = self.adm.get_categories()
 
-        for category in categories:
-            logger.info("PROCESSING CATEGORY=%s", category.name)
-            category_sum = np.zeros(6, dtype='float64')
-            category_counts = CountMatrix(ncols=6)
-            for rid, counts in self.counter:
-                if gene_group_db:
-                    ggroup_id = rid
-                    logger.info("GGROUP %s: %s", ggroup_id, str(counts))
-                else:
-                    ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid)
-                    ggroup_id = ref
-
-                region_annotation = self.adm.query_sequence(ggroup_id)
-                if region_annotation is not None:
-                    _, _, region_annotation = region_annotation
-                    for category_id, features in region_annotation:
-                        if int(category_id) == category.id:
-                            category_sum += counts
-                            for feature_id in features:
-                                category_counts[(category.id, int(feature_id))] += counts
-                            break
-
-            u_sf, c_sf = (
-                CountMatrix.calculate_scaling_factor(*category_sum[0:2]),
-                CountMatrix.calculate_scaling_factor(*category_sum[3:5]),
-            )
+        # for category in categories:
+        #     logger.info("PROCESSING CATEGORY=%s", category.name)
+        #     category_sum = np.zeros(6, dtype='float64')
+        #     category_counts = CountMatrix(ncols=6)
+        #     for rid, counts in self.counter:
+        #         if gene_group_db:
+        #             ggroup_id = rid
+        #             logger.info("GGROUP %s: %s", ggroup_id, str(counts))
+        #         else:
+        #             ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid)
+        #             ggroup_id = ref
+
+        #         region_annotation = self.adm.query_sequence(ggroup_id)
+        #         if region_annotation is not None:
+        #             _, _, region_annotation = region_annotation
+        #             for category_id, features in region_annotation:
+        #                 if int(category_id) == category.id:
+        #                     category_sum += counts
+        #                     for feature_id in features:
+        #                         category_counts[(category.id, int(feature_id))] += counts
+        #                     break
+
+        #     u_sf, c_sf = (
+        #         CountMatrix.calculate_scaling_factor(*category_sum[0:2]),
+        #         CountMatrix.calculate_scaling_factor(*category_sum[3:5]),
+        #     )
 
-            category_counts.scale_column(1, u_sf)
-            category_counts.scale_column(4, c_sf)
+        #     category_counts.scale_column(1, u_sf)
+        #     category_counts.scale_column(4, c_sf)
 
-            category_sum[2] = category_sum[1] / u_sf
-            category_sum[5] = category_sum[4] / c_sf
+        #     category_sum[2] = category_sum[1] / u_sf
+        #     category_sum[5] = category_sum[4] / c_sf
 
-            features = tuple(self.adm.get_features(category.id))
-            count_writer.write_category(
-                category.id,
-                category.name,
-                category_sum,
-                category_counts,
-                features,
-                unannotated_reads=(None, unannotated_reads)[report_unannotated],
-            )
+        #     features = tuple(self.adm.get_features(category.id))
+        #     count_writer.write_category(
+        #         category.id,
+        #         category.name,
+        #         category_sum,
+        #         category_counts,
+        #         features,
+        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
+        #     )
 
         
 
 
 
-        # functional_counts, category_sums = count_annotator.annotate_gene_counts(
-        #     self.reference_manager,
-        #     self.adm,
-        #     self.counter,
-        #     gene_group_db=gene_group_db,
-        # )
+        functional_counts, category_sums = count_annotator.annotate_gene_counts(
+            self.reference_manager,
+            self.adm,
+            self.counter,
+            gene_group_db=gene_group_db,
+        )
 
-        # logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10]))
-        # logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :]))
+        logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10]))
+        logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :]))
 
-        # categories = self.adm.get_categories()
-        # for category, category_sum in zip(categories, category_sums):
-        #     features = tuple(self.adm.get_features(category.id))
-        #     feature_names = {
-        #         feature.id: feature.name
-        #         for feature in features
-        #     }
-        #     rows = tuple(
-        #         key[0] == category.id
-        #         for key, _ in functional_counts
-        #     )
+        categories = self.adm.get_categories()
+        for category, category_sum in zip(categories, category_sums):
+            features = tuple(self.adm.get_features(category.id))
+            feature_names = {
+                feature.id: feature.name
+                for feature in features
+            }
+            rows = tuple(
+                key[0] == category.id
+                for key, _ in functional_counts
+            )
 
-        #     cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
-        #     # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
-        #     # for feature in features:
-        #     #     key = (category.id, feature.id)
-        #     #     if functional_counts.has_record(key):
-        #     #         cat_counts[key] += functional_counts[key]
-        #     #     else:
-        #     #         _ = cat_counts[key]            
+            cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows)
+            # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names))
+            # for feature in features:
+            #     key = (category.id, feature.id)
+            #     if functional_counts.has_record(key):
+            #         cat_counts[key] += functional_counts[key]
+            #     else:
+            #         _ = cat_counts[key]            
             
-        #     # for category in categories:
-        #     # features = ((feature.name, feature) for feature in db.get_features(category.id))
-        #     # for _, feature in sorted(features, key=lambda x: x[0]):
-        #     #     _ = functional_counts[(category.id, feature.id)]
+            # for category in categories:
+            # features = ((feature.name, feature) for feature in db.get_features(category.id))
+            # for _, feature in sorted(features, key=lambda x: x[0]):
+            #     _ = functional_counts[(category.id, feature.id)]
 
 
-        #     logger.info("PROCESSING CATEGORY=%s", category.name)
-        #     count_writer.write_category(
-        #         category.id,
-        #         category.name,
-        #         category_sum,
-        #         # functional_counts,
-        #         cat_counts,
-        #         # feature_names,
-        #         features,
-        #         unannotated_reads=(None, unannotated_reads)[report_unannotated],
-        #     )
+            logger.info("PROCESSING CATEGORY=%s", category.name)
+            count_writer.write_category(
+                category.id,
+                category.name,
+                category_sum,
+                # functional_counts,
+                cat_counts,
+                # feature_names,
+                features,
+                unannotated_reads=(None, unannotated_reads)[report_unannotated],
+            )
 
         self.adm.clear_caches()