From 61b93da99defeb80d4b6f155d671fc4662b73d9c Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 15 Dec 2024 23:37:09 +0100 Subject: [PATCH 001/128] initial --- gffquant/annotation/count_annotator.py | 19 +++--- gffquant/counters/__init__.py | 2 +- gffquant/counters/alignment_counter2.py | 81 +++++++++++++++++++++++++ gffquant/counters/count_manager.py | 17 +++++- gffquant/counters/region_counter.py | 2 +- gffquant/counters/seq_counter.py | 2 +- 6 files changed, 109 insertions(+), 14 deletions(-) create mode 100644 gffquant/counters/alignment_counter2.py diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 760250dc..4d91f8cd 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -8,6 +8,7 @@ import numpy as np +from ..counters.count_manager import CountManager logger = logging.getLogger(__name__) @@ -198,7 +199,7 @@ def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) # pylint: disable=R0914,W0613 - def annotate(self, refmgr, db, count_manager, gene_group_db=False): + def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): """ Annotate a set of region counts via db-lookup. input: @@ -206,9 +207,10 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False): - db: GffDatabaseManager holding functional annotation database - count_manager: count_data """ - for rid in set(count_manager.uniq_regioncounts).union( - count_manager.ambig_regioncounts - ): + # for rid in set(count_manager.uniq_regioncounts).union( + # count_manager.ambig_regioncounts + # ): + for rid in count_manager.get_all_regions(region_counts=True): ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] for region in count_manager.get_regions(rid): @@ -273,7 +275,7 @@ class GeneCountAnnotator(CountAnnotator): def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - def annotate(self, refmgr, db, count_manager, gene_group_db=False): + def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): """ Annotate a set of gene counts via db-iteration. input: @@ -286,9 +288,10 @@ def annotate(self, refmgr, db, count_manager, gene_group_db=False): if self.strand_specific else None ) - for rid in set(count_manager.uniq_seqcounts).union( - count_manager.ambig_seqcounts - ): + # for rid in set(count_manager.uniq_seqcounts).union( + # count_manager.ambig_seqcounts + # ): + for rid in count_manager.get_all_regions(): ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) uniq_counts, ambig_counts = count_manager.get_counts( diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py index 7641c957..774cd03c 100644 --- a/gffquant/counters/__init__.py +++ b/gffquant/counters/__init__.py @@ -3,7 +3,7 @@ """module docstring""" -from .alignment_counter import AlignmentCounter +from .alignment_counter2 import AlignmentCounter from .region_counter import RegionCounter from .seq_counter import UniqueSeqCounter, AmbiguousSeqCounter from .count_manager import CountManager diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py new file mode 100644 index 00000000..e6f82111 --- /dev/null +++ b/gffquant/counters/alignment_counter2.py @@ -0,0 +1,81 @@ +from collections import Counter + +import numpy as np + +from .. import DistributionMode + + +class AlignmentCounter: + COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled") + INITIAL_SIZE = 1000 + + @staticmethod + def normalise_counts(counts, feature_len, scaling_factor): + """Returns raw, length-normalised, and scaled feature counts.""" + normalised = counts / feature_len + scaled = normalised * scaling_factor + return counts, normalised, scaled + + def get_increment(self, n_aln, increment): + # 1overN = lavern. Maya <3 + return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment + + def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False): + self.distribution_mode = distribution_mode + self.strand_specific = strand_specific + self.unannotated_reads = 0 + + self.index = {} + self.counts = np.zeros( + (AlignmentCounter.INITIAL_SIZE, 1), + ) + def dump(self, prefix, refmgr): + ... + def get(self, key, default_val): + key_index = self.index.get(key) + if key_index is None: + return Counter() + return Counter({key: self.counts[key_index]}) + + def setdefault(self, key, default_val): + ... + + def __iter__(self): + yield from self.index.keys() + def __getitem__(self, key): + return self.counts.get(self.index.get(key), 0.0) + def __setitem__(self, key, value): + key_index = self.index.get(key) + if key_index is not None: + self.counts[key_index] = value + raise KeyError(f"{key=} not found.") + + def update_counts(self, count_stream, increment=1): + contributed_counts = 0 + for hits, aln_count in count_stream: + hit = hits[0] + inc = ( + ( + self.get_increment(aln_count, increment), + increment, + ) + )[aln_count == 1] + key = ( + ( + (hit.rid, hit.rev_strand), + hit.rid + ) + )[self.strand_specific] + + key_index = self.index.get(key) + if key_index is None: + nrows = self.counts.shape[0] + if len(self.index) == nrows: + self.counts = np.pad( + self.counts, + ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), + ) + key_index = self.index.setdefault(key, len(self.index)) + self.counts[key_index] += inc + contributed_counts += inc + diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 40ae72a6..0eabc67e 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -3,7 +3,7 @@ from collections import Counter from .. import DistributionMode -from .alignment_counter import AlignmentCounter +from .alignment_counter2 import AlignmentCounter from .region_counter import RegionCounter @@ -157,6 +157,17 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): return uniq_counts, ambig_counts def get_regions(self, rid): - return set(self.uniq_regioncounts.get(rid, set())).union( - self.ambig_regioncounts.get(rid, set()) + # return set(self.uniq_regioncounts.get(rid, set())).union( + # self.ambig_regioncounts.get(rid, set()) + # ) + return set(self.uniq_regioncounts.get(rid, Counter())).union( + self.ambig_regioncounts.get(rid, Counter()) ) + + def get_all_regions(self, region_counts=False): + uniq_counts, ambig_counts = ( + (self.uniq_seqcounts, self.ambig_seqcounts,), + (self.uniq_regioncounts, self.ambig_regioncounts,), + )[region_counts] + yield from set(uniq_counts).union(ambig_counts) + diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py index 7a617056..a7a74756 100644 --- a/gffquant/counters/region_counter.py +++ b/gffquant/counters/region_counter.py @@ -5,7 +5,7 @@ from collections import Counter from .. import DistributionMode -from .alignment_counter import AlignmentCounter +from .alignment_counter2 import AlignmentCounter class RegionCounter(AlignmentCounter): diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py index bc71c7fb..91e28628 100644 --- a/gffquant/counters/seq_counter.py +++ b/gffquant/counters/seq_counter.py @@ -3,7 +3,7 @@ """ module docstring """ from .. import DistributionMode -from .alignment_counter import AlignmentCounter +from .alignment_counter2 import AlignmentCounter class UniqueSeqCounter(AlignmentCounter): From 6c80f0aacc7c92ff27f6f0753732fbbbe0fed5ab Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 15 Dec 2024 23:41:02 +0100 Subject: [PATCH 002/128] version --- Dockerfile | 2 +- gffquant/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index ac8b542d..aa88185f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:22.04 LABEL maintainer="cschu1981@gmail.com" -LABEL version="2.18.0" +LABEL version="2.19.0" LABEL description="gffquant - functional profiling of metagenomic/transcriptomic wgs samples" diff --git a/gffquant/__init__.py b/gffquant/__init__.py index 128d76bf..31f4177f 100644 --- a/gffquant/__init__.py +++ b/gffquant/__init__.py @@ -5,7 +5,7 @@ from enum import Enum, auto, unique -__version__ = "2.18.0" +__version__ = "2.19.0" __tool__ = "gffquant" From 2be12b462dbc14edeaf87a5a90673f47c84457fa Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 15 Dec 2024 23:53:30 +0100 Subject: [PATCH 003/128] fix: getitem implementation --- gffquant/counters/alignment_counter2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index e6f82111..14d77d96 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -43,7 +43,10 @@ def setdefault(self, key, default_val): def __iter__(self): yield from self.index.keys() def __getitem__(self, key): - return self.counts.get(self.index.get(key), 0.0) + key_index = self.index.get(key) + if key_index is None: + return 0.0 + return self.counts[self.index.get(key)] def __setitem__(self, key, value): key_index = self.index.get(key) if key_index is not None: From e143eb34b49511b16e08c759f3d2031fadfbd153 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 16 Dec 2024 21:08:21 +0100 Subject: [PATCH 004/128] fix?: missing counts --- gffquant/counters/alignment_counter2.py | 9 ++++++--- gffquant/counters/count_manager.py | 4 ++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 14d77d96..4723a410 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -78,7 +78,10 @@ def update_counts(self, count_stream, increment=1): self.counts, ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), ) - key_index = self.index.setdefault(key, len(self.index)) - self.counts[key_index] += inc - contributed_counts += inc + # key_index = self.index.setdefault(key, len(self.index)) + key_index = self.index[key] = len(self.index) + self.counts[key_index] += inc + contributed_counts += inc + + return contributed_counts diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 0eabc67e..c7d4718e 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -66,6 +66,10 @@ def __init__( strand_specific=strand_specific, distribution_mode=distribution_mode ) + # self.seqcounts = AlignmentCounter( + # strand_specific=strand_specific, + # distribution_mode=distribution_mode, + # ) def has_ambig_counts(self): return self.ambig_regioncounts or self.ambig_seqcounts From 32ce2bd090998d175546bc4c35511ccfc8b0b9f1 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 16 Dec 2024 21:18:09 +0100 Subject: [PATCH 005/128] fix: fixing AlignmentCounter __getitem__/__setitem__ methods --- gffquant/counters/alignment_counter2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 4723a410..c5e61b36 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -46,12 +46,13 @@ def __getitem__(self, key): key_index = self.index.get(key) if key_index is None: return 0.0 - return self.counts[self.index.get(key)] + return self.counts[key_index] def __setitem__(self, key, value): key_index = self.index.get(key) if key_index is not None: self.counts[key_index] = value - raise KeyError(f"{key=} not found.") + else: + raise KeyError(f"{key=} not found.") def update_counts(self, count_stream, increment=1): contributed_counts = 0 From ed0eaab8eaaef2cc21c21a02eb9c47a56fd77e56 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 16 Dec 2024 23:33:56 +0100 Subject: [PATCH 006/128] merge uniq/ambig seqcounters --- gffquant/annotation/count_annotator.py | 3 - gffquant/counters/alignment_counter2.py | 9 +- gffquant/counters/count_manager.py | 115 +++++++++++++++--------- 3 files changed, 78 insertions(+), 49 deletions(-) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 4d91f8cd..70bc6c86 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -288,9 +288,6 @@ def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False) if self.strand_specific else None ) - # for rid in set(count_manager.uniq_seqcounts).union( - # count_manager.ambig_seqcounts - # ): for rid in count_manager.get_all_regions(): ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index c5e61b36..6b54a845 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -27,7 +27,7 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi self.index = {} self.counts = np.zeros( - (AlignmentCounter.INITIAL_SIZE, 1), + (AlignmentCounter.INITIAL_SIZE, 2), ) def dump(self, prefix, refmgr): ... @@ -39,6 +39,9 @@ def get(self, key, default_val): def setdefault(self, key, default_val): ... + + def has_ambig_counts(self): + ... def __iter__(self): yield from self.index.keys() @@ -54,7 +57,7 @@ def __setitem__(self, key, value): else: raise KeyError(f"{key=} not found.") - def update_counts(self, count_stream, increment=1): + def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 for hits, aln_count in count_stream: hit = hits[0] @@ -81,7 +84,7 @@ def update_counts(self, count_stream, increment=1): ) # key_index = self.index.setdefault(key, len(self.index)) key_index = self.index[key] = len(self.index) - self.counts[key_index] += inc + self.counts[key_index][int(ambiguous_counts)] += inc contributed_counts += inc return contributed_counts diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index c7d4718e..37d31f6b 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -50,36 +50,45 @@ def __init__( self.increments = [1.0, 1.0] self.increments_auto_detect = [1.0, self.paired_end_count / 2.0] - self.uniq_seqcounts, self.ambig_seqcounts = None, None - self.uniq_regioncounts, self.ambig_regioncounts = None, None + # self.uniq_seqcounts, self.ambig_seqcounts = None, None + # self.uniq_regioncounts, self.ambig_regioncounts = None, None + self.seqcounts, self.regioncounts = None, None if region_counts: - self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific) - self.ambig_regioncounts = RegionCounter( + # self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific) + # self.ambig_regioncounts = RegionCounter( + # strand_specific=strand_specific, + # distribution_mode=distribution_mode, + # ) + self.regioncounts = RegionCounter( strand_specific=strand_specific, distribution_mode=distribution_mode, ) else: - self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific) - self.ambig_seqcounts = AlignmentCounter( - strand_specific=strand_specific, - distribution_mode=distribution_mode - ) - # self.seqcounts = AlignmentCounter( + # self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific) + # self.ambig_seqcounts = AlignmentCounter( # strand_specific=strand_specific, - # distribution_mode=distribution_mode, + # distribution_mode=distribution_mode # ) + self.seqcounts = AlignmentCounter( + strand_specific=strand_specific, + distribution_mode=distribution_mode, + ) def has_ambig_counts(self): - return self.ambig_regioncounts or self.ambig_seqcounts + return any( + self.seqcounts and self.seqcounts.has_ambig_counts(), + self.regioncounts and self.regioncounts.has_ambig_counts(), + ) + # return self.ambig_regioncounts or self.ambig_seqcounts def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None): - seq_counter, region_counter = ( - (self.uniq_seqcounts, self.uniq_regioncounts) - if not ambiguous_counts - else (self.ambig_seqcounts, self.ambig_regioncounts) - ) + # seq_counter, region_counter = ( + # (self.uniq_seqcounts, self.uniq_regioncounts) + # if not ambiguous_counts + # else (self.ambig_seqcounts, self.ambig_regioncounts) + # ) if pe_library is not None: # this is the case when the alignment has a read group tag @@ -95,40 +104,51 @@ def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_lib increment = self.increments[pair] contributed_counts = 0 - if seq_counter is not None: - contributed_counts = seq_counter.update_counts(count_stream, increment=increment) - elif region_counter is not None: - contributed_counts = region_counter.update_counts(count_stream, increment=increment) + if self.seqcounts is not None: + contributed_counts = self.seqcounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) + elif self.regioncounts is not None: + contributed_counts = self.regioncounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) + # if seq_counter is not None: + # contributed_counts = seq_counter.update_counts(count_stream, increment=increment) + # elif region_counter is not None: + # contributed_counts = region_counter.update_counts(count_stream, increment=increment) return contributed_counts def dump_raw_counters(self, prefix, refmgr): - if self.uniq_seqcounts is not None: - self.uniq_seqcounts.dump(prefix, refmgr) - if self.ambig_seqcounts is not None: - self.ambig_seqcounts.dump(prefix, refmgr) - if self.uniq_regioncounts is not None: - self.uniq_regioncounts.dump(prefix, refmgr) - if self.ambig_regioncounts is not None: - self.ambig_regioncounts.dump(prefix, refmgr) + # if self.uniq_seqcounts is not None: + # self.uniq_seqcounts.dump(prefix, refmgr) + # if self.ambig_seqcounts is not None: + # self.ambig_seqcounts.dump(prefix, refmgr) + # if self.uniq_regioncounts is not None: + # self.uniq_regioncounts.dump(prefix, refmgr) + # if self.ambig_regioncounts is not None: + # self.ambig_regioncounts.dump(prefix, refmgr) + ... def get_unannotated_reads(self): unannotated_reads = 0 - if self.uniq_regioncounts is not None: - unannotated_reads += self.uniq_regioncounts.unannotated_reads - if self.ambig_regioncounts is not None: - unannotated_reads += self.ambig_regioncounts.unannotated_reads - if self.uniq_seqcounts is not None: - unannotated_reads += self.uniq_seqcounts.unannotated_reads - if self.ambig_seqcounts is not None: - unannotated_reads += self.ambig_seqcounts.unannotated_reads + # if self.uniq_regioncounts is not None: + # unannotated_reads += self.uniq_regioncounts.unannotated_reads + # if self.ambig_regioncounts is not None: + # unannotated_reads += self.ambig_regioncounts.unannotated_reads + # if self.uniq_seqcounts is not None: + # unannotated_reads += self.uniq_seqcounts.unannotated_reads + # if self.ambig_seqcounts is not None: + # unannotated_reads += self.ambig_seqcounts.unannotated_reads + if self.regioncounts is not None: + unannotated_reads += self.regioncounts + if self.seqcounts is not None: + unannotated_reads += self.seqcounts return unannotated_reads def get_counts(self, seqid, region_counts=False, strand_specific=False): if region_counts: + raise NotImplementedError() rid, seqid = seqid[0], seqid[1:] + uniq_counter = self.uniq_regioncounts.get(rid, Counter()) ambig_counter = self.ambig_regioncounts.get(rid, Counter()) @@ -139,9 +159,11 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): return [uniq_counter[seqid]], [ambig_counter[seqid]] else: - uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts + # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts + if strand_specific: + raise NotImplementedError() uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] uniq_counts[seqid[1]] = uniq_counter[seqid] ambig_counts[seqid[1]] = ambig_counter[seqid] @@ -156,7 +178,8 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): # ambig_counter[(rid, CountManager.MINUS_STRAND)], # ] else: - uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]] + # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]] + uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]] return uniq_counts, ambig_counts @@ -169,9 +192,15 @@ def get_regions(self, rid): ) def get_all_regions(self, region_counts=False): - uniq_counts, ambig_counts = ( - (self.uniq_seqcounts, self.ambig_seqcounts,), - (self.uniq_regioncounts, self.ambig_regioncounts,), + # uniq_counts, ambig_counts = ( + # (self.uniq_seqcounts, self.ambig_seqcounts,), + # (self.uniq_regioncounts, self.ambig_regioncounts,), + # )[region_counts] + # yield from set(uniq_counts).union(ambig_counts) + counts = ( + self.seqcounts, + self.regioncounts, )[region_counts] - yield from set(uniq_counts).union(ambig_counts) + + yield from counts From 6c32072c15b54aa20b51a560ad5cbab4ae99122d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 16 Dec 2024 23:47:41 +0100 Subject: [PATCH 007/128] fix: AlignmentCounter.has_ambig_counts(), CountManager.has_ambig_counts() --- gffquant/counters/alignment_counter2.py | 2 +- gffquant/counters/count_manager.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 6b54a845..9e9e7ce1 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -41,7 +41,7 @@ def setdefault(self, key, default_val): ... def has_ambig_counts(self): - ... + return bool(self.counts[:, 1].sum() != 0) def __iter__(self): yield from self.index.keys() diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 37d31f6b..b25cf21b 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -78,8 +78,10 @@ def __init__( def has_ambig_counts(self): return any( - self.seqcounts and self.seqcounts.has_ambig_counts(), - self.regioncounts and self.regioncounts.has_ambig_counts(), + ( + self.seqcounts and self.seqcounts.has_ambig_counts(), + self.regioncounts and self.regioncounts.has_ambig_counts(), + ) ) # return self.ambig_regioncounts or self.ambig_seqcounts From 28f84e8a8aed979cad28b575647bcc7319a3bd6d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 16 Dec 2024 23:58:25 +0100 Subject: [PATCH 008/128] fix: minor --- gffquant/counters/count_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index b25cf21b..22fa9988 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -140,9 +140,9 @@ def get_unannotated_reads(self): # if self.ambig_seqcounts is not None: # unannotated_reads += self.ambig_seqcounts.unannotated_reads if self.regioncounts is not None: - unannotated_reads += self.regioncounts + unannotated_reads += self.regioncounts.unannotated_reads if self.seqcounts is not None: - unannotated_reads += self.seqcounts + unannotated_reads += self.seqcounts.unannotated_reads return unannotated_reads From b344e9f7cb3e8bedb9cc5b5e7a134d7f759d8726 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 21:59:24 +0100 Subject: [PATCH 009/128] updating count annotation --- gffquant/annotation/count_annotator.py | 6 ++- gffquant/annotation/genecount_annotator.py | 48 ++++++++++++++++++++++ gffquant/counters/alignment_counter2.py | 30 +++++++++++++- gffquant/counters/count_manager.py | 14 ++++++- 4 files changed, 93 insertions(+), 5 deletions(-) create mode 100644 gffquant/annotation/genecount_annotator.py diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 70bc6c86..67f6ce7a 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -95,7 +95,8 @@ def calculate_scaling_factors(self, default_scaling_factor=0): def calc_scaling_factor(raw, normed, default=0): return (raw / normed) if normed else default - total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts + # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts + total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts logger.info( "TOTAL COUNTS: uraw=%s unorm=%s araw=%s anorm=%s", total_uniq, total_uniq_normed, total_ambi, total_ambi_normed @@ -109,7 +110,8 @@ def calc_scaling_factor(raw, normed, default=0): total_ambi, total_ambi_normed, default_scaling_factor ) - total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts + # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts + total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts logger.info( "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s", total_uniq, total_uniq_normed, total_ambi, total_ambi_normed diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py new file mode 100644 index 00000000..7fab2d11 --- /dev/null +++ b/gffquant/annotation/genecount_annotator.py @@ -0,0 +1,48 @@ +import logging + +import numpy as np + +from .count_annotator import CountAnnotator +from ..counters import CountManager + + +logger = logging.getLogger(__name__) + + +class GeneCountAnnotator(CountAnnotator): + """ CountAnnotator subclass for gene-based counting. """ + + def __init__(self, strand_specific, report_scaling_factors=True): + CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) + + def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): + self.total_gene_counts = count_manager.transform_counts(refmgr) + self.total_counts = self.total_gene_counts # ? + + for rid in count_manager.get_all_regions(): + counts = count_manager.get_counts(rid) + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + + if gene_group_db: + ref_tokens = ref.split(".") + gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + else: + gene_id, ggroup_id = ref, ref + + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + logger.info( + "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", + gene_id, ggroup_id, counts[0], counts[2], + ) + self.distribute_feature_counts(counts, region_annotation) + + else: + logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) + self.unannotated_counts += counts[:4] + + self.calculate_scaling_factors() + + + diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 9e9e7ce1..4d91b9ae 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -27,7 +27,8 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi self.index = {} self.counts = np.zeros( - (AlignmentCounter.INITIAL_SIZE, 2), + (AlignmentCounter.INITIAL_SIZE, 2,), + dtype='float64', ) def dump(self, prefix, refmgr): ... @@ -88,4 +89,31 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts += inc return contributed_counts + + def transform(self, refmgr): + # transform 2-column uniq/ambig count matrix + # into 4 columns + # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm + + # obtain gene lengths + lengths = np.array( + (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] + for key in self.index + ) + + # calculate combined_raw + self.counts[:, 1:2] += self.counts[:, 0:1] + + # duplicate the raw counts + self.counts = np.concatenate( + (self.counts, self.counts,), + axis=1, + ) + + # length-normalise the lnorm columns + self.counts[:, 2:4] /= lengths[:, None] + + # return count sums + return self.counts.sum(axis=0) + diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 22fa9988..4488babb 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -2,6 +2,8 @@ from collections import Counter +import numpy as np + from .. import DistributionMode from .alignment_counter2 import AlignmentCounter from .region_counter import RegionCounter @@ -181,9 +183,11 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): # ] else: # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]] - uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]] + # uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]] + counts = self.seqcounts[seqid] - return uniq_counts, ambig_counts + # return uniq_counts, ambig_counts + return np.array((counts[0], counts[2], counts[1], counts[3])) def get_regions(self, rid): # return set(self.uniq_regioncounts.get(rid, set())).union( @@ -206,3 +210,9 @@ def get_all_regions(self, region_counts=False): yield from counts + def transform_counts(self, refmgr): + if self.seqcounts is not None: + self.seqcounts.transform(refmgr) + if self.regioncounts is not None: + self.regioncounts.transform(refmgr) + From 5ed8cfc95c7b76a1e58a1bfc20d264a2bff78002 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 22:15:37 +0100 Subject: [PATCH 010/128] fixed import --- gffquant/annotation/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gffquant/annotation/__init__.py b/gffquant/annotation/__init__.py index 4ae04f44..1649dcb4 100644 --- a/gffquant/annotation/__init__.py +++ b/gffquant/annotation/__init__.py @@ -2,5 +2,7 @@ """ module docstring """ -from .count_annotator import GeneCountAnnotator, RegionCountAnnotator +# from .count_annotator import GeneCountAnnotator, RegionCountAnnotator +from .count_annotator import RegionCountAnnotator from .count_writer import CountWriter +from .genecount_annotator import GeneCountAnnotator From a92722ae8a2181417d87e41a92c9260357fc7021 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 22:34:16 +0100 Subject: [PATCH 011/128] added debug message --- gffquant/counters/alignment_counter2.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 4d91b9ae..02d53b15 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -1,3 +1,5 @@ +import logging + from collections import Counter import numpy as np @@ -5,6 +7,9 @@ from .. import DistributionMode +logger = logging.getLogger(__name__) + + class AlignmentCounter: COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled") INITIAL_SIZE = 1000 @@ -100,6 +105,7 @@ def transform(self, refmgr): (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] for key in self.index ) + logger.info("LENGTHS ARRAY = %s", lengths.shape) # calculate combined_raw self.counts[:, 1:2] += self.counts[:, 0:1] From 515e71f2c0346524fe4febd64d35ee13a6cbeb64 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 22:46:07 +0100 Subject: [PATCH 012/128] added debug message --- gffquant/counters/alignment_counter2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 02d53b15..5df5de16 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -106,6 +106,7 @@ def transform(self, refmgr): for key in self.index ) logger.info("LENGTHS ARRAY = %s", lengths.shape) + logger.info("INDEX SIZE = %s", len(self.index)) # calculate combined_raw self.counts[:, 1:2] += self.counts[:, 0:1] From 6efbca61156509d13466c0c4acaeaa61b5ad85de Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 22:57:08 +0100 Subject: [PATCH 013/128] fixing empty length vector issue? --- gffquant/counters/alignment_counter2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 5df5de16..bdeebf08 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -102,8 +102,10 @@ def transform(self, refmgr): # obtain gene lengths lengths = np.array( - (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] - for key in self.index + tuple( + (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] + for key in self.index + ) ) logger.info("LENGTHS ARRAY = %s", lengths.shape) logger.info("INDEX SIZE = %s", len(self.index)) From ef34cde79fc030e73decf66319f29c9059521213 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 23:12:14 +0100 Subject: [PATCH 014/128] fixing empty length vector issue? --- gffquant/counters/alignment_counter2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index bdeebf08..8516006c 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -110,6 +110,9 @@ def transform(self, refmgr): logger.info("LENGTHS ARRAY = %s", lengths.shape) logger.info("INDEX SIZE = %s", len(self.index)) + # remove the un-indexed rows + self.counts = self.counts[0:len(self.index), :] + # calculate combined_raw self.counts[:, 1:2] += self.counts[:, 0:1] From aa09c1affee68a415701bfa016f03f83aec994a9 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 23:22:44 +0100 Subject: [PATCH 015/128] added debug message --- gffquant/annotation/genecount_annotator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 7fab2d11..d34f3032 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -17,6 +17,7 @@ def __init__(self, strand_specific, report_scaling_factors=True): def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): self.total_gene_counts = count_manager.transform_counts(refmgr) + logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) self.total_counts = self.total_gene_counts # ? for rid in count_manager.get_all_regions(): From d3649c28cd2334522d620d4387748d79826c52b2 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 23:30:58 +0100 Subject: [PATCH 016/128] fixing empty total counts? --- gffquant/counters/count_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 4488babb..97bb2cc0 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -212,7 +212,7 @@ def get_all_regions(self, region_counts=False): def transform_counts(self, refmgr): if self.seqcounts is not None: - self.seqcounts.transform(refmgr) + return self.seqcounts.transform(refmgr) if self.regioncounts is not None: - self.regioncounts.transform(refmgr) + return self.regioncounts.transform(refmgr) From e3bee67e54facd5d831ba309cc11f8e85327248f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 19 Dec 2024 23:55:54 +0100 Subject: [PATCH 017/128] fixing total count issue? --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index d34f3032..ed161d40 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -18,7 +18,7 @@ def __init__(self, strand_specific, report_scaling_factors=True): def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): self.total_gene_counts = count_manager.transform_counts(refmgr) logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) - self.total_counts = self.total_gene_counts # ? + # self.total_counts = self.total_gene_counts # ? for rid in count_manager.get_all_regions(): counts = count_manager.get_counts(rid) From b466381cc2e6312c7a11031474edf39e875dcbd7 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 20 Dec 2024 22:12:22 +0100 Subject: [PATCH 018/128] debug messaging --- gffquant/annotation/count_annotator.py | 6 +++--- gffquant/annotation/count_writer.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 67f6ce7a..b3f74d18 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -95,8 +95,8 @@ def calculate_scaling_factors(self, default_scaling_factor=0): def calc_scaling_factor(raw, normed, default=0): return (raw / normed) if normed else default - # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts - total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts + total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts + # total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts logger.info( "TOTAL COUNTS: uraw=%s unorm=%s araw=%s anorm=%s", total_uniq, total_uniq_normed, total_ambi, total_ambi_normed @@ -141,7 +141,7 @@ def calc_scaling_factor(raw, normed, default=0): total_ambi, total_ambi_normed, default_scaling_factor ) ) - + if self.report_scaling_factors: logger.info( "Calculating scaling factors for category=%s: uraw=%s unorm=%s araw=%s anorm=%s -> factors=%s", diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index fa67c3fc..1827a2b8 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -142,6 +142,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un if "category" in self.publish_reports: cat_counts = counts.get(f"cat:::{category_id}") + logger.info("CAT %s: %s", category_id, str(cat_counts)) if cat_counts is not None: cat_row = self.compile_output_row( cat_counts, From 8249d9f9d6c4f91f177101228ca61cbe9275807e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 20 Dec 2024 22:58:45 +0100 Subject: [PATCH 019/128] fixed gene writing? --- gffquant/annotation/count_writer.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 1827a2b8..f1092192 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -8,6 +8,8 @@ import numpy as np +from ..counters import CountManager + logger = logging.getLogger(__name__) @@ -161,16 +163,25 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un ) CountWriter.write_row(feature.name, out_row, stream=feat_out) - def write_gene_counts(self, gene_counts, uniq_scaling_factor, ambig_scaling_factor): + def write_gene_counts(self, gene_counts: CountManager, uniq_scaling_factor, ambig_scaling_factor): if "scaled" in self.publish_reports: logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True) - for gene, g_counts in sorted(gene_counts.items()): + # for gene, g_counts in sorted(gene_counts.items()): + # out_row = self.compile_output_row( + # g_counts, + # scaling_factor=uniq_scaling_factor, + # ambig_scaling_factor=ambig_scaling_factor + # ) + # CountWriter.write_row(gene, out_row, stream=gene_out) + for rid in gene_counts.get_all_regions(): + counts = gene_counts.get_counts(rid) out_row = self.compile_output_row( - g_counts, + counts, scaling_factor=uniq_scaling_factor, - ambig_scaling_factor=ambig_scaling_factor + ambig_scaling_factor=ambig_scaling_factor, ) - CountWriter.write_row(gene, out_row, stream=gene_out) + CountWriter.write_row(rid, out_row, stream=gene_out,) + From 7fb5c56e65b8fdc6709bf167f4a400b1d8632f62 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 20 Dec 2024 23:15:45 +0100 Subject: [PATCH 020/128] fixed gene writing? --- gffquant/profilers/feature_quantifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index bd5c5c96..a8897ffa 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -186,7 +186,8 @@ def process_counters( ) count_writer.write_gene_counts( - count_annotator.gene_counts, + # count_annotator.gene_counts, + self.count_manager, count_annotator.scaling_factors["total_gene_uniq"], count_annotator.scaling_factors["total_gene_ambi"] ) From d2226acb79cfa7f894c9ef650cf389220fcf4a7e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 20 Dec 2024 23:38:34 +0100 Subject: [PATCH 021/128] fixed gene writing? --- gffquant/annotation/count_writer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index f1092192..7a73406c 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -163,7 +163,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un ) CountWriter.write_row(feature.name, out_row, stream=feat_out) - def write_gene_counts(self, gene_counts: CountManager, uniq_scaling_factor, ambig_scaling_factor): + def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor): if "scaled" in self.publish_reports: logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: @@ -183,5 +183,6 @@ def write_gene_counts(self, gene_counts: CountManager, uniq_scaling_factor, ambi scaling_factor=uniq_scaling_factor, ambig_scaling_factor=ambig_scaling_factor, ) - CountWriter.write_row(rid, out_row, stream=gene_out,) + ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] + CountWriter.write_row(ref, out_row, stream=gene_out,) From a1e54c29d0834a92e0704cfec0f2359a17b31cc4 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 00:01:14 +0100 Subject: [PATCH 022/128] fixed gene writing? --- gffquant/profilers/feature_quantifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index a8897ffa..358d3fd8 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -188,6 +188,7 @@ def process_counters( count_writer.write_gene_counts( # count_annotator.gene_counts, self.count_manager, + self.reference_manager, count_annotator.scaling_factors["total_gene_uniq"], count_annotator.scaling_factors["total_gene_ambi"] ) From f3364934328765a84ad6ad1635ab33f1c494ada9 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 09:43:34 +0100 Subject: [PATCH 023/128] fixed gene writing? --- gffquant/annotation/count_writer.py | 11 +++++++++-- gffquant/profilers/feature_quantifier.py | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 7a73406c..2b6168aa 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -163,7 +163,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un ) CountWriter.write_row(feature.name, out_row, stream=feat_out) - def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor): + def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False): if "scaled" in self.publish_reports: logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: @@ -184,5 +184,12 @@ def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_fact ambig_scaling_factor=ambig_scaling_factor, ) ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] - CountWriter.write_row(ref, out_row, stream=gene_out,) + + if gene_group_db: + ref_tokens = ref.split(".") + gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] + else: + gene_id = ref + + CountWriter.write_row(gene_id, out_row, stream=gene_out,) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 358d3fd8..7ecdd55c 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -190,7 +190,8 @@ def process_counters( self.count_manager, self.reference_manager, count_annotator.scaling_factors["total_gene_uniq"], - count_annotator.scaling_factors["total_gene_ambi"] + count_annotator.scaling_factors["total_gene_ambi"], + gene_group_db=gene_group_db, ) self.adm.clear_caches() From cb9ea29420fde0d7e879c373ec7e871ace873596 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 20:48:16 +0100 Subject: [PATCH 024/128] dump seqcounters for debugging --- gffquant/counters/alignment_counter2.py | 8 ++++++++ gffquant/counters/count_manager.py | 5 +++++ gffquant/profilers/feature_quantifier.py | 5 +++++ 3 files changed, 18 insertions(+) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 8516006c..d5c8ad3d 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -36,6 +36,14 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi dtype='float64', ) def dump(self, prefix, refmgr): + import gzip + with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: + for key in self: + ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) + print(key, ref, reflen, self.counts[key], sep="\t", file=_out) + # for k, v in self.items(): + # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) + # print(k, ref, reflen, v, sep="\t", file=_out) ... def get(self, key, default_val): key_index = self.index.get(key) diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 97bb2cc0..8f8517f8 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -216,3 +216,8 @@ def transform_counts(self, refmgr): if self.regioncounts is not None: return self.regioncounts.transform(refmgr) + def dump(self, prefix, refmgr): + if self.seqcounts is not None: + self.seqcounts.dump(prefix, refmgr) + if self.regioncounts is not None: + self.regioncounts.dump(prefix, refmgr) \ No newline at end of file diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 7ecdd55c..fa1ce0d9 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -164,8 +164,13 @@ def process_counters( Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required] count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) + + self.count_manager.dump("pre_annotate", self.reference_manager) + count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,) + self.count_manager.dump("post_annotate", self.reference_manager) + count_writer = CountWriter( self.out_prefix, has_ambig_counts=self.count_manager.has_ambig_counts(), From 6ae5c5a8aec81d6bef12b7737728c4ebf8f0ef80 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 21:03:39 +0100 Subject: [PATCH 025/128] dump seqcounters for debugging --- gffquant/counters/alignment_counter2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index d5c8ad3d..34db48a1 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -38,7 +38,7 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi def dump(self, prefix, refmgr): import gzip with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: - for key in self: + for key in self.index.keys(): ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) print(key, ref, reflen, self.counts[key], sep="\t", file=_out) # for k, v in self.items(): From fbb6a0edb6b6a3e928811d36b9c3e450e18032e2 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 21:21:53 +0100 Subject: [PATCH 026/128] dump seqcounters for debugging --- gffquant/counters/alignment_counter2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 34db48a1..679790f2 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -38,9 +38,9 @@ def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specifi def dump(self, prefix, refmgr): import gzip with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: - for key in self.index.keys(): + for key, key_index in self.index.items(): ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) - print(key, ref, reflen, self.counts[key], sep="\t", file=_out) + print(key, ref, reflen, self.counts[key_index], sep="\t", file=_out) # for k, v in self.items(): # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) # print(k, ref, reflen, v, sep="\t", file=_out) From a70f1a8ec19fc1b7ee1962d426f9d16d542dbf63 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 21:45:37 +0100 Subject: [PATCH 027/128] changed strand specific order --- gffquant/counters/alignment_counter2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 679790f2..3650dc18 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -83,8 +83,8 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): )[aln_count == 1] key = ( ( - (hit.rid, hit.rev_strand), hit.rid + (hit.rid, hit.rev_strand), ) )[self.strand_specific] From bb13e149a9c2da412a50196ebb8155e49fd362cc Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 21:57:59 +0100 Subject: [PATCH 028/128] debug log --- gffquant/counters/alignment_counter2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 3650dc18..48c5ece0 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -74,6 +74,7 @@ def __setitem__(self, key, value): def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 for hits, aln_count in count_stream: + logger.info("update_counts:: HITS: %s", hits) hit = hits[0] inc = ( ( From 4579affeb5f16d48022c3cbea57080d415d0355e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 22:11:14 +0100 Subject: [PATCH 029/128] debug log --- gffquant/counters/alignment_counter2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 48c5ece0..0a417edb 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -74,8 +74,10 @@ def __setitem__(self, key, value): def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 for hits, aln_count in count_stream: + # [2024-12-21 22:05:40,032] update_counts:: HITS: [258011 None None False None None None None True 2 1] logger.info("update_counts:: HITS: %s", hits) hit = hits[0] + logger.info("update_counts:: HIT %s (%s)", hit, type(hit)) inc = ( ( self.get_increment(aln_count, increment), From a5f7fb1eef16e9cd985c112fb6fccdafb9d7409f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 22:19:42 +0100 Subject: [PATCH 030/128] debug log --- gffquant/counters/alignment_counter2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 0a417edb..1aa3fde4 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -86,7 +86,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): )[aln_count == 1] key = ( ( - hit.rid + hit.rid, (hit.rid, hit.rev_strand), ) )[self.strand_specific] From 68fa1942b5c814066f58327de6e4949094a83a52 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 22:56:51 +0100 Subject: [PATCH 031/128] fixed gene writing? --- gffquant/annotation/count_writer.py | 38 ++++++++++++++++++++----- gffquant/counters/alignment_counter2.py | 3 -- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 2b6168aa..06952cf7 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -176,20 +176,44 @@ def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_fact # ambig_scaling_factor=ambig_scaling_factor # ) # CountWriter.write_row(gene, out_row, stream=gene_out) - for rid in gene_counts.get_all_regions(): - counts = gene_counts.get_counts(rid) - out_row = self.compile_output_row( - counts, - scaling_factor=uniq_scaling_factor, - ambig_scaling_factor=ambig_scaling_factor, + ref_stream = ( + ( + refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0], + rid, ) - ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] + for rid in gene_counts.get_all_regions() + ) + for ref, rid in sorted(ref_stream): + counts = gene_counts.get_counts(rid) if gene_group_db: ref_tokens = ref.split(".") gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] else: gene_id = ref + out_row = self.compile_output_row( + counts, + scaling_factor=uniq_scaling_factor, + ambig_scaling_factor=ambig_scaling_factor, + ) + CountWriter.write_row(gene_id, out_row, stream=gene_out,) + # for rid in gene_counts.get_all_regions(): + # counts = gene_counts.get_counts(rid) + # out_row = self.compile_output_row( + # counts, + # scaling_factor=uniq_scaling_factor, + # ambig_scaling_factor=ambig_scaling_factor, + # ) + # ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] + + # if gene_group_db: + # ref_tokens = ref.split(".") + # gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] + # else: + # gene_id = ref + + # CountWriter.write_row(gene_id, out_row, stream=gene_out,) + diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 1aa3fde4..545deafd 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -74,10 +74,7 @@ def __setitem__(self, key, value): def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 for hits, aln_count in count_stream: - # [2024-12-21 22:05:40,032] update_counts:: HITS: [258011 None None False None None None None True 2 1] - logger.info("update_counts:: HITS: %s", hits) hit = hits[0] - logger.info("update_counts:: HIT %s (%s)", hit, type(hit)) inc = ( ( self.get_increment(aln_count, increment), From 313a0615cced87eed056f45ace560c81943aa865 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 22:57:38 +0100 Subject: [PATCH 032/128] fixed gene writing? --- gffquant/profilers/feature_quantifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index fa1ce0d9..e685f955 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -165,11 +165,11 @@ def process_counters( Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required] count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) - self.count_manager.dump("pre_annotate", self.reference_manager) + # self.count_manager.dump("pre_annotate", self.reference_manager) count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,) - self.count_manager.dump("post_annotate", self.reference_manager) + # self.count_manager.dump("post_annotate", self.reference_manager) count_writer = CountWriter( self.out_prefix, From e917fe2c93d0a0ce87f00eea33d7fa585e8898fd Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 21 Dec 2024 23:55:51 +0100 Subject: [PATCH 033/128] starting to replace CountManager --- gffquant/alignment/__init__.py | 1 + gffquant/alignment/reference_hit.py | 34 ++++++++++ gffquant/annotation/count_annotator.py | 12 ++-- gffquant/annotation/count_writer.py | 4 +- gffquant/annotation/genecount_annotator.py | 10 +-- gffquant/counters/alignment_counter2.py | 62 +++++++++++++++++- gffquant/profilers/feature_quantifier.py | 74 +++++++--------------- 7 files changed, 134 insertions(+), 63 deletions(-) create mode 100644 gffquant/alignment/reference_hit.py diff --git a/gffquant/alignment/__init__.py b/gffquant/alignment/__init__.py index 9c07f61f..77c59f0a 100644 --- a/gffquant/alignment/__init__.py +++ b/gffquant/alignment/__init__.py @@ -6,6 +6,7 @@ from .aln_group import AlignmentGroup from .pysam_alignment_processor import AlignmentProcessor +from .reference_hit import ReferenceHit from .samflags import SamFlags from .cigarops import CigarOps diff --git a/gffquant/alignment/reference_hit.py b/gffquant/alignment/reference_hit.py new file mode 100644 index 00000000..29c5a0f6 --- /dev/null +++ b/gffquant/alignment/reference_hit.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass, asdict + + +@dataclass(slots=True) +class ReferenceHit: + rid: int = None + start: int = None + end: int = None + rev_strand: bool = None + cov_start: int = None + cov_end: int = None + has_annotation: bool = None + n_aln: int = None + is_ambiguous: bool = None + library_mod: int = None + mate_id: int = None + + def __hash__(self): + return hash(tuple(asdict(self).values())) + + def __eq__(self, other): + return all( + item[0][1] == item[1][1] + for item in zip( + sorted(asdict(self).items()), + sorted(asdict(other).items()) + ) + ) + + def __str__(self): + return "\t".join(map(str, asdict(self).values())) + + def __repr__(self): + return str(self) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index b3f74d18..8352300e 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -9,6 +9,8 @@ import numpy as np from ..counters.count_manager import CountManager +from ..counters.alignment_counter2 import AlignmentCounter + logger = logging.getLogger(__name__) @@ -277,23 +279,23 @@ class GeneCountAnnotator(CountAnnotator): def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): + def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of gene counts via db-iteration. input: - bam: bamr.BamFile to use as reverse lookup table for reference ids - db: GffDatabaseManager holding functional annotation database - - count_manager: count_data + - counter: count_data """ strand_specific_counts = ( - (count_manager.PLUS_STRAND, count_manager.MINUS_STRAND) + (counter.PLUS_STRAND, counter.MINUS_STRAND) if self.strand_specific else None ) - for rid in count_manager.get_all_regions(): + for rid in counter.get_all_regions(): ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - uniq_counts, ambig_counts = count_manager.get_counts( + uniq_counts, ambig_counts = counter.get_counts( rid, region_counts=False, strand_specific=self.strand_specific ) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 06952cf7..fcff0f2e 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -8,7 +8,7 @@ import numpy as np -from ..counters import CountManager +from ..counters import CountManager, AlignmentCounter logger = logging.getLogger(__name__) @@ -163,7 +163,7 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un ) CountWriter.write_row(feature.name, out_row, stream=feat_out) - def write_gene_counts(self, gene_counts: CountManager, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False): + def write_gene_counts(self, gene_counts: AlignmentCounter, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False): if "scaled" in self.publish_reports: logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index ed161d40..9ef4455e 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -3,7 +3,7 @@ import numpy as np from .count_annotator import CountAnnotator -from ..counters import CountManager +from ..counters import CountManager, AlignmentCounter logger = logging.getLogger(__name__) @@ -15,13 +15,13 @@ class GeneCountAnnotator(CountAnnotator): def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): - self.total_gene_counts = count_manager.transform_counts(refmgr) + def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): + self.total_gene_counts = counter.transform(refmgr) # count_manager.transform_counts(refmgr) logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) # self.total_counts = self.total_gene_counts # ? - for rid in count_manager.get_all_regions(): - counts = count_manager.get_counts(rid) + for rid in counter.get_all_regions(): + counts = counter.get_counts(rid) ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) if gene_group_db: diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 545deafd..2e7bc3b8 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -13,6 +13,7 @@ class AlignmentCounter: COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled") INITIAL_SIZE = 1000 + PLUS_STRAND, MINUS_STRAND = False, True @staticmethod def normalise_counts(counts, feature_len, scaling_factor): @@ -24,10 +25,39 @@ def normalise_counts(counts, feature_len, scaling_factor): def get_increment(self, n_aln, increment): # 1overN = lavern. Maya <3 return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment + + def toggle_single_read_handling(self, unmarked_orphans): + # precalculate count-increment for single-end, paired-end reads + # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing), + # properly attribute fractional counts to the orphans + # Increments: + # alignment from single end library read: 1 + # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2) + # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2) + + # old code: + # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5 + + # if pair: + # increment = 1 if self.paired_end_count == 2 else 0.5 + # else: + # increment = 0.5 if self.unmarked_orphans else 1 + self.increments = ( + (self.paired_end_count / 2.0) if unmarked_orphans else 1.0, + self.paired_end_count / 2.0, + ) - def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False): + def __init__( + self, + distribution_mode=DistributionMode.ONE_OVER_N, + strand_specific=False, + paired_end_count=1, + ): self.distribution_mode = distribution_mode self.strand_specific = strand_specific + self.paired_end_count = paired_end_count + self.increments = (1.0, 1.0,) + self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,) self.unannotated_reads = 0 self.index = {} @@ -70,6 +100,36 @@ def __setitem__(self, key, value): self.counts[key_index] = value else: raise KeyError(f"{key=} not found.") + + def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,): + if pe_library is not None: + # this is the case when the alignment has a read group tag + # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans) + # else (RG tag '1') -> take single-end increment + increment = self.increments_auto_detect[pe_library] + else: + # if the alignment has no (appropriate) read group tag + # use the paired-end information instead + # if orphan reads are present in the input sam/bam, + # the flag `--unmarked_orphans` should be set + # otherwise orphan reads will be assigned a count of 1. + increment = self.increments[pair] + + contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) + + return contributed_counts + + def get_unannotated_reads(self): + return self.unannotated_reads + + def get_counts(self, seqid, strand_specific=False): + if strand_specific: + raise NotImplementedError() + counts = self[seqid] + return np.array((counts[0], counts[2], counts[1], counts[3])) + + def get_all_regions(self): + yield from self def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index e685f955..2af697f4 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -13,9 +13,9 @@ from dataclasses import dataclass, asdict from .panda_coverage_profiler import PandaCoverageProfiler -from ..alignment import AlignmentGroup, AlignmentProcessor, SamFlags +from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter -from ..counters import CountManager +from ..counters import CountManager, AlignmentCounter from ..db.annotation_db import AnnotationDatabaseManager from .. import __tool__, DistributionMode, RunMode @@ -24,39 +24,6 @@ logger = logging.getLogger(__name__) -@dataclass(slots=True) -class ReferenceHit: - rid: int = None - start: int = None - end: int = None - rev_strand: bool = None - cov_start: int = None - cov_end: int = None - has_annotation: bool = None - n_aln: int = None - is_ambiguous: bool = None - library_mod: int = None - mate_id: int = None - - def __hash__(self): - return hash(tuple(asdict(self).values())) - - def __eq__(self, other): - return all( - item[0][1] == item[1][1] - for item in zip( - sorted(asdict(self).items()), - sorted(asdict(other).items()) - ) - ) - - def __str__(self): - return "\t".join(map(str, asdict(self).values())) - - def __repr__(self): - return str(self) - - class FeatureQuantifier(ABC): """ Three groups of alignments: @@ -93,10 +60,15 @@ def __init__( self.db = db self.adm = None self.run_mode = run_mode - self.count_manager = CountManager( + # self.count_manager = CountManager( + # distribution_mode=distribution_mode, + # region_counts=run_mode.overlap_required, + # strand_specific=strand_specific and not run_mode.overlap_required, + # paired_end_count=paired_end_count, + # ) + self.counter = AlignmentCounter( distribution_mode=distribution_mode, - region_counts=run_mode.overlap_required, - strand_specific=strand_specific and not run_mode.overlap_required, + strand_specific=strand_specific, paired_end_count=paired_end_count, ) self.out_prefix = out_prefix @@ -158,22 +130,20 @@ def process_counters( self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory) if dump_counters: - self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager) + # self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager) + self.counter.dump(self.out_prefix, self.reference_manager,) report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required] count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) - # self.count_manager.dump("pre_annotate", self.reference_manager) - - count_annotator.annotate(self.reference_manager, self.adm, self.count_manager, gene_group_db=gene_group_db,) - - # self.count_manager.dump("post_annotate", self.reference_manager) - + count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) + count_writer = CountWriter( self.out_prefix, - has_ambig_counts=self.count_manager.has_ambig_counts(), + # has_ambig_counts=self.count_manager.has_ambig_counts(), + has_ambig_counts=self.counter.has_ambig_counts(), strand_specific=self.strand_specific, restrict_reports=restrict_reports, report_category=report_category, @@ -181,7 +151,8 @@ def process_counters( filtered_readcount=self.aln_counter["filtered_read_count"], ) - unannotated_reads = self.count_manager.get_unannotated_reads() + # unannotated_reads = self.count_manager.get_unannotated_reads() + unannotated_reads = self.counter.get_unannotated_reads() unannotated_reads += self.aln_counter["unannotated_ambig"] count_writer.write_feature_counts( @@ -192,7 +163,8 @@ def process_counters( count_writer.write_gene_counts( # count_annotator.gene_counts, - self.count_manager, + # self.count_manager, + self.counter, self.reference_manager, count_annotator.scaling_factors["total_gene_uniq"], count_annotator.scaling_factors["total_gene_ambi"], @@ -234,7 +206,8 @@ def process_alignments( filtered_sam=debug_samfile, ) - self.count_manager.toggle_single_read_handling(unmarked_orphans) + # self.count_manager.toggle_single_read_handling(unmarked_orphans) + self.counter.toggle_single_read_handling(unmarked_orphans) ac = self.aln_counter read_count = 0 @@ -455,7 +428,8 @@ def process_alignment_group(self, aln_group, aln_reader): ) ) - contributed_counts = self.count_manager.update_counts( + # contributed_counts = self.count_manager.update_counts( + contributed_counts = self.counter.update( count_stream, ambiguous_counts=is_ambiguous_group, pair=aln_group.is_paired(), From 37b624e300d12df3bef2a7a91a416cec7e95db2d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 15:59:11 +0100 Subject: [PATCH 034/128] pleasing linters --- gffquant/alignment/reference_hit.py | 4 + gffquant/annotation/count_annotator.py | 62 +--------------- gffquant/annotation/count_writer.py | 12 ++- gffquant/annotation/genecount_annotator.py | 86 +++++++++++----------- gffquant/counters/alignment_counter2.py | 51 +++++++++---- gffquant/counters/count_manager.py | 7 +- gffquant/counters/region_counter.py | 15 ++++ gffquant/counters/seq_counter.py | 4 +- gffquant/profilers/feature_quantifier.py | 5 +- gffquant/profilers/panda_profiler.py | 3 - 10 files changed, 118 insertions(+), 131 deletions(-) diff --git a/gffquant/alignment/reference_hit.py b/gffquant/alignment/reference_hit.py index 29c5a0f6..968da602 100644 --- a/gffquant/alignment/reference_hit.py +++ b/gffquant/alignment/reference_hit.py @@ -1,3 +1,7 @@ +# pylint: disable=R0902 + +""" module docstring """ + from dataclasses import dataclass, asdict diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 8352300e..cc146006 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -9,7 +9,6 @@ import numpy as np from ..counters.count_manager import CountManager -from ..counters.alignment_counter2 import AlignmentCounter logger = logging.getLogger(__name__) @@ -143,7 +142,7 @@ def calc_scaling_factor(raw, normed, default=0): total_ambi, total_ambi_normed, default_scaling_factor ) ) - + if self.report_scaling_factors: logger.info( "Calculating scaling factors for category=%s: uraw=%s unorm=%s araw=%s anorm=%s -> factors=%s", @@ -271,62 +270,3 @@ def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False) self.total_gene_counts += counts[:4] self.calculate_scaling_factors() - - -class GeneCountAnnotator(CountAnnotator): - """ CountAnnotator subclass for gene-based counting. """ - - def __init__(self, strand_specific, report_scaling_factors=True): - CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - - def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): - """ - Annotate a set of gene counts via db-iteration. - input: - - bam: bamr.BamFile to use as reverse lookup table for reference ids - - db: GffDatabaseManager holding functional annotation database - - counter: count_data - """ - strand_specific_counts = ( - (counter.PLUS_STRAND, counter.MINUS_STRAND) - if self.strand_specific else None - ) - - for rid in counter.get_all_regions(): - ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - - uniq_counts, ambig_counts = counter.get_counts( - rid, region_counts=False, strand_specific=self.strand_specific - ) - - counts = self.compute_count_vector( - uniq_counts, - ambig_counts, - region_length, - strand_specific_counts=strand_specific_counts, - ) - - if gene_group_db: - ref_tokens = ref.split(".") - gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] - else: - ggroup_id, gene_id = ref, ref - - gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins)) - gcounts += counts - self.total_gene_counts += counts[:4] - - region_annotation = db.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - logger.info( - "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", - gene_id, ggroup_id, counts[0], counts[2], - ) - self.distribute_feature_counts(counts, region_annotation) - - else: - logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) - self.unannotated_counts += counts[:4] - - self.calculate_scaling_factors() diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index fcff0f2e..e5959598 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -8,7 +8,7 @@ import numpy as np -from ..counters import CountManager, AlignmentCounter +from ..counters import AlignmentCounter logger = logging.getLogger(__name__) @@ -163,7 +163,14 @@ def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_un ) CountWriter.write_row(feature.name, out_row, stream=feat_out) - def write_gene_counts(self, gene_counts: AlignmentCounter, refmgr, uniq_scaling_factor, ambig_scaling_factor, gene_group_db=False): + def write_gene_counts( + self, + gene_counts: AlignmentCounter, + refmgr, + uniq_scaling_factor, + ambig_scaling_factor, + gene_group_db=False + ): if "scaled" in self.publish_reports: logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: @@ -216,4 +223,3 @@ def write_gene_counts(self, gene_counts: AlignmentCounter, refmgr, uniq_scaling_ # gene_id = ref # CountWriter.write_row(gene_id, out_row, stream=gene_out,) - diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 9ef4455e..bbd7581d 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -1,49 +1,53 @@ -import logging - -import numpy as np +""" module docstring """ +import logging from .count_annotator import CountAnnotator -from ..counters import CountManager, AlignmentCounter +from ..counters import AlignmentCounter logger = logging.getLogger(__name__) class GeneCountAnnotator(CountAnnotator): - """ CountAnnotator subclass for gene-based counting. """ - - def __init__(self, strand_specific, report_scaling_factors=True): - CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - - def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): - self.total_gene_counts = counter.transform(refmgr) # count_manager.transform_counts(refmgr) - logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) - # self.total_counts = self.total_gene_counts # ? - - for rid in counter.get_all_regions(): - counts = counter.get_counts(rid) - ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - - if gene_group_db: - ref_tokens = ref.split(".") - gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] - else: - gene_id, ggroup_id = ref, ref - - region_annotation = db.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - logger.info( - "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", - gene_id, ggroup_id, counts[0], counts[2], - ) - self.distribute_feature_counts(counts, region_annotation) - - else: - logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) - self.unannotated_counts += counts[:4] - - self.calculate_scaling_factors() - - - + """ CountAnnotator subclass for gene-based counting. """ + + def __init__(self, strand_specific, report_scaling_factors=True): + """ __init__() """ + CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) + + def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): + """ Annotate a set of gene counts with functional annotations. """ + self.total_gene_counts = counter.transform(refmgr) # count_manager.transform_counts(refmgr) + logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) + # self.total_counts = self.total_gene_counts # ? + + # formerly used in compute_count_vector + strand_specific_counts = ( + (counter.PLUS_STRAND, counter.MINUS_STRAND) + if self.strand_specific else None + ) + + for rid in counter.get_all_regions(): + counts = counter.get_counts(rid, strand_specific=self.strand_specific) + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + + if gene_group_db: + ref_tokens = ref.split(".") + gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + else: + gene_id, ggroup_id = ref, ref + + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + logger.info( + "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", + gene_id, ggroup_id, counts[0], counts[2], + ) + self.distribute_feature_counts(counts, region_annotation) + + else: + logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) + self.unannotated_counts += counts[:4] + + self.calculate_scaling_factors() diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index 2e7bc3b8..a0c84716 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -1,3 +1,8 @@ +# pylint: disable=R0902 + +""" module docstring """ + +import gzip import logging from collections import Counter @@ -13,6 +18,9 @@ class AlignmentCounter: COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled") INITIAL_SIZE = 1000 + # this may be counter-intuitive + # but originates from the samflags 0x10, 0x20, + # which explicitly identify the reverse-strandness of the read PLUS_STRAND, MINUS_STRAND = False, True @staticmethod @@ -25,7 +33,7 @@ def normalise_counts(counts, feature_len, scaling_factor): def get_increment(self, n_aln, increment): # 1overN = lavern. Maya <3 return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment - + def toggle_single_read_handling(self, unmarked_orphans): # precalculate count-increment for single-end, paired-end reads # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing), @@ -65,8 +73,8 @@ def __init__( (AlignmentCounter.INITIAL_SIZE, 2,), dtype='float64', ) + def dump(self, prefix, refmgr): - import gzip with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: for key, key_index in self.index.items(): ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) @@ -74,33 +82,35 @@ def dump(self, prefix, refmgr): # for k, v in self.items(): # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) # print(k, ref, reflen, v, sep="\t", file=_out) - ... + def get(self, key, default_val): key_index = self.index.get(key) if key_index is None: return Counter() return Counter({key: self.counts[key_index]}) - + def setdefault(self, key, default_val): ... def has_ambig_counts(self): return bool(self.counts[:, 1].sum() != 0) - + def __iter__(self): yield from self.index.keys() + def __getitem__(self, key): key_index = self.index.get(key) if key_index is None: return 0.0 return self.counts[key_index] + def __setitem__(self, key, value): key_index = self.index.get(key) if key_index is not None: self.counts[key_index] = value else: raise KeyError(f"{key=} not found.") - + def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,): if pe_library is not None: # this is the case when the alignment has a read group tag @@ -118,19 +128,32 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) return contributed_counts - + def get_unannotated_reads(self): return self.unannotated_reads - + def get_counts(self, seqid, strand_specific=False): if strand_specific: - raise NotImplementedError() + raise NotImplementedError() + uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] + uniq_counts[seqid[1]] = uniq_counter[seqid] + ambig_counts[seqid[1]] = ambig_counter[seqid] + + # rid = seqid[0] if isinstance(seqid, tuple) else seqid + # uniq_counts = [ + # uniq_counter[(rid, CountManager.PLUS_STRAND)], + # uniq_counter[(rid, CountManager.MINUS_STRAND)], + # ] + # ambig_counts = [ + # ambig_counter[(rid, CountManager.PLUS_STRAND)], + # ambig_counter[(rid, CountManager.MINUS_STRAND)], + # ] counts = self[seqid] return np.array((counts[0], counts[2], counts[1], counts[3])) - + def get_all_regions(self): - yield from self - + yield from self + def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 for hits, aln_count in count_stream: @@ -162,7 +185,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts += inc return contributed_counts - + def transform(self, refmgr): # transform 2-column uniq/ambig count matrix # into 4 columns @@ -195,5 +218,3 @@ def transform(self, refmgr): # return count sums return self.counts.sum(axis=0) - - diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index 8f8517f8..d3eb059f 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -152,7 +152,7 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): if region_counts: raise NotImplementedError() rid, seqid = seqid[0], seqid[1:] - + uniq_counter = self.uniq_regioncounts.get(rid, Counter()) ambig_counter = self.ambig_regioncounts.get(rid, Counter()) @@ -165,7 +165,6 @@ def get_counts(self, seqid, region_counts=False, strand_specific=False): else: # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts - if strand_specific: raise NotImplementedError() uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] @@ -196,7 +195,7 @@ def get_regions(self, rid): return set(self.uniq_regioncounts.get(rid, Counter())).union( self.ambig_regioncounts.get(rid, Counter()) ) - + def get_all_regions(self, region_counts=False): # uniq_counts, ambig_counts = ( # (self.uniq_seqcounts, self.ambig_seqcounts,), @@ -220,4 +219,4 @@ def dump(self, prefix, refmgr): if self.seqcounts is not None: self.seqcounts.dump(prefix, refmgr) if self.regioncounts is not None: - self.regioncounts.dump(prefix, refmgr) \ No newline at end of file + self.regioncounts.dump(prefix, refmgr) diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py index a7a74756..3056288a 100644 --- a/gffquant/counters/region_counter.py +++ b/gffquant/counters/region_counter.py @@ -8,6 +8,21 @@ from .alignment_counter2 import AlignmentCounter +# from count_manager.get_counts() +# if region_counts: +# raise NotImplementedError() +# rid, seqid = seqid[0], seqid[1:] + +# uniq_counter = self.uniq_regioncounts.get(rid, Counter()) +# ambig_counter = self.ambig_regioncounts.get(rid, Counter()) + +# # pylint: disable=R1720 +# if strand_specific: +# raise NotImplementedError +# else: +# return [uniq_counter[seqid]], [ambig_counter[seqid]] + + class RegionCounter(AlignmentCounter): """This counter class can be used in overlap mode, i.e. when reads are aligned against long references (e.g. contigs) diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py index 91e28628..261ed575 100644 --- a/gffquant/counters/seq_counter.py +++ b/gffquant/counters/seq_counter.py @@ -1,9 +1,11 @@ # pylint: disable=W0223 +# deprecated + """ module docstring """ from .. import DistributionMode -from .alignment_counter2 import AlignmentCounter +from .alignment_counter import AlignmentCounter class UniqueSeqCounter(AlignmentCounter): diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 2af697f4..6477c1be 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -10,12 +10,11 @@ from abc import ABC from collections import Counter -from dataclasses import dataclass, asdict from .panda_coverage_profiler import PandaCoverageProfiler from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter -from ..counters import CountManager, AlignmentCounter +from ..counters import AlignmentCounter from ..db.annotation_db import AnnotationDatabaseManager from .. import __tool__, DistributionMode, RunMode @@ -139,7 +138,7 @@ def process_counters( count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) - + count_writer = CountWriter( self.out_prefix, # has_ambig_counts=self.count_manager.has_ambig_counts(), diff --git a/gffquant/profilers/panda_profiler.py b/gffquant/profilers/panda_profiler.py index 53dacee6..4a5ee057 100644 --- a/gffquant/profilers/panda_profiler.py +++ b/gffquant/profilers/panda_profiler.py @@ -28,7 +28,6 @@ def __init__( self._buffer_size = 0 self._max_buffer_size = 400_000_000 - def get_gene_coords(self): if self.with_overlap: for rid, start, end in zip( @@ -283,7 +282,6 @@ def add_records(self, hits, last_update=False): self._buffer += hits self._buffer_size += hits_size - def merge_dataframes(self): print("BUFFER:", len(self._buffer), self._buffer[:1]) hits_df = pd.DataFrame(self._buffer) @@ -319,7 +317,6 @@ def merge_dataframes(self): .groupby(by=self.index_columns, as_index=False) \ .sum(numeric_only=True) - def add_records_old(self, hits): # [2024-02-08 14:51:17,846] count_stream: From 9e078ce9b8665257c7d26c6ade602a56b9333b64 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 21:47:16 +0100 Subject: [PATCH 035/128] removed seq_counter.py --- gffquant/annotation/count_annotator.py | 4 +- gffquant/counters/alignment_counter2.py | 8 +-- gffquant/counters/seq_counter.py | 65 ------------------------ gffquant/profilers/feature_quantifier.py | 7 --- 4 files changed, 6 insertions(+), 78 deletions(-) delete mode 100644 gffquant/counters/seq_counter.py diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index cc146006..12d811ae 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -8,7 +8,7 @@ import numpy as np -from ..counters.count_manager import CountManager +from ..counters.count_manager import CountManager, AlignmentCounter logger = logging.getLogger(__name__) @@ -202,7 +202,7 @@ def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) # pylint: disable=R0914,W0613 - def annotate(self, refmgr, db, count_manager: CountManager, gene_group_db=False): + def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of region counts via db-lookup. input: diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py index a0c84716..8fc96d2c 100644 --- a/gffquant/counters/alignment_counter2.py +++ b/gffquant/counters/alignment_counter2.py @@ -141,12 +141,12 @@ def get_counts(self, seqid, strand_specific=False): # rid = seqid[0] if isinstance(seqid, tuple) else seqid # uniq_counts = [ - # uniq_counter[(rid, CountManager.PLUS_STRAND)], - # uniq_counter[(rid, CountManager.MINUS_STRAND)], + # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], # ] # ambig_counts = [ - # ambig_counter[(rid, CountManager.PLUS_STRAND)], - # ambig_counter[(rid, CountManager.MINUS_STRAND)], + # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], # ] counts = self[seqid] return np.array((counts[0], counts[2], counts[1], counts[3])) diff --git a/gffquant/counters/seq_counter.py b/gffquant/counters/seq_counter.py deleted file mode 100644 index 261ed575..00000000 --- a/gffquant/counters/seq_counter.py +++ /dev/null @@ -1,65 +0,0 @@ -# pylint: disable=W0223 - -# deprecated - -""" module docstring """ - -from .. import DistributionMode -from .alignment_counter import AlignmentCounter - - -class UniqueSeqCounter(AlignmentCounter): - def __init__(self, strand_specific=False): - AlignmentCounter.__init__(self, strand_specific=strand_specific) - - def get_counts(self, seq_ids): - """ - Given a list of sequence ids, return the total number of reads that mapped to each of those - sequences - - :param seq_ids: a list of sequence ids to count - :return: A list of counts for each sequence ID. - """ - if self.strand_specific: - return sum( - self[(seq_id, strand)] for seq_id in seq_ids for strand in (True, False) - ) - return sum(self[seq_id] for seq_id in seq_ids) - - def update_counts(self, count_stream, increment=1): - for counts, _, _ in count_stream: - - for rid, hits in counts.items(): - - if self.strand_specific: - strands = tuple(int(strand) for _, _, strand, _, _ in hits) - - self[(rid, True)] += sum(strands) * increment - self[(rid, False)] += (len(hits) - sum(strands)) * increment - - else: - self[rid] += len(hits) * increment - - -class AmbiguousSeqCounter(AlignmentCounter): - def __init__(self, strand_specific=False, distribution_mode=DistributionMode.ONE_OVER_N): - AlignmentCounter.__init__( - self, distribution_mode=distribution_mode, strand_specific=strand_specific - ) - - def update_counts(self, count_stream, increment=1): - - for counts, aln_count, _ in count_stream: - - inc = self.get_increment(aln_count, increment) - - for rid, hits in counts.items(): - - if self.strand_specific: - strands = tuple(int(strand) for _, _, strand, _, _ in hits) - - self[(rid, True)] += sum(strands) * inc - self[(rid, False)] += (len(hits) - sum(strands)) * inc - - else: - self[rid] += len(hits) * inc diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 6477c1be..b3b08a9f 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -59,12 +59,6 @@ def __init__( self.db = db self.adm = None self.run_mode = run_mode - # self.count_manager = CountManager( - # distribution_mode=distribution_mode, - # region_counts=run_mode.overlap_required, - # strand_specific=strand_specific and not run_mode.overlap_required, - # paired_end_count=paired_end_count, - # ) self.counter = AlignmentCounter( distribution_mode=distribution_mode, strand_specific=strand_specific, @@ -74,7 +68,6 @@ def __init__( self.distribution_mode = distribution_mode self.reference_manager = {} self.strand_specific = strand_specific - # self.coverage_counter = {} self.debug = debug self.panda_cv = PandaCoverageProfiler(dump_dataframes=self.debug) if calculate_coverage else None From 7976a4da982df4750706be468a41995fd3f3c03e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 21:49:22 +0100 Subject: [PATCH 036/128] removed Unique- and AmbiguousRegionCounter classes --- gffquant/counters/region_counter.py | 65 ----------------------------- 1 file changed, 65 deletions(-) diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py index 3056288a..5b36a876 100644 --- a/gffquant/counters/region_counter.py +++ b/gffquant/counters/region_counter.py @@ -49,68 +49,3 @@ def update_counts(self, count_stream, increment=1): ) contributed_counts += inc return contributed_counts - - -class UniqueRegionCounter(RegionCounter): - """This counter class can be used in overlap mode, i.e. - when reads are aligned against long references (e.g. contigs) - with multiple regions of interest (features). - """ - - def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False): - RegionCounter.__init__( - self, distribution_mode=distribution_mode, strand_specific=strand_specific, - ) - - # pylint: disable=W0613 - def update_counts(self, count_stream, increment=1): - """Update counter with alignments against the same reference. - - input: count_stream - - counts: set of overlaps with the reference - - aln_count: 1 if overlaps else 0 - - unaligned: 1 - aln_count - (redundant input due to streamlining uniq/ambig dataflows) - """ - for counts, aln_count, unaligned in count_stream: - if aln_count: - for rid, hits in counts.items(): - for hit in hits: - self._update_region( - rid, *hit, increment=increment - ) - else: - self.unannotated_reads += unaligned - - -class AmbiguousRegionCounter(RegionCounter): - """This counter class can be used in overlap mode, i.e. - when reads are aligned against long references (e.g. contigs) - with multiple regions of interest (features). - """ - - def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False): - RegionCounter.__init__( - self, distribution_mode=distribution_mode, strand_specific=strand_specific, - ) - - # pylint: disable=W0613 - def update_counts(self, count_stream, increment=1): - """Update counter with alignments against the same reference. - - input: count_stream - - counts: set of overlaps with the reference - - aln_count: 1 if overlaps else 0 - - unaligned: 1 - aln_count - (redundant input due to streamlining uniq/ambig dataflows) - """ - for counts, aln_count, unaligned in count_stream: - if aln_count: - inc = self.get_increment(aln_count, increment) - for rid, hits in counts.items(): - for hit in hits: - self._update_region( - rid, *hit, increment=inc - ) - else: - self.unannotated_reads += unaligned From f754653f4e2a0e4142bc881fca71cd47f843eb6c Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 21:57:53 +0100 Subject: [PATCH 037/128] updated alignment_counter, removed alignment_counter2 --- gffquant/counters/__init__.py | 3 +- gffquant/counters/alignment_counter.py | 202 ++++++++++++++++++++-- gffquant/counters/alignment_counter2.py | 220 ------------------------ gffquant/counters/count_manager.py | 2 +- gffquant/counters/region_counter.py | 2 +- 5 files changed, 188 insertions(+), 241 deletions(-) delete mode 100644 gffquant/counters/alignment_counter2.py diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py index 774cd03c..34d30242 100644 --- a/gffquant/counters/__init__.py +++ b/gffquant/counters/__init__.py @@ -3,7 +3,6 @@ """module docstring""" -from .alignment_counter2 import AlignmentCounter +from .alignment_counter import AlignmentCounter from .region_counter import RegionCounter -from .seq_counter import UniqueSeqCounter, AmbiguousSeqCounter from .count_manager import CountManager diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 3c42b254..8fc96d2c 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -1,18 +1,27 @@ -# pylint: disable=W0223 -# pylint: disable=C0103 -# pylint: disable=W1514 +# pylint: disable=R0902 -"""module docstring""" +""" module docstring """ import gzip +import logging from collections import Counter +import numpy as np + from .. import DistributionMode -class AlignmentCounter(Counter): - COUNT_HEADER_ELEMENTS = ["raw", "lnorm", "scaled"] +logger = logging.getLogger(__name__) + + +class AlignmentCounter: + COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled") + INITIAL_SIZE = 1000 + # this may be counter-intuitive + # but originates from the samflags 0x10, 0x20, + # which explicitly identify the reverse-strandness of the read + PLUS_STRAND, MINUS_STRAND = False, True @staticmethod def normalise_counts(counts, feature_len, scaling_factor): @@ -25,28 +34,187 @@ def get_increment(self, n_aln, increment): # 1overN = lavern. Maya <3 return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment - def __init__(self, distribution_mode=DistributionMode.ONE_OVER_N, strand_specific=False): - Counter.__init__(self) + def toggle_single_read_handling(self, unmarked_orphans): + # precalculate count-increment for single-end, paired-end reads + # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing), + # properly attribute fractional counts to the orphans + # Increments: + # alignment from single end library read: 1 + # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2) + # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2) + + # old code: + # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5 + + # if pair: + # increment = 1 if self.paired_end_count == 2 else 0.5 + # else: + # increment = 0.5 if self.unmarked_orphans else 1 + self.increments = ( + (self.paired_end_count / 2.0) if unmarked_orphans else 1.0, + self.paired_end_count / 2.0, + ) + + def __init__( + self, + distribution_mode=DistributionMode.ONE_OVER_N, + strand_specific=False, + paired_end_count=1, + ): self.distribution_mode = distribution_mode self.strand_specific = strand_specific + self.paired_end_count = paired_end_count + self.increments = (1.0, 1.0,) + self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,) self.unannotated_reads = 0 + self.index = {} + self.counts = np.zeros( + (AlignmentCounter.INITIAL_SIZE, 2,), + dtype='float64', + ) + def dump(self, prefix, refmgr): with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: - for k, v in self.items(): - ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) - print(k, ref, reflen, v, sep="\t", file=_out) + for key, key_index in self.index.items(): + ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) + print(key, ref, reflen, self.counts[key_index], sep="\t", file=_out) + # for k, v in self.items(): + # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) + # print(k, ref, reflen, v, sep="\t", file=_out) + + def get(self, key, default_val): + key_index = self.index.get(key) + if key_index is None: + return Counter() + return Counter({key: self.counts[key_index]}) + + def setdefault(self, key, default_val): + ... + + def has_ambig_counts(self): + return bool(self.counts[:, 1].sum() != 0) + + def __iter__(self): + yield from self.index.keys() + + def __getitem__(self, key): + key_index = self.index.get(key) + if key_index is None: + return 0.0 + return self.counts[key_index] + + def __setitem__(self, key, value): + key_index = self.index.get(key) + if key_index is not None: + self.counts[key_index] = value + else: + raise KeyError(f"{key=} not found.") + + def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,): + if pe_library is not None: + # this is the case when the alignment has a read group tag + # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans) + # else (RG tag '1') -> take single-end increment + increment = self.increments_auto_detect[pe_library] + else: + # if the alignment has no (appropriate) read group tag + # use the paired-end information instead + # if orphan reads are present in the input sam/bam, + # the flag `--unmarked_orphans` should be set + # otherwise orphan reads will be assigned a count of 1. + increment = self.increments[pair] - def update_counts(self, count_stream, increment=1): + contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) + + return contributed_counts + + def get_unannotated_reads(self): + return self.unannotated_reads + + def get_counts(self, seqid, strand_specific=False): + if strand_specific: + raise NotImplementedError() + uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] + uniq_counts[seqid[1]] = uniq_counter[seqid] + ambig_counts[seqid[1]] = ambig_counter[seqid] + + # rid = seqid[0] if isinstance(seqid, tuple) else seqid + # uniq_counts = [ + # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], + # ] + # ambig_counts = [ + # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], + # ] + counts = self[seqid] + return np.array((counts[0], counts[2], counts[1], counts[3])) + + def get_all_regions(self): + yield from self + + def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 for hits, aln_count in count_stream: hit = hits[0] - inc = increment if aln_count == 1 else self.get_increment(aln_count, increment) - if self.strand_specific: - self[(hit.rid, hit.rev_strand)] += inc - else: - self[hit.rid] += inc + inc = ( + ( + self.get_increment(aln_count, increment), + increment, + ) + )[aln_count == 1] + key = ( + ( + hit.rid, + (hit.rid, hit.rev_strand), + ) + )[self.strand_specific] + key_index = self.index.get(key) + if key_index is None: + nrows = self.counts.shape[0] + if len(self.index) == nrows: + self.counts = np.pad( + self.counts, + ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), + ) + # key_index = self.index.setdefault(key, len(self.index)) + key_index = self.index[key] = len(self.index) + self.counts[key_index][int(ambiguous_counts)] += inc contributed_counts += inc return contributed_counts + + def transform(self, refmgr): + # transform 2-column uniq/ambig count matrix + # into 4 columns + # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm + + # obtain gene lengths + lengths = np.array( + tuple( + (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] + for key in self.index + ) + ) + logger.info("LENGTHS ARRAY = %s", lengths.shape) + logger.info("INDEX SIZE = %s", len(self.index)) + + # remove the un-indexed rows + self.counts = self.counts[0:len(self.index), :] + + # calculate combined_raw + self.counts[:, 1:2] += self.counts[:, 0:1] + + # duplicate the raw counts + self.counts = np.concatenate( + (self.counts, self.counts,), + axis=1, + ) + + # length-normalise the lnorm columns + self.counts[:, 2:4] /= lengths[:, None] + + # return count sums + return self.counts.sum(axis=0) diff --git a/gffquant/counters/alignment_counter2.py b/gffquant/counters/alignment_counter2.py deleted file mode 100644 index 8fc96d2c..00000000 --- a/gffquant/counters/alignment_counter2.py +++ /dev/null @@ -1,220 +0,0 @@ -# pylint: disable=R0902 - -""" module docstring """ - -import gzip -import logging - -from collections import Counter - -import numpy as np - -from .. import DistributionMode - - -logger = logging.getLogger(__name__) - - -class AlignmentCounter: - COUNT_HEADER_ELEMENTS = ("raw", "lnorm", "scaled") - INITIAL_SIZE = 1000 - # this may be counter-intuitive - # but originates from the samflags 0x10, 0x20, - # which explicitly identify the reverse-strandness of the read - PLUS_STRAND, MINUS_STRAND = False, True - - @staticmethod - def normalise_counts(counts, feature_len, scaling_factor): - """Returns raw, length-normalised, and scaled feature counts.""" - normalised = counts / feature_len - scaled = normalised * scaling_factor - return counts, normalised, scaled - - def get_increment(self, n_aln, increment): - # 1overN = lavern. Maya <3 - return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment - - def toggle_single_read_handling(self, unmarked_orphans): - # precalculate count-increment for single-end, paired-end reads - # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing), - # properly attribute fractional counts to the orphans - # Increments: - # alignment from single end library read: 1 - # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2) - # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2) - - # old code: - # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5 - - # if pair: - # increment = 1 if self.paired_end_count == 2 else 0.5 - # else: - # increment = 0.5 if self.unmarked_orphans else 1 - self.increments = ( - (self.paired_end_count / 2.0) if unmarked_orphans else 1.0, - self.paired_end_count / 2.0, - ) - - def __init__( - self, - distribution_mode=DistributionMode.ONE_OVER_N, - strand_specific=False, - paired_end_count=1, - ): - self.distribution_mode = distribution_mode - self.strand_specific = strand_specific - self.paired_end_count = paired_end_count - self.increments = (1.0, 1.0,) - self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,) - self.unannotated_reads = 0 - - self.index = {} - self.counts = np.zeros( - (AlignmentCounter.INITIAL_SIZE, 2,), - dtype='float64', - ) - - def dump(self, prefix, refmgr): - with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: - for key, key_index in self.index.items(): - ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) - print(key, ref, reflen, self.counts[key_index], sep="\t", file=_out) - # for k, v in self.items(): - # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) - # print(k, ref, reflen, v, sep="\t", file=_out) - - def get(self, key, default_val): - key_index = self.index.get(key) - if key_index is None: - return Counter() - return Counter({key: self.counts[key_index]}) - - def setdefault(self, key, default_val): - ... - - def has_ambig_counts(self): - return bool(self.counts[:, 1].sum() != 0) - - def __iter__(self): - yield from self.index.keys() - - def __getitem__(self, key): - key_index = self.index.get(key) - if key_index is None: - return 0.0 - return self.counts[key_index] - - def __setitem__(self, key, value): - key_index = self.index.get(key) - if key_index is not None: - self.counts[key_index] = value - else: - raise KeyError(f"{key=} not found.") - - def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,): - if pe_library is not None: - # this is the case when the alignment has a read group tag - # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans) - # else (RG tag '1') -> take single-end increment - increment = self.increments_auto_detect[pe_library] - else: - # if the alignment has no (appropriate) read group tag - # use the paired-end information instead - # if orphan reads are present in the input sam/bam, - # the flag `--unmarked_orphans` should be set - # otherwise orphan reads will be assigned a count of 1. - increment = self.increments[pair] - - contributed_counts = self.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) - - return contributed_counts - - def get_unannotated_reads(self): - return self.unannotated_reads - - def get_counts(self, seqid, strand_specific=False): - if strand_specific: - raise NotImplementedError() - uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] - uniq_counts[seqid[1]] = uniq_counter[seqid] - ambig_counts[seqid[1]] = ambig_counter[seqid] - - # rid = seqid[0] if isinstance(seqid, tuple) else seqid - # uniq_counts = [ - # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # ] - # ambig_counts = [ - # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # ] - counts = self[seqid] - return np.array((counts[0], counts[2], counts[1], counts[3])) - - def get_all_regions(self): - yield from self - - def update_counts(self, count_stream, increment=1, ambiguous_counts=False): - contributed_counts = 0 - for hits, aln_count in count_stream: - hit = hits[0] - inc = ( - ( - self.get_increment(aln_count, increment), - increment, - ) - )[aln_count == 1] - key = ( - ( - hit.rid, - (hit.rid, hit.rev_strand), - ) - )[self.strand_specific] - - key_index = self.index.get(key) - if key_index is None: - nrows = self.counts.shape[0] - if len(self.index) == nrows: - self.counts = np.pad( - self.counts, - ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), - ) - # key_index = self.index.setdefault(key, len(self.index)) - key_index = self.index[key] = len(self.index) - self.counts[key_index][int(ambiguous_counts)] += inc - contributed_counts += inc - - return contributed_counts - - def transform(self, refmgr): - # transform 2-column uniq/ambig count matrix - # into 4 columns - # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm - - # obtain gene lengths - lengths = np.array( - tuple( - (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] - for key in self.index - ) - ) - logger.info("LENGTHS ARRAY = %s", lengths.shape) - logger.info("INDEX SIZE = %s", len(self.index)) - - # remove the un-indexed rows - self.counts = self.counts[0:len(self.index), :] - - # calculate combined_raw - self.counts[:, 1:2] += self.counts[:, 0:1] - - # duplicate the raw counts - self.counts = np.concatenate( - (self.counts, self.counts,), - axis=1, - ) - - # length-normalise the lnorm columns - self.counts[:, 2:4] /= lengths[:, None] - - # return count sums - return self.counts.sum(axis=0) diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py index d3eb059f..4a37ad9a 100644 --- a/gffquant/counters/count_manager.py +++ b/gffquant/counters/count_manager.py @@ -5,7 +5,7 @@ import numpy as np from .. import DistributionMode -from .alignment_counter2 import AlignmentCounter +from .alignment_counter import AlignmentCounter from .region_counter import RegionCounter diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py index 5b36a876..41f2b574 100644 --- a/gffquant/counters/region_counter.py +++ b/gffquant/counters/region_counter.py @@ -5,7 +5,7 @@ from collections import Counter from .. import DistributionMode -from .alignment_counter2 import AlignmentCounter +from .alignment_counter import AlignmentCounter # from count_manager.get_counts() From 26fbb556dfdefa0e2d962fc666a57b2673fc3c25 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 22:52:23 +0100 Subject: [PATCH 038/128] throwing out old code, splitting of regioncount_annotator --- gffquant/annotation/__init__.py | 3 +- gffquant/annotation/count_annotator.py | 79 +------------------ gffquant/annotation/regioncount_annotator.py | 81 ++++++++++++++++++++ gffquant/counters/alignment_counter.py | 28 +++---- 4 files changed, 98 insertions(+), 93 deletions(-) create mode 100644 gffquant/annotation/regioncount_annotator.py diff --git a/gffquant/annotation/__init__.py b/gffquant/annotation/__init__.py index 1649dcb4..2f8e1c0c 100644 --- a/gffquant/annotation/__init__.py +++ b/gffquant/annotation/__init__.py @@ -3,6 +3,7 @@ """ module docstring """ # from .count_annotator import GeneCountAnnotator, RegionCountAnnotator -from .count_annotator import RegionCountAnnotator +from .count_annotator import CountAnnotator from .count_writer import CountWriter from .genecount_annotator import GeneCountAnnotator +from .regioncount_annotator import RegionCountAnnotator diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 12d811ae..669eb325 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -8,7 +8,7 @@ import numpy as np -from ..counters.count_manager import CountManager, AlignmentCounter +from ..counters.count_manager import AlignmentCounter logger = logging.getLogger(__name__) @@ -193,80 +193,3 @@ def compute_count_vector( counts[1::2] /= float(length) return counts - - -class RegionCountAnnotator(CountAnnotator): - """ CountAnnotator subclass for contig/region-based counting. """ - - def __init__(self, strand_specific, report_scaling_factors=True): - CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - - # pylint: disable=R0914,W0613 - def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): - """ - Annotate a set of region counts via db-lookup. - input: - - bam: bamr.BamFile to use as lookup table for reference names - - db: GffDatabaseManager holding functional annotation database - - count_manager: count_data - """ - # for rid in set(count_manager.uniq_regioncounts).union( - # count_manager.ambig_regioncounts - # ): - for rid in count_manager.get_all_regions(region_counts=True): - ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] - - for region in count_manager.get_regions(rid): - if self.strand_specific: - (start, end), rev_strand = region - else: - (start, end), rev_strand = region, None - # the region_annotation is a tuple of key-value pairs: - # (strand, func_category1: subcategories, func_category2: subcategories, ...) - # the first is the strand, the second is the gene id, the rest are the features - - region_annotation = db.query_sequence(ref, start=start, end=end) - if region_annotation is not None: - region_strand, feature_id, region_annotation = region_annotation - if feature_id is None: - feature_id = ref - - on_other_strand = (region_strand == "+" and rev_strand) \ - or (region_strand == "-" and not rev_strand) - - antisense_region = self.strand_specific and on_other_strand - - uniq_counts, ambig_counts = count_manager.get_counts( - (rid, start, end), region_counts=True, strand_specific=self.strand_specific - ) - - if self.strand_specific: - # if the region is antisense, 'sense-counts' (relative to the) region come from the - # negative strand and 'antisense-counts' from the positive strand - # vice-versa for a sense-region - strand_specific_counts = ( - (count_manager.MINUS_STRAND, count_manager.PLUS_STRAND) - if antisense_region - else (count_manager.PLUS_STRAND, count_manager.MINUS_STRAND) - ) - else: - strand_specific_counts = None - - region_length = end - start + 1 - counts = self.compute_count_vector( - uniq_counts, - ambig_counts, - region_length, - strand_specific_counts=strand_specific_counts, - region_counts=True, - ) - - self.distribute_feature_counts(counts, region_annotation) - - gcounts = self.gene_counts.setdefault( - feature_id, np.zeros(self.bins) - ) - gcounts += counts - self.total_gene_counts += counts[:4] - - self.calculate_scaling_factors() diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py new file mode 100644 index 00000000..8db52e1c --- /dev/null +++ b/gffquant/annotation/regioncount_annotator.py @@ -0,0 +1,81 @@ +import numpy as np + +from . import CountAnnotator +from ..counters import AlignmentCounter + + +class RegionCountAnnotator(CountAnnotator): + """ CountAnnotator subclass for contig/region-based counting. """ + + def __init__(self, strand_specific, report_scaling_factors=True): + CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) + + # pylint: disable=R0914,W0613 + def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): + """ + Annotate a set of region counts via db-lookup. + input: + - bam: bamr.BamFile to use as lookup table for reference names + - db: GffDatabaseManager holding functional annotation database + - count_manager: count_data + """ + # for rid in set(count_manager.uniq_regioncounts).union( + # count_manager.ambig_regioncounts + # ): + for rid in counter.get_all_regions(region_counts=True): + ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] + + for region in counter.get_regions(rid): + if self.strand_specific: + (start, end), rev_strand = region + else: + (start, end), rev_strand = region, None + # the region_annotation is a tuple of key-value pairs: + # (strand, func_category1: subcategories, func_category2: subcategories, ...) + # the first is the strand, the second is the gene id, the rest are the features + + region_annotation = db.query_sequence(ref, start=start, end=end) + if region_annotation is not None: + region_strand, feature_id, region_annotation = region_annotation + if feature_id is None: + feature_id = ref + + on_other_strand = (region_strand == "+" and rev_strand) \ + or (region_strand == "-" and not rev_strand) + + antisense_region = self.strand_specific and on_other_strand + + uniq_counts, ambig_counts = counter.get_counts( + (rid, start, end), region_counts=True, strand_specific=self.strand_specific + ) + + if self.strand_specific: + # if the region is antisense, 'sense-counts' (relative to the) region come from the + # negative strand and 'antisense-counts' from the positive strand + # vice-versa for a sense-region + strand_specific_counts = ( + (counter.MINUS_STRAND, counter.PLUS_STRAND) + if antisense_region + else (counter.PLUS_STRAND, counter.MINUS_STRAND) + ) + else: + strand_specific_counts = None + + region_length = end - start + 1 + counts = self.compute_count_vector( + uniq_counts, + ambig_counts, + region_length, + strand_specific_counts=strand_specific_counts, + region_counts=True, + ) + + self.distribute_feature_counts(counts, region_annotation) + + gcounts = self.gene_counts.setdefault( + feature_id, np.zeros(self.bins) + ) + gcounts += counts + self.total_gene_counts += counts[:4] + + self.calculate_scaling_factors() diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 8fc96d2c..16e53b54 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -134,20 +134,20 @@ def get_unannotated_reads(self): def get_counts(self, seqid, strand_specific=False): if strand_specific: - raise NotImplementedError() - uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] - uniq_counts[seqid[1]] = uniq_counter[seqid] - ambig_counts[seqid[1]] = ambig_counter[seqid] - - # rid = seqid[0] if isinstance(seqid, tuple) else seqid - # uniq_counts = [ - # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # ] - # ambig_counts = [ - # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # ] + raise NotImplementedError() + # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] + # uniq_counts[seqid[1]] = uniq_counter[seqid] + # ambig_counts[seqid[1]] = ambig_counter[seqid] + + # rid = seqid[0] if isinstance(seqid, tuple) else seqid + # uniq_counts = [ + # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], + # ] + # ambig_counts = [ + # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], + # ] counts = self[seqid] return np.array((counts[0], counts[2], counts[1], counts[3])) From bd1436f4812d9f7ac3505e7896a3fb50a836347e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 22:58:35 +0100 Subject: [PATCH 039/128] removed count_manager --- gffquant/counters/__init__.py | 1 - gffquant/counters/count_manager.py | 222 ----------------------------- 2 files changed, 223 deletions(-) delete mode 100644 gffquant/counters/count_manager.py diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py index 34d30242..d325316e 100644 --- a/gffquant/counters/__init__.py +++ b/gffquant/counters/__init__.py @@ -5,4 +5,3 @@ from .alignment_counter import AlignmentCounter from .region_counter import RegionCounter -from .count_manager import CountManager diff --git a/gffquant/counters/count_manager.py b/gffquant/counters/count_manager.py deleted file mode 100644 index 4a37ad9a..00000000 --- a/gffquant/counters/count_manager.py +++ /dev/null @@ -1,222 +0,0 @@ -"""count_manager""" - -from collections import Counter - -import numpy as np - -from .. import DistributionMode -from .alignment_counter import AlignmentCounter -from .region_counter import RegionCounter - - -# pylint: disable=R0902 -class CountManager: - # this may be counter-intuitive - # but originates from the samflags 0x10, 0x20, - # which also identify the reverse-strandness of the read - # and not the forward-strandness - PLUS_STRAND, MINUS_STRAND = False, True - - def toggle_single_read_handling(self, unmarked_orphans): - # precalculate count-increment for single-end, paired-end reads - # for mixed input (i.e., paired-end data with single-end reads = orphans from preprocessing), - # properly attribute fractional counts to the orphans - # Increments: - # alignment from single end library read: 1 - # alignment from paired-end library read: 0.5 / mate (pe_count = 1) or 1 / mate (pe_count = 2) - # alignment from paired-end library orphan: 0.5 (pe_count = 1) or 1 (pe_count = 2) - - # old code: - # increment = 1 if (not pair or self.paired_end_count == 2) else 0.5 - - # if pair: - # increment = 1 if self.paired_end_count == 2 else 0.5 - # else: - # increment = 0.5 if self.unmarked_orphans else 1 - self.increments = [ - (self.paired_end_count / 2.0) if unmarked_orphans else 1.0, - self.paired_end_count / 2.0 - ] - - def __init__( - # pylint: disable=W0613,R0913 - self, - distribution_mode=DistributionMode.ONE_OVER_N, - region_counts=True, - strand_specific=False, - paired_end_count=1, - ): - self.distribution_mode = distribution_mode - self.strand_specific = strand_specific - self.paired_end_count = paired_end_count - self.increments = [1.0, 1.0] - self.increments_auto_detect = [1.0, self.paired_end_count / 2.0] - - # self.uniq_seqcounts, self.ambig_seqcounts = None, None - # self.uniq_regioncounts, self.ambig_regioncounts = None, None - self.seqcounts, self.regioncounts = None, None - - if region_counts: - # self.uniq_regioncounts = RegionCounter(strand_specific=strand_specific) - # self.ambig_regioncounts = RegionCounter( - # strand_specific=strand_specific, - # distribution_mode=distribution_mode, - # ) - self.regioncounts = RegionCounter( - strand_specific=strand_specific, - distribution_mode=distribution_mode, - ) - - else: - # self.uniq_seqcounts = AlignmentCounter(strand_specific=strand_specific) - # self.ambig_seqcounts = AlignmentCounter( - # strand_specific=strand_specific, - # distribution_mode=distribution_mode - # ) - self.seqcounts = AlignmentCounter( - strand_specific=strand_specific, - distribution_mode=distribution_mode, - ) - - def has_ambig_counts(self): - return any( - ( - self.seqcounts and self.seqcounts.has_ambig_counts(), - self.regioncounts and self.regioncounts.has_ambig_counts(), - ) - ) - # return self.ambig_regioncounts or self.ambig_seqcounts - - def update_counts(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None): - # seq_counter, region_counter = ( - # (self.uniq_seqcounts, self.uniq_regioncounts) - # if not ambiguous_counts - # else (self.ambig_seqcounts, self.ambig_regioncounts) - # ) - - if pe_library is not None: - # this is the case when the alignment has a read group tag - # if pe_library is True (RG tag '2') -> take paired-end increment (also for orphans) - # else (RG tag '1') -> take single-end increment - increment = self.increments_auto_detect[pe_library] - else: - # if the alignment has no (appropriate) read group tag - # use the paired-end information instead - # if orphan reads are present in the input sam/bam, - # the flag `--unmarked_orphans` should be set - # otherwise orphan reads will be assigned a count of 1. - increment = self.increments[pair] - - contributed_counts = 0 - if self.seqcounts is not None: - contributed_counts = self.seqcounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) - elif self.regioncounts is not None: - contributed_counts = self.regioncounts.update_counts(count_stream, increment=increment, ambiguous_counts=ambiguous_counts,) - # if seq_counter is not None: - # contributed_counts = seq_counter.update_counts(count_stream, increment=increment) - # elif region_counter is not None: - # contributed_counts = region_counter.update_counts(count_stream, increment=increment) - - return contributed_counts - - def dump_raw_counters(self, prefix, refmgr): - # if self.uniq_seqcounts is not None: - # self.uniq_seqcounts.dump(prefix, refmgr) - # if self.ambig_seqcounts is not None: - # self.ambig_seqcounts.dump(prefix, refmgr) - # if self.uniq_regioncounts is not None: - # self.uniq_regioncounts.dump(prefix, refmgr) - # if self.ambig_regioncounts is not None: - # self.ambig_regioncounts.dump(prefix, refmgr) - ... - - def get_unannotated_reads(self): - unannotated_reads = 0 - - # if self.uniq_regioncounts is not None: - # unannotated_reads += self.uniq_regioncounts.unannotated_reads - # if self.ambig_regioncounts is not None: - # unannotated_reads += self.ambig_regioncounts.unannotated_reads - # if self.uniq_seqcounts is not None: - # unannotated_reads += self.uniq_seqcounts.unannotated_reads - # if self.ambig_seqcounts is not None: - # unannotated_reads += self.ambig_seqcounts.unannotated_reads - if self.regioncounts is not None: - unannotated_reads += self.regioncounts.unannotated_reads - if self.seqcounts is not None: - unannotated_reads += self.seqcounts.unannotated_reads - - return unannotated_reads - - def get_counts(self, seqid, region_counts=False, strand_specific=False): - if region_counts: - raise NotImplementedError() - rid, seqid = seqid[0], seqid[1:] - - uniq_counter = self.uniq_regioncounts.get(rid, Counter()) - ambig_counter = self.ambig_regioncounts.get(rid, Counter()) - - # pylint: disable=R1720 - if strand_specific: - raise NotImplementedError - else: - return [uniq_counter[seqid]], [ambig_counter[seqid]] - - else: - # uniq_counter, ambig_counter = self.uniq_seqcounts, self.ambig_seqcounts - - if strand_specific: - raise NotImplementedError() - uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] - uniq_counts[seqid[1]] = uniq_counter[seqid] - ambig_counts[seqid[1]] = ambig_counter[seqid] - - # rid = seqid[0] if isinstance(seqid, tuple) else seqid - # uniq_counts = [ - # uniq_counter[(rid, CountManager.PLUS_STRAND)], - # uniq_counter[(rid, CountManager.MINUS_STRAND)], - # ] - # ambig_counts = [ - # ambig_counter[(rid, CountManager.PLUS_STRAND)], - # ambig_counter[(rid, CountManager.MINUS_STRAND)], - # ] - else: - # uniq_counts, ambig_counts = [uniq_counter[seqid]], [ambig_counter[seqid]] - # uniq_counts, ambig_counts = [self.seqcounts[seqid][0]], [self.seqcounts[seqid][1]] - counts = self.seqcounts[seqid] - - # return uniq_counts, ambig_counts - return np.array((counts[0], counts[2], counts[1], counts[3])) - - def get_regions(self, rid): - # return set(self.uniq_regioncounts.get(rid, set())).union( - # self.ambig_regioncounts.get(rid, set()) - # ) - return set(self.uniq_regioncounts.get(rid, Counter())).union( - self.ambig_regioncounts.get(rid, Counter()) - ) - - def get_all_regions(self, region_counts=False): - # uniq_counts, ambig_counts = ( - # (self.uniq_seqcounts, self.ambig_seqcounts,), - # (self.uniq_regioncounts, self.ambig_regioncounts,), - # )[region_counts] - # yield from set(uniq_counts).union(ambig_counts) - counts = ( - self.seqcounts, - self.regioncounts, - )[region_counts] - - yield from counts - - def transform_counts(self, refmgr): - if self.seqcounts is not None: - return self.seqcounts.transform(refmgr) - if self.regioncounts is not None: - return self.regioncounts.transform(refmgr) - - def dump(self, prefix, refmgr): - if self.seqcounts is not None: - self.seqcounts.dump(prefix, refmgr) - if self.regioncounts is not None: - self.regioncounts.dump(prefix, refmgr) From b460ebdd38437f8889f3429e76a641eb5f7933d6 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 22 Dec 2024 23:00:35 +0100 Subject: [PATCH 040/128] removed count_manager references --- gffquant/annotation/count_annotator.py | 2 +- gffquant/annotation/genecount_annotator.py | 3 +-- gffquant/annotation/regioncount_annotator.py | 4 ---- gffquant/profilers/feature_quantifier.py | 7 ------- 4 files changed, 2 insertions(+), 14 deletions(-) diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 669eb325..f92b6472 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -8,7 +8,7 @@ import numpy as np -from ..counters.count_manager import AlignmentCounter +from ..counters import AlignmentCounter logger = logging.getLogger(__name__) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index bbd7581d..6e84cf95 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -17,9 +17,8 @@ def __init__(self, strand_specific, report_scaling_factors=True): def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of gene counts with functional annotations. """ - self.total_gene_counts = counter.transform(refmgr) # count_manager.transform_counts(refmgr) + self.total_gene_counts = counter.transform(refmgr) logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) - # self.total_counts = self.total_gene_counts # ? # formerly used in compute_count_vector strand_specific_counts = ( diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py index 8db52e1c..6d719413 100644 --- a/gffquant/annotation/regioncount_annotator.py +++ b/gffquant/annotation/regioncount_annotator.py @@ -17,11 +17,7 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): input: - bam: bamr.BamFile to use as lookup table for reference names - db: GffDatabaseManager holding functional annotation database - - count_manager: count_data """ - # for rid in set(count_manager.uniq_regioncounts).union( - # count_manager.ambig_regioncounts - # ): for rid in counter.get_all_regions(region_counts=True): ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index b3b08a9f..a26e9508 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -122,7 +122,6 @@ def process_counters( self.adm = AnnotationDatabaseManager.from_db(self.db, in_memory=in_memory) if dump_counters: - # self.count_manager.dump_raw_counters(self.out_prefix, self.reference_manager) self.counter.dump(self.out_prefix, self.reference_manager,) report_scaling_factors = restrict_reports is None or "scaled" in restrict_reports @@ -134,7 +133,6 @@ def process_counters( count_writer = CountWriter( self.out_prefix, - # has_ambig_counts=self.count_manager.has_ambig_counts(), has_ambig_counts=self.counter.has_ambig_counts(), strand_specific=self.strand_specific, restrict_reports=restrict_reports, @@ -143,7 +141,6 @@ def process_counters( filtered_readcount=self.aln_counter["filtered_read_count"], ) - # unannotated_reads = self.count_manager.get_unannotated_reads() unannotated_reads = self.counter.get_unannotated_reads() unannotated_reads += self.aln_counter["unannotated_ambig"] @@ -154,8 +151,6 @@ def process_counters( ) count_writer.write_gene_counts( - # count_annotator.gene_counts, - # self.count_manager, self.counter, self.reference_manager, count_annotator.scaling_factors["total_gene_uniq"], @@ -198,7 +193,6 @@ def process_alignments( filtered_sam=debug_samfile, ) - # self.count_manager.toggle_single_read_handling(unmarked_orphans) self.counter.toggle_single_read_handling(unmarked_orphans) ac = self.aln_counter @@ -420,7 +414,6 @@ def process_alignment_group(self, aln_group, aln_reader): ) ) - # contributed_counts = self.count_manager.update_counts( contributed_counts = self.counter.update( count_stream, ambiguous_counts=is_ambiguous_group, From 178955119ce0cc3eeeda52909b7664ea44fecb3f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 23 Dec 2024 21:05:15 +0100 Subject: [PATCH 041/128] modified gene_count write behaviour in prep of ggroup annotation --- gffquant/alignment/aln_group.py | 2 ++ gffquant/annotation/count_annotator.py | 26 +++++++++---------- gffquant/annotation/genecount_annotator.py | 12 +++++++-- gffquant/counters/alignment_counter.py | 17 ++++++++++--- gffquant/profilers/feature_quantifier.py | 29 +++++++++++++++------- 5 files changed, 59 insertions(+), 27 deletions(-) diff --git a/gffquant/alignment/aln_group.py b/gffquant/alignment/aln_group.py index 057b5a3f..b25f5758 100644 --- a/gffquant/alignment/aln_group.py +++ b/gffquant/alignment/aln_group.py @@ -79,6 +79,8 @@ def get_all_hits(self, as_ambiguous=False): except TypeError as err: raise TypeError(f"Cannot derive sequencing library from tags: {aln.tags}") from err + # in region mode, there can be more hits (if the alignment overlaps multiple features of the target sequence) + # in gene mode, each alignment is a hit, i.e. there is at most 1 hit / alignment yield aln.hits, n_aln def get_ambig_align_counts(self): diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index f92b6472..3598d285 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -112,19 +112,19 @@ def calc_scaling_factor(raw, normed, default=0): ) # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts - total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts - logger.info( - "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s", - total_uniq, total_uniq_normed, total_ambi, total_ambi_normed - ) - - self.scaling_factors["total_gene_uniq"] = calc_scaling_factor( - total_uniq, total_uniq_normed, default_scaling_factor - ) - - self.scaling_factors["total_gene_ambi"] = calc_scaling_factor( - total_ambi, total_ambi_normed, default_scaling_factor - ) + # total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts + # logger.info( + # "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s", + # total_uniq, total_uniq_normed, total_ambi, total_ambi_normed + # ) + + # self.scaling_factors["total_gene_uniq"] = calc_scaling_factor( + # total_uniq, total_uniq_normed, default_scaling_factor + # ) + + # self.scaling_factors["total_gene_ambi"] = calc_scaling_factor( + # total_ambi, total_ambi_normed, default_scaling_factor + # ) fc_items = self.feature_count_sums.items() for category, ( diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 6e84cf95..dd405f6c 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -2,6 +2,7 @@ import logging from .count_annotator import CountAnnotator +from .count_writer import CountWriter from ..counters import AlignmentCounter @@ -17,8 +18,15 @@ def __init__(self, strand_specific, report_scaling_factors=True): def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of gene counts with functional annotations. """ - self.total_gene_counts = counter.transform(refmgr) - logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) + # self.total_gene_counts, u_sf, a_sf = counter.generate_gene_count_matrix(refmgr) + # logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) + + # writer.write_gene_counts( + # counter, + # refmgr, + # u_sf, a_sf, + # gene_group_db=gene_group_db, + # ) # formerly used in compute_count_vector strand_specific_counts = ( diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 16e53b54..e377d67e 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -186,7 +186,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): return contributed_counts - def transform(self, refmgr): + def generate_gene_count_matrix(self, refmgr): # transform 2-column uniq/ambig count matrix # into 4 columns # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm @@ -216,5 +216,16 @@ def transform(self, refmgr): # length-normalise the lnorm columns self.counts[:, 2:4] /= lengths[:, None] - # return count sums - return self.counts.sum(axis=0) + count_sums = self.counts.sum(axis=0) + + uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] + ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] + + logger.info( + "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", + count_sums[0], count_sums[2], count_sums[1], count_sums[3], + uniq_scaling_factor, ambig_scaling_factor, + ) + + # return count sums and scaling factors + return count_sums, uniq_scaling_factor, ambig_scaling_factor diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index a26e9508..6f9a4558 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -129,8 +129,6 @@ def process_counters( Annotator = (GeneCountAnnotator, RegionCountAnnotator)[self.run_mode.overlap_required] count_annotator = Annotator(self.strand_specific, report_scaling_factors=report_scaling_factors) - count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) - count_writer = CountWriter( self.out_prefix, has_ambig_counts=self.counter.has_ambig_counts(), @@ -141,6 +139,19 @@ def process_counters( filtered_readcount=self.aln_counter["filtered_read_count"], ) + count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) + + total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager) + logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts) + + count_writer.write_gene_counts( + self.counter, + self.reference_manager, + u_sf, a_sf, + gene_group_db=gene_group_db, + ) + + unannotated_reads = self.counter.get_unannotated_reads() unannotated_reads += self.aln_counter["unannotated_ambig"] @@ -150,13 +161,13 @@ def process_counters( (None, unannotated_reads)[report_unannotated], ) - count_writer.write_gene_counts( - self.counter, - self.reference_manager, - count_annotator.scaling_factors["total_gene_uniq"], - count_annotator.scaling_factors["total_gene_ambi"], - gene_group_db=gene_group_db, - ) + # count_writer.write_gene_counts( + # self.counter, + # self.reference_manager, + # count_annotator.scaling_factors["total_gene_uniq"], + # count_annotator.scaling_factors["total_gene_ambi"], + # gene_group_db=gene_group_db, + # ) self.adm.clear_caches() From fdd7aafa0408de2c2879cc20c94c26cafde6dccf Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 23 Dec 2024 21:16:33 +0100 Subject: [PATCH 042/128] modified gene_count write behaviour in prep of ggroup annotation --- gffquant/profilers/feature_quantifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 6f9a4558..f6ec4ea3 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -139,11 +139,11 @@ def process_counters( filtered_readcount=self.aln_counter["filtered_read_count"], ) - count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) - total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager) logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts) + count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) + count_writer.write_gene_counts( self.counter, self.reference_manager, From 4af2f1449079376aae279c2195e1c59fb20eb95d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 00:14:24 +0100 Subject: [PATCH 043/128] change gene group handling during annotation --- gffquant/annotation/genecount_annotator.py | 6 ++++-- gffquant/counters/alignment_counter.py | 23 ++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index dd405f6c..aac0b11a 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -36,11 +36,13 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): for rid in counter.get_all_regions(): counts = counter.get_counts(rid, strand_specific=self.strand_specific) + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) if gene_group_db: - ref_tokens = ref.split(".") - gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + # ref_tokens = ref.split(".") + # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + gene_id, ggroup_id = rid, rid else: gene_id, ggroup_id = ref, ref diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index e377d67e..bbcd062e 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -229,3 +229,26 @@ def generate_gene_count_matrix(self, refmgr): # return count sums and scaling factors return count_sums, uniq_scaling_factor, ambig_scaling_factor + + def group_gene_count_matrix(self, refmgr): + ggroup_index = {} + for key, key_index in self.index.items(): + ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0] + ref_tokens = ref.split(".") + _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + g_key_index = ggroup_index.get(ggroup_id) + if g_key_index is None: + g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) + else: + # only add counts if group has been encountered before + # else there will be duplicates + self.counts[g_key_index] += self.counts[key_index] + + # replace index with grouped index + self.index = ggroup_index + + # remove the un-indexed (ungrouped) rows + self.counts = self.counts[0:len(self.index), :] + + + From 39ebd15f7619289e7292b698b5ca37c79a4352ea Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 00:35:45 +0100 Subject: [PATCH 044/128] change gene group handling during annotation --- gffquant/profilers/feature_quantifier.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index f6ec4ea3..988202d3 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -142,7 +142,6 @@ def process_counters( total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager) logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts) - count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) count_writer.write_gene_counts( self.counter, @@ -151,6 +150,9 @@ def process_counters( gene_group_db=gene_group_db, ) + self.counter.group_gene_count_matrix(self.reference_manager) + + count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) unannotated_reads = self.counter.get_unannotated_reads() unannotated_reads += self.aln_counter["unannotated_ambig"] From c0b466479e9e4df4867c9593954403f1f3c883f1 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 00:45:11 +0100 Subject: [PATCH 045/128] change gene group handling during annotation --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index aac0b11a..6f3fceed 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -37,13 +37,13 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): for rid in counter.get_all_regions(): counts = counter.get_counts(rid, strand_specific=self.strand_specific) - ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) if gene_group_db: # ref_tokens = ref.split(".") # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] gene_id, ggroup_id = rid, rid else: + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) gene_id, ggroup_id = ref, ref region_annotation = db.query_sequence(ggroup_id) From b935a24cf04a07851f7973a7cfe91f102fcdb971 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 09:49:58 +0100 Subject: [PATCH 046/128] added debug messaging --- gffquant/counters/alignment_counter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index bbcd062e..323d37d5 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -235,14 +235,16 @@ def group_gene_count_matrix(self, refmgr): for key, key_index in self.index.items(): ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0] ref_tokens = ref.split(".") - _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] g_key_index = ggroup_index.get(ggroup_id) if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) + logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) else: # only add counts if group has been encountered before # else there will be duplicates self.counts[g_key_index] += self.counts[key_index] + logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) # replace index with grouped index self.index = ggroup_index From 69709f881c17655b365161f4a7f4a00007187aa6 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 14:36:40 +0100 Subject: [PATCH 047/128] solved? --- gffquant/counters/alignment_counter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 323d37d5..f47f0581 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -237,13 +237,15 @@ def group_gene_count_matrix(self, refmgr): ref_tokens = ref.split(".") gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] g_key_index = ggroup_index.get(ggroup_id) + gene_counts = self.counts[key_index] if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) + self.counts[g_key_index] = gene_counts logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) else: # only add counts if group has been encountered before # else there will be duplicates - self.counts[g_key_index] += self.counts[key_index] + self.counts[g_key_index] += gene_counts logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) # replace index with grouped index From 62c0a523982ceabd793f69dead0a538f766ee352 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 23:19:15 +0100 Subject: [PATCH 048/128] disabling various logger calls --- gffquant/annotation/genecount_annotator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 6f3fceed..cbc74ff5 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -49,14 +49,14 @@ def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): region_annotation = db.query_sequence(ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation - logger.info( - "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", - gene_id, ggroup_id, counts[0], counts[2], - ) + # logger.info( + # "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", + # gene_id, ggroup_id, counts[0], counts[2], + # ) self.distribute_feature_counts(counts, region_annotation) else: - logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) + # logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) self.unannotated_counts += counts[:4] self.calculate_scaling_factors() From 2239d29673bb1a186609a7ef1e68b815099473b0 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 24 Dec 2024 23:29:58 +0100 Subject: [PATCH 049/128] disabling various logger calls --- gffquant/counters/alignment_counter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index f47f0581..51a2dd28 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -241,12 +241,12 @@ def group_gene_count_matrix(self, refmgr): if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) self.counts[g_key_index] = gene_counts - logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) + # logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) else: # only add counts if group has been encountered before # else there will be duplicates self.counts[g_key_index] += gene_counts - logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) + # logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) # replace index with grouped index self.index = ggroup_index From 413684abfb2b1c437151f9c2ebd598efe9e0c7e7 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 14:51:11 +0100 Subject: [PATCH 050/128] trying to update feature count processing --- gffquant/annotation/count_writer.py | 77 +++++++++++++++------- gffquant/annotation/genecount_annotator.py | 61 ++++++++++++++++- gffquant/profilers/feature_quantifier.py | 21 +++--- 3 files changed, 126 insertions(+), 33 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index e5959598..bd9d7788 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -109,6 +109,59 @@ def compile_block(raw, lnorm, scaling_factors): def write_row(header, data, stream=sys.stdout): print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream) + def write_category(self, category, counts, index, names, unique_sf, ambig_sf, unannotated_reads=None, report_unseen=True): + # category, c_counts, c_index, c_names, u_sf, a_sf + if "scaled" in self.publish_reports: + logger.info( + "SCALING FACTORS %s %s %s", + category, unique_sf, ambig_sf, + ) + with gzip.open(f"{self.out_prefix}.{category}.txt.gz", "wt") as feat_out: + header = self.get_header() + print("feature", *header, sep="\t", file=feat_out) + + if unannotated_reads is not None: + print("unannotated", unannotated_reads, sep="\t", file=feat_out) + + if "total_readcount" in self.publish_reports: + CountWriter.write_row( + "total_reads", + np.zeros(len(header)) + self.total_readcount, + stream=feat_out, + ) + + if "filtered_readcount" in self.publish_reports: + CountWriter.write_row( + "filtered_reads", + np.zeros(len(header)) + self.filtered_readcount, + stream=feat_out, + ) + + if "category" in self.publish_reports: + # cat_counts = counts.get(f"cat:::{category_id}") + cat_counts = counts.get(0) + logger.info("CAT %s: %s", category, str(cat_counts)) + if cat_counts is not None: + cat_row = self.compile_output_row( + cat_counts, + # scaling_factor=featcounts.scaling_factors["total_uniq"], + # ambig_scaling_factor=featcounts.scaling_factors["total_ambi"], + scaling_factor=unique_sf, + ambig_scaling_factor=ambig_sf, + ) + CountWriter.write_row("category", cat_row, stream=feat_out) + + for fid, i in index.items(): + f_counts = np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3])) #counts[fid] + if report_unseen or f_counts.sum(): + out_row = self.compile_output_row( + f_counts, + scaling_factor=unique_sf, + ambig_scaling_factor=ambig_sf, + ) + CountWriter.write_row(names[fid], out_row, stream=feat_out) + + # pylint: disable=R0914 def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_unseen=True): for category_id, counts in sorted(featcounts.items()): @@ -176,13 +229,6 @@ def write_gene_counts( with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True) - # for gene, g_counts in sorted(gene_counts.items()): - # out_row = self.compile_output_row( - # g_counts, - # scaling_factor=uniq_scaling_factor, - # ambig_scaling_factor=ambig_scaling_factor - # ) - # CountWriter.write_row(gene, out_row, stream=gene_out) ref_stream = ( ( refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0], @@ -206,20 +252,3 @@ def write_gene_counts( ) CountWriter.write_row(gene_id, out_row, stream=gene_out,) - - # for rid in gene_counts.get_all_regions(): - # counts = gene_counts.get_counts(rid) - # out_row = self.compile_output_row( - # counts, - # scaling_factor=uniq_scaling_factor, - # ambig_scaling_factor=ambig_scaling_factor, - # ) - # ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0] - - # if gene_group_db: - # ref_tokens = ref.split(".") - # gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] - # else: - # gene_id = ref - - # CountWriter.write_row(gene_id, out_row, stream=gene_out,) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index cbc74ff5..65d22b54 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -1,9 +1,12 @@ """ module docstring """ import logging +import numpy as np + from .count_annotator import CountAnnotator from .count_writer import CountWriter from ..counters import AlignmentCounter +from ..db.annotation_db import AnnotationDatabaseManager logger = logging.getLogger(__name__) @@ -16,7 +19,63 @@ def __init__(self, strand_specific, report_scaling_factors=True): """ __init__() """ CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - def annotate(self, refmgr, db, counter: AlignmentCounter, gene_group_db=False): + def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): + for it, category in enumerate(db.get_categories()): + features = tuple(db.get_features(category.id)) + # total_reads 483808.00000 483808.00000 483808.00000 483808.00000 483808.00000 483808.00000 + # filtered_reads 454437.00000 454437.00000 454437.00000 454437.00000 454437.00000 454437.00000 + # category 45359.50000 47.10706 42266.81963 152875.83896 224.72779 149853.25971 + category_counts = np.zeros( + (len(features) + 1, 2,), + dtype='float64', + ) + category_index = { + feature.id: i + for i, feature in enumerate(features, start=1) + } + category_names = { + feature.id: feature.name + for feature in features + } + for rid in counter.get_all_regions(): + counts = counter.get_counts(rid, strand_specific=self.strand_specific) + if gene_group_db: + gene_id, ggroup_id = rid, rid + else: + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + gene_id, ggroup_id = ref, ref + + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + category_features = dict(region_annotation).get(category.id) + if category_features is not None: + category_counts[0] += counts # category row + for cf in category_features: + category_counts[category_index.get(cf)] += counts + + elif it == 0: + self.unannotated_counts += counts[:4] + + count_sums = self.counts.sum(axis=0) + + uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] + ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] + + logger.info( + "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", + category.name, + count_sums[0], count_sums[2], count_sums[1], count_sums[3], + uniq_scaling_factor, ambig_scaling_factor, + ) + + yield category.name, category_counts, category_index, category_names, uniq_scaling_factor, ambig_scaling_factor + + + + + + + def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of gene counts with functional annotations. """ # self.total_gene_counts, u_sf, a_sf = counter.generate_gene_count_matrix(refmgr) # logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 988202d3..81403321 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -154,14 +154,19 @@ def process_counters( count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) - unannotated_reads = self.counter.get_unannotated_reads() - unannotated_reads += self.aln_counter["unannotated_ambig"] - - count_writer.write_feature_counts( - self.adm, - count_annotator, - (None, unannotated_reads)[report_unannotated], - ) + # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor + for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,): + unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] + count_writer.write_category(category, c_counts, c_index, c_names, u_sf, a_sf, unannotated_reads=(None, unannotated_reads)[report_unannotated],) + + # unannotated_reads = self.counter.get_unannotated_reads() + # unannotated_reads += self.aln_counter["unannotated_ambig"] + + # count_writer.write_feature_counts( + # self.adm, + # count_annotator, + # (None, unannotated_reads)[report_unannotated], + # ) # count_writer.write_gene_counts( # self.counter, From 1b5cf9c602effebe268ef136c3df38feafd35c9a Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 15:01:42 +0100 Subject: [PATCH 051/128] trying to update feature count processing --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 81403321..38be7233 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -152,7 +152,7 @@ def process_counters( self.counter.group_gene_count_matrix(self.reference_manager) - count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) + # count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,): From cb6ac1a58f29725d85813c935f47498c26e152bb Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 15:02:37 +0100 Subject: [PATCH 052/128] trying to update feature count processing --- gffquant/profilers/feature_quantifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 38be7233..4a6cc96e 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -156,6 +156,7 @@ def process_counters( # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,): + logger.info("PROCESSING CATEGORY=%s", category) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] count_writer.write_category(category, c_counts, c_index, c_names, u_sf, a_sf, unannotated_reads=(None, unannotated_reads)[report_unannotated],) From 8977c4d71bba5855a4dba757cce9d0813f7e655d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 16:23:15 +0100 Subject: [PATCH 053/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 65d22b54..3304b97f 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -46,7 +46,9 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou gene_id, ggroup_id = ref, ref region_annotation = db.query_sequence(ggroup_id) + # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation)) if region_annotation is not None: + _, _, region_annotation = region_annotation category_features = dict(region_annotation).get(category.id) if category_features is not None: category_counts[0] += counts # category row From f96e1b3b7a8baafa7054706d1bef0f8fd05c281e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 16:49:54 +0100 Subject: [PATCH 054/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 3304b97f..22a73ed1 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou elif it == 0: self.unannotated_counts += counts[:4] - count_sums = self.counts.sum(axis=0) + count_sums = counter.sum(axis=0) uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] From b4fb4d8fd30d329da0d77200dfbbadfe06e8c944 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 17:08:03 +0100 Subject: [PATCH 055/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 22a73ed1..85bbe569 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou elif it == 0: self.unannotated_counts += counts[:4] - count_sums = counter.sum(axis=0) + count_sums = counter.counts.sum(axis=0) uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] From 3594dfff13e72e475e394466044e002881354e70 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 17:19:38 +0100 Subject: [PATCH 056/128] trying to update feature count processing --- gffquant/annotation/count_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index bd9d7788..f4272cd7 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -139,7 +139,7 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un if "category" in self.publish_reports: # cat_counts = counts.get(f"cat:::{category_id}") - cat_counts = counts.get(0) + cat_counts = np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3])) logger.info("CAT %s: %s", category, str(cat_counts)) if cat_counts is not None: cat_row = self.compile_output_row( From 0dbd4ae5e723b3396ef9f0fdde4a0752bdaae778 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 17:45:44 +0100 Subject: [PATCH 057/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 85bbe569..a7768e56 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -26,7 +26,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou # filtered_reads 454437.00000 454437.00000 454437.00000 454437.00000 454437.00000 454437.00000 # category 45359.50000 47.10706 42266.81963 152875.83896 224.72779 149853.25971 category_counts = np.zeros( - (len(features) + 1, 2,), + (len(features) + 1, 4,), dtype='float64', ) category_index = { From 7b8e6bf18e9c7d8d8d178512f8f74278d49a6d0b Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 20:15:28 +0100 Subject: [PATCH 058/128] debug log --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index a7768e56..b6c6e5c6 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -46,7 +46,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou gene_id, ggroup_id = ref, ref region_annotation = db.query_sequence(ggroup_id) - # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation)) + logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation category_features = dict(region_annotation).get(category.id) From 563b4023855e703b43b9ef7a84d515643cc348fb Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 20:29:07 +0100 Subject: [PATCH 059/128] trying to fix annotate2 --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index b6c6e5c6..bbe4da36 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -49,7 +49,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation - category_features = dict(region_annotation).get(category.id) + category_features = dict(region_annotation).get(str(category.id)) if category_features is not None: category_counts[0] += counts # category row for cf in category_features: From e9e171565bfb8b5a3bc1630d30b7a19db12bc3d2 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 21:16:12 +0100 Subject: [PATCH 060/128] turn off annotate2 log --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index bbe4da36..a0967e66 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -46,7 +46,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou gene_id, ggroup_id = ref, ref region_annotation = db.query_sequence(ggroup_id) - logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id) + # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation category_features = dict(region_annotation).get(str(category.id)) From 07c674377fbc17b1a886ad26fc088b629782bdb6 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 25 Dec 2024 22:00:08 +0100 Subject: [PATCH 061/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index a0967e66..4de54ad9 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -53,7 +53,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou if category_features is not None: category_counts[0] += counts # category row for cf in category_features: - category_counts[category_index.get(cf)] += counts + category_counts[category_index.get(int(cf))] += counts elif it == 0: self.unannotated_counts += counts[:4] From da8e93baa8c7145f78e037f016bf787020d58f42 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 26 Dec 2024 00:27:17 +0100 Subject: [PATCH 062/128] trying to update feature count processing --- gffquant/annotation/count_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index f4272cd7..4b4fc4e0 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -139,7 +139,7 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un if "category" in self.publish_reports: # cat_counts = counts.get(f"cat:::{category_id}") - cat_counts = np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3])) + cat_counts = counts[0] # np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3])) logger.info("CAT %s: %s", category, str(cat_counts)) if cat_counts is not None: cat_row = self.compile_output_row( @@ -152,7 +152,7 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un CountWriter.write_row("category", cat_row, stream=feat_out) for fid, i in index.items(): - f_counts = np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3])) #counts[fid] + f_counts = counts[i] # np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3])) #counts[fid] if report_unseen or f_counts.sum(): out_row = self.compile_output_row( f_counts, From a94d9e7e782a99da3f445b5ba0a3b7bee3df7282 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 26 Dec 2024 01:47:39 +0100 Subject: [PATCH 063/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 4de54ad9..e77c15ef 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou elif it == 0: self.unannotated_counts += counts[:4] - count_sums = counter.counts.sum(axis=0) + count_sums = counter.counts[1:].sum(axis=0) uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] From 164cc6d51074938c43cf35a21924d12c9c29d853 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 26 Dec 2024 21:42:26 +0100 Subject: [PATCH 064/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index e77c15ef..363efa17 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -58,7 +58,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou elif it == 0: self.unannotated_counts += counts[:4] - count_sums = counter.counts[1:].sum(axis=0) + count_sums = category_counts[1:].sum(axis=0) uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] From 0cdf49af70c2a47bd4c6e6e489602f6bc3ea5013 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 26 Dec 2024 22:56:21 +0100 Subject: [PATCH 065/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 363efa17..36b597eb 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -60,13 +60,14 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou count_sums = category_counts[1:].sum(axis=0) - uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] - ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] + uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] + ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", category.name, - count_sums[0], count_sums[2], count_sums[1], count_sums[3], + # count_sums[0], count_sums[1], count_sums[1], count_sums[3], + *count_sums, uniq_scaling_factor, ambig_scaling_factor, ) From 3b2b5cb6b50c1ff984b4157be9fa804368c6967e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 27 Dec 2024 01:40:57 +0100 Subject: [PATCH 066/128] trying to update feature count processing --- gffquant/annotation/genecount_annotator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 36b597eb..63150c46 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -60,8 +60,15 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou count_sums = category_counts[1:].sum(axis=0) - uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] - ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] + # should scaled counts use a factor derived from all counts or should multi-feature counts only contribute once? + # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] + # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] + + uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0 + if category_counts[0][1]: + uniq_scaling_factor = category_counts[0][0] / category_counts[0][1] + if category_counts[0][3]: + ambig_scaling_factor = category_counts[0][2] / category_counts[0][3] logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", From cb9934e9157166804b3b26b3f41bb4a65df9b133 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 27 Dec 2024 11:31:24 +0100 Subject: [PATCH 067/128] added category scaling comment --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 63150c46..277a11a8 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -63,7 +63,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou # should scaled counts use a factor derived from all counts or should multi-feature counts only contribute once? # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] - + # pre 2.19 category count scaling was based on total counts uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0 if category_counts[0][1]: uniq_scaling_factor = category_counts[0][0] / category_counts[0][1] From 4e119a4b32a49c226d7da2a8ea9c8d554aae5f8f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 27 Dec 2024 23:20:55 +0100 Subject: [PATCH 068/128] linting + obsolete code removal --- gffquant/alignment/aln_group.py | 3 +- gffquant/annotation/count_annotator.py | 2 - gffquant/annotation/count_writer.py | 76 ++++---------------- gffquant/annotation/genecount_annotator.py | 46 ++++++------ gffquant/annotation/regioncount_annotator.py | 2 + gffquant/counters/alignment_counter.py | 19 +++-- gffquant/profilers/feature_quantifier.py | 40 +++++------ 7 files changed, 67 insertions(+), 121 deletions(-) diff --git a/gffquant/alignment/aln_group.py b/gffquant/alignment/aln_group.py index b25f5758..b465df46 100644 --- a/gffquant/alignment/aln_group.py +++ b/gffquant/alignment/aln_group.py @@ -79,7 +79,8 @@ def get_all_hits(self, as_ambiguous=False): except TypeError as err: raise TypeError(f"Cannot derive sequencing library from tags: {aln.tags}") from err - # in region mode, there can be more hits (if the alignment overlaps multiple features of the target sequence) + # in region mode, there can be more hits + # (if the alignment overlaps multiple features of the target sequence) # in gene mode, each alignment is a hit, i.e. there is at most 1 hit / alignment yield aln.hits, n_aln diff --git a/gffquant/annotation/count_annotator.py b/gffquant/annotation/count_annotator.py index 3598d285..601eb865 100644 --- a/gffquant/annotation/count_annotator.py +++ b/gffquant/annotation/count_annotator.py @@ -8,8 +8,6 @@ import numpy as np -from ..counters import AlignmentCounter - logger = logging.getLogger(__name__) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 4b4fc4e0..417404e2 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -1,4 +1,4 @@ -# pylint: disable=C0103,W1514,R0913,R0917 +# pylint: disable=C0103,W1514,R0913,R0917,R0914 """ module docstring """ @@ -109,7 +109,17 @@ def compile_block(raw, lnorm, scaling_factors): def write_row(header, data, stream=sys.stdout): print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream) - def write_category(self, category, counts, index, names, unique_sf, ambig_sf, unannotated_reads=None, report_unseen=True): + def write_category( + self, + category, + counts, + index, + names, + unique_sf, + ambig_sf, + unannotated_reads=None, + report_unseen=True, + ): # category, c_counts, c_index, c_names, u_sf, a_sf if "scaled" in self.publish_reports: logger.info( @@ -138,21 +148,18 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un ) if "category" in self.publish_reports: - # cat_counts = counts.get(f"cat:::{category_id}") - cat_counts = counts[0] # np.array((counts[0][0], counts[0][2], counts[0][1], counts[0][3])) + cat_counts = counts[0] logger.info("CAT %s: %s", category, str(cat_counts)) if cat_counts is not None: cat_row = self.compile_output_row( cat_counts, - # scaling_factor=featcounts.scaling_factors["total_uniq"], - # ambig_scaling_factor=featcounts.scaling_factors["total_ambi"], scaling_factor=unique_sf, ambig_scaling_factor=ambig_sf, ) CountWriter.write_row("category", cat_row, stream=feat_out) for fid, i in index.items(): - f_counts = counts[i] # np.array((counts[i][0], counts[i][2], counts[i][1], counts[i][3])) #counts[fid] + f_counts = counts[i] if report_unseen or f_counts.sum(): out_row = self.compile_output_row( f_counts, @@ -161,61 +168,6 @@ def write_category(self, category, counts, index, names, unique_sf, ambig_sf, un ) CountWriter.write_row(names[fid], out_row, stream=feat_out) - - # pylint: disable=R0914 - def write_feature_counts(self, db, featcounts, unannotated_reads=None, report_unseen=True): - for category_id, counts in sorted(featcounts.items()): - scaling_factor, ambig_scaling_factor = featcounts.scaling_factors[ - category_id - ] - category = db.query_category(category_id).name - if "scaled" in self.publish_reports: - logger.info( - "SCALING FACTORS %s %s %s", - category, scaling_factor, ambig_scaling_factor - ) - with gzip.open(f"{self.out_prefix}.{category}.txt.gz", "wt") as feat_out: - header = self.get_header() - print("feature", *header, sep="\t", file=feat_out) - - if unannotated_reads is not None: - print("unannotated", unannotated_reads, sep="\t", file=feat_out) - - if "total_readcount" in self.publish_reports: - CountWriter.write_row( - "total_reads", - np.zeros(len(header)) + self.total_readcount, - stream=feat_out, - ) - - if "filtered_readcount" in self.publish_reports: - CountWriter.write_row( - "filtered_reads", - np.zeros(len(header)) + self.filtered_readcount, - stream=feat_out, - ) - - if "category" in self.publish_reports: - cat_counts = counts.get(f"cat:::{category_id}") - logger.info("CAT %s: %s", category_id, str(cat_counts)) - if cat_counts is not None: - cat_row = self.compile_output_row( - cat_counts, - scaling_factor=featcounts.scaling_factors["total_uniq"], - ambig_scaling_factor=featcounts.scaling_factors["total_ambi"], - ) - CountWriter.write_row("category", cat_row, stream=feat_out) - - for feature in db.get_features(category_id): - f_counts = counts.get(str(feature.id), np.zeros(len(header))) - if report_unseen or f_counts.sum(): - out_row = self.compile_output_row( - f_counts, - scaling_factor=scaling_factor, - ambig_scaling_factor=ambig_scaling_factor, - ) - CountWriter.write_row(feature.name, out_row, stream=feat_out) - def write_gene_counts( self, gene_counts: AlignmentCounter, diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 277a11a8..304a9e6a 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -1,10 +1,11 @@ +# pylint: disable=R0914 + """ module docstring """ import logging import numpy as np from .count_annotator import CountAnnotator -from .count_writer import CountWriter from ..counters import AlignmentCounter from ..db.annotation_db import AnnotationDatabaseManager @@ -20,11 +21,8 @@ def __init__(self, strand_specific, report_scaling_factors=True): CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): - for it, category in enumerate(db.get_categories()): + for category in db.get_categories(): features = tuple(db.get_features(category.id)) - # total_reads 483808.00000 483808.00000 483808.00000 483808.00000 483808.00000 483808.00000 - # filtered_reads 454437.00000 454437.00000 454437.00000 454437.00000 454437.00000 454437.00000 - # category 45359.50000 47.10706 42266.81963 152875.83896 224.72779 149853.25971 category_counts = np.zeros( (len(features) + 1, 4,), dtype='float64', @@ -40,10 +38,12 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou for rid in counter.get_all_regions(): counts = counter.get_counts(rid, strand_specific=self.strand_specific) if gene_group_db: - gene_id, ggroup_id = rid, rid + # gene_id, ggroup_id = rid, rid + ggroup_id = rid else: ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - gene_id, ggroup_id = ref, ref + # gene_id, ggroup_id = ref, ref + ggroup_id = ref region_annotation = db.query_sequence(ggroup_id) # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id) @@ -55,12 +55,13 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou for cf in category_features: category_counts[category_index.get(int(cf))] += counts - elif it == 0: - self.unannotated_counts += counts[:4] - + # elif it == 0: + # self.unannotated_counts += counts[:4] + count_sums = category_counts[1:].sum(axis=0) - # should scaled counts use a factor derived from all counts or should multi-feature counts only contribute once? + # should scaled counts use a factor derived from all counts + # or should multi-feature counts only contribute once? # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] # pre 2.19 category count scaling was based on total counts @@ -75,15 +76,17 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou category.name, # count_sums[0], count_sums[1], count_sums[1], count_sums[3], *count_sums, - uniq_scaling_factor, ambig_scaling_factor, + uniq_scaling_factor, ambig_scaling_factor, ) - yield category.name, category_counts, category_index, category_names, uniq_scaling_factor, ambig_scaling_factor - - - - - + yield ( + category.name, + category_counts, + category_index, + category_names, + uniq_scaling_factor, + ambig_scaling_factor, + ) def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of gene counts with functional annotations. """ @@ -105,15 +108,16 @@ def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCoun for rid in counter.get_all_regions(): counts = counter.get_counts(rid, strand_specific=self.strand_specific) - if gene_group_db: # ref_tokens = ref.split(".") # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] - gene_id, ggroup_id = rid, rid + # gene_id, ggroup_id = rid, rid + ggroup_id = rid else: ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - gene_id, ggroup_id = ref, ref + # gene_id, ggroup_id = ref, ref + ggroup_id = ref region_annotation = db.query_sequence(ggroup_id) if region_annotation is not None: diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py index 6d719413..84f85163 100644 --- a/gffquant/annotation/regioncount_annotator.py +++ b/gffquant/annotation/regioncount_annotator.py @@ -1,3 +1,5 @@ +""" module docstring """ + import numpy as np from . import CountAnnotator diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 51a2dd28..aad80de5 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -130,7 +130,11 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No return contributed_counts def get_unannotated_reads(self): - return self.unannotated_reads + # return self.unannotated_reads + no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056") + if no_annotation is not None: + return self.counts[no_annotation][0] + return 0.0 def get_counts(self, seqid, strand_specific=False): if strand_specific: @@ -224,35 +228,28 @@ def generate_gene_count_matrix(self, refmgr): logger.info( "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", count_sums[0], count_sums[2], count_sums[1], count_sums[3], - uniq_scaling_factor, ambig_scaling_factor, + uniq_scaling_factor, ambig_scaling_factor, ) # return count sums and scaling factors return count_sums, uniq_scaling_factor, ambig_scaling_factor - + def group_gene_count_matrix(self, refmgr): ggroup_index = {} for key, key_index in self.index.items(): ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0] ref_tokens = ref.split(".") - gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] g_key_index = ggroup_index.get(ggroup_id) gene_counts = self.counts[key_index] if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) self.counts[g_key_index] = gene_counts - # logger.info("AC: group_gene_count_matrix - gene=%s new group=%s (%s) base counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) else: - # only add counts if group has been encountered before - # else there will be duplicates self.counts[g_key_index] += gene_counts - # logger.info("AC: group_gene_count_matrix - gene=%s group=%s (%s) adding counts=%s -> %s", gene_id, ggroup_id, g_key_index, str(self.counts[key_index]), str(self.counts[g_key_index]),) # replace index with grouped index self.index = ggroup_index # remove the un-indexed (ungrouped) rows self.counts = self.counts[0:len(self.index), :] - - - diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 4a6cc96e..d1480f57 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -142,7 +142,6 @@ def process_counters( total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager) logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts) - count_writer.write_gene_counts( self.counter, self.reference_manager, @@ -151,31 +150,24 @@ def process_counters( ) self.counter.group_gene_count_matrix(self.reference_manager) + unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - # count_annotator.annotate(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,) - - # category.name, category_counts, category_index, uniq_scaling_factor, ambig_scaling_factor - for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2(self.reference_manager, self.adm, self.counter, gene_group_db=gene_group_db,): + for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2( + self.reference_manager, + self.adm, + self.counter, + gene_group_db=gene_group_db, + ): logger.info("PROCESSING CATEGORY=%s", category) - unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - count_writer.write_category(category, c_counts, c_index, c_names, u_sf, a_sf, unannotated_reads=(None, unannotated_reads)[report_unannotated],) - - # unannotated_reads = self.counter.get_unannotated_reads() - # unannotated_reads += self.aln_counter["unannotated_ambig"] - - # count_writer.write_feature_counts( - # self.adm, - # count_annotator, - # (None, unannotated_reads)[report_unannotated], - # ) - - # count_writer.write_gene_counts( - # self.counter, - # self.reference_manager, - # count_annotator.scaling_factors["total_gene_uniq"], - # count_annotator.scaling_factors["total_gene_ambi"], - # gene_group_db=gene_group_db, - # ) + count_writer.write_category( + category, + c_counts, + c_index, + c_names, + u_sf, + a_sf, + unannotated_reads=(None, unannotated_reads)[report_unannotated], + ) self.adm.clear_caches() From 90eb4b504300da27456acfd06a9f3820a1c18bcf Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 30 Dec 2024 00:55:46 +0100 Subject: [PATCH 069/128] trying to optimise scaling factors, temp. disabled feature counts --- gffquant/annotation/count_writer.py | 17 ++++++----- gffquant/annotation/genecount_annotator.py | 14 --------- gffquant/counters/alignment_counter.py | 33 +++++++++++++++++----- gffquant/profilers/feature_quantifier.py | 32 ++++++++++----------- 4 files changed, 52 insertions(+), 44 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 417404e2..a33cef3f 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -76,6 +76,7 @@ def compile_block(raw, lnorm, scaling_factors): p, row = 0, [] rpkm_factor = 1e9 / self.filtered_readcount + # unique counts row += compile_block(*counts[p:p + 2], (scaling_factor, rpkm_factor,)) p += 2 @@ -190,17 +191,19 @@ def write_gene_counts( ) for ref, rid in sorted(ref_stream): - counts = gene_counts.get_counts(rid) + # counts = gene_counts.get_counts(rid) + counts = gene_counts[rid] if gene_group_db: ref_tokens = ref.split(".") gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] else: gene_id = ref - out_row = self.compile_output_row( - counts, - scaling_factor=uniq_scaling_factor, - ambig_scaling_factor=ambig_scaling_factor, - ) + # out_row = self.compile_output_row( + # counts, + # scaling_factor=uniq_scaling_factor, + # ambig_scaling_factor=ambig_scaling_factor, + # ) - CountWriter.write_row(gene_id, out_row, stream=gene_out,) + # CountWriter.write_row(gene_id, out_row, stream=gene_out,) + CountWriter.write_row(gene_id, counts, stream=gene_out,) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 304a9e6a..80af4515 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -74,7 +74,6 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", category.name, - # count_sums[0], count_sums[1], count_sums[1], count_sums[3], *count_sums, uniq_scaling_factor, ambig_scaling_factor, ) @@ -90,15 +89,6 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): """ Annotate a set of gene counts with functional annotations. """ - # self.total_gene_counts, u_sf, a_sf = counter.generate_gene_count_matrix(refmgr) - # logger.info("TOTAL_GENE_COUNTS = %s", self.total_gene_counts) - - # writer.write_gene_counts( - # counter, - # refmgr, - # u_sf, a_sf, - # gene_group_db=gene_group_db, - # ) # formerly used in compute_count_vector strand_specific_counts = ( @@ -122,10 +112,6 @@ def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCoun region_annotation = db.query_sequence(ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation - # logger.info( - # "GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s", - # gene_id, ggroup_id, counts[0], counts[2], - # ) self.distribute_feature_counts(counts, region_annotation) else: diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index aad80de5..c136759f 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -213,26 +213,45 @@ def generate_gene_count_matrix(self, refmgr): # duplicate the raw counts self.counts = np.concatenate( - (self.counts, self.counts,), + #(self.counts, self.counts, self.counts,), + ( + self.counts[:, 0], self.counts[:, 0], self.counts[:, 0], # 0, 1, 2 + self.counts[:, 1], self.counts[:, 1], self.counts[:, 1], # 3, 4, 5 + ), axis=1, ) # length-normalise the lnorm columns - self.counts[:, 2:4] /= lengths[:, None] + # self.counts[:, 2:4] /= lengths[:, None] + self.counts[:, 1::2] /= lengths[:, None] count_sums = self.counts.sum(axis=0) - uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] - ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] + # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] + # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] + uniq_scaling_factor, combined_scaling_factor = ( + AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), + AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), + ) logger.info( - "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", + "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s", count_sums[0], count_sums[2], count_sums[1], count_sums[3], - uniq_scaling_factor, ambig_scaling_factor, + uniq_scaling_factor, combined_scaling_factor, ) + # apply scaling factors + self.counts[:, 2] *= uniq_scaling_factor + self.counts[:, 5] *= combined_scaling_factor + # return count sums and scaling factors - return count_sums, uniq_scaling_factor, ambig_scaling_factor + return count_sums, uniq_scaling_factor, combined_scaling_factor + + @staticmethod + def calculate_scaling_factor(raw, norm): + if norm == 0.0: + return 1.0 + return raw / norm def group_gene_count_matrix(self, refmgr): ggroup_index = {} diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index d1480f57..01ffbdd3 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -152,22 +152,22 @@ def process_counters( self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2( - self.reference_manager, - self.adm, - self.counter, - gene_group_db=gene_group_db, - ): - logger.info("PROCESSING CATEGORY=%s", category) - count_writer.write_category( - category, - c_counts, - c_index, - c_names, - u_sf, - a_sf, - unannotated_reads=(None, unannotated_reads)[report_unannotated], - ) + # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2( + # self.reference_manager, + # self.adm, + # self.counter, + # gene_group_db=gene_group_db, + # ): + # logger.info("PROCESSING CATEGORY=%s", category) + # count_writer.write_category( + # category, + # c_counts, + # c_index, + # c_names, + # u_sf, + # a_sf, + # unannotated_reads=(None, unannotated_reads)[report_unannotated], + # ) self.adm.clear_caches() From 5b0286f5c7b3fa4f307ceb8a141de08d458aeed2 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 30 Dec 2024 01:11:44 +0100 Subject: [PATCH 070/128] trying to optimise scaling factors, temp. disabled feature counts --- gffquant/counters/alignment_counter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index c136759f..7f3b871a 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -212,13 +212,13 @@ def generate_gene_count_matrix(self, refmgr): self.counts[:, 1:2] += self.counts[:, 0:1] # duplicate the raw counts - self.counts = np.concatenate( + self.counts = np.column_stack( #(self.counts, self.counts, self.counts,), ( self.counts[:, 0], self.counts[:, 0], self.counts[:, 0], # 0, 1, 2 self.counts[:, 1], self.counts[:, 1], self.counts[:, 1], # 3, 4, 5 ), - axis=1, + # axis=1, ) # length-normalise the lnorm columns From 5baafe04fe6b334944cfbb633b17c002407d1afa Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 30 Dec 2024 01:23:15 +0100 Subject: [PATCH 071/128] trying to optimise scaling factors, temp. disabled feature counts --- gffquant/counters/alignment_counter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 7f3b871a..6bc70794 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -223,7 +223,7 @@ def generate_gene_count_matrix(self, refmgr): # length-normalise the lnorm columns # self.counts[:, 2:4] /= lengths[:, None] - self.counts[:, 1::2] /= lengths[:, None] + self.counts[:, 1::3] /= lengths[:, None] count_sums = self.counts.sum(axis=0) @@ -241,8 +241,8 @@ def generate_gene_count_matrix(self, refmgr): ) # apply scaling factors - self.counts[:, 2] *= uniq_scaling_factor - self.counts[:, 5] *= combined_scaling_factor + self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor + self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor # return count sums and scaling factors return count_sums, uniq_scaling_factor, combined_scaling_factor From 49e11e367054bc4b41a3cbb9f2a3618b87fa431d Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 30 Dec 2024 01:46:39 +0100 Subject: [PATCH 072/128] re-enable feature counts --- gffquant/annotation/count_writer.py | 24 ++++++++-------- gffquant/annotation/genecount_annotator.py | 28 +++++++++++-------- gffquant/counters/alignment_counter.py | 2 +- gffquant/profilers/feature_quantifier.py | 32 +++++++++++----------- 4 files changed, 46 insertions(+), 40 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index a33cef3f..98225835 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -152,22 +152,22 @@ def write_category( cat_counts = counts[0] logger.info("CAT %s: %s", category, str(cat_counts)) if cat_counts is not None: - cat_row = self.compile_output_row( - cat_counts, - scaling_factor=unique_sf, - ambig_scaling_factor=ambig_sf, - ) - CountWriter.write_row("category", cat_row, stream=feat_out) + # cat_row = self.compile_output_row( + # cat_counts, + # scaling_factor=unique_sf, + # ambig_scaling_factor=ambig_sf, + # ) + CountWriter.write_row("category", counts[0], stream=feat_out) for fid, i in index.items(): f_counts = counts[i] if report_unseen or f_counts.sum(): - out_row = self.compile_output_row( - f_counts, - scaling_factor=unique_sf, - ambig_scaling_factor=ambig_sf, - ) - CountWriter.write_row(names[fid], out_row, stream=feat_out) + # out_row = self.compile_output_row( + # f_counts, + # scaling_factor=unique_sf, + # ambig_scaling_factor=ambig_sf, + # ) + CountWriter.write_row(names[fid], counts[i], stream=feat_out) def write_gene_counts( self, diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 80af4515..b4f25689 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -24,7 +24,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou for category in db.get_categories(): features = tuple(db.get_features(category.id)) category_counts = np.zeros( - (len(features) + 1, 4,), + (len(features) + 1, 6,), dtype='float64', ) category_index = { @@ -35,8 +35,9 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou feature.id: feature.name for feature in features } - for rid in counter.get_all_regions(): - counts = counter.get_counts(rid, strand_specific=self.strand_specific) + for rid in counter: + # counts = counter.get_counts(rid, strand_specific=self.strand_specific) + counts = counter[rid] if gene_group_db: # gene_id, ggroup_id = rid, rid ggroup_id = rid @@ -65,17 +66,22 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] # pre 2.19 category count scaling was based on total counts - uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0 - if category_counts[0][1]: - uniq_scaling_factor = category_counts[0][0] / category_counts[0][1] - if category_counts[0][3]: - ambig_scaling_factor = category_counts[0][2] / category_counts[0][3] + # uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0 + # if category_counts[0][1]: + # uniq_scaling_factor = category_counts[0][0] / category_counts[0][1] + # if category_counts[0][3]: + # ambig_scaling_factor = category_counts[0][2] / category_counts[0][3] + uniq_scaling_factor, combined_scaling_factor = ( + AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), + AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), + ) logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", category.name, - *count_sums, - uniq_scaling_factor, ambig_scaling_factor, + # *count_sums, + count_sums[0], count_sums[1], count_sums[3], count_sums[4], + uniq_scaling_factor, combined_scaling_factor, ) yield ( @@ -84,7 +90,7 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou category_index, category_names, uniq_scaling_factor, - ambig_scaling_factor, + combined_scaling_factor, ) def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 6bc70794..04276477 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -236,7 +236,7 @@ def generate_gene_count_matrix(self, refmgr): logger.info( "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s", - count_sums[0], count_sums[2], count_sums[1], count_sums[3], + count_sums[0], count_sums[1], count_sums[3], count_sums[4], uniq_scaling_factor, combined_scaling_factor, ) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 01ffbdd3..d1480f57 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -152,22 +152,22 @@ def process_counters( self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2( - # self.reference_manager, - # self.adm, - # self.counter, - # gene_group_db=gene_group_db, - # ): - # logger.info("PROCESSING CATEGORY=%s", category) - # count_writer.write_category( - # category, - # c_counts, - # c_index, - # c_names, - # u_sf, - # a_sf, - # unannotated_reads=(None, unannotated_reads)[report_unannotated], - # ) + for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2( + self.reference_manager, + self.adm, + self.counter, + gene_group_db=gene_group_db, + ): + logger.info("PROCESSING CATEGORY=%s", category) + count_writer.write_category( + category, + c_counts, + c_index, + c_names, + u_sf, + a_sf, + unannotated_reads=(None, unannotated_reads)[report_unannotated], + ) self.adm.clear_caches() From b39b4b10613ec9d2baf027cc77541b5ae7132516 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 30 Dec 2024 22:43:17 +0100 Subject: [PATCH 073/128] trying to fix scaling factor issue --- gffquant/annotation/genecount_annotator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index b4f25689..63f0279d 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -59,7 +59,8 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou # elif it == 0: # self.unannotated_counts += counts[:4] - count_sums = category_counts[1:].sum(axis=0) + # count_sums = category_counts[1:].sum(axis=0) + count_sums = category_counts[0] # should scaled counts use a factor derived from all counts # or should multi-feature counts only contribute once? @@ -72,9 +73,9 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou # if category_counts[0][3]: # ambig_scaling_factor = category_counts[0][2] / category_counts[0][3] uniq_scaling_factor, combined_scaling_factor = ( - AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), - AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), - ) + AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), + AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), + ) logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", From b76f168edceff3b05a3bb4209884261faa7149e2 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Mon, 30 Dec 2024 23:13:07 +0100 Subject: [PATCH 074/128] trying to fix scaling factor issue --- gffquant/annotation/genecount_annotator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 63f0279d..5d676dc2 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -77,6 +77,10 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), ) + # apply scaling factors + category_counts[:, 2] = category_counts[:, 1] * uniq_scaling_factor + category_counts[:, 5] = category_counts[:, 4] * combined_scaling_factor + logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", category.name, From 992040427c4bac10909b5cb5b75424f0c86cd4c6 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 03:00:12 +0100 Subject: [PATCH 075/128] trying to implement count matrix --- gffquant/annotation/count_writer.py | 2 +- gffquant/annotation/genecount_annotator.py | 79 +++--- gffquant/annotation/regioncount_annotator.py | 1 + gffquant/counters/__init__.py | 1 + gffquant/counters/alignment_counter.py | 259 ++++++++++--------- gffquant/profilers/feature_quantifier.py | 32 +-- 6 files changed, 187 insertions(+), 187 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 98225835..7fd1bab6 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -187,7 +187,7 @@ def write_gene_counts( refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0], rid, ) - for rid in gene_counts.get_all_regions() + for rid, _ in gene_counts #.get_all_regions() ) for ref, rid in sorted(ref_stream): diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 5d676dc2..c750f98b 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -20,7 +20,35 @@ def __init__(self, strand_specific, report_scaling_factors=True): """ __init__() """ CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): + # def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): + # category_sums = np.zeros((len(db.get_categories()), 6)) + # functional_counts = np.zeros(()) + # for rid in counter: + # counts = counter[rid] + # if gene_group_db: + # ggroup_id = rid + # else: + # ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + # ggroup_id = ref + + # region_annotation = db.query_sequence(ggroup_id) + # if region_annotation is not None: + # _, _, region_annotation = region_annotation + # for category_id, features in region_annotation: + # category_sums[int(category_id)] += counts + + + # category_features = dict(region_annotation).get(str(category.id)) + # if category_features is not None: + # category_counts[0] += counts # category row + # for cf in category_features: + # category_counts[category_index.get(int(cf))] += counts + + + + def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): + """ Annotate a set of gene counts with functional annotations. """ + for category in db.get_categories(): features = tuple(db.get_features(category.id)) category_counts = np.zeros( @@ -36,18 +64,14 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou for feature in features } for rid in counter: - # counts = counter.get_counts(rid, strand_specific=self.strand_specific) counts = counter[rid] if gene_group_db: - # gene_id, ggroup_id = rid, rid ggroup_id = rid else: ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - # gene_id, ggroup_id = ref, ref ggroup_id = ref region_annotation = db.query_sequence(ggroup_id) - # logger.info("REGION_ANNOTATION: %s (%s)", str(region_annotation), ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation category_features = dict(region_annotation).get(str(category.id)) @@ -56,22 +80,11 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou for cf in category_features: category_counts[category_index.get(int(cf))] += counts - # elif it == 0: - # self.unannotated_counts += counts[:4] - - # count_sums = category_counts[1:].sum(axis=0) count_sums = category_counts[0] # should scaled counts use a factor derived from all counts # or should multi-feature counts only contribute once? - # uniq_scaling_factor = (count_sums[0] / count_sums[1], 1.0)[count_sums[1] == 0] - # ambig_scaling_factor = (count_sums[2] / count_sums[3], 1.0)[count_sums[3] == 0] # pre 2.19 category count scaling was based on total counts - # uniq_scaling_factor, ambig_scaling_factor = 1.0, 1.0 - # if category_counts[0][1]: - # uniq_scaling_factor = category_counts[0][0] / category_counts[0][1] - # if category_counts[0][3]: - # ambig_scaling_factor = category_counts[0][2] / category_counts[0][3] uniq_scaling_factor, combined_scaling_factor = ( AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), @@ -84,7 +97,6 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou logger.info( "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", category.name, - # *count_sums, count_sums[0], count_sums[1], count_sums[3], count_sums[4], uniq_scaling_factor, combined_scaling_factor, ) @@ -97,36 +109,3 @@ def annotate2(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCou uniq_scaling_factor, combined_scaling_factor, ) - - def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): - """ Annotate a set of gene counts with functional annotations. """ - - # formerly used in compute_count_vector - strand_specific_counts = ( - (counter.PLUS_STRAND, counter.MINUS_STRAND) - if self.strand_specific else None - ) - - for rid in counter.get_all_regions(): - counts = counter.get_counts(rid, strand_specific=self.strand_specific) - - if gene_group_db: - # ref_tokens = ref.split(".") - # gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] - # gene_id, ggroup_id = rid, rid - ggroup_id = rid - else: - ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - # gene_id, ggroup_id = ref, ref - ggroup_id = ref - - region_annotation = db.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - self.distribute_feature_counts(counts, region_annotation) - - else: - # logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id) - self.unannotated_counts += counts[:4] - - self.calculate_scaling_factors() diff --git a/gffquant/annotation/regioncount_annotator.py b/gffquant/annotation/regioncount_annotator.py index 84f85163..57931cff 100644 --- a/gffquant/annotation/regioncount_annotator.py +++ b/gffquant/annotation/regioncount_annotator.py @@ -10,6 +10,7 @@ class RegionCountAnnotator(CountAnnotator): """ CountAnnotator subclass for contig/region-based counting. """ def __init__(self, strand_specific, report_scaling_factors=True): + raise NotImplementedError() CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) # pylint: disable=R0914,W0613 diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py index d325316e..e82470ea 100644 --- a/gffquant/counters/__init__.py +++ b/gffquant/counters/__init__.py @@ -4,4 +4,5 @@ """module docstring""" from .alignment_counter import AlignmentCounter +from .count_matrix import CountMatrix from .region_counter import RegionCounter diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 04276477..72faf1d9 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -9,6 +9,7 @@ import numpy as np +from . import CountMatrix from .. import DistributionMode @@ -68,13 +69,15 @@ def __init__( self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,) self.unannotated_reads = 0 - self.index = {} - self.counts = np.zeros( - (AlignmentCounter.INITIAL_SIZE, 2,), - dtype='float64', - ) + # self.index = {} + # self.counts = np.zeros( + # (AlignmentCounter.INITIAL_SIZE, 2,), + # dtype='float64', + # ) + self.counts = CountMatrix(2, nrows=AlignmentCounter.INITIAL_SIZE) def dump(self, prefix, refmgr): + raise NotImplementedError() with gzip.open(f"{prefix}.{self.__class__.__name__}.txt.gz", "wt") as _out: for key, key_index in self.index.items(): ref, reflen = refmgr.get(key[0] if isinstance(key, tuple) else key) @@ -83,33 +86,34 @@ def dump(self, prefix, refmgr): # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) # print(k, ref, reflen, v, sep="\t", file=_out) - def get(self, key, default_val): - key_index = self.index.get(key) - if key_index is None: - return Counter() - return Counter({key: self.counts[key_index]}) + # def get(self, key, default_val): + # key_index = self.index.get(key) + # if key_index is None: + # return Counter() + # return Counter({key: self.counts[key_index]}) - def setdefault(self, key, default_val): - ... + # def setdefault(self, key, default_val): + # ... def has_ambig_counts(self): - return bool(self.counts[:, 1].sum() != 0) + # return bool(self.counts[:, 1].sum() != 0) + return bool(self.counts.colsum(1) != 0) - def __iter__(self): - yield from self.index.keys() + # def __iter__(self): + # yield from self.index.keys() - def __getitem__(self, key): - key_index = self.index.get(key) - if key_index is None: - return 0.0 - return self.counts[key_index] + # def __getitem__(self, key): + # key_index = self.index.get(key) + # if key_index is None: + # return 0.0 + # return self.counts[key_index] - def __setitem__(self, key, value): - key_index = self.index.get(key) - if key_index is not None: - self.counts[key_index] = value - else: - raise KeyError(f"{key=} not found.") + # def __setitem__(self, key, value): + # key_index = self.index.get(key) + # if key_index is not None: + # self.counts[key_index] = value + # else: + # raise KeyError(f"{key=} not found.") def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,): if pe_library is not None: @@ -131,32 +135,33 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No def get_unannotated_reads(self): # return self.unannotated_reads - no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056") - if no_annotation is not None: - return self.counts[no_annotation][0] - return 0.0 - - def get_counts(self, seqid, strand_specific=False): - if strand_specific: - raise NotImplementedError() - # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] - # uniq_counts[seqid[1]] = uniq_counter[seqid] - # ambig_counts[seqid[1]] = ambig_counter[seqid] - - # rid = seqid[0] if isinstance(seqid, tuple) else seqid - # uniq_counts = [ - # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # ] - # ambig_counts = [ - # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # ] - counts = self[seqid] - return np.array((counts[0], counts[2], counts[1], counts[3])) - - def get_all_regions(self): - yield from self + return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0] + # no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056") + # if no_annotation is not None: + # return self.counts[no_annotation][0] + # return 0.0 + + # def get_counts(self, seqid, strand_specific=False): + # if strand_specific: + # raise NotImplementedError() + # # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] + # # uniq_counts[seqid[1]] = uniq_counter[seqid] + # # ambig_counts[seqid[1]] = ambig_counter[seqid] + + # # rid = seqid[0] if isinstance(seqid, tuple) else seqid + # # uniq_counts = [ + # # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], + # # ] + # # ambig_counts = [ + # # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], + # # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], + # # ] + # counts = self[seqid] + # return np.array((counts[0], counts[2], counts[1], counts[3])) + + # def get_all_regions(self): + # yield from self def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 @@ -175,17 +180,18 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): ) )[self.strand_specific] - key_index = self.index.get(key) - if key_index is None: - nrows = self.counts.shape[0] - if len(self.index) == nrows: - self.counts = np.pad( - self.counts, - ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), - ) - # key_index = self.index.setdefault(key, len(self.index)) - key_index = self.index[key] = len(self.index) - self.counts[key_index][int(ambiguous_counts)] += inc + # key_index = self.index.get(key) + # if key_index is None: + # nrows = self.counts.shape[0] + # if len(self.index) == nrows: + # self.counts = np.pad( + # self.counts, + # ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), + # ) + # # key_index = self.index.setdefault(key, len(self.index)) + # key_index = self.index[key] = len(self.index) + # self.counts[key_index][int(ambiguous_counts)] += inc + self.counts[key][int(ambiguous_counts)] += inc contributed_counts += inc return contributed_counts @@ -196,56 +202,61 @@ def generate_gene_count_matrix(self, refmgr): # uniq_raw, combined_raw, uniq_lnorm, combined_lnorm # obtain gene lengths - lengths = np.array( + gene_lengths = np.array( tuple( (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] for key in self.index ) ) - logger.info("LENGTHS ARRAY = %s", lengths.shape) - logger.info("INDEX SIZE = %s", len(self.index)) - - # remove the un-indexed rows - self.counts = self.counts[0:len(self.index), :] - - # calculate combined_raw - self.counts[:, 1:2] += self.counts[:, 0:1] - - # duplicate the raw counts - self.counts = np.column_stack( - #(self.counts, self.counts, self.counts,), - ( - self.counts[:, 0], self.counts[:, 0], self.counts[:, 0], # 0, 1, 2 - self.counts[:, 1], self.counts[:, 1], self.counts[:, 1], # 3, 4, 5 - ), - # axis=1, - ) - # length-normalise the lnorm columns - # self.counts[:, 2:4] /= lengths[:, None] - self.counts[:, 1::3] /= lengths[:, None] + self.counts = self.counts.generate_gene_counts(gene_lengths) - count_sums = self.counts.sum(axis=0) + return self.counts.sum() - # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] - # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] - uniq_scaling_factor, combined_scaling_factor = ( - AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), - AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), - ) + # logger.info("LENGTHS ARRAY = %s", lengths.shape) + # logger.info("INDEX SIZE = %s", len(self.index)) - logger.info( - "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s", - count_sums[0], count_sums[1], count_sums[3], count_sums[4], - uniq_scaling_factor, combined_scaling_factor, - ) + # # remove the un-indexed rows + # self.counts = self.counts[0:len(self.index), :] + + # # calculate combined_raw + # self.counts[:, 1:2] += self.counts[:, 0:1] + + # # duplicate the raw counts + # self.counts = np.column_stack( + # #(self.counts, self.counts, self.counts,), + # ( + # self.counts[:, 0], self.counts[:, 0], self.counts[:, 0], # 0, 1, 2 + # self.counts[:, 1], self.counts[:, 1], self.counts[:, 1], # 3, 4, 5 + # ), + # # axis=1, + # ) - # apply scaling factors - self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor - self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor + # # length-normalise the lnorm columns + # # self.counts[:, 2:4] /= lengths[:, None] + # self.counts[:, 1::3] /= lengths[:, None] - # return count sums and scaling factors - return count_sums, uniq_scaling_factor, combined_scaling_factor + # count_sums = self.counts.sum(axis=0) + + # # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] + # # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] + # uniq_scaling_factor, combined_scaling_factor = ( + # AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), + # AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), + # ) + + # logger.info( + # "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s", + # count_sums[0], count_sums[1], count_sums[3], count_sums[4], + # uniq_scaling_factor, combined_scaling_factor, + # ) + + # # apply scaling factors + # self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor + # self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor + + # # return count sums and scaling factors + # return count_sums, uniq_scaling_factor, combined_scaling_factor @staticmethod def calculate_scaling_factor(raw, norm): @@ -254,21 +265,29 @@ def calculate_scaling_factor(raw, norm): return raw / norm def group_gene_count_matrix(self, refmgr): - ggroup_index = {} - for key, key_index in self.index.items(): - ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0] - ref_tokens = ref.split(".") - _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] - g_key_index = ggroup_index.get(ggroup_id) - gene_counts = self.counts[key_index] - if g_key_index is None: - g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) - self.counts[g_key_index] = gene_counts - else: - self.counts[g_key_index] += gene_counts - - # replace index with grouped index - self.index = ggroup_index - - # remove the un-indexed (ungrouped) rows - self.counts = self.counts[0:len(self.index), :] + + ggroups = ( + (refmgr.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[-1] + for key, _ in self.counts + ) + + self.counts = self.counts.group_gene_counts(ggroups) + + # ggroup_index = {} + # for key, key_index in self.index.items(): + # ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0] + # ref_tokens = ref.split(".") + # _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] + # g_key_index = ggroup_index.get(ggroup_id) + # gene_counts = self.counts[key_index] + # if g_key_index is None: + # g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) + # self.counts[g_key_index] = gene_counts + # else: + # self.counts[g_key_index] += gene_counts + + # # replace index with grouped index + # self.index = ggroup_index + + # # remove the un-indexed (ungrouped) rows + # self.counts = self.counts[0:len(self.index), :] diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index d1480f57..858b1670 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -152,22 +152,22 @@ def process_counters( self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate2( - self.reference_manager, - self.adm, - self.counter, - gene_group_db=gene_group_db, - ): - logger.info("PROCESSING CATEGORY=%s", category) - count_writer.write_category( - category, - c_counts, - c_index, - c_names, - u_sf, - a_sf, - unannotated_reads=(None, unannotated_reads)[report_unannotated], - ) + # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate( + # self.reference_manager, + # self.adm, + # self.counter, + # gene_group_db=gene_group_db, + # ): + # logger.info("PROCESSING CATEGORY=%s", category) + # count_writer.write_category( + # category, + # c_counts, + # c_index, + # c_names, + # u_sf, + # a_sf, + # unannotated_reads=(None, unannotated_reads)[report_unannotated], + # ) self.adm.clear_caches() From 945cf8e89cd8727d0d2de1d6b3be13b968a65984 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 03:12:24 +0100 Subject: [PATCH 076/128] trying to implement count matrix --- gffquant/counters/__init__.py | 2 +- gffquant/counters/alignment_counter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/__init__.py b/gffquant/counters/__init__.py index e82470ea..2b5426c3 100644 --- a/gffquant/counters/__init__.py +++ b/gffquant/counters/__init__.py @@ -4,5 +4,5 @@ """module docstring""" from .alignment_counter import AlignmentCounter -from .count_matrix import CountMatrix +# from .count_matrix import CountMatrix from .region_counter import RegionCounter diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 72faf1d9..b13297f9 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -9,7 +9,7 @@ import numpy as np -from . import CountMatrix +from .count_matrix import CountMatrix from .. import DistributionMode From 07f85f096118b944aad34627aaccfce42a8df594 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 03:15:32 +0100 Subject: [PATCH 077/128] trying to implement count matrix --- gffquant/counters/alignment_counter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index b13297f9..8ec11878 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -9,7 +9,7 @@ import numpy as np -from .count_matrix import CountMatrix +from count_matrix import CountMatrix from .. import DistributionMode From dbb36da3b59c7fc849633ab09c7265afb57d92a3 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 03:16:49 +0100 Subject: [PATCH 078/128] trying to implement count matrix --- gffquant/counters/alignment_counter.py | 2 +- gffquant/counters/count_matrix.py | 110 +++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 gffquant/counters/count_matrix.py diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 8ec11878..b13297f9 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -9,7 +9,7 @@ import numpy as np -from count_matrix import CountMatrix +from .count_matrix import CountMatrix from .. import DistributionMode diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py new file mode 100644 index 00000000..7fcf86e3 --- /dev/null +++ b/gffquant/counters/count_matrix.py @@ -0,0 +1,110 @@ +import logging + +import numpy as np + + +logger = logging.getLogger(__name__) + +class CountMatrix: + + @staticmethod + def calculate_scaling_factor(raw, norm): + if norm == 0.0: + return 1.0 + return raw / norm + + def __init__(self, ncols, nrows=1000): + self.index = {} + self.counts = np.zeros( + (nrows, ncols,), + dtype='float64', + ) + + def _resize(self): + nrows = self.counts.shape[0] + if len(self.index) == nrows: + self.counts = np.pad( + self.counts, + ((0, nrows * 2), (0, 0),), + ) + return len(self.index) + + def __getitem__(self, key): + key_index = self.index.get(key) + if key_index is None: + key_index = self.index[key] = self._resize() + return self.counts[key_index] + + def __setitem__(self, key, value): + key_index = self.index.get(key) + if key_index is None: + key_index = self.index[key] = self._resize() + self.counts[key_index] = value + + def __iter__(self): + yield from zip(self.index.keys(), self.counts) + + def sum(self): + return self.counts.sum(axis=0) + + def generate_gene_counts(self, lengths): + logger.info("LENGTHS ARRAY = %s", lengths.shape) + logger.info("INDEX SIZE = %s", len(self.index)) + + # remove the un-indexed rows + counts = self.counts[0:len(self.index), :] + + # calculate combined_raw + counts[:, 1:2] += counts[:, 0:1] + + # duplicate the raw counts + counts = np.column_stack( + ( + counts[:, 0], counts[:, 0], counts[:, 0], # 0, 1, 2 + counts[:, 1], counts[:, 1], counts[:, 1], # 3, 4, 5 + ), + ) + + # length-normalise the lnorm columns + counts[:, 1::3] /= lengths[:, None] + + count_sums = counts.sum(axis=0) + + uniq_scaling_factor, combined_scaling_factor = ( + CountMatrix.calculate_scaling_factor(*count_sums[0:2]), + CountMatrix.calculate_scaling_factor(*count_sums[3:5]), + ) + + logger.info( + "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s", + count_sums[0], count_sums[1], count_sums[3], count_sums[4], + uniq_scaling_factor, combined_scaling_factor, + ) + + # apply scaling factors + counts[:, 2] = counts[:, 1] * uniq_scaling_factor + counts[:, 5] = counts[:, 4] * combined_scaling_factor + + return self + + def group_gene_counts(self, ggroups): + ggroup_index = {} + for (key, key_index), ggroup_id in zip(self.index.items(), ggroups): + g_key_index = ggroup_index.get(ggroup_id) + gene_counts = self.counts[self.index[key]] + if g_key_index is None: + g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) + self.counts[g_key_index] = gene_counts + else: + self.counts[g_key_index] += gene_counts + + # replace index with grouped index + self.index = ggroup_index + + # remove the un-indexed (ungrouped) rows + self.counts = self.counts[0:len(self.index), :] + + return self + + def colsum(self, col): + return self.counts[:, col].sum() \ No newline at end of file From b281a16c32decf6439a91f372d9af8bebef27dc8 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 03:25:00 +0100 Subject: [PATCH 079/128] trying to implement count matrix --- gffquant/counters/alignment_counter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index b13297f9..5606a43c 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -205,7 +205,7 @@ def generate_gene_count_matrix(self, refmgr): gene_lengths = np.array( tuple( (refmgr.get(key[0] if isinstance(key, tuple) else key))[1] - for key in self.index + for key, _ in self.counts ) ) From 0b64813d13748d21f36ca2e5f56fde53ac23b9c5 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 10:44:45 +0100 Subject: [PATCH 080/128] trying to implement count matrix --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 858b1670..381a950f 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -139,7 +139,7 @@ def process_counters( filtered_readcount=self.aln_counter["filtered_read_count"], ) - total_gene_counts, u_sf, a_sf = self.counter.generate_gene_count_matrix(self.reference_manager) + total_gene_counts = self.counter.generate_gene_count_matrix(self.reference_manager) logger.info("TOTAL_GENE_COUNTS = %s", total_gene_counts) count_writer.write_gene_counts( From 7f93e353c9e19090eb7832883d1f875e8bcf7a54 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 11:06:27 +0100 Subject: [PATCH 081/128] trying to implement count matrix --- gffquant/annotation/count_writer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 7fd1bab6..5fd43f25 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -173,12 +173,12 @@ def write_gene_counts( self, gene_counts: AlignmentCounter, refmgr, - uniq_scaling_factor, - ambig_scaling_factor, + # uniq_scaling_factor, + # ambig_scaling_factor, gene_group_db=False ): - if "scaled" in self.publish_reports: - logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) + # if "scaled" in self.publish_reports: + # logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True) From b8bdf080fc8690d219870143d9a335e598c6c282 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 11:15:49 +0100 Subject: [PATCH 082/128] trying to implement count matrix --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 381a950f..7142f5ef 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -145,7 +145,7 @@ def process_counters( count_writer.write_gene_counts( self.counter, self.reference_manager, - u_sf, a_sf, + # u_sf, a_sf, gene_group_db=gene_group_db, ) From 85d1def473517f6805030f0ffb40814d4dbed026 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 11:34:20 +0100 Subject: [PATCH 083/128] trying to implement count matrix --- gffquant/counters/alignment_counter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 5606a43c..3da911bf 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -99,8 +99,9 @@ def has_ambig_counts(self): # return bool(self.counts[:, 1].sum() != 0) return bool(self.counts.colsum(1) != 0) - # def __iter__(self): - # yield from self.index.keys() + def __iter__(self): + # yield from self.index.keys() + yield from self.counts # def __getitem__(self, key): # key_index = self.index.get(key) From e325127d57578ed03ced12464599ce30b395cdc6 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 11:53:13 +0100 Subject: [PATCH 084/128] trying to implement count matrix --- gffquant/counters/alignment_counter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 3da911bf..9ca23e83 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -103,6 +103,9 @@ def __iter__(self): # yield from self.index.keys() yield from self.counts + def __getitem__(self, key): + return self.counts[key] + # def __getitem__(self, key): # key_index = self.index.get(key) # if key_index is None: From c1c93d14f50f1f6b974d633e775af0d1101871ee Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 31 Dec 2024 12:05:02 +0100 Subject: [PATCH 085/128] trying to implement count matrix --- gffquant/counters/count_matrix.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 7fcf86e3..620bf165 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -85,6 +85,8 @@ def generate_gene_counts(self, lengths): counts[:, 2] = counts[:, 1] * uniq_scaling_factor counts[:, 5] = counts[:, 4] * combined_scaling_factor + self.counts = counts + return self def group_gene_counts(self, ggroups): From 21fd9631fe9a183a50cc6d96864d00d7962a68b5 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 02:39:51 +0100 Subject: [PATCH 086/128] reactivated feature output --- gffquant/annotation/count_writer.py | 43 ++++++++++++ gffquant/annotation/genecount_annotator.py | 76 +++++++++++++++------- gffquant/counters/count_matrix.py | 16 ++++- gffquant/profilers/feature_quantifier.py | 23 +++++++ 4 files changed, 134 insertions(+), 24 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 5fd43f25..b65afa45 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -110,6 +110,49 @@ def compile_block(raw, lnorm, scaling_factors): def write_row(header, data, stream=sys.stdout): print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream) + + def write_category2( + self, + category_id, + category_name, + category_sum, + counts, + feature_names, + unannotated_reads=None, + report_unseen=True, + ): + with gzip.open(f"{self.out_prefix}.{category_name}.txt.gz", "wt") as feat_out: + header = self.get_header() + print("feature", *header, sep="\t", file=feat_out) + + if unannotated_reads is not None: + print("unannotated", unannotated_reads, sep="\t", file=feat_out) + + if "total_readcount" in self.publish_reports: + CountWriter.write_row( + "total_reads", + np.zeros(len(header)) + self.total_readcount, + stream=feat_out, + ) + + if "filtered_readcount" in self.publish_reports: + CountWriter.write_row( + "filtered_reads", + np.zeros(len(header)) + self.filtered_readcount, + stream=feat_out, + ) + + if "category" in self.publish_reports: + cat_counts = counts[0] + logger.info("CAT %s: %s", category_name, str(cat_counts)) + if cat_counts is not None: + CountWriter.write_row("category", category_sum, stream=feat_out) + + for (cid, fid), fcounts in counts: + if (report_unseen or fcounts.sum()) and cid == category_id: + CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) + + def write_category( self, category, diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index c750f98b..af63b220 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -7,6 +7,7 @@ from .count_annotator import CountAnnotator from ..counters import AlignmentCounter +from ..counters.count_matrix import CountMatrix from ..db.annotation_db import AnnotationDatabaseManager @@ -20,29 +21,58 @@ def __init__(self, strand_specific, report_scaling_factors=True): """ __init__() """ CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - # def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): - # category_sums = np.zeros((len(db.get_categories()), 6)) - # functional_counts = np.zeros(()) - # for rid in counter: - # counts = counter[rid] - # if gene_group_db: - # ggroup_id = rid - # else: - # ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - # ggroup_id = ref - - # region_annotation = db.query_sequence(ggroup_id) - # if region_annotation is not None: - # _, _, region_annotation = region_annotation - # for category_id, features in region_annotation: - # category_sums[int(category_id)] += counts - - - # category_features = dict(region_annotation).get(str(category.id)) - # if category_features is not None: - # category_counts[0] += counts # category row - # for cf in category_features: - # category_counts[category_index.get(int(cf))] += counts + def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): + categories = list(db.get_categories()) + category_sums = np.zeros((len(categories), 6)) + functional_counts = CountMatrix(6) + + for category_id in categories: + features = db.get_features(category_id) + for feature_id in sorted(features): + _ = functional_counts[(category_id, feature_id)] + + for rid in counter: + counts = counter[rid] + if gene_group_db: + ggroup_id = rid + else: + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + ggroup_id = ref + + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + for category_id, features in region_annotation: + category_id = int(category_id) + category_sums[category_id] += counts + for feature_id in features: + feature_id = int(feature_id) + functional_counts[(category_id, feature_id)] += counts[:4] + + for i, category_id in enumerate(categories): + u_sf, c_sf = ( + CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]), + CountMatrix.calculate_scaling_factor(*category_sums[i][2:4]), + ) + + category_id = int(category_id) + + rows = tuple( + key[0] == category_id + for key, _ in functional_counts + ) + + functional_counts.scale_column(1, u_sf, rows=rows) + functional_counts.scale_column(4, c_sf, rows=rows) + + category_sums[i, 2] = category_sums[i, 1] * u_sf + category_sums[i, 5] = category_sums[i, 4] * c_sf + + return functional_counts, category_sums + + + + diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 620bf165..347af127 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -47,6 +47,13 @@ def __iter__(self): def sum(self): return self.counts.sum(axis=0) + def scale_column(self, col_index, factor, rows=None): + # apply scaling factors + if rows is None: + self.counts[:, col_index + 1] = self.counts[:, col_index] * factor + else: + self.counts[rows, col_index + 1] = self.counts[rows, col_index] * factor + def generate_gene_counts(self, lengths): logger.info("LENGTHS ARRAY = %s", lengths.shape) logger.info("INDEX SIZE = %s", len(self.index)) @@ -109,4 +116,11 @@ def group_gene_counts(self, ggroups): return self def colsum(self, col): - return self.counts[:, col].sum() \ No newline at end of file + return self.counts[:, col].sum() + + def get_category(self, category_id): + rows = tuple( + cid == category_id + for (cid, _), _ in self + ) + return self \ No newline at end of file diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 7142f5ef..e0745ea7 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -152,6 +152,29 @@ def process_counters( self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] + functional_counts, category_sums = count_annotator.annotate_gene_counts( + self.reference_manager, + self.adm, + self.counter, + gene_group_db=gene_group_db, + ) + + categories = self.adm.get_categories() + for category, category_sum in zip(categories, category_sums): + feature_names = { + feature.id: feature.name + for feature in self.adm.get_features(category.id) + } + logger.info("PROCESSING CATEGORY=%s", category) + count_writer.write_category2( + category.id, + category.name, + category_sum, + functional_counts, + feature_names, + unannotated_reads=(None, unannotated_reads)[report_unannotated], + ) + # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate( # self.reference_manager, # self.adm, From a47b87e77bea0c8a25ae5a513ef9d980193e85b9 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 02:50:06 +0100 Subject: [PATCH 087/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index af63b220..827ca452 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -26,10 +26,10 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A category_sums = np.zeros((len(categories), 6)) functional_counts = CountMatrix(6) - for category_id in categories: - features = db.get_features(category_id) - for feature_id in sorted(features): - _ = functional_counts[(category_id, feature_id)] + for category in categories: + features = db.get_features(category.id) + for feature in sorted(features): + _ = functional_counts[(category.id, feature.id)] for rid in counter: counts = counter[rid] @@ -49,16 +49,14 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A feature_id = int(feature_id) functional_counts[(category_id, feature_id)] += counts[:4] - for i, category_id in enumerate(categories): + for i, category in enumerate(categories): u_sf, c_sf = ( CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]), CountMatrix.calculate_scaling_factor(*category_sums[i][2:4]), ) - category_id = int(category_id) - rows = tuple( - key[0] == category_id + key[0] == category.id for key, _ in functional_counts ) From 7723314943a52c0759585c6e9c4f075da3fdc48f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 02:58:55 +0100 Subject: [PATCH 088/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 827ca452..69f15481 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -27,8 +27,8 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A functional_counts = CountMatrix(6) for category in categories: - features = db.get_features(category.id) - for feature in sorted(features): + features = ((feature.id, feature) for feature in db.get_features(category.id)) + for _, feature in sorted(features, key=lambda x:x[0]): _ = functional_counts[(category.id, feature.id)] for rid in counter: From 27e9e386c19e4f26e83590c9024ae0aae4fc45b0 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 11:15:53 +0100 Subject: [PATCH 089/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 69f15481..5b47483b 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -31,7 +31,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A for _, feature in sorted(features, key=lambda x:x[0]): _ = functional_counts[(category.id, feature.id)] - for rid in counter: + for rid, counts in counter: counts = counter[rid] if gene_group_db: ggroup_id = rid From 2f938f627943496240e75775438a26c5f693ff46 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 11:30:29 +0100 Subject: [PATCH 090/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 5b47483b..64deadaf 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -47,7 +47,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A category_sums[category_id] += counts for feature_id in features: feature_id = int(feature_id) - functional_counts[(category_id, feature_id)] += counts[:4] + functional_counts[(category_id, feature_id)] += counts for i, category in enumerate(categories): u_sf, c_sf = ( From 03a4a9621d7122a6c5250371756aee922fb782f1 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 11:53:36 +0100 Subject: [PATCH 091/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 2 ++ gffquant/counters/count_matrix.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 64deadaf..ec4abb74 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -48,6 +48,8 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A for feature_id in features: feature_id = int(feature_id) functional_counts[(category_id, feature_id)] += counts + + functional_counts.drop_unindexed() for i, category in enumerate(categories): u_sf, c_sf = ( diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 347af127..89775b3e 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -54,6 +54,9 @@ def scale_column(self, col_index, factor, rows=None): else: self.counts[rows, col_index + 1] = self.counts[rows, col_index] * factor + def drop_unindexed(self): + self.counts = self.counts[0:len(self.index), :] + def generate_gene_counts(self, lengths): logger.info("LENGTHS ARRAY = %s", lengths.shape) logger.info("INDEX SIZE = %s", len(self.index)) From 74c2992cbc01204ff3683f31764816521290e946 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 12:11:06 +0100 Subject: [PATCH 092/128] reactivated feature output --- gffquant/profilers/feature_quantifier.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index e0745ea7..08981551 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -159,6 +159,9 @@ def process_counters( gene_group_db=gene_group_db, ) + logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10])) + logger.info("FC-counts: %s", str(functional_counts.counts[0:10,:])) + categories = self.adm.get_categories() for category, category_sum in zip(categories, category_sums): feature_names = { From 26a94d804e01d67a6733108cb19200ffd5d2b8a1 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 12:30:26 +0100 Subject: [PATCH 093/128] reactivated feature output --- gffquant/annotation/count_writer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index b65afa45..23439f59 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -148,6 +148,11 @@ def write_category2( if cat_counts is not None: CountWriter.write_row("category", category_sum, stream=feat_out) + for item in counts: + logger.info("ITEM: %s", str(item)) + + + for (cid, fid), fcounts in counts: if (report_unseen or fcounts.sum()) and cid == category_id: CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) From fb73a0a82756c79bae7e376374fcf9ca48889f59 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 12:58:52 +0100 Subject: [PATCH 094/128] reactivated feature output --- gffquant/annotation/count_writer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 23439f59..0ffa00e0 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -149,13 +149,18 @@ def write_category2( CountWriter.write_row("category", category_sum, stream=feat_out) for item in counts: - logger.info("ITEM: %s", str(item)) + if not isinstance(item[0], tuple): + logger.info("ITEM: %s", str(item)) + raise TypeError(f"Weird key: {str(item)}") + (cid, fid), fcounts = item + if (report_unseen or fcounts.sum()) and cid == category_id: + CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) - for (cid, fid), fcounts in counts: - if (report_unseen or fcounts.sum()) and cid == category_id: - CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) + # for (cid, fid), fcounts in counts: + # if (report_unseen or fcounts.sum()) and cid == category_id: + # CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) def write_category( From d1a272060b10a8ab499d21bb804102402dfc64cc Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 14:12:00 +0100 Subject: [PATCH 095/128] reactivated feature output --- gffquant/annotation/count_writer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 0ffa00e0..6034bb90 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -143,7 +143,8 @@ def write_category2( ) if "category" in self.publish_reports: - cat_counts = counts[0] + # cat_counts = counts[0] + cat_counts = category_sum logger.info("CAT %s: %s", category_name, str(cat_counts)) if cat_counts is not None: CountWriter.write_row("category", category_sum, stream=feat_out) From d54e3f096dbcacd60dc401a085e8c0cbc52363dc Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 14:40:25 +0100 Subject: [PATCH 096/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index ec4abb74..9db9fff6 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -54,7 +54,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A for i, category in enumerate(categories): u_sf, c_sf = ( CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]), - CountMatrix.calculate_scaling_factor(*category_sums[i][2:4]), + CountMatrix.calculate_scaling_factor(*category_sums[i][3:5]), ) rows = tuple( From 44a773d72d0b8b55d206292c7874fe36b137e26a Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 14:42:37 +0100 Subject: [PATCH 097/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 9db9fff6..017c5af8 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -27,7 +27,7 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A functional_counts = CountMatrix(6) for category in categories: - features = ((feature.id, feature) for feature in db.get_features(category.id)) + features = ((feature.name, feature) for feature in db.get_features(category.id)) for _, feature in sorted(features, key=lambda x:x[0]): _ = functional_counts[(category.id, feature.id)] From 6b06a52f3e492dec430f835fc1ab97715f1827d7 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 22:02:04 +0100 Subject: [PATCH 098/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 20 +++++++++++--------- gffquant/profilers/feature_quantifier.py | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 017c5af8..964f58bb 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -39,15 +39,17 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) ggroup_id = ref - region_annotation = db.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - for category_id, features in region_annotation: - category_id = int(category_id) - category_sums[category_id] += counts - for feature_id in features: - feature_id = int(feature_id) - functional_counts[(category_id, feature_id)] += counts + with open("GGROUP_DATA.txt", "wt") as _out: + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + print(ggroup_id, *(f"{category_id}={features}" for category_id, features in region_annotation), sep="\t", file=_out) + for category_id, features in region_annotation: + category_id = int(category_id) + category_sums[category_id] += counts + for feature_id in features: + feature_id = int(feature_id) + functional_counts[(category_id, feature_id)] += counts functional_counts.drop_unindexed() diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 08981551..a6323c2b 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -168,7 +168,7 @@ def process_counters( feature.id: feature.name for feature in self.adm.get_features(category.id) } - logger.info("PROCESSING CATEGORY=%s", category) + logger.info("PROCESSING CATEGORY=%s", category.name) count_writer.write_category2( category.id, category.name, From a0d1032925c0dd5087445b7904a17a09a0bafc8c Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 22:24:51 +0100 Subject: [PATCH 099/128] reactivated feature output --- gffquant/annotation/genecount_annotator.py | 21 +++++++++++---------- gffquant/counters/count_matrix.py | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 964f58bb..2dd44278 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -31,19 +31,20 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A for _, feature in sorted(features, key=lambda x:x[0]): _ = functional_counts[(category.id, feature.id)] - for rid, counts in counter: - counts = counter[rid] - if gene_group_db: - ggroup_id = rid - else: - ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - ggroup_id = ref - - with open("GGROUP_DATA.txt", "wt") as _out: + with open("GGROUP_DATA.txt", "wt") as _out: + + for rid, counts in counter: + counts = counter[rid] + if gene_group_db: + ggroup_id = rid + else: + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + ggroup_id = ref + region_annotation = db.query_sequence(ggroup_id) if region_annotation is not None: _, _, region_annotation = region_annotation - print(ggroup_id, *(f"{category_id}={features}" for category_id, features in region_annotation), sep="\t", file=_out) + print(ggroup_id, *(f"{category_id}={','.join(features)}" for category_id, features in region_annotation), sep="\t", file=_out) for category_id, features in region_annotation: category_id = int(category_id) category_sums[category_id] += counts diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 89775b3e..9e56e71d 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -25,7 +25,7 @@ def _resize(self): if len(self.index) == nrows: self.counts = np.pad( self.counts, - ((0, nrows * 2), (0, 0),), + ((0, nrows + 1000), (0, 0),), ) return len(self.index) From 8a56e33c553f4cfba5d96d741d2f7b921e1cd527 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Wed, 1 Jan 2025 23:15:34 +0100 Subject: [PATCH 100/128] pleasing linters, cleanup --- gffquant/annotation/count_writer.py | 97 ++------------ gffquant/annotation/genecount_annotator.py | 119 ++++-------------- gffquant/counters/alignment_counter.py | 139 +-------------------- gffquant/counters/count_matrix.py | 22 ++-- gffquant/counters/region_counter.py | 2 +- gffquant/profilers/feature_quantifier.py | 22 +--- 6 files changed, 53 insertions(+), 348 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 6034bb90..fe47a4c7 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -110,8 +110,7 @@ def compile_block(raw, lnorm, scaling_factors): def write_row(header, data, stream=sys.stdout): print(header, *(f"{c:.5f}" for c in data), flush=True, sep="\t", file=stream) - - def write_category2( + def write_category( self, category_id, category_name, @@ -149,90 +148,24 @@ def write_category2( if cat_counts is not None: CountWriter.write_row("category", category_sum, stream=feat_out) - for item in counts: - if not isinstance(item[0], tuple): - logger.info("ITEM: %s", str(item)) - raise TypeError(f"Weird key: {str(item)}") - (cid, fid), fcounts = item - if (report_unseen or fcounts.sum()) and cid == category_id: - CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) - - - - # for (cid, fid), fcounts in counts: + # for item in counts: + # if not isinstance(item[0], tuple): + # logger.info("ITEM: %s", str(item)) + # raise TypeError(f"Weird key: {str(item)}") + # (cid, fid), fcounts = item # if (report_unseen or fcounts.sum()) and cid == category_id: # CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) - - def write_category( - self, - category, - counts, - index, - names, - unique_sf, - ambig_sf, - unannotated_reads=None, - report_unseen=True, - ): - # category, c_counts, c_index, c_names, u_sf, a_sf - if "scaled" in self.publish_reports: - logger.info( - "SCALING FACTORS %s %s %s", - category, unique_sf, ambig_sf, - ) - with gzip.open(f"{self.out_prefix}.{category}.txt.gz", "wt") as feat_out: - header = self.get_header() - print("feature", *header, sep="\t", file=feat_out) - - if unannotated_reads is not None: - print("unannotated", unannotated_reads, sep="\t", file=feat_out) - - if "total_readcount" in self.publish_reports: - CountWriter.write_row( - "total_reads", - np.zeros(len(header)) + self.total_readcount, - stream=feat_out, - ) - - if "filtered_readcount" in self.publish_reports: - CountWriter.write_row( - "filtered_reads", - np.zeros(len(header)) + self.filtered_readcount, - stream=feat_out, - ) - - if "category" in self.publish_reports: - cat_counts = counts[0] - logger.info("CAT %s: %s", category, str(cat_counts)) - if cat_counts is not None: - # cat_row = self.compile_output_row( - # cat_counts, - # scaling_factor=unique_sf, - # ambig_scaling_factor=ambig_sf, - # ) - CountWriter.write_row("category", counts[0], stream=feat_out) - - for fid, i in index.items(): - f_counts = counts[i] - if report_unseen or f_counts.sum(): - # out_row = self.compile_output_row( - # f_counts, - # scaling_factor=unique_sf, - # ambig_scaling_factor=ambig_sf, - # ) - CountWriter.write_row(names[fid], counts[i], stream=feat_out) + for (cid, fid), fcounts in counts: + if (report_unseen or fcounts.sum()) and cid == category_id: + CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) def write_gene_counts( self, gene_counts: AlignmentCounter, refmgr, - # uniq_scaling_factor, - # ambig_scaling_factor, - gene_group_db=False + gene_group_db=False, ): - # if "scaled" in self.publish_reports: - # logger.info("SCALING_FACTORS %s %s", uniq_scaling_factor, ambig_scaling_factor) with gzip.open(f"{self.out_prefix}.gene_counts.txt.gz", "wt") as gene_out: print("gene", *self.get_header(), sep="\t", file=gene_out, flush=True) @@ -241,11 +174,10 @@ def write_gene_counts( refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0], rid, ) - for rid, _ in gene_counts #.get_all_regions() + for rid, _ in gene_counts ) for ref, rid in sorted(ref_stream): - # counts = gene_counts.get_counts(rid) counts = gene_counts[rid] if gene_group_db: ref_tokens = ref.split(".") @@ -253,11 +185,4 @@ def write_gene_counts( else: gene_id = ref - # out_row = self.compile_output_row( - # counts, - # scaling_factor=uniq_scaling_factor, - # ambig_scaling_factor=ambig_scaling_factor, - # ) - - # CountWriter.write_row(gene_id, out_row, stream=gene_out,) CountWriter.write_row(gene_id, counts, stream=gene_out,) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 2dd44278..3fbd8a8d 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -21,39 +21,42 @@ def __init__(self, strand_specific, report_scaling_factors=True): """ __init__() """ CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors) - def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): + def annotate_gene_counts( + self, + refmgr, + db: AnnotationDatabaseManager, + counter: AlignmentCounter, + gene_group_db=False + ): categories = list(db.get_categories()) category_sums = np.zeros((len(categories), 6)) functional_counts = CountMatrix(6) for category in categories: features = ((feature.name, feature) for feature in db.get_features(category.id)) - for _, feature in sorted(features, key=lambda x:x[0]): + for _, feature in sorted(features, key=lambda x: x[0]): _ = functional_counts[(category.id, feature.id)] - with open("GGROUP_DATA.txt", "wt") as _out: - - for rid, counts in counter: - counts = counter[rid] - if gene_group_db: - ggroup_id = rid - else: - ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - ggroup_id = ref - - region_annotation = db.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - print(ggroup_id, *(f"{category_id}={','.join(features)}" for category_id, features in region_annotation), sep="\t", file=_out) - for category_id, features in region_annotation: - category_id = int(category_id) - category_sums[category_id] += counts - for feature_id in features: - feature_id = int(feature_id) - functional_counts[(category_id, feature_id)] += counts + for rid, counts in counter: + counts = counter[rid] + if gene_group_db: + ggroup_id = rid + else: + ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) + ggroup_id = ref + + region_annotation = db.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + for category_id, features in region_annotation: + category_id = int(category_id) + category_sums[category_id] += counts + for feature_id in features: + feature_id = int(feature_id) + functional_counts[(category_id, feature_id)] += counts functional_counts.drop_unindexed() - + for i, category in enumerate(categories): u_sf, c_sf = ( CountMatrix.calculate_scaling_factor(*category_sums[i][0:2]), @@ -72,73 +75,3 @@ def annotate_gene_counts(self, refmgr, db: AnnotationDatabaseManager, counter: A category_sums[i, 5] = category_sums[i, 4] * c_sf return functional_counts, category_sums - - - - - - - - def annotate(self, refmgr, db: AnnotationDatabaseManager, counter: AlignmentCounter, gene_group_db=False): - """ Annotate a set of gene counts with functional annotations. """ - - for category in db.get_categories(): - features = tuple(db.get_features(category.id)) - category_counts = np.zeros( - (len(features) + 1, 6,), - dtype='float64', - ) - category_index = { - feature.id: i - for i, feature in enumerate(features, start=1) - } - category_names = { - feature.id: feature.name - for feature in features - } - for rid in counter: - counts = counter[rid] - if gene_group_db: - ggroup_id = rid - else: - ref, _ = refmgr.get(rid[0] if isinstance(rid, tuple) else rid) - ggroup_id = ref - - region_annotation = db.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - category_features = dict(region_annotation).get(str(category.id)) - if category_features is not None: - category_counts[0] += counts # category row - for cf in category_features: - category_counts[category_index.get(int(cf))] += counts - - count_sums = category_counts[0] - - # should scaled counts use a factor derived from all counts - # or should multi-feature counts only contribute once? - # pre 2.19 category count scaling was based on total counts - uniq_scaling_factor, combined_scaling_factor = ( - AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), - AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), - ) - - # apply scaling factors - category_counts[:, 2] = category_counts[:, 1] * uniq_scaling_factor - category_counts[:, 5] = category_counts[:, 4] * combined_scaling_factor - - logger.info( - "GCA:: %s CATEGORY COUNTS: uraw=%s unorm=%s araw=%s anorm=%s => SF: %s %s", - category.name, - count_sums[0], count_sums[1], count_sums[3], count_sums[4], - uniq_scaling_factor, combined_scaling_factor, - ) - - yield ( - category.name, - category_counts, - category_index, - category_names, - uniq_scaling_factor, - combined_scaling_factor, - ) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 9ca23e83..b467eb0e 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -5,8 +5,6 @@ import gzip import logging -from collections import Counter - import numpy as np from .count_matrix import CountMatrix @@ -31,9 +29,10 @@ def normalise_counts(counts, feature_len, scaling_factor): scaled = normalised * scaling_factor return counts, normalised, scaled - def get_increment(self, n_aln, increment): + @staticmethod + def get_increment(n_aln, increment, distribution_mode): # 1overN = lavern. Maya <3 - return (increment / n_aln) if self.distribution_mode == DistributionMode.ONE_OVER_N else increment + return (increment, (increment / n_aln))[distribution_mode == DistributionMode.ONE_OVER_N] def toggle_single_read_handling(self, unmarked_orphans): # precalculate count-increment for single-end, paired-end reads @@ -69,11 +68,6 @@ def __init__( self.increments_auto_detect = (1.0, self.paired_end_count / 2.0,) self.unannotated_reads = 0 - # self.index = {} - # self.counts = np.zeros( - # (AlignmentCounter.INITIAL_SIZE, 2,), - # dtype='float64', - # ) self.counts = CountMatrix(2, nrows=AlignmentCounter.INITIAL_SIZE) def dump(self, prefix, refmgr): @@ -86,39 +80,16 @@ def dump(self, prefix, refmgr): # ref, reflen = refmgr.get(k[0] if isinstance(k, tuple) else k) # print(k, ref, reflen, v, sep="\t", file=_out) - # def get(self, key, default_val): - # key_index = self.index.get(key) - # if key_index is None: - # return Counter() - # return Counter({key: self.counts[key_index]}) - - # def setdefault(self, key, default_val): - # ... - def has_ambig_counts(self): # return bool(self.counts[:, 1].sum() != 0) return bool(self.counts.colsum(1) != 0) def __iter__(self): - # yield from self.index.keys() yield from self.counts def __getitem__(self, key): return self.counts[key] - # def __getitem__(self, key): - # key_index = self.index.get(key) - # if key_index is None: - # return 0.0 - # return self.counts[key_index] - - # def __setitem__(self, key, value): - # key_index = self.index.get(key) - # if key_index is not None: - # self.counts[key_index] = value - # else: - # raise KeyError(f"{key=} not found.") - def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=None,): if pe_library is not None: # this is the case when the alignment has a read group tag @@ -138,34 +109,7 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No return contributed_counts def get_unannotated_reads(self): - # return self.unannotated_reads return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0] - # no_annotation = self.index.get("c591b65a0f4cd46d5125745a40c8c056") - # if no_annotation is not None: - # return self.counts[no_annotation][0] - # return 0.0 - - # def get_counts(self, seqid, strand_specific=False): - # if strand_specific: - # raise NotImplementedError() - # # uniq_counts, ambig_counts = [0.0, 0.0], [0.0, 0.0] - # # uniq_counts[seqid[1]] = uniq_counter[seqid] - # # ambig_counts[seqid[1]] = ambig_counter[seqid] - - # # rid = seqid[0] if isinstance(seqid, tuple) else seqid - # # uniq_counts = [ - # # uniq_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # # uniq_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # # ] - # # ambig_counts = [ - # # ambig_counter[(rid, AlignmentCounter.PLUS_STRAND)], - # # ambig_counter[(rid, AlignmentCounter.MINUS_STRAND)], - # # ] - # counts = self[seqid] - # return np.array((counts[0], counts[2], counts[1], counts[3])) - - # def get_all_regions(self): - # yield from self def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 @@ -173,7 +117,7 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): hit = hits[0] inc = ( ( - self.get_increment(aln_count, increment), + AlignmentCounter.get_increment(aln_count, increment, self.distribution_mode), increment, ) )[aln_count == 1] @@ -184,17 +128,6 @@ def update_counts(self, count_stream, increment=1, ambiguous_counts=False): ) )[self.strand_specific] - # key_index = self.index.get(key) - # if key_index is None: - # nrows = self.counts.shape[0] - # if len(self.index) == nrows: - # self.counts = np.pad( - # self.counts, - # ((0, AlignmentCounter.INITIAL_SIZE), (0, 0),), - # ) - # # key_index = self.index.setdefault(key, len(self.index)) - # key_index = self.index[key] = len(self.index) - # self.counts[key_index][int(ambiguous_counts)] += inc self.counts[key][int(ambiguous_counts)] += inc contributed_counts += inc @@ -217,51 +150,6 @@ def generate_gene_count_matrix(self, refmgr): return self.counts.sum() - # logger.info("LENGTHS ARRAY = %s", lengths.shape) - # logger.info("INDEX SIZE = %s", len(self.index)) - - # # remove the un-indexed rows - # self.counts = self.counts[0:len(self.index), :] - - # # calculate combined_raw - # self.counts[:, 1:2] += self.counts[:, 0:1] - - # # duplicate the raw counts - # self.counts = np.column_stack( - # #(self.counts, self.counts, self.counts,), - # ( - # self.counts[:, 0], self.counts[:, 0], self.counts[:, 0], # 0, 1, 2 - # self.counts[:, 1], self.counts[:, 1], self.counts[:, 1], # 3, 4, 5 - # ), - # # axis=1, - # ) - - # # length-normalise the lnorm columns - # # self.counts[:, 2:4] /= lengths[:, None] - # self.counts[:, 1::3] /= lengths[:, None] - - # count_sums = self.counts.sum(axis=0) - - # # uniq_scaling_factor = (count_sums[0] / count_sums[2], 1.0)[count_sums[2] == 0] - # # ambig_scaling_factor = (count_sums[1] / count_sums[3], 1.0)[count_sums[3] == 0] - # uniq_scaling_factor, combined_scaling_factor = ( - # AlignmentCounter.calculate_scaling_factor(*count_sums[0:2]), - # AlignmentCounter.calculate_scaling_factor(*count_sums[3:5]), - # ) - - # logger.info( - # "AC:: TOTAL GENE COUNTS: uraw=%s unorm=%s craw=%s cnorm=%s => SF: %s %s", - # count_sums[0], count_sums[1], count_sums[3], count_sums[4], - # uniq_scaling_factor, combined_scaling_factor, - # ) - - # # apply scaling factors - # self.counts[:, 2] = self.counts[:, 1] * uniq_scaling_factor - # self.counts[:, 5] = self.counts[:, 4] * combined_scaling_factor - - # # return count sums and scaling factors - # return count_sums, uniq_scaling_factor, combined_scaling_factor - @staticmethod def calculate_scaling_factor(raw, norm): if norm == 0.0: @@ -276,22 +164,3 @@ def group_gene_count_matrix(self, refmgr): ) self.counts = self.counts.group_gene_counts(ggroups) - - # ggroup_index = {} - # for key, key_index in self.index.items(): - # ref = (refmgr.get(key[0] if isinstance(key, tuple) else key))[0] - # ref_tokens = ref.split(".") - # _, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1] - # g_key_index = ggroup_index.get(ggroup_id) - # gene_counts = self.counts[key_index] - # if g_key_index is None: - # g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) - # self.counts[g_key_index] = gene_counts - # else: - # self.counts[g_key_index] += gene_counts - - # # replace index with grouped index - # self.index = ggroup_index - - # # remove the un-indexed (ungrouped) rows - # self.counts = self.counts[0:len(self.index), :] diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 9e56e71d..e9c876cf 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -1,3 +1,5 @@ +""" module docstring """ + import logging import numpy as np @@ -5,6 +7,7 @@ logger = logging.getLogger(__name__) + class CountMatrix: @staticmethod @@ -28,16 +31,16 @@ def _resize(self): ((0, nrows + 1000), (0, 0),), ) return len(self.index) - + def __getitem__(self, key): key_index = self.index.get(key) if key_index is None: key_index = self.index[key] = self._resize() return self.counts[key_index] - + def __setitem__(self, key, value): key_index = self.index.get(key) - if key_index is None: + if key_index is None: key_index = self.index[key] = self._resize() self.counts[key_index] = value @@ -98,10 +101,10 @@ def generate_gene_counts(self, lengths): self.counts = counts return self - + def group_gene_counts(self, ggroups): ggroup_index = {} - for (key, key_index), ggroup_id in zip(self.index.items(), ggroups): + for (key, _), ggroup_id in zip(self.index.items(), ggroups): g_key_index = ggroup_index.get(ggroup_id) gene_counts = self.counts[self.index[key]] if g_key_index is None: @@ -117,13 +120,6 @@ def group_gene_counts(self, ggroups): self.counts = self.counts[0:len(self.index), :] return self - + def colsum(self, col): return self.counts[:, col].sum() - - def get_category(self, category_id): - rows = tuple( - cid == category_id - for (cid, _), _ in self - ) - return self \ No newline at end of file diff --git a/gffquant/counters/region_counter.py b/gffquant/counters/region_counter.py index 41f2b574..ffab7718 100644 --- a/gffquant/counters/region_counter.py +++ b/gffquant/counters/region_counter.py @@ -42,7 +42,7 @@ def _update_region(self, region_id, ostart, oend, rev_strand, cstart=None, cend= def update_counts(self, count_stream, increment=1): contributed_counts = 0 for hits, aln_count in count_stream: - inc = increment if aln_count == 1 else self.get_increment(aln_count, increment) + inc = increment if aln_count == 1 else AlignmentCounter.get_increment(aln_count, increment, self.distribution_mode) for hit in hits: self._update_region( hit.rid, hit.start, hit.end, hit.rev_strand, increment=inc, diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index a6323c2b..3e3f7505 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -145,7 +145,6 @@ def process_counters( count_writer.write_gene_counts( self.counter, self.reference_manager, - # u_sf, a_sf, gene_group_db=gene_group_db, ) @@ -160,7 +159,7 @@ def process_counters( ) logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10])) - logger.info("FC-counts: %s", str(functional_counts.counts[0:10,:])) + logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :])) categories = self.adm.get_categories() for category, category_sum in zip(categories, category_sums): @@ -169,7 +168,7 @@ def process_counters( for feature in self.adm.get_features(category.id) } logger.info("PROCESSING CATEGORY=%s", category.name) - count_writer.write_category2( + count_writer.write_category( category.id, category.name, category_sum, @@ -178,23 +177,6 @@ def process_counters( unannotated_reads=(None, unannotated_reads)[report_unannotated], ) - # for category, c_counts, c_index, c_names, u_sf, a_sf in count_annotator.annotate( - # self.reference_manager, - # self.adm, - # self.counter, - # gene_group_db=gene_group_db, - # ): - # logger.info("PROCESSING CATEGORY=%s", category) - # count_writer.write_category( - # category, - # c_counts, - # c_index, - # c_names, - # u_sf, - # a_sf, - # unannotated_reads=(None, unannotated_reads)[report_unannotated], - # ) - self.adm.clear_caches() def register_reference(self, rid, aln_reader): From 98c6cf4e02c0d77a92d34baea99f06cf29cbfacf Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 02:07:53 +0100 Subject: [PATCH 101/128] trying to reduce memory footprint --- gffquant/annotation/genecount_annotator.py | 8 +++--- gffquant/counters/count_matrix.py | 33 ++++++++++++++++++---- gffquant/profilers/feature_quantifier.py | 27 ++++++++++++++++-- 3 files changed, 56 insertions(+), 12 deletions(-) diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 3fbd8a8d..4ae066c4 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -32,10 +32,10 @@ def annotate_gene_counts( category_sums = np.zeros((len(categories), 6)) functional_counts = CountMatrix(6) - for category in categories: - features = ((feature.name, feature) for feature in db.get_features(category.id)) - for _, feature in sorted(features, key=lambda x: x[0]): - _ = functional_counts[(category.id, feature.id)] + # for category in categories: + # features = ((feature.name, feature) for feature in db.get_features(category.id)) + # for _, feature in sorted(features, key=lambda x: x[0]): + # _ = functional_counts[(category.id, feature.id)] for rid, counts in counter: counts = counter[rid] diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index e9c876cf..6fb098e3 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -10,18 +10,39 @@ class CountMatrix: + @classmethod + def from_count_matrix(cls, cmatrix, rows=None): + if rows is None: + counts = np.array(cmatrix.counts) + index = dict(counts.index.items()) + else: + counts = cmatrix.counts[rows, :] + index = { + key: value + for (key, value), keep in zip(counts.index.items(), rows) + if keep + } + return cls(index=index, counts=counts) + @staticmethod def calculate_scaling_factor(raw, norm): if norm == 0.0: return 1.0 return raw / norm - def __init__(self, ncols, nrows=1000): - self.index = {} - self.counts = np.zeros( - (nrows, ncols,), - dtype='float64', - ) + def __init__(self, ncols=2, nrows=1000, index=None, counts=None,): + if index is not None and counts is not None: + self.index = dict(index.items()) + self.counts = counts + else: + self.index = {} + self.counts = np.zeros( + (nrows, ncols,), + dtype='float64', + ) + + def has_record(self, key): + return self.index.get(key) is not None def _resize(self): nrows = self.counts.shape[0] diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 3e3f7505..b8855bb2 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -15,6 +15,7 @@ from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter from ..counters import AlignmentCounter +from ..counters.count_matrix import CountMatrix from ..db.annotation_db import AnnotationDatabaseManager from .. import __tool__, DistributionMode, RunMode @@ -163,16 +164,38 @@ def process_counters( categories = self.adm.get_categories() for category, category_sum in zip(categories, category_sums): + features = tuple(self.adm.get_features(category.id)) feature_names = { feature.id: feature.name - for feature in self.adm.get_features(category.id) + for feature in features } + # rows = tuple( + # key[0] == category.id + # for key, _ in functional_counts + # ) + + # cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) + cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) + for feature in features: + key = (category.id, feature.id) + if functional_counts.has_record(key): + cat_counts[key] += functional_counts[key] + else: + _ = cat_counts[key] + + # for category in categories: + # features = ((feature.name, feature) for feature in db.get_features(category.id)) + # for _, feature in sorted(features, key=lambda x: x[0]): + # _ = functional_counts[(category.id, feature.id)] + + logger.info("PROCESSING CATEGORY=%s", category.name) count_writer.write_category( category.id, category.name, category_sum, - functional_counts, + # functional_counts, + cat_counts, feature_names, unannotated_reads=(None, unannotated_reads)[report_unannotated], ) From 7fa8339c4ee511b39e3c21b7e5bfe57659f31d9e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 03:01:23 +0100 Subject: [PATCH 102/128] trying to reduce memory footprint --- gffquant/annotation/count_writer.py | 20 ++++++++++++---- gffquant/annotation/genecount_annotator.py | 2 +- gffquant/profilers/feature_quantifier.py | 27 +++++++++++----------- 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index fe47a4c7..e1980ad1 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -116,7 +116,8 @@ def write_category( category_name, category_sum, counts, - feature_names, + # feature_names, + features, unannotated_reads=None, report_unseen=True, ): @@ -156,9 +157,20 @@ def write_category( # if (report_unseen or fcounts.sum()) and cid == category_id: # CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) - for (cid, fid), fcounts in counts: - if (report_unseen or fcounts.sum()) and cid == category_id: - CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) + empty_row = np.zeros((1, 6), dtype='float64') + for feature in features: + key = (category_id, feature.id) + if counts.has_record(key): + row = counts[key] + else: + row = empty_row + if (report_unseen or row.sum()): + CountWriter.write_row(feature.name, row, stream=feat_out,) + + + # for (cid, fid), fcounts in counts: + # if (report_unseen or fcounts.sum()) and cid == category_id: + # CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) def write_gene_counts( self, diff --git a/gffquant/annotation/genecount_annotator.py b/gffquant/annotation/genecount_annotator.py index 4ae066c4..c7697536 100644 --- a/gffquant/annotation/genecount_annotator.py +++ b/gffquant/annotation/genecount_annotator.py @@ -38,7 +38,7 @@ def annotate_gene_counts( # _ = functional_counts[(category.id, feature.id)] for rid, counts in counter: - counts = counter[rid] + # counts = counter[rid] if gene_group_db: ggroup_id = rid else: diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index b8855bb2..7ac6433d 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -169,19 +169,19 @@ def process_counters( feature.id: feature.name for feature in features } - # rows = tuple( - # key[0] == category.id - # for key, _ in functional_counts - # ) + rows = tuple( + key[0] == category.id + for key, _ in functional_counts + ) - # cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) - cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) - for feature in features: - key = (category.id, feature.id) - if functional_counts.has_record(key): - cat_counts[key] += functional_counts[key] - else: - _ = cat_counts[key] + cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) + # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) + # for feature in features: + # key = (category.id, feature.id) + # if functional_counts.has_record(key): + # cat_counts[key] += functional_counts[key] + # else: + # _ = cat_counts[key] # for category in categories: # features = ((feature.name, feature) for feature in db.get_features(category.id)) @@ -196,7 +196,8 @@ def process_counters( category_sum, # functional_counts, cat_counts, - feature_names, + # feature_names, + features, unannotated_reads=(None, unannotated_reads)[report_unannotated], ) From 3fde96f31e4b5ec38c34d44ee01843691cac9527 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 03:21:20 +0100 Subject: [PATCH 103/128] trying to reduce memory footprint --- gffquant/counters/count_matrix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 6fb098e3..92f0af74 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -14,12 +14,12 @@ class CountMatrix: def from_count_matrix(cls, cmatrix, rows=None): if rows is None: counts = np.array(cmatrix.counts) - index = dict(counts.index.items()) + index = dict(cmatrix.index.items()) else: counts = cmatrix.counts[rows, :] index = { key: value - for (key, value), keep in zip(counts.index.items(), rows) + for (key, value), keep in zip(cmatrix.index.items(), rows) if keep } return cls(index=index, counts=counts) From 64845ec0e9755ea8a0a30747aa44371826529897 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 03:34:37 +0100 Subject: [PATCH 104/128] trying to reduce memory footprint --- gffquant/counters/count_matrix.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 92f0af74..6b573b8d 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -17,11 +17,15 @@ def from_count_matrix(cls, cmatrix, rows=None): index = dict(cmatrix.index.items()) else: counts = cmatrix.counts[rows, :] - index = { - key: value - for (key, value), keep in zip(cmatrix.index.items(), rows) - if keep - } + index = {} + for (key, _), keep in zip(cmatrix.index.items(), rows): + if keep: + index[key] = len(index) + # index = { + # key: value + # for (key, value), keep in zip(cmatrix.index.items(), rows) + # if keep + # } return cls(index=index, counts=counts) @staticmethod From 4745be954f3df1e319f910a4688a1c8a0f669495 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 03:45:10 +0100 Subject: [PATCH 105/128] trying to reduce memory footprint --- gffquant/annotation/count_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index e1980ad1..b54e9ab2 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -157,7 +157,7 @@ def write_category( # if (report_unseen or fcounts.sum()) and cid == category_id: # CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) - empty_row = np.zeros((1, 6), dtype='float64') + empty_row = np.zeros(6, dtype='float64') for feature in features: key = (category_id, feature.id) if counts.has_record(key): From fa19a80a19866bae9abd7b997bba01e80cf4133b Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 10:24:35 +0100 Subject: [PATCH 106/128] trying to reduce memory footprint --- gffquant/annotation/count_writer.py | 3 ++- gffquant/counters/count_matrix.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index b54e9ab2..1e222689 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -9,6 +9,7 @@ import numpy as np from ..counters import AlignmentCounter +from ..counters.count_matrix import CountMatrix logger = logging.getLogger(__name__) @@ -157,7 +158,7 @@ def write_category( # if (report_unseen or fcounts.sum()) and cid == category_id: # CountWriter.write_row(feature_names[fid], fcounts, stream=feat_out,) - empty_row = np.zeros(6, dtype='float64') + empty_row = np.zeros(6, dtype=CountMatrix.NUMPY_DTYPE) for feature in features: key = (category_id, feature.id) if counts.has_record(key): diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 6b573b8d..36651bb6 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -9,6 +9,7 @@ class CountMatrix: + NUMPY_DTYPE = 'float32' @classmethod def from_count_matrix(cls, cmatrix, rows=None): @@ -42,7 +43,7 @@ def __init__(self, ncols=2, nrows=1000, index=None, counts=None,): self.index = {} self.counts = np.zeros( (nrows, ncols,), - dtype='float64', + dtype=CountMatrix.NUMPY_DTYPE, ) def has_record(self, key): From 54b88a55292da47e806e996eedf89dfcd339cc09 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 11:46:41 +0100 Subject: [PATCH 107/128] trying to reduce memory footprint --- gffquant/counters/count_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 36651bb6..91ccaf70 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -9,7 +9,7 @@ class CountMatrix: - NUMPY_DTYPE = 'float32' + NUMPY_DTYPE = 'float16' @classmethod def from_count_matrix(cls, cmatrix, rows=None): From 3d4a4f5ca00b25dbd7c688f3391bf8ce481ae80a Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Thu, 2 Jan 2025 12:54:35 +0100 Subject: [PATCH 108/128] trying to reduce memory footprint --- gffquant/counters/count_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 91ccaf70..032774b2 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -9,7 +9,7 @@ class CountMatrix: - NUMPY_DTYPE = 'float16' + NUMPY_DTYPE = 'float64' # float16 causes some overflow issue during testing @classmethod def from_count_matrix(cls, cmatrix, rows=None): From 2ffa0c5dad11392364f171af21618930f632d899 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 3 Jan 2025 00:02:25 +0100 Subject: [PATCH 109/128] trying category-wise processing --- gffquant/profilers/feature_quantifier.py | 119 ++++++++++++++++------- 1 file changed, 83 insertions(+), 36 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 7ac6433d..a57a4c02 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -11,6 +11,8 @@ from abc import ABC from collections import Counter +import numpy as np + from .panda_coverage_profiler import PandaCoverageProfiler from ..alignment import AlignmentGroup, AlignmentProcessor, ReferenceHit, SamFlags from ..annotation import GeneCountAnnotator, RegionCountAnnotator, CountWriter @@ -152,54 +154,99 @@ def process_counters( self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - functional_counts, category_sums = count_annotator.annotate_gene_counts( - self.reference_manager, - self.adm, - self.counter, - gene_group_db=gene_group_db, - ) - - logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10])) - logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :])) categories = self.adm.get_categories() - for category, category_sum in zip(categories, category_sums): - features = tuple(self.adm.get_features(category.id)) - feature_names = { - feature.id: feature.name - for feature in features - } - rows = tuple( - key[0] == category.id - for key, _ in functional_counts - ) + category_sum = np.array(6) - cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) - # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) - # for feature in features: - # key = (category.id, feature.id) - # if functional_counts.has_record(key): - # cat_counts[key] += functional_counts[key] - # else: - # _ = cat_counts[key] - - # for category in categories: - # features = ((feature.name, feature) for feature in db.get_features(category.id)) - # for _, feature in sorted(features, key=lambda x: x[0]): - # _ = functional_counts[(category.id, feature.id)] + for category in categories: + logger.info("PROCESSING CATEGORY=%s", category.name) + category_counts = CountMatrix(ncols=6) + for rid, counts in self.counter: + if gene_group_db: + ggroup_id = rid + else: + ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid) + ggroup_id = ref + + region_annotation = self.adm.query_sequence(ggroup_id) + if region_annotation is not None: + _, _, region_annotation = region_annotation + for category_id, features in region_annotation: + if int(category_id) == category.id: + category_sum += counts + for feature_id in features: + category_counts[(category.id, int(feature_id))] += counts + break + + u_sf, c_sf = ( + CountMatrix.calculate_scaling_factor(*category_sum[0:2]), + CountMatrix.calculate_scaling_factor(*category_sum[3:5]), + ) + category_counts.scale_column(1, u_sf) + category_counts.scale_column(4, c_sf) - logger.info("PROCESSING CATEGORY=%s", category.name) + features = tuple(self.adm.get_features(category.id)) count_writer.write_category( category.id, category.name, category_sum, - # functional_counts, - cat_counts, - # feature_names, + category_counts, features, unannotated_reads=(None, unannotated_reads)[report_unannotated], ) + + + + + # functional_counts, category_sums = count_annotator.annotate_gene_counts( + # self.reference_manager, + # self.adm, + # self.counter, + # gene_group_db=gene_group_db, + # ) + + # logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10])) + # logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :])) + + # categories = self.adm.get_categories() + # for category, category_sum in zip(categories, category_sums): + # features = tuple(self.adm.get_features(category.id)) + # feature_names = { + # feature.id: feature.name + # for feature in features + # } + # rows = tuple( + # key[0] == category.id + # for key, _ in functional_counts + # ) + + # cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) + # # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) + # # for feature in features: + # # key = (category.id, feature.id) + # # if functional_counts.has_record(key): + # # cat_counts[key] += functional_counts[key] + # # else: + # # _ = cat_counts[key] + + # # for category in categories: + # # features = ((feature.name, feature) for feature in db.get_features(category.id)) + # # for _, feature in sorted(features, key=lambda x: x[0]): + # # _ = functional_counts[(category.id, feature.id)] + + + # logger.info("PROCESSING CATEGORY=%s", category.name) + # count_writer.write_category( + # category.id, + # category.name, + # category_sum, + # # functional_counts, + # cat_counts, + # # feature_names, + # features, + # unannotated_reads=(None, unannotated_reads)[report_unannotated], + # ) self.adm.clear_caches() From 688a081deaa4a485ab791d6b21d5268beb3a3d89 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 3 Jan 2025 00:12:34 +0100 Subject: [PATCH 110/128] trying category-wise processing --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index a57a4c02..04251c2d 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -156,7 +156,7 @@ def process_counters( categories = self.adm.get_categories() - category_sum = np.array(6) + category_sum = np.array(6, dtype='float64') for category in categories: logger.info("PROCESSING CATEGORY=%s", category.name) From 64f814931c2fecce5777530debc7bd84bf8cc3b1 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 3 Jan 2025 00:27:37 +0100 Subject: [PATCH 111/128] trying category-wise processing --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 04251c2d..3af4ba2c 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -156,10 +156,10 @@ def process_counters( categories = self.adm.get_categories() - category_sum = np.array(6, dtype='float64') for category in categories: logger.info("PROCESSING CATEGORY=%s", category.name) + category_sum = np.zeros(6, dtype='float64') category_counts = CountMatrix(ncols=6) for rid, counts in self.counter: if gene_group_db: From f7c16c3d7d5b7f47e4b99a6ff6721e6641396959 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 3 Jan 2025 14:03:36 +0100 Subject: [PATCH 112/128] truncated unannotated hash --- gffquant/counters/alignment_counter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index b467eb0e..5884c76a 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -109,7 +109,8 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No return contributed_counts def get_unannotated_reads(self): - return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0] + # return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0] + return self.counts["c591b65a0f4cd"][0] def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 From 2fe0bdc1a1ba48efe924e525117b17d0dced3b6e Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 7 Jan 2025 12:10:38 +0100 Subject: [PATCH 113/128] making adjustments for new db format --- gffquant/annotation/count_writer.py | 11 ++++++----- gffquant/counters/alignment_counter.py | 5 +++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/gffquant/annotation/count_writer.py b/gffquant/annotation/count_writer.py index 1e222689..155bac39 100644 --- a/gffquant/annotation/count_writer.py +++ b/gffquant/annotation/count_writer.py @@ -192,10 +192,11 @@ def write_gene_counts( for ref, rid in sorted(ref_stream): counts = gene_counts[rid] - if gene_group_db: - ref_tokens = ref.split(".") - gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] - else: - gene_id = ref + # if gene_group_db: + # ref_tokens = ref.split(".") + # gene_id, _ = ".".join(ref_tokens[:-1]), ref_tokens[-1] + # else: + # gene_id = ref + gene_id = ref CountWriter.write_row(gene_id, counts, stream=gene_out,) diff --git a/gffquant/counters/alignment_counter.py b/gffquant/counters/alignment_counter.py index 5884c76a..3f716955 100644 --- a/gffquant/counters/alignment_counter.py +++ b/gffquant/counters/alignment_counter.py @@ -110,7 +110,8 @@ def update(self, count_stream, ambiguous_counts=False, pair=False, pe_library=No def get_unannotated_reads(self): # return self.counts["c591b65a0f4cd46d5125745a40c8c056"][0] - return self.counts["c591b65a0f4cd"][0] + # return self.counts["c591b65a0f4cd"][0] + return self.counts["00000000"][0] def update_counts(self, count_stream, increment=1, ambiguous_counts=False): contributed_counts = 0 @@ -160,7 +161,7 @@ def calculate_scaling_factor(raw, norm): def group_gene_count_matrix(self, refmgr): ggroups = ( - (refmgr.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[-1] + (refmgr.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0] for key, _ in self.counts ) From 9a0f88805380e8123757a18296be2d4bd2ba03a2 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 7 Jan 2025 13:18:10 +0100 Subject: [PATCH 114/128] making adjustments for new db format --- gffquant/counters/count_matrix.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 032774b2..602c352e 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -130,9 +130,12 @@ def generate_gene_counts(self, lengths): def group_gene_counts(self, ggroups): ggroup_index = {} - for (key, _), ggroup_id in zip(self.index.items(), ggroups): + # for (key, _), ggroup_id in zip(self.index.items(), ggroups): + # g_key_index = ggroup_index.get(ggroup_id) + # gene_counts = self.counts[self.index[key]] + for gene_id, gene_counts in self: + ggroup_id = gene_id.split(".")[-1] g_key_index = ggroup_index.get(ggroup_id) - gene_counts = self.counts[self.index[key]] if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) self.counts[g_key_index] = gene_counts From b51b48ddc0236b745c3b1d2593b7a11d6ea2b0ea Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 7 Jan 2025 13:27:28 +0100 Subject: [PATCH 115/128] making adjustments for new db format --- gffquant/counters/count_matrix.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 602c352e..ab1e450f 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -130,12 +130,12 @@ def generate_gene_counts(self, lengths): def group_gene_counts(self, ggroups): ggroup_index = {} - # for (key, _), ggroup_id in zip(self.index.items(), ggroups): + # for gene_id, gene_counts in self: + # ggroup_id = gene_id.split(".")[-1] # g_key_index = ggroup_index.get(ggroup_id) - # gene_counts = self.counts[self.index[key]] - for gene_id, gene_counts in self: - ggroup_id = gene_id.split(".")[-1] + for (_, gene_counts), ggroup_id in zip(self, ggroups): g_key_index = ggroup_index.get(ggroup_id) + # gene_counts = self.counts[self.index[key]] if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) self.counts[g_key_index] = gene_counts From 0e1edae4785ab0f5cc0bf682fcd42f1eac72cadf Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 7 Jan 2025 13:55:09 +0100 Subject: [PATCH 116/128] making adjustments for new db format --- gffquant/counters/count_matrix.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index ab1e450f..88b64db2 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -139,8 +139,10 @@ def group_gene_counts(self, ggroups): if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) self.counts[g_key_index] = gene_counts + logger.info("CM.group_gene_counts: Adding %s to new group %s (%s).", str(gene_counts), ggroup_id, g_key_index) else: self.counts[g_key_index] += gene_counts + logger.info("CM.group_gene_counts: Adding %s to group %s (%s).", str(gene_counts), ggroup_id, g_key_index) # replace index with grouped index self.index = ggroup_index From c065163da196659bdcb2201c61f44dd775b79578 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Tue, 7 Jan 2025 16:00:28 +0100 Subject: [PATCH 117/128] making adjustments for new db format --- gffquant/counters/count_matrix.py | 4 ++-- gffquant/profilers/feature_quantifier.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 88b64db2..1963c812 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -139,10 +139,10 @@ def group_gene_counts(self, ggroups): if g_key_index is None: g_key_index = ggroup_index[ggroup_id] = len(ggroup_index) self.counts[g_key_index] = gene_counts - logger.info("CM.group_gene_counts: Adding %s to new group %s (%s).", str(gene_counts), ggroup_id, g_key_index) + # logger.info("CM.group_gene_counts: Adding %s to new group %s (%s).", str(gene_counts), ggroup_id, g_key_index) else: self.counts[g_key_index] += gene_counts - logger.info("CM.group_gene_counts: Adding %s to group %s (%s).", str(gene_counts), ggroup_id, g_key_index) + # logger.info("CM.group_gene_counts: Adding %s to group %s (%s).", str(gene_counts), ggroup_id, g_key_index) # replace index with grouped index self.index = ggroup_index diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 3af4ba2c..c590cc69 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -164,6 +164,7 @@ def process_counters( for rid, counts in self.counter: if gene_group_db: ggroup_id = rid + logger.info("GGROUP %s: %s", ggroup_id, str(counts)) else: ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid) ggroup_id = ref From 21d42644e736870a8630289afdcfaf31f54b29a3 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 10:43:05 +0100 Subject: [PATCH 118/128] added count matrix state dump --- gffquant/counters/count_matrix.py | 6 ++++++ gffquant/profilers/feature_quantifier.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 1963c812..ad3150f5 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -127,6 +127,12 @@ def generate_gene_counts(self, lengths): self.counts = counts return self + + def dump(self, state="genes"): + with open(f"CountMatrix.{state}.txt", "wt") as _out: + for index, counts in self: + print(index, *counts, sep="\t", file=_out) + def group_gene_counts(self, ggroups): ggroup_index = {} diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index c590cc69..ad5dd242 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -151,9 +151,12 @@ def process_counters( gene_group_db=gene_group_db, ) + self.counter.counts.dump() + self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] + self.counter.counts.dump(state="ggroup") categories = self.adm.get_categories() @@ -187,6 +190,9 @@ def process_counters( category_counts.scale_column(1, u_sf) category_counts.scale_column(4, c_sf) + category_sum[2] = category_sum[1] / u_sf + category_sum[5] = category_sum[4] / c_sf + features = tuple(self.adm.get_features(category.id)) count_writer.write_category( category.id, From 929e2116c79c09a2bd5930ecd329ef2a4389d66f Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 11:03:16 +0100 Subject: [PATCH 119/128] added count matrix state dump --- gffquant/counters/count_matrix.py | 9 ++++++--- gffquant/profilers/feature_quantifier.py | 8 +++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index ad3150f5..0831d111 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -128,10 +128,13 @@ def generate_gene_counts(self, lengths): return self - def dump(self, state="genes"): + def dump(self, state="genes", labels=None,): with open(f"CountMatrix.{state}.txt", "wt") as _out: - for index, counts in self: - print(index, *counts, sep="\t", file=_out) + if labels is None: + for index, counts in self: + print(index, *counts, sep="\t", file=_out) + for (index, counts), label in zip(self, labels): + print(label, *counts, sep="\t", file=_out) def group_gene_counts(self, ggroups): diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index ad5dd242..7d84a32d 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -151,7 +151,13 @@ def process_counters( gene_group_db=gene_group_db, ) - self.counter.counts.dump() + ggroups = ( + (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0] + for key, _ in self.counter.counts + ) + + + self.counter.counts.dump(labels=ggroups) self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] From 86f5f78ea440f76e6e2098477fcdb577e45290cf Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 11:16:33 +0100 Subject: [PATCH 120/128] added count matrix state dump --- gffquant/counters/count_matrix.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 0831d111..72d38c02 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -133,6 +133,7 @@ def dump(self, state="genes", labels=None,): if labels is None: for index, counts in self: print(index, *counts, sep="\t", file=_out) + else: for (index, counts), label in zip(self, labels): print(label, *counts, sep="\t", file=_out) From daac8f2cece125baf9dc5b660e1f58d979b82820 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 11:17:05 +0100 Subject: [PATCH 121/128] added count matrix state dump --- gffquant/profilers/feature_quantifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 7d84a32d..40196377 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -157,12 +157,12 @@ def process_counters( ) - self.counter.counts.dump(labels=ggroups) + self.counter.counts.dump() self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - self.counter.counts.dump(state="ggroup") + self.counter.counts.dump(state="ggroup", labels=ggroups) categories = self.adm.get_categories() From 88fa6f51e71b3ad8bb9557f28c93eb8401c4984c Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 11:38:36 +0100 Subject: [PATCH 122/128] added count matrix state dump --- gffquant/profilers/feature_quantifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 40196377..d5faee4f 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -151,9 +151,9 @@ def process_counters( gene_group_db=gene_group_db, ) - ggroups = ( + ggroups = tuple( (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0] - for key, _ in self.counter.counts + for key, _ in self.counter ) From 12dbfb3b1103dad84553d54900ebc70c16410d22 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 11:50:24 +0100 Subject: [PATCH 123/128] added count matrix state dump --- gffquant/profilers/feature_quantifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index d5faee4f..1442ae98 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -157,12 +157,12 @@ def process_counters( ) - self.counter.counts.dump() + self.counter.counts.dump(labels=ggroups) self.counter.group_gene_count_matrix(self.reference_manager) unannotated_reads = self.counter.get_unannotated_reads() + self.aln_counter["unannotated_ambig"] - self.counter.counts.dump(state="ggroup", labels=ggroups) + self.counter.counts.dump(state="ggroup") categories = self.adm.get_categories() From 166aab47c76e3a7affa119d9779b70dac2fca8ca Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Fri, 10 Jan 2025 12:06:21 +0100 Subject: [PATCH 124/128] added count matrix state dump --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 1442ae98..b30a3b9f 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -152,7 +152,7 @@ def process_counters( ) ggroups = tuple( - (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0].split(".")[0] + (self.reference_manager.get(key[0] if isinstance(key, tuple) else key))[0] # .split(".")[0] for key, _ in self.counter ) From a822d12dd350028fa80e4175c73bc9d1b8ce7e4b Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 11 Jan 2025 22:59:01 +0100 Subject: [PATCH 125/128] refactor group_gene_counts to be not in-place --- gffquant/counters/count_matrix.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 72d38c02..78cdb100 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -139,6 +139,15 @@ def dump(self, state="genes", labels=None,): def group_gene_counts(self, ggroups): + + ggroup_counts = CountMatrix(ncols=6) + for (_, gene_counts), ggroup_id in zip(self, ggroups): + ggroup_counts[ggroup_id] +=gene_counts + + return ggroup_counts + + + ggroup_index = {} # for gene_id, gene_counts in self: # ggroup_id = gene_id.split(".")[-1] From e3c86c1d887859937e0f0ea01bec051ce71c3891 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sat, 11 Jan 2025 23:54:33 +0100 Subject: [PATCH 126/128] refactor group_gene_counts to be not in-place --- gffquant/counters/count_matrix.py | 2 +- gffquant/profilers/feature_quantifier.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gffquant/counters/count_matrix.py b/gffquant/counters/count_matrix.py index 78cdb100..dda3d8a6 100644 --- a/gffquant/counters/count_matrix.py +++ b/gffquant/counters/count_matrix.py @@ -142,7 +142,7 @@ def group_gene_counts(self, ggroups): ggroup_counts = CountMatrix(ncols=6) for (_, gene_counts), ggroup_id in zip(self, ggroups): - ggroup_counts[ggroup_id] +=gene_counts + ggroup_counts[ggroup_id] += gene_counts return ggroup_counts diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index b30a3b9f..7287546d 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -208,6 +208,7 @@ def process_counters( features, unannotated_reads=(None, unannotated_reads)[report_unannotated], ) + break From c4a8fad1e5954e924a184320a4965bd29ba97f47 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 12 Jan 2025 00:02:38 +0100 Subject: [PATCH 127/128] refactor group_gene_counts to be not in-place --- gffquant/profilers/feature_quantifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index 7287546d..c27fa19d 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -208,7 +208,7 @@ def process_counters( features, unannotated_reads=(None, unannotated_reads)[report_unannotated], ) - break + From 1f295bc469fea435db26b4c696692843015a4e61 Mon Sep 17 00:00:00 2001 From: Christian Schudoma Date: Sun, 12 Jan 2025 00:07:54 +0100 Subject: [PATCH 128/128] refactor group_gene_counts to be not in-place --- gffquant/profilers/feature_quantifier.py | 164 +++++++++++------------ 1 file changed, 82 insertions(+), 82 deletions(-) diff --git a/gffquant/profilers/feature_quantifier.py b/gffquant/profilers/feature_quantifier.py index c27fa19d..53a5c2e6 100644 --- a/gffquant/profilers/feature_quantifier.py +++ b/gffquant/profilers/feature_quantifier.py @@ -164,103 +164,103 @@ def process_counters( self.counter.counts.dump(state="ggroup") - categories = self.adm.get_categories() + # categories = self.adm.get_categories() - for category in categories: - logger.info("PROCESSING CATEGORY=%s", category.name) - category_sum = np.zeros(6, dtype='float64') - category_counts = CountMatrix(ncols=6) - for rid, counts in self.counter: - if gene_group_db: - ggroup_id = rid - logger.info("GGROUP %s: %s", ggroup_id, str(counts)) - else: - ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid) - ggroup_id = ref - - region_annotation = self.adm.query_sequence(ggroup_id) - if region_annotation is not None: - _, _, region_annotation = region_annotation - for category_id, features in region_annotation: - if int(category_id) == category.id: - category_sum += counts - for feature_id in features: - category_counts[(category.id, int(feature_id))] += counts - break - - u_sf, c_sf = ( - CountMatrix.calculate_scaling_factor(*category_sum[0:2]), - CountMatrix.calculate_scaling_factor(*category_sum[3:5]), - ) + # for category in categories: + # logger.info("PROCESSING CATEGORY=%s", category.name) + # category_sum = np.zeros(6, dtype='float64') + # category_counts = CountMatrix(ncols=6) + # for rid, counts in self.counter: + # if gene_group_db: + # ggroup_id = rid + # logger.info("GGROUP %s: %s", ggroup_id, str(counts)) + # else: + # ref, _ = self.reference_manager.get(rid[0] if isinstance(rid, tuple) else rid) + # ggroup_id = ref + + # region_annotation = self.adm.query_sequence(ggroup_id) + # if region_annotation is not None: + # _, _, region_annotation = region_annotation + # for category_id, features in region_annotation: + # if int(category_id) == category.id: + # category_sum += counts + # for feature_id in features: + # category_counts[(category.id, int(feature_id))] += counts + # break + + # u_sf, c_sf = ( + # CountMatrix.calculate_scaling_factor(*category_sum[0:2]), + # CountMatrix.calculate_scaling_factor(*category_sum[3:5]), + # ) - category_counts.scale_column(1, u_sf) - category_counts.scale_column(4, c_sf) + # category_counts.scale_column(1, u_sf) + # category_counts.scale_column(4, c_sf) - category_sum[2] = category_sum[1] / u_sf - category_sum[5] = category_sum[4] / c_sf + # category_sum[2] = category_sum[1] / u_sf + # category_sum[5] = category_sum[4] / c_sf - features = tuple(self.adm.get_features(category.id)) - count_writer.write_category( - category.id, - category.name, - category_sum, - category_counts, - features, - unannotated_reads=(None, unannotated_reads)[report_unannotated], - ) + # features = tuple(self.adm.get_features(category.id)) + # count_writer.write_category( + # category.id, + # category.name, + # category_sum, + # category_counts, + # features, + # unannotated_reads=(None, unannotated_reads)[report_unannotated], + # ) - # functional_counts, category_sums = count_annotator.annotate_gene_counts( - # self.reference_manager, - # self.adm, - # self.counter, - # gene_group_db=gene_group_db, - # ) + functional_counts, category_sums = count_annotator.annotate_gene_counts( + self.reference_manager, + self.adm, + self.counter, + gene_group_db=gene_group_db, + ) - # logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10])) - # logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :])) + logger.info("FC-index: %s", str(list(functional_counts.index.keys())[:10])) + logger.info("FC-counts: %s", str(functional_counts.counts[0:10, :])) - # categories = self.adm.get_categories() - # for category, category_sum in zip(categories, category_sums): - # features = tuple(self.adm.get_features(category.id)) - # feature_names = { - # feature.id: feature.name - # for feature in features - # } - # rows = tuple( - # key[0] == category.id - # for key, _ in functional_counts - # ) + categories = self.adm.get_categories() + for category, category_sum in zip(categories, category_sums): + features = tuple(self.adm.get_features(category.id)) + feature_names = { + feature.id: feature.name + for feature in features + } + rows = tuple( + key[0] == category.id + for key, _ in functional_counts + ) - # cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) - # # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) - # # for feature in features: - # # key = (category.id, feature.id) - # # if functional_counts.has_record(key): - # # cat_counts[key] += functional_counts[key] - # # else: - # # _ = cat_counts[key] + cat_counts = CountMatrix.from_count_matrix(functional_counts, rows=rows) + # cat_counts = CountMatrix(ncols=6, nrows=len(feature_names)) + # for feature in features: + # key = (category.id, feature.id) + # if functional_counts.has_record(key): + # cat_counts[key] += functional_counts[key] + # else: + # _ = cat_counts[key] - # # for category in categories: - # # features = ((feature.name, feature) for feature in db.get_features(category.id)) - # # for _, feature in sorted(features, key=lambda x: x[0]): - # # _ = functional_counts[(category.id, feature.id)] + # for category in categories: + # features = ((feature.name, feature) for feature in db.get_features(category.id)) + # for _, feature in sorted(features, key=lambda x: x[0]): + # _ = functional_counts[(category.id, feature.id)] - # logger.info("PROCESSING CATEGORY=%s", category.name) - # count_writer.write_category( - # category.id, - # category.name, - # category_sum, - # # functional_counts, - # cat_counts, - # # feature_names, - # features, - # unannotated_reads=(None, unannotated_reads)[report_unannotated], - # ) + logger.info("PROCESSING CATEGORY=%s", category.name) + count_writer.write_category( + category.id, + category.name, + category_sum, + # functional_counts, + cat_counts, + # feature_names, + features, + unannotated_reads=(None, unannotated_reads)[report_unannotated], + ) self.adm.clear_caches()