Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
128 commits
Select commit Hold shift + click to select a range
61b93da
initial
cschu Dec 15, 2024
6c80f0a
version
cschu Dec 15, 2024
2be12b4
fix: getitem implementation
cschu Dec 15, 2024
e143eb3
fix?: missing counts
cschu Dec 16, 2024
32ce2bd
fix: fixing AlignmentCounter __getitem__/__setitem__ methods
cschu Dec 16, 2024
ed0eaab
merge uniq/ambig seqcounters
cschu Dec 16, 2024
6c32072
fix: AlignmentCounter.has_ambig_counts(), CountManager.has_ambig_coun…
cschu Dec 16, 2024
28f84e8
fix: minor
cschu Dec 16, 2024
b344e9f
updating count annotation
cschu Dec 19, 2024
5ed8cfc
fixed import
cschu Dec 19, 2024
a92722a
added debug message
cschu Dec 19, 2024
515e71f
added debug message
cschu Dec 19, 2024
6efbca6
fixing empty length vector issue?
cschu Dec 19, 2024
ef34cde
fixing empty length vector issue?
cschu Dec 19, 2024
aa09c1a
added debug message
cschu Dec 19, 2024
d3649c2
fixing empty total counts?
cschu Dec 19, 2024
e3bee67
fixing total count issue?
cschu Dec 19, 2024
b466381
debug messaging
cschu Dec 20, 2024
8249d9f
fixed gene writing?
cschu Dec 20, 2024
7fb5c56
fixed gene writing?
cschu Dec 20, 2024
d2226ac
fixed gene writing?
cschu Dec 20, 2024
a1e54c2
fixed gene writing?
cschu Dec 20, 2024
f336493
fixed gene writing?
cschu Dec 21, 2024
cb9ea29
dump seqcounters for debugging
cschu Dec 21, 2024
6ae5c5a
dump seqcounters for debugging
cschu Dec 21, 2024
fbb6a0e
dump seqcounters for debugging
cschu Dec 21, 2024
a70f1a8
changed strand specific order
cschu Dec 21, 2024
bb13e14
debug log
cschu Dec 21, 2024
4579aff
debug log
cschu Dec 21, 2024
a5f7fb1
debug log
cschu Dec 21, 2024
68fa194
fixed gene writing?
cschu Dec 21, 2024
313a061
fixed gene writing?
cschu Dec 21, 2024
e917fe2
starting to replace CountManager
cschu Dec 21, 2024
37b624e
pleasing linters
cschu Dec 22, 2024
9e078ce
removed seq_counter.py
cschu Dec 22, 2024
7976a4d
removed Unique- and AmbiguousRegionCounter classes
cschu Dec 22, 2024
f754653
updated alignment_counter, removed alignment_counter2
cschu Dec 22, 2024
26fbb55
throwing out old code, splitting of regioncount_annotator
cschu Dec 22, 2024
bd1436f
removed count_manager
cschu Dec 22, 2024
b460ebd
removed count_manager references
cschu Dec 22, 2024
1789551
modified gene_count write behaviour in prep of ggroup annotation
cschu Dec 23, 2024
fdd7aaf
modified gene_count write behaviour in prep of ggroup annotation
cschu Dec 23, 2024
4af2f14
change gene group handling during annotation
cschu Dec 23, 2024
39ebd15
change gene group handling during annotation
cschu Dec 23, 2024
c0b4664
change gene group handling during annotation
cschu Dec 23, 2024
b935a24
added debug messaging
cschu Dec 24, 2024
69709f8
solved?
cschu Dec 24, 2024
62c0a52
disabling various logger calls
cschu Dec 24, 2024
2239d29
disabling various logger calls
cschu Dec 24, 2024
413684a
trying to update feature count processing
cschu Dec 25, 2024
1b5cf9c
trying to update feature count processing
cschu Dec 25, 2024
cb6ac1a
trying to update feature count processing
cschu Dec 25, 2024
8977c4d
trying to update feature count processing
cschu Dec 25, 2024
f96e1b3
trying to update feature count processing
cschu Dec 25, 2024
b4fb4d8
trying to update feature count processing
cschu Dec 25, 2024
3594dff
trying to update feature count processing
cschu Dec 25, 2024
0dbd4ae
trying to update feature count processing
cschu Dec 25, 2024
7b8e6bf
debug log
cschu Dec 25, 2024
563b402
trying to fix annotate2
cschu Dec 25, 2024
e9e1715
turn off annotate2 log
cschu Dec 25, 2024
07c6743
trying to update feature count processing
cschu Dec 25, 2024
da8e93b
trying to update feature count processing
cschu Dec 25, 2024
a94d9e7
trying to update feature count processing
cschu Dec 26, 2024
164cc6d
trying to update feature count processing
cschu Dec 26, 2024
0cdf49a
trying to update feature count processing
cschu Dec 26, 2024
3b2b5cb
trying to update feature count processing
cschu Dec 27, 2024
cb9934e
added category scaling comment
cschu Dec 27, 2024
4e119a4
linting + obsolete code removal
cschu Dec 27, 2024
90eb4b5
trying to optimise scaling factors, temp. disabled feature counts
cschu Dec 29, 2024
5b0286f
trying to optimise scaling factors, temp. disabled feature counts
cschu Dec 30, 2024
5baafe0
trying to optimise scaling factors, temp. disabled feature counts
cschu Dec 30, 2024
49e11e3
re-enable feature counts
cschu Dec 30, 2024
b39b4b1
trying to fix scaling factor issue
cschu Dec 30, 2024
b76f168
trying to fix scaling factor issue
cschu Dec 30, 2024
9920404
trying to implement count matrix
cschu Dec 31, 2024
945cf8e
trying to implement count matrix
cschu Dec 31, 2024
07f85f0
trying to implement count matrix
cschu Dec 31, 2024
dbb36da
trying to implement count matrix
cschu Dec 31, 2024
b281a16
trying to implement count matrix
cschu Dec 31, 2024
0b64813
trying to implement count matrix
cschu Dec 31, 2024
7f93e35
trying to implement count matrix
cschu Dec 31, 2024
b8bdf08
trying to implement count matrix
cschu Dec 31, 2024
85d1def
trying to implement count matrix
cschu Dec 31, 2024
e325127
trying to implement count matrix
cschu Dec 31, 2024
c1c93d1
trying to implement count matrix
cschu Dec 31, 2024
21fd963
reactivated feature output
cschu Jan 1, 2025
a47b87e
reactivated feature output
cschu Jan 1, 2025
7723314
reactivated feature output
cschu Jan 1, 2025
27e9e38
reactivated feature output
cschu Jan 1, 2025
2f938f6
reactivated feature output
cschu Jan 1, 2025
03a4a96
reactivated feature output
cschu Jan 1, 2025
74c2992
reactivated feature output
cschu Jan 1, 2025
26a94d8
reactivated feature output
cschu Jan 1, 2025
fb73a0a
reactivated feature output
cschu Jan 1, 2025
d1a2720
reactivated feature output
cschu Jan 1, 2025
d54e3f0
reactivated feature output
cschu Jan 1, 2025
44a773d
reactivated feature output
cschu Jan 1, 2025
6b06a52
reactivated feature output
cschu Jan 1, 2025
a0d1032
reactivated feature output
cschu Jan 1, 2025
8a56e33
pleasing linters, cleanup
cschu Jan 1, 2025
98c6cf4
trying to reduce memory footprint
cschu Jan 2, 2025
7fa8339
trying to reduce memory footprint
cschu Jan 2, 2025
3fde96f
trying to reduce memory footprint
cschu Jan 2, 2025
64845ec
trying to reduce memory footprint
cschu Jan 2, 2025
4745be9
trying to reduce memory footprint
cschu Jan 2, 2025
fa19a80
trying to reduce memory footprint
cschu Jan 2, 2025
54b88a5
trying to reduce memory footprint
cschu Jan 2, 2025
3d4a4f5
trying to reduce memory footprint
cschu Jan 2, 2025
2ffa0c5
trying category-wise processing
cschu Jan 2, 2025
688a081
trying category-wise processing
cschu Jan 2, 2025
64f8149
trying category-wise processing
cschu Jan 2, 2025
f7c16c3
truncated unannotated hash
cschu Jan 3, 2025
2fe0bdc
making adjustments for new db format
cschu Jan 7, 2025
9a0f888
making adjustments for new db format
cschu Jan 7, 2025
b51b48d
making adjustments for new db format
cschu Jan 7, 2025
0e1edae
making adjustments for new db format
cschu Jan 7, 2025
c065163
making adjustments for new db format
cschu Jan 7, 2025
21d4264
added count matrix state dump
cschu Jan 10, 2025
929e211
added count matrix state dump
cschu Jan 10, 2025
86f5f78
added count matrix state dump
cschu Jan 10, 2025
daac8f2
added count matrix state dump
cschu Jan 10, 2025
88fa6f5
added count matrix state dump
cschu Jan 10, 2025
12dbfb3
added count matrix state dump
cschu Jan 10, 2025
166aab4
added count matrix state dump
cschu Jan 10, 2025
a822d12
refactor group_gene_counts to be not in-place
cschu Jan 11, 2025
e3c86c1
refactor group_gene_counts to be not in-place
cschu Jan 11, 2025
c4a8fad
refactor group_gene_counts to be not in-place
cschu Jan 11, 2025
1f295bc
refactor group_gene_counts to be not in-place
cschu Jan 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
FROM ubuntu:22.04

LABEL maintainer="cschu1981@gmail.com"
LABEL version="2.18.0"
LABEL version="2.19.0"
LABEL description="gffquant - functional profiling of metagenomic/transcriptomic wgs samples"


Expand Down
2 changes: 1 addition & 1 deletion gffquant/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from enum import Enum, auto, unique


__version__ = "2.18.0"
__version__ = "2.19.0"
__tool__ = "gffquant"


Expand Down
1 change: 1 addition & 0 deletions gffquant/alignment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from .aln_group import AlignmentGroup
from .pysam_alignment_processor import AlignmentProcessor
from .reference_hit import ReferenceHit
from .samflags import SamFlags
from .cigarops import CigarOps

Expand Down
3 changes: 3 additions & 0 deletions gffquant/alignment/aln_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ def get_all_hits(self, as_ambiguous=False):
except TypeError as err:
raise TypeError(f"Cannot derive sequencing library from tags: {aln.tags}") from err

# in region mode, there can be more hits
# (if the alignment overlaps multiple features of the target sequence)
# in gene mode, each alignment is a hit, i.e. there is at most 1 hit / alignment
yield aln.hits, n_aln

def get_ambig_align_counts(self):
Expand Down
38 changes: 38 additions & 0 deletions gffquant/alignment/reference_hit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# pylint: disable=R0902

""" module docstring """

from dataclasses import dataclass, asdict


@dataclass(slots=True)
class ReferenceHit:
rid: int = None
start: int = None
end: int = None
rev_strand: bool = None
cov_start: int = None
cov_end: int = None
has_annotation: bool = None
n_aln: int = None
is_ambiguous: bool = None
library_mod: int = None
mate_id: int = None

def __hash__(self):
return hash(tuple(asdict(self).values()))

def __eq__(self, other):
return all(
item[0][1] == item[1][1]
for item in zip(
sorted(asdict(self).items()),
sorted(asdict(other).items())
)
)

def __str__(self):
return "\t".join(map(str, asdict(self).values()))

def __repr__(self):
return str(self)
5 changes: 4 additions & 1 deletion gffquant/annotation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,8 @@

""" module docstring """

from .count_annotator import GeneCountAnnotator, RegionCountAnnotator
# from .count_annotator import GeneCountAnnotator, RegionCountAnnotator
from .count_annotator import CountAnnotator
from .count_writer import CountWriter
from .genecount_annotator import GeneCountAnnotator
from .regioncount_annotator import RegionCountAnnotator
161 changes: 13 additions & 148 deletions gffquant/annotation/count_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def calc_scaling_factor(raw, normed, default=0):
return (raw / normed) if normed else default

total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_counts
# total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_counts
logger.info(
"TOTAL COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
Expand All @@ -108,19 +109,20 @@ def calc_scaling_factor(raw, normed, default=0):
total_ambi, total_ambi_normed, default_scaling_factor
)

total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts
logger.info(
"TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
)
# total_uniq, total_uniq_normed, total_ambi, total_ambi_normed = self.total_gene_counts
# total_uniq, total_ambi, total_uniq_normed, total_ambi_normed = self.total_gene_counts
# logger.info(
# "TOTAL GENE COUNTS: uraw=%s unorm=%s araw=%s anorm=%s",
# total_uniq, total_uniq_normed, total_ambi, total_ambi_normed
# )

self.scaling_factors["total_gene_uniq"] = calc_scaling_factor(
total_uniq, total_uniq_normed, default_scaling_factor
)
# self.scaling_factors["total_gene_uniq"] = calc_scaling_factor(
# total_uniq, total_uniq_normed, default_scaling_factor
# )

self.scaling_factors["total_gene_ambi"] = calc_scaling_factor(
total_ambi, total_ambi_normed, default_scaling_factor
)
# self.scaling_factors["total_gene_ambi"] = calc_scaling_factor(
# total_ambi, total_ambi_normed, default_scaling_factor
# )

fc_items = self.feature_count_sums.items()
for category, (
Expand Down Expand Up @@ -189,140 +191,3 @@ def compute_count_vector(
counts[1::2] /= float(length)

return counts


class RegionCountAnnotator(CountAnnotator):
""" CountAnnotator subclass for contig/region-based counting. """

def __init__(self, strand_specific, report_scaling_factors=True):
CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)

# pylint: disable=R0914,W0613
def annotate(self, refmgr, db, count_manager, gene_group_db=False):
"""
Annotate a set of region counts via db-lookup.
input:
- bam: bamr.BamFile to use as lookup table for reference names
- db: GffDatabaseManager holding functional annotation database
- count_manager: count_data
"""
for rid in set(count_manager.uniq_regioncounts).union(
count_manager.ambig_regioncounts
):
ref = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)[0]

for region in count_manager.get_regions(rid):
if self.strand_specific:
(start, end), rev_strand = region
else:
(start, end), rev_strand = region, None
# the region_annotation is a tuple of key-value pairs:
# (strand, func_category1: subcategories, func_category2: subcategories, ...)
# the first is the strand, the second is the gene id, the rest are the features

region_annotation = db.query_sequence(ref, start=start, end=end)
if region_annotation is not None:
region_strand, feature_id, region_annotation = region_annotation
if feature_id is None:
feature_id = ref

on_other_strand = (region_strand == "+" and rev_strand) \
or (region_strand == "-" and not rev_strand)

antisense_region = self.strand_specific and on_other_strand

uniq_counts, ambig_counts = count_manager.get_counts(
(rid, start, end), region_counts=True, strand_specific=self.strand_specific
)

if self.strand_specific:
# if the region is antisense, 'sense-counts' (relative to the) region come from the
# negative strand and 'antisense-counts' from the positive strand
# vice-versa for a sense-region
strand_specific_counts = (
(count_manager.MINUS_STRAND, count_manager.PLUS_STRAND)
if antisense_region
else (count_manager.PLUS_STRAND, count_manager.MINUS_STRAND)
)
else:
strand_specific_counts = None

region_length = end - start + 1
counts = self.compute_count_vector(
uniq_counts,
ambig_counts,
region_length,
strand_specific_counts=strand_specific_counts,
region_counts=True,
)

self.distribute_feature_counts(counts, region_annotation)

gcounts = self.gene_counts.setdefault(
feature_id, np.zeros(self.bins)
)
gcounts += counts
self.total_gene_counts += counts[:4]

self.calculate_scaling_factors()


class GeneCountAnnotator(CountAnnotator):
""" CountAnnotator subclass for gene-based counting. """

def __init__(self, strand_specific, report_scaling_factors=True):
CountAnnotator.__init__(self, strand_specific, report_scaling_factors=report_scaling_factors)

def annotate(self, refmgr, db, count_manager, gene_group_db=False):
"""
Annotate a set of gene counts via db-iteration.
input:
- bam: bamr.BamFile to use as reverse lookup table for reference ids
- db: GffDatabaseManager holding functional annotation database
- count_manager: count_data
"""
strand_specific_counts = (
(count_manager.PLUS_STRAND, count_manager.MINUS_STRAND)
if self.strand_specific else None
)

for rid in set(count_manager.uniq_seqcounts).union(
count_manager.ambig_seqcounts
):
ref, region_length = refmgr.get(rid[0] if isinstance(rid, tuple) else rid)

uniq_counts, ambig_counts = count_manager.get_counts(
rid, region_counts=False, strand_specific=self.strand_specific
)

counts = self.compute_count_vector(
uniq_counts,
ambig_counts,
region_length,
strand_specific_counts=strand_specific_counts,
)

if gene_group_db:
ref_tokens = ref.split(".")
gene_id, ggroup_id = ".".join(ref_tokens[:-1]), ref_tokens[-1]
else:
ggroup_id, gene_id = ref, ref

gcounts = self.gene_counts.setdefault(gene_id, np.zeros(self.bins))
gcounts += counts
self.total_gene_counts += counts[:4]

region_annotation = db.query_sequence(ggroup_id)
if region_annotation is not None:
_, _, region_annotation = region_annotation
logger.info(
"GCAnnotator: Distributing counts of Gene %s (group=%s) %s %s",
gene_id, ggroup_id, counts[0], counts[2],
)
self.distribute_feature_counts(counts, region_annotation)

else:
logger.info("GCAnnotator: Gene %s (group=%s) has no information in database.", gene_id, ggroup_id)
self.unannotated_counts += counts[:4]

self.calculate_scaling_factors()
Loading