diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst index 7f078bf..26b7c8b 100644 --- a/doc/source/changelog.rst +++ b/doc/source/changelog.rst @@ -9,9 +9,11 @@ Changes in v0.11 This is largely a bugfix release, many thanks to contributors Rory Kirchner, Stefano Rivera, Daniel Lowengrub, Nolan Woods, Stefen Moeller, and Husen Umer. -- Avoid deadlocks in tests under Python 3.8 (`#155 `_, thanks Stefano Rivera) +- Avoid deadlocks in tests under Python 3.8 (`#155 + `_, thanks Stefano Rivera) - Fix deprecation warning for invalid escape sequence (`#168 - `_, Stefen Moeller, and `#165 `_, thanks Rory Kirchner) + `_, Stefen Moeller, and `#165 + `_, thanks Rory Kirchner) - Fix ResourceWarning about unclosed file (`#169 `_, thanks Daniel Lowengrub) - Allow database creation when there is an empty string in the transcript ID @@ -32,6 +34,9 @@ Stefano Rivera, Daniel Lowengrub, Nolan Woods, Stefen Moeller, and Husen Umer. attributes. This solves things like `#128 `_ where some dialect components are otherwise ambiguous. +- Fix bug in :meth:`FeatureDB.children_bp`, `#157 + `_, where the `ignore_strand` + argument is deprecated. Changes in v0.10.1 ------------------ diff --git a/gffutils/helpers.py b/gffutils/helpers.py index 5d36807..8ab1a97 100644 --- a/gffutils/helpers.py +++ b/gffutils/helpers.py @@ -485,9 +485,14 @@ def to_unicode(obj, encoding="utf-8"): def canonical_transcripts(db, fasta_filename): + """ + WARNING: this function is currently not well ttested and will likely be + replaced with a more modular approach. + """ import pyfaidx - fasta = pyfaidx.Fasta(fasta_filename, as_raw=True) + + fasta = pyfaidx.Fasta(fasta_filename, as_raw=False) for gene in db.features_of_type("gene"): # exons_list will contain (CDS_length, total_length, transcript, [exons]) tuples. @@ -502,20 +507,20 @@ def canonical_transcripts(db, fasta_filename): cds_len += exon_length total_len += exon_length - exon_list.append((cds_len, total_len, transcript, exons)) + exon_list.append((cds_len, total_len, transcript, exons if cds_len == 0 else [e for e in exons if e.featuretype in ['CDS', 'five_prime_UTR', 'three_prime_UTR']])) # If we have CDS, then use the longest coding transcript if max(i[0] for i in exon_list) > 0: - best = sorted(exon_list)[0] + best = sorted(exon_list, key=lambda x: x[0], reverse=True)[0] # Otherwise, just choose the longest else: - best = sorted(exon_list, lambda x: x[1])[0] + best = sorted(exon_list, key=lambda x: x[1])[0] print(best) canonical_exons = best[-1] transcript = best[-2] - seqs = [i.sequence(fasta) for i in canonical_exons] + seqs = [i.sequence(fasta) for i in sorted(canonical_exons, key=lambda x: x.start, reverse=transcript.strand != '+')] yield transcript, "".join(seqs) diff --git a/gffutils/test/test_issues.py b/gffutils/test/test_issues.py index 3fe285d..70689e4 100644 --- a/gffutils/test/test_issues.py +++ b/gffutils/test/test_issues.py @@ -330,3 +330,10 @@ def test_issue_157(): # The way to do it now is the following (we can omit the mc.feature_type # since we're preselecting for exons anyway): db.children_bp(gene, child_featuretype='exon', merge=True, merge_criteria=(mc.overlap_end_inclusive)) + + +def test_issue_159(): + db = gffutils.create_db(gffutils.example_filename('FBgn0031208.gff'), ":memory:") + fasta = gffutils.example_filename('dm6-chr2L.fa') + for transcript, seq in gffutils.helpers.canonical_transcripts(db, fasta): + pass