From 15ac120553a6b4a5dfd882442b61d7c6632835c8 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Dec 2025 13:46:00 +0100 Subject: [PATCH 1/4] Process VEP response for variants on the mitochondrial chromosome. --- src/gpsea/preprocessing/_vep.py | 34 +- .../data/vep_response/MT_11778_11778_G_A.json | 506 ++++++++++++++++++ tests/preprocessing/test_vep.py | 21 + 3 files changed, 551 insertions(+), 10 deletions(-) create mode 100644 tests/preprocessing/data/vep_response/MT_11778_11778_G_A.json diff --git a/src/gpsea/preprocessing/_vep.py b/src/gpsea/preprocessing/_vep.py index 40a736e1e..75ef6de3a 100644 --- a/src/gpsea/preprocessing/_vep.py +++ b/src/gpsea/preprocessing/_vep.py @@ -85,9 +85,11 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati Parse one transcript annotation from the JSON response. """ trans_id = item.get("transcript_id") - if not self._include_computational_txs and not trans_id.startswith("NM_"): - # Skipping a computational transcript - return None + assert isinstance(trans_id, str) + if VepFunctionalAnnotator._seems_like_refseq_tx(trans_id): + if not self._include_computational_txs and not trans_id.startswith("NM_"): + # Skipping a computational transcript + return None is_preferred = True if ("canonical" in item and item["canonical"] == 1) else False hgvs_cdna = item.get("hgvsc") var_effects = [] @@ -97,12 +99,12 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati if var_effect is not None: var_effects.append(var_effect) gene_name = item.get("gene_symbol") - exons_effected = item.get("exon") - if exons_effected is not None: - exons_effected = exons_effected.split("/")[0].split("-") - if len(exons_effected) == 2: - exons_effected = range(int(exons_effected[0]), int(exons_effected[1]) + 1) - exons_effected = (int(x) for x in exons_effected) + exons_affected = item.get("exon") + if exons_affected is not None: + exons_affected = exons_affected.split("/")[0].split("-") + if len(exons_affected) == 2: + exons_affected = range(int(exons_affected[0]), int(exons_affected[1]) + 1) + exons_affected = (int(x) for x in exons_affected) protein_id = item.get("protein_id") hgvsp = item.get("hgvsp") @@ -123,7 +125,15 @@ def _process_item(self, item: typing.Dict) -> typing.Optional[TranscriptAnnotati protein_effect = Region(protein_effect_start, protein_effect_end) return TranscriptAnnotation( - gene_name, trans_id, hgvs_cdna, is_preferred, var_effects, exons_effected, protein_id, hgvsp, protein_effect + gene_name, + trans_id, + hgvs_cdna, + is_preferred, + var_effects, + exons_affected, + protein_id, + hgvsp, + protein_effect, ) def fetch_response( @@ -190,3 +200,7 @@ def format_coordinates_for_vep_query(vc: VariantCoordinates) -> str: # MNV return f"{chrom}:{start}-{end}/{alt}" + + @staticmethod + def _seems_like_refseq_tx(tx_id: str) -> bool: + return tx_id.startswith('NM_') or tx_id.startswith('XM_') diff --git a/tests/preprocessing/data/vep_response/MT_11778_11778_G_A.json b/tests/preprocessing/data/vep_response/MT_11778_11778_G_A.json new file mode 100644 index 000000000..09b3580e3 --- /dev/null +++ b/tests/preprocessing/data/vep_response/MT_11778_11778_G_A.json @@ -0,0 +1,506 @@ +{ + "id": "MT_11778_G/A", + "variant_class": "SNV", + "input": "MT 11778 11778 G/A 1", + "most_severe_consequence": "missense_variant", + "transcript_consequences": [ + { + "gene_id": "4508", + "consequence_terms": [ + "downstream_gene_variant" + ], + "transcript_id": "ATP6.1", + "strand": 1, + "used_ref": "G", + "protein_id": "YP_003024031.1", + "variant_allele": "A", + "canonical": 1, + "distance": 2571, + "given_ref": "G", + "gene_symbol": "ATP6", + "gene_symbol_source": "EntrezGene", + "impact": "MODIFIER", + "biotype": "protein_coding" + }, + { + "given_ref": "G", + "gene_symbol": "ATP8", + "gene_symbol_source": "EntrezGene", + "impact": "MODIFIER", + "biotype": "protein_coding", + "used_ref": "G", + "protein_id": "YP_003024030.1", + "canonical": 1, + "variant_allele": "A", + "distance": 3206, + "strand": 1, + "gene_id": "4509", + "transcript_id": "ATP8.1", + "consequence_terms": [ + "downstream_gene_variant" + ] + }, + { + "transcript_id": "COX1.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4512", + "strand": 1, + "variant_allele": "A", + "canonical": 1, + "distance": 4333, + "used_ref": "G", + "protein_id": "YP_003024028.1", + "impact": "MODIFIER", + "biotype": "protein_coding", + "given_ref": "G", + "gene_symbol": "COX1", + "gene_symbol_source": "EntrezGene" + }, + { + "strand": 1, + "gene_id": "4513", + "consequence_terms": [ + "downstream_gene_variant" + ], + "transcript_id": "COX2.1", + "gene_symbol": "COX2", + "gene_symbol_source": "EntrezGene", + "given_ref": "G", + "biotype": "protein_coding", + "impact": "MODIFIER", + "used_ref": "G", + "protein_id": "YP_003024029.1", + "distance": 3509, + "variant_allele": "A", + "canonical": 1 + }, + { + "strand": 1, + "consequence_terms": [ + "downstream_gene_variant" + ], + "transcript_id": "COX3.1", + "gene_id": "4514", + "impact": "MODIFIER", + "biotype": "protein_coding", + "given_ref": "G", + "gene_symbol_source": "EntrezGene", + "gene_symbol": "COX3", + "variant_allele": "A", + "canonical": 1, + "distance": 1788, + "protein_id": "YP_003024032.1", + "used_ref": "G" + }, + { + "gene_symbol_source": "EntrezGene", + "gene_symbol": "CYTB", + "given_ref": "G", + "biotype": "protein_coding", + "impact": "MODIFIER", + "protein_id": "YP_003024038.1", + "used_ref": "G", + "canonical": 1, + "variant_allele": "A", + "distance": 2969, + "strand": 1, + "gene_id": "4519", + "consequence_terms": [ + "upstream_gene_variant" + ], + "transcript_id": "CYTB.1" + }, + { + "impact": "MODIFIER", + "biotype": "protein_coding", + "given_ref": "G", + "gene_symbol": "ND3", + "gene_symbol_source": "EntrezGene", + "distance": 1374, + "variant_allele": "A", + "canonical": 1, + "protein_id": "YP_003024033.1", + "used_ref": "G", + "strand": 1, + "transcript_id": "ND3.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4537" + }, + { + "variant_allele": "A", + "distance": 1012, + "canonical": 1, + "protein_id": "YP_003024034.1", + "used_ref": "G", + "biotype": "protein_coding", + "impact": "MODIFIER", + "gene_symbol": "ND4L", + "gene_symbol_source": "EntrezGene", + "given_ref": "G", + "consequence_terms": [ + "downstream_gene_variant" + ], + "transcript_id": "ND4L.1", + "gene_id": "4539", + "strand": 1 + }, + { + "variant_allele": "A", + "cdna_end": 1019, + "protein_id": "YP_003024035.1", + "used_ref": "G", + "impact": "MODERATE", + "cds_start": 1019, + "transcript_id": "ND4.1", + "sift_prediction": "deleterious_low_confidence", + "gene_id": "4538", + "codons": "cGc/cAc", + "polyphen_score": 0.996, + "canonical": 1, + "hgvsp": "YP_003024035.1:p.Arg340His", + "amino_acids": "R/H", + "mutfunc": { + "mod": { + "dG_mt": 346.5258, + "ddG_sd": 0.009126, + "dG_mt_sd": 0.095423, + "dG_wt_sd": 0.097037, + "ddG": 0.3596, + "dG_wt": 346.1662 + } + }, + "protein_end": 340, + "cds_end": 1019, + "biotype": "protein_coding", + "exon": "1/1", + "gene_symbol": "ND4", + "gene_symbol_source": "EntrezGene", + "given_ref": "G", + "sift_score": 0, + "consequence_terms": [ + "missense_variant" + ], + "hgvsc": "ND4.1:c.1019G>A", + "cdna_start": 1019, + "polyphen_prediction": "probably_damaging", + "protein_start": 340, + "strand": 1 + }, + { + "impact": "MODIFIER", + "biotype": "protein_coding", + "given_ref": "G", + "gene_symbol_source": "EntrezGene", + "gene_symbol": "ND5", + "variant_allele": "A", + "distance": 559, + "canonical": 1, + "used_ref": "G", + "protein_id": "YP_003024036.1", + "strand": 1, + "consequence_terms": [ + "upstream_gene_variant" + ], + "transcript_id": "ND5.1", + "gene_id": "4540" + }, + { + "transcript_id": "ND6.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4541", + "strand": -1, + "canonical": 1, + "variant_allele": "A", + "distance": 2371, + "used_ref": "G", + "protein_id": "YP_003024037.1", + "impact": "MODIFIER", + "biotype": "protein_coding", + "given_ref": "G", + "gene_symbol": "ND6", + "gene_symbol_source": "EntrezGene" + }, + { + "variant_allele": "A", + "distance": 4193, + "canonical": 1, + "used_ref": "G", + "biotype": "tRNA", + "impact": "MODIFIER", + "gene_symbol_source": "EntrezGene", + "gene_symbol": "TRND", + "given_ref": "G", + "consequence_terms": [ + "downstream_gene_variant" + ], + "transcript_id": "TRND.1", + "gene_id": "4555", + "strand": 1 + }, + { + "impact": "MODIFIER", + "biotype": "tRNA", + "given_ref": "G", + "gene_symbol_source": "EntrezGene", + "gene_symbol": "TRNE", + "variant_allele": "A", + "distance": 2896, + "canonical": 1, + "used_ref": "G", + "strand": -1, + "transcript_id": "TRNE.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4556" + }, + { + "transcript_id": "TRNG.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4563", + "strand": 1, + "distance": 1720, + "variant_allele": "A", + "canonical": 1, + "used_ref": "G", + "impact": "MODIFIER", + "biotype": "tRNA", + "given_ref": "G", + "gene_symbol": "TRNG", + "gene_symbol_source": "EntrezGene" + }, + { + "distance": 360, + "variant_allele": "A", + "canonical": 1, + "used_ref": "G", + "impact": "MODIFIER", + "biotype": "tRNA", + "given_ref": "G", + "gene_symbol": "TRNH", + "gene_symbol_source": "EntrezGene", + "transcript_id": "TRNH.1", + "consequence_terms": [ + "upstream_gene_variant" + ], + "gene_id": "4564", + "strand": 1 + }, + { + "consequence_terms": [ + "downstream_gene_variant" + ], + "transcript_id": "TRNK.1", + "gene_id": "4566", + "strand": 1, + "canonical": 1, + "variant_allele": "A", + "distance": 3414, + "used_ref": "G", + "impact": "MODIFIER", + "biotype": "tRNA", + "given_ref": "G", + "gene_symbol": "TRNK", + "gene_symbol_source": "EntrezGene" + }, + { + "strand": 1, + "consequence_terms": [ + "upstream_gene_variant" + ], + "transcript_id": "TRNL2.1", + "gene_id": "4568", + "biotype": "tRNA", + "impact": "MODIFIER", + "gene_symbol": "TRNL2", + "gene_symbol_source": "EntrezGene", + "given_ref": "G", + "variant_allele": "A", + "canonical": 1, + "distance": 488, + "used_ref": "G" + }, + { + "biotype": "tRNA", + "impact": "MODIFIER", + "gene_symbol": "TRNP", + "gene_symbol_source": "EntrezGene", + "given_ref": "G", + "distance": 4178, + "variant_allele": "A", + "canonical": 1, + "used_ref": "G", + "strand": -1, + "transcript_id": "TRNP.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4571" + }, + { + "biotype": "tRNA", + "impact": "MODIFIER", + "gene_symbol_source": "EntrezGene", + "gene_symbol": "TRNR", + "given_ref": "G", + "variant_allele": "A", + "canonical": 1, + "distance": 1309, + "used_ref": "G", + "strand": 1, + "transcript_id": "TRNR.1", + "consequence_terms": [ + "downstream_gene_variant" + ], + "gene_id": "4573" + }, + { + "gene_symbol": "TRNS1", + "gene_symbol_source": "EntrezGene", + "given_ref": "G", + "biotype": "tRNA", + "impact": "MODIFIER", + "used_ref": "G", + "canonical": 1, + "variant_allele": "A", + "distance": 4264, + "strand": -1, + "gene_id": "4574", + "consequence_terms": [ + "upstream_gene_variant" + ], + "transcript_id": "TRNS1.1" + }, + { + "used_ref": "G", + "distance": 429, + "variant_allele": "A", + "canonical": 1, + "given_ref": "G", + "gene_symbol_source": "EntrezGene", + "gene_symbol": "TRNS2", + "impact": "MODIFIER", + "biotype": "tRNA", + "gene_id": "4575", + "transcript_id": "TRNS2.1", + "consequence_terms": [ + "upstream_gene_variant" + ], + "strand": 1 + }, + { + "given_ref": "G", + "gene_symbol": "TRNT", + "gene_symbol_source": "EntrezGene", + "impact": "MODIFIER", + "biotype": "tRNA", + "used_ref": "G", + "variant_allele": "A", + "canonical": 1, + "distance": 4110, + "strand": 1, + "gene_id": "4576", + "consequence_terms": [ + "upstream_gene_variant" + ], + "transcript_id": "TRNT.1" + } + ], + "allele_string": "G/A", + "start": 11778, + "seq_region_name": "MT", + "assembly_name": "GRCh38", + "strand": 1, + "colocated_variants": [ + { + "start": 11778, + "id": "rs199476112", + "strand": 1, + "clin_sig_allele": "A:pathogenic", + "end": 11778, + "var_synonyms": { + "OMIM": [ + 516003.0001 + ], + "UniProt": [ + "VAR_004760" + ], + "ClinVar": [ + "RCV002260593", + "RCV002288481", + "RCV002285007", + "RCV004814876", + "RCV004814875", + "RCV000010354", + "VCV000009708", + "RCV000224219" + ] + }, + "seq_region_name": "MT", + "pubmed": [ + 25741868, + 1900003, + 20301353, + 34002094, + 12560876, + 1417830, + 8755941, + 19026397, + 1346348, + 1352537, + 1734726, + 1763894, + 1770533, + 1770665, + 1866007, + 1937476, + 1959619, + 1959931, + 2039048, + 2222273, + 2286378, + 2346190, + 2346203, + 2390098, + 2566021, + 2566116, + 2575667, + 2817063, + 3201231, + 8101084, + 8240101, + 8240102, + 8448903, + 8449667, + 8457609, + 8474822, + 8489402, + 8489411, + 9150158, + 11169561, + 11854175, + 12402249, + 16431939, + 16477364, + 16532388, + 18771762, + 29774306, + 39578757 + ], + "clin_sig": [ + "pathogenic" + ], + "allele_string": "G/A", + "phenotype_or_disease": 1 + } + ], + "end": 11778 +} \ No newline at end of file diff --git a/tests/preprocessing/test_vep.py b/tests/preprocessing/test_vep.py index f04660990..0dfa29d03 100644 --- a/tests/preprocessing/test_vep.py +++ b/tests/preprocessing/test_vep.py @@ -12,6 +12,7 @@ LMNA_MANE_TX_ID = "NM_170707.4" ANKRD11_MANE_TX_ID = "NM_013275.6" +ND4_TX_ID = "ND4.1" @pytest.mark.parametrize( @@ -188,6 +189,18 @@ def test_process_response_deletion( 803, 804, ), + ( # `MT_11778_11778_G_A` + "MT", + 11_777, + 11_778, + "G", + "A", + 0, + ND4_TX_ID, + "YP_003024035.1", + 339, + 340, + ), ], ) def test_parse_response( @@ -274,6 +287,14 @@ def test_parse_response( "T", 0, ), # `X_31180437_31180437_C_T` + ( + "MT", + 11_777, + 11_778, + "G", + "A", + 0, + ), # `MT_11778_11778_G_A` ], ) def test_fetch_response( From 3283f25ea9951c6e8a3affbff0e00a49874bed8c Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Dec 2025 13:57:24 +0100 Subject: [PATCH 2/4] Handle refseq transcripts. --- src/gpsea/preprocessing/_vep.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gpsea/preprocessing/_vep.py b/src/gpsea/preprocessing/_vep.py index 75ef6de3a..4a22f0fcc 100644 --- a/src/gpsea/preprocessing/_vep.py +++ b/src/gpsea/preprocessing/_vep.py @@ -33,6 +33,11 @@ class VepFunctionalAnnotator(FunctionalAnnotator): Non-coding variant effects where we do not complain if the functional annotation lacks the protein effects. """ + _REFSEQ_TX_ID_PREFIXES = ( + 'NM_', 'NR_', 'NC_', + 'XM_', 'XR_', 'XC_', + ) + def __init__(self, include_computational_txs: bool = False, timeout: float = 10.0): self._logger = logging.getLogger(__name__) self._url = ( @@ -203,4 +208,4 @@ def format_coordinates_for_vep_query(vc: VariantCoordinates) -> str: @staticmethod def _seems_like_refseq_tx(tx_id: str) -> bool: - return tx_id.startswith('NM_') or tx_id.startswith('XM_') + return len(tx_id) >= 3 and tx_id[:3] in VepFunctionalAnnotator._REFSEQ_TX_ID_PREFIXES From 2572b58c37f02995dceab7e9fa78adbee9b47445 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Dec 2025 14:06:03 +0100 Subject: [PATCH 3/4] Regenerate caches, format code. --- .../6_32040421_32040421_C_T.json | 126 ++++++++ .../variant_cache/MT_11778_11778_G_A.json | 293 ++++++++++++++++++ src/gpsea/preprocessing/_vep.py | 10 +- tests/test_random.py | 33 ++ 4 files changed, 459 insertions(+), 3 deletions(-) create mode 100644 .gpsea_ci_cachedir/variant_cache/6_32040421_32040421_C_T.json create mode 100644 .gpsea_ci_cachedir/variant_cache/MT_11778_11778_G_A.json create mode 100644 tests/test_random.py diff --git a/.gpsea_ci_cachedir/variant_cache/6_32040421_32040421_C_T.json b/.gpsea_ci_cachedir/variant_cache/6_32040421_32040421_C_T.json new file mode 100644 index 000000000..c3883bb63 --- /dev/null +++ b/.gpsea_ci_cachedir/variant_cache/6_32040421_32040421_C_T.json @@ -0,0 +1,126 @@ +[ + { + "gene_symbol": "CYP21A2", + "transcript_id": "NM_000500.9", + "hgvs_cdna": "NM_000500.9:c.955C>T", + "is_preferred": true, + "variant_effects": [ + "STOP_GAINED" + ], + "overlapping_exons": [ + 8 + ], + "protein_id": "NP_000491.4", + "hgvsp": "NP_000491.4:p.Gln319Ter", + "protein_effect_location": { + "start": 318, + "end": 319 + } + }, + { + "gene_symbol": "CYP21A2", + "transcript_id": "NM_001128590.4", + "hgvs_cdna": "NM_001128590.4:c.865C>T", + "is_preferred": false, + "variant_effects": [ + "STOP_GAINED" + ], + "overlapping_exons": [ + 7 + ], + "protein_id": "NP_001122062.3", + "hgvsp": "NP_001122062.3:p.Gln289Ter", + "protein_effect_location": { + "start": 288, + "end": 289 + } + }, + { + "gene_symbol": "TNXB", + "transcript_id": "NM_001365276.2", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "NP_001352205.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "CYP21A2", + "transcript_id": "NM_001368143.2", + "hgvs_cdna": "NM_001368143.2:c.550C>T", + "is_preferred": false, + "variant_effects": [ + "STOP_GAINED" + ], + "overlapping_exons": [ + 8 + ], + "protein_id": "NP_001355072.1", + "hgvsp": "NP_001355072.1:p.Gln184Ter", + "protein_effect_location": { + "start": 183, + "end": 184 + } + }, + { + "gene_symbol": "CYP21A2", + "transcript_id": "NM_001368144.2", + "hgvs_cdna": "NM_001368144.2:c.550C>T", + "is_preferred": false, + "variant_effects": [ + "STOP_GAINED" + ], + "overlapping_exons": [ + 7 + ], + "protein_id": "NP_001355073.1", + "hgvsp": "NP_001355073.1:p.Gln184Ter", + "protein_effect_location": { + "start": 183, + "end": 184 + } + }, + { + "gene_symbol": "TNXB", + "transcript_id": "NM_001428335.1", + "hgvs_cdna": null, + "is_preferred": false, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "NP_001415264.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TNXB", + "transcript_id": "NM_019105.8", + "hgvs_cdna": null, + "is_preferred": false, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "NP_061978.6", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TNXB", + "transcript_id": "NM_032470.4", + "hgvs_cdna": null, + "is_preferred": false, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "NP_115859.2", + "hgvsp": null, + "protein_effect_location": null + } +] \ No newline at end of file diff --git a/.gpsea_ci_cachedir/variant_cache/MT_11778_11778_G_A.json b/.gpsea_ci_cachedir/variant_cache/MT_11778_11778_G_A.json new file mode 100644 index 000000000..5acdea931 --- /dev/null +++ b/.gpsea_ci_cachedir/variant_cache/MT_11778_11778_G_A.json @@ -0,0 +1,293 @@ +[ + { + "gene_symbol": "ATP6", + "transcript_id": "ATP6.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024031.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "ATP8", + "transcript_id": "ATP8.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024030.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "COX1", + "transcript_id": "COX1.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024028.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "COX2", + "transcript_id": "COX2.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024029.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "COX3", + "transcript_id": "COX3.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024032.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "CYTB", + "transcript_id": "CYTB.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024038.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "ND3", + "transcript_id": "ND3.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024033.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "ND4L", + "transcript_id": "ND4L.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024034.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "ND4", + "transcript_id": "ND4.1", + "hgvs_cdna": "ND4.1:c.1019G>A", + "is_preferred": true, + "variant_effects": [ + "MISSENSE_VARIANT" + ], + "overlapping_exons": [ + 1 + ], + "protein_id": "YP_003024035.1", + "hgvsp": "YP_003024035.1:p.Arg340His", + "protein_effect_location": { + "start": 339, + "end": 340 + } + }, + { + "gene_symbol": "ND5", + "transcript_id": "ND5.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024036.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "ND6", + "transcript_id": "ND6.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": "YP_003024037.1", + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRND", + "transcript_id": "TRND.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNE", + "transcript_id": "TRNE.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNG", + "transcript_id": "TRNG.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNH", + "transcript_id": "TRNH.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNK", + "transcript_id": "TRNK.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNL2", + "transcript_id": "TRNL2.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNP", + "transcript_id": "TRNP.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNR", + "transcript_id": "TRNR.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "DOWNSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNS1", + "transcript_id": "TRNS1.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNS2", + "transcript_id": "TRNS2.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + }, + { + "gene_symbol": "TRNT", + "transcript_id": "TRNT.1", + "hgvs_cdna": null, + "is_preferred": true, + "variant_effects": [ + "UPSTREAM_GENE_VARIANT" + ], + "overlapping_exons": null, + "protein_id": null, + "hgvsp": null, + "protein_effect_location": null + } +] \ No newline at end of file diff --git a/src/gpsea/preprocessing/_vep.py b/src/gpsea/preprocessing/_vep.py index 4a22f0fcc..a605a13c9 100644 --- a/src/gpsea/preprocessing/_vep.py +++ b/src/gpsea/preprocessing/_vep.py @@ -34,8 +34,12 @@ class VepFunctionalAnnotator(FunctionalAnnotator): """ _REFSEQ_TX_ID_PREFIXES = ( - 'NM_', 'NR_', 'NC_', - 'XM_', 'XR_', 'XC_', + "NM_", + "NR_", + "NC_", + "XM_", + "XR_", + "XC_", ) def __init__(self, include_computational_txs: bool = False, timeout: float = 10.0): @@ -205,7 +209,7 @@ def format_coordinates_for_vep_query(vc: VariantCoordinates) -> str: # MNV return f"{chrom}:{start}-{end}/{alt}" - + @staticmethod def _seems_like_refseq_tx(tx_id: str) -> bool: return len(tx_id) >= 3 and tx_id[:3] in VepFunctionalAnnotator._REFSEQ_TX_ID_PREFIXES diff --git a/tests/test_random.py b/tests/test_random.py new file mode 100644 index 000000000..10a37fe91 --- /dev/null +++ b/tests/test_random.py @@ -0,0 +1,33 @@ +# Tests of random stuff asked for by the users. + +import hpotk +import pytest + +from gpsea.preprocessing import CohortCreator, configure_caching_cohort_creator, load_phenopacket_files + + +@pytest.fixture(scope="module") +def hpo() -> hpotk.MinimalOntology: + store = hpotk.configure_ontology_store() + return store.load_minimal_hpo(release="v2025-10-22") + + +@pytest.fixture(scope="module") +def cohort_creator( + hpo: hpotk.MinimalOntology, +) -> CohortCreator: + return configure_caching_cohort_creator( + hpo=hpo, + ) + + +@pytest.mark.skip(reason="Just for interactive debugging") +def test_load_phenopacket( + cohort_creator: CohortCreator, +): + pps = ("dev/Mito/1-10011778Ff.json",) + _cohort, qc = load_phenopacket_files( + pp_files=pps, + cohort_creator=cohort_creator, + ) + qc.summarize() From 739d4f5bc80c81d4f3eea57d5c54e026ab141cc7 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Dec 2025 14:18:21 +0100 Subject: [PATCH 4/4] Clear previously set env variable. --- tests/test_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_config.py b/tests/test_config.py index e9a341c3b..6d613e16d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -30,5 +30,7 @@ def test_create_using_environment_variable( assert cd == target - if previous is not None: + if previous is None: + del os.environ[CACHE_ENV] + else: os.environ[CACHE_ENV] = previous