From 3ecd46e4a87bec0fd516b7b93dedc58af1f0e2bf Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Wed, 5 Nov 2025 15:07:15 +0100 Subject: [PATCH 1/4] Update Custom components section. --- docs/user-guide/custom-components.rst | 82 +++++++++++++++++---------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/docs/user-guide/custom-components.rst b/docs/user-guide/custom-components.rst index af851a2c5..6cd64803f 100644 --- a/docs/user-guide/custom-components.rst +++ b/docs/user-guide/custom-components.rst @@ -36,9 +36,9 @@ for using body mass index (BMI) as a phenotype score. >>> from gpsea.analysis.pscore import PhenotypeScorer >>> class BmiScorer(PhenotypeScorer): # ❶ ... -... def __init__( # ❷ +... def __init__( ... self, -... id2bmi: typing.Mapping[str, float], +... id2bmi: typing.Mapping[str, float], # ❷ ... ): ... self._id2bmi = id2bmi ... @@ -60,19 +60,22 @@ for using body mass index (BMI) as a phenotype score. ... except KeyError: ... return float('nan') -❶ The ``BmiScorer`` must extend :class:`~gpsea.analysis.pscore.PhenotypeScorer` -to be used as a phenotype scorer. -❷ The scorer needs a ``dict`` with `label` → `BMI` for the analyzed individuals. -We assume the user will pre-compute the corresponding ``dict``. +The ``BmiScorer`` must extend :class:`~gpsea.analysis.pscore.PhenotypeScorer` +to be used as a phenotype scorer (❶). +The scorer needs a mapping (e.g. a Python ``dict``) with `label` → `BMI` for the analyzed individuals (❷). +We assume the user will pre-compute the BMI values. + +Then, the scorer must expose several properties, including ``name``, ``description``, +and the ``variable_name`` it operates on (❸❹❺). +GPSEA uses the properties to describe the scorer in reports or visualizations. +We should always aim for short and concise descriptions. -Then, the scorer must expose several properties, including ❸ ``name``, ❹ ``description``, -and the ❺ ``variable_name`` it operates on. -The properties provide bookkeeping metadata to use in e.g. visualizations. -Try to choose short and concise names. +The most important part of the scorer is the `score` method (❻). +As stated above, the scorer is expected to compute a numerical value or `NaN` +if the individual should be excluded from the analysis. +In the case of BMI scorer, the BMI is retrieved from the ``id2bmi`` dictionary. +If the BMI is missing, `NaN` is returned and the individual is omitted from the analysis. -The most important part of the scorer is the ❻ `score` method -which retrieves the BMI for an individual or returns `NaN` if the value is not available -and the individual should be omitted from the analysis. .. _custom-variant-predicate: @@ -80,16 +83,17 @@ and the individual should be omitted from the analysis. Variant predicate ***************** -The purpose of a :class:`~gpsea.analysis.predicate.VariantPredicate` is to test -if a variant meets a certain criterion and GPSEA ships with an array -of builtin predicates (see :mod:`gpsea.analysis.predicate` module). -However, chances are a custom predicate will be needed in future, -so we show how to how to extend -the :class:`~gpsea.analysis.predicate.VariantPredicate` class -to create one's own predicate. +A :class:`~gpsea.analysis.predicate.VariantPredicate` tests +if a variant meets a certain criterion (e.g. variant is a deletion, variant is annotated wrt. a transcript of interest) +in order to assign the individual harboring the variant into a genotype class. +GPSEA ships with an array of builtin predicates (see :mod:`gpsea.analysis.predicate` module) +that should cover the most commonly needed cases. + +However, since it is unlikely that the builtin predicates cover *all* cases, +GPSEA allows to define custom variant predicates. Here we show how to create one. -Specifically, we show how to create a predicate to test if the variant affects a glycine residue -of the transcript of interest. +As an example, we show how to create a predicate for checking if the variant affects a glycine residue +in a transcript of interest. >>> from gpsea.model import Variant, VariantEffect >>> from gpsea.analysis.predicate import VariantPredicate @@ -133,10 +137,30 @@ of the transcript of interest. ... def __str__(self) -> str: # ➓ ... return f"AffectsGlycinePredicate(tx_id={self._tx_id})" -❶ The ``AffectsGlycinePredicate`` must extend :class:`~gpsea.analysis.predicate.VariantPredicate`. -❷ We ask the user to provide the transcript accession `str` and we set the target aminoacid code to glycine ``Gly``. -Like in the :ref:`custom-phenotype-scorer` above, ❸❹❺ provide metadata required for the bookkeeping. -The ❻ ``test`` method includes the most interesting part - we retrieve the :class:`~gpsea.model.TranscriptAnnotation` -with the functional annotation data for the transcript of interest, and we test if the HGVS protein indicates -that the reference aminoacid is glycine. -Last, we override ➐ ``__eq__()`` and ❽ ``__hash__()`` (required) as well as ❾ ``__repr__()`` and ➓ ``__str__()`` (recommended). + +The ``AffectsGlycinePredicate`` must extend :class:`~gpsea.analysis.predicate.VariantPredicate` to work with GPSEA (❶). +We ask the user to provide the transcript accession `str` and we set the target aminoacid code to glycine ``Gly`` (❷). + +.. note:: + + Clearly, to test for change of *any* aminoacid + with only a slight rewrite of the predicate's constructor. + We will leave this as an exercise for the interested readers. + +Like in the :ref:`custom-phenotype-scorer` above, we provide metadata required for reports and visualizations (❸❹❺). + +The ``test`` method includes the most important logic of the predicate (❻). +In this specific case, we retrieve the :class:`~gpsea.model.TranscriptAnnotation` +with the functional annotation data for the transcript of interest, +and we test if the HGVS protein indicates that the reference aminoacid is glycine. + +.. note:: + + We recommend using an Integrated Development Environment (IDE) such as PyCharm or VS Code to design the predicate. + On top of autocompletion and syntax checking features, an IDE simplifies accessing the properties and methods of objects. + In case of :class:`~gpsea.model.Variant`, an IDE will help us discover its ``get_tx_anno_by_tx_id`` method, + realize that it returns either :class:`~gpsea.model.TranscriptAnnotation` or ``None``, + and retrieve the functional annotation of the variant with respect to transcript's protein sequence + from the ``hgvsp`` field. + +Last, we override ``__eq__()`` and ``__hash__()`` (required, ➐❽) as well as ``__repr__()`` and ``__str__()`` (recommended, ❾➓). From 2cc3c77149a158d095199b3a70e5be78314244c4 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 6 Nov 2025 09:22:26 +0100 Subject: [PATCH 2/4] Add numpy compatibility notes. --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6e7f58c37..9ed153b18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,8 @@ dependencies = [ "requests>=2.25.0,<3.0", "scipy>=1.10,<2.0", "statsmodels>=0.13.0", + # numpy==1.23 was released on June 23rd, 2022. + # numpy==2.0.0 does not break any APIs used by GPSEA. "numpy>=1.23", "matplotlib>=3.2.0,<4.0", "ratelimit>=2.2.1,<3", From 2244dfaa8f57ace1ad6ff9996df69e5e610bf9bf Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 7 Nov 2025 14:28:47 +0100 Subject: [PATCH 3/4] Remove typo. --- src/gpsea/analysis/predicate/_predicates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpsea/analysis/predicate/_predicates.py b/src/gpsea/analysis/predicate/_predicates.py index 9537c213c..98ab8150b 100644 --- a/src/gpsea/analysis/predicate/_predicates.py +++ b/src/gpsea/analysis/predicate/_predicates.py @@ -194,7 +194,7 @@ class VariantTranscriptPredicate(VariantPredicate): is annotated to affect a transcript with `tx_id` accession. Args: - tx_id (str): the accessiono of the transcript of interest, e.g. `NM_123456.7` + tx_id (str): the accession of the transcript of interest, e.g. `NM_123456.7` """ def __init__(self, tx_id: str) -> None: From 59ef4c39c8e6452176a05e350e57d00b10413a0a Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Fri, 12 Dec 2025 14:34:49 +0100 Subject: [PATCH 4/4] Add a TODO. --- src/gpsea/analysis/pscore/_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpsea/analysis/pscore/_api.py b/src/gpsea/analysis/pscore/_api.py index 5db1e9934..3564f321b 100644 --- a/src/gpsea/analysis/pscore/_api.py +++ b/src/gpsea/analysis/pscore/_api.py @@ -356,6 +356,7 @@ def compare_genotype_vs_phenotype_score( # Sort by PatientCategory.cat_id and unpack. # For now, we only allow to have up to 2 groups. + # TODO: fails if one of the categories includes no individuals. x_key, y_key = sorted(data[MonoPhenotypeAnalysisResult.GT_COL].dropna().unique()) x = data.loc[ data[MonoPhenotypeAnalysisResult.GT_COL] == x_key,