diff --git a/docs/include/example.feature.parquet b/docs/include/example.feature.parquet index 8d85b8d2..5c5fe40a 100644 Binary files a/docs/include/example.feature.parquet and b/docs/include/example.feature.parquet differ diff --git a/docs/include/example.psm.parquet b/docs/include/example.psm.parquet index 95e7ef09..9725e375 100644 Binary files a/docs/include/example.psm.parquet and b/docs/include/example.psm.parquet differ diff --git a/docs/spec/psm.md b/docs/spec/psm.md index ba001658..9005158a 100644 --- a/docs/spec/psm.md +++ b/docs/spec/psm.md @@ -35,16 +35,16 @@ The PSM (Peptide Spectrum Match) view captures spectrum-level identification res These fields are optional and may not exist in the file at all. They are included based on conversion settings or user preference. -| Field | Description | Type | Required | -|-------|-------------|------|----------| +| Field | Description | Type | Required | +|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------|----------| | `protein_accessions` | Protein accessions of all proteins that the peptide maps to. Optional because protein mapping can be recovered from the feature and protein group views | array[string], null | no | -| `cross_links` | Cross-link information for XL-MS experiments. Each entry describes one cross-link site. `null` for non-cross-linked PSMs | array[struct], null | no | -| `ion_mobility` | Ion mobility value for the precursor ion | float32, null | no | -| `mz_array` | Array of m/z values for the spectrum | array[float32], null | no | -| `intensity_array` | Array of intensity values for the spectrum | array[float32], null | no | -| `charge_array` | Array of fragment ion charge values | array[int32], null | no | -| `ion_type_array` | Array of fragment ion type annotations (e.g., b, y, a) | array[string], null | no | -| `ion_mobility_array` | Array of fragment ion mobility values | array[float32], null | no | +| `cross_links` | Cross-link information for XL-MS experiments. Each entry describes one cross-link site. `null` for non-cross-linked PSMs | array[struct], null | no | +| `ion_mobility` | Ion mobility value for the precursor ion | float32, null | no | +| `mz_array` | Array of m/z values for the spectrum | array[float32], null | no | +| `intensity_array` | Array of intensity values for the spectrum | array[float32], null | no | +| `charge_array` | Array of fragment ion charge values | array[int32], null | no | +| `ion_type_array` | Array of fragment ion type annotations (e.g., b1, y2, a2) | array[string], null | no | +| `ion_mobility_array` | Array of fragment ion mobility values | array[float32], null | no | !!! note "Nullable vs Optional" Core fields marked as "not required" are **nullable** -- the column always exists in the file but individual values may be null. Optional fields (protein accessions, spectral data) may be **absent from the file entirely**, depending on conversion settings. Protein mappings can be recovered by joining with the feature and protein group views. @@ -101,7 +101,7 @@ Several fields in the PSM view use structures shared across other QPX views: ```json { "sequence": "AAAAAAAAAAGAAGGR", - "peptidoform": "_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_", + "peptidoform": "[Acetyl]-AAAAAAAAAAGAAGGR", "charge": 2, "scan": [42164], "rt": 5140.98, @@ -156,7 +156,7 @@ When spectral arrays are included, the record also contains peak-level data: ```json { "sequence": "AAAAAAAAAAGAAGGR", - "peptidoform": "_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_", + "peptidoform": "[Acetyl]-AAAAAAAAAAGAAGGR", "charge": 2, "scan": [42164], "rt": 5140.98, diff --git a/qpx/converters/diann/constants.py b/qpx/converters/diann/constants.py index d6bdbf3d..f5bd6111 100644 --- a/qpx/converters/diann/constants.py +++ b/qpx/converters/diann/constants.py @@ -53,7 +53,7 @@ def to_proforma(modified_sequence: str) -> str: return build_proforma(plain_seq, mods) -def to_modifications(modified_sequence: str, sequence: str) -> list[dict] | None: +def to_modifications(modified_sequence: str, sequence: str) -> tuple[str, list[dict] | None]: """Parse modifications from a DIA-NN Modified.Sequence string. Converts to ProForma first, then delegates to the shared ``from_proforma`` @@ -64,7 +64,8 @@ def to_modifications(modified_sequence: str, sequence: str) -> list[dict] | None sequence: Stripped peptide sequence (no modification annotations). Returns: - List of modification dicts per QPX schema, or ``None`` if unmodified. + Tuple of (peptidoform, modifications) where modifications is a list + of modification dicts per QPX schema, or ``None`` if unmodified. """ proforma = to_proforma(modified_sequence) return from_proforma(proforma, sequence, meta=None) diff --git a/qpx/converters/diann/feature_adapter.py b/qpx/converters/diann/feature_adapter.py index 539e8459..fd0d0e10 100644 --- a/qpx/converters/diann/feature_adapter.py +++ b/qpx/converters/diann/feature_adapter.py @@ -231,7 +231,7 @@ def _register_precursor_lookup(self, enzyme_name: str | None) -> dict: charge = int(charge) peptidoform = to_proforma(modified_seq) - modifications = to_modifications(modified_seq, sequence) + _, modifications = to_modifications(modified_seq, sequence) cache_key = (modified_seq, charge) if cache_key not in self._mz_cache: diff --git a/qpx/converters/fragpipe/constants.py b/qpx/converters/fragpipe/constants.py index ca03cf85..df0cd57b 100644 --- a/qpx/converters/fragpipe/constants.py +++ b/qpx/converters/fragpipe/constants.py @@ -70,7 +70,7 @@ def to_proforma(assigned_mods: str, sequence: str) -> str: return build_proforma(sequence, mods) -def to_modifications(assigned_mods: str, sequence: str) -> list[dict] | None: +def to_modifications(assigned_mods: str, sequence: str) -> tuple[str, list[dict] | None]: """Parse modifications from FragPipe Assigned Modifications format. Args: @@ -78,7 +78,8 @@ def to_modifications(assigned_mods: str, sequence: str) -> list[dict] | None: sequence: Stripped peptide sequence. Returns: - List of modification dicts per QPX schema, or ``None`` if unmodified. + Tuple of (peptidoform, modifications) where modifications is a list + of modification dicts per QPX schema, or ``None`` if unmodified. """ proforma = to_proforma(assigned_mods, sequence) return from_proforma(proforma, sequence, meta=None) diff --git a/qpx/converters/fragpipe/feature_adapter.py b/qpx/converters/fragpipe/feature_adapter.py index 6f6b0425..50f82ed8 100644 --- a/qpx/converters/fragpipe/feature_adapter.py +++ b/qpx/converters/fragpipe/feature_adapter.py @@ -384,7 +384,7 @@ def _transform_row( # Modifications (reuse assigned_mods_str already extracted for peptidoform) modifications = None if assigned_mods_str: - modifications = to_modifications(assigned_mods_str, sequence) + _, modifications = to_modifications(assigned_mods_str, sequence) # M/Z (from feature file — used as fallback) mz = safe_float(row.get(r.get("observed_mz", "M/Z"))) or 0.0 diff --git a/qpx/converters/fragpipe/psm_adapter.py b/qpx/converters/fragpipe/psm_adapter.py index bbfa5037..f8885975 100644 --- a/qpx/converters/fragpipe/psm_adapter.py +++ b/qpx/converters/fragpipe/psm_adapter.py @@ -244,7 +244,7 @@ def _transform_row(self, row) -> Optional[dict]: modifications = None assigned_mods = row.get("Assigned Modifications") if pd.notna(assigned_mods) and assigned_mods: - modifications = to_modifications(str(assigned_mods), sequence) + _, modifications = to_modifications(str(assigned_mods), sequence) return { "sequence": sequence, diff --git a/qpx/converters/maxquant/feature_adapter.py b/qpx/converters/maxquant/feature_adapter.py index 1b8ff1a4..d8d9fffc 100644 --- a/qpx/converters/maxquant/feature_adapter.py +++ b/qpx/converters/maxquant/feature_adapter.py @@ -286,7 +286,7 @@ def _transform_row( peptidoform = to_proforma( str(row.get(r.get("modified_sequence", "Modified sequence"), "")), ) - modifications = from_proforma(peptidoform, sequence) if peptidoform else None + _, modifications = from_proforma(peptidoform, sequence) if peptidoform else (None, None) charge = int(row.get(r.get("charge", "Charge"), 0)) run_file_name = str(row.get(r.get("run_file_name", "Raw file"), "")) diff --git a/qpx/converters/maxquant/psm_adapter.py b/qpx/converters/maxquant/psm_adapter.py index 5ff56e01..777cf847 100644 --- a/qpx/converters/maxquant/psm_adapter.py +++ b/qpx/converters/maxquant/psm_adapter.py @@ -162,7 +162,7 @@ def _transform_row(self, row, spectral_data: bool) -> Optional[dict]: if pd.notna(phospho_raw) and phospho_raw: site_scores = parse_phospho_probabilities(str(phospho_raw)) - modifications = ( + peptidoform, modifications = ( from_proforma( peptidoform, sequence, diff --git a/qpx/converters/ptm.py b/qpx/converters/ptm.py index f987efd7..0cf16377 100644 --- a/qpx/converters/ptm.py +++ b/qpx/converters/ptm.py @@ -13,7 +13,7 @@ import re from functools import lru_cache -from typing import Optional +from typing import Optional, Tuple # --------------------------------------------------------------------------- # UNIMOD mass registry @@ -155,6 +155,7 @@ def _normalize_peptidoform(peptidoform: str) -> str: if "(" not in peptidoform: return peptidoform out: list[str] = [] + peptidoform = peptidoform.removeprefix(".") n = len(peptidoform) i = 0 while i < n: @@ -179,7 +180,16 @@ def _normalize_peptidoform(peptidoform: str) -> str: else: out.append(peptidoform[i]) i += 1 - return "".join(out) + + result = "".join(out) + + # For N-term + if result.startswith("["): + idx = result.find("]") + if idx != -1 and idx + 1 < len(result) and result[idx + 1] != "-": + result = result[: idx + 1] + "-" + result[idx + 1 :] + + return result def _from_proforma_impl( @@ -187,7 +197,7 @@ def _from_proforma_impl( sequence: str, meta: Optional[dict] = None, site_scores: Optional[dict[int, list[dict]]] = None, -) -> Optional[list[dict]]: +) -> Tuple[str, Optional[list[dict]]]: """Core implementation of ProForma modification parsing. See :func:`from_proforma` for full documentation. @@ -195,11 +205,12 @@ def _from_proforma_impl( # Normalise mzTab parenthetical notation to ProForma brackets peptidoform = _normalize_peptidoform(peptidoform) if not peptidoform or peptidoform == sequence: - return None + return peptidoform, None mods: dict[str, dict] = {} seq_pos = 0 n = len(peptidoform) + last_aa = None i = 0 while i < n: @@ -207,11 +218,17 @@ def _from_proforma_impl( try: end = peptidoform.index("]", i) except ValueError: - return None # Malformed ProForma + return peptidoform, None # Malformed ProForma mod_str = peptidoform[i + 1 : end] - position = seq_pos - aa = sequence[seq_pos - 1] if 0 < seq_pos <= len(sequence) else None + if seq_pos == 0: + # N-term + position = 0 + aa = None + else: + # 1-based position + position = seq_pos + aa = last_aa name = mod_str accession = None @@ -246,14 +263,17 @@ def _from_proforma_impl( elif peptidoform[i] == "-": i += 1 else: + last_aa = peptidoform[i] seq_pos += 1 i += 1 - return list(mods.values()) if mods else None + mods = list(mods.values()) if mods else None + + return peptidoform, mods @lru_cache(maxsize=8192) -def _from_proforma_cached(peptidoform: str, sequence: str) -> Optional[list[dict]]: +def _from_proforma_cached(peptidoform: str, sequence: str) -> Tuple[str, Optional[list[dict]]]: """Cached fast path for from_proforma when no meta or site_scores.""" return _from_proforma_impl(peptidoform, sequence) @@ -263,7 +283,7 @@ def from_proforma( sequence: str, meta: Optional[dict] = None, site_scores: Optional[dict[int, list[dict]]] = None, -) -> Optional[list[dict]]: +) -> Tuple[str, Optional[list[dict]]]: """Parse modifications from a ProForma-style peptidoform string. Handles: ``M[UNIMOD:35]PEPTIDEK``, ``M[+15.9949]PEPTIDEK``, @@ -287,8 +307,10 @@ def from_proforma( Used for phospho site localization probabilities. Returns: - List of modification dicts (``{name, accession, positions}``) per QPX - schema, or ``None`` if no modifications. + Tuple of (peptidoform, modifications) where peptidoform is the + normalised ProForma string and modifications is a list of dicts + (``{name, accession, positions}``) per QPX schema, or ``None`` + if no modifications. """ if meta is None and site_scores is None: return _from_proforma_cached(peptidoform, sequence) diff --git a/qpx/converters/quantms/feature_adapter.py b/qpx/converters/quantms/feature_adapter.py index 800e8acd..3bdabcd9 100644 --- a/qpx/converters/quantms/feature_adapter.py +++ b/qpx/converters/quantms/feature_adapter.py @@ -195,8 +195,13 @@ def _convert_lfq_fast( sql = sql_build( """ SELECT - m.$pf_col AS peptidoform, - regexp_replace(upper(CAST(m.$pf_col AS VARCHAR)), '[^A-Z]', '', 'g') AS sequence, + pf.peptidoform AS peptidoform, + regexp_replace( + regexp_replace(upper(CAST(pf.peptidoform AS VARCHAR)), '\\[.*?\\]', '', 'g'), + '[^A-Z]', + '', + 'g' + ) AS sequence, split_part(CAST(m.$ref_col AS VARCHAR), '.', 1) AS run_file_name, COALESCE(TRY_CAST(m.$chg_col AS INTEGER), 0) AS charge, COALESCE(TRY_CAST(m.$int_col AS DOUBLE), 0.0) AS intensity, @@ -226,7 +231,7 @@ def _convert_lfq_fast( LEFT JOIN _protein_genes pg ON split_part(CAST(m.$prot_col AS VARCHAR), ';', 1) = pg.accession LEFT JOIN _proforma_lookup pf - ON CAST(m.$pf_col AS VARCHAR) = pf.peptidoform + ON CAST(m.$pf_col AS VARCHAR) = pf.raw_peptidoform """, pf_col=q_pf, ref_col=q_ref, @@ -480,22 +485,35 @@ def _load_proforma_lookup(self, pf_col: str) -> None: continue sequence = re.sub(r"[^A-Z]", "", peptidoform.upper()) if peptidoform != sequence: - mods = from_proforma(peptidoform, sequence, meta=mods_meta) + peptidoform_profoma, mods = from_proforma( + peptidoform, + sequence, + meta=mods_meta, + ) mods_json = json.dumps(mods) if mods else None else: mods_json = None - records.append((peptidoform, mods_json)) + peptidoform_profoma = peptidoform + records.append((peptidoform, peptidoform_profoma, mods_json)) # Load into DuckDB if records: import pandas as _pd - df = _pd.DataFrame(records, columns=["peptidoform", "modifications_json"]) + df = _pd.DataFrame( + records, + columns=["raw_peptidoform", "peptidoform", "modifications_json"], + ) self._conn.execute("DROP TABLE IF EXISTS _proforma_lookup") self._conn.from_df(df).create("_proforma_lookup") else: - self._conn.execute("CREATE OR REPLACE TABLE _proforma_lookup (peptidoform VARCHAR, modifications_json VARCHAR)") - + self._conn.execute(""" + CREATE OR REPLACE TABLE _proforma_lookup ( + raw_peptidoform VARCHAR, + peptidoform VARCHAR, + modifications_json VARCHAR + ) + """) self.logger.info("ProForma lookup table: %d entries", len(records)) def _rows_to_feature_records(self, rows: list[tuple]) -> list[dict]: @@ -978,10 +996,14 @@ def _transform_batch_lfq( if peptidoform and peptidoform != sequence: _cache_key = (peptidoform, sequence) if _cache_key in _proforma_cache: - modifications = _proforma_cache[_cache_key] + peptidoform, modifications = _proforma_cache[_cache_key] else: - modifications = _from_proforma(peptidoform, sequence, meta=mods_meta) - _proforma_cache[_cache_key] = modifications + peptidoform, modifications = _from_proforma( + peptidoform, + sequence, + meta=mods_meta, + ) + _proforma_cache[_cache_key] = (peptidoform, modifications) else: modifications = None @@ -1108,10 +1130,14 @@ def _transform_batch_isobaric( if peptidoform and peptidoform != sequence: _cache_key = (peptidoform, sequence) if _cache_key in _proforma_cache: - modifications = _proforma_cache[_cache_key] + peptidoform, modifications = _proforma_cache[_cache_key] else: - modifications = _from_proforma(peptidoform, sequence, meta=mods_meta) - _proforma_cache[_cache_key] = modifications + peptidoform, modifications = _from_proforma( + peptidoform, + sequence, + meta=mods_meta, + ) + _proforma_cache[_cache_key] = (peptidoform, modifications) else: modifications = None diff --git a/qpx/converters/quantms/psm_adapter.py b/qpx/converters/quantms/psm_adapter.py index 5b24e40a..d701ee12 100644 --- a/qpx/converters/quantms/psm_adapter.py +++ b/qpx/converters/quantms/psm_adapter.py @@ -370,7 +370,7 @@ def _is_valid(val): ) # --- Modifications (structured) --- - modifications = from_proforma( + peptidoform, modifications = from_proforma( peptidoform, sequence, meta=modifications_meta, diff --git a/qpx/core/data/schemas/psm.yaml b/qpx/core/data/schemas/psm.yaml index f1609279..42ececf3 100644 --- a/qpx/core/data/schemas/psm.yaml +++ b/qpx/core/data/schemas/psm.yaml @@ -87,7 +87,7 @@ fields: doc: "Fragment charge values" ion_type_array: type: "list" - doc: "Fragment ion type annotations (b, y, a, etc.)" + doc: "Fragment ion type annotations (e.g., b1, y2, a2)" ion_mobility_array: type: "list" doc: "Fragment ion mobility values" diff --git a/tests/converters/test_converters.py b/tests/converters/test_converters.py index 7528a559..a9cc62de 100644 --- a/tests/converters/test_converters.py +++ b/tests/converters/test_converters.py @@ -350,7 +350,7 @@ def test_parse_modifications_from_peptidoform_unimod(self): "UNIMOD:35": ("Oxidation", ["M"], ["Anywhere"]), "UNIMOD:4": ("Carbamidomethyl", ["C"], ["Anywhere"]), } - result = from_proforma( + _, result = from_proforma( peptidoform="M[UNIMOD:35]PEPTIDEC[UNIMOD:4]K", sequence="MPEPTIDECK", meta=mods_meta, @@ -365,14 +365,14 @@ def test_parse_modifications_from_peptidoform_unimod(self): def test_parse_modifications_no_mods(self): from qpx.converters.ptm import from_proforma - result = from_proforma("PEPTIDEK", "PEPTIDEK", meta=None) + _, result = from_proforma("PEPTIDEK", "PEPTIDEK", meta=None) assert result is None def test_parse_modifications_mass_shift(self): from qpx.converters.ptm import from_proforma mods_meta = {"UNIMOD:35": ("Oxidation", ["M"], ["Anywhere"])} - result = from_proforma( + _, result = from_proforma( peptidoform="M[+15.9949]PEPTIDEK", sequence="MPEPTIDEK", meta=mods_meta, @@ -386,7 +386,7 @@ def test_parse_modifications_nterm(self): from qpx.converters.ptm import from_proforma mods_meta = {"UNIMOD:1": ("Acetyl", ["X"], ["N-term"])} - result = from_proforma( + _, result = from_proforma( peptidoform="[UNIMOD:1]-PEPTIDEK", sequence="PEPTIDEK", meta=mods_meta, diff --git a/tests/converters/test_ptm.py b/tests/converters/test_ptm.py index 0a8624e4..b10708e2 100644 --- a/tests/converters/test_ptm.py +++ b/tests/converters/test_ptm.py @@ -41,18 +41,20 @@ def test_empty_sequence(self): class TestFromProforma: def test_unimod_tag(self): meta = {"UNIMOD:35": ("Oxidation", ["M"], ["Anywhere"])} - result = from_proforma("M[UNIMOD:35]PEPTIDEK", "MPEPTIDEK", meta=meta) + _, result = from_proforma("M[UNIMOD:35]PEPTIDEK", "MPEPTIDEK", meta=meta) assert result is not None assert len(result) == 1 assert result[0]["name"] == "Oxidation" assert result[0]["accession"] == "UNIMOD:35" def test_no_mods(self): - assert from_proforma("PEPTIDEK", "PEPTIDEK", meta=None) is None + peptidoform, result = from_proforma("PEPTIDEK", "PEPTIDEK", meta=None) + assert result is None + assert peptidoform == "PEPTIDEK" def test_nterm(self): meta = {"UNIMOD:1": ("Acetyl", ["X"], ["N-term"])} - result = from_proforma("[UNIMOD:1]-PEPTIDEK", "PEPTIDEK", meta=meta) + _, result = from_proforma("[UNIMOD:1]-PEPTIDEK", "PEPTIDEK", meta=meta) assert result is not None assert result[0]["positions"][0]["position"] == 0 @@ -67,7 +69,7 @@ def test_site_scores_attached(self): } ], } - result = from_proforma( + _, result = from_proforma( "M[UNIMOD:35]PEPTIDEK", "MPEPTIDEK", meta=None, @@ -81,7 +83,7 @@ def test_site_scores_attached(self): def test_site_scores_none_when_not_provided(self): """Without site_scores, positions still have scores=None.""" - result = from_proforma("M[UNIMOD:35]PEPTIDEK", "MPEPTIDEK", meta=None) + _, result = from_proforma("M[UNIMOD:35]PEPTIDEK", "MPEPTIDEK", meta=None) assert result is not None assert result[0]["positions"][0]["scores"] is None @@ -97,7 +99,7 @@ def test_site_scores_partial_positions(self): ], } # Two mods: position 1 and position 5; only position 5 has scores - result = from_proforma( + _, result = from_proforma( "M[UNIMOD:35]PEPTS[UNIMOD:21]IDEK", "MPEPTSIDEK", meta=None, @@ -133,13 +135,13 @@ def test_nested_parens(self): raise AssertionError(f"Unexpected: {result!r}") def test_nterm_mod(self): - result = _normalize_peptidoform("(Acetyl)PEPTIDEK") - if result != "[Acetyl]PEPTIDEK": + result = _normalize_peptidoform(".(Acetyl)PEPTIDEK") + if result != "[Acetyl]-PEPTIDEK": raise AssertionError(f"Unexpected: {result!r}") def test_multiple_mods(self): - result = _normalize_peptidoform("(Acetyl)M(Oxidation)PEPTC(Carbamidomethyl)K") - if result != "[Acetyl]M[Oxidation]PEPTC[Carbamidomethyl]K": + result = _normalize_peptidoform(".(Acetyl)M(Oxidation)PEPTC(Carbamidomethyl)K") + if result != "[Acetyl]-M[Oxidation]PEPTC[Carbamidomethyl]K": raise AssertionError(f"Unexpected: {result!r}") def test_unmatched_open_paren(self): @@ -158,7 +160,7 @@ class TestFromProformaMzTab: def test_mztab_simple(self): meta = {"UNIMOD:4": ("Carbamidomethyl", ["C"], ["Anywhere"])} - result = from_proforma("C(UNIMOD:4)PEPTIDEK", "CPEPTIDEK", meta=meta) + _, result = from_proforma("C(UNIMOD:4)PEPTIDEK", "CPEPTIDEK", meta=meta) if result is None: raise AssertionError("Expected mods, got None") if result[0]["accession"] != "UNIMOD:4": @@ -166,7 +168,7 @@ def test_mztab_simple(self): def test_mztab_nterm(self): meta = {"UNIMOD:1": ("Acetyl", ["X"], ["N-term"])} - result = from_proforma("(UNIMOD:1)-PEPTIDEK", "PEPTIDEK", meta=meta) + _, result = from_proforma("(UNIMOD:1)-PEPTIDEK", "PEPTIDEK", meta=meta) if result is None: raise AssertionError("Expected mods, got None") if result[0]["positions"][0]["position"] != 0: @@ -174,7 +176,7 @@ def test_mztab_nterm(self): def test_mztab_nested_parens(self): # Normalization preserves inner parens as-is - result = from_proforma("C(Carbamidomethyl (C))PEPTIDEK", "CPEPTIDEK", meta=None) + _, result = from_proforma("C(Carbamidomethyl (C))PEPTIDEK", "CPEPTIDEK", meta=None) if result is None: raise AssertionError("Expected mods, got None") mod_name = result[0]["name"]