Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docs/include/example.feature.parquet
Binary file not shown.
Binary file modified docs/include/example.psm.parquet
Binary file not shown.
22 changes: 11 additions & 11 deletions docs/spec/psm.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@ The PSM (Peptide Spectrum Match) view captures spectrum-level identification res

These fields are optional and may not exist in the file at all. They are included based on conversion settings or user preference.

| Field | Description | Type | Required |
|-------|-------------|------|----------|
| Field | Description | Type | Required |
|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------|----------|
| `protein_accessions` | Protein accessions of all proteins that the peptide maps to. Optional because protein mapping can be recovered from the feature and protein group views | array[string], null | no |
| `cross_links` | Cross-link information for XL-MS experiments. Each entry describes one cross-link site. `null` for non-cross-linked PSMs | array[struct], null | no |
| `ion_mobility` | Ion mobility value for the precursor ion | float32, null | no |
| `mz_array` | Array of m/z values for the spectrum | array[float32], null | no |
| `intensity_array` | Array of intensity values for the spectrum | array[float32], null | no |
| `charge_array` | Array of fragment ion charge values | array[int32], null | no |
| `ion_type_array` | Array of fragment ion type annotations (e.g., b, y, a) | array[string], null | no |
| `ion_mobility_array` | Array of fragment ion mobility values | array[float32], null | no |
| `cross_links` | Cross-link information for XL-MS experiments. Each entry describes one cross-link site. `null` for non-cross-linked PSMs | array[struct], null | no |
| `ion_mobility` | Ion mobility value for the precursor ion | float32, null | no |
| `mz_array` | Array of m/z values for the spectrum | array[float32], null | no |
| `intensity_array` | Array of intensity values for the spectrum | array[float32], null | no |
| `charge_array` | Array of fragment ion charge values | array[int32], null | no |
| `ion_type_array` | Array of fragment ion type annotations (e.g., b1, y2, a2) | array[string], null | no |
| `ion_mobility_array` | Array of fragment ion mobility values | array[float32], null | no |

!!! note "Nullable vs Optional"
Core fields marked as "not required" are **nullable** -- the column always exists in the file but individual values may be null. Optional fields (protein accessions, spectral data) may be **absent from the file entirely**, depending on conversion settings. Protein mappings can be recovered by joining with the feature and protein group views.
Expand Down Expand Up @@ -101,7 +101,7 @@ Several fields in the PSM view use structures shared across other QPX views:
```json
{
"sequence": "AAAAAAAAAAGAAGGR",
"peptidoform": "_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_",
"peptidoform": "[Acetyl]-AAAAAAAAAAGAAGGR",
"charge": 2,
"scan": [42164],
"rt": 5140.98,
Expand Down Expand Up @@ -156,7 +156,7 @@ When spectral arrays are included, the record also contains peak-level data:
```json
{
"sequence": "AAAAAAAAAAGAAGGR",
"peptidoform": "_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_",
"peptidoform": "[Acetyl]-AAAAAAAAAAGAAGGR",
"charge": 2,
"scan": [42164],
"rt": 5140.98,
Expand Down
5 changes: 3 additions & 2 deletions qpx/converters/diann/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def to_proforma(modified_sequence: str) -> str:
return build_proforma(plain_seq, mods)


def to_modifications(modified_sequence: str, sequence: str) -> list[dict] | None:
def to_modifications(modified_sequence: str, sequence: str) -> tuple[str, list[dict] | None]:
"""Parse modifications from a DIA-NN Modified.Sequence string.

Converts to ProForma first, then delegates to the shared ``from_proforma``
Expand All @@ -64,7 +64,8 @@ def to_modifications(modified_sequence: str, sequence: str) -> list[dict] | None
sequence: Stripped peptide sequence (no modification annotations).

Returns:
List of modification dicts per QPX schema, or ``None`` if unmodified.
Tuple of (peptidoform, modifications) where modifications is a list
of modification dicts per QPX schema, or ``None`` if unmodified.
"""
proforma = to_proforma(modified_sequence)
return from_proforma(proforma, sequence, meta=None)
2 changes: 1 addition & 1 deletion qpx/converters/diann/feature_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _register_precursor_lookup(self, enzyme_name: str | None) -> dict:
charge = int(charge)

peptidoform = to_proforma(modified_seq)
modifications = to_modifications(modified_seq, sequence)
_, modifications = to_modifications(modified_seq, sequence)

cache_key = (modified_seq, charge)
if cache_key not in self._mz_cache:
Expand Down
5 changes: 3 additions & 2 deletions qpx/converters/fragpipe/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,16 @@ def to_proforma(assigned_mods: str, sequence: str) -> str:
return build_proforma(sequence, mods)


def to_modifications(assigned_mods: str, sequence: str) -> list[dict] | None:
def to_modifications(assigned_mods: str, sequence: str) -> tuple[str, list[dict] | None]:
"""Parse modifications from FragPipe Assigned Modifications format.

Args:
assigned_mods: Assigned Modifications string, e.g. ``5M(15.9949)``.
sequence: Stripped peptide sequence.

Returns:
List of modification dicts per QPX schema, or ``None`` if unmodified.
Tuple of (peptidoform, modifications) where modifications is a list
of modification dicts per QPX schema, or ``None`` if unmodified.
"""
proforma = to_proforma(assigned_mods, sequence)
return from_proforma(proforma, sequence, meta=None)
2 changes: 1 addition & 1 deletion qpx/converters/fragpipe/feature_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def _transform_row(
# Modifications (reuse assigned_mods_str already extracted for peptidoform)
modifications = None
if assigned_mods_str:
modifications = to_modifications(assigned_mods_str, sequence)
_, modifications = to_modifications(assigned_mods_str, sequence)

# M/Z (from feature file — used as fallback)
mz = safe_float(row.get(r.get("observed_mz", "M/Z"))) or 0.0
Expand Down
2 changes: 1 addition & 1 deletion qpx/converters/fragpipe/psm_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _transform_row(self, row) -> Optional[dict]:
modifications = None
assigned_mods = row.get("Assigned Modifications")
if pd.notna(assigned_mods) and assigned_mods:
modifications = to_modifications(str(assigned_mods), sequence)
_, modifications = to_modifications(str(assigned_mods), sequence)

return {
"sequence": sequence,
Expand Down
2 changes: 1 addition & 1 deletion qpx/converters/maxquant/feature_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def _transform_row(
peptidoform = to_proforma(
str(row.get(r.get("modified_sequence", "Modified sequence"), "")),
)
modifications = from_proforma(peptidoform, sequence) if peptidoform else None
_, modifications = from_proforma(peptidoform, sequence) if peptidoform else (None, None)
Comment on lines 286 to +289
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Inconsistent peptidoform handling vs. maxquant/psm_adapter.py.

Here the normalized peptidoform returned by from_proforma is discarded (_, modifications = ...), so the emitted peptidoform field keeps the raw to_proforma(...) output. In the sibling PSM adapter (and in quantms/psm_adapter.py) the pattern peptidoform, modifications = from_proforma(...) is used, meaning features and PSMs for the same peptide may now carry slightly different peptidoform strings. Consider aligning the feature adapter:

-        _, modifications = from_proforma(peptidoform, sequence) if peptidoform else (None, None)
+        peptidoform, modifications = (
+            from_proforma(peptidoform, sequence) if peptidoform else (peptidoform, None)
+        )

Note the fallback preserves the empty/original peptidoform instead of nulling it, since downstream fields (Line 382) assume it is a string.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@qpx/converters/maxquant/feature_adapter.py` around lines 286 - 289, The
current code calls peptidoform = to_proforma(...) then discards the normalized
peptidoform returned by from_proforma by doing "_, modifications =
from_proforma(...)" which leaves the emitted peptidoform unnormalized and
diverges from the PSM adapter; change the assignment to capture both the
normalized peptidoform and modifications (peptidoform, modifications =
from_proforma(peptidoform, sequence)) and if from_proforma returns None/None
fall back to the original stringified to_proforma value so peptidoform stays a
string; update the usage sites that expect a string (e.g., later logic that
assumes peptidoform is a string) accordingly.

charge = int(row.get(r.get("charge", "Charge"), 0))
run_file_name = str(row.get(r.get("run_file_name", "Raw file"), ""))

Expand Down
2 changes: 1 addition & 1 deletion qpx/converters/maxquant/psm_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def _transform_row(self, row, spectral_data: bool) -> Optional[dict]:
if pd.notna(phospho_raw) and phospho_raw:
site_scores = parse_phospho_probabilities(str(phospho_raw))

modifications = (
peptidoform, modifications = (
from_proforma(
peptidoform,
sequence,
Expand Down
46 changes: 34 additions & 12 deletions qpx/converters/ptm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import re
from functools import lru_cache
from typing import Optional
from typing import Optional, Tuple

# ---------------------------------------------------------------------------
# UNIMOD mass registry
Expand Down Expand Up @@ -155,6 +155,7 @@ def _normalize_peptidoform(peptidoform: str) -> str:
if "(" not in peptidoform:
return peptidoform
out: list[str] = []
peptidoform = peptidoform.removeprefix(".")
n = len(peptidoform)
i = 0
while i < n:
Expand All @@ -179,39 +180,55 @@ def _normalize_peptidoform(peptidoform: str) -> str:
else:
out.append(peptidoform[i])
i += 1
return "".join(out)

result = "".join(out)

# For N-term
if result.startswith("["):
idx = result.find("]")
if idx != -1 and idx + 1 < len(result) and result[idx + 1] != "-":
result = result[: idx + 1] + "-" + result[idx + 1 :]

return result


def _from_proforma_impl(
peptidoform: str,
sequence: str,
meta: Optional[dict] = None,
site_scores: Optional[dict[int, list[dict]]] = None,
) -> Optional[list[dict]]:
) -> Tuple[str, Optional[list[dict]]]:
"""Core implementation of ProForma modification parsing.

See :func:`from_proforma` for full documentation.
"""
# Normalise mzTab parenthetical notation to ProForma brackets
peptidoform = _normalize_peptidoform(peptidoform)
if not peptidoform or peptidoform == sequence:
return None
return peptidoform, None

mods: dict[str, dict] = {}
seq_pos = 0
n = len(peptidoform)
last_aa = None

i = 0
while i < n:
if peptidoform[i] == "[":
try:
end = peptidoform.index("]", i)
except ValueError:
return None # Malformed ProForma
return peptidoform, None # Malformed ProForma
mod_str = peptidoform[i + 1 : end]

position = seq_pos
aa = sequence[seq_pos - 1] if 0 < seq_pos <= len(sequence) else None
if seq_pos == 0:
# N-term
position = 0
aa = None
else:
# 1-based position
position = seq_pos
aa = last_aa

name = mod_str
accession = None
Expand Down Expand Up @@ -246,14 +263,17 @@ def _from_proforma_impl(
elif peptidoform[i] == "-":
i += 1
else:
last_aa = peptidoform[i]
seq_pos += 1
i += 1

return list(mods.values()) if mods else None
mods = list(mods.values()) if mods else None

return peptidoform, mods


@lru_cache(maxsize=8192)
def _from_proforma_cached(peptidoform: str, sequence: str) -> Optional[list[dict]]:
def _from_proforma_cached(peptidoform: str, sequence: str) -> Tuple[str, Optional[list[dict]]]:
"""Cached fast path for from_proforma when no meta or site_scores."""
return _from_proforma_impl(peptidoform, sequence)

Expand All @@ -263,7 +283,7 @@ def from_proforma(
sequence: str,
meta: Optional[dict] = None,
site_scores: Optional[dict[int, list[dict]]] = None,
) -> Optional[list[dict]]:
) -> Tuple[str, Optional[list[dict]]]:
"""Parse modifications from a ProForma-style peptidoform string.

Handles: ``M[UNIMOD:35]PEPTIDEK``, ``M[+15.9949]PEPTIDEK``,
Expand All @@ -287,8 +307,10 @@ def from_proforma(
Used for phospho site localization probabilities.

Returns:
List of modification dicts (``{name, accession, positions}``) per QPX
schema, or ``None`` if no modifications.
Tuple of (peptidoform, modifications) where peptidoform is the
normalised ProForma string and modifications is a list of dicts
(``{name, accession, positions}``) per QPX schema, or ``None``
if no modifications.
"""
if meta is None and site_scores is None:
return _from_proforma_cached(peptidoform, sequence)
Expand Down
54 changes: 40 additions & 14 deletions qpx/converters/quantms/feature_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,13 @@ def _convert_lfq_fast(
sql = sql_build(
"""
SELECT
m.$pf_col AS peptidoform,
regexp_replace(upper(CAST(m.$pf_col AS VARCHAR)), '[^A-Z]', '', 'g') AS sequence,
pf.peptidoform AS peptidoform,
regexp_replace(
regexp_replace(upper(CAST(pf.peptidoform AS VARCHAR)), '\\[.*?\\]', '', 'g'),
'[^A-Z]',
'',
'g'
) AS sequence,
split_part(CAST(m.$ref_col AS VARCHAR), '.', 1) AS run_file_name,
COALESCE(TRY_CAST(m.$chg_col AS INTEGER), 0) AS charge,
COALESCE(TRY_CAST(m.$int_col AS DOUBLE), 0.0) AS intensity,
Expand Down Expand Up @@ -226,7 +231,7 @@ def _convert_lfq_fast(
LEFT JOIN _protein_genes pg
ON split_part(CAST(m.$prot_col AS VARCHAR), ';', 1) = pg.accession
LEFT JOIN _proforma_lookup pf
ON CAST(m.$pf_col AS VARCHAR) = pf.peptidoform
ON CAST(m.$pf_col AS VARCHAR) = pf.raw_peptidoform
""",
pf_col=q_pf,
ref_col=q_ref,
Expand Down Expand Up @@ -480,22 +485,35 @@ def _load_proforma_lookup(self, pf_col: str) -> None:
continue
sequence = re.sub(r"[^A-Z]", "", peptidoform.upper())
if peptidoform != sequence:
mods = from_proforma(peptidoform, sequence, meta=mods_meta)
peptidoform_profoma, mods = from_proforma(
peptidoform,
sequence,
meta=mods_meta,
)
mods_json = json.dumps(mods) if mods else None
else:
mods_json = None
records.append((peptidoform, mods_json))
peptidoform_profoma = peptidoform
records.append((peptidoform, peptidoform_profoma, mods_json))

# Load into DuckDB
if records:
import pandas as _pd

df = _pd.DataFrame(records, columns=["peptidoform", "modifications_json"])
df = _pd.DataFrame(
records,
columns=["raw_peptidoform", "peptidoform", "modifications_json"],
)
self._conn.execute("DROP TABLE IF EXISTS _proforma_lookup")
self._conn.from_df(df).create("_proforma_lookup")
else:
self._conn.execute("CREATE OR REPLACE TABLE _proforma_lookup (peptidoform VARCHAR, modifications_json VARCHAR)")

self._conn.execute("""
CREATE OR REPLACE TABLE _proforma_lookup (
raw_peptidoform VARCHAR,
peptidoform VARCHAR,
modifications_json VARCHAR
)
""")
self.logger.info("ProForma lookup table: %d entries", len(records))

def _rows_to_feature_records(self, rows: list[tuple]) -> list[dict]:
Expand Down Expand Up @@ -978,10 +996,14 @@ def _transform_batch_lfq(
if peptidoform and peptidoform != sequence:
_cache_key = (peptidoform, sequence)
if _cache_key in _proforma_cache:
modifications = _proforma_cache[_cache_key]
peptidoform, modifications = _proforma_cache[_cache_key]
else:
modifications = _from_proforma(peptidoform, sequence, meta=mods_meta)
_proforma_cache[_cache_key] = modifications
peptidoform, modifications = _from_proforma(
peptidoform,
sequence,
meta=mods_meta,
)
_proforma_cache[_cache_key] = (peptidoform, modifications)
else:
modifications = None

Expand Down Expand Up @@ -1108,10 +1130,14 @@ def _transform_batch_isobaric(
if peptidoform and peptidoform != sequence:
_cache_key = (peptidoform, sequence)
if _cache_key in _proforma_cache:
modifications = _proforma_cache[_cache_key]
peptidoform, modifications = _proforma_cache[_cache_key]
else:
modifications = _from_proforma(peptidoform, sequence, meta=mods_meta)
_proforma_cache[_cache_key] = modifications
peptidoform, modifications = _from_proforma(
peptidoform,
sequence,
meta=mods_meta,
)
_proforma_cache[_cache_key] = (peptidoform, modifications)
else:
modifications = None

Expand Down
2 changes: 1 addition & 1 deletion qpx/converters/quantms/psm_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def _is_valid(val):
)

# --- Modifications (structured) ---
modifications = from_proforma(
peptidoform, modifications = from_proforma(
peptidoform,
sequence,
meta=modifications_meta,
Expand Down
2 changes: 1 addition & 1 deletion qpx/core/data/schemas/psm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ fields:
doc: "Fragment charge values"
ion_type_array:
type: "list<string>"
doc: "Fragment ion type annotations (b, y, a, etc.)"
doc: "Fragment ion type annotations (e.g., b1, y2, a2)"
ion_mobility_array:
type: "list<float32>"
doc: "Fragment ion mobility values"
Loading
Loading