bigbio · ypriverol · Apr 18, 2026 · Nov 29, 2023 · Mar 14, 2026 · Mar 16, 2026
diff --git a/docs/include/example.feature.parquet b/docs/include/example.feature.parquet
diff --git a/docs/include/example.psm.parquet b/docs/include/example.psm.parquet
diff --git a/docs/spec/psm.md b/docs/spec/psm.md
@@ -35,16 +35,16 @@ The PSM (Peptide Spectrum Match) view captures spectrum-level identification res
 
 These fields are optional and may not exist in the file at all. They are included based on conversion settings or user preference.
 
-| Field | Description | Type | Required |
-|-------|-------------|------|----------|
+| Field | Description                                                                                                                                             | Type | Required |
+|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------|----------|
 | `protein_accessions` | Protein accessions of all proteins that the peptide maps to. Optional because protein mapping can be recovered from the feature and protein group views | array[string], null | no |
-| `cross_links` | Cross-link information for XL-MS experiments. Each entry describes one cross-link site. `null` for non-cross-linked PSMs | array[struct], null | no |
-| `ion_mobility` | Ion mobility value for the precursor ion | float32, null | no |
-| `mz_array` | Array of m/z values for the spectrum | array[float32], null | no |
-| `intensity_array` | Array of intensity values for the spectrum | array[float32], null | no |
-| `charge_array` | Array of fragment ion charge values | array[int32], null | no |
-| `ion_type_array` | Array of fragment ion type annotations (e.g., b, y, a) | array[string], null | no |
-| `ion_mobility_array` | Array of fragment ion mobility values | array[float32], null | no |
+| `cross_links` | Cross-link information for XL-MS experiments. Each entry describes one cross-link site. `null` for non-cross-linked PSMs                                | array[struct], null | no |
+| `ion_mobility` | Ion mobility value for the precursor ion                                                                                                                | float32, null | no |
+| `mz_array` | Array of m/z values for the spectrum                                                                                                                    | array[float32], null | no |
+| `intensity_array` | Array of intensity values for the spectrum                                                                                                              | array[float32], null | no |
+| `charge_array` | Array of fragment ion charge values                                                                                                                     | array[int32], null | no |
+| `ion_type_array` | Array of fragment ion type annotations (e.g., b1, y2, a2)                                                                                               | array[string], null | no |
+| `ion_mobility_array` | Array of fragment ion mobility values                                                                                                                   | array[float32], null | no |
 
 !!! note "Nullable vs Optional"
     Core fields marked as "not required" are **nullable** -- the column always exists in the file but individual values may be null. Optional fields (protein accessions, spectral data) may be **absent from the file entirely**, depending on conversion settings. Protein mappings can be recovered by joining with the feature and protein group views.
@@ -101,7 +101,7 @@ Several fields in the PSM view use structures shared across other QPX views:
 ```json
 {
   "sequence": "AAAAAAAAAAGAAGGR",
-  "peptidoform": "_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_",
+  "peptidoform": "[Acetyl]-AAAAAAAAAAGAAGGR",
   "charge": 2,
   "scan": [42164],
   "rt": 5140.98,
@@ -156,7 +156,7 @@ When spectral arrays are included, the record also contains peak-level data:
 ```json
 {
   "sequence": "AAAAAAAAAAGAAGGR",
-  "peptidoform": "_(Acetyl (Protein N-term))AAAAAAAAAAGAAGGR_",
+  "peptidoform": "[Acetyl]-AAAAAAAAAAGAAGGR",
   "charge": 2,
   "scan": [42164],
   "rt": 5140.98,

diff --git a/qpx/converters/diann/constants.py b/qpx/converters/diann/constants.py
@@ -53,7 +53,7 @@ def to_proforma(modified_sequence: str) -> str:
     return build_proforma(plain_seq, mods)
 
 
-def to_modifications(modified_sequence: str, sequence: str) -> list[dict] | None:
+def to_modifications(modified_sequence: str, sequence: str) -> tuple[str, list[dict] | None]:
     """Parse modifications from a DIA-NN Modified.Sequence string.
 
     Converts to ProForma first, then delegates to the shared ``from_proforma``
@@ -64,7 +64,8 @@ def to_modifications(modified_sequence: str, sequence: str) -> list[dict] | None
         sequence: Stripped peptide sequence (no modification annotations).
 
     Returns:
-        List of modification dicts per QPX schema, or ``None`` if unmodified.
+        Tuple of (peptidoform, modifications) where modifications is a list
+        of modification dicts per QPX schema, or ``None`` if unmodified.
     """
     proforma = to_proforma(modified_sequence)
     return from_proforma(proforma, sequence, meta=None)
diff --git a/qpx/converters/diann/feature_adapter.py b/qpx/converters/diann/feature_adapter.py
@@ -231,7 +231,7 @@ def _register_precursor_lookup(self, enzyme_name: str | None) -> dict:
             charge = int(charge)
 
             peptidoform = to_proforma(modified_seq)
-            modifications = to_modifications(modified_seq, sequence)
+            _, modifications = to_modifications(modified_seq, sequence)
 
             cache_key = (modified_seq, charge)
             if cache_key not in self._mz_cache:

diff --git a/qpx/converters/fragpipe/constants.py b/qpx/converters/fragpipe/constants.py
@@ -70,15 +70,16 @@ def to_proforma(assigned_mods: str, sequence: str) -> str:
     return build_proforma(sequence, mods)
 
 
-def to_modifications(assigned_mods: str, sequence: str) -> list[dict] | None:
+def to_modifications(assigned_mods: str, sequence: str) -> tuple[str, list[dict] | None]:
     """Parse modifications from FragPipe Assigned Modifications format.
 
     Args:
         assigned_mods: Assigned Modifications string, e.g. ``5M(15.9949)``.
         sequence: Stripped peptide sequence.
 
     Returns:
-        List of modification dicts per QPX schema, or ``None`` if unmodified.
+        Tuple of (peptidoform, modifications) where modifications is a list
+        of modification dicts per QPX schema, or ``None`` if unmodified.
     """
     proforma = to_proforma(assigned_mods, sequence)
     return from_proforma(proforma, sequence, meta=None)
diff --git a/qpx/converters/fragpipe/feature_adapter.py b/qpx/converters/fragpipe/feature_adapter.py
@@ -384,7 +384,7 @@ def _transform_row(
         # Modifications (reuse assigned_mods_str already extracted for peptidoform)
         modifications = None
         if assigned_mods_str:
-            modifications = to_modifications(assigned_mods_str, sequence)
+            _, modifications = to_modifications(assigned_mods_str, sequence)
 
         # M/Z (from feature file — used as fallback)
         mz = safe_float(row.get(r.get("observed_mz", "M/Z"))) or 0.0

diff --git a/qpx/converters/fragpipe/psm_adapter.py b/qpx/converters/fragpipe/psm_adapter.py
@@ -244,7 +244,7 @@ def _transform_row(self, row) -> Optional[dict]:
         modifications = None
         assigned_mods = row.get("Assigned Modifications")
         if pd.notna(assigned_mods) and assigned_mods:
-            modifications = to_modifications(str(assigned_mods), sequence)
+            _, modifications = to_modifications(str(assigned_mods), sequence)
 
         return {
             "sequence": sequence,

diff --git a/qpx/converters/maxquant/feature_adapter.py b/qpx/converters/maxquant/feature_adapter.py
@@ -286,7 +286,7 @@ def _transform_row(
         peptidoform = to_proforma(
             str(row.get(r.get("modified_sequence", "Modified sequence"), "")),
         )
-        modifications = from_proforma(peptidoform, sequence) if peptidoform else None
+        _, modifications = from_proforma(peptidoform, sequence) if peptidoform else (None, None)
         charge = int(row.get(r.get("charge", "Charge"), 0))
         run_file_name = str(row.get(r.get("run_file_name", "Raw file"), ""))
 

diff --git a/qpx/converters/maxquant/psm_adapter.py b/qpx/converters/maxquant/psm_adapter.py
@@ -162,7 +162,7 @@ def _transform_row(self, row, spectral_data: bool) -> Optional[dict]:
         if pd.notna(phospho_raw) and phospho_raw:
             site_scores = parse_phospho_probabilities(str(phospho_raw))
 
-        modifications = (
+        peptidoform, modifications = (
             from_proforma(
                 peptidoform,
                 sequence,

diff --git a/qpx/converters/ptm.py b/qpx/converters/ptm.py
@@ -13,7 +13,7 @@
 
 import re
 from functools import lru_cache
-from typing import Optional
+from typing import Optional, Tuple
 
 # ---------------------------------------------------------------------------
 # UNIMOD mass registry
@@ -155,6 +155,7 @@ def _normalize_peptidoform(peptidoform: str) -> str:
     if "(" not in peptidoform:
         return peptidoform
     out: list[str] = []
+    peptidoform = peptidoform.removeprefix(".")
     n = len(peptidoform)
     i = 0
     while i < n:
@@ -179,39 +180,55 @@ def _normalize_peptidoform(peptidoform: str) -> str:
         else:
             out.append(peptidoform[i])
             i += 1
-    return "".join(out)
+
+    result = "".join(out)
+
+    # For N-term
+    if result.startswith("["):
+        idx = result.find("]")
+        if idx != -1 and idx + 1 < len(result) and result[idx + 1] != "-":
+            result = result[: idx + 1] + "-" + result[idx + 1 :]
+
+    return result
 
 
 def _from_proforma_impl(
     peptidoform: str,
     sequence: str,
     meta: Optional[dict] = None,
     site_scores: Optional[dict[int, list[dict]]] = None,
-) -> Optional[list[dict]]:
+) -> Tuple[str, Optional[list[dict]]]:
     """Core implementation of ProForma modification parsing.
 
     See :func:`from_proforma` for full documentation.
     """
     # Normalise mzTab parenthetical notation to ProForma brackets
     peptidoform = _normalize_peptidoform(peptidoform)
     if not peptidoform or peptidoform == sequence:
-        return None
+        return peptidoform, None
 
     mods: dict[str, dict] = {}
     seq_pos = 0
     n = len(peptidoform)
+    last_aa = None
 
     i = 0
     while i < n:
         if peptidoform[i] == "[":
             try:
                 end = peptidoform.index("]", i)
             except ValueError:
-                return None  # Malformed ProForma
+                return peptidoform, None  # Malformed ProForma
             mod_str = peptidoform[i + 1 : end]
 
-            position = seq_pos
-            aa = sequence[seq_pos - 1] if 0 < seq_pos <= len(sequence) else None
+            if seq_pos == 0:
+                # N-term
+                position = 0
+                aa = None
+            else:
+                # 1-based position
+                position = seq_pos
+                aa = last_aa
 
             name = mod_str
             accession = None
@@ -246,14 +263,17 @@ def _from_proforma_impl(
         elif peptidoform[i] == "-":
             i += 1
         else:
+            last_aa = peptidoform[i]
             seq_pos += 1
             i += 1
 
-    return list(mods.values()) if mods else None
+    mods = list(mods.values()) if mods else None
+
+    return peptidoform, mods
 
 
 @lru_cache(maxsize=8192)
-def _from_proforma_cached(peptidoform: str, sequence: str) -> Optional[list[dict]]:
+def _from_proforma_cached(peptidoform: str, sequence: str) -> Tuple[str, Optional[list[dict]]]:
     """Cached fast path for from_proforma when no meta or site_scores."""
     return _from_proforma_impl(peptidoform, sequence)
 
@@ -263,7 +283,7 @@ def from_proforma(
     sequence: str,
     meta: Optional[dict] = None,
     site_scores: Optional[dict[int, list[dict]]] = None,
-) -> Optional[list[dict]]:
+) -> Tuple[str, Optional[list[dict]]]:
     """Parse modifications from a ProForma-style peptidoform string.
 
     Handles: ``M[UNIMOD:35]PEPTIDEK``, ``M[+15.9949]PEPTIDEK``,
@@ -287,8 +307,10 @@ def from_proforma(
             Used for phospho site localization probabilities.
 
     Returns:
-        List of modification dicts (``{name, accession, positions}``) per QPX
-        schema, or ``None`` if no modifications.
+        Tuple of (peptidoform, modifications) where peptidoform is the
+        normalised ProForma string and modifications is a list of dicts
+        (``{name, accession, positions}``) per QPX schema, or ``None``
+        if no modifications.
     """
     if meta is None and site_scores is None:
         return _from_proforma_cached(peptidoform, sequence)

diff --git a/qpx/converters/quantms/feature_adapter.py b/qpx/converters/quantms/feature_adapter.py
@@ -195,8 +195,13 @@ def _convert_lfq_fast(
         sql = sql_build(
             """
             SELECT
-                m.$pf_col AS peptidoform,
-                regexp_replace(upper(CAST(m.$pf_col AS VARCHAR)), '[^A-Z]', '', 'g') AS sequence,
+                pf.peptidoform AS peptidoform,
+                regexp_replace(
+                    regexp_replace(upper(CAST(pf.peptidoform AS VARCHAR)), '\\[.*?\\]', '', 'g'),
+                    '[^A-Z]',
+                    '',
+                    'g'
+                ) AS sequence,
                 split_part(CAST(m.$ref_col AS VARCHAR), '.', 1) AS run_file_name,
                 COALESCE(TRY_CAST(m.$chg_col AS INTEGER), 0) AS charge,
                 COALESCE(TRY_CAST(m.$int_col AS DOUBLE), 0.0) AS intensity,
@@ -226,7 +231,7 @@ def _convert_lfq_fast(
             LEFT JOIN _protein_genes pg
                 ON split_part(CAST(m.$prot_col AS VARCHAR), ';', 1) = pg.accession
             LEFT JOIN _proforma_lookup pf
-                ON CAST(m.$pf_col AS VARCHAR) = pf.peptidoform
+                ON CAST(m.$pf_col AS VARCHAR) = pf.raw_peptidoform
             """,
             pf_col=q_pf,
             ref_col=q_ref,
@@ -480,22 +485,35 @@ def _load_proforma_lookup(self, pf_col: str) -> None:
                 continue
             sequence = re.sub(r"[^A-Z]", "", peptidoform.upper())
             if peptidoform != sequence:
-                mods = from_proforma(peptidoform, sequence, meta=mods_meta)
+                peptidoform_profoma, mods = from_proforma(
+                    peptidoform,
+                    sequence,
+                    meta=mods_meta,
+                )
                 mods_json = json.dumps(mods) if mods else None
             else:
                 mods_json = None
-            records.append((peptidoform, mods_json))
+                peptidoform_profoma = peptidoform
+            records.append((peptidoform, peptidoform_profoma, mods_json))
 
         # Load into DuckDB
         if records:
             import pandas as _pd
 
-            df = _pd.DataFrame(records, columns=["peptidoform", "modifications_json"])
+            df = _pd.DataFrame(
+                records,
+                columns=["raw_peptidoform", "peptidoform", "modifications_json"],
+            )
             self._conn.execute("DROP TABLE IF EXISTS _proforma_lookup")
             self._conn.from_df(df).create("_proforma_lookup")
         else:
-            self._conn.execute("CREATE OR REPLACE TABLE _proforma_lookup (peptidoform VARCHAR, modifications_json VARCHAR)")
-
+            self._conn.execute("""
+            CREATE OR REPLACE TABLE _proforma_lookup (
+                raw_peptidoform VARCHAR,
+                peptidoform VARCHAR,
+                modifications_json VARCHAR
+            )
+            """)
         self.logger.info("ProForma lookup table: %d entries", len(records))
 
     def _rows_to_feature_records(self, rows: list[tuple]) -> list[dict]:
@@ -978,10 +996,14 @@ def _transform_batch_lfq(
                 if peptidoform and peptidoform != sequence:
                     _cache_key = (peptidoform, sequence)
                     if _cache_key in _proforma_cache:
-                        modifications = _proforma_cache[_cache_key]
+                        peptidoform, modifications = _proforma_cache[_cache_key]
                     else:
-                        modifications = _from_proforma(peptidoform, sequence, meta=mods_meta)
-                        _proforma_cache[_cache_key] = modifications
+                        peptidoform, modifications = _from_proforma(
+                            peptidoform,
+                            sequence,
+                            meta=mods_meta,
+                        )
+                        _proforma_cache[_cache_key] = (peptidoform, modifications)
                 else:
                     modifications = None
 
@@ -1108,10 +1130,14 @@ def _transform_batch_isobaric(
                 if peptidoform and peptidoform != sequence:
                     _cache_key = (peptidoform, sequence)
                     if _cache_key in _proforma_cache:
-                        modifications = _proforma_cache[_cache_key]
+                        peptidoform, modifications = _proforma_cache[_cache_key]
                     else:
-                        modifications = _from_proforma(peptidoform, sequence, meta=mods_meta)
-                        _proforma_cache[_cache_key] = modifications
+                        peptidoform, modifications = _from_proforma(
+                            peptidoform,
+                            sequence,
+                            meta=mods_meta,
+                        )
+                        _proforma_cache[_cache_key] = (peptidoform, modifications)
                 else:
                     modifications = None
 

diff --git a/qpx/converters/quantms/psm_adapter.py b/qpx/converters/quantms/psm_adapter.py
@@ -370,7 +370,7 @@ def _is_valid(val):
                 )
 
         # --- Modifications (structured) ---
-        modifications = from_proforma(
+        peptidoform, modifications = from_proforma(
             peptidoform,
             sequence,
             meta=modifications_meta,

diff --git a/qpx/core/data/schemas/psm.yaml b/qpx/core/data/schemas/psm.yaml
@@ -87,7 +87,7 @@ fields:
     doc: "Fragment charge values"
   ion_type_array:
     type: "list<string>"
-    doc: "Fragment ion type annotations (b, y, a, etc.)"
+    doc: "Fragment ion type annotations (e.g., b1, y2, a2)"
   ion_mobility_array:
     type: "list<float32>"
     doc: "Fragment ion mobility values"