diff --git a/notebooks/structure_prediction.ipynb b/notebooks/structure_prediction.ipynb
index ae68613..7fd067d 100644
--- a/notebooks/structure_prediction.ipynb
+++ b/notebooks/structure_prediction.ipynb
@@ -80,7 +80,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "7fd300b3f2364146bc2b1066cf3e3f06",
+ "model_id": "675085da034f4139976b9f860b5670e8",
"version_major": 2,
"version_minor": 0
},
@@ -255,31 +255,31 @@
"
0 | \n",
" x00011-1 | \n",
" NS(=O)(=O)C=1C=CC=2CCCC2C1 | \n",
- " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4740> | \n",
+ " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac430> | \n",
" \n",
" \n",
" | 1 | \n",
" x00035-1 | \n",
" NC1=NC=2C=CC=CC2S1 | \n",
- " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab47b0> | \n",
+ " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac2e0> | \n",
"
\n",
" \n",
" | 2 | \n",
" x00046-1 | \n",
" NC1=NC=2C=C(Cl)C=CC2O1 | \n",
- " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4820> | \n",
+ " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac740> | \n",
"
\n",
" \n",
" | 3 | \n",
" x00052-1 | \n",
" NC(=O)C=1C=CC=C(Cl)C1 | \n",
- " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4890> | \n",
+ " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac7b0> | \n",
"
\n",
" \n",
" | 4 | \n",
" x00086-1 | \n",
" ClC=1C=CC=2OC(=O)NC2C1 | \n",
- " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4900> | \n",
+ " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac820> | \n",
"
\n",
" \n",
"\n",
@@ -294,11 +294,11 @@
"4 x00086-1 ClC=1C=CC=2OC(=O)NC2C1 \n",
"\n",
" mol \n",
- "0 \n",
- "1 \n",
- "2 \n",
- "3 \n",
- "4 "
+ "0 \n",
+ "1 \n",
+ "2 \n",
+ "3 \n",
+ "4 "
]
},
"execution_count": 5,
@@ -683,7 +683,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "71197becbdd541ef86a336453c28aac4",
+ "model_id": "067c76e2aa9842599d3e2d8b484e510b",
"version_major": 2,
"version_minor": 0
},
@@ -870,9 +870,12 @@
"\n",
"print(f\"Created zip archive: {zip_path} ({len(pdb_files)} files)\")\n",
"\n",
+ "expected_smiles_dict = dict(zip(structure_df[\"id\"], structure_df[\"SMILES\"]))\n",
+ "\n",
"is_valid, validation_errors = validate_structure_submission(\n",
" zip_path,\n",
" expected_ids=set(structure_df[\"id\"]),\n",
+ " expected_ligand_smiles=expected_smiles_dict,\n",
" require_lig_resname=True,\n",
")\n",
"\n",
diff --git a/validation/structure_validation.py b/validation/structure_validation.py
index bdbf9cb..3fdd651 100644
--- a/validation/structure_validation.py
+++ b/validation/structure_validation.py
@@ -2,16 +2,26 @@
import zipfile
import tempfile
+from collections import Counter
import MDAnalysis as mda
from pathlib import Path
from typing import Union
+from rdkit import Chem
STRUCTURE_DATASET_SIZE = 78
+
+def _heavy_atom_counts_from_smiles(smiles: str) -> Counter | None:
+ mol = Chem.MolFromSmiles(smiles)
+ if mol is None:
+ return None
+ return Counter(atom.GetSymbol() for atom in mol.GetAtoms())
+
+
def validate_structure_submission(
structure_predictions_file: Union[str, Path],
expected_ids: set[str] | None = None,
- expected_ligand_smiles: set[str] | None = None,
+ expected_ligand_smiles: dict[str, str] | None = None,
require_lig_resname: bool = True,
) -> tuple[bool, list[str]]:
@@ -68,6 +78,28 @@ def validate_structure_submission(
if len(u.segments) > 2:
errors.append(f"{name}: Found {len(u.segments)} chains, expected 2 or fewer")
+ # 4. Check ligand matches expected SMILES by heavy-atom composition
+ if expected_ligand_smiles is not None:
+ pdb_id = Path(name).stem
+ expected_smi = expected_ligand_smiles.get(pdb_id)
+ if expected_smi is not None:
+ expected_counts = _heavy_atom_counts_from_smiles(expected_smi)
+ if expected_counts is None:
+ errors.append(
+ f"{name}: Could not parse expected SMILES for '{pdb_id}'"
+ )
+ else:
+ obs_counts = Counter(
+ elem for elem in ligands.elements
+ if elem.strip() not in ("H", "h", "")
+ )
+ if obs_counts != expected_counts:
+ errors.append(
+ f"{name}: Ligand heavy-atom composition mismatch. "
+ f"Expected {dict(sorted(expected_counts.items()))}, "
+ f"got {dict(sorted(obs_counts.items()))}"
+ )
+
except Exception as e:
errors.append(f"{name}: MDAnalysis failed to parse file: {e}")