From 4a6f3006b7ad41e80e80217f75e0837424fe0e29 Mon Sep 17 00:00:00 2001 From: hmacdope Date: Wed, 1 Apr 2026 13:12:21 +1100 Subject: [PATCH] add HAC validation --- notebooks/structure_prediction.ipynb | 27 ++++++++++++---------- validation/structure_validation.py | 34 +++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/notebooks/structure_prediction.ipynb b/notebooks/structure_prediction.ipynb index ae68613..7fd067d 100644 --- a/notebooks/structure_prediction.ipynb +++ b/notebooks/structure_prediction.ipynb @@ -80,7 +80,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7fd300b3f2364146bc2b1066cf3e3f06", + "model_id": "675085da034f4139976b9f860b5670e8", "version_major": 2, "version_minor": 0 }, @@ -255,31 +255,31 @@ " 0\n", " x00011-1\n", " NS(=O)(=O)C=1C=CC=2CCCC2C1\n", - " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4740>\n", + " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac430>\n", " \n", " \n", " 1\n", " x00035-1\n", " NC1=NC=2C=CC=CC2S1\n", - " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab47b0>\n", + " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac2e0>\n", " \n", " \n", " 2\n", " x00046-1\n", " NC1=NC=2C=C(Cl)C=CC2O1\n", - " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4820>\n", + " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac740>\n", " \n", " \n", " 3\n", " x00052-1\n", " NC(=O)C=1C=CC=C(Cl)C1\n", - " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4890>\n", + " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac7b0>\n", " \n", " \n", " 4\n", " x00086-1\n", " ClC=1C=CC=2OC(=O)NC2C1\n", - " <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4900>\n", + " <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac820>\n", " \n", " \n", "\n", @@ -294,11 +294,11 @@ "4 x00086-1 ClC=1C=CC=2OC(=O)NC2C1 \n", "\n", " mol \n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 " + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 " ] }, "execution_count": 5, @@ -683,7 +683,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "71197becbdd541ef86a336453c28aac4", + "model_id": "067c76e2aa9842599d3e2d8b484e510b", "version_major": 2, "version_minor": 0 }, @@ -870,9 +870,12 @@ "\n", "print(f\"Created zip archive: {zip_path} ({len(pdb_files)} files)\")\n", "\n", + "expected_smiles_dict = dict(zip(structure_df[\"id\"], structure_df[\"SMILES\"]))\n", + "\n", "is_valid, validation_errors = validate_structure_submission(\n", " zip_path,\n", " expected_ids=set(structure_df[\"id\"]),\n", + " expected_ligand_smiles=expected_smiles_dict,\n", " require_lig_resname=True,\n", ")\n", "\n", diff --git a/validation/structure_validation.py b/validation/structure_validation.py index bdbf9cb..3fdd651 100644 --- a/validation/structure_validation.py +++ b/validation/structure_validation.py @@ -2,16 +2,26 @@ import zipfile import tempfile +from collections import Counter import MDAnalysis as mda from pathlib import Path from typing import Union +from rdkit import Chem STRUCTURE_DATASET_SIZE = 78 + +def _heavy_atom_counts_from_smiles(smiles: str) -> Counter | None: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return None + return Counter(atom.GetSymbol() for atom in mol.GetAtoms()) + + def validate_structure_submission( structure_predictions_file: Union[str, Path], expected_ids: set[str] | None = None, - expected_ligand_smiles: set[str] | None = None, + expected_ligand_smiles: dict[str, str] | None = None, require_lig_resname: bool = True, ) -> tuple[bool, list[str]]: @@ -68,6 +78,28 @@ def validate_structure_submission( if len(u.segments) > 2: errors.append(f"{name}: Found {len(u.segments)} chains, expected 2 or fewer") + # 4. Check ligand matches expected SMILES by heavy-atom composition + if expected_ligand_smiles is not None: + pdb_id = Path(name).stem + expected_smi = expected_ligand_smiles.get(pdb_id) + if expected_smi is not None: + expected_counts = _heavy_atom_counts_from_smiles(expected_smi) + if expected_counts is None: + errors.append( + f"{name}: Could not parse expected SMILES for '{pdb_id}'" + ) + else: + obs_counts = Counter( + elem for elem in ligands.elements + if elem.strip() not in ("H", "h", "") + ) + if obs_counts != expected_counts: + errors.append( + f"{name}: Ligand heavy-atom composition mismatch. " + f"Expected {dict(sorted(expected_counts.items()))}, " + f"got {dict(sorted(obs_counts.items()))}" + ) + except Exception as e: errors.append(f"{name}: MDAnalysis failed to parse file: {e}")