Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions notebooks/structure_prediction.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7fd300b3f2364146bc2b1066cf3e3f06",
"model_id": "675085da034f4139976b9f860b5670e8",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -255,31 +255,31 @@
" <th>0</th>\n",
" <td>x00011-1</td>\n",
" <td>NS(=O)(=O)C=1C=CC=2CCCC2C1</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7c7374ab4740&gt;</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7b69c12ac430&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>x00035-1</td>\n",
" <td>NC1=NC=2C=CC=CC2S1</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7c7374ab47b0&gt;</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7b69c12ac2e0&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>x00046-1</td>\n",
" <td>NC1=NC=2C=C(Cl)C=CC2O1</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7c7374ab4820&gt;</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7b69c12ac740&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>x00052-1</td>\n",
" <td>NC(=O)C=1C=CC=C(Cl)C1</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7c7374ab4890&gt;</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7b69c12ac7b0&gt;</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>x00086-1</td>\n",
" <td>ClC=1C=CC=2OC(=O)NC2C1</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7c7374ab4900&gt;</td>\n",
" <td>&lt;rdkit.Chem.rdchem.Mol object at 0x7b69c12ac820&gt;</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -294,11 +294,11 @@
"4 x00086-1 ClC=1C=CC=2OC(=O)NC2C1 \n",
"\n",
" mol \n",
"0 <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4740> \n",
"1 <rdkit.Chem.rdchem.Mol object at 0x7c7374ab47b0> \n",
"2 <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4820> \n",
"3 <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4890> \n",
"4 <rdkit.Chem.rdchem.Mol object at 0x7c7374ab4900> "
"0 <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac430> \n",
"1 <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac2e0> \n",
"2 <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac740> \n",
"3 <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac7b0> \n",
"4 <rdkit.Chem.rdchem.Mol object at 0x7b69c12ac820> "
]
},
"execution_count": 5,
Expand Down Expand Up @@ -683,7 +683,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "71197becbdd541ef86a336453c28aac4",
"model_id": "067c76e2aa9842599d3e2d8b484e510b",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -870,9 +870,12 @@
"\n",
"print(f\"Created zip archive: {zip_path} ({len(pdb_files)} files)\")\n",
"\n",
"expected_smiles_dict = dict(zip(structure_df[\"id\"], structure_df[\"SMILES\"]))\n",
"\n",
"is_valid, validation_errors = validate_structure_submission(\n",
" zip_path,\n",
" expected_ids=set(structure_df[\"id\"]),\n",
" expected_ligand_smiles=expected_smiles_dict,\n",
" require_lig_resname=True,\n",
")\n",
"\n",
Expand Down
34 changes: 33 additions & 1 deletion validation/structure_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,26 @@

import zipfile
import tempfile
from collections import Counter
import MDAnalysis as mda
from pathlib import Path
from typing import Union
from rdkit import Chem

STRUCTURE_DATASET_SIZE = 78


def _heavy_atom_counts_from_smiles(smiles: str) -> Counter | None:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
return Counter(atom.GetSymbol() for atom in mol.GetAtoms())


def validate_structure_submission(
structure_predictions_file: Union[str, Path],
expected_ids: set[str] | None = None,
expected_ligand_smiles: set[str] | None = None,
expected_ligand_smiles: dict[str, str] | None = None,
require_lig_resname: bool = True,
) -> tuple[bool, list[str]]:

Expand Down Expand Up @@ -68,6 +78,28 @@ def validate_structure_submission(
if len(u.segments) > 2:
errors.append(f"{name}: Found {len(u.segments)} chains, expected 2 or fewer")

# 4. Check ligand matches expected SMILES by heavy-atom composition
if expected_ligand_smiles is not None:
pdb_id = Path(name).stem
expected_smi = expected_ligand_smiles.get(pdb_id)
if expected_smi is not None:
expected_counts = _heavy_atom_counts_from_smiles(expected_smi)
if expected_counts is None:
errors.append(
f"{name}: Could not parse expected SMILES for '{pdb_id}'"
)
else:
obs_counts = Counter(
elem for elem in ligands.elements
if elem.strip() not in ("H", "h", "")
)
if obs_counts != expected_counts:
errors.append(
f"{name}: Ligand heavy-atom composition mismatch. "
f"Expected {dict(sorted(expected_counts.items()))}, "
f"got {dict(sorted(obs_counts.items()))}"
)

except Exception as e:
errors.append(f"{name}: MDAnalysis failed to parse file: {e}")

Expand Down
Loading