Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions validation/structure_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,15 @@

import zipfile
import tempfile
from collections import Counter
import MDAnalysis as mda
from pathlib import Path
from typing import Union
from rdkit import Chem
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem

STRUCTURE_DATASET_SIZE = 78


def _heavy_atom_counts_from_smiles(smiles: str) -> Counter | None:
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
return Counter(atom.GetSymbol() for atom in mol.GetAtoms())


def validate_structure_submission(
structure_predictions_file: Union[str, Path],
expected_ids: set[str] | None = None,
Expand Down Expand Up @@ -78,27 +71,37 @@ def validate_structure_submission(
if len(u.segments) > 2:
errors.append(f"{name}: Found {len(u.segments)} chains, expected 2 or fewer")

# 4. Check ligand matches expected SMILES by heavy-atom composition
# 4. Check ligand graph matches expected SMILES connectivity
if expected_ligand_smiles is not None:
pdb_id = Path(name).stem
expected_smi = expected_ligand_smiles.get(pdb_id)
if expected_smi is not None:
expected_counts = _heavy_atom_counts_from_smiles(expected_smi)
if expected_counts is None:
ref_mol = Chem.MolFromSmiles(expected_smi)
if ref_mol is None:
errors.append(
f"{name}: Could not parse expected SMILES for '{pdb_id}'"
)
else:
obs_counts = Counter(
elem for elem in ligands.elements
if elem.strip() not in ("H", "h", "")
lig_pdb_path = Path(tmpdir) / f"{pdb_id}_lig.pdb"
ligands.write(str(lig_pdb_path))
lig_mol = Chem.MolFromPDBFile(
str(lig_pdb_path), removeHs=True, sanitize=False
)
if obs_counts != expected_counts:
if lig_mol is None:
errors.append(
f"{name}: Ligand heavy-atom composition mismatch. "
f"Expected {dict(sorted(expected_counts.items()))}, "
f"got {dict(sorted(obs_counts.items()))}"
f"{name}: RDKit could not parse LIG residue"
)
else:
try:
RDLogger.DisableLog("rdApp.*")
AllChem.AssignBondOrdersFromTemplate(ref_mol, lig_mol)
RDLogger.EnableLog("rdApp.*")
except ValueError:
RDLogger.EnableLog("rdApp.*")
errors.append(
f"{name}: Ligand connectivity does not match "
f"expected SMILES '{expected_smi}'"
)

except Exception as e:
errors.append(f"{name}: MDAnalysis failed to parse file: {e}")
Expand Down
Loading