Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 69 additions & 88 deletions notebooks/structure_prediction.ipynb

Large diffs are not rendered by default.

112 changes: 110 additions & 2 deletions validation/structure_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,79 @@
import tempfile
import MDAnalysis as mda
from pathlib import Path
from typing import Union
from typing import Mapping, Union

from rdkit import Chem
from rdkit.Chem import AllChem

STRUCTURE_DATASET_SIZE = 110


def _canonical_no_stereo_smiles(mol: Chem.Mol) -> str:
"""Return a canonical, non-isomeric SMILES string for robust matching."""
return Chem.MolToSmiles(Chem.RemoveHs(mol), canonical=True, isomericSmiles=False)


def _validate_ligand_bond_orders(
pdb_path: str,
expected_smiles: str,
) -> tuple[bool, str | None]:
"""
Validate ligand chemistry by assigning bond orders from a SMILES template.

This mirrors the template-based approach used during prep/inspection workflows,
but is embedded here so validation does not rely on external helper scripts.
"""
# Parse ligand-only PDB generated from a single LIG residue
lig_mol = Chem.MolFromPDBFile(pdb_path, removeHs=True)
if lig_mol is None:
return False, "RDKit could not parse extracted LIG residue"

template = Chem.MolFromSmiles(expected_smiles)
if template is None:
return False, f"Invalid expected SMILES provided: {expected_smiles!r}"

try:
assigned = AllChem.AssignBondOrdersFromTemplate(template, lig_mol)
except Exception as exc:
return False, f"Bond-order assignment from template failed: {exc}"

expected_canon = _canonical_no_stereo_smiles(template)
assigned_canon = _canonical_no_stereo_smiles(assigned)
if expected_canon != assigned_canon:
return (
False,
f"Ligand chemistry mismatch after bond-order assignment: expected {expected_canon}, got {assigned_canon}",
)

return True, None


def _smiles_for_structure(
expected_ligand_smiles: Mapping[str, str] | set[str] | list[str] | None,
structure_id: str,
) -> str | None:
"""
Resolve expected SMILES for a given structure ID.

Supported formats:
- Mapping[str, str]: preferred; keys are structure IDs.
- set/list[str]: fallback; accepted SMILES pool (ID-agnostic).
"""
if expected_ligand_smiles is None:
return None

if isinstance(expected_ligand_smiles, Mapping):
return expected_ligand_smiles.get(structure_id)

# Fallback mode: if a non-mapping iterable of SMILES is provided,
# we cannot resolve ID-specific templates here.
return None

def validate_structure_submission(
structure_predictions_file: Union[str, Path],
expected_ids: set[str] | None = None,
expected_ligand_smiles: set[str] | None = None,
expected_ligand_smiles: Mapping[str, str] | set[str] | list[str] | None = None,
require_lig_resname: bool = True,
) -> tuple[bool, list[str]]:

Expand Down Expand Up @@ -50,6 +115,8 @@ def validate_structure_submission(
# Extract to temp file so MDAnalysis can read it
tmp_path = zip_file.extract(name, path=tmpdir)

structure_id = Path(name).stem

try:
# Suppress warnings for missing chain IDs/elements if necessary
u = mda.Universe(tmp_path)
Expand All @@ -68,6 +135,47 @@ def validate_structure_submission(
if len(u.segments) > 2:
errors.append(f"{name}: Found {len(u.segments)} chains, expected 2 or fewer")

# 4. Optional ligand bond-order/chemistry check against expected SMILES
expected_smiles = _smiles_for_structure(expected_ligand_smiles, structure_id)
if expected_smiles is not None:
lig_res = ligands.residues[0]
lig_atoms = lig_res.atoms

# Write ligand-only PDB for robust template assignment
lig_tmp_path = Path(tmpdir) / f"{structure_id}_LIG.pdb"
lig_atoms.write(str(lig_tmp_path))

is_ok, msg = _validate_ligand_bond_orders(str(lig_tmp_path), expected_smiles)
if not is_ok:
errors.append(f"{name}: {msg}")
elif isinstance(expected_ligand_smiles, (set, list)):
# Backward-compatible pool mode: validate assigned chemistry against any provided SMILES.
lig_res = ligands.residues[0]
lig_atoms = lig_res.atoms
lig_tmp_path = Path(tmpdir) / f"{structure_id}_LIG.pdb"
lig_atoms.write(str(lig_tmp_path))

lig_mol = Chem.MolFromPDBFile(str(lig_tmp_path), removeHs=True)
if lig_mol is None:
errors.append(f"{name}: RDKit could not parse extracted LIG residue")
else:
matched = False
for smiles in expected_ligand_smiles:
template = Chem.MolFromSmiles(smiles)
if template is None:
continue
try:
assigned = AllChem.AssignBondOrdersFromTemplate(template, lig_mol)
except Exception:
continue
if _canonical_no_stereo_smiles(assigned) == _canonical_no_stereo_smiles(template):
matched = True
break
if not matched:
errors.append(
f"{name}: Ligand did not match any expected SMILES after bond-order assignment"
)

except Exception as e:
errors.append(f"{name}: MDAnalysis failed to parse file: {e}")

Expand Down
Loading