diff --git a/validation/structure_validation.py b/validation/structure_validation.py index 3fdd651..2ca2fa5 100644 --- a/validation/structure_validation.py +++ b/validation/structure_validation.py @@ -2,22 +2,15 @@ import zipfile import tempfile -from collections import Counter import MDAnalysis as mda from pathlib import Path from typing import Union -from rdkit import Chem +from rdkit import Chem, RDLogger +from rdkit.Chem import AllChem STRUCTURE_DATASET_SIZE = 78 -def _heavy_atom_counts_from_smiles(smiles: str) -> Counter | None: - mol = Chem.MolFromSmiles(smiles) - if mol is None: - return None - return Counter(atom.GetSymbol() for atom in mol.GetAtoms()) - - def validate_structure_submission( structure_predictions_file: Union[str, Path], expected_ids: set[str] | None = None, @@ -78,27 +71,37 @@ def validate_structure_submission( if len(u.segments) > 2: errors.append(f"{name}: Found {len(u.segments)} chains, expected 2 or fewer") - # 4. Check ligand matches expected SMILES by heavy-atom composition + # 4. Check ligand graph matches expected SMILES connectivity if expected_ligand_smiles is not None: pdb_id = Path(name).stem expected_smi = expected_ligand_smiles.get(pdb_id) if expected_smi is not None: - expected_counts = _heavy_atom_counts_from_smiles(expected_smi) - if expected_counts is None: + ref_mol = Chem.MolFromSmiles(expected_smi) + if ref_mol is None: errors.append( f"{name}: Could not parse expected SMILES for '{pdb_id}'" ) else: - obs_counts = Counter( - elem for elem in ligands.elements - if elem.strip() not in ("H", "h", "") + lig_pdb_path = Path(tmpdir) / f"{pdb_id}_lig.pdb" + ligands.write(str(lig_pdb_path)) + lig_mol = Chem.MolFromPDBFile( + str(lig_pdb_path), removeHs=True, sanitize=False ) - if obs_counts != expected_counts: + if lig_mol is None: errors.append( - f"{name}: Ligand heavy-atom composition mismatch. " - f"Expected {dict(sorted(expected_counts.items()))}, " - f"got {dict(sorted(obs_counts.items()))}" + f"{name}: RDKit could not parse LIG residue" ) + else: + try: + RDLogger.DisableLog("rdApp.*") + AllChem.AssignBondOrdersFromTemplate(ref_mol, lig_mol) + RDLogger.EnableLog("rdApp.*") + except ValueError: + RDLogger.EnableLog("rdApp.*") + errors.append( + f"{name}: Ligand connectivity does not match " + f"expected SMILES '{expected_smi}'" + ) except Exception as e: errors.append(f"{name}: MDAnalysis failed to parse file: {e}")