From dd00e3288e1b9651db53a93f3664cfd4bbb1fc64 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Fri, 24 Jan 2025 15:12:06 +0100 Subject: [PATCH 01/12] Add `explicit_hydrogen` parameter --- src/biotite/interface/rdkit/mol.py | 27 +++++++++++++++++++++++---- tests/interface/test_rdkit.py | 10 ++++------ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index eef433179..500807cb0 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -76,6 +76,7 @@ def to_mol( kekulize=False, use_dative_bonds=False, include_extra_annotations=(), + explicit_hydrogen=True, ): """ Convert an :class:`.AtomArray` or :class:`.AtomArrayStack` into a @@ -105,6 +106,11 @@ def to_mol( are always included per default. These standard annotations can be accessed with :meth:`rdkit.Chem.rdchem.Atom.GetPDBResidueInfo()` for each atom in the returned :class:`rdkit.Chem.rdchem.Mol`. + explicit_hydrogen : bool, optional + If set to true, the conversion process expects that all hydrogen atoms are + explicit, i.e. each each hydrogen atom must be part of the :class:`AtomArray`. + If set to false, the conversion process treats all hydrogen atoms as implicit + and all hydrogen atoms in the :class:`AtomArray` are removed. Returns ------- @@ -141,12 +147,24 @@ def to_mol( HB3 HXT """ + hydrogen_mask = atoms.element == "H" + if explicit_hydrogen: + if not hydrogen_mask.any(): + warnings.warn( + "No hydrogen found in the input, although 'explicit_hydrogen' is 'True'" + ) + else: + atoms = atoms[..., ~hydrogen_mask] + mol = EditableMol(Mol()) has_annot = frozenset(atoms.get_annotation_categories()) extra_annot = set(include_extra_annotations) - _STANDARD_ANNOTATIONS + for i in range(atoms.array_length()): rdkit_atom = Atom(atoms.element[i].capitalize()) + if explicit_hydrogen: + rdkit_atom.SetNoImplicit(True) if "charge" in has_annot: rdkit_atom.SetFormalCharge(atoms.charge[i].item()) @@ -176,11 +194,12 @@ def to_mol( if atoms.bonds is None: raise BadStructureError("An AtomArray with associated BondList is required") - bonds = atoms.bonds.as_array() if kekulize: - bonds = bonds.copy() + bonds = atoms.bonds.copy() bonds.remove_aromaticity() - for atom_i, atom_j, bond_type in atoms.bonds.as_array(): + else: + bonds = atoms.bonds + for atom_i, atom_j, bond_type in bonds.as_array(): if not use_dative_bonds and bond_type == BondType.COORDINATION: bond_type = BondType.SINGLE mol.AddBond( @@ -367,7 +386,7 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): except KekulizeException: warnings.warn( "Kekulization failed, " - "using 'BondType.ANY' instead for aromatic bonds instead", + "using 'BondType.AROMATIC' instead for aromatic bonds instead", LossyConversionWarning, ) rdkit_bonds = list(mol.GetBonds()) diff --git a/tests/interface/test_rdkit.py b/tests/interface/test_rdkit.py index 2e342ae92..f9fc6245d 100644 --- a/tests/interface/test_rdkit.py +++ b/tests/interface/test_rdkit.py @@ -20,11 +20,9 @@ def _load_smiles(): return file.read().splitlines() -@pytest.mark.filterwarnings( - "ignore:" - "The coordinates are missing for some atoms. " - "The fallback coordinates will be used instead" -) +@pytest.mark.filterwarnings("ignore:Missing coordinates.*") +@pytest.mark.filterwarnings("ignore:.*coordinates are missing.*") +@pytest.mark.filterwarnings("ignore::biotite.interface.LossyConversionWarning") @pytest.mark.parametrize( "res_name", np.random.default_rng(0).choice(info.all_residues(), size=200).tolist() ) @@ -46,7 +44,7 @@ def test_conversion_from_biotite(res_name): # Some compounds in the CCD have missing coordinates assert np.allclose(test_atoms.coord, ref_atoms.coord, equal_nan=True) - # There should be now undefined bonds + # There should be no undefined bonds assert (test_atoms.bonds.as_array()[:, 2] != struc.BondType.ANY).all() # Kekulization returns one of multiple resonance structures, so the returned one # might not be the same as the input From 30b8eb3be23483e147021de90014c478448cf53c Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sun, 2 Feb 2025 11:15:24 +0100 Subject: [PATCH 02/12] Use canonical rdkit import --- doc/tutorial/interface/rdkit.rst | 20 ++++----- src/biotite/interface/rdkit/mol.py | 66 +++++++++++++----------------- 2 files changed, 36 insertions(+), 50 deletions(-) diff --git a/doc/tutorial/interface/rdkit.rst b/doc/tutorial/interface/rdkit.rst index ff9954a52..e91ee55d2 100644 --- a/doc/tutorial/interface/rdkit.rst +++ b/doc/tutorial/interface/rdkit.rst @@ -27,15 +27,14 @@ For a proper structural formula, we need to compute proper 2D coordinates first. import biotite.interface.rdkit as rdkit_interface import biotite.structure.info as struc + import rdkit.Chem.AllChem as Chem from rdkit.Chem.Draw import MolToImage - from rdkit.Chem.rdDepictor import Compute2DCoords - from rdkit.Chem.rdmolops import RemoveHs penicillin = struc.residue("PNN") mol = rdkit_interface.to_mol(penicillin) # We do not want to include explicit hydrogen atoms in the structural formula - mol = RemoveHs(mol) - Compute2DCoords(mol) + mol = Chem.RemoveHs(mol) + Chem.Compute2DCoords(mol) image = MolToImage(mol, size=(600, 400)) display(image) @@ -49,18 +48,13 @@ One way to to obtain them as :class:`.AtomArray` is passing a *SMILES* string to .. jupyter-execute:: - from rdkit.Chem import MolFromSmiles - from rdkit.Chem.rdDistGeom import EmbedMolecule - from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule - from rdkit.Chem.rdmolops import AddHs - ERTAPENEM_SMILES = "C[C@@H]1[C@@H]2[C@H](C(=O)N2C(=C1S[C@H]3C[C@H](NC3)C(=O)NC4=CC=CC(=C4)C(=O)O)C(=O)O)[C@@H](C)O" - mol = MolFromSmiles(ERTAPENEM_SMILES) + mol = Chem.MolFromSmiles(ERTAPENEM_SMILES) # RDKit uses implicit hydrogen atoms by default, but Biotite requires explicit ones - mol = AddHs(mol) + mol = Chem.AddHs(mol) # Create a 3D conformer - conformer_id = EmbedMolecule(mol) - UFFOptimizeMolecule(mol) + conformer_id = Chem.EmbedMolecule(mol) + Chem.UFFOptimizeMolecule(mol) ertapenem = rdkit_interface.from_mol(mol, conformer_id) print(ertapenem) \ No newline at end of file diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 500807cb0..eb31a0854 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -9,16 +9,8 @@ import warnings from collections import defaultdict import numpy as np -from rdkit.Chem.rdchem import ( - Atom, - AtomPDBResidueInfo, - Conformer, - EditableMol, - KekulizeException, - Mol, -) -from rdkit.Chem.rdchem import BondType as RDKitBondType -from rdkit.Chem.rdmolops import AddHs, Kekulize, SanitizeFlags, SanitizeMol +import rdkit.Chem.AllChem as Chem +from rdkit.Chem import SanitizeFlags from biotite.interface.version import requires_version from biotite.interface.warning import LossyConversionWarning from biotite.structure.atoms import AtomArray, AtomArrayStack @@ -31,26 +23,26 @@ BondType.TRIPLE: BondType.AROMATIC_TRIPLE, } _BIOTITE_TO_RDKIT_BOND_TYPE = { - BondType.ANY: RDKitBondType.UNSPECIFIED, - BondType.SINGLE: RDKitBondType.SINGLE, - BondType.DOUBLE: RDKitBondType.DOUBLE, - BondType.TRIPLE: RDKitBondType.TRIPLE, - BondType.QUADRUPLE: RDKitBondType.QUADRUPLE, - BondType.AROMATIC_SINGLE: RDKitBondType.AROMATIC, - BondType.AROMATIC_DOUBLE: RDKitBondType.AROMATIC, - BondType.AROMATIC_TRIPLE: RDKitBondType.AROMATIC, - BondType.AROMATIC: RDKitBondType.AROMATIC, + BondType.ANY: Chem.BondType.UNSPECIFIED, + BondType.SINGLE: Chem.BondType.SINGLE, + BondType.DOUBLE: Chem.BondType.DOUBLE, + BondType.TRIPLE: Chem.BondType.TRIPLE, + BondType.QUADRUPLE: Chem.BondType.QUADRUPLE, + BondType.AROMATIC_SINGLE: Chem.BondType.AROMATIC, + BondType.AROMATIC_DOUBLE: Chem.BondType.AROMATIC, + BondType.AROMATIC_TRIPLE: Chem.BondType.AROMATIC, + BondType.AROMATIC: Chem.BondType.AROMATIC, # Dative bonds may lead to a KekulizeException and may potentially be deprecated # in the future (https://github.com/rdkit/rdkit/discussions/6995) - BondType.COORDINATION: RDKitBondType.SINGLE, + BondType.COORDINATION: Chem.BondType.SINGLE, } _RDKIT_TO_BIOTITE_BOND_TYPE = { - RDKitBondType.UNSPECIFIED: BondType.ANY, - RDKitBondType.SINGLE: BondType.SINGLE, - RDKitBondType.DOUBLE: BondType.DOUBLE, - RDKitBondType.TRIPLE: BondType.TRIPLE, - RDKitBondType.QUADRUPLE: BondType.QUADRUPLE, - RDKitBondType.DATIVE: BondType.COORDINATION, + Chem.BondType.UNSPECIFIED: BondType.ANY, + Chem.BondType.SINGLE: BondType.SINGLE, + Chem.BondType.DOUBLE: BondType.DOUBLE, + Chem.BondType.TRIPLE: BondType.TRIPLE, + Chem.BondType.QUADRUPLE: BondType.QUADRUPLE, + Chem.BondType.DATIVE: BondType.COORDINATION, } _STANDARD_ANNOTATIONS = frozenset( { @@ -156,20 +148,20 @@ def to_mol( else: atoms = atoms[..., ~hydrogen_mask] - mol = EditableMol(Mol()) + mol = Chem.EditableMol(Chem.Mol()) has_annot = frozenset(atoms.get_annotation_categories()) extra_annot = set(include_extra_annotations) - _STANDARD_ANNOTATIONS for i in range(atoms.array_length()): - rdkit_atom = Atom(atoms.element[i].capitalize()) + rdkit_atom = Chem.Atom(atoms.element[i].capitalize()) if explicit_hydrogen: rdkit_atom.SetNoImplicit(True) if "charge" in has_annot: rdkit_atom.SetFormalCharge(atoms.charge[i].item()) # add standard pdb annotations - rdkit_atom_res_info = AtomPDBResidueInfo( + rdkit_atom_res_info = Chem.AtomPDBResidueInfo( atomName=atoms.atom_name[i].item(), residueName=atoms.res_name[i].item(), chainId=atoms.chain_id[i].item(), @@ -213,7 +205,7 @@ def to_mol( # Handle AtomArray and AtomArrayStack consistently coord = coord[None, :, :] for model_coord in coord: - conformer = Conformer(mol.GetNumAtoms()) + conformer = Chem.Conformer(mol.GetNumAtoms()) # RDKit silently expects the data to be in C-contiguous order # Otherwise the coordinates would be completely misassigned # (https://github.com/rdkit/rdkit/issues/8221) @@ -290,8 +282,8 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): if add_hydrogen is None: add_hydrogen = not _has_explicit_hydrogen(mol) if add_hydrogen: - SanitizeMol(mol, SanitizeFlags.SANITIZE_ADJUSTHS) - mol = AddHs(mol, addCoords=False, addResidueInfo=False) + Chem.SanitizeMol(mol, SanitizeFlags.SANITIZE_ADJUSTHS) + mol = Chem.AddHs(mol, addCoords=False, addResidueInfo=False) rdkit_atoms = mol.GetAtoms() if rdkit_atoms is None: @@ -328,7 +320,7 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): residue_info = rdkit_atom.GetPDBResidueInfo() if residue_info is None: # ... default values for atoms with missing residue information - residue_info = AtomPDBResidueInfo( + residue_info = Chem.AtomPDBResidueInfo( atomName="", occupancy=0.0, tempFactor=float("nan"), @@ -375,15 +367,15 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): rdkit_bonds = list(mol.GetBonds()) is_aromatic = np.array( - [bond.GetBondType() == RDKitBondType.AROMATIC for bond in rdkit_bonds] + [bond.GetBondType() == Chem.BondType.AROMATIC for bond in rdkit_bonds] ) if np.any(is_aromatic): # Determine the kekulized order of aromatic bonds # Copy as 'Kekulize()' modifies the molecule in-place - mol = Mol(mol) + mol = Chem.Mol(mol) try: - Kekulize(mol) - except KekulizeException: + Chem.Kekulize(mol) + except Chem.KekulizeException: warnings.warn( "Kekulization failed, " "using 'BondType.AROMATIC' instead for aromatic bonds instead", From 6ced9ea37092089053e97eb72882a839f324e1bf Mon Sep 17 00:00:00 2001 From: Simon Mathis Date: Wed, 12 Feb 2025 12:05:21 +0000 Subject: [PATCH 03/12] fix: enable returning molecules without conformers (with nan coords). Also set coordinates of non-3d conformers to nan even when explicitly requested, as they are meaningless. --- src/biotite/interface/rdkit/mol.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index eb31a0854..8d632d3ae 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -290,14 +290,20 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): raise BadStructureError("Could not obtains atoms from Mol") if conformer_id is None: - conformers = [conf for conf in mol.GetConformers() if conf.Is3D()] + conformers = [conf for conf in mol.GetConformers()] atoms = AtomArrayStack(len(conformers), len(rdkit_atoms)) for i, conformer in enumerate(conformers): - atoms.coord[i] = np.array(conformer.GetPositions()) + if conformer.Is3D(): + atoms.coord[i] = np.array(conformer.GetPositions()) + else: + atoms.coord[i] = np.full((len(rdkit_atoms), 3), np.nan) else: conformer = mol.GetConformer(conformer_id) atoms = AtomArray(len(rdkit_atoms)) - atoms.coord = np.array(conformer.GetPositions()) + if conformer.Is3D(): + atoms.coord = np.array(conformer.GetPositions()) + else: + atoms.coord = np.full((len(rdkit_atoms), 3), np.nan) extra_annotations = defaultdict( # Use 'object' dtype first, as the maximum string length is unknown From c111e7911c52fdec91739503380307af286b91c4 Mon Sep 17 00:00:00 2001 From: Simon Mathis Date: Wed, 12 Feb 2025 12:06:07 +0000 Subject: [PATCH 04/12] feat: attempt to type-cast extra annotations instead of always leaving them as 'str' objects. Also type-cast the rdkit internal 'isImplicit' flag. --- src/biotite/interface/rdkit/mol.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 8d632d3ae..09bc66ab6 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -369,7 +369,25 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): extra_annotations[annot_name][_atom_idx] = rdkit_atom.GetProp(annot_name) for annot_name, array in extra_annotations.items(): - atoms.set_annotation(annot_name, array.astype(str)) + # ... handle special case of implicit hydrogen atom flags + if annot_name == "isImplicit": + atoms.set_annotation(annot_name, array.astype(bool)) + continue + + # ... for all custom annotations, try numeric conversions in order of preference + for dtype in (bool, int, float): + try: + converted = array.astype(dtype) + # Verify the conversion was lossless by converting back + if np.array_equal( + converted.astype(str), array.astype(str), equal_nan=True + ): + atoms.set_annotation(annot_name, converted) + break + except (ValueError, TypeError): + continue + else: # ... if no numeric conversion succeeded + atoms.set_annotation(annot_name, array.astype(str)) rdkit_bonds = list(mol.GetBonds()) is_aromatic = np.array( From 88cdfee7331e78cc1100deabbbf7a373d5593b39 Mon Sep 17 00:00:00 2001 From: Simon Mathis Date: Wed, 12 Feb 2025 12:15:25 +0000 Subject: [PATCH 05/12] fix: suggestion for default case & warnings for explicit hydrogens --- src/biotite/interface/rdkit/mol.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 09bc66ab6..21ccf9561 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -68,7 +68,7 @@ def to_mol( kekulize=False, use_dative_bonds=False, include_extra_annotations=(), - explicit_hydrogen=True, + explicit_hydrogen=False, ): """ Convert an :class:`.AtomArray` or :class:`.AtomArrayStack` into a @@ -103,6 +103,9 @@ def to_mol( explicit, i.e. each each hydrogen atom must be part of the :class:`AtomArray`. If set to false, the conversion process treats all hydrogen atoms as implicit and all hydrogen atoms in the :class:`AtomArray` are removed. + By default, explicit_hydrogen=False, i.e. hydrogen atoms are never assumed, to + avoid accidentally introducing radicals into the molecule upon calling + :func:`Chem.SanitizeMol()`. Returns ------- @@ -140,12 +143,24 @@ def to_mol( HXT """ hydrogen_mask = atoms.element == "H" + _has_hydrogen = hydrogen_mask.any() if explicit_hydrogen: - if not hydrogen_mask.any(): + if not _has_hydrogen: warnings.warn( - "No hydrogen found in the input, although 'explicit_hydrogen' is 'True'" + "No hydrogen found in the input, although 'explicit_hydrogen' is 'True'. " + "This may lead to unexpected results, as hydrogen atoms are not " + "implicitly added, which often results in radicals after sanitization " + "in RDKit. Make sure this is what you want.", + UserWarning, ) else: + if _has_hydrogen: + warnings.warn( + "Hydrogen atoms are present in the input, although 'explicit_hydrogen' " + "is 'False'. All hydrogen atoms will be removed and then re-inferred " + "by RDKit. Make sure this is what you want.", + UserWarning, + ) atoms = atoms[..., ~hydrogen_mask] mol = Chem.EditableMol(Chem.Mol()) From c67831723d61fb100a9939ff34109392be26c64c Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 15 Feb 2025 22:23:33 +0100 Subject: [PATCH 06/12] Update src/biotite/interface/rdkit/mol.py Co-authored-by: Simon Mathis --- src/biotite/interface/rdkit/mol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 21ccf9561..40a2d5b26 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -171,6 +171,7 @@ def to_mol( for i in range(atoms.array_length()): rdkit_atom = Chem.Atom(atoms.element[i].capitalize()) if explicit_hydrogen: + # ... tell RDKit to not assume any implicit hydrogens rdkit_atom.SetNoImplicit(True) if "charge" in has_annot: rdkit_atom.SetFormalCharge(atoms.charge[i].item()) From 8235f0152e4fd773b160543af99c62570c2b2614 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 15 Feb 2025 22:44:20 +0100 Subject: [PATCH 07/12] Infer `explicit_hydrogen` from presence of hydrogen atoms --- src/biotite/interface/rdkit/mol.py | 34 +++++++++++++++++------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 40a2d5b26..a2e855230 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -68,7 +68,7 @@ def to_mol( kekulize=False, use_dative_bonds=False, include_extra_annotations=(), - explicit_hydrogen=False, + explicit_hydrogen=None, ): """ Convert an :class:`.AtomArray` or :class:`.AtomArrayStack` into a @@ -78,6 +78,7 @@ def to_mol( ---------- atoms : AtomArray or AtomArrayStack The molecule to be converted. + Must have an associated :class:`BondList`. kekulize : bool, optional If set to true, aromatic bonds are represented by single, double and triple bonds. @@ -101,11 +102,9 @@ def to_mol( explicit_hydrogen : bool, optional If set to true, the conversion process expects that all hydrogen atoms are explicit, i.e. each each hydrogen atom must be part of the :class:`AtomArray`. - If set to false, the conversion process treats all hydrogen atoms as implicit - and all hydrogen atoms in the :class:`AtomArray` are removed. - By default, explicit_hydrogen=False, i.e. hydrogen atoms are never assumed, to - avoid accidentally introducing radicals into the molecule upon calling - :func:`Chem.SanitizeMol()`. + If set to false, the conversion process treats all hydrogen atoms as implicit. + By default, explicit hydrogen atoms are only assumed of any hydrogen atoms are + present in `atoms`. Returns ------- @@ -114,6 +113,13 @@ def to_mol( If the input `atoms` is an :class:`AtomArrayStack`, all models are included as conformers with conformer IDs starting from ``0``. + Raised + ------ + BadStructureError + If the input `atoms` does not have an associated :class:`BondList`. + Also raises a :class:`BadStructureError`, if `explicit_hydrogen` is set to + ``False`` despite hydrogen atoms being present in `atoms`. + Examples -------- @@ -144,22 +150,20 @@ def to_mol( """ hydrogen_mask = atoms.element == "H" _has_hydrogen = hydrogen_mask.any() - if explicit_hydrogen: + if explicit_hydrogen is None: + explicit_hydrogen = _has_hydrogen + elif explicit_hydrogen: if not _has_hydrogen: warnings.warn( - "No hydrogen found in the input, although 'explicit_hydrogen' is 'True'. " - "This may lead to unexpected results, as hydrogen atoms are not " - "implicitly added, which often results in radicals after sanitization " - "in RDKit. Make sure this is what you want.", + "No hydrogen found, although 'explicit_hydrogen' is 'True'. " + "This may lead to radicals after sanitization in RDKit.", UserWarning, ) else: if _has_hydrogen: - warnings.warn( + raise BadStructureError( "Hydrogen atoms are present in the input, although 'explicit_hydrogen' " - "is 'False'. All hydrogen atoms will be removed and then re-inferred " - "by RDKit. Make sure this is what you want.", - UserWarning, + "is set to 'False'" ) atoms = atoms[..., ~hydrogen_mask] From 81fa25741849310a591bfbaab672af532488a9cc Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 15 Feb 2025 22:45:32 +0100 Subject: [PATCH 08/12] Add author --- src/biotite/interface/rdkit/mol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index a2e855230..7484842b7 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -3,7 +3,7 @@ # information. __name__ = "biotite.interface.rdkit" -__author__ = "Patrick Kunzmann" +__author__ = "Patrick Kunzmann, Simon Mathis" __all__ = ["to_mol", "from_mol"] import warnings From 2964417879fedd61238939e5a2558a1ef1414916 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 15 Feb 2025 23:38:09 +0100 Subject: [PATCH 09/12] Make 2D and 3D conformers selectable --- src/biotite/interface/rdkit/mol.py | 41 +++++++++++++++---------- tests/interface/test_rdkit.py | 49 +++++++++++++++++++++--------- 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 7484842b7..9540a4210 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -246,9 +246,12 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): ---------- mol : rdkit.Chem.rdchem.Mol The molecule to be converted. - conformer_id : int, optional - The conformer to be converted. - By default, an :class:`AtomArrayStack` with all conformers is returned. + conformer_id : int or {"2D", "3D"}, optional + The ID of the conformer to be converted. + If set to "2D" or "3D", an :class:`AtomArrayStack` with only the 2D or 3D + conformer is returned, respectively. + By default, an :class:`AtomArrayStack` with all conformers (2D and 3D) is + returned. add_hydrogen : bool, optional If set to true, explicit hydrogen atoms are always added. If set to false, explicit hydrogen atoms are never added. @@ -259,8 +262,10 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): ------- atoms : AtomArray or AtomArrayStack The converted atoms. - An :class:`AtomArrayStack` is only returned, if the `conformer_id` parameter - is not set. + An :class:`AtomArray` is returned if an integer `conformer_id` is given. + Otherwise, an :class:`AtomArrayStack` is returned. + If the input `mol` does not have a conformer, an `AtomArrayStack` with a + single model, where all coordinates are *NaN*, is returned. Notes ----- @@ -309,21 +314,25 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): if rdkit_atoms is None: raise BadStructureError("Could not obtains atoms from Mol") - if conformer_id is None: + if conformer_id in (None, "2D", "3D"): conformers = [conf for conf in mol.GetConformers()] - atoms = AtomArrayStack(len(conformers), len(rdkit_atoms)) - for i, conformer in enumerate(conformers): - if conformer.Is3D(): - atoms.coord[i] = np.array(conformer.GetPositions()) - else: - atoms.coord[i] = np.full((len(rdkit_atoms), 3), np.nan) + if conformer_id == "2D": + conformers = [conf for conf in conformers if not conf.Is3D()] + elif conformer_id == "3D": + conformers = [conf for conf in conformers if conf.Is3D()] + if len(conformers) == 0: + # No conformer in 'Mol' that fulfills the criteria + # -> create a single model with all coordinates set to NaN + atoms = AtomArrayStack(1, len(rdkit_atoms)) + atoms.coord = np.full((1, len(rdkit_atoms), 3), np.nan) + else: + atoms = AtomArrayStack(len(conformers), len(rdkit_atoms)) + for i, conformer in enumerate(conformers): + atoms.coord[i] = np.array(conformer.GetPositions(), dtype=np.float32) else: conformer = mol.GetConformer(conformer_id) atoms = AtomArray(len(rdkit_atoms)) - if conformer.Is3D(): - atoms.coord = np.array(conformer.GetPositions()) - else: - atoms.coord = np.full((len(rdkit_atoms), 3), np.nan) + atoms.coord = np.array(conformer.GetPositions(), dtype=np.float32) extra_annotations = defaultdict( # Use 'object' dtype first, as the maximum string length is unknown diff --git a/tests/interface/test_rdkit.py b/tests/interface/test_rdkit.py index f9fc6245d..15d907842 100644 --- a/tests/interface/test_rdkit.py +++ b/tests/interface/test_rdkit.py @@ -1,13 +1,7 @@ from pathlib import Path import numpy as np import pytest -from rdkit.Chem import MolFromSmiles, MolToSmiles -from rdkit.Chem.rdchem import Atom, EditableMol, Mol -from rdkit.Chem.rdchem import BondType as RDKitBondType -from rdkit.Chem.rdmolops import ( - AddHs, - RemoveStereochemistry, -) +import rdkit.Chem.AllChem as Chem import biotite.interface.rdkit as rdkit_interface import biotite.structure as struc import biotite.structure.info as info @@ -102,19 +96,44 @@ def test_conversion_from_rdkit(smiles): Start from SMILES string to ensure that built-in functionality of RDKit is used to create the initial molecule. """ - ref_mol = MolFromSmiles(smiles) + ref_mol = Chem.MolFromSmiles(smiles) atoms = rdkit_interface.from_mol(ref_mol) test_mol = rdkit_interface.to_mol(atoms) # The intermediate AtomArray has explicit hydrogen atoms so add them explicitly # to the reference as well for fair comparison - ref_mol = AddHs(ref_mol) + ref_mol = Chem.AddHs(ref_mol) # The intermediate AtomArray does not have stereochemistry information, # so this info cannot be preserved in the comparison - RemoveStereochemistry(ref_mol) + Chem.RemoveStereochemistry(ref_mol) # RDKit does not support equality checking -> Use SMILES string as proxy - assert MolToSmiles(test_mol) == MolToSmiles(ref_mol) + assert Chem.MolToSmiles(test_mol) == Chem.MolToSmiles(ref_mol) + + +@pytest.mark.parametrize("selected_conformer_type", ["2D", "3D"]) +@pytest.mark.parametrize("present_conformer_type", ["2D", "3D"]) +def test_conformer_selection(present_conformer_type, selected_conformer_type): + """ + Check if 2D and 3D conformers can be selected and that a model with *NaN* + coordinates is returned if an absent conformer type is selected. + """ + mol = Chem.MolFromSmiles("C1=CC=CC=C1") + conformer = Chem.Conformer(mol.GetNumAtoms()) + conformer.Set3D(present_conformer_type=="3D") + mol.AddConformer(conformer) + + atoms = rdkit_interface.from_mol(mol, conformer_id=selected_conformer_type) + + # In any case there should be a single model + assert atoms.stack_depth() == 1 + if selected_conformer_type != present_conformer_type: + # In case the selected conformer type is not present, expect a model + # with all coordinates set to NaN + assert np.all(np.isnan(atoms.coord)) + else: + # The default conformer set above contains zeros + assert np.all(atoms.coord == 0) def test_kekulization(): @@ -147,11 +166,11 @@ def test_unmappable_bond_type(): """ Test that a warning is raised when a bond type cannot be mapped to Biotite. """ - mol = EditableMol(Mol()) - mol.AddAtom(Atom("F")) - mol.AddAtom(Atom("F")) + mol = Chem.EditableMol(Chem.Mol()) + mol.AddAtom(Chem.Atom("F")) + mol.AddAtom(Chem.Atom("F")) # 'HEXTUPLE' has no corresponding Biotite bond type - mol.AddBond(0, 1, RDKitBondType.HEXTUPLE) + mol.AddBond(0, 1, Chem.BondType.HEXTUPLE) mol = mol.GetMol() with pytest.warns(LossyConversionWarning): From f30ca157b3197ece67a074e9343a08bc4b81a1fc Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Tue, 18 Feb 2025 09:32:01 +0100 Subject: [PATCH 10/12] Allow multiple types for extra annotations --- src/biotite/interface/rdkit/mol.py | 63 ++++++++++++++++-------------- tests/interface/test_rdkit.py | 28 ++++++++++++- 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 9540a4210..a357bb3d8 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -6,6 +6,7 @@ __author__ = "Patrick Kunzmann, Simon Mathis" __all__ = ["to_mol", "from_mol"] +import numbers import warnings from collections import defaultdict import numpy as np @@ -199,7 +200,9 @@ def to_mol( # add extra annotations for annot_name in extra_annot: - rdkit_atom.SetProp(annot_name, atoms.get_annotation(annot_name)[i].item()) + _set_property( + rdkit_atom, annot_name, atoms.get_annotation(annot_name)[i].item() + ) # add atom to molecule mol.AddAtom(rdkit_atom) @@ -270,8 +273,8 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): Notes ----- All atom-level properties of `mol` - (obtainable with :meth:`rdkit.Chem.rdchem.Mol.GetProp()`) are added as string-type - annotation array with the same name. + (obtainable with :meth:`rdkit.Chem.rdchem.Mol.GetProp()`) are added as annotation + array with the same name. ``element`` and ``charge`` are not inferred from properties but from the dedicated attributes in the :class:`rdkit.Chem.rdchem.Mol` object. @@ -335,8 +338,8 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): atoms.coord = np.array(conformer.GetPositions(), dtype=np.float32) extra_annotations = defaultdict( - # Use 'object' dtype first, as the maximum string length is unknown - lambda: np.full(atoms.array_length(), "", dtype=object) + # The dtype of each annotation array is inferred later + lambda: [None] * atoms.array_length() ) atoms.add_annotation("charge", int) atoms.add_annotation("b_factor", float) @@ -393,30 +396,17 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): atoms.atom_name[_atom_idx] = residue_info.GetName().strip() # ... add extra annotations - annot_names = rdkit_atom.GetPropNames() - for annot_name in annot_names: - extra_annotations[annot_name][_atom_idx] = rdkit_atom.GetProp(annot_name) - - for annot_name, array in extra_annotations.items(): - # ... handle special case of implicit hydrogen atom flags - if annot_name == "isImplicit": - atoms.set_annotation(annot_name, array.astype(bool)) - continue - - # ... for all custom annotations, try numeric conversions in order of preference - for dtype in (bool, int, float): - try: - converted = array.astype(dtype) - # Verify the conversion was lossless by converting back - if np.array_equal( - converted.astype(str), array.astype(str), equal_nan=True - ): - atoms.set_annotation(annot_name, converted) - break - except (ValueError, TypeError): - continue - else: # ... if no numeric conversion succeeded - atoms.set_annotation(annot_name, array.astype(str)) + for annot, value in rdkit_atom.GetPropsAsDict(includePrivate=False).items(): + extra_annotations[annot][_atom_idx] = value + + for annot, array in extra_annotations.items(): + # Handle special case of implicit hydrogen atom flags, + # that is set by 'AddHs()' to hydrogen atoms + if annot == "isImplicit": + annotation_array = np.array(array, dtype=bool) + else: + annotation_array = np.array(array) + atoms.set_annotation(annot, annotation_array) rdkit_bonds = list(mol.GetBonds()) is_aromatic = np.array( @@ -465,3 +455,18 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): def _has_explicit_hydrogen(mol): return mol.GetNumAtoms() > mol.GetNumHeavyAtoms() + + +def _set_property(atom, annot_name, value): + if isinstance(value, bool): + atom.SetBoolProp(annot_name, value) + elif isinstance(value, numbers.Integral): + atom.SetIntProp(annot_name, value) + elif isinstance(value, numbers.Real): + atom.SetDoubleProp(annot_name, value) + elif isinstance(value, str): + atom.SetProp(annot_name, value) + else: + raise TypeError( + f"Unsupported dtype '{type(value).__name__}' for annotation '{annot_name}'" + ) diff --git a/tests/interface/test_rdkit.py b/tests/interface/test_rdkit.py index 15d907842..58bd35272 100644 --- a/tests/interface/test_rdkit.py +++ b/tests/interface/test_rdkit.py @@ -1,3 +1,4 @@ +import itertools from pathlib import Path import numpy as np import pytest @@ -120,7 +121,7 @@ def test_conformer_selection(present_conformer_type, selected_conformer_type): """ mol = Chem.MolFromSmiles("C1=CC=CC=C1") conformer = Chem.Conformer(mol.GetNumAtoms()) - conformer.Set3D(present_conformer_type=="3D") + conformer.Set3D(present_conformer_type == "3D") mol.AddConformer(conformer) atoms = rdkit_interface.from_mol(mol, conformer_id=selected_conformer_type) @@ -162,6 +163,31 @@ def test_kekulization(): ) +@pytest.mark.parametrize( + "values", [(False, True), (1, 2, 3), (1.0, 1.5, 2.0), ("foo", "bar", "baz")] +) +def test_extra_annotations(values): + """ + Check if extra annotations can be added to a :class:`Mol` and recovered. + """ + EXTRA_ANNOT_NAME = "extra" + + ref_atoms = info.residue("ALA") + ref_annot = np.array( + list(itertools.islice(itertools.cycle(values), ref_atoms.array_length())) + ) + ref_atoms.set_annotation(EXTRA_ANNOT_NAME, ref_annot) + + mol = rdkit_interface.to_mol( + ref_atoms, include_extra_annotations=[EXTRA_ANNOT_NAME] + ) + test_atoms = rdkit_interface.from_mol(mol) + test_annot = test_atoms.get_annotation(EXTRA_ANNOT_NAME) + + assert test_annot.dtype == ref_annot.dtype + assert test_annot.tolist() == ref_annot.tolist() + + def test_unmappable_bond_type(): """ Test that a warning is raised when a bond type cannot be mapped to Biotite. From aed49b533a0e574491743a19f0d8fd5503da6254 Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Sat, 22 Feb 2025 08:10:41 +0100 Subject: [PATCH 11/12] Note that the atoms are in the same order --- src/biotite/interface/rdkit/mol.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index a357bb3d8..75e1f2fd5 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -114,13 +114,19 @@ def to_mol( If the input `atoms` is an :class:`AtomArrayStack`, all models are included as conformers with conformer IDs starting from ``0``. - Raised + Raises ------ BadStructureError If the input `atoms` does not have an associated :class:`BondList`. Also raises a :class:`BadStructureError`, if `explicit_hydrogen` is set to ``False`` despite hydrogen atoms being present in `atoms`. + Notes + ----- + The atoms in the return value are in the same order as the input `atoms`, + i.e. indices pointing to the :class:`rdkit.Chem.rdchem.Mol` can be used to point to + the same atoms in the :class:`.AtomArray`. + Examples -------- @@ -272,6 +278,10 @@ def from_mol(mol, conformer_id=None, add_hydrogen=None): Notes ----- + The atoms in the return value are in the same order as the input `mol`, + i.e. indices pointing to the :class:`rdkit.Chem.rdchem.Mol` can be used to point to + the same atoms in the :class:`.AtomArray`. + All atom-level properties of `mol` (obtainable with :meth:`rdkit.Chem.rdchem.Mol.GetProp()`) are added as annotation array with the same name. From e42473968e2f72b3c98bfce3ab4286c85151709f Mon Sep 17 00:00:00 2001 From: Patrick Kunzmann Date: Mon, 24 Feb 2025 14:55:36 +0100 Subject: [PATCH 12/12] Fix typo Co-authored-by: Simon Mathis --- src/biotite/interface/rdkit/mol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/biotite/interface/rdkit/mol.py b/src/biotite/interface/rdkit/mol.py index 75e1f2fd5..06d4d38a2 100644 --- a/src/biotite/interface/rdkit/mol.py +++ b/src/biotite/interface/rdkit/mol.py @@ -104,7 +104,7 @@ def to_mol( If set to true, the conversion process expects that all hydrogen atoms are explicit, i.e. each each hydrogen atom must be part of the :class:`AtomArray`. If set to false, the conversion process treats all hydrogen atoms as implicit. - By default, explicit hydrogen atoms are only assumed of any hydrogen atoms are + By default, explicit hydrogen atoms are only assumed if any hydrogen atoms are present in `atoms`. Returns