From 9da275841e0085138e37cf9cea30a88e997f4171 Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Wed, 12 Feb 2025 18:04:59 +0000 Subject: [PATCH 1/3] add basic name sanitization --- gufe/components/explicitmoleculecomponent.py | 13 ++++++++++++- gufe/tests/test_smallmoleculecomponent.py | 11 ++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/gufe/components/explicitmoleculecomponent.py b/gufe/components/explicitmoleculecomponent.py index 827c65b85..091a434a8 100644 --- a/gufe/components/explicitmoleculecomponent.py +++ b/gufe/components/explicitmoleculecomponent.py @@ -14,7 +14,8 @@ def _ensure_ofe_name(mol: RDKitMol, name: str) -> str: """ Determine the correct name from the rdkit.Chem.Mol and the user-provided - name; ensure that is set in the rdkit representation. + name; ensure that is set in the rdkit representation. We also perform some sanitation of the name + to help downstream tools. """ try: rdkit_name = mol.GetProp("_Name") @@ -31,6 +32,16 @@ def _ensure_ofe_name(mol: RDKitMol, name: str) -> str: elif name == "": name = rdkit_name + # sanitize the name before we set it + # list of characters to replace + to_replace = [" ", "/"] + if any([i in name for i in to_replace]): + # strip leading and trailing whitespace + name = name.strip() + for i in to_replace: + name = name.replace(i, "-") + warnings.warn(f"Component name sanitized to: {name}") + mol.SetProp("ofe-name", name) return name diff --git a/gufe/tests/test_smallmoleculecomponent.py b/gufe/tests/test_smallmoleculecomponent.py index 9287c79db..8b4cfa101 100644 --- a/gufe/tests/test_smallmoleculecomponent.py +++ b/gufe/tests/test_smallmoleculecomponent.py @@ -52,6 +52,10 @@ def named_ethane(): ("bar", "", "foo", "foo"), ("baz", "bar", "foo", "foo"), ("foo", "", "", "foo"), + ("foo/bar", "", "", "foo-bar"), + (" foo bar ", "", "", "foo-bar"), + ("foo", "foo/bar", "", "foo-bar"), + ("", "", "foo/bar", "foo-bar") ], ) def test_ensure_ofe_name(internal, rdkit_name, name, expected, recwarn): @@ -64,7 +68,12 @@ def test_ensure_ofe_name(internal, rdkit_name, name, expected, recwarn): out_name = _ensure_ofe_name(rdkit, name) - if {rdkit_name, internal} - {"foo", ""}: + if "-" in expected: + # we should warn if we have sanitized the name + assert len(recwarn) == 1 + assert f"Component name sanitized to: {expected}" in recwarn[0].message.args[0] + + elif {rdkit_name, internal} - {"foo", ""}: # we should warn if rdkit properties are anything other than 'foo' # (expected) or the empty string (not set) assert len(recwarn) == 1 From 89415129d470c96aa6db38dc66c6a57caec3a889 Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Fri, 14 Feb 2025 12:44:18 +0000 Subject: [PATCH 2/3] replace characters and add news --- gufe/components/explicitmoleculecomponent.py | 4 ++-- gufe/tests/test_smallmoleculecomponent.py | 11 +++++----- news/ligand_name_cleanup.rst | 23 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 news/ligand_name_cleanup.rst diff --git a/gufe/components/explicitmoleculecomponent.py b/gufe/components/explicitmoleculecomponent.py index 091a434a8..e23c5f96e 100644 --- a/gufe/components/explicitmoleculecomponent.py +++ b/gufe/components/explicitmoleculecomponent.py @@ -34,12 +34,12 @@ def _ensure_ofe_name(mol: RDKitMol, name: str) -> str: # sanitize the name before we set it # list of characters to replace - to_replace = [" ", "/"] + to_replace = [" ", "/", "*"] if any([i in name for i in to_replace]): # strip leading and trailing whitespace name = name.strip() for i in to_replace: - name = name.replace(i, "-") + name = name.replace(i, "") warnings.warn(f"Component name sanitized to: {name}") mol.SetProp("ofe-name", name) diff --git a/gufe/tests/test_smallmoleculecomponent.py b/gufe/tests/test_smallmoleculecomponent.py index 8b4cfa101..0991963f7 100644 --- a/gufe/tests/test_smallmoleculecomponent.py +++ b/gufe/tests/test_smallmoleculecomponent.py @@ -52,10 +52,11 @@ def named_ethane(): ("bar", "", "foo", "foo"), ("baz", "bar", "foo", "foo"), ("foo", "", "", "foo"), - ("foo/bar", "", "", "foo-bar"), - (" foo bar ", "", "", "foo-bar"), - ("foo", "foo/bar", "", "foo-bar"), - ("", "", "foo/bar", "foo-bar") + ("foo/bar", "", "", "foobar"), + (" foo bar ", "", "", "foobar"), + ("foo*bar", "", "", "foobar"), + ("foo", "foo/bar", "", "foobar"), + ("", "", "foo/bar", "foobar") ], ) def test_ensure_ofe_name(internal, rdkit_name, name, expected, recwarn): @@ -68,7 +69,7 @@ def test_ensure_ofe_name(internal, rdkit_name, name, expected, recwarn): out_name = _ensure_ofe_name(rdkit, name) - if "-" in expected: + if "bar" in expected: # we should warn if we have sanitized the name assert len(recwarn) == 1 assert f"Component name sanitized to: {expected}" in recwarn[0].message.args[0] diff --git a/news/ligand_name_cleanup.rst b/news/ligand_name_cleanup.rst new file mode 100644 index 000000000..00d1de15b --- /dev/null +++ b/news/ligand_name_cleanup.rst @@ -0,0 +1,23 @@ +**Added:** + +* + +**Changed:** + +* The name of an ``ExplicitMoleculeComponent`` and its subclasses will now be sanitized to remove whitespace, ``/`` and ``*`` characters, a warning will be displayed when this happens. + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* + +**Security:** + +* From b2a418b44dc693553d6c242bec6ff0383e91e03c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:51:13 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- gufe/tests/test_smallmoleculecomponent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gufe/tests/test_smallmoleculecomponent.py b/gufe/tests/test_smallmoleculecomponent.py index 0991963f7..a6f8b2c30 100644 --- a/gufe/tests/test_smallmoleculecomponent.py +++ b/gufe/tests/test_smallmoleculecomponent.py @@ -56,7 +56,7 @@ def named_ethane(): (" foo bar ", "", "", "foobar"), ("foo*bar", "", "", "foobar"), ("foo", "foo/bar", "", "foobar"), - ("", "", "foo/bar", "foobar") + ("", "", "foo/bar", "foobar"), ], ) def test_ensure_ofe_name(internal, rdkit_name, name, expected, recwarn):