From 9bc0f766cb75831f7bc4681e8ab9d39711825d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Wed, 16 Apr 2025 13:14:52 +0200 Subject: [PATCH 1/8] enhance filtering script with verbosity and output file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_by_composition.py | 39 ++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py index c595e9f..87788f8 100644 --- a/scripts/filter_by_composition.py +++ b/scripts/filter_by_composition.py @@ -9,15 +9,20 @@ from mindlessgen.molecules import Molecule # type: ignore -def get_molecules_from_filesystem(keyword: str) -> list[Molecule]: +def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule]: """ Get a list of molecules from the filesystem. """ # check if the file exists - if not Path(keyword).exists(): - raise FileNotFoundError(f"File '{keyword}' does not exist.") + file_object = Path(keyword).resolve() + if not file_object.exists(): + raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.") + if not file_object.is_file(): + raise NotImplementedError("Reading from directories is not implemented yet.") # read the file - with open(keyword, encoding="utf-8") as file: + if verbosity > 0: + print(f"Reading file: {file_object}") + with open(file_object, encoding="utf-8") as file: mol_names = file.readlines() # get the molecules and return them mol_list: list[Molecule] = [] @@ -37,6 +42,9 @@ def get_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Detect fragments for a given list of molecules." ) + parser.add_argument( + "--verbosity", "-v", type=int, default=1, help="Verbosity level." + ) parser.add_argument( "--keyword", type=str, @@ -51,6 +59,13 @@ def get_args() -> argparse.Namespace: default=None, help="Allowed elements for the molecules. Format example: `--allowed-elements '57-71, 81-*'", ) + parser.add_argument( + "--output-file", + type=str, + required=False, + default="selected_elements_molecules.list", + help="Output file for the selected elements.", + ) return parser.parse_args() @@ -90,15 +105,19 @@ def main() -> int: from the command line. """ args = get_args() - mols = get_molecules_from_filesystem(keyword=args.keyword) allowed_elements = parse_allowed_elements(args.allowed_elements) - with open( - "selected_elements_molecules.list", "w", encoding="utf8" - ) as sel_elem_file: - for mol in tqdm(mols, desc="Detecting fragments...", unit="molecule"): + output_file = Path(args.output_file).resolve() + if args.verbosity > 0: + print(f"Allowed elements: {allowed_elements}") + print(f"Output file: {output_file}") + + mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity) + with open(output_file, "w", encoding="utf8") as sel_elem_file: + for mol in tqdm(mols, desc="Checking composition...", unit="molecule"): # check if all elements in the molecule are allowed if all(ati in allowed_elements for ati in mol.ati): - print(f"Molecule {mol.name} has only allowed elements.") + if args.verbosity > 1: + print(f"Molecule {mol.name} has only allowed elements.") sel_elem_file.write(mol.name + "\n") return 0 From 3c0ee8f738d50a20859a748363bb18dc8ac29f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Wed, 16 Apr 2025 14:38:48 +0200 Subject: [PATCH 2/8] option for required elements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_by_composition.py | 98 +++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 9 deletions(-) diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py index 87788f8..13bbc08 100644 --- a/scripts/filter_by_composition.py +++ b/scripts/filter_by_composition.py @@ -55,9 +55,27 @@ def get_args() -> argparse.Namespace: parser.add_argument( "--allowed-elements", type=str, - required=True, + required=False, + default=None, + help="Allowed elements for the molecules. " + + "Format example: `--allowed-elements '57-71, 81-*'", + ) + parser.add_argument( + "--required-elements-all", + type=str, + required=False, + default=None, + help="Required element(s) that MUST be in each molecule (ALL of them must be contained). " + + "Format example: `--required-elements-all '57-71, 81-*'", + ) + parser.add_argument( + "--required-elements-one", + type=str, + required=False, default=None, - help="Allowed elements for the molecules. Format example: `--allowed-elements '57-71, 81-*'", + help="Required element(s) that MUST be in each molecule " + + "(at least one of them must be contained). " + + "Format example: `--required-elements-one '57-71, 81-*'", ) parser.add_argument( "--output-file", @@ -69,7 +87,7 @@ def get_args() -> argparse.Namespace: return parser.parse_args() -def parse_allowed_elements(allowed_elements: str) -> list[int]: +def parse_element_list(allowed_elements: str) -> list[int]: """ Parse the allowed elements from a string. """ @@ -105,20 +123,82 @@ def main() -> int: from the command line. """ args = get_args() - allowed_elements = parse_allowed_elements(args.allowed_elements) + if ( + not args.allowed_elements + and not args.required_elements_all + and not args.required_elements_one + ): + raise ValueError( + "Either --allowed-elements or --required-elements_XXX must be provided." + ) + if args.required_elements_all and args.required_elements_one: + raise ValueError( + "Both --required-elements-all and --required-elements-one cannot be provided at the same time." + ) + if args.allowed_elements: + allowed_elements = parse_element_list(args.allowed_elements) + if args.required_elements_all: + required_elements_all = parse_element_list(args.required_elements_all) + if args.required_elements_one: + required_elements_one = parse_element_list(args.required_elements_one) + output_file = Path(args.output_file).resolve() if args.verbosity > 0: - print(f"Allowed elements: {allowed_elements}") + if args.allowed_elements: + print(f"Allowed elements: {allowed_elements}") print(f"Output file: {output_file}") + # required elements is a list of tuples + # one tuple per set of required elements that must be contained at the same time + # e.g. [(55, 56)] means that both 55 and 56 must be contained in the molecule + # [(54),(55)] means that either 54 or 55 must be contained in the molecule + required_elements: list[tuple] = [] + if args.required_elements_all: + required_elements.append(tuple(required_elements_all)) + if args.required_elements_one: + for elem in required_elements_one: + required_elements.append(tuple([elem])) + if args.verbosity > 0: + print(f"Required elements: {required_elements}") + mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity) with open(output_file, "w", encoding="utf8") as sel_elem_file: for mol in tqdm(mols, desc="Checking composition...", unit="molecule"): # check if all elements in the molecule are allowed - if all(ati in allowed_elements for ati in mol.ati): - if args.verbosity > 1: - print(f"Molecule {mol.name} has only allowed elements.") - sel_elem_file.write(mol.name + "\n") + if args.allowed_elements: + if all(ati in allowed_elements for ati in mol.ati): + if args.verbosity > 1: + print(f"Molecule {mol.name} has only allowed elements.") + else: + if args.verbosity > 1: + print(f"Molecule {mol.name} has forbidden elements.") + continue + if required_elements: + # check if all required elements are in the molecule + # loop over all tuples of required element combinations + contained_combinations: list[bool] = [False] * len(required_elements) + for k, req_elem in enumerate(required_elements): + # list of boolean values with the same length as the number of req_elem + contained: list[bool] = [False] * len(req_elem) + for i, ati in enumerate(req_elem): + # check if the required element is in the molecule + if ati in mol.ati: + contained[i] = True + # check if all elements of the respective required element combination are found + if all(contained): + contained_combinations[k] = True + # check if any of the combinations is True + if any(contained_combinations): + if args.verbosity > 1: + print(f"Molecule {mol.name} has the required elements.") + else: + if args.verbosity > 1: + print( + f"Molecule {mol.name} does not have the required elements." + ) + continue + + sel_elem_file.write(mol.name + "\n") return 0 From 70186f32ca840f02be50c30afc82fe8476537635 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Wed, 16 Apr 2025 14:48:58 +0200 Subject: [PATCH 3/8] put required elements in separate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_by_composition.py | 56 ++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py index 13bbc08..356373a 100644 --- a/scripts/filter_by_composition.py +++ b/scripts/filter_by_composition.py @@ -117,6 +117,34 @@ def parse_element_list(allowed_elements: str) -> list[int]: return sorted(list(set_allowed_elements)) +def molecule_has_required_elements( + mol: Molecule, required_elements: list[tuple], verbosity: int +) -> bool: + """ + Check whether a molecule contains the required elements. + """ + # loop over all tuples of required element combinations + contained_combinations: list[bool] = [False] * len(required_elements) + for k, req_elem in enumerate(required_elements): + # list of boolean values with the same length as the number of req_elem + contained: list[bool] = [False] * len(req_elem) + for i, ati in enumerate(req_elem): + # check if the required element is in the molecule + if ati in mol.ati: + contained[i] = True + # check if all elements of the respective required element combination are found + if all(contained): + contained_combinations[k] = True + # check if any of the combinations is True + if any(contained_combinations): + if verbosity > 1: + print(f"Molecule {mol.name} has the required elements.") + return True + if verbosity > 1: + print(f"Molecule {mol.name} does not have the required elements.") + return False + + def main() -> int: """ Main function that is called when the script is executed @@ -173,30 +201,10 @@ def main() -> int: if args.verbosity > 1: print(f"Molecule {mol.name} has forbidden elements.") continue - if required_elements: - # check if all required elements are in the molecule - # loop over all tuples of required element combinations - contained_combinations: list[bool] = [False] * len(required_elements) - for k, req_elem in enumerate(required_elements): - # list of boolean values with the same length as the number of req_elem - contained: list[bool] = [False] * len(req_elem) - for i, ati in enumerate(req_elem): - # check if the required element is in the molecule - if ati in mol.ati: - contained[i] = True - # check if all elements of the respective required element combination are found - if all(contained): - contained_combinations[k] = True - # check if any of the combinations is True - if any(contained_combinations): - if args.verbosity > 1: - print(f"Molecule {mol.name} has the required elements.") - else: - if args.verbosity > 1: - print( - f"Molecule {mol.name} does not have the required elements." - ) - continue + if not molecule_has_required_elements( + mol, required_elements, args.verbosity + ): + continue sel_elem_file.write(mol.name + "\n") From 29732d3ea18a1857fdb3060b69d77a810b9caf5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Wed, 16 Apr 2025 15:12:50 +0200 Subject: [PATCH 4/8] investigate directory structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_by_composition.py | 55 +++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py index 356373a..26fccd4 100644 --- a/scripts/filter_by_composition.py +++ b/scripts/filter_by_composition.py @@ -19,20 +19,51 @@ def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.") if not file_object.is_file(): raise NotImplementedError("Reading from directories is not implemented yet.") - # read the file + ### Process molecules from list of files (molecules) if verbosity > 0: print(f"Reading file: {file_object}") with open(file_object, encoding="utf-8") as file: mol_names = file.readlines() - # get the molecules and return them + ### Get the molecules and return them + # Test directory structure first + if Path(mol_names[0].strip() + ".xyz").exists(): + format: str = "xyz" + elif Path(mol_names[0].strip()).is_dir(): + format = "dir" + # search all XYZ files in the test directory + xyz_files = list(Path(mol_names[0].strip()).glob("*.xyz")) + # if more than one file is found, raise an error + if len(xyz_files) > 1: + raise ValueError( + "More than one XYZ file found in the directory. " + + "Please specify the file name." + ) mol_list: list[Molecule] = [] - for mol_name in tqdm( - mol_names, desc="Processing molecules from files...", unit="molecule" - ): - mol_name = mol_name.strip() - mol = Molecule.read_mol_from_file(mol_name + ".xyz") - mol_list.append(mol) - return mol_list + if format == "xyz": + for mol_name in tqdm( + mol_names, desc="Processing molecules from files...", unit="molecule" + ): + mol_name = mol_name.strip() + mol = Molecule.read_mol_from_file(mol_name + ".xyz") + mol_list.append(mol) + return mol_list + elif format == "dir": + # read all XYZ files in the directory + for mol_name in tqdm( + mol_names, desc="Processing molecules from files...", unit="molecule" + ): + # path to the xyz file is obtained by grepping for "*.xyz" in the mol_name directory + mol_name = mol_name.strip() + xyz_file = list(Path(mol_name).glob("*.xyz")) + mol = Molecule.read_mol_from_file(xyz_file[0]) + mol.name = mol_name + mol_list.append(mol) + return mol_list + else: + raise ValueError( + "File format not supported. " + + "Please specify the file name with the extension." + ) def get_args() -> argparse.Namespace: @@ -201,8 +232,10 @@ def main() -> int: if args.verbosity > 1: print(f"Molecule {mol.name} has forbidden elements.") continue - if not molecule_has_required_elements( - mol, required_elements, args.verbosity + if required_elements and ( + not molecule_has_required_elements( + mol, required_elements, args.verbosity + ) ): continue From 4bbbee6f492186809e3b9a9ff84e5f6d07a39a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Tue, 22 Apr 2025 14:06:26 +0200 Subject: [PATCH 5/8] add stereoisomer filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_by_composition.py | 81 ++++++------------- scripts/filter_stereoisomers.py | 81 +++++++++++++++++++ scripts/fragment_detection.py | 26 +----- src/mindlessgen/molecules/__init__.py | 5 +- src/mindlessgen/molecules/miscellaneous.py | 93 +++++++++++++++++++++- src/mindlessgen/molecules/refinement.py | 62 ++++++++++----- 6 files changed, 244 insertions(+), 104 deletions(-) create mode 100644 scripts/filter_stereoisomers.py diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py index 26fccd4..0fa68be 100644 --- a/scripts/filter_by_composition.py +++ b/scripts/filter_by_composition.py @@ -7,63 +7,7 @@ from pathlib import Path from tqdm import tqdm from mindlessgen.molecules import Molecule # type: ignore - - -def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule]: - """ - Get a list of molecules from the filesystem. - """ - # check if the file exists - file_object = Path(keyword).resolve() - if not file_object.exists(): - raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.") - if not file_object.is_file(): - raise NotImplementedError("Reading from directories is not implemented yet.") - ### Process molecules from list of files (molecules) - if verbosity > 0: - print(f"Reading file: {file_object}") - with open(file_object, encoding="utf-8") as file: - mol_names = file.readlines() - ### Get the molecules and return them - # Test directory structure first - if Path(mol_names[0].strip() + ".xyz").exists(): - format: str = "xyz" - elif Path(mol_names[0].strip()).is_dir(): - format = "dir" - # search all XYZ files in the test directory - xyz_files = list(Path(mol_names[0].strip()).glob("*.xyz")) - # if more than one file is found, raise an error - if len(xyz_files) > 1: - raise ValueError( - "More than one XYZ file found in the directory. " - + "Please specify the file name." - ) - mol_list: list[Molecule] = [] - if format == "xyz": - for mol_name in tqdm( - mol_names, desc="Processing molecules from files...", unit="molecule" - ): - mol_name = mol_name.strip() - mol = Molecule.read_mol_from_file(mol_name + ".xyz") - mol_list.append(mol) - return mol_list - elif format == "dir": - # read all XYZ files in the directory - for mol_name in tqdm( - mol_names, desc="Processing molecules from files...", unit="molecule" - ): - # path to the xyz file is obtained by grepping for "*.xyz" in the mol_name directory - mol_name = mol_name.strip() - xyz_file = list(Path(mol_name).glob("*.xyz")) - mol = Molecule.read_mol_from_file(xyz_file[0]) - mol.name = mol_name - mol_list.append(mol) - return mol_list - else: - raise ValueError( - "File format not supported. " - + "Please specify the file name with the extension." - ) +from mindlessgen.molecules import get_molecules_from_filesystem # type: ignore def get_args() -> argparse.Namespace: @@ -108,6 +52,20 @@ def get_args() -> argparse.Namespace: + "(at least one of them must be contained). " + "Format example: `--required-elements-one '57-71, 81-*'", ) + parser.add_argument( + "--min-charge", + type=int, + required=False, + default=None, + help="Minimum charge for the molecules." + "Format example: `--min-charge -1`", + ) + parser.add_argument( + "--max-charge", + type=int, + required=False, + default=None, + help="Maximum charge for the molecules." + "Format example: `--max-charge 2`", + ) parser.add_argument( "--output-file", type=str, @@ -239,6 +197,15 @@ def main() -> int: ): continue + if args.min_charge is not None and mol.charge < args.min_charge: + if args.verbosity > 1: + print(f"Molecule {mol.name} has charge {mol.charge}.") + continue + if args.max_charge is not None and mol.charge > args.max_charge: + if args.verbosity > 1: + print(f"Molecule {mol.name} has charge {mol.charge}.") + continue + sel_elem_file.write(mol.name + "\n") return 0 diff --git a/scripts/filter_stereoisomers.py b/scripts/filter_stereoisomers.py new file mode 100644 index 0000000..c90af9f --- /dev/null +++ b/scripts/filter_stereoisomers.py @@ -0,0 +1,81 @@ +""" +Python script that is based on MindlessGen +and filters compounds that are redundant stereoisomers. +""" + +import argparse +from pathlib import Path +from collections import defaultdict + +from tqdm import tqdm +import networkx as nx # type: ignore + +from mindlessgen.molecules import get_molecules_from_filesystem # type: ignore +from mindlessgen.molecules import get_molecular_graph # type: ignore + + +def get_args() -> argparse.Namespace: + """ + Get the command line arguments. + """ + parser = argparse.ArgumentParser( + description="Detect stereoisomers for a given list of molecules." + ) + parser.add_argument( + "--verbosity", "-v", type=int, default=1, help="Verbosity level." + ) + parser.add_argument( + "--keyword", + type=str, + required=False, + default="molecules.list", + help="Keyword for the file that contains the list of molecules.", + ) + parser.add_argument( + "--output-file", + type=str, + required=False, + default="selected_elements_molecules.list", + help="Output file for the selected elements.", + ) + return parser.parse_args() + + +def main() -> int: + """ + Main function that is called when the script is executed. + """ + args = get_args() + output_file = Path(args.output_file).resolve() + if args.verbosity > 0: + print(f"Output file: {output_file}") + mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity) + + seen_hashes: defaultdict[str, list[str]] = defaultdict( + list + ) # maps graph hashes to list of mol indices or names + + with open(output_file, "w", encoding="utf8") as sel_elem_file: + for i, mol in enumerate( + tqdm(mols, desc="Checking composition...", unit="molecule") + ): + graph = get_molecular_graph(mol, 1.25, verbosity=args.verbosity) + + # Get WL hash with atom type info + g_hash = nx.weisfeiler_lehman_graph_hash(graph, node_attr="element") + + if g_hash in seen_hashes.keys(): + if args.verbosity > 0: + print( + f"Found stereoisomer: {seen_hashes[g_hash]} and {mol.name} with hash {g_hash}" + ) + seen_hashes[g_hash].append(mol.name) + continue + seen_hashes[g_hash].append(mol.name) + sel_elem_file.write(mol.name + "\n") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/fragment_detection.py b/scripts/fragment_detection.py index 34e3219..5fa1b1d 100644 --- a/scripts/fragment_detection.py +++ b/scripts/fragment_detection.py @@ -6,28 +6,8 @@ import argparse from pathlib import Path from tqdm import tqdm -from mindlessgen.molecules import Molecule, detect_fragments # type: ignore - - -def get_molecules_from_filesystem(keyword: str) -> list[Molecule]: - """ - Get a list of molecules from the filesystem. - """ - # check if the file exists - if not Path(keyword).exists(): - raise FileNotFoundError(f"File '{keyword}' does not exist.") - # read the file - with open(keyword, encoding="utf-8") as file: - mol_names = file.readlines() - # get the molecules and return them - mol_list: list[Molecule] = [] - for mol_name in tqdm( - mol_names, desc="Processing molecules from files...", unit="molecule" - ): - mol_name = mol_name.strip() - mol = Molecule.read_mol_from_file(mol_name + ".xyz") - mol_list.append(mol) - return mol_list +from mindlessgen.molecules import detect_fragments # type: ignore +from mindlessgen.molecules import get_molecules_from_filesystem # type: ignore def get_args() -> argparse.Namespace: @@ -61,7 +41,7 @@ def main() -> int: from the command line. """ args = get_args() - mols = get_molecules_from_filesystem(keyword=args.keyword) + mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=0) # create new directory "new_single_molecules" if it does not exist newmoldir = Path("fragments").resolve() newmoldir.mkdir(exist_ok=True, parents=True) diff --git a/src/mindlessgen/molecules/__init__.py b/src/mindlessgen/molecules/__init__.py index a167c46..95e3365 100644 --- a/src/mindlessgen/molecules/__init__.py +++ b/src/mindlessgen/molecules/__init__.py @@ -13,7 +13,7 @@ generate_atom_list, check_distances, ) -from .refinement import iterative_optimization, detect_fragments +from .refinement import iterative_optimization, detect_fragments, get_molecular_graph from .postprocess import postprocess_mol from .miscellaneous import ( get_cov_radii, @@ -25,6 +25,7 @@ get_actinides, get_alkali_metals, get_alkaline_earth_metals, + get_molecules_from_filesystem, ) __all__ = [ @@ -34,6 +35,7 @@ "generate_atom_list", "iterative_optimization", "detect_fragments", + "get_molecular_graph", "get_cov_radii", "set_random_charge", "check_distances", @@ -44,6 +46,7 @@ "get_actinides", "get_alkali_metals", "get_alkaline_earth_metals", + "get_molecules_from_filesystem", "ati_to_atlist", "atlist_to_ati", "postprocess_mol", diff --git a/src/mindlessgen/molecules/miscellaneous.py b/src/mindlessgen/molecules/miscellaneous.py index 4a59c5e..9952b2f 100644 --- a/src/mindlessgen/molecules/miscellaneous.py +++ b/src/mindlessgen/molecules/miscellaneous.py @@ -2,9 +2,12 @@ Molecule-related helper tools. """ +from pathlib import Path +from tqdm import tqdm + import numpy as np -from .molecule import ati_to_atlist +from .molecule import ati_to_atlist, Molecule from ..data.parameters import COV_RADII_PYYKKO, COV_RADII_MLMGEN @@ -173,3 +176,91 @@ def get_actinides() -> list[int]: """ actinides = list(range(88, 103)) return actinides + + +# NOTE: Required by external helper scripts. +def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule]: + """ + Get a list of molecules from the filesystem. + """ + # check if the file exists + file_object = Path(keyword).resolve() + if not file_object.exists(): + raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.") + if not file_object.is_file(): + raise NotImplementedError("Reading from directories is not implemented yet.") + ### Process molecules from list of files (molecules) + if verbosity > 0: + print(f"Reading file: {file_object}") + with open(file_object, encoding="utf-8") as file: + mol_names = file.readlines() + ### Get the molecules and return them + # Test directory structure first + if Path(mol_names[0].strip() + ".xyz").exists(): + xyzformat: str = "xyz" + elif Path(mol_names[0].strip()).is_dir(): + xyzformat = "dir" + # search all XYZ files in the test directory + xyz_files = list(Path(mol_names[0].strip()).glob("*.xyz")) + # if more than one file is found, raise an error + if len(xyz_files) > 1: + raise ValueError( + "More than one XYZ file found in the directory. " + + "Please specify the file name." + ) + else: + raise ValueError( + "File format not supported. " + + "Please specify the file name with the extension." + ) + mol_list: list[Molecule] = [] + if xyzformat == "xyz": + for mol_name in tqdm( + mol_names, desc="Processing molecules from files...", unit="molecule" + ): + mol_name = mol_name.strip() + mol = Molecule.read_mol_from_file(mol_name + ".xyz") + mol_list.append(mol) + return mol_list + elif xyzformat == "dir": + # read all XYZ files in the directory + for mol_name in tqdm( + mol_names, desc="Processing molecules from files...", unit="molecule" + ): + # path to the xyz file is obtained by grepping for "*.xyz" in the mol_name directory + mol_name = mol_name.strip() + xyz_file = list(Path(mol_name).glob("*.xyz")) + mol = Molecule.read_mol_from_file(xyz_file[0]) + mol.name = mol_name + # check if the directory contains a ".CHRG" file + chrg_file = Path(mol_name).parent / ".CHRG" + if chrg_file.exists(): + with open(chrg_file, encoding="utf-8") as chrg: + chrg_lines = chrg.readlines() + # check if the file contains a charge + if len(chrg_lines) > 0: + try: + mol.charge = int(chrg_lines[0].strip()) + except ValueError as e: + raise ValueError( + f"Charge in file {chrg_file} is not an integer." + ) from e + uhf_file = Path(mol_name).parent / ".UHF" + if uhf_file.exists(): + with open(uhf_file, encoding="utf-8") as uhf: + uhf_lines = uhf.readlines() + # check if the file contains a charge + if len(uhf_lines) > 0: + try: + mol.charge = int(uhf_lines[0].strip()) + except ValueError as e: + raise ValueError( + f"Charge in file {uhf_file} is not an integer." + ) from e + mol_list.append(mol) + return mol_list + else: + raise ValueError( + "File format not supported. " + + "Please specify the file name with the extension." + ) diff --git a/src/mindlessgen/molecules/refinement.py b/src/mindlessgen/molecules/refinement.py index c851139..1fd3116 100644 --- a/src/mindlessgen/molecules/refinement.py +++ b/src/mindlessgen/molecules/refinement.py @@ -203,29 +203,8 @@ def detect_fragments( Returns: - A list of fragments, where each fragment is a Molecule object. """ - graph = nx.Graph() - # Add nodes (atoms) to the graph - graph.add_nodes_from(range(mol.num_atoms)) - - # Calculate pairwise distances and add edges if atoms are bonded - for i in range(mol.num_atoms - 1): - for j in range(i + 1, mol.num_atoms): - distance = np.linalg.norm(mol.xyz[i] - mol.xyz[j]) - sum_radii = get_cov_radii(mol.ati[i], COV_RADII) + get_cov_radii( - mol.ati[j], COV_RADII - ) - if verbosity > 2: - print(f"Distance between atom {i} and {j}: {distance:6.3f}") - print( - f"Covalent radii of atom {i} and {j}, " - + "and the effective threshold: " - + f"{get_cov_radii(mol.ati[i], COV_RADII):6.3f}, " - + f"{get_cov_radii(mol.ati[j], COV_RADII):6.3f}, " - + f"{(sum_radii * vdw_scaling):6.3f}" - ) - if distance <= sum_radii * vdw_scaling: - graph.add_edge(i, j) + graph = get_molecular_graph(mol, vdw_scaling, verbosity) # Detect connected components (fragments) fragments = [list(component) for component in nx.connected_components(graph)] @@ -268,3 +247,42 @@ def detect_fragments( fragment_molecules.append(fragment_molecule) return fragment_molecules + + +def get_molecular_graph( + mol: Molecule, + vdw_scaling: float, + verbosity: int = 1, +) -> nx.Graph: + """ + Generate a molecular graph from the molecule object. + The graph is undirected and contains nodes for each atom. + Edges are added between atoms that are bonded based on their distances + and covalent radii. + """ + graph = nx.Graph() + + # Add nodes (atoms) to the graph with atomic type as attribute + for i in range(mol.num_atoms): + graph.add_node(i, element=mol.ati[i]) + + # Calculate pairwise distances and add edges if atoms are bonded + for i in range(mol.num_atoms - 1): + for j in range(i + 1, mol.num_atoms): + distance = np.linalg.norm(mol.xyz[i] - mol.xyz[j]) + sum_radii = get_cov_radii(mol.ati[i], COV_RADII) + get_cov_radii( + mol.ati[j], COV_RADII + ) + if verbosity > 2: + print(f"Distance between atom {i} and {j}: {distance:6.3f}") + print( + f"Covalent radii of atom {i} and {j}, " + + "and the effective threshold: " + + f"{get_cov_radii(mol.ati[i], COV_RADII):6.3f}, " + + f"{get_cov_radii(mol.ati[j], COV_RADII):6.3f}, " + + f"{(sum_radii * vdw_scaling):6.3f}" + ) + if distance <= sum_radii * vdw_scaling: + graph.add_edge(i, j) + + return graph From a6043b03d3ee842ebef50ebddd253d1e10b78a60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Thu, 24 Apr 2025 09:56:59 +0200 Subject: [PATCH 6/8] add UHF max support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_by_composition.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py index 0fa68be..d1d7776 100644 --- a/scripts/filter_by_composition.py +++ b/scripts/filter_by_composition.py @@ -66,6 +66,14 @@ def get_args() -> argparse.Namespace: default=None, help="Maximum charge for the molecules." + "Format example: `--max-charge 2`", ) + parser.add_argument( + "--max-uhf", + type=int, + required=False, + default=None, + help="Maximum number of unpaired electrons (UHF) for the molecules." + + " Format example: `--max-uhf 2`", + ) parser.add_argument( "--output-file", type=str, @@ -144,13 +152,18 @@ def main() -> int: not args.allowed_elements and not args.required_elements_all and not args.required_elements_one + and not args.min_charge + and not args.max_charge + and not args.max_uhf ): raise ValueError( - "Either --allowed-elements or --required-elements_XXX must be provided." + "Either --allowed-elements, --required-elements_XXX, --min-charge, " + + "--max-charge, or --max-uhf must be provided." ) if args.required_elements_all and args.required_elements_one: raise ValueError( - "Both --required-elements-all and --required-elements-one cannot be provided at the same time." + "Both --required-elements-all and " + + "--required-elements-one cannot be provided at the same time." ) if args.allowed_elements: allowed_elements = parse_element_list(args.allowed_elements) @@ -205,6 +218,10 @@ def main() -> int: if args.verbosity > 1: print(f"Molecule {mol.name} has charge {mol.charge}.") continue + if args.max_uhf is not None and mol.uhf > args.max_uhf: + if args.verbosity > 1: + print(f"Molecule {mol.name} has UHF {mol.uhf}.") + continue sel_elem_file.write(mol.name + "\n") From 5ce08ac4b7c383303d110c563e3e1bf76a0e3491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Thu, 24 Apr 2025 09:57:39 +0200 Subject: [PATCH 7/8] increase verbosity level for print MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- scripts/filter_stereoisomers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/filter_stereoisomers.py b/scripts/filter_stereoisomers.py index c90af9f..c3968d1 100644 --- a/scripts/filter_stereoisomers.py +++ b/scripts/filter_stereoisomers.py @@ -65,9 +65,10 @@ def main() -> int: g_hash = nx.weisfeiler_lehman_graph_hash(graph, node_attr="element") if g_hash in seen_hashes.keys(): - if args.verbosity > 0: + if args.verbosity > 1: print( - f"Found stereoisomer: {seen_hashes[g_hash]} and {mol.name} with hash {g_hash}" + f"Found stereoisomer: {seen_hashes[g_hash]} " + + f"and {mol.name} with hash {g_hash}" ) seen_hashes[g_hash].append(mol.name) continue From 38333cec402328946217e2acd21ede9b729dbf53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20M=C3=BCller?= Date: Thu, 24 Apr 2025 10:12:07 +0200 Subject: [PATCH 8/8] correct wrong assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marcel Müller --- src/mindlessgen/molecules/miscellaneous.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mindlessgen/molecules/miscellaneous.py b/src/mindlessgen/molecules/miscellaneous.py index 9952b2f..66e138e 100644 --- a/src/mindlessgen/molecules/miscellaneous.py +++ b/src/mindlessgen/molecules/miscellaneous.py @@ -252,10 +252,10 @@ def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule # check if the file contains a charge if len(uhf_lines) > 0: try: - mol.charge = int(uhf_lines[0].strip()) + mol.uhf = int(uhf_lines[0].strip()) except ValueError as e: raise ValueError( - f"Charge in file {uhf_file} is not an integer." + f"UHF in file {uhf_file} is not an integer." ) from e mol_list.append(mol) return mol_list