grimme-lab · marcelmbn · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
@@ -7,27 +7,7 @@
 from pathlib import Path
 from tqdm import tqdm
 from mindlessgen.molecules import Molecule  # type: ignore
-
-
-def get_molecules_from_filesystem(keyword: str) -> list[Molecule]:
-    """
-    Get a list of molecules from the filesystem.
-    """
-    # check if the file exists
-    if not Path(keyword).exists():
-        raise FileNotFoundError(f"File '{keyword}' does not exist.")
-    # read the file
-    with open(keyword, encoding="utf-8") as file:
-        mol_names = file.readlines()
-    # get the molecules and return them
-    mol_list: list[Molecule] = []
-    for mol_name in tqdm(
-        mol_names, desc="Processing molecules from files...", unit="molecule"
-    ):
-        mol_name = mol_name.strip()
-        mol = Molecule.read_mol_from_file(mol_name + ".xyz")
-        mol_list.append(mol)
-    return mol_list
+from mindlessgen.molecules import get_molecules_from_filesystem  # type: ignore
 
 
 def get_args() -> argparse.Namespace:
@@ -37,6 +17,9 @@ def get_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Detect fragments for a given list of molecules."
     )
+    parser.add_argument(
+        "--verbosity", "-v", type=int, default=1, help="Verbosity level."
+    )
     parser.add_argument(
         "--keyword",
         type=str,
@@ -47,14 +30,61 @@ def get_args() -> argparse.Namespace:
     parser.add_argument(
         "--allowed-elements",
         type=str,
-        required=True,
+        required=False,
+        default=None,
+        help="Allowed elements for the molecules. "
+        + "Format example: `--allowed-elements '57-71, 81-*'",
+    )
+    parser.add_argument(
+        "--required-elements-all",
+        type=str,
+        required=False,
+        default=None,
+        help="Required element(s) that MUST be in each molecule (ALL of them must be contained). "
+        + "Format example: `--required-elements-all '57-71, 81-*'",
+    )
+    parser.add_argument(
+        "--required-elements-one",
+        type=str,
+        required=False,
+        default=None,
+        help="Required element(s) that MUST be in each molecule "
+        + "(at least one of them must be contained). "
+        + "Format example: `--required-elements-one '57-71, 81-*'",
+    )
+    parser.add_argument(
+        "--min-charge",
+        type=int,
+        required=False,
         default=None,
-        help="Allowed elements for the molecules. Format example: `--allowed-elements '57-71, 81-*'",
+        help="Minimum charge for the molecules." + "Format example: `--min-charge -1`",
+    )
+    parser.add_argument(
+        "--max-charge",
+        type=int,
+        required=False,
+        default=None,
+        help="Maximum charge for the molecules." + "Format example: `--max-charge 2`",
+    )
+    parser.add_argument(
+        "--max-uhf",
+        type=int,
+        required=False,
+        default=None,
+        help="Maximum number of unpaired electrons (UHF) for the molecules."
+        + " Format example: `--max-uhf 2`",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=False,
+        default="selected_elements_molecules.list",
+        help="Output file for the selected elements.",
     )
     return parser.parse_args()
 
 
-def parse_allowed_elements(allowed_elements: str) -> list[int]:
+def parse_element_list(allowed_elements: str) -> list[int]:
     """
     Parse the allowed elements from a string.
     """
@@ -84,22 +114,116 @@ def parse_allowed_elements(allowed_elements: str) -> list[int]:
     return sorted(list(set_allowed_elements))
 
 
+def molecule_has_required_elements(
+    mol: Molecule, required_elements: list[tuple], verbosity: int
+) -> bool:
+    """
+    Check whether a molecule contains the required elements.
+    """
+    # loop over all tuples of required element combinations
+    contained_combinations: list[bool] = [False] * len(required_elements)
+    for k, req_elem in enumerate(required_elements):
+        # list of boolean values with the same length as the number of req_elem
+        contained: list[bool] = [False] * len(req_elem)
+        for i, ati in enumerate(req_elem):
+            # check if the required element is in the molecule
+            if ati in mol.ati:
+                contained[i] = True
+        # check if all elements of the respective required element combination are found
+        if all(contained):
+            contained_combinations[k] = True
+    # check if any of the combinations is True
+    if any(contained_combinations):
+        if verbosity > 1:
+            print(f"Molecule {mol.name} has the required elements.")
+        return True
+    if verbosity > 1:
+        print(f"Molecule {mol.name} does not have the required elements.")
+    return False
+
+
 def main() -> int:
     """
     Main function that is called when the script is executed
     from the command line.
     """
     args = get_args()
-    mols = get_molecules_from_filesystem(keyword=args.keyword)
-    allowed_elements = parse_allowed_elements(args.allowed_elements)
-    with open(
-        "selected_elements_molecules.list", "w", encoding="utf8"
-    ) as sel_elem_file:
-        for mol in tqdm(mols, desc="Detecting fragments...", unit="molecule"):
+    if (
+        not args.allowed_elements
+        and not args.required_elements_all
+        and not args.required_elements_one
+        and not args.min_charge
+        and not args.max_charge
+        and not args.max_uhf
+    ):
+        raise ValueError(
+            "Either --allowed-elements, --required-elements_XXX, --min-charge, "
+            + "--max-charge, or --max-uhf must be provided."
+        )
+    if args.required_elements_all and args.required_elements_one:
+        raise ValueError(
+            "Both --required-elements-all and "
+            + "--required-elements-one cannot be provided at the same time."
+        )
+    if args.allowed_elements:
+        allowed_elements = parse_element_list(args.allowed_elements)
+    if args.required_elements_all:
+        required_elements_all = parse_element_list(args.required_elements_all)
+    if args.required_elements_one:
+        required_elements_one = parse_element_list(args.required_elements_one)
+
+    output_file = Path(args.output_file).resolve()
+    if args.verbosity > 0:
+        if args.allowed_elements:
+            print(f"Allowed elements: {allowed_elements}")
+        print(f"Output file: {output_file}")
+
+    # required elements is a list of tuples
+    # one tuple per set of required elements that must be contained at the same time
+    # e.g. [(55, 56)] means that both 55 and 56 must be contained in the molecule
+    # [(54),(55)] means that either 54 or 55 must be contained in the molecule
+    required_elements: list[tuple] = []
+    if args.required_elements_all:
+        required_elements.append(tuple(required_elements_all))
+    if args.required_elements_one:
+        for elem in required_elements_one:
+            required_elements.append(tuple([elem]))
+    if args.verbosity > 0:
+        print(f"Required elements: {required_elements}")
+
+    mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)
+    with open(output_file, "w", encoding="utf8") as sel_elem_file:
+        for mol in tqdm(mols, desc="Checking composition...", unit="molecule"):
             # check if all elements in the molecule are allowed
-            if all(ati in allowed_elements for ati in mol.ati):
-                print(f"Molecule {mol.name} has only allowed elements.")
-                sel_elem_file.write(mol.name + "\n")
+            if args.allowed_elements:
+                if all(ati in allowed_elements for ati in mol.ati):
+                    if args.verbosity > 1:
+                        print(f"Molecule {mol.name} has only allowed elements.")
+                else:
+                    if args.verbosity > 1:
+                        print(f"Molecule {mol.name} has forbidden elements.")
+                    continue
+            if required_elements and (
+                not molecule_has_required_elements(
+                    mol, required_elements, args.verbosity
+                )
+            ):
+                continue
+
+            if args.min_charge is not None and mol.charge < args.min_charge:
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has charge {mol.charge}.")
+                continue
+            if args.max_charge is not None and mol.charge > args.max_charge:
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has charge {mol.charge}.")
+                continue
+            if args.max_uhf is not None and mol.uhf > args.max_uhf:
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has UHF {mol.uhf}.")
+                continue
+
+            sel_elem_file.write(mol.name + "\n")
 
     return 0
 

@@ -0,0 +1,82 @@
+"""
+Python script that is based on MindlessGen
+and filters compounds that are redundant stereoisomers.
+"""
+
+import argparse
+from pathlib import Path
+from collections import defaultdict
+
+from tqdm import tqdm
+import networkx as nx  # type: ignore
+
+from mindlessgen.molecules import get_molecules_from_filesystem  # type: ignore
+from mindlessgen.molecules import get_molecular_graph  # type: ignore
+
+
+def get_args() -> argparse.Namespace:
+    """
+    Get the command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="Detect stereoisomers for a given list of molecules."
+    )
+    parser.add_argument(
+        "--verbosity", "-v", type=int, default=1, help="Verbosity level."
+    )
+    parser.add_argument(
+        "--keyword",
+        type=str,
+        required=False,
+        default="molecules.list",
+        help="Keyword for the file that contains the list of molecules.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=False,
+        default="selected_elements_molecules.list",
+        help="Output file for the selected elements.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    """
+    Main function that is called when the script is executed.
+    """
+    args = get_args()
+    output_file = Path(args.output_file).resolve()
+    if args.verbosity > 0:
+        print(f"Output file: {output_file}")
+    mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)
+
+    seen_hashes: defaultdict[str, list[str]] = defaultdict(
+        list
+    )  # maps graph hashes to list of mol indices or names
+
+    with open(output_file, "w", encoding="utf8") as sel_elem_file:
+        for i, mol in enumerate(
+            tqdm(mols, desc="Checking composition...", unit="molecule")
+        ):
+            graph = get_molecular_graph(mol, 1.25, verbosity=args.verbosity)
+
+            # Get WL hash with atom type info
+            g_hash = nx.weisfeiler_lehman_graph_hash(graph, node_attr="element")
+
+            if g_hash in seen_hashes.keys():
+                if args.verbosity > 1:
+                    print(
+                        f"Found stereoisomer: {seen_hashes[g_hash]} "
+                        + f"and {mol.name} with hash {g_hash}"
+                    )
+                seen_hashes[g_hash].append(mol.name)
+                continue
+            seen_hashes[g_hash].append(mol.name)
+            sel_elem_file.write(mol.name + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -6,28 +6,8 @@
 import argparse
 from pathlib import Path
 from tqdm import tqdm
-from mindlessgen.molecules import Molecule, detect_fragments  # type: ignore
-
-
-def get_molecules_from_filesystem(keyword: str) -> list[Molecule]:
-    """
-    Get a list of molecules from the filesystem.
-    """
-    # check if the file exists
-    if not Path(keyword).exists():
-        raise FileNotFoundError(f"File '{keyword}' does not exist.")
-    # read the file
-    with open(keyword, encoding="utf-8") as file:
-        mol_names = file.readlines()
-    # get the molecules and return them
-    mol_list: list[Molecule] = []
-    for mol_name in tqdm(
-        mol_names, desc="Processing molecules from files...", unit="molecule"
-    ):
-        mol_name = mol_name.strip()
-        mol = Molecule.read_mol_from_file(mol_name + ".xyz")
-        mol_list.append(mol)
-    return mol_list
+from mindlessgen.molecules import detect_fragments  # type: ignore
+from mindlessgen.molecules import get_molecules_from_filesystem  # type: ignore
 
 
 def get_args() -> argparse.Namespace:
@@ -61,7 +41,7 @@ def main() -> int:
     from the command line.
     """
     args = get_args()
-    mols = get_molecules_from_filesystem(keyword=args.keyword)
+    mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=0)
     # create new directory "new_single_molecules" if it does not exist
     newmoldir = Path("fragments").resolve()
     newmoldir.mkdir(exist_ok=True, parents=True)

@@ -13,7 +13,7 @@
     generate_atom_list,
     check_distances,
 )
-from .refinement import iterative_optimization, detect_fragments
+from .refinement import iterative_optimization, detect_fragments, get_molecular_graph
 from .postprocess import postprocess_mol
 from .miscellaneous import (
     get_cov_radii,
@@ -25,6 +25,7 @@
     get_actinides,
     get_alkali_metals,
     get_alkaline_earth_metals,
+    get_molecules_from_filesystem,
 )
 
 __all__ = [
@@ -34,6 +35,7 @@
     "generate_atom_list",
     "iterative_optimization",
     "detect_fragments",
+    "get_molecular_graph",
     "get_cov_radii",
     "set_random_charge",
     "check_distances",
@@ -44,6 +46,7 @@
     "get_actinides",
     "get_alkali_metals",
     "get_alkaline_earth_metals",
+    "get_molecules_from_filesystem",
     "ati_to_atlist",
     "atlist_to_ati",
     "postprocess_mol",