From 9bc0f766cb75831f7bc4681e8ab9d39711825d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Wed, 16 Apr 2025 13:14:52 +0200
Subject: [PATCH 1/8] enhance filtering script with verbosity and output file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_by_composition.py | 39 ++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py
index c595e9f..87788f8 100644
--- a/scripts/filter_by_composition.py
+++ b/scripts/filter_by_composition.py
@@ -9,15 +9,20 @@
 from mindlessgen.molecules import Molecule  # type: ignore
 
 
-def get_molecules_from_filesystem(keyword: str) -> list[Molecule]:
+def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule]:
     """
     Get a list of molecules from the filesystem.
     """
     # check if the file exists
-    if not Path(keyword).exists():
-        raise FileNotFoundError(f"File '{keyword}' does not exist.")
+    file_object = Path(keyword).resolve()
+    if not file_object.exists():
+        raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.")
+    if not file_object.is_file():
+        raise NotImplementedError("Reading from directories is not implemented yet.")
     # read the file
-    with open(keyword, encoding="utf-8") as file:
+    if verbosity > 0:
+        print(f"Reading file: {file_object}")
+    with open(file_object, encoding="utf-8") as file:
         mol_names = file.readlines()
     # get the molecules and return them
     mol_list: list[Molecule] = []
@@ -37,6 +42,9 @@ def get_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Detect fragments for a given list of molecules."
     )
+    parser.add_argument(
+        "--verbosity", "-v", type=int, default=1, help="Verbosity level."
+    )
     parser.add_argument(
         "--keyword",
         type=str,
@@ -51,6 +59,13 @@ def get_args() -> argparse.Namespace:
         default=None,
         help="Allowed elements for the molecules. Format example: `--allowed-elements '57-71, 81-*'",
     )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=False,
+        default="selected_elements_molecules.list",
+        help="Output file for the selected elements.",
+    )
     return parser.parse_args()
 
 
@@ -90,15 +105,19 @@ def main() -> int:
     from the command line.
     """
     args = get_args()
-    mols = get_molecules_from_filesystem(keyword=args.keyword)
     allowed_elements = parse_allowed_elements(args.allowed_elements)
-    with open(
-        "selected_elements_molecules.list", "w", encoding="utf8"
-    ) as sel_elem_file:
-        for mol in tqdm(mols, desc="Detecting fragments...", unit="molecule"):
+    output_file = Path(args.output_file).resolve()
+    if args.verbosity > 0:
+        print(f"Allowed elements: {allowed_elements}")
+        print(f"Output file: {output_file}")
+
+    mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)
+    with open(output_file, "w", encoding="utf8") as sel_elem_file:
+        for mol in tqdm(mols, desc="Checking composition...", unit="molecule"):
             # check if all elements in the molecule are allowed
             if all(ati in allowed_elements for ati in mol.ati):
-                print(f"Molecule {mol.name} has only allowed elements.")
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has only allowed elements.")
                 sel_elem_file.write(mol.name + "\n")
 
     return 0

From 3c0ee8f738d50a20859a748363bb18dc8ac29f3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Wed, 16 Apr 2025 14:38:48 +0200
Subject: [PATCH 2/8] option for required elements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_by_composition.py | 98 +++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 9 deletions(-)

diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py
index 87788f8..13bbc08 100644
--- a/scripts/filter_by_composition.py
+++ b/scripts/filter_by_composition.py
@@ -55,9 +55,27 @@ def get_args() -> argparse.Namespace:
     parser.add_argument(
         "--allowed-elements",
         type=str,
-        required=True,
+        required=False,
+        default=None,
+        help="Allowed elements for the molecules. "
+        + "Format example: `--allowed-elements '57-71, 81-*'",
+    )
+    parser.add_argument(
+        "--required-elements-all",
+        type=str,
+        required=False,
+        default=None,
+        help="Required element(s) that MUST be in each molecule (ALL of them must be contained). "
+        + "Format example: `--required-elements-all '57-71, 81-*'",
+    )
+    parser.add_argument(
+        "--required-elements-one",
+        type=str,
+        required=False,
         default=None,
-        help="Allowed elements for the molecules. Format example: `--allowed-elements '57-71, 81-*'",
+        help="Required element(s) that MUST be in each molecule "
+        + "(at least one of them must be contained). "
+        + "Format example: `--required-elements-one '57-71, 81-*'",
     )
     parser.add_argument(
         "--output-file",
@@ -69,7 +87,7 @@ def get_args() -> argparse.Namespace:
     return parser.parse_args()
 
 
-def parse_allowed_elements(allowed_elements: str) -> list[int]:
+def parse_element_list(allowed_elements: str) -> list[int]:
     """
     Parse the allowed elements from a string.
     """
@@ -105,20 +123,82 @@ def main() -> int:
     from the command line.
     """
     args = get_args()
-    allowed_elements = parse_allowed_elements(args.allowed_elements)
+    if (
+        not args.allowed_elements
+        and not args.required_elements_all
+        and not args.required_elements_one
+    ):
+        raise ValueError(
+            "Either --allowed-elements or --required-elements_XXX must be provided."
+        )
+    if args.required_elements_all and args.required_elements_one:
+        raise ValueError(
+            "Both --required-elements-all and --required-elements-one cannot be provided at the same time."
+        )
+    if args.allowed_elements:
+        allowed_elements = parse_element_list(args.allowed_elements)
+    if args.required_elements_all:
+        required_elements_all = parse_element_list(args.required_elements_all)
+    if args.required_elements_one:
+        required_elements_one = parse_element_list(args.required_elements_one)
+
     output_file = Path(args.output_file).resolve()
     if args.verbosity > 0:
-        print(f"Allowed elements: {allowed_elements}")
+        if args.allowed_elements:
+            print(f"Allowed elements: {allowed_elements}")
         print(f"Output file: {output_file}")
 
+    # required elements is a list of tuples
+    # one tuple per set of required elements that must be contained at the same time
+    # e.g. [(55, 56)] means that both 55 and 56 must be contained in the molecule
+    # [(54),(55)] means that either 54 or 55 must be contained in the molecule
+    required_elements: list[tuple] = []
+    if args.required_elements_all:
+        required_elements.append(tuple(required_elements_all))
+    if args.required_elements_one:
+        for elem in required_elements_one:
+            required_elements.append(tuple([elem]))
+    if args.verbosity > 0:
+        print(f"Required elements: {required_elements}")
+
     mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)
     with open(output_file, "w", encoding="utf8") as sel_elem_file:
         for mol in tqdm(mols, desc="Checking composition...", unit="molecule"):
             # check if all elements in the molecule are allowed
-            if all(ati in allowed_elements for ati in mol.ati):
-                if args.verbosity > 1:
-                    print(f"Molecule {mol.name} has only allowed elements.")
-                sel_elem_file.write(mol.name + "\n")
+            if args.allowed_elements:
+                if all(ati in allowed_elements for ati in mol.ati):
+                    if args.verbosity > 1:
+                        print(f"Molecule {mol.name} has only allowed elements.")
+                else:
+                    if args.verbosity > 1:
+                        print(f"Molecule {mol.name} has forbidden elements.")
+                    continue
+            if required_elements:
+                # check if all required elements are in the molecule
+                # loop over all tuples of required element combinations
+                contained_combinations: list[bool] = [False] * len(required_elements)
+                for k, req_elem in enumerate(required_elements):
+                    # list of boolean values with the same length as the number of req_elem
+                    contained: list[bool] = [False] * len(req_elem)
+                    for i, ati in enumerate(req_elem):
+                        # check if the required element is in the molecule
+                        if ati in mol.ati:
+                            contained[i] = True
+                    # check if all elements of the respective required element combination are found
+                    if all(contained):
+                        contained_combinations[k] = True
+                # check if any of the combinations is True
+                if any(contained_combinations):
+                    if args.verbosity > 1:
+                        print(f"Molecule {mol.name} has the required elements.")
+                else:
+                    if args.verbosity > 1:
+                        print(
+                            f"Molecule {mol.name} does not have the required elements."
+                        )
+                    continue
+
+            sel_elem_file.write(mol.name + "\n")
 
     return 0
 

From 70186f32ca840f02be50c30afc82fe8476537635 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Wed, 16 Apr 2025 14:48:58 +0200
Subject: [PATCH 3/8] put required elements in separate function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_by_composition.py | 56 ++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py
index 13bbc08..356373a 100644
--- a/scripts/filter_by_composition.py
+++ b/scripts/filter_by_composition.py
@@ -117,6 +117,34 @@ def parse_element_list(allowed_elements: str) -> list[int]:
     return sorted(list(set_allowed_elements))
 
 
+def molecule_has_required_elements(
+    mol: Molecule, required_elements: list[tuple], verbosity: int
+) -> bool:
+    """
+    Check whether a molecule contains the required elements.
+    """
+    # loop over all tuples of required element combinations
+    contained_combinations: list[bool] = [False] * len(required_elements)
+    for k, req_elem in enumerate(required_elements):
+        # list of boolean values with the same length as the number of req_elem
+        contained: list[bool] = [False] * len(req_elem)
+        for i, ati in enumerate(req_elem):
+            # check if the required element is in the molecule
+            if ati in mol.ati:
+                contained[i] = True
+        # check if all elements of the respective required element combination are found
+        if all(contained):
+            contained_combinations[k] = True
+    # check if any of the combinations is True
+    if any(contained_combinations):
+        if verbosity > 1:
+            print(f"Molecule {mol.name} has the required elements.")
+        return True
+    if verbosity > 1:
+        print(f"Molecule {mol.name} does not have the required elements.")
+    return False
+
+
 def main() -> int:
     """
     Main function that is called when the script is executed
@@ -173,30 +201,10 @@ def main() -> int:
                     if args.verbosity > 1:
                         print(f"Molecule {mol.name} has forbidden elements.")
                     continue
-            if required_elements:
-                # check if all required elements are in the molecule
-                # loop over all tuples of required element combinations
-                contained_combinations: list[bool] = [False] * len(required_elements)
-                for k, req_elem in enumerate(required_elements):
-                    # list of boolean values with the same length as the number of req_elem
-                    contained: list[bool] = [False] * len(req_elem)
-                    for i, ati in enumerate(req_elem):
-                        # check if the required element is in the molecule
-                        if ati in mol.ati:
-                            contained[i] = True
-                    # check if all elements of the respective required element combination are found
-                    if all(contained):
-                        contained_combinations[k] = True
-                # check if any of the combinations is True
-                if any(contained_combinations):
-                    if args.verbosity > 1:
-                        print(f"Molecule {mol.name} has the required elements.")
-                else:
-                    if args.verbosity > 1:
-                        print(
-                            f"Molecule {mol.name} does not have the required elements."
-                        )
-                    continue
+            if not molecule_has_required_elements(
+                mol, required_elements, args.verbosity
+            ):
+                continue
 
             sel_elem_file.write(mol.name + "\n")
 

From 29732d3ea18a1857fdb3060b69d77a810b9caf5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Wed, 16 Apr 2025 15:12:50 +0200
Subject: [PATCH 4/8] investigate directory structures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_by_composition.py | 55 +++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py
index 356373a..26fccd4 100644
--- a/scripts/filter_by_composition.py
+++ b/scripts/filter_by_composition.py
@@ -19,20 +19,51 @@ def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule
         raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.")
     if not file_object.is_file():
         raise NotImplementedError("Reading from directories is not implemented yet.")
-    # read the file
+    ### Process molecules from list of files (molecules)
     if verbosity > 0:
         print(f"Reading file: {file_object}")
     with open(file_object, encoding="utf-8") as file:
         mol_names = file.readlines()
-    # get the molecules and return them
+    ### Get the molecules and return them
+    # Test directory structure first
+    if Path(mol_names[0].strip() + ".xyz").exists():
+        format: str = "xyz"
+    elif Path(mol_names[0].strip()).is_dir():
+        format = "dir"
+        # search all XYZ files in the test directory
+        xyz_files = list(Path(mol_names[0].strip()).glob("*.xyz"))
+        # if more than one file is found, raise an error
+        if len(xyz_files) > 1:
+            raise ValueError(
+                "More than one XYZ file found in the directory. "
+                + "Please specify the file name."
+            )
     mol_list: list[Molecule] = []
-    for mol_name in tqdm(
-        mol_names, desc="Processing molecules from files...", unit="molecule"
-    ):
-        mol_name = mol_name.strip()
-        mol = Molecule.read_mol_from_file(mol_name + ".xyz")
-        mol_list.append(mol)
-    return mol_list
+    if format == "xyz":
+        for mol_name in tqdm(
+            mol_names, desc="Processing molecules from files...", unit="molecule"
+        ):
+            mol_name = mol_name.strip()
+            mol = Molecule.read_mol_from_file(mol_name + ".xyz")
+            mol_list.append(mol)
+        return mol_list
+    elif format == "dir":
+        # read all XYZ files in the directory
+        for mol_name in tqdm(
+            mol_names, desc="Processing molecules from files...", unit="molecule"
+        ):
+            # path to the xyz file is obtained by grepping for "*.xyz" in the mol_name directory
+            mol_name = mol_name.strip()
+            xyz_file = list(Path(mol_name).glob("*.xyz"))
+            mol = Molecule.read_mol_from_file(xyz_file[0])
+            mol.name = mol_name
+            mol_list.append(mol)
+        return mol_list
+    else:
+        raise ValueError(
+            "File format not supported. "
+            + "Please specify the file name with the extension."
+        )
 
 
 def get_args() -> argparse.Namespace:
@@ -201,8 +232,10 @@ def main() -> int:
                     if args.verbosity > 1:
                         print(f"Molecule {mol.name} has forbidden elements.")
                     continue
-            if not molecule_has_required_elements(
-                mol, required_elements, args.verbosity
+            if required_elements and (
+                not molecule_has_required_elements(
+                    mol, required_elements, args.verbosity
+                )
             ):
                 continue
 

From 4bbbee6f492186809e3b9a9ff84e5f6d07a39a64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Tue, 22 Apr 2025 14:06:26 +0200
Subject: [PATCH 5/8] add stereoisomer filtering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_by_composition.py           | 81 ++++++-------------
 scripts/filter_stereoisomers.py            | 81 +++++++++++++++++++
 scripts/fragment_detection.py              | 26 +-----
 src/mindlessgen/molecules/__init__.py      |  5 +-
 src/mindlessgen/molecules/miscellaneous.py | 93 +++++++++++++++++++++-
 src/mindlessgen/molecules/refinement.py    | 62 ++++++++++-----
 6 files changed, 244 insertions(+), 104 deletions(-)
 create mode 100644 scripts/filter_stereoisomers.py

diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py
index 26fccd4..0fa68be 100644
--- a/scripts/filter_by_composition.py
+++ b/scripts/filter_by_composition.py
@@ -7,63 +7,7 @@
 from pathlib import Path
 from tqdm import tqdm
 from mindlessgen.molecules import Molecule  # type: ignore
-
-
-def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule]:
-    """
-    Get a list of molecules from the filesystem.
-    """
-    # check if the file exists
-    file_object = Path(keyword).resolve()
-    if not file_object.exists():
-        raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.")
-    if not file_object.is_file():
-        raise NotImplementedError("Reading from directories is not implemented yet.")
-    ### Process molecules from list of files (molecules)
-    if verbosity > 0:
-        print(f"Reading file: {file_object}")
-    with open(file_object, encoding="utf-8") as file:
-        mol_names = file.readlines()
-    ### Get the molecules and return them
-    # Test directory structure first
-    if Path(mol_names[0].strip() + ".xyz").exists():
-        format: str = "xyz"
-    elif Path(mol_names[0].strip()).is_dir():
-        format = "dir"
-        # search all XYZ files in the test directory
-        xyz_files = list(Path(mol_names[0].strip()).glob("*.xyz"))
-        # if more than one file is found, raise an error
-        if len(xyz_files) > 1:
-            raise ValueError(
-                "More than one XYZ file found in the directory. "
-                + "Please specify the file name."
-            )
-    mol_list: list[Molecule] = []
-    if format == "xyz":
-        for mol_name in tqdm(
-            mol_names, desc="Processing molecules from files...", unit="molecule"
-        ):
-            mol_name = mol_name.strip()
-            mol = Molecule.read_mol_from_file(mol_name + ".xyz")
-            mol_list.append(mol)
-        return mol_list
-    elif format == "dir":
-        # read all XYZ files in the directory
-        for mol_name in tqdm(
-            mol_names, desc="Processing molecules from files...", unit="molecule"
-        ):
-            # path to the xyz file is obtained by grepping for "*.xyz" in the mol_name directory
-            mol_name = mol_name.strip()
-            xyz_file = list(Path(mol_name).glob("*.xyz"))
-            mol = Molecule.read_mol_from_file(xyz_file[0])
-            mol.name = mol_name
-            mol_list.append(mol)
-        return mol_list
-    else:
-        raise ValueError(
-            "File format not supported. "
-            + "Please specify the file name with the extension."
-        )
+from mindlessgen.molecules import get_molecules_from_filesystem  # type: ignore
 
 
 def get_args() -> argparse.Namespace:
@@ -108,6 +52,20 @@ def get_args() -> argparse.Namespace:
         + "(at least one of them must be contained). "
         + "Format example: `--required-elements-one '57-71, 81-*'",
     )
+    parser.add_argument(
+        "--min-charge",
+        type=int,
+        required=False,
+        default=None,
+        help="Minimum charge for the molecules." + "Format example: `--min-charge -1`",
+    )
+    parser.add_argument(
+        "--max-charge",
+        type=int,
+        required=False,
+        default=None,
+        help="Maximum charge for the molecules." + "Format example: `--max-charge 2`",
+    )
     parser.add_argument(
         "--output-file",
         type=str,
@@ -239,6 +197,15 @@ def main() -> int:
             ):
                 continue
 
+            if args.min_charge is not None and mol.charge < args.min_charge:
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has charge {mol.charge}.")
+                continue
+            if args.max_charge is not None and mol.charge > args.max_charge:
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has charge {mol.charge}.")
+                continue
+
             sel_elem_file.write(mol.name + "\n")
 
     return 0
diff --git a/scripts/filter_stereoisomers.py b/scripts/filter_stereoisomers.py
new file mode 100644
index 0000000..c90af9f
--- /dev/null
+++ b/scripts/filter_stereoisomers.py
@@ -0,0 +1,81 @@
+"""
+Python script that is based on MindlessGen
+and filters compounds that are redundant stereoisomers.
+"""
+
+import argparse
+from pathlib import Path
+from collections import defaultdict
+
+from tqdm import tqdm
+import networkx as nx  # type: ignore
+
+from mindlessgen.molecules import get_molecules_from_filesystem  # type: ignore
+from mindlessgen.molecules import get_molecular_graph  # type: ignore
+
+
+def get_args() -> argparse.Namespace:
+    """
+    Get the command line arguments.
+    """
+    parser = argparse.ArgumentParser(
+        description="Detect stereoisomers for a given list of molecules."
+    )
+    parser.add_argument(
+        "--verbosity", "-v", type=int, default=1, help="Verbosity level."
+    )
+    parser.add_argument(
+        "--keyword",
+        type=str,
+        required=False,
+        default="molecules.list",
+        help="Keyword for the file that contains the list of molecules.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=False,
+        default="selected_elements_molecules.list",
+        help="Output file for the selected elements.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    """
+    Main function that is called when the script is executed.
+    """
+    args = get_args()
+    output_file = Path(args.output_file).resolve()
+    if args.verbosity > 0:
+        print(f"Output file: {output_file}")
+    mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)
+
+    seen_hashes: defaultdict[str, list[str]] = defaultdict(
+        list
+    )  # maps graph hashes to list of mol indices or names
+
+    with open(output_file, "w", encoding="utf8") as sel_elem_file:
+        for i, mol in enumerate(
+            tqdm(mols, desc="Checking composition...", unit="molecule")
+        ):
+            graph = get_molecular_graph(mol, 1.25, verbosity=args.verbosity)
+
+            # Get WL hash with atom type info
+            g_hash = nx.weisfeiler_lehman_graph_hash(graph, node_attr="element")
+
+            if g_hash in seen_hashes.keys():
+                if args.verbosity > 0:
+                    print(
+                        f"Found stereoisomer: {seen_hashes[g_hash]} and {mol.name} with hash {g_hash}"
+                    )
+                seen_hashes[g_hash].append(mol.name)
+                continue
+            seen_hashes[g_hash].append(mol.name)
+            sel_elem_file.write(mol.name + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/fragment_detection.py b/scripts/fragment_detection.py
index 34e3219..5fa1b1d 100644
--- a/scripts/fragment_detection.py
+++ b/scripts/fragment_detection.py
@@ -6,28 +6,8 @@
 import argparse
 from pathlib import Path
 from tqdm import tqdm
-from mindlessgen.molecules import Molecule, detect_fragments  # type: ignore
-
-
-def get_molecules_from_filesystem(keyword: str) -> list[Molecule]:
-    """
-    Get a list of molecules from the filesystem.
-    """
-    # check if the file exists
-    if not Path(keyword).exists():
-        raise FileNotFoundError(f"File '{keyword}' does not exist.")
-    # read the file
-    with open(keyword, encoding="utf-8") as file:
-        mol_names = file.readlines()
-    # get the molecules and return them
-    mol_list: list[Molecule] = []
-    for mol_name in tqdm(
-        mol_names, desc="Processing molecules from files...", unit="molecule"
-    ):
-        mol_name = mol_name.strip()
-        mol = Molecule.read_mol_from_file(mol_name + ".xyz")
-        mol_list.append(mol)
-    return mol_list
+from mindlessgen.molecules import detect_fragments  # type: ignore
+from mindlessgen.molecules import get_molecules_from_filesystem  # type: ignore
 
 
 def get_args() -> argparse.Namespace:
@@ -61,7 +41,7 @@ def main() -> int:
     from the command line.
     """
     args = get_args()
-    mols = get_molecules_from_filesystem(keyword=args.keyword)
+    mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=0)
     # create new directory "new_single_molecules" if it does not exist
     newmoldir = Path("fragments").resolve()
     newmoldir.mkdir(exist_ok=True, parents=True)
diff --git a/src/mindlessgen/molecules/__init__.py b/src/mindlessgen/molecules/__init__.py
index a167c46..95e3365 100644
--- a/src/mindlessgen/molecules/__init__.py
+++ b/src/mindlessgen/molecules/__init__.py
@@ -13,7 +13,7 @@
     generate_atom_list,
     check_distances,
 )
-from .refinement import iterative_optimization, detect_fragments
+from .refinement import iterative_optimization, detect_fragments, get_molecular_graph
 from .postprocess import postprocess_mol
 from .miscellaneous import (
     get_cov_radii,
@@ -25,6 +25,7 @@
     get_actinides,
     get_alkali_metals,
     get_alkaline_earth_metals,
+    get_molecules_from_filesystem,
 )
 
 __all__ = [
@@ -34,6 +35,7 @@
     "generate_atom_list",
     "iterative_optimization",
     "detect_fragments",
+    "get_molecular_graph",
     "get_cov_radii",
     "set_random_charge",
     "check_distances",
@@ -44,6 +46,7 @@
     "get_actinides",
     "get_alkali_metals",
     "get_alkaline_earth_metals",
+    "get_molecules_from_filesystem",
     "ati_to_atlist",
     "atlist_to_ati",
     "postprocess_mol",
diff --git a/src/mindlessgen/molecules/miscellaneous.py b/src/mindlessgen/molecules/miscellaneous.py
index 4a59c5e..9952b2f 100644
--- a/src/mindlessgen/molecules/miscellaneous.py
+++ b/src/mindlessgen/molecules/miscellaneous.py
@@ -2,9 +2,12 @@
 Molecule-related helper tools.
 """
 
+from pathlib import Path
+from tqdm import tqdm
+
 import numpy as np
 
-from .molecule import ati_to_atlist
+from .molecule import ati_to_atlist, Molecule
 from ..data.parameters import COV_RADII_PYYKKO, COV_RADII_MLMGEN
 
 
@@ -173,3 +176,91 @@ def get_actinides() -> list[int]:
     """
     actinides = list(range(88, 103))
     return actinides
+
+
+# NOTE: Required by external helper scripts.
+def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule]:
+    """
+    Get a list of molecules from the filesystem.
+    """
+    # check if the file exists
+    file_object = Path(keyword).resolve()
+    if not file_object.exists():
+        raise FileNotFoundError(f"File/Directory '{keyword}' does not exist.")
+    if not file_object.is_file():
+        raise NotImplementedError("Reading from directories is not implemented yet.")
+    ### Process molecules from list of files (molecules)
+    if verbosity > 0:
+        print(f"Reading file: {file_object}")
+    with open(file_object, encoding="utf-8") as file:
+        mol_names = file.readlines()
+    ### Get the molecules and return them
+    # Test directory structure first
+    if Path(mol_names[0].strip() + ".xyz").exists():
+        xyzformat: str = "xyz"
+    elif Path(mol_names[0].strip()).is_dir():
+        xyzformat = "dir"
+        # search all XYZ files in the test directory
+        xyz_files = list(Path(mol_names[0].strip()).glob("*.xyz"))
+        # if more than one file is found, raise an error
+        if len(xyz_files) > 1:
+            raise ValueError(
+                "More than one XYZ file found in the directory. "
+                + "Please specify the file name."
+            )
+    else:
+        raise ValueError(
+            "File format not supported. "
+            + "Please specify the file name with the extension."
+        )
+    mol_list: list[Molecule] = []
+    if xyzformat == "xyz":
+        for mol_name in tqdm(
+            mol_names, desc="Processing molecules from files...", unit="molecule"
+        ):
+            mol_name = mol_name.strip()
+            mol = Molecule.read_mol_from_file(mol_name + ".xyz")
+            mol_list.append(mol)
+        return mol_list
+    elif xyzformat == "dir":
+        # read all XYZ files in the directory
+        for mol_name in tqdm(
+            mol_names, desc="Processing molecules from files...", unit="molecule"
+        ):
+            # path to the xyz file is obtained by grepping for "*.xyz" in the mol_name directory
+            mol_name = mol_name.strip()
+            xyz_file = list(Path(mol_name).glob("*.xyz"))
+            mol = Molecule.read_mol_from_file(xyz_file[0])
+            mol.name = mol_name
+            # check if the directory contains a ".CHRG" file
+            chrg_file = Path(mol_name).parent / ".CHRG"
+            if chrg_file.exists():
+                with open(chrg_file, encoding="utf-8") as chrg:
+                    chrg_lines = chrg.readlines()
+                # check if the file contains a charge
+                if len(chrg_lines) > 0:
+                    try:
+                        mol.charge = int(chrg_lines[0].strip())
+                    except ValueError as e:
+                        raise ValueError(
+                            f"Charge in file {chrg_file} is not an integer."
+                        ) from e
+            uhf_file = Path(mol_name).parent / ".UHF"
+            if uhf_file.exists():
+                with open(uhf_file, encoding="utf-8") as uhf:
+                    uhf_lines = uhf.readlines()
+                # check if the file contains a charge
+                if len(uhf_lines) > 0:
+                    try:
+                        mol.charge = int(uhf_lines[0].strip())
+                    except ValueError as e:
+                        raise ValueError(
+                            f"Charge in file {uhf_file} is not an integer."
+                        ) from e
+            mol_list.append(mol)
+        return mol_list
+    else:
+        raise ValueError(
+            "File format not supported. "
+            + "Please specify the file name with the extension."
+        )
diff --git a/src/mindlessgen/molecules/refinement.py b/src/mindlessgen/molecules/refinement.py
index c851139..1fd3116 100644
--- a/src/mindlessgen/molecules/refinement.py
+++ b/src/mindlessgen/molecules/refinement.py
@@ -203,29 +203,8 @@ def detect_fragments(
     Returns:
     - A list of fragments, where each fragment is a Molecule object.
     """
-    graph = nx.Graph()
 
-    # Add nodes (atoms) to the graph
-    graph.add_nodes_from(range(mol.num_atoms))
-
-    # Calculate pairwise distances and add edges if atoms are bonded
-    for i in range(mol.num_atoms - 1):
-        for j in range(i + 1, mol.num_atoms):
-            distance = np.linalg.norm(mol.xyz[i] - mol.xyz[j])
-            sum_radii = get_cov_radii(mol.ati[i], COV_RADII) + get_cov_radii(
-                mol.ati[j], COV_RADII
-            )
-            if verbosity > 2:
-                print(f"Distance between atom {i} and {j}: {distance:6.3f}")
-                print(
-                    f"Covalent radii of atom {i} and {j}, "
-                    + "and the effective threshold: "
-                    + f"{get_cov_radii(mol.ati[i], COV_RADII):6.3f}, "
-                    + f"{get_cov_radii(mol.ati[j], COV_RADII):6.3f}, "
-                    + f"{(sum_radii * vdw_scaling):6.3f}"
-                )
-            if distance <= sum_radii * vdw_scaling:
-                graph.add_edge(i, j)
+    graph = get_molecular_graph(mol, vdw_scaling, verbosity)
 
     # Detect connected components (fragments)
     fragments = [list(component) for component in nx.connected_components(graph)]
@@ -268,3 +247,42 @@ def detect_fragments(
         fragment_molecules.append(fragment_molecule)
 
     return fragment_molecules
+
+
+def get_molecular_graph(
+    mol: Molecule,
+    vdw_scaling: float,
+    verbosity: int = 1,
+) -> nx.Graph:
+    """
+    Generate a molecular graph from the molecule object.
+    The graph is undirected and contains nodes for each atom.
+    Edges are added between atoms that are bonded based on their distances
+    and covalent radii.
+    """
+    graph = nx.Graph()
+
+    # Add nodes (atoms) to the graph with atomic type as attribute
+    for i in range(mol.num_atoms):
+        graph.add_node(i, element=mol.ati[i])
+
+    # Calculate pairwise distances and add edges if atoms are bonded
+    for i in range(mol.num_atoms - 1):
+        for j in range(i + 1, mol.num_atoms):
+            distance = np.linalg.norm(mol.xyz[i] - mol.xyz[j])
+            sum_radii = get_cov_radii(mol.ati[i], COV_RADII) + get_cov_radii(
+                mol.ati[j], COV_RADII
+            )
+            if verbosity > 2:
+                print(f"Distance between atom {i} and {j}: {distance:6.3f}")
+                print(
+                    f"Covalent radii of atom {i} and {j}, "
+                    + "and the effective threshold: "
+                    + f"{get_cov_radii(mol.ati[i], COV_RADII):6.3f}, "
+                    + f"{get_cov_radii(mol.ati[j], COV_RADII):6.3f}, "
+                    + f"{(sum_radii * vdw_scaling):6.3f}"
+                )
+            if distance <= sum_radii * vdw_scaling:
+                graph.add_edge(i, j)
+
+    return graph

From a6043b03d3ee842ebef50ebddd253d1e10b78a60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Thu, 24 Apr 2025 09:56:59 +0200
Subject: [PATCH 6/8] add UHF max support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_by_composition.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/scripts/filter_by_composition.py b/scripts/filter_by_composition.py
index 0fa68be..d1d7776 100644
--- a/scripts/filter_by_composition.py
+++ b/scripts/filter_by_composition.py
@@ -66,6 +66,14 @@ def get_args() -> argparse.Namespace:
         default=None,
         help="Maximum charge for the molecules." + "Format example: `--max-charge 2`",
     )
+    parser.add_argument(
+        "--max-uhf",
+        type=int,
+        required=False,
+        default=None,
+        help="Maximum number of unpaired electrons (UHF) for the molecules."
+        + " Format example: `--max-uhf 2`",
+    )
     parser.add_argument(
         "--output-file",
         type=str,
@@ -144,13 +152,18 @@ def main() -> int:
         not args.allowed_elements
         and not args.required_elements_all
         and not args.required_elements_one
+        and not args.min_charge
+        and not args.max_charge
+        and not args.max_uhf
     ):
         raise ValueError(
-            "Either --allowed-elements or --required-elements_XXX must be provided."
+            "Either --allowed-elements, --required-elements_XXX, --min-charge, "
+            + "--max-charge, or --max-uhf must be provided."
         )
     if args.required_elements_all and args.required_elements_one:
         raise ValueError(
-            "Both --required-elements-all and --required-elements-one cannot be provided at the same time."
+            "Both --required-elements-all and "
+            + "--required-elements-one cannot be provided at the same time."
         )
     if args.allowed_elements:
         allowed_elements = parse_element_list(args.allowed_elements)
@@ -205,6 +218,10 @@ def main() -> int:
                 if args.verbosity > 1:
                     print(f"Molecule {mol.name} has charge {mol.charge}.")
                 continue
+            if args.max_uhf is not None and mol.uhf > args.max_uhf:
+                if args.verbosity > 1:
+                    print(f"Molecule {mol.name} has UHF {mol.uhf}.")
+                continue
 
             sel_elem_file.write(mol.name + "\n")
 

From 5ce08ac4b7c383303d110c563e3e1bf76a0e3491 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Thu, 24 Apr 2025 09:57:39 +0200
Subject: [PATCH 7/8] increase verbosity level for print
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 scripts/filter_stereoisomers.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/filter_stereoisomers.py b/scripts/filter_stereoisomers.py
index c90af9f..c3968d1 100644
--- a/scripts/filter_stereoisomers.py
+++ b/scripts/filter_stereoisomers.py
@@ -65,9 +65,10 @@ def main() -> int:
             g_hash = nx.weisfeiler_lehman_graph_hash(graph, node_attr="element")
 
             if g_hash in seen_hashes.keys():
-                if args.verbosity > 0:
+                if args.verbosity > 1:
                     print(
-                        f"Found stereoisomer: {seen_hashes[g_hash]} and {mol.name} with hash {g_hash}"
+                        f"Found stereoisomer: {seen_hashes[g_hash]} "
+                        + f"and {mol.name} with hash {g_hash}"
                     )
                 seen_hashes[g_hash].append(mol.name)
                 continue

From 38333cec402328946217e2acd21ede9b729dbf53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcel=20M=C3=BCller?= <marcel.mueller@thch.uni-bonn.de>
Date: Thu, 24 Apr 2025 10:12:07 +0200
Subject: [PATCH 8/8] correct wrong assignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Marcel Müller <marcel.mueller@thch.uni-bonn.de>
---
 src/mindlessgen/molecules/miscellaneous.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mindlessgen/molecules/miscellaneous.py b/src/mindlessgen/molecules/miscellaneous.py
index 9952b2f..66e138e 100644
--- a/src/mindlessgen/molecules/miscellaneous.py
+++ b/src/mindlessgen/molecules/miscellaneous.py
@@ -252,10 +252,10 @@ def get_molecules_from_filesystem(keyword: str, verbosity: int) -> list[Molecule
                 # check if the file contains a charge
                 if len(uhf_lines) > 0:
                     try:
-                        mol.charge = int(uhf_lines[0].strip())
+                        mol.uhf = int(uhf_lines[0].strip())
                     except ValueError as e:
                         raise ValueError(
-                            f"Charge in file {uhf_file} is not an integer."
+                            f"UHF in file {uhf_file} is not an integer."
                         ) from e
             mol_list.append(mol)
         return mol_list