Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 157 additions & 33 deletions scripts/filter_by_composition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,7 @@
from pathlib import Path
from tqdm import tqdm
from mindlessgen.molecules import Molecule # type: ignore


def get_molecules_from_filesystem(keyword: str) -> list[Molecule]:
"""
Get a list of molecules from the filesystem.
"""
# check if the file exists
if not Path(keyword).exists():
raise FileNotFoundError(f"File '{keyword}' does not exist.")
# read the file
with open(keyword, encoding="utf-8") as file:
mol_names = file.readlines()
# get the molecules and return them
mol_list: list[Molecule] = []
for mol_name in tqdm(
mol_names, desc="Processing molecules from files...", unit="molecule"
):
mol_name = mol_name.strip()
mol = Molecule.read_mol_from_file(mol_name + ".xyz")
mol_list.append(mol)
return mol_list
from mindlessgen.molecules import get_molecules_from_filesystem # type: ignore


def get_args() -> argparse.Namespace:
Expand All @@ -37,6 +17,9 @@ def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Detect fragments for a given list of molecules."
)
parser.add_argument(
"--verbosity", "-v", type=int, default=1, help="Verbosity level."
)
parser.add_argument(
"--keyword",
type=str,
Expand All @@ -47,14 +30,61 @@ def get_args() -> argparse.Namespace:
parser.add_argument(
"--allowed-elements",
type=str,
required=True,
required=False,
default=None,
help="Allowed elements for the molecules. "
+ "Format example: `--allowed-elements '57-71, 81-*'",
)
parser.add_argument(
"--required-elements-all",
type=str,
required=False,
default=None,
help="Required element(s) that MUST be in each molecule (ALL of them must be contained). "
+ "Format example: `--required-elements-all '57-71, 81-*'",
)
parser.add_argument(
"--required-elements-one",
type=str,
required=False,
default=None,
help="Required element(s) that MUST be in each molecule "
+ "(at least one of them must be contained). "
+ "Format example: `--required-elements-one '57-71, 81-*'",
)
parser.add_argument(
"--min-charge",
type=int,
required=False,
default=None,
help="Allowed elements for the molecules. Format example: `--allowed-elements '57-71, 81-*'",
help="Minimum charge for the molecules." + "Format example: `--min-charge -1`",
)
parser.add_argument(
"--max-charge",
type=int,
required=False,
default=None,
help="Maximum charge for the molecules." + "Format example: `--max-charge 2`",
)
parser.add_argument(
"--max-uhf",
type=int,
required=False,
default=None,
help="Maximum number of unpaired electrons (UHF) for the molecules."
+ " Format example: `--max-uhf 2`",
)
parser.add_argument(
"--output-file",
type=str,
required=False,
default="selected_elements_molecules.list",
help="Output file for the selected elements.",
)
return parser.parse_args()


def parse_allowed_elements(allowed_elements: str) -> list[int]:
def parse_element_list(allowed_elements: str) -> list[int]:
"""
Parse the allowed elements from a string.
"""
Expand Down Expand Up @@ -84,22 +114,116 @@ def parse_allowed_elements(allowed_elements: str) -> list[int]:
return sorted(list(set_allowed_elements))


def molecule_has_required_elements(
mol: Molecule, required_elements: list[tuple], verbosity: int
) -> bool:
"""
Check whether a molecule contains the required elements.
"""
# loop over all tuples of required element combinations
contained_combinations: list[bool] = [False] * len(required_elements)
for k, req_elem in enumerate(required_elements):
# list of boolean values with the same length as the number of req_elem
contained: list[bool] = [False] * len(req_elem)
for i, ati in enumerate(req_elem):
# check if the required element is in the molecule
if ati in mol.ati:
contained[i] = True
# check if all elements of the respective required element combination are found
if all(contained):
contained_combinations[k] = True
# check if any of the combinations is True
if any(contained_combinations):
if verbosity > 1:
print(f"Molecule {mol.name} has the required elements.")
return True
if verbosity > 1:
print(f"Molecule {mol.name} does not have the required elements.")
return False


def main() -> int:
"""
Main function that is called when the script is executed
from the command line.
"""
args = get_args()
mols = get_molecules_from_filesystem(keyword=args.keyword)
allowed_elements = parse_allowed_elements(args.allowed_elements)
with open(
"selected_elements_molecules.list", "w", encoding="utf8"
) as sel_elem_file:
for mol in tqdm(mols, desc="Detecting fragments...", unit="molecule"):
if (
not args.allowed_elements
and not args.required_elements_all
and not args.required_elements_one
and not args.min_charge
and not args.max_charge
and not args.max_uhf
):
raise ValueError(
"Either --allowed-elements, --required-elements_XXX, --min-charge, "
+ "--max-charge, or --max-uhf must be provided."
)
if args.required_elements_all and args.required_elements_one:
raise ValueError(
"Both --required-elements-all and "
+ "--required-elements-one cannot be provided at the same time."
)
if args.allowed_elements:
allowed_elements = parse_element_list(args.allowed_elements)
if args.required_elements_all:
required_elements_all = parse_element_list(args.required_elements_all)
if args.required_elements_one:
required_elements_one = parse_element_list(args.required_elements_one)

output_file = Path(args.output_file).resolve()
if args.verbosity > 0:
if args.allowed_elements:
print(f"Allowed elements: {allowed_elements}")
print(f"Output file: {output_file}")

# required elements is a list of tuples
# one tuple per set of required elements that must be contained at the same time
# e.g. [(55, 56)] means that both 55 and 56 must be contained in the molecule
# [(54),(55)] means that either 54 or 55 must be contained in the molecule
required_elements: list[tuple] = []
if args.required_elements_all:
required_elements.append(tuple(required_elements_all))
if args.required_elements_one:
for elem in required_elements_one:
required_elements.append(tuple([elem]))
if args.verbosity > 0:
print(f"Required elements: {required_elements}")

mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)
with open(output_file, "w", encoding="utf8") as sel_elem_file:
for mol in tqdm(mols, desc="Checking composition...", unit="molecule"):
# check if all elements in the molecule are allowed
if all(ati in allowed_elements for ati in mol.ati):
print(f"Molecule {mol.name} has only allowed elements.")
sel_elem_file.write(mol.name + "\n")
if args.allowed_elements:
if all(ati in allowed_elements for ati in mol.ati):
if args.verbosity > 1:
print(f"Molecule {mol.name} has only allowed elements.")
else:
if args.verbosity > 1:
print(f"Molecule {mol.name} has forbidden elements.")
continue
if required_elements and (
not molecule_has_required_elements(
mol, required_elements, args.verbosity
)
):
continue

if args.min_charge is not None and mol.charge < args.min_charge:
if args.verbosity > 1:
print(f"Molecule {mol.name} has charge {mol.charge}.")
continue
if args.max_charge is not None and mol.charge > args.max_charge:
if args.verbosity > 1:
print(f"Molecule {mol.name} has charge {mol.charge}.")
continue
if args.max_uhf is not None and mol.uhf > args.max_uhf:
if args.verbosity > 1:
print(f"Molecule {mol.name} has UHF {mol.uhf}.")
continue

sel_elem_file.write(mol.name + "\n")

return 0

Expand Down
82 changes: 82 additions & 0 deletions scripts/filter_stereoisomers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Python script that is based on MindlessGen
and filters compounds that are redundant stereoisomers.
"""

import argparse
from pathlib import Path
from collections import defaultdict

from tqdm import tqdm
import networkx as nx # type: ignore

from mindlessgen.molecules import get_molecules_from_filesystem # type: ignore
from mindlessgen.molecules import get_molecular_graph # type: ignore


def get_args() -> argparse.Namespace:
"""
Get the command line arguments.
"""
parser = argparse.ArgumentParser(
description="Detect stereoisomers for a given list of molecules."
)
parser.add_argument(
"--verbosity", "-v", type=int, default=1, help="Verbosity level."
)
parser.add_argument(
"--keyword",
type=str,
required=False,
default="molecules.list",
help="Keyword for the file that contains the list of molecules.",
)
parser.add_argument(
"--output-file",
type=str,
required=False,
default="selected_elements_molecules.list",
help="Output file for the selected elements.",
)
return parser.parse_args()


def main() -> int:
"""
Main function that is called when the script is executed.
"""
args = get_args()
output_file = Path(args.output_file).resolve()
if args.verbosity > 0:
print(f"Output file: {output_file}")
mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=args.verbosity)

seen_hashes: defaultdict[str, list[str]] = defaultdict(
list
) # maps graph hashes to list of mol indices or names

with open(output_file, "w", encoding="utf8") as sel_elem_file:
for i, mol in enumerate(
tqdm(mols, desc="Checking composition...", unit="molecule")
):
graph = get_molecular_graph(mol, 1.25, verbosity=args.verbosity)

# Get WL hash with atom type info
g_hash = nx.weisfeiler_lehman_graph_hash(graph, node_attr="element")

if g_hash in seen_hashes.keys():
if args.verbosity > 1:
print(
f"Found stereoisomer: {seen_hashes[g_hash]} "
+ f"and {mol.name} with hash {g_hash}"
)
seen_hashes[g_hash].append(mol.name)
continue
seen_hashes[g_hash].append(mol.name)
sel_elem_file.write(mol.name + "\n")

return 0


if __name__ == "__main__":
raise SystemExit(main())
26 changes: 3 additions & 23 deletions scripts/fragment_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,8 @@
import argparse
from pathlib import Path
from tqdm import tqdm
from mindlessgen.molecules import Molecule, detect_fragments # type: ignore


def get_molecules_from_filesystem(keyword: str) -> list[Molecule]:
"""
Get a list of molecules from the filesystem.
"""
# check if the file exists
if not Path(keyword).exists():
raise FileNotFoundError(f"File '{keyword}' does not exist.")
# read the file
with open(keyword, encoding="utf-8") as file:
mol_names = file.readlines()
# get the molecules and return them
mol_list: list[Molecule] = []
for mol_name in tqdm(
mol_names, desc="Processing molecules from files...", unit="molecule"
):
mol_name = mol_name.strip()
mol = Molecule.read_mol_from_file(mol_name + ".xyz")
mol_list.append(mol)
return mol_list
from mindlessgen.molecules import detect_fragments # type: ignore
from mindlessgen.molecules import get_molecules_from_filesystem # type: ignore


def get_args() -> argparse.Namespace:
Expand Down Expand Up @@ -61,7 +41,7 @@ def main() -> int:
from the command line.
"""
args = get_args()
mols = get_molecules_from_filesystem(keyword=args.keyword)
mols = get_molecules_from_filesystem(keyword=args.keyword, verbosity=0)
# create new directory "new_single_molecules" if it does not exist
newmoldir = Path("fragments").resolve()
newmoldir.mkdir(exist_ok=True, parents=True)
Expand Down
5 changes: 4 additions & 1 deletion src/mindlessgen/molecules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
generate_atom_list,
check_distances,
)
from .refinement import iterative_optimization, detect_fragments
from .refinement import iterative_optimization, detect_fragments, get_molecular_graph
from .postprocess import postprocess_mol
from .miscellaneous import (
get_cov_radii,
Expand All @@ -25,6 +25,7 @@
get_actinides,
get_alkali_metals,
get_alkaline_earth_metals,
get_molecules_from_filesystem,
)

__all__ = [
Expand All @@ -34,6 +35,7 @@
"generate_atom_list",
"iterative_optimization",
"detect_fragments",
"get_molecular_graph",
"get_cov_radii",
"set_random_charge",
"check_distances",
Expand All @@ -44,6 +46,7 @@
"get_actinides",
"get_alkali_metals",
"get_alkaline_earth_metals",
"get_molecules_from_filesystem",
"ati_to_atlist",
"atlist_to_ati",
"postprocess_mol",
Expand Down
Loading