diff --git a/docs/source/user_guide/benchmarks/molecular.rst b/docs/source/user_guide/benchmarks/molecular.rst index 7b1857d9e..ab3e4937c 100644 --- a/docs/source/user_guide/benchmarks/molecular.rst +++ b/docs/source/user_guide/benchmarks/molecular.rst @@ -174,3 +174,60 @@ Input structures: Binary and Ternary Mixtures of Bis(2-hydroxyethyl)ammonium Acetate with Methanol, N,N-Dimethylformamide, and Water at Several Temperatures. J. Chem. Eng. Data 62, 3958-3966 (2017). https://doi.org/10.1021/acs.jced.7b00654 + + + +BDEs +==== + +Summary +------- + +Performance in predicting C-H bond dissociation energies (BDEs) for 60 CYP3A4 drug-like +substrates (CHO elements only), comprising 1117 sp3 C-H bonds across all molecules. + + +Metrics +------- + +1. Direct BDE + +Mean absolute error (MAE) of predicted BDEs against DFT reference values, in kcal/mol. + +For each molecule, BDEs are computed as: BDE = E(radical) + E(H) − E(molecule), where +energies are evaluated on DFT-optimised geometries. + +2. BDE rank + +Mean Kendall's τ rank correlation of predicted BDE rankings against reference rankings, +evaluated per molecule and averaged across all 60 compounds. + +3. Direct BDE (MLFF opt) + +Same as (1), but geometries are first relaxed using the MLFF before evaluating energies. + +4. BDE rank (MLFF opt) + +Same as (2), but using MLFF-optimised geometries. + + +Computational cost +------------------ + +Medium: the DFT geometry tests are fast, but the MLFF geometry optimisation tests may +take several minutes per model on CPU. + + +Data availability +----------------- + +Input structures: + +* Gelzinyte, E. et al. Transferable Machine Learning Interatomic Potential for Bond + Dissociation Energy Estimation. J. Chem. Theory Comput. 20, 164-177 (2024). + DOI: 10.1021/acs.jctc.3c00710 + +Reference data: + +* Same as input data +* B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py new file mode 100644 index 000000000..44ca3315c --- /dev/null +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -0,0 +1,641 @@ +"""Analyse bond dissociation energy benchmark.""" + +from __future__ import annotations + +from pathlib import Path + +from ase import units +from ase.io import read, write +import numpy as np +import pytest +from scipy import stats + +from ml_peg.analysis.utils.decorators import build_table, plot_parity +from ml_peg.analysis.utils.utils import ( + build_dispersion_name_map, + load_metrics_config, + mae, +) +from ml_peg.app import APP_ROOT +from ml_peg.calcs import CALCS_ROOT +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +MODELS = get_model_names(current_models) +D3_MODEL_NAMES = build_dispersion_name_map(MODELS) +CALC_PATH = CALCS_ROOT / "molecular" / "BDEs" / "outputs" +OUT_PATH = APP_ROOT / "data" / "molecular" / "BDEs" + +METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") +DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config( + METRICS_CONFIG_PATH +) + +# Unit conversion +EV_TO_KCAL_PER_MOL = units.mol / units.kcal + +# Populated by dft_geometry_bdes / mlff_geometry_bdes; shared with downstream +# fixtures without appearing in the returned dict (which would add a scatter trace). +_COMPOUND_LABELS: list[str] = [] +_MLFF_COMPOUND_LABELS: list[str] = [] + + +def into_dict_of_labels(atoms, key): + """ + Group atoms based on their atoms.info[key] labels. + + Parameters + ---------- + atoms : list[Atoms] + Atoms objects with info[key] labels to group. + key : str + Key in atoms.info to group by. + + Returns + ------- + dict[str, list[Atoms]] + Dictionary mapping each label value to a list of Atoms objects. + """ + dict_out = {} + for at in atoms: + label = at.info[key] + if label not in dict_out: + dict_out[label] = [] + dict_out[label].append(at) + return dict_out + + +def get_bde(mol_energy, rad_energy, isolated_h_energy): + """ + Compute bond dissociation energy. + + Parameters + ---------- + mol_energy : float + Energy of the intact molecule. + rad_energy : float + Energy of the molecule with the hydrogen atom removed. + isolated_h_energy : float + Energy of the isolated hydrogen atom. + + Returns + ------- + float + Bond dissociation energy. + """ + return rad_energy + isolated_h_energy - mol_energy + + +def get_hover_data_labels(key) -> list[str]: + """ + Get labels based on the key. + + Parameters + ---------- + key : str + Key in atoms.info to retrieve labels from. + + Returns + ------- + list[str] + List of system names from atoms.info entries. + """ + labels = [] + for model_name in MODELS: + model_dir = CALC_PATH / model_name + if model_dir.exists(): + xyz_files = sorted(model_dir.glob("*.xyz")) + if xyz_files: + for xyz_file in xyz_files: + atoms = read(xyz_file, ":") + labels += [at.info[key] for at in atoms] + break + return labels + + +def group_values_by_labels( + values: list[float], labels: list[str] +) -> dict[str, list[float]]: + """ + Group values by their corresponding string labels. + + Parameters + ---------- + values : list[float] + Values to group. + labels : list[str] + Labels corresponding to each value. + + Returns + ------- + dict[str, list[float]] + Dictionary mapping each label to a list of its values. + """ + results = {} + for label, value in zip(labels, values, strict=False): + if label not in results: + results[label] = [] + results[label].append(value) + + return results + + +def process_bdes_per_compound(all_atoms, prefix): + """ + Compute BDEs for given atoms and property prefix. + + Parameters + ---------- + all_atoms + List of Atoms with appropriate labels for radicals, + molecules and isolated atoms. + prefix + "dft_" or "pred_" to fetch the appropriate energy. + + Returns + ------- + dict[str: list[float]] + List of computed bond dissociation energies, for each compound. + """ + compound_dict = into_dict_of_labels(all_atoms, key="compound") + + isolated_atoms = compound_dict.pop("isolated_atom") + isolated_atoms_dict = into_dict_of_labels(isolated_atoms, "element") + isolated_h_energy = isolated_atoms_dict["H"][0].info[f"{prefix}energy"] + + all_bdes = {} + for compound_name, compound_atoms in compound_dict.items(): + mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") + if "rad" not in mol_rad_dict: + continue + + all_bdes[compound_name] = [] + + mol = mol_rad_dict["mol"] + assert len(mol) == 1 + mol = mol[0] + mol_energy = mol.info[f"{prefix}energy"] + + for rad in mol_rad_dict["rad"]: + rad_energy = rad.info[f"{prefix}energy"] + bde = get_bde( + mol_energy=mol_energy, + rad_energy=rad_energy, + isolated_h_energy=isolated_h_energy, + ) + all_bdes[compound_name].append(bde * EV_TO_KCAL_PER_MOL) + + return all_bdes + + +def mean_kendalls_tau(ref_by_label, pred_by_label): + """ + Compute mean Kendall's tau across labels. + + Parameters + ---------- + ref_by_label : dict[str, list] + Reference ranks grouped by compound label. + pred_by_label : dict[str, list] + Predicted ranks grouped by compound label. + + Returns + ------- + float + Mean Kendall's tau correlation across all labels. + """ + kendalls_taus = [] + for label in ref_by_label.keys(): + ref = ref_by_label[label] + pred = pred_by_label[label] + correlation = stats.kendalltau(ref, pred).correlation + kendalls_taus.append(correlation) + return np.mean(kendalls_taus) + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _load_bdes(xyz_suffix: str, labels_out: list) -> dict[str, list]: + """ + Load BDEs from output xyz files for all models. + + Parameters + ---------- + xyz_suffix + Suffix of the xyz filename, e.g. "dft_opt" or "mlff_opt". + labels_out + List to populate in-place with compound labels for each BDE entry. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDEs keyed by model name and "ref". + """ + xyz_filename = f"cytochrome_p450_substrates.{xyz_suffix}.xyz" + results = {"ref": []} | {mlip: [] for mlip in MODELS} + labels_out.clear() + ref_stored = False + + for model_name in MODELS: + model_dir = CALC_PATH / model_name + + if not model_dir.exists(): + continue + + xyz_files = sorted(model_dir.glob(xyz_filename)) + if not xyz_files: + continue + + for xyz_file in xyz_files: + all_atoms = read(xyz_file, index=":") + pred_bdes_by_label = process_bdes_per_compound(all_atoms, "pred_") + for compound_label, pred_bdes in pred_bdes_by_label.items(): + if not ref_stored: + labels_out += [compound_label] * len(pred_bdes) + results[model_name] += pred_bdes + + if not ref_stored: + dft_bdes_by_label = process_bdes_per_compound(all_atoms, "dft_") + for dft_bdes in dft_bdes_by_label.values(): + results["ref"] += dft_bdes + ref_stored = True + + return results + + +def _compute_ranks(bdes: dict, labels: list) -> dict[str, list]: + """ + Compute BDE ranks per compound for all models and reference. + + Parameters + ---------- + bdes + Dictionary of reference and predicted BDEs keyed by model name and "ref". + labels + Compound label for each BDE entry. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDE ranks. + """ + results = {"ref": []} | {mlip: [] for mlip in MODELS} + for model_name in results.keys(): + bdes_by_compound = group_values_by_labels( + values=bdes[model_name], + labels=labels, + ) + for compound_bdes in bdes_by_compound.values(): + ranks = np.argsort(np.argsort(compound_bdes)) + results[model_name] += list(ranks) + return results + + +def _compute_bde_errors(bdes: dict) -> dict[str, float]: + """ + Compute mean absolute error of predicted BDEs against reference. + + Parameters + ---------- + bdes + Dictionary of reference and predicted BDEs keyed by model name and "ref". + + Returns + ------- + dict[str, float] + MAE for each model. + """ + return { + model_name: mae(bdes["ref"], bdes[model_name]) if bdes[model_name] else None + for model_name in MODELS + } + + +def _save_struct_files(xyz_suffix: str) -> None: + """ + Save individual radical XYZ files for app visualisation. + + Writes one file per BDE data point (one per radical) in the same order + as ``_load_bdes``, so that scatter-point indices map to structure files. + Saves for every model that has the relevant output file. + + Parameters + ---------- + xyz_suffix + Suffix of the xyz filename, e.g. ``"dft_opt"`` or ``"mlff_opt"``. + """ + xyz_filename = f"cytochrome_p450_substrates.{xyz_suffix}.xyz" + for model_name in MODELS: + model_dir = CALC_PATH / model_name + if not model_dir.exists(): + continue + xyz_files = sorted(model_dir.glob(xyz_filename)) + if not xyz_files: + continue + + struct_dir = OUT_PATH / model_name / xyz_suffix + struct_dir.mkdir(parents=True, exist_ok=True) + + for xyz_file in xyz_files: + all_atoms = read(xyz_file, index=":") + compound_dict = into_dict_of_labels(all_atoms, key="compound") + compound_dict.pop("isolated_atom", None) + + idx = 0 + for compound_atoms in compound_dict.values(): + mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") + if "rad" not in mol_rad_dict: + continue + for rad in mol_rad_dict["rad"]: + write(struct_dir / f"{idx}.xyz", rad) + idx += 1 + + +def _compute_rank_correlations(ranks: dict, labels: list) -> dict[str, float]: + """ + Compute mean Kendall's tau rank correlation against reference ranks. + + Parameters + ---------- + ranks + Dictionary of reference and predicted BDE ranks keyed by model name and "ref". + labels + Compound label for each rank entry. + + Returns + ------- + dict[str, float] + Mean Kendall's tau for each model. + """ + ref_ranks_by_compound = group_values_by_labels( + values=ranks["ref"], + labels=labels, + ) + results = {} + for model_name in MODELS: + if ranks[model_name]: + pred_ranks_by_compound = group_values_by_labels( + values=ranks[model_name], + labels=labels, + ) + results[model_name] = mean_kendalls_tau( + ref_by_label=ref_ranks_by_compound, + pred_by_label=pred_ranks_by_compound, + ) + else: + results[model_name] = None + return results + + +# --------------------------------------------------------------------------- +# DFT-geometry fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", + title="Bond Dissociation Energies on DFT Geometries", + x_label="Predicted BDE / kcal/mol", + y_label="Reference BDE / kcal/mol", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def dft_geometry_bdes() -> dict[str, list]: + """ + Compute BDEs for all sp3 H atoms on DFT optimised geometries. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDEs and labels of the + compound for the corresponding BDE. + """ + result = _load_bdes("dft_opt", _COMPOUND_LABELS) + _save_struct_files("dft_opt") + return result + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", + title="Bond Dissociation Energy ranks on DFT Geometries", + x_label="Predicted BDE rank", + y_label="Reference BDE rank", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def dft_geometry_bde_ranks(dft_geometry_bdes) -> dict[str, list]: + """ + Compute BDE ranks for all sp3 H atoms using DFT optimised geometries. + + Parameters + ---------- + dft_geometry_bdes : dict[str, list] + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDE ranks. + """ + return _compute_ranks(dft_geometry_bdes, _COMPOUND_LABELS) + + +@pytest.fixture +def dft_geometry_bde_errors(dft_geometry_bdes) -> dict[str, float]: + """ + Get mean absolute error for bond dissociation energies. + + Parameters + ---------- + dft_geometry_bdes + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE errors for all models. + """ + return _compute_bde_errors(dft_geometry_bdes) + + +@pytest.fixture +def dft_geometry_bde_rank_correlations(dft_geometry_bde_ranks) -> dict[str, float]: + """ + Compute mean Kendall's tau rank correlation across all molecules. + + For each molecule, sp3 C-H bond strengths are ranked from lowest to + highest by both DFT and MLIP, and the correlation is measured by + Kendall's tau. The final metric is the average across all molecules. + + Parameters + ---------- + dft_geometry_bde_ranks : dict[str, list] + Dictionary of reference and predicted BDE ranks. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE rank correlation for all models. + """ + return _compute_rank_correlations(dft_geometry_bde_ranks, _COMPOUND_LABELS) + + +# --------------------------------------------------------------------------- +# MLFF-geometry fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.mlff_opt_geometry.BDEs.json", + title="Bond Dissociation Energies on MLFF Geometries", + x_label="Predicted BDE / kcal/mol", + y_label="Reference BDE / kcal/mol", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def mlff_geometry_bdes() -> dict[str, list]: + """ + Compute BDEs for all sp3 H atoms on MLFF optimised geometries. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDEs and labels of the + compound for the corresponding BDE. + """ + result = _load_bdes("mlff_opt", _MLFF_COMPOUND_LABELS) + _save_struct_files("mlff_opt") + return result + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.mlff_opt_geometry.BDE_ranks.json", + title="Bond Dissociation Energy ranks on MLFF Geometries", + x_label="Predicted BDE rank", + y_label="Reference BDE rank", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def mlff_geometry_bde_ranks(mlff_geometry_bdes) -> dict[str, list]: + """ + Compute BDE ranks for all sp3 H atoms using MLFF optimised geometries. + + Parameters + ---------- + mlff_geometry_bdes : dict[str, list] + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDE ranks. + """ + return _compute_ranks(mlff_geometry_bdes, _MLFF_COMPOUND_LABELS) + + +@pytest.fixture +def mlff_geometry_bde_errors(mlff_geometry_bdes) -> dict[str, float]: + """ + Get mean absolute error for bond dissociation energies on MLFF geometries. + + Parameters + ---------- + mlff_geometry_bdes + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE errors for all models. + """ + return _compute_bde_errors(mlff_geometry_bdes) + + +@pytest.fixture +def mlff_geometry_bde_rank_correlations(mlff_geometry_bde_ranks) -> dict[str, float]: + """ + Compute mean Kendall's tau rank correlation across all molecules on MLFF geometries. + + For each molecule, sp3 C-H bond strengths are ranked from lowest to + highest by both DFT and MLIP, and the correlation is measured by + Kendall's tau. The final metric is the average across all molecules. + + Parameters + ---------- + mlff_geometry_bde_ranks : dict[str, list] + Dictionary of reference and predicted BDE ranks. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE rank correlation for all models. + """ + return _compute_rank_correlations(mlff_geometry_bde_ranks, _MLFF_COMPOUND_LABELS) + + +# --------------------------------------------------------------------------- +# Metrics table +# --------------------------------------------------------------------------- + + +@pytest.fixture +@build_table( + filename=OUT_PATH / "metrics_table.CYP3A4.dft_opt_geometry.BDEs.json", + metric_tooltips=DEFAULT_TOOLTIPS, + thresholds=DEFAULT_THRESHOLDS, + mlip_name_map=D3_MODEL_NAMES, +) +def metrics( + dft_geometry_bde_errors: dict[str, float], + dft_geometry_bde_rank_correlations: dict[str, float], + mlff_geometry_bde_errors: dict[str, float], + mlff_geometry_bde_rank_correlations: dict[str, float], +) -> dict[str, dict]: + """ + Get all BDE metrics. + + Parameters + ---------- + dft_geometry_bde_errors + Mean absolute errors on DFT-relaxed structures. + dft_geometry_bde_rank_correlations + Mean Kendall's tau across predicted and reference BDE ranks on DFT geometries. + mlff_geometry_bde_errors + Mean absolute errors on MLFF-relaxed structures. + mlff_geometry_bde_rank_correlations + Mean Kendall's tau across predicted and reference BDE ranks on MLFF geometries. + + Returns + ------- + dict[str, dict] + Metric names and values for all models. + """ + return { + "Direct BDE": dft_geometry_bde_errors, + "BDE rank": dft_geometry_bde_rank_correlations, + "Direct BDE (MLFF opt)": mlff_geometry_bde_errors, + "BDE rank (MLFF opt)": mlff_geometry_bde_rank_correlations, + } + + +def test_bdes(metrics: dict[str, dict]) -> None: + """ + Run BDEs test. + + Parameters + ---------- + metrics + All BDEs metrics. + """ + return diff --git a/ml_peg/analysis/molecular/BDEs/metrics.yml b/ml_peg/analysis/molecular/BDEs/metrics.yml new file mode 100644 index 000000000..3d6af0b25 --- /dev/null +++ b/ml_peg/analysis/molecular/BDEs/metrics.yml @@ -0,0 +1,25 @@ +metrics: + Direct BDE: + good: 0.5 + bad: 50.0 + unit: kcal/mol + tooltip: "Mean Absolute Error on DFT-relaxed structures" + level of theory: B3LYP-D3BJ/def2-SV(P) + Direct BDE (MLFF opt): + good: 0.5 + bad: 50.0 + unit: kcal/mol + tooltip: "Mean Absolute Error on MLFF-relaxed structures" + level of theory: B3LYP-D3BJ/def2-SV(P) + BDE rank: + good: 1.0 + bad: 0.5 + unit: null + tooltip: "Mean Kendal rank correlation coefficient" + level of theory: B3LYP-D3BJ/def2-SV(P) + BDE rank (MLFF opt): + good: 1.0 + bad: 0.5 + unit: null + tooltip: "Mean Kendall rank correlation coefficient on MLFF-relaxed structures" + level of theory: B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/app/molecular/BDEs/app_BDEs.py b/ml_peg/app/molecular/BDEs/app_BDEs.py new file mode 100644 index 000000000..9ba1716c6 --- /dev/null +++ b/ml_peg/app/molecular/BDEs/app_BDEs.py @@ -0,0 +1,240 @@ +"""Run BDEs app.""" + +from __future__ import annotations + +from dash import Dash, Input, Output, callback +from dash.html import Div, Iframe + +from ml_peg.app import APP_ROOT +from ml_peg.app.base_app import BaseApp +from ml_peg.app.utils.build_callbacks import ( + plot_from_table_column, + struct_from_scatter, +) +from ml_peg.app.utils.load import read_plot +from ml_peg.app.utils.weas import generate_weas_html +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +# Get all models +MODELS = get_model_names(current_models) +BENCHMARK_NAME = "Bond Dissociation Energies" +DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/molecular.html#bdes" +DATA_PATH = APP_ROOT / "data" / "molecular" / "BDEs" + + +class BDEsApp(BaseApp): + """BDEs benchmark app layout and callbacks.""" + + def register_callbacks(self) -> None: + """Register callbacks to app.""" + scatter_bdes = read_plot( + DATA_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", + id=f"{BENCHMARK_NAME}-figure", + ) + scatter_ranks = read_plot( + DATA_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", + id=f"{BENCHMARK_NAME}-ranks-figure", + ) + scatter_mlff_bdes = read_plot( + DATA_PATH / "figure.CYP3A4.mlff_opt_geometry.BDEs.json", + id=f"{BENCHMARK_NAME}-mlff-figure", + ) + scatter_mlff_ranks = read_plot( + DATA_PATH / "figure.CYP3A4.mlff_opt_geometry.BDE_ranks.json", + id=f"{BENCHMARK_NAME}-mlff-ranks-figure", + ) + + def _get_structs(suffix: str) -> list[str]: + """ + Return asset paths for the first model with saved structure files. + + Parameters + ---------- + suffix + Geometry suffix, e.g. ``"dft_opt"`` or ``"mlff_opt"``. + + Returns + ------- + list[str] + Asset URL paths for each structure file, or empty list if none found. + """ + for model_name in MODELS: + structs_dir = DATA_PATH / model_name / suffix + xyz_files = sorted(structs_dir.glob("*.xyz"), key=lambda p: int(p.stem)) + if xyz_files: + return [ + f"assets/molecular/BDEs/{model_name}/{suffix}/{f.stem}.xyz" + for f in xyz_files + ] + return [] + + def _get_structs_per_model(suffix: str) -> dict[str, list[str]]: + """ + Return asset paths for each model that has saved structure files. + + Parameters + ---------- + suffix + Geometry suffix, e.g. ``"dft_opt"`` or ``"mlff_opt"``. + + Returns + ------- + dict[str, list[str]] + Mapping of model name to asset URL paths for its structure files. + """ + result = {} + for model_name in MODELS: + structs_dir = DATA_PATH / model_name / suffix + xyz_files = sorted(structs_dir.glob("*.xyz"), key=lambda p: int(p.stem)) + if xyz_files: + result[model_name] = [ + f"assets/molecular/BDEs/{model_name}/{suffix}/{f.stem}.xyz" + for f in xyz_files + ] + return result + + structs_dft = _get_structs("dft_opt") + structs_mlff_by_model = _get_structs_per_model("mlff_opt") + + plot_from_table_column( + table_id=self.table_id, + plot_id=f"{BENCHMARK_NAME}-figure-placeholder", + column_to_plot={ + "Direct BDE": scatter_bdes, + "BDE rank": scatter_ranks, + "Direct BDE (MLFF opt)": scatter_mlff_bdes, + "BDE rank (MLFF opt)": scatter_mlff_ranks, + }, + ) + + struct_from_scatter( + scatter_id=f"{BENCHMARK_NAME}-figure", + struct_id=f"{BENCHMARK_NAME}-struct-placeholder", + structs=structs_dft, + mode="struct", + ) + + struct_from_scatter( + scatter_id=f"{BENCHMARK_NAME}-ranks-figure", + struct_id=f"{BENCHMARK_NAME}-struct-placeholder", + structs=structs_dft, + mode="struct", + ) + + def _show_mlff_struct_from_click(click_data): + """ + Return structure viewer for the clicked MLFF scatter point. + + Parameters + ---------- + click_data + Clicked data point in scatter plot. + + Returns + ------- + Div + Visualised structure on plot click. + """ + if not click_data: + return Div("Click on a metric to view the structure.") + point = click_data["points"][0] + idx = point["pointNumber"] + curve_num = point["curveNumber"] + if curve_num >= len(MODELS): + return Div("No structures available for this model.") + model_name = MODELS[curve_num] + structs = structs_mlff_by_model.get(model_name, []) + if not structs: + return Div("No structures available for this model.") + return Div( + Iframe( + srcDoc=generate_weas_html(structs[idx], "struct", 0), + style={ + "height": "550px", + "width": "100%", + "border": "1px solid #ddd", + "borderRadius": "5px", + }, + ) + ) + + @callback( + Output( + f"{BENCHMARK_NAME}-struct-placeholder", "children", allow_duplicate=True + ), + Input(f"{BENCHMARK_NAME}-mlff-figure", "clickData"), + prevent_initial_call="initial_duplicate", + ) + def show_mlff_struct(click_data): + """ + Show structure for the clicked MLFF BDE scatter point. + + Parameters + ---------- + click_data + Clicked data point in scatter plot. + + Returns + ------- + Div + Visualised structure on plot click. + """ + return _show_mlff_struct_from_click(click_data) + + @callback( + Output( + f"{BENCHMARK_NAME}-struct-placeholder", "children", allow_duplicate=True + ), + Input(f"{BENCHMARK_NAME}-mlff-ranks-figure", "clickData"), + prevent_initial_call="initial_duplicate", + ) + def show_mlff_rank_struct(click_data): + """ + Show structure for the clicked MLFF BDE rank scatter point. + + Parameters + ---------- + click_data + Clicked data point in scatter plot. + + Returns + ------- + Div + Visualised structure on plot click. + """ + return _show_mlff_struct_from_click(click_data) + + +def get_app() -> BDEsApp: + """ + Get bond dissociation energy benchmark app. + + Returns + ------- + BDEsApp + Benchmark layout and callback registration. + """ + return BDEsApp( + name=BENCHMARK_NAME, + description="Bond Dissociation Energies", + docs_url=DOCS_URL, + table_path=DATA_PATH / "metrics_table.CYP3A4.dft_opt_geometry.BDEs.json", + extra_components=[ + Div(id=f"{BENCHMARK_NAME}-figure-placeholder"), + Div(id=f"{BENCHMARK_NAME}-struct-placeholder"), + ], + ) + + +if __name__ == "__main__": + # Create Dash app + full_app = Dash(__name__, assets_folder=DATA_PATH.parent.parent) + + # Construct layout and register callbacks + bdes_app = get_app() + full_app.layout = bdes_app.layout + bdes_app.register_callbacks() + + # Run app + full_app.run(port=8055, debug=True) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py new file mode 100644 index 000000000..e8b06f633 --- /dev/null +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py @@ -0,0 +1,102 @@ +""" +Calculate bond dissociation energies of drug-like molecules. + +Journal of Chemical Theory and Computation 2024 20 (1), 164-177 +DOI: 10.1021/acs.jctc.3c00710 +""" + +from __future__ import annotations + +from copy import copy +from pathlib import Path +from typing import Any + +from ase.io import read, write +from janus_core.calculations.geom_opt import GeomOpt +import pytest + +from ml_peg.calcs.utils.utils import download_s3_data +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) + +OUT_PATH = Path(__file__).parent / "outputs" + +DFT_OPT_FILENAME = "cytochrome_p450_substrates.dft_opt.xyz" + + +def evaluate_bde_structures( + model_name: str, + model: Any, + bde_dir: Path, + out_filename: str, + geom_opt: bool = False, +) -> None: + """ + Evaluate MLFF energies on BDE structures, optionally after geometry optimisation. + + Parameters + ---------- + model_name + Name of the model, used to determine the output directory. + model + Model object providing get_calculator() and add_d3_calculator(). + bde_dir + Directory containing the input xyz file. + out_filename + Name of the output xyz file. + geom_opt + If True, geometry-optimise each structure with the MLFF before + evaluating energies and forces. + """ + model.default_dtype = "float64" + model.dispersion_kwargs["dtype"] = "float64" + calc = model.get_calculator() + calc = model.add_d3_calculator(calc) + + mols_rads = read(bde_dir / DFT_OPT_FILENAME, ":") + + ats_out = [] + for at in mols_rads: + at.calc = copy(calc) + if geom_opt: + geomopt = GeomOpt(struct=at, filter_class=None, write_results=False) + geomopt.run() + at = geomopt.struct + at.info["pred_energy"] = at.get_potential_energy() + at.arrays["pred_forces"] = at.get_forces() + ats_out.append(at) + + write_dir = OUT_PATH / model_name + write_dir.mkdir(parents=True, exist_ok=True) + write(write_dir / out_filename, ats_out) + + +@pytest.mark.parametrize("mlip", MODELS.items()) +def test_bond_dissociation_energy(mlip: tuple[str, Any]) -> None: + """ + Calculate C-H bond dissociation energy of drug-like molecules. + + Parameters + ---------- + mlip + Name of model use and model to get calculator. + """ + model_name, model = mlip + + bde_dir = ( + download_s3_data( + key="inputs/molecular/BDEs/BDEs.zip", + filename="BDEs.zip", + ) + / "BDEs" + ) + + evaluate_bde_structures( + model_name=model_name, + model=model, + bde_dir=bde_dir, + out_filename=DFT_OPT_FILENAME, + geom_opt=False, + ) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py new file mode 100644 index 000000000..6f68638c2 --- /dev/null +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py @@ -0,0 +1,55 @@ +""" +Calculate BDEs of drug-like molecules on MLFF-optimised geometries. + +Journal of Chemical Theory and Computation 2024 20 (1), 164-177 +DOI: 10.1021/acs.jctc.3c00710 +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from ml_peg.calcs.molecular.BDEs.calc_BDEs import evaluate_bde_structures +from ml_peg.calcs.utils.utils import download_s3_data +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) + +MLFF_OPT_FILENAME = "cytochrome_p450_substrates.mlff_opt.xyz" + + +@pytest.mark.parametrize("mlip", MODELS.items()) +def test_bond_dissociation_energy_mlff_opt(mlip: tuple[str, Any]) -> None: + """ + Calculate C-H BDEs of drug-like molecules on MLFF-optimised geometries. + + Geometry-optimises each structure (molecule, radical, isolated atom) using the + MLFF before evaluating energies and forces. DFT reference energies are preserved + from the input structures. Reference BDEs are computed from DFT energies at DFT + geometries. + + Parameters + ---------- + mlip + Name of model use and model to get calculator. + """ + model_name, model = mlip + + bde_dir = ( + download_s3_data( + key="inputs/molecular/BDEs/BDEs.zip", + filename="BDEs.zip", + ) + / "BDEs" + ) + + evaluate_bde_structures( + model_name=model_name, + model=model, + bde_dir=bde_dir, + out_filename=MLFF_OPT_FILENAME, + geom_opt=True, + )