From 3b0f613145819341ee207576681fffde6c1b6d7c Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Tue, 3 Feb 2026 13:40:47 +0100 Subject: [PATCH 01/20] WIP calculations --- .../calc_split_vacancies.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py diff --git a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py new file mode 100644 index 000000000..68fa13242 --- /dev/null +++ b/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py @@ -0,0 +1,77 @@ +"""Run calculations for elasticity benchmark.""" + +from __future__ import annotations + +from copy import deepcopy +from pathlib import Path +from typing import Any + +# from pymatgen.analysis import StructureMatcher +from ase.io import read, write +from ase.optimize import LBFGS +import pytest + +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) +# TODO: DATA_PATH = download_github_data(filename, github_uri) +DATA_PATH = Path("/u/twarford/dev/defect_data/out") +OUT_PATH = Path(__file__).parent / "outputs" + + +# def rmsd(atoms_1, atoms_2): + + +@pytest.mark.parametrize("mlip", MODELS.items()) +def test_relax_and_calculate_energy(mlip: tuple[str, Any]): + """ + Run calculations required for split vacancy formation energies. + + Parameters + ---------- + mlip + Name of model use and model to get calculator. + """ + model_name, model = mlip + calc = model.get_calculator() + + fmax = 0.03 + steps = 200 + + for material_dir in DATA_PATH.iterdir(): + for cation_dir in material_dir.iterdir(): + if not cation_dir.is_dir(): + continue # skip pristine supercell.xyz files (not used) + + nv_xyz_path = cation_dir / "normal_vacancy.xyz" + sv_xyz_path = cation_dir / "split_vacancy.xyz" + sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz" + + for atoms_path in [nv_xyz_path, sv_xyz_path, sv_from_nv_xyz_path]: + relaxed_atoms = [] + atoms_list = read(atoms_path, ":") + + for atoms in atoms_list: + atoms.calc = deepcopy(calc) + atoms.info["initial_energy"] = atoms.get_potential_energy() + + opt = LBFGS(atoms, logfile=None) + opt.run(fmax=fmax, steps=steps) + + atoms.info["relaxed_energy"] = atoms.get_potential_energy() + + relaxed_atoms.append(atoms) + break + + atoms_out_path = ( + OUT_PATH + / model_name + / material_dir.stem + / cation_dir.stem + / f"{atoms_path.stem}.xyz.gz" + ) + atoms_out_path.parent.mkdir(exist_ok=True, parents=True) + + write(relaxed_atoms, atoms_out_path) + break From 3c890aa7979937f35f0ad564d03715f302b84e1e Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Tue, 3 Feb 2026 13:38:13 +0000 Subject: [PATCH 02/20] bugfixes --- .../calc_split_vacancies.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py index 68fa13242..3aa21df59 100644 --- a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py @@ -10,13 +10,14 @@ from ase.io import read, write from ase.optimize import LBFGS import pytest +from tqdm.auto import tqdm from ml_peg.models.get_models import load_models from ml_peg.models.models import current_models MODELS = load_models(current_models) # TODO: DATA_PATH = download_github_data(filename, github_uri) -DATA_PATH = Path("/u/twarford/dev/defect_data/out") +DATA_PATH = Path("/Users/tw/Downloads/out") OUT_PATH = Path(__file__).parent / "outputs" @@ -39,20 +40,25 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): fmax = 0.03 steps = 200 - for material_dir in DATA_PATH.iterdir(): - for cation_dir in material_dir.iterdir(): - if not cation_dir.is_dir(): - continue # skip pristine supercell.xyz files (not used) - + for material_dir in tqdm(list(DATA_PATH.iterdir())): + cation_dirs = [ + p for p in material_dir.iterdir() if p.is_dir() + ] # skip pristine supercell.xyz files (not used) + for cation_dir in tqdm(cation_dirs, leave=False): nv_xyz_path = cation_dir / "normal_vacancy.xyz" sv_xyz_path = cation_dir / "split_vacancy.xyz" sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz" - for atoms_path in [nv_xyz_path, sv_xyz_path, sv_from_nv_xyz_path]: + for atoms_path in tqdm( + [nv_xyz_path, sv_xyz_path, sv_from_nv_xyz_path], leave=False + ): + if not atoms_path.exists(): + continue + relaxed_atoms = [] atoms_list = read(atoms_path, ":") - for atoms in atoms_list: + for atoms in tqdm(atoms_list, leave=False): atoms.calc = deepcopy(calc) atoms.info["initial_energy"] = atoms.get_potential_energy() @@ -62,7 +68,7 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): atoms.info["relaxed_energy"] = atoms.get_potential_energy() relaxed_atoms.append(atoms) - break + # break atoms_out_path = ( OUT_PATH @@ -73,5 +79,5 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): ) atoms_out_path.parent.mkdir(exist_ok=True, parents=True) - write(relaxed_atoms, atoms_out_path) - break + write(atoms_out_path, relaxed_atoms) + # break From 4e3d27c27c459035e53cdff55afe659e9a88f7ad Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Tue, 3 Feb 2026 14:05:56 +0000 Subject: [PATCH 03/20] float32 to float64 --- .../bulk_crystal/calc_split_vacancies/calc_split_vacancies.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py index 3aa21df59..e9a54d9a2 100644 --- a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py @@ -35,6 +35,7 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): Name of model use and model to get calculator. """ model_name, model = mlip + model.default_dtype = "float64" calc = model.get_calculator() fmax = 0.03 From ac29957e87beefdcbf1263598f829c4afcd6bf7c Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 09:50:10 +0000 Subject: [PATCH 04/20] very WIP --- .../analyse_split_vacancy.py | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py new file mode 100644 index 000000000..2f16b9635 --- /dev/null +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +from pathlib import Path + +from ase.io import read +from pymatgen.analysis.structure_matcher import StructureMatcher +from pymatgen.core import Structure +import pytest +from scipy.stats import spearmanr +from tqdm.auto import tqdm + +from ml_peg.analysis.utils.utils import ( + load_metrics_config, +) +from ml_peg.calcs import CALCS_ROOT +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +MODELS = get_model_names(current_models) +CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs" +# OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" + +METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") +DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config( + METRICS_CONFIG_PATH +) + +# same setting as MatBench +# https://github.com/janosh/matbench-discovery/blob/93cc6907ac08b4adaa8391ccc4adf9c015c0dd61/matbench_discovery/structure/symmetry.py#L124 +STRUCTURE_MATCHER = StructureMatcher(stol=1.0, scale=False) + + +def get_rmsd(atoms_1, atoms_2): + rmsd, max_dist = STRUCTURE_MATCHER.get_rms_dist( + Structure.from_ase_atoms(atoms_1), Structure.from_ase_atoms(atoms_2) + ) + + return rmsd + + +def get_hoverdata() -> tuple[list, list, list]: + # TODO: RMSD could be good hoverdata - think about this. Only needed for formation energy parity plot. + # TODO: add cell charge? + mp_ids = [] + formulae = [] + vacant_cations = [] + + model_dir = model_dir = CALC_PATH / MODELS[0] + for material_dir in tqdm(list(model_dir.iterdir())): + split_dir_name = material_dir.stem.split("-") + bulk_formula = split_dir_name[0] + mp_id = f"mp-{split_dir_name[-1]}" + + cation_dirs = [ + p for p in material_dir.iterdir() if p.is_dir() + ] # skip pristine supercell.xyz files if present (not used) + + for cation_dir in cation_dirs: + cation = cation_dir.stem + + mp_ids.append(mp_id) + formulae.append(bulk_formula) + vacant_cations.append(cation) + + return mp_ids, formulae, vacant_cations + + +@pytest.fixture # cache outputs +def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: + preference_energy_threshold = 0 # TODO: confirm + + result_formation_energy = {"ref": []} | {mlip: [] for mlip in MODELS} + result_spearmans_coefficient = {"ref": []} | {mlip: [] for mlip in MODELS} + # TODO: investigate Kendall rank correlation + result_rmsd = {mlip: [] for mlip in MODELS} + + ref_stored = False + for model_name in MODELS: + model_dir = CALC_PATH / model_name + + if not model_dir.exists(): + continue + + for material_dir in tqdm(list(model_dir.iterdir())): + cation_dirs = [ + p for p in material_dir.iterdir() if p.is_dir() + ] # skip pristine supercell.xyz files if present (not used) + + for cation_dir in tqdm(cation_dirs, leave=False): + cation = cation_dir.stem + + nv_xyz_path = cation_dir / "normal_vacancy.xyz.gz" + sv_xyz_path = cation_dir / "split_vacancy.xyz.gz" + # sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz.gz" + + nv_atoms_list = read(nv_xyz_path, ":") + sv_atoms_list = read(sv_xyz_path, ":") + + nv_energies = [at.info["relaxed_energy"] for at in nv_atoms_list] + sv_energies = [at.info["relaxed_energy"] for at in sv_atoms_list] + + # TODO (later): result_rmsd[model_name] + # load original structure + # use get_rmsd + + formula = nv_atoms_list[0].info["name"] + cell_charge = nv_atoms_list[0][-1].info["cell_charge"] # TODO + + sv_formation_energy = min(sv_energies) - min(nv_energies) + sv_preferred = sv_formation_energy < preference_energy_threshold + + if not ref_stored: + ref_nv_energies = [at.info["ref_energy"] for at in nv_atoms_list] + ref_sv_energies = [at.info["ref_energy"] for at in sv_atoms_list] + + ref_sv_formation_energy = min(ref_sv_energies) - min( + ref_nv_energies + ) + ref_sv_preferred = ( + ref_sv_formation_energy < preference_energy_threshold + ) + + result_formation_energy["ref"] = ref_sv_formation_energy + + # calculate metrics + spearmans_coefficient = spearmanr( + nv_energies + sv_energies, ref_sv_energies + ref_nv_energies + ) + result_rmsd = get_rmsd() # TODO: RMSD of what?? + + ref_stored = False + + return result_formation_energy, result_spearmans_coefficient, result_rmsd From 358b32b067974f52c8a85e525bcb47c4e23d5e51 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 10:42:43 +0000 Subject: [PATCH 05/20] passed, still many TODOs --- .../analyse_split_vacancy.py | 109 ++++++++++++++++-- 1 file changed, 101 insertions(+), 8 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 2f16b9635..cedd9f532 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -10,15 +10,17 @@ from tqdm.auto import tqdm from ml_peg.analysis.utils.utils import ( - load_metrics_config, + load_metrics_config, mae ) from ml_peg.calcs import CALCS_ROOT from ml_peg.models.get_models import get_model_names from ml_peg.models.models import current_models +from ml_peg.analysis.utils.decorators import build_table, plot_parity +from ml_peg.app import APP_ROOT MODELS = get_model_names(current_models) CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs" -# OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" +OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config( @@ -64,13 +66,15 @@ def get_hoverdata() -> tuple[list, list, list]: return mp_ids, formulae, vacant_cations +MP_IDS, BULK_FORMULAE, VACANT_CATIONS = get_hoverdata() @pytest.fixture # cache outputs def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: preference_energy_threshold = 0 # TODO: confirm - result_formation_energy = {"ref": []} | {mlip: [] for mlip in MODELS} - result_spearmans_coefficient = {"ref": []} | {mlip: [] for mlip in MODELS} + result_formation_energy = {"ref": []} | {mlip: [] for mlip in MODELS} # formation energy for every material-cation pair + result_spearmans_coefficient = {mlip: [] for mlip in MODELS} # spearmans coefficient for every material-cation pair + result_rmsd = {mlip: [] for mlip in MODELS} # RMSD error for every material-cation pair # TODO: investigate Kendall rank correlation result_rmsd = {mlip: [] for mlip in MODELS} @@ -91,6 +95,7 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: nv_xyz_path = cation_dir / "normal_vacancy.xyz.gz" sv_xyz_path = cation_dir / "split_vacancy.xyz.gz" + if not (nv_xyz_path.exists() and sv_xyz_path.exists()): continue # TODO: remove!!! # sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz.gz" nv_atoms_list = read(nv_xyz_path, ":") @@ -103,8 +108,8 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: # load original structure # use get_rmsd - formula = nv_atoms_list[0].info["name"] - cell_charge = nv_atoms_list[0][-1].info["cell_charge"] # TODO + # formula = nv_atoms_list[0].info["name"] + # cell_charge = nv_atoms_list[0][-1].info["cell_charge"] # TODO sv_formation_energy = min(sv_energies) - min(nv_energies) sv_preferred = sv_formation_energy < preference_energy_threshold @@ -120,14 +125,102 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: ref_sv_formation_energy < preference_energy_threshold ) - result_formation_energy["ref"] = ref_sv_formation_energy + result_formation_energy["ref"].append(ref_sv_formation_energy) # calculate metrics spearmans_coefficient = spearmanr( nv_energies + sv_energies, ref_sv_energies + ref_nv_energies ) - result_rmsd = get_rmsd() # TODO: RMSD of what?? + # result_rmsd[model_name] = get_rmsd() # TODO: RMSD of what?? + result_formation_energy[model_name].append(sv_formation_energy) + result_spearmans_coefficient[model_name].append(spearmans_coefficient) + ref_stored = False return result_formation_energy, result_spearmans_coefficient, result_rmsd + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure_formation_energies_dft.json", + title="Split Vacancy Formation Energy (from Normal Vacancy)", + x_label="Predicted Split Vacancy Formation Energy / eV", + y_label="DFT Split Vacancy Formation Energy / eV", + hoverdata={ + "Materials Project ID": MP_IDS, + "Formula": BULK_FORMULAE, + "Vacant Cation": VACANT_CATIONS, + }, +) +def formation_energies_dft(build_results) -> dict[str, list]: + """ + Get DFT and predicted lattice constant for all crystals. + + Returns + ------- + dict[str, list] + Dictionary of DFT and predicted lattice constants. + """ + result_formation_energies, _, _ = build_results + return result_formation_energies + +@pytest.fixture +def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: + """ + Get mean absolute error for split-vancacy formation energies compared to DFT. + + Parameters + ---------- + lattice_constants_dft + Dictionary of DFT and predicted lattice constants. + + Returns + ------- + dict[str, float] + Dictionary of predicted lattice constant errors for all models. + """ + results = {} + for model_name in MODELS: + results[model_name] = mae( + formation_energies_dft["ref"], formation_energies_dft[model_name] + ) + return results + +@pytest.fixture +@build_table( + filename=OUT_PATH / "lattice_constants_metrics_table.json", + metric_tooltips=DEFAULT_TOOLTIPS, + thresholds=DEFAULT_THRESHOLDS, +) +def metrics( + formation_energy_dft_mae: dict[str, float]#, metric_2: dict[str, float] +) -> dict[str, dict]: + """ + Get all new benchmark metrics. + + Parameters + ---------- + metric_1 + Metric 1 value for all models. + metric_2 + Metric 2 value for all models. + + Returns + ------- + dict[str, dict] + Metric names and values for all models. + """ + return { + "Metric 1": formation_energy_dft_mae, + } + +def test_new_benchmark(metrics: dict[str, dict]) -> None: + """ + Run new benchmark analysis. + + Parameters + ---------- + metrics + All new benchmark metric names and dictionary of values for each model. + """ + return \ No newline at end of file From 819918b821281f720ff5e3583c42fd23815aa872 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 11:02:17 +0000 Subject: [PATCH 06/20] rename --- .../calc_split_vacancies.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) rename ml_peg/calcs/bulk_crystal/{calc_split_vacancies => split_vacancy}/calc_split_vacancies.py (88%) diff --git a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py similarity index 88% rename from ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py rename to ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index e9a54d9a2..24be70214 100644 --- a/ml_peg/calcs/bulk_crystal/calc_split_vacancies/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -50,12 +50,14 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): sv_xyz_path = cation_dir / "split_vacancy.xyz" sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz" - for atoms_path in tqdm( - [nv_xyz_path, sv_xyz_path, sv_from_nv_xyz_path], leave=False - ): - if not atoms_path.exists(): - continue + if not (nv_xyz_path.exists() and sv_xyz_path.exists()): + continue + atoms_paths = [nv_xyz_path, sv_xyz_path] + if sv_from_nv_xyz_path.exists(): + atoms_paths += sv_from_nv_xyz_path + + for atoms_path in tqdm(atoms_paths, leave=False): relaxed_atoms = [] atoms_list = read(atoms_path, ":") From b1f9098d90a72ae658b26e5e86e87ad3eabbdb1c Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 11:10:52 +0000 Subject: [PATCH 07/20] remove sv_from_nv_xyz_path for simplicity --- .../calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index 24be70214..a0bd415eb 100644 --- a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -48,14 +48,11 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): for cation_dir in tqdm(cation_dirs, leave=False): nv_xyz_path = cation_dir / "normal_vacancy.xyz" sv_xyz_path = cation_dir / "split_vacancy.xyz" - sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz" if not (nv_xyz_path.exists() and sv_xyz_path.exists()): continue atoms_paths = [nv_xyz_path, sv_xyz_path] - if sv_from_nv_xyz_path.exists(): - atoms_paths += sv_from_nv_xyz_path for atoms_path in tqdm(atoms_paths, leave=False): relaxed_atoms = [] @@ -71,7 +68,6 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): atoms.info["relaxed_energy"] = atoms.get_potential_energy() relaxed_atoms.append(atoms) - # break atoms_out_path = ( OUT_PATH @@ -83,4 +79,3 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): atoms_out_path.parent.mkdir(exist_ok=True, parents=True) write(atoms_out_path, relaxed_atoms) - # break From ef2ddbc360251cfbe823230af8d2ea86f39c25e1 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 15:51:22 +0000 Subject: [PATCH 08/20] Working app --- .../split_vacancy/app_split_vacancy.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py new file mode 100644 index 000000000..f211c6387 --- /dev/null +++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py @@ -0,0 +1,92 @@ +"""Run lattice constants benchmark app.""" + +from __future__ import annotations + +from dash import Dash +from dash.html import Div + +from ml_peg.app import APP_ROOT +from ml_peg.app.base_app import BaseApp +from ml_peg.app.utils.build_callbacks import plot_from_table_column, struct_from_scatter +from ml_peg.app.utils.load import read_plot +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +# Get all models +MODELS = get_model_names(current_models) +BENCHMARK_NAME = "Split vacancy" +DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/bulk_crystal.html#lattice-constants" # TODO: change +DATA_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" + + +class SplitVacancyApp(BaseApp): + """Lattice constants benchmark app layout and callbacks.""" + + def register_callbacks(self) -> None: + """Register callbacks to app.""" + scatter_dft = read_plot( + DATA_PATH / "figure_formation_energies_dft.json", id=f"{BENCHMARK_NAME}-figure" + ) + + # structs_dir = DATA_PATH / MODELS[0] + # structs = [] + # for struct_file in sorted(structs_dir.glob("*.xyz")): + # if struct_file.stem == "SiC": + # structs.extend( + # [ + # f"assets/bulk_crystal/lattice_constants/{MODELS[0]}/{struct_file.stem}.xyz" + # ] + # * 2 + # ) + # else: + # structs.append( + # f"assets/bulk_crystal/lattice_constants/{MODELS[0]}/{struct_file.stem}.xyz" + # ) + + plot_from_table_column( + table_id=self.table_id, + plot_id=f"{BENCHMARK_NAME}-figure-placeholder", + column_to_plot={ + "Metric 1": scatter_dft, + # "MAE (PBE)": scatter_dft, + }, + ) + + # struct_from_scatter( + # scatter_id=f"{BENCHMARK_NAME}-figure", + # struct_id=f"{BENCHMARK_NAME}-struct-placeholder", + # structs=structs, + # mode="struct", + # ) + + +def get_app() -> SplitVacancyApp: + """ + Get lattice constants benchmark app layout and callback registration. + + Returns + ------- + LatticeConstantsApp + Benchmark layout and callback registration. + """ + return SplitVacancyApp( + name=BENCHMARK_NAME, + description=( + "Performance when predicting lattice constants for 23 solids, including " + "pure elements, binary compounds and semiconductors." + ), + docs_url=DOCS_URL, + table_path=DATA_PATH / "split_vacancy_metrics_table.json", + extra_components=[ + Div(id=f"{BENCHMARK_NAME}-figure-placeholder"), + Div(id=f"{BENCHMARK_NAME}-struct-placeholder"), + ], + ) + + +if __name__ == "__main__": + full_app = Dash(__name__, assets_folder=DATA_PATH.parent.parent) + split_vacancy_app = get_app() + full_app.layout = split_vacancy_app.layout + split_vacancy_app.register_callbacks() + full_app.run(port=8054, debug=True) From c1684d23ff07b27ab17af66c220ef271a68f555a Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 16:08:16 +0000 Subject: [PATCH 09/20] update metric names --- .../analyse_split_vacancy.py | 87 ++++++++++++++----- .../analyse_split_vacancy/metrics.yml | 13 +++ .../split_vacancy/app_split_vacancy.py | 11 +-- 3 files changed, 85 insertions(+), 26 deletions(-) create mode 100644 ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index cedd9f532..35214c579 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -3,20 +3,19 @@ from pathlib import Path from ase.io import read +import numpy as np from pymatgen.analysis.structure_matcher import StructureMatcher from pymatgen.core import Structure import pytest from scipy.stats import spearmanr from tqdm.auto import tqdm -from ml_peg.analysis.utils.utils import ( - load_metrics_config, mae -) +from ml_peg.analysis.utils.decorators import build_table, plot_parity +from ml_peg.analysis.utils.utils import load_metrics_config, mae +from ml_peg.app import APP_ROOT from ml_peg.calcs import CALCS_ROOT from ml_peg.models.get_models import get_model_names from ml_peg.models.models import current_models -from ml_peg.analysis.utils.decorators import build_table, plot_parity -from ml_peg.app import APP_ROOT MODELS = get_model_names(current_models) CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs" @@ -66,15 +65,23 @@ def get_hoverdata() -> tuple[list, list, list]: return mp_ids, formulae, vacant_cations + MP_IDS, BULK_FORMULAE, VACANT_CATIONS = get_hoverdata() + @pytest.fixture # cache outputs def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: preference_energy_threshold = 0 # TODO: confirm - result_formation_energy = {"ref": []} | {mlip: [] for mlip in MODELS} # formation energy for every material-cation pair - result_spearmans_coefficient = {mlip: [] for mlip in MODELS} # spearmans coefficient for every material-cation pair - result_rmsd = {mlip: [] for mlip in MODELS} # RMSD error for every material-cation pair + result_formation_energy = {"ref": []} | { + mlip: [] for mlip in MODELS + } # formation energy for every material-cation pair + result_spearmans_coefficient = { + mlip: [] for mlip in MODELS + } # spearmans coefficient for every material-cation pair + result_rmsd = { + mlip: [] for mlip in MODELS + } # RMSD error for every material-cation pair # TODO: investigate Kendall rank correlation result_rmsd = {mlip: [] for mlip in MODELS} @@ -95,14 +102,16 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: nv_xyz_path = cation_dir / "normal_vacancy.xyz.gz" sv_xyz_path = cation_dir / "split_vacancy.xyz.gz" - if not (nv_xyz_path.exists() and sv_xyz_path.exists()): continue # TODO: remove!!! + + if not (nv_xyz_path.exists() and sv_xyz_path.exists()): + raise ValueError # TODO: remove # sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz.gz" nv_atoms_list = read(nv_xyz_path, ":") sv_atoms_list = read(sv_xyz_path, ":") - nv_energies = [at.info["relaxed_energy"] for at in nv_atoms_list] - sv_energies = [at.info["relaxed_energy"] for at in sv_atoms_list] + nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] + sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] # TODO (later): result_rmsd[model_name] # load original structure @@ -115,8 +124,12 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: sv_preferred = sv_formation_energy < preference_energy_threshold if not ref_stored: - ref_nv_energies = [at.info["ref_energy"] for at in nv_atoms_list] - ref_sv_energies = [at.info["ref_energy"] for at in sv_atoms_list] + ref_nv_energies = [ + float(at.info["ref_energy"]) for at in nv_atoms_list + ] + ref_sv_energies = [ + float(at.info["ref_energy"]) for at in sv_atoms_list + ] ref_sv_formation_energy = min(ref_sv_energies) - min( ref_nv_energies @@ -130,20 +143,23 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: # calculate metrics spearmans_coefficient = spearmanr( nv_energies + sv_energies, ref_sv_energies + ref_nv_energies - ) + ).statistic # result_rmsd[model_name] = get_rmsd() # TODO: RMSD of what?? result_formation_energy[model_name].append(sv_formation_energy) result_spearmans_coefficient[model_name].append(spearmans_coefficient) - + # print(result_formation_energy) + # print(result_spearmans_coefficient) + # print(result_rmsd) ref_stored = False - + # print(result_formation_energy) return result_formation_energy, result_spearmans_coefficient, result_rmsd + @pytest.fixture @plot_parity( filename=OUT_PATH / "figure_formation_energies_dft.json", - title="Split Vacancy Formation Energy (from Normal Vacancy)", + title="Split vacancy", x_label="Predicted Split Vacancy Formation Energy / eV", y_label="DFT Split Vacancy Formation Energy / eV", hoverdata={ @@ -164,6 +180,7 @@ def formation_energies_dft(build_results) -> dict[str, list]: result_formation_energies, _, _ = build_results return result_formation_energies + @pytest.fixture def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: """ @@ -186,14 +203,39 @@ def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: ) return results + +@pytest.fixture +def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: + """ + Get mean spearmans coefficient between DFT and MLIP relaxed structures.. + + Parameters + ---------- + build_results + Dictionary of DFT and predicted lattice constants. + + Returns + ------- + dict[str, float] + Dictionary of predicted lattice constant errors for all models. + """ + _, result_spearmans_coefficient, _ = build_results + + results = {} + for model_name in MODELS: + results[model_name] = float(np.mean(result_spearmans_coefficient[model_name])) + return results + + @pytest.fixture @build_table( - filename=OUT_PATH / "lattice_constants_metrics_table.json", + filename=OUT_PATH / "split_vacancy_metrics_table.json", metric_tooltips=DEFAULT_TOOLTIPS, thresholds=DEFAULT_THRESHOLDS, ) def metrics( - formation_energy_dft_mae: dict[str, float]#, metric_2: dict[str, float] + formation_energy_dft_mae: dict[str, float], + spearmans_coefficient_dft_mean: dict[str, float], ) -> dict[str, dict]: """ Get all new benchmark metrics. @@ -210,10 +252,13 @@ def metrics( dict[str, dict] Metric names and values for all models. """ + # print(formation_energy_dft_mae) return { - "Metric 1": formation_energy_dft_mae, + "MAE": formation_energy_dft_mae, + "Mean Spearman's Coefficient": spearmans_coefficient_dft_mean, } + def test_new_benchmark(metrics: dict[str, dict]) -> None: """ Run new benchmark analysis. @@ -223,4 +268,4 @@ def test_new_benchmark(metrics: dict[str, dict]) -> None: metrics All new benchmark metric names and dictionary of values for each model. """ - return \ No newline at end of file + return diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml new file mode 100644 index 000000000..0504ba4b4 --- /dev/null +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml @@ -0,0 +1,13 @@ +metrics: + Mean Spearman's Coefficient: + good: 0.0 + bad: 1.0 + unit: null + tooltip: Measure of the accuracy of the energy ranking of DFT relaxed structures by MLIPs. + level_of_theory: PBE + MAE: + good: 0.0 + bad: 1.0 + unit: eV + tooltip: Formation energy of split vacancy, considering lowest energy relaxed structures. + level_of_theory: PBE diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py index f211c6387..426ada79d 100644 --- a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py +++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py @@ -7,7 +7,7 @@ from ml_peg.app import APP_ROOT from ml_peg.app.base_app import BaseApp -from ml_peg.app.utils.build_callbacks import plot_from_table_column, struct_from_scatter +from ml_peg.app.utils.build_callbacks import plot_from_table_column from ml_peg.app.utils.load import read_plot from ml_peg.models.get_models import get_model_names from ml_peg.models.models import current_models @@ -15,7 +15,7 @@ # Get all models MODELS = get_model_names(current_models) BENCHMARK_NAME = "Split vacancy" -DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/bulk_crystal.html#lattice-constants" # TODO: change +DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/bulk_crystal.html#lattice-constants" # TODO: change DATA_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" @@ -25,7 +25,8 @@ class SplitVacancyApp(BaseApp): def register_callbacks(self) -> None: """Register callbacks to app.""" scatter_dft = read_plot( - DATA_PATH / "figure_formation_energies_dft.json", id=f"{BENCHMARK_NAME}-figure" + DATA_PATH / "figure_formation_energies_dft.json", + id=f"{BENCHMARK_NAME}-figure", ) # structs_dir = DATA_PATH / MODELS[0] @@ -47,8 +48,8 @@ def register_callbacks(self) -> None: table_id=self.table_id, plot_id=f"{BENCHMARK_NAME}-figure-placeholder", column_to_plot={ - "Metric 1": scatter_dft, - # "MAE (PBE)": scatter_dft, + "MAE": scatter_dft, + "Mean Spearman's Coefficient": scatter_dft, }, ) From 8b0f0f47a6fc3e8052b5379a103c6424788cc4ec Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 16:41:49 +0000 Subject: [PATCH 10/20] copy reference structures --- .../split_vacancy/calc_split_vacancies.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index a0bd415eb..f6f926ef9 100644 --- a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -21,9 +21,6 @@ OUT_PATH = Path(__file__).parent / "outputs" -# def rmsd(atoms_1, atoms_2): - - @pytest.mark.parametrize("mlip", MODELS.items()) def test_relax_and_calculate_energy(mlip: tuple[str, Any]): """ @@ -58,6 +55,18 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): relaxed_atoms = [] atoms_list = read(atoms_path, ":") + ref_atoms_out_path = ( + OUT_PATH + / "ref" + / material_dir.stem + / cation_dir.stem + / f"{atoms_path.stem}.xyz.gz" + ) + # Copy ref structures once; subsequent runs skip if already present. + if not ref_atoms_out_path.exists(): + ref_atoms_out_path.parent.mkdir(exist_ok=True, parents=True) + write(ref_atoms_out_path, atoms_list) + for atoms in tqdm(atoms_list, leave=False): atoms.calc = deepcopy(calc) atoms.info["initial_energy"] = atoms.get_potential_energy() From 8dd25b888d6fbf5f6cf1d5813672b3713d6ad96d Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 17:00:51 +0000 Subject: [PATCH 11/20] add rmsd --- .../analyse_split_vacancy.py | 58 ++++++++++++++----- .../analyse_split_vacancy/metrics.yml | 6 ++ .../split_vacancy/app_split_vacancy.py | 1 + 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 35214c579..e08fd70d0 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -110,19 +110,6 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: nv_atoms_list = read(nv_xyz_path, ":") sv_atoms_list = read(sv_xyz_path, ":") - nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] - sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] - - # TODO (later): result_rmsd[model_name] - # load original structure - # use get_rmsd - - # formula = nv_atoms_list[0].info["name"] - # cell_charge = nv_atoms_list[0][-1].info["cell_charge"] # TODO - - sv_formation_energy = min(sv_energies) - min(nv_energies) - sv_preferred = sv_formation_energy < preference_energy_threshold - if not ref_stored: ref_nv_energies = [ float(at.info["ref_energy"]) for at in nv_atoms_list @@ -139,14 +126,33 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: ) result_formation_energy["ref"].append(ref_sv_formation_energy) + + ref_cation_dir = CALC_PATH / 'ref' / material_dir.stem / cation_dir.stem + ref_nv_atoms_list = read(ref_cation_dir / "normal_vacancy.xyz.gz", ':') + ref_sv_atoms_list = read(ref_cation_dir / "split_vacancy.xyz.gz", ':') + + + nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] + sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] # calculate metrics + sv_formation_energy = min(sv_energies) - min(nv_energies) + sv_preferred = sv_formation_energy < preference_energy_threshold + spearmans_coefficient = spearmanr( nv_energies + sv_energies, ref_sv_energies + ref_nv_energies ).statistic - # result_rmsd[model_name] = get_rmsd() # TODO: RMSD of what?? + + rmsd_list = [] + for ref_atoms, mlip_atoms in zip(ref_nv_atoms_list, nv_atoms_list): + rmsd_list.append(get_rmsd(ref_atoms, mlip_atoms)) + for ref_atoms, mlip_atoms in zip(ref_sv_atoms_list, sv_atoms_list): + rmsd_list.append(get_rmsd(ref_atoms, mlip_atoms)) + + # add metrics to dicts result_formation_energy[model_name].append(sv_formation_energy) result_spearmans_coefficient[model_name].append(spearmans_coefficient) + result_rmsd[model_name].extend(rmsd_list) # print(result_formation_energy) # print(result_spearmans_coefficient) @@ -226,6 +232,28 @@ def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: results[model_name] = float(np.mean(result_spearmans_coefficient[model_name])) return results +@pytest.fixture +def rmsd_dft_mean(build_results) -> dict[str, float]: + """ + Get RMSD between DFT and MLIP relaxed structures. + + Parameters + ---------- + build_results + Dictionary of DFT and predicted lattice constants. + + Returns + ------- + dict[str, float] + Dictionary of predicted lattice constant errors for all models. + """ + _, _, result_rmsd = build_results + + results = {} + for model_name in MODELS: + results[model_name] = float(np.mean(result_rmsd[model_name])) + return results + @pytest.fixture @build_table( @@ -236,6 +264,7 @@ def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: def metrics( formation_energy_dft_mae: dict[str, float], spearmans_coefficient_dft_mean: dict[str, float], + rmsd_dft_mean: dict[str, float], ) -> dict[str, dict]: """ Get all new benchmark metrics. @@ -256,6 +285,7 @@ def metrics( return { "MAE": formation_energy_dft_mae, "Mean Spearman's Coefficient": spearmans_coefficient_dft_mean, + "RMSD": rmsd_dft_mean, } diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml index 0504ba4b4..2219d4698 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml @@ -11,3 +11,9 @@ metrics: unit: eV tooltip: Formation energy of split vacancy, considering lowest energy relaxed structures. level_of_theory: PBE + RMSD: + good: 0.0 + bad: 1.0 + unit: Å + tooltip: RMSD between DFT relaxed structures, pre- and post- MLIP relaxation. + level_of_theory: PBE diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py index 426ada79d..0ab51d18b 100644 --- a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py +++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py @@ -50,6 +50,7 @@ def register_callbacks(self) -> None: column_to_plot={ "MAE": scatter_dft, "Mean Spearman's Coefficient": scatter_dft, + "RMSD": scatter_dft, }, ) From a784828d181fb1c4927414e8f7705e333eb719c4 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Wed, 4 Feb 2026 17:50:46 +0000 Subject: [PATCH 12/20] tidy --- .../analyse_split_vacancy.py | 113 ++++++++++++------ .../split_vacancy/app_split_vacancy.py | 37 ++---- .../split_vacancy/calc_split_vacancies.py | 2 +- 3 files changed, 88 insertions(+), 64 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index e08fd70d0..737576861 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -1,3 +1,5 @@ +"""Analyse split vacancy benchmark.""" + from __future__ import annotations from pathlib import Path @@ -31,7 +33,20 @@ STRUCTURE_MATCHER = StructureMatcher(stol=1.0, scale=False) -def get_rmsd(atoms_1, atoms_2): +def get_rmsd(atoms_1, atoms_2) -> float: + """ + Calculate the RMSD between two ASE Atoms objects. + + Parameters + ---------- + atoms_1, atoms_2 + ASE Atoms objects. + + Returns + ------- + float + Root mean square displacement. + """ rmsd, max_dist = STRUCTURE_MATCHER.get_rms_dist( Structure.from_ase_atoms(atoms_1), Structure.from_ase_atoms(atoms_2) ) @@ -40,7 +55,15 @@ def get_rmsd(atoms_1, atoms_2): def get_hoverdata() -> tuple[list, list, list]: - # TODO: RMSD could be good hoverdata - think about this. Only needed for formation energy parity plot. + """ + Get hover data. + + Returns + ------- + tuple[list, list, list ] + Tuple of Materials Project IDs, bulk formulae and vacant cations. + """ + # TODO: RMSD could be good hoverdata? # TODO: add cell charge? mp_ids = [] formulae = [] @@ -71,7 +94,15 @@ def get_hoverdata() -> tuple[list, list, list]: @pytest.fixture # cache outputs def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: - preference_energy_threshold = 0 # TODO: confirm + """ + Iterate through bulk-cation pairs calculating results. + + Returns + ------- + tuple[dict[str, float], dict[str, float], dict[str, float]] + Tuple of metrics. + """ + # preference_energy_threshold = 0 # TODO: confirm result_formation_energy = {"ref": []} | { mlip: [] for mlip in MODELS @@ -98,14 +129,13 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: ] # skip pristine supercell.xyz files if present (not used) for cation_dir in tqdm(cation_dirs, leave=False): - cation = cation_dir.stem + # cation = cation_dir.stem TODO: save structures for visualization nv_xyz_path = cation_dir / "normal_vacancy.xyz.gz" sv_xyz_path = cation_dir / "split_vacancy.xyz.gz" if not (nv_xyz_path.exists() and sv_xyz_path.exists()): raise ValueError # TODO: remove - # sv_from_nv_xyz_path = cation_dir / "normal_vacancy_to_split_vac.xyz.gz" nv_atoms_list = read(nv_xyz_path, ":") sv_atoms_list = read(sv_xyz_path, ":") @@ -121,32 +151,43 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: ref_sv_formation_energy = min(ref_sv_energies) - min( ref_nv_energies ) - ref_sv_preferred = ( - ref_sv_formation_energy < preference_energy_threshold - ) + # ref_sv_preferred = ( + # ref_sv_formation_energy < preference_energy_threshold + # ) # TODO: F1 score result_formation_energy["ref"].append(ref_sv_formation_energy) - - ref_cation_dir = CALC_PATH / 'ref' / material_dir.stem / cation_dir.stem - ref_nv_atoms_list = read(ref_cation_dir / "normal_vacancy.xyz.gz", ':') - ref_sv_atoms_list = read(ref_cation_dir / "split_vacancy.xyz.gz", ':') + ref_cation_dir = ( + CALC_PATH / "ref" / material_dir.stem / cation_dir.stem + ) + ref_nv_atoms_list = read( + ref_cation_dir / "normal_vacancy.xyz.gz", ":" + ) + ref_sv_atoms_list = read( + ref_cation_dir / "split_vacancy.xyz.gz", ":" + ) nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] # calculate metrics sv_formation_energy = min(sv_energies) - min(nv_energies) - sv_preferred = sv_formation_energy < preference_energy_threshold + # sv_preferred = sv_formation_energy < preference_energy_threshold spearmans_coefficient = spearmanr( - nv_energies + sv_energies, ref_sv_energies + ref_nv_energies + [float(at.info["initial_energy"]) for at in nv_atoms_list] + + [float(at.info["initial_energy"]) for at in sv_atoms_list], + ref_sv_energies + ref_nv_energies, ).statistic rmsd_list = [] - for ref_atoms, mlip_atoms in zip(ref_nv_atoms_list, nv_atoms_list): + for ref_atoms, mlip_atoms in zip( + ref_nv_atoms_list, nv_atoms_list, strict=False + ): rmsd_list.append(get_rmsd(ref_atoms, mlip_atoms)) - for ref_atoms, mlip_atoms in zip(ref_sv_atoms_list, sv_atoms_list): + for ref_atoms, mlip_atoms in zip( + ref_sv_atoms_list, sv_atoms_list, strict=False + ): rmsd_list.append(get_rmsd(ref_atoms, mlip_atoms)) # add metrics to dicts @@ -154,11 +195,7 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: result_spearmans_coefficient[model_name].append(spearmans_coefficient) result_rmsd[model_name].extend(rmsd_list) - # print(result_formation_energy) - # print(result_spearmans_coefficient) - # print(result_rmsd) ref_stored = False - # print(result_formation_energy) return result_formation_energy, result_spearmans_coefficient, result_rmsd @@ -176,12 +213,17 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: ) def formation_energies_dft(build_results) -> dict[str, list]: """ - Get DFT and predicted lattice constant for all crystals. + Get DFT and predicted formation energies. + + Parameters + ---------- + build_results + Tuple of results dictionaries. Returns ------- dict[str, list] - Dictionary of DFT and predicted lattice constants. + Dictionary of DFT and predicted formation energies. """ result_formation_energies, _, _ = build_results return result_formation_energies @@ -194,13 +236,13 @@ def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: Parameters ---------- - lattice_constants_dft - Dictionary of DFT and predicted lattice constants. + formation_energies_dft + Dictionary of DFT and predicted formation energies. Returns ------- dict[str, float] - Dictionary of predicted lattice constant errors for all models. + Dictionary of formation energy MAEs for all models. """ results = {} for model_name in MODELS: @@ -213,17 +255,17 @@ def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: @pytest.fixture def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: """ - Get mean spearmans coefficient between DFT and MLIP relaxed structures.. + Get mean spearmans coefficient between DFT and MLIP relaxed structures. Parameters ---------- build_results - Dictionary of DFT and predicted lattice constants. + Tuple of results dictionaries. Returns ------- dict[str, float] - Dictionary of predicted lattice constant errors for all models. + Dictionary of mean Spearman's coefficients for all models. """ _, result_spearmans_coefficient, _ = build_results @@ -232,6 +274,7 @@ def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: results[model_name] = float(np.mean(result_spearmans_coefficient[model_name])) return results + @pytest.fixture def rmsd_dft_mean(build_results) -> dict[str, float]: """ @@ -240,12 +283,12 @@ def rmsd_dft_mean(build_results) -> dict[str, float]: Parameters ---------- build_results - Dictionary of DFT and predicted lattice constants. + Tuple of results dictionaries. Returns ------- dict[str, float] - Dictionary of predicted lattice constant errors for all models. + Dictionary of mean relaxed structure RMSDs for all models. """ _, _, result_rmsd = build_results @@ -271,10 +314,12 @@ def metrics( Parameters ---------- - metric_1 - Metric 1 value for all models. - metric_2 - Metric 2 value for all models. + formation_energy_dft_mae + Split vancancy formation energy MAE for all models. + spearmans_coefficient_dft_mean + Spearman's coefficient mean for all models. + rmsd_dft_mean + RMSD for all models. Returns ------- diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py index 0ab51d18b..634da0370 100644 --- a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py +++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py @@ -1,4 +1,4 @@ -"""Run lattice constants benchmark app.""" +"""Run split vacancy benchmark app.""" from __future__ import annotations @@ -15,12 +15,13 @@ # Get all models MODELS = get_model_names(current_models) BENCHMARK_NAME = "Split vacancy" -DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/bulk_crystal.html#lattice-constants" # TODO: change +# TODO: change DOCS_URL +DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/bulk_crystal.html#lattice-constants" DATA_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" class SplitVacancyApp(BaseApp): - """Lattice constants benchmark app layout and callbacks.""" + """Split vacancy benchmark app layout and callbacks.""" def register_callbacks(self) -> None: """Register callbacks to app.""" @@ -29,21 +30,6 @@ def register_callbacks(self) -> None: id=f"{BENCHMARK_NAME}-figure", ) - # structs_dir = DATA_PATH / MODELS[0] - # structs = [] - # for struct_file in sorted(structs_dir.glob("*.xyz")): - # if struct_file.stem == "SiC": - # structs.extend( - # [ - # f"assets/bulk_crystal/lattice_constants/{MODELS[0]}/{struct_file.stem}.xyz" - # ] - # * 2 - # ) - # else: - # structs.append( - # f"assets/bulk_crystal/lattice_constants/{MODELS[0]}/{struct_file.stem}.xyz" - # ) - plot_from_table_column( table_id=self.table_id, plot_id=f"{BENCHMARK_NAME}-figure-placeholder", @@ -54,28 +40,21 @@ def register_callbacks(self) -> None: }, ) - # struct_from_scatter( - # scatter_id=f"{BENCHMARK_NAME}-figure", - # struct_id=f"{BENCHMARK_NAME}-struct-placeholder", - # structs=structs, - # mode="struct", - # ) - def get_app() -> SplitVacancyApp: """ - Get lattice constants benchmark app layout and callback registration. + Get split vacancy benchmark app layout and callback registration. Returns ------- - LatticeConstantsApp + SplitVacancyApp Benchmark layout and callback registration. """ return SplitVacancyApp( name=BENCHMARK_NAME, description=( - "Performance when predicting lattice constants for 23 solids, including " - "pure elements, binary compounds and semiconductors." + "Performance predicting the formation energy of split " + "vacancies from regular vacancies in XX systems." # TODO: add number ), docs_url=DOCS_URL, table_path=DATA_PATH / "split_vacancy_metrics_table.json", diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index f6f926ef9..b5c342409 100644 --- a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -1,4 +1,4 @@ -"""Run calculations for elasticity benchmark.""" +"""Run calculations for split vacancy benchmark.""" from __future__ import annotations From f489af9be692e51eb59427b4871e57fc531d71a5 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Sat, 7 Feb 2026 11:26:21 +0000 Subject: [PATCH 13/20] split by functional --- .../split_vacancy/calc_split_vacancies.py | 105 +++++++++--------- 1 file changed, 54 insertions(+), 51 deletions(-) diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index b5c342409..c613dd674 100644 --- a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -17,7 +17,7 @@ MODELS = load_models(current_models) # TODO: DATA_PATH = download_github_data(filename, github_uri) -DATA_PATH = Path("/Users/tw/Downloads/out") +DATA_PATH = Path("/Users/tw/Downloads/split_vacancy_data") OUT_PATH = Path(__file__).parent / "outputs" @@ -38,53 +38,56 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): fmax = 0.03 steps = 200 - for material_dir in tqdm(list(DATA_PATH.iterdir())): - cation_dirs = [ - p for p in material_dir.iterdir() if p.is_dir() - ] # skip pristine supercell.xyz files (not used) - for cation_dir in tqdm(cation_dirs, leave=False): - nv_xyz_path = cation_dir / "normal_vacancy.xyz" - sv_xyz_path = cation_dir / "split_vacancy.xyz" - - if not (nv_xyz_path.exists() and sv_xyz_path.exists()): - continue - - atoms_paths = [nv_xyz_path, sv_xyz_path] - - for atoms_path in tqdm(atoms_paths, leave=False): - relaxed_atoms = [] - atoms_list = read(atoms_path, ":") - - ref_atoms_out_path = ( - OUT_PATH - / "ref" - / material_dir.stem - / cation_dir.stem - / f"{atoms_path.stem}.xyz.gz" - ) - # Copy ref structures once; subsequent runs skip if already present. - if not ref_atoms_out_path.exists(): - ref_atoms_out_path.parent.mkdir(exist_ok=True, parents=True) - write(ref_atoms_out_path, atoms_list) - - for atoms in tqdm(atoms_list, leave=False): - atoms.calc = deepcopy(calc) - atoms.info["initial_energy"] = atoms.get_potential_energy() - - opt = LBFGS(atoms, logfile=None) - opt.run(fmax=fmax, steps=steps) - - atoms.info["relaxed_energy"] = atoms.get_potential_energy() - - relaxed_atoms.append(atoms) - - atoms_out_path = ( - OUT_PATH - / model_name - / material_dir.stem - / cation_dir.stem - / f"{atoms_path.stem}.xyz.gz" - ) - atoms_out_path.parent.mkdir(exist_ok=True, parents=True) - - write(atoms_out_path, relaxed_atoms) + for functional in ["pbe", "pbesol"]: + for material_dir in tqdm(list((DATA_PATH / functional).iterdir())): + cation_dirs = [ + p for p in material_dir.iterdir() if p.is_dir() + ] # skip pristine supercell.xyz files (not used) + for cation_dir in tqdm(cation_dirs, leave=False): + nv_xyz_path = cation_dir / "normal_vacancy.xyz" + sv_xyz_path = cation_dir / "split_vacancy.xyz" + + if not (nv_xyz_path.exists() and sv_xyz_path.exists()): + continue + + atoms_paths = [nv_xyz_path, sv_xyz_path] + + for atoms_path in tqdm(atoms_paths, leave=False): + relaxed_atoms = [] + atoms_list = read(atoms_path, ":") + + ref_atoms_out_path = ( + OUT_PATH + / functional + / "ref" + / material_dir.stem + / cation_dir.stem + / f"{atoms_path.stem}.xyz.gz" + ) + # Copy ref structures once; subsequent runs skip if already present. + if not ref_atoms_out_path.exists(): + ref_atoms_out_path.parent.mkdir(exist_ok=True, parents=True) + write(ref_atoms_out_path, atoms_list) + + for atoms in tqdm(atoms_list, leave=False): + atoms.calc = deepcopy(calc) + atoms.info["initial_energy"] = atoms.get_potential_energy() + + opt = LBFGS(atoms, logfile=None) + opt.run(fmax=fmax, steps=steps) + + atoms.info["relaxed_energy"] = atoms.get_potential_energy() + + relaxed_atoms.append(atoms) + + atoms_out_path = ( + OUT_PATH + / functional + / model_name + / material_dir.stem + / cation_dir.stem + / f"{atoms_path.stem}.xyz.gz" + ) + atoms_out_path.parent.mkdir(exist_ok=True, parents=True) + + write(atoms_out_path, relaxed_atoms) From fb72af9374716221fdbc648037a2705ffffc34dd Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Sun, 8 Feb 2026 16:26:19 +0000 Subject: [PATCH 14/20] slow --- .../analyse_split_vacancy.py | 205 +++++++++++++++--- 1 file changed, 172 insertions(+), 33 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 737576861..7a6acfe58 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -21,6 +21,8 @@ MODELS = get_model_names(current_models) CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs" +CALC_PATH_PBESOL = CALC_PATH / "pbesol" # oxides +CALC_PATH_PBE = CALC_PATH / "pbe" # nitrides OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") @@ -54,7 +56,7 @@ def get_rmsd(atoms_1, atoms_2) -> float: return rmsd -def get_hoverdata() -> tuple[list, list, list]: +def get_hoverdata(functional_path) -> tuple[list, list, list]: """ Get hover data. @@ -69,7 +71,7 @@ def get_hoverdata() -> tuple[list, list, list]: formulae = [] vacant_cations = [] - model_dir = model_dir = CALC_PATH / MODELS[0] + model_dir = model_dir = functional_path / MODELS[0] for material_dir in tqdm(list(model_dir.iterdir())): split_dir_name = material_dir.stem.split("-") bulk_formula = split_dir_name[0] @@ -89,11 +91,15 @@ def get_hoverdata() -> tuple[list, list, list]: return mp_ids, formulae, vacant_cations -MP_IDS, BULK_FORMULAE, VACANT_CATIONS = get_hoverdata() +MP_IDS_PBE, BULK_FORMULAE_PBE, VACANT_CATIONS_PBE = get_hoverdata(CALC_PATH_PBE) +MP_IDS_PBESOL, BULK_FORMULAE_PBESOL, VACANT_CATIONS_PBESOL = get_hoverdata( + CALC_PATH_PBESOL +) -@pytest.fixture # cache outputs -def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: +def build_results( + functional_path, +) -> tuple[dict[str, list], dict[str, list], dict[str, list]]: """ Iterate through bulk-cation pairs calculating results. @@ -104,6 +110,8 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: """ # preference_energy_threshold = 0 # TODO: confirm + print(f"Analysing {functional_path.stem} calculations.") + result_formation_energy = {"ref": []} | { mlip: [] for mlip in MODELS } # formation energy for every material-cation pair @@ -117,13 +125,13 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: result_rmsd = {mlip: [] for mlip in MODELS} ref_stored = False - for model_name in MODELS: - model_dir = CALC_PATH / model_name + for model_name in tqdm(MODELS): + model_dir = functional_path / model_name if not model_dir.exists(): continue - for material_dir in tqdm(list(model_dir.iterdir())): + for material_dir in tqdm(list(model_dir.iterdir()), leave=False): cation_dirs = [ p for p in material_dir.iterdir() if p.is_dir() ] # skip pristine supercell.xyz files if present (not used) @@ -158,7 +166,7 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: result_formation_energy["ref"].append(ref_sv_formation_energy) ref_cation_dir = ( - CALC_PATH / "ref" / material_dir.stem / cation_dir.stem + functional_path / "ref" / material_dir.stem / cation_dir.stem ) ref_nv_atoms_list = read( ref_cation_dir / "normal_vacancy.xyz.gz", ":" @@ -199,21 +207,47 @@ def build_results() -> tuple[dict[str, list], dict[str, list], dict[str, list]]: return result_formation_energy, result_spearmans_coefficient, result_rmsd +@pytest.fixture # cache outputs +def build_results_pbesol(): + """ + Get PBEsol (oxide) results. + + Returns + ------- + tuple[dict[str, float], dict[str, float], dict[str, float]] + Tuple of metrics. + """ + return build_results(CALC_PATH_PBESOL) + + +@pytest.fixture # cache outputs +def build_results_pbe(): + """ + Get PBE (nitride) results. + + Returns + ------- + tuple[dict[str, float], dict[str, float], dict[str, float]] + Tuple of metrics. + """ + return build_results(CALC_PATH_PBE) + + @pytest.fixture @plot_parity( - filename=OUT_PATH / "figure_formation_energies_dft.json", - title="Split vacancy", + filename=OUT_PATH / "figure_formation_energies_pbesol.json", + title="Split vacancy (Oxides, PBEsol)", x_label="Predicted Split Vacancy Formation Energy / eV", y_label="DFT Split Vacancy Formation Energy / eV", hoverdata={ - "Materials Project ID": MP_IDS, - "Formula": BULK_FORMULAE, - "Vacant Cation": VACANT_CATIONS, + "Materials Project ID": MP_IDS_PBESOL, + "Formula": BULK_FORMULAE_PBESOL, + "Vacant Cation": VACANT_CATIONS_PBESOL, }, ) -def formation_energies_dft(build_results) -> dict[str, list]: +def formation_energies_pbesol(build_results_pbesol) -> dict[str, list]: """ - Get DFT and predicted formation energies. + Get DFT and predicted formation energies for oxides (PBEsol). Parameters ---------- @@ -225,18 +259,48 @@ def formation_energies_dft(build_results) -> dict[str, list]: dict[str, list] Dictionary of DFT and predicted formation energies. """ - result_formation_energies, _, _ = build_results + result_formation_energies, _, _ = build_results_pbesol() return result_formation_energies @pytest.fixture -def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: +@plot_parity( + filename=OUT_PATH / "figure_formation_energies_pbe.json", + title="Split vacancy (Nitrides, PBE(+U))", + x_label="Predicted Split Vacancy Formation Energy / eV", + y_label="DFT Split Vacancy Formation Energy / eV", + hoverdata={ + "Materials Project ID": MP_IDS_PBE, + "Formula": BULK_FORMULAE_PBE, + "Vacant Cation": VACANT_CATIONS_PBE, + }, +) +def formation_energies_pbe(build_results_pbe) -> dict[str, list]: + """ + Get DFT and predicted formation energies for nitrides (PBE(+U)). + + Parameters + ---------- + build_results + Tuple of results dictionaries. + + Returns + ------- + dict[str, list] + Dictionary of DFT and predicted formation energies. """ - Get mean absolute error for split-vancacy formation energies compared to DFT. + result_formation_energies, _, _ = build_results_pbe() + return result_formation_energies + + +@pytest.fixture +def formation_energy_pbesol_mae(formation_energies_pbesol) -> dict[str, float]: + """ + Get mean absolute error for split-vancacy formation energies compared to PBEsol. Parameters ---------- - formation_energies_dft + formation_energies_pbesol Dictionary of DFT and predicted formation energies. Returns @@ -247,15 +311,61 @@ def formation_energy_dft_mae(formation_energies_dft) -> dict[str, float]: results = {} for model_name in MODELS: results[model_name] = mae( - formation_energies_dft["ref"], formation_energies_dft[model_name] + formation_energies_pbesol["ref"], formation_energies_pbesol[model_name] ) return results @pytest.fixture -def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: +def formation_energy_pbe_mae(formation_energies_pbe) -> dict[str, float]: """ - Get mean spearmans coefficient between DFT and MLIP relaxed structures. + Get mean absolute error for split-vancacy formation energies compared to PBE(+U). + + Parameters + ---------- + formation_energies_pbesol + Dictionary of DFT and predicted formation energies. + + Returns + ------- + dict[str, float] + Dictionary of formation energy MAEs for all models. + """ + results = {} + for model_name in MODELS: + results[model_name] = mae( + formation_energies_pbe["ref"], formation_energies_pbe[model_name] + ) + return results + + +@pytest.fixture +def spearmans_coefficient_pbesol_mean(build_results_pbe) -> dict[str, float]: + """ + Energy ranking score of PBEsol relaxed structures (oxides). + + Parameters + ---------- + build_results + Tuple of results dictionaries. + + Returns + ------- + dict[str, float] + Dictionary of mean Spearman's coefficients for all models. + """ + _, result_spearmans_coefficient, _ = build_results_pbe + + results = {} + for model_name in MODELS: + results[model_name] = float(np.mean(result_spearmans_coefficient[model_name])) + return results + + +@pytest.fixture +def spearmans_coefficient_pbe_mean(build_results_pbe) -> dict[str, float]: + """ + Energy ranking score of PBE relaxed structures (nitrides). Parameters ---------- @@ -267,7 +377,7 @@ def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: dict[str, float] Dictionary of mean Spearman's coefficients for all models. """ - _, result_spearmans_coefficient, _ = build_results + _, result_spearmans_coefficient, _ = build_results_pbe results = {} for model_name in MODELS: @@ -276,9 +386,32 @@ def spearmans_coefficient_dft_mean(build_results) -> dict[str, float]: @pytest.fixture -def rmsd_dft_mean(build_results) -> dict[str, float]: +def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: + """ + Get RMSD between PBEsol and MLIP relaxed structures (oxides). + + Parameters + ---------- + build_results + Tuple of results dictionaries. + + Returns + ------- + dict[str, float] + Dictionary of mean relaxed structure RMSDs for all models. + """ + _, _, result_rmsd = build_results_pbesol + + results = {} + for model_name in MODELS: + results[model_name] = float(np.mean(result_rmsd[model_name])) + return results + + +@pytest.fixture +def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: """ - Get RMSD between DFT and MLIP relaxed structures. + Get RMSD between PBE and MLIP relaxed structures (nitrides). Parameters ---------- @@ -290,7 +423,7 @@ def rmsd_dft_mean(build_results) -> dict[str, float]: dict[str, float] Dictionary of mean relaxed structure RMSDs for all models. """ - _, _, result_rmsd = build_results + _, _, result_rmsd = build_results_pbesol() results = {} for model_name in MODELS: @@ -305,9 +438,12 @@ def rmsd_dft_mean(build_results) -> dict[str, float]: thresholds=DEFAULT_THRESHOLDS, ) def metrics( - formation_energy_dft_mae: dict[str, float], - spearmans_coefficient_dft_mean: dict[str, float], - rmsd_dft_mean: dict[str, float], + formation_energy_pbesol_mae: dict[str, float], + spearmans_coefficient_pbesol_mean: dict[str, float], + rmsd_pbesol_mean: dict[str, float], + formation_energy_pbe_mae: dict[str, float], + spearmans_coefficient_pbe_mean: dict[str, float], + rmsd_pbe_mean: dict[str, float], ) -> dict[str, dict]: """ Get all new benchmark metrics. @@ -328,9 +464,12 @@ def metrics( """ # print(formation_energy_dft_mae) return { - "MAE": formation_energy_dft_mae, - "Mean Spearman's Coefficient": spearmans_coefficient_dft_mean, - "RMSD": rmsd_dft_mean, + "MAE (PBEsol)": formation_energy_pbesol_mae, + "Spearman's (PBEsol)": spearmans_coefficient_pbesol_mean, + "RMSD (PBEsol)": rmsd_pbesol_mean, + "MAE (PBE)": formation_energy_pbe_mae, + "Spearman's (PBE)": spearmans_coefficient_pbe_mean, + "RMSD (PBE)": rmsd_pbe_mean, } From 172c814e64a33ba7a75868e9477bab060a03601c Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Sun, 8 Feb 2026 17:00:53 +0000 Subject: [PATCH 15/20] calculate rmsd (expensive) in calculate stage --- .../split_vacancy/calc_split_vacancies.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index c613dd674..493ac1c07 100644 --- a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -9,6 +9,8 @@ # from pymatgen.analysis import StructureMatcher from ase.io import read, write from ase.optimize import LBFGS +from pymatgen.analysis.structure_matcher import StructureMatcher +from pymatgen.core import Structure import pytest from tqdm.auto import tqdm @@ -20,6 +22,29 @@ DATA_PATH = Path("/Users/tw/Downloads/split_vacancy_data") OUT_PATH = Path(__file__).parent / "outputs" +# same setting as MatBench +# https://github.com/janosh/matbench-discovery/blob/93cc6907ac08b4adaa8391ccc4adf9c015c0dd61/matbench_discovery/structure/symmetry.py#L124 +STRUCTURE_MATCHER = StructureMatcher(stol=1.0, scale=False) + + +def get_rms_dist(atoms_1, atoms_2) -> tuple[float, float] | None: + """ + Calculate the RMSD between two ASE Atoms objects. + + Parameters + ---------- + atoms_1, atoms_2 + ASE Atoms objects. + + Returns + ------- + tuple[float, float] | None + (Root mean square displacement, max di) or None if no match. + """ + return STRUCTURE_MATCHER.get_rms_dist( + Structure.from_ase_atoms(atoms_1), Structure.from_ase_atoms(atoms_2) + ) + @pytest.mark.parametrize("mlip", MODELS.items()) def test_relax_and_calculate_energy(mlip: tuple[str, Any]): @@ -69,7 +94,8 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): ref_atoms_out_path.parent.mkdir(exist_ok=True, parents=True) write(ref_atoms_out_path, atoms_list) - for atoms in tqdm(atoms_list, leave=False): + for initial_atoms in tqdm(atoms_list, leave=False): + atoms = deepcopy(initial_atoms) atoms.calc = deepcopy(calc) atoms.info["initial_energy"] = atoms.get_potential_energy() @@ -78,6 +104,15 @@ def test_relax_and_calculate_energy(mlip: tuple[str, Any]): atoms.info["relaxed_energy"] = atoms.get_potential_energy() + rmsd_max_dist = get_rms_dist(atoms, initial_atoms) + if rmsd_max_dist is None: + atoms.info["ref_structure_match"] = False + else: + rmsd, max_dist = rmsd_max_dist + atoms.info["ref_structure_match"] = True + atoms.info["ref_rmsd"] = rmsd + atoms.info["ref_max_distance"] = max_dist + relaxed_atoms.append(atoms) atoms_out_path = ( From 2a84003f9628bad230737846ca5edd3f67c44e98 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Sun, 8 Feb 2026 17:45:41 +0000 Subject: [PATCH 16/20] fix some bugs --- .../analyse_split_vacancy.py | 122 +++++++++--------- 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 7a6acfe58..5e5beedbf 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -7,7 +7,6 @@ from ase.io import read import numpy as np from pymatgen.analysis.structure_matcher import StructureMatcher -from pymatgen.core import Structure import pytest from scipy.stats import spearmanr from tqdm.auto import tqdm @@ -20,7 +19,7 @@ from ml_peg.models.models import current_models MODELS = get_model_names(current_models) -CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs" +CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs_big" CALC_PATH_PBESOL = CALC_PATH / "pbesol" # oxides CALC_PATH_PBE = CALC_PATH / "pbe" # nitrides OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" @@ -49,17 +48,18 @@ def get_rmsd(atoms_1, atoms_2) -> float: float Root mean square displacement. """ - rmsd, max_dist = STRUCTURE_MATCHER.get_rms_dist( - Structure.from_ase_atoms(atoms_1), Structure.from_ase_atoms(atoms_2) - ) + return np.random.rand() - return rmsd - -def get_hoverdata(functional_path) -> tuple[list, list, list]: +def get_hoverdata(functional_path: Path) -> tuple[list, list, list]: """ Get hover data. + Parameters + ---------- + functional_path + Path to data. + Returns ------- tuple[list, list, list ] @@ -71,8 +71,8 @@ def get_hoverdata(functional_path) -> tuple[list, list, list]: formulae = [] vacant_cations = [] - model_dir = model_dir = functional_path / MODELS[0] - for material_dir in tqdm(list(model_dir.iterdir())): + model_dir = functional_path / MODELS[0] + for material_dir in model_dir.iterdir(): split_dir_name = material_dir.stem.split("-") bulk_formula = split_dir_name[0] mp_id = f"mp-{split_dir_name[-1]}" @@ -103,6 +103,11 @@ def build_results( """ Iterate through bulk-cation pairs calculating results. + Parameters + ---------- + functional_path + Path to data. + Returns ------- tuple[dict[str, float], dict[str, float], dict[str, float]] @@ -122,9 +127,7 @@ def build_results( mlip: [] for mlip in MODELS } # RMSD error for every material-cation pair # TODO: investigate Kendall rank correlation - result_rmsd = {mlip: [] for mlip in MODELS} - ref_stored = False for model_name in tqdm(MODELS): model_dir = functional_path / model_name @@ -143,37 +146,29 @@ def build_results( sv_xyz_path = cation_dir / "split_vacancy.xyz.gz" if not (nv_xyz_path.exists() and sv_xyz_path.exists()): + continue raise ValueError # TODO: remove nv_atoms_list = read(nv_xyz_path, ":") sv_atoms_list = read(sv_xyz_path, ":") - if not ref_stored: - ref_nv_energies = [ - float(at.info["ref_energy"]) for at in nv_atoms_list - ] - ref_sv_energies = [ - float(at.info["ref_energy"]) for at in sv_atoms_list - ] - - ref_sv_formation_energy = min(ref_sv_energies) - min( - ref_nv_energies - ) - # ref_sv_preferred = ( - # ref_sv_formation_energy < preference_energy_threshold - # ) # TODO: F1 score - - result_formation_energy["ref"].append(ref_sv_formation_energy) - - ref_cation_dir = ( - functional_path / "ref" / material_dir.stem / cation_dir.stem - ) - ref_nv_atoms_list = read( - ref_cation_dir / "normal_vacancy.xyz.gz", ":" - ) - ref_sv_atoms_list = read( - ref_cation_dir / "split_vacancy.xyz.gz", ":" - ) + # Load reference data + + ref_nv_energies = [float(at.info["ref_energy"]) for at in nv_atoms_list] + ref_sv_energies = [float(at.info["ref_energy"]) for at in sv_atoms_list] + + ref_sv_formation_energy = min(ref_sv_energies) - min(ref_nv_energies) + # ref_sv_preferred = ( + # ref_sv_formation_energy < preference_energy_threshold + # ) # TODO: F1 score + + result_formation_energy["ref"].append(ref_sv_formation_energy) + + ref_cation_dir = ( + functional_path / "ref" / material_dir.stem / cation_dir.stem + ) + ref_nv_atoms_list = read(ref_cation_dir / "normal_vacancy.xyz.gz", ":") + ref_sv_atoms_list = read(ref_cation_dir / "split_vacancy.xyz.gz", ":") nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] @@ -185,7 +180,7 @@ def build_results( spearmans_coefficient = spearmanr( [float(at.info["initial_energy"]) for at in nv_atoms_list] + [float(at.info["initial_energy"]) for at in sv_atoms_list], - ref_sv_energies + ref_nv_energies, + ref_nv_energies + ref_sv_energies, ).statistic rmsd_list = [] @@ -203,7 +198,6 @@ def build_results( result_spearmans_coefficient[model_name].append(spearmans_coefficient) result_rmsd[model_name].extend(rmsd_list) - ref_stored = False return result_formation_energy, result_spearmans_coefficient, result_rmsd @@ -251,7 +245,7 @@ def formation_energies_pbesol(build_results_pbesol) -> dict[str, list]: Parameters ---------- - build_results + build_results_pbesol Tuple of results dictionaries. Returns @@ -259,7 +253,7 @@ def formation_energies_pbesol(build_results_pbesol) -> dict[str, list]: dict[str, list] Dictionary of DFT and predicted formation energies. """ - result_formation_energies, _, _ = build_results_pbesol() + result_formation_energies, _, _ = build_results_pbesol return result_formation_energies @@ -281,7 +275,7 @@ def formation_energies_pbe(build_results_pbe) -> dict[str, list]: Parameters ---------- - build_results + build_results_pbe Tuple of results dictionaries. Returns @@ -289,7 +283,7 @@ def formation_energies_pbe(build_results_pbe) -> dict[str, list]: dict[str, list] Dictionary of DFT and predicted formation energies. """ - result_formation_energies, _, _ = build_results_pbe() + result_formation_energies, _, _ = build_results_pbe return result_formation_energies @@ -323,7 +317,7 @@ def formation_energy_pbe_mae(formation_energies_pbe) -> dict[str, float]: Parameters ---------- - formation_energies_pbesol + formation_energies_pbe Dictionary of DFT and predicted formation energies. Returns @@ -340,13 +334,13 @@ def formation_energy_pbe_mae(formation_energies_pbe) -> dict[str, float]: @pytest.fixture -def spearmans_coefficient_pbesol_mean(build_results_pbe) -> dict[str, float]: +def spearmans_coefficient_pbesol_mean(build_results_pbesol) -> dict[str, float]: """ Energy ranking score of PBEsol relaxed structures (oxides). Parameters ---------- - build_results + build_results_pbesol Tuple of results dictionaries. Returns @@ -354,7 +348,7 @@ def spearmans_coefficient_pbesol_mean(build_results_pbe) -> dict[str, float]: dict[str, float] Dictionary of mean Spearman's coefficients for all models. """ - _, result_spearmans_coefficient, _ = build_results_pbe + _, result_spearmans_coefficient, _ = build_results_pbesol results = {} for model_name in MODELS: @@ -369,7 +363,7 @@ def spearmans_coefficient_pbe_mean(build_results_pbe) -> dict[str, float]: Parameters ---------- - build_results + build_results_pbe Tuple of results dictionaries. Returns @@ -392,7 +386,7 @@ def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: Parameters ---------- - build_results + build_results_pbesol Tuple of results dictionaries. Returns @@ -409,13 +403,13 @@ def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: @pytest.fixture -def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: +def rmsd_pbe_mean(build_results_pbe) -> dict[str, float]: """ - Get RMSD between PBE and MLIP relaxed structures (nitrides). + Get RMSD between PBE and MLIP relaxed structures (oxides). Parameters ---------- - build_results + build_results_pbe Tuple of results dictionaries. Returns @@ -423,7 +417,7 @@ def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: dict[str, float] Dictionary of mean relaxed structure RMSDs for all models. """ - _, _, result_rmsd = build_results_pbesol() + _, _, result_rmsd = build_results_pbe results = {} for model_name in MODELS: @@ -450,12 +444,20 @@ def metrics( Parameters ---------- - formation_energy_dft_mae - Split vancancy formation energy MAE for all models. - spearmans_coefficient_dft_mean - Spearman's coefficient mean for all models. - rmsd_dft_mean - RMSD for all models. + formation_energy_pbesol_mae + Split vacancy formation energy MAE (oxides, PBEsol) for all models. + spearmans_coefficient_pbesol_mean + Mean Spearman's rank correlation (oxides, PBEsol) for all models. + rmsd_pbesol_mean + Mean RMSD between DFT and MLIP relaxed structures (oxides, PBEsol) for all + models. + formation_energy_pbe_mae + Split vacancy formation energy MAE (nitrides, PBE(+U)) for all models. + spearmans_coefficient_pbe_mean + Mean Spearman's rank correlation (nitrides, PBE(+U)) for all models. + rmsd_pbe_mean + Mean RMSD between DFT and MLIP relaxed structures (nitrides, PBE(+U)) for all + models. Returns ------- From 0eb4a5e3fe46c07daa1b2460e21c2c724a053fa6 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Sun, 8 Feb 2026 18:24:38 +0000 Subject: [PATCH 17/20] clean --- .../analyse_split_vacancy.py | 38 ++++++++----------- .../split_vacancy/app_split_vacancy.py | 17 ++++++--- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 5e5beedbf..1a9d8b20b 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -128,6 +128,8 @@ def build_results( } # RMSD error for every material-cation pair # TODO: investigate Kendall rank correlation + ref_stored = False + for model_name in tqdm(MODELS): model_dir = functional_path / model_name @@ -146,7 +148,7 @@ def build_results( sv_xyz_path = cation_dir / "split_vacancy.xyz.gz" if not (nv_xyz_path.exists() and sv_xyz_path.exists()): - continue + # continue raise ValueError # TODO: remove nv_atoms_list = read(nv_xyz_path, ":") @@ -157,18 +159,15 @@ def build_results( ref_nv_energies = [float(at.info["ref_energy"]) for at in nv_atoms_list] ref_sv_energies = [float(at.info["ref_energy"]) for at in sv_atoms_list] - ref_sv_formation_energy = min(ref_sv_energies) - min(ref_nv_energies) - # ref_sv_preferred = ( - # ref_sv_formation_energy < preference_energy_threshold - # ) # TODO: F1 score - - result_formation_energy["ref"].append(ref_sv_formation_energy) + if not ref_stored: + ref_sv_formation_energy = min(ref_sv_energies) - min( + ref_nv_energies + ) + # ref_sv_preferred = ( + # ref_sv_formation_energy < preference_energy_threshold + # ) # TODO: F1 score - ref_cation_dir = ( - functional_path / "ref" / material_dir.stem / cation_dir.stem - ) - ref_nv_atoms_list = read(ref_cation_dir / "normal_vacancy.xyz.gz", ":") - ref_sv_atoms_list = read(ref_cation_dir / "split_vacancy.xyz.gz", ":") + result_formation_energy["ref"].append(ref_sv_formation_energy) nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] @@ -183,21 +182,16 @@ def build_results( ref_nv_energies + ref_sv_energies, ).statistic - rmsd_list = [] - for ref_atoms, mlip_atoms in zip( - ref_nv_atoms_list, nv_atoms_list, strict=False - ): - rmsd_list.append(get_rmsd(ref_atoms, mlip_atoms)) - for ref_atoms, mlip_atoms in zip( - ref_sv_atoms_list, sv_atoms_list, strict=False - ): - rmsd_list.append(get_rmsd(ref_atoms, mlip_atoms)) - # add metrics to dicts result_formation_energy[model_name].append(sv_formation_energy) result_spearmans_coefficient[model_name].append(spearmans_coefficient) + rmsd_list = [ + -1 for i in range(len(ref_nv_energies) + len(ref_sv_energies)) + ] result_rmsd[model_name].extend(rmsd_list) + ref_stored = True + return result_formation_energy, result_spearmans_coefficient, result_rmsd diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py index 634da0370..86e98f328 100644 --- a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py +++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py @@ -25,8 +25,12 @@ class SplitVacancyApp(BaseApp): def register_callbacks(self) -> None: """Register callbacks to app.""" - scatter_dft = read_plot( - DATA_PATH / "figure_formation_energies_dft.json", + scatter_pbesol = read_plot( + DATA_PATH / "figure_formation_energies_pbesol.json", + id=f"{BENCHMARK_NAME}-figure", + ) + scatter_pbe = read_plot( + DATA_PATH / "figure_formation_energies_pbe.json", id=f"{BENCHMARK_NAME}-figure", ) @@ -34,9 +38,12 @@ def register_callbacks(self) -> None: table_id=self.table_id, plot_id=f"{BENCHMARK_NAME}-figure-placeholder", column_to_plot={ - "MAE": scatter_dft, - "Mean Spearman's Coefficient": scatter_dft, - "RMSD": scatter_dft, + "MAE (PBEsol)": scatter_pbesol, + "Spearman's (PBEsol)": scatter_pbesol, + "RMSD (PBEsol)": scatter_pbesol, + "MAE (PBE)": scatter_pbe, + "Spearman's (PBE)": scatter_pbe, + "RMSD (PBE)": scatter_pbe, }, ) From 32d25757415fc575fd621f37540f5bbc8a02acad Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Mon, 16 Feb 2026 12:45:22 +0000 Subject: [PATCH 18/20] structure matching --- .../analyse_split_vacancy.py | 157 ++++++++---- .../analyse_split_vacancy/metrics.yml | 42 ++- ml_peg/models/models.yml | 240 +++++++++--------- 3 files changed, 262 insertions(+), 177 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 1a9d8b20b..4d8f9f7be 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -6,7 +6,6 @@ from ase.io import read import numpy as np -from pymatgen.analysis.structure_matcher import StructureMatcher import pytest from scipy.stats import spearmanr from tqdm.auto import tqdm @@ -19,7 +18,7 @@ from ml_peg.models.models import current_models MODELS = get_model_names(current_models) -CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs_big" +CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs" CALC_PATH_PBESOL = CALC_PATH / "pbesol" # oxides CALC_PATH_PBE = CALC_PATH / "pbe" # nitrides OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy" @@ -29,27 +28,6 @@ METRICS_CONFIG_PATH ) -# same setting as MatBench -# https://github.com/janosh/matbench-discovery/blob/93cc6907ac08b4adaa8391ccc4adf9c015c0dd61/matbench_discovery/structure/symmetry.py#L124 -STRUCTURE_MATCHER = StructureMatcher(stol=1.0, scale=False) - - -def get_rmsd(atoms_1, atoms_2) -> float: - """ - Calculate the RMSD between two ASE Atoms objects. - - Parameters - ---------- - atoms_1, atoms_2 - ASE Atoms objects. - - Returns - ------- - float - Root mean square displacement. - """ - return np.random.rand() - def get_hoverdata(functional_path: Path) -> tuple[list, list, list]: """ @@ -126,6 +104,7 @@ def build_results( result_rmsd = { mlip: [] for mlip in MODELS } # RMSD error for every material-cation pair + result_match = {mlip: [] for mlip in MODELS} # if structures relaxing to same state # TODO: investigate Kendall rank correlation ref_stored = False @@ -169,30 +148,55 @@ def build_results( result_formation_energy["ref"].append(ref_sv_formation_energy) - nv_energies = [float(at.info["relaxed_energy"]) for at in nv_atoms_list] - sv_energies = [float(at.info["relaxed_energy"]) for at in sv_atoms_list] - - # calculate metrics - sv_formation_energy = min(sv_energies) - min(nv_energies) - # sv_preferred = sv_formation_energy < preference_energy_threshold - + match_list = [] + rmsd_list = [] + + nv_initial_energies = [] + nv_relaxed_energies = [] + for nv_atoms in nv_atoms_list: + if nv_atoms.info["ref_structure_match"]: + rmsd_list.append(nv_atoms.info["ref_rmsd"]) + nv_relaxed_energies.append(nv_atoms.info["relaxed_energy"]) + else: + rmsd_list.append(np.nan) + + match_list.append(nv_atoms.info["ref_structure_match"]) + nv_initial_energies.append(nv_atoms.info["initial_energy"]) + + sv_initial_energies = [] + sv_relaxed_energies = [] + for sv_atoms in sv_atoms_list: + if sv_atoms.info["ref_structure_match"]: + rmsd_list.append(sv_atoms.info["ref_rmsd"]) + sv_relaxed_energies.append(sv_atoms.info["relaxed_energy"]) + else: + rmsd_list.append(np.nan) + + match_list.append(sv_atoms.info["ref_structure_match"]) + sv_initial_energies.append(sv_atoms.info["initial_energy"]) + + sv_formation_energy = min(sv_relaxed_energies, default=np.nan) - min( + nv_relaxed_energies, default=np.nan + ) spearmans_coefficient = spearmanr( - [float(at.info["initial_energy"]) for at in nv_atoms_list] - + [float(at.info["initial_energy"]) for at in sv_atoms_list], + nv_initial_energies + sv_initial_energies, ref_nv_energies + ref_sv_energies, ).statistic # add metrics to dicts result_formation_energy[model_name].append(sv_formation_energy) result_spearmans_coefficient[model_name].append(spearmans_coefficient) - rmsd_list = [ - -1 for i in range(len(ref_nv_energies) + len(ref_sv_energies)) - ] result_rmsd[model_name].extend(rmsd_list) + result_match[model_name].extend(match_list) ref_stored = True - return result_formation_energy, result_spearmans_coefficient, result_rmsd + return ( + result_formation_energy, + result_spearmans_coefficient, + result_rmsd, + result_match, + ) @pytest.fixture # cache outputs @@ -247,7 +251,7 @@ def formation_energies_pbesol(build_results_pbesol) -> dict[str, list]: dict[str, list] Dictionary of DFT and predicted formation energies. """ - result_formation_energies, _, _ = build_results_pbesol + result_formation_energies, _, _, _ = build_results_pbesol return result_formation_energies @@ -277,7 +281,7 @@ def formation_energies_pbe(build_results_pbe) -> dict[str, list]: dict[str, list] Dictionary of DFT and predicted formation energies. """ - result_formation_energies, _, _ = build_results_pbe + result_formation_energies, _, _, _ = build_results_pbe return result_formation_energies @@ -342,7 +346,7 @@ def spearmans_coefficient_pbesol_mean(build_results_pbesol) -> dict[str, float]: dict[str, float] Dictionary of mean Spearman's coefficients for all models. """ - _, result_spearmans_coefficient, _ = build_results_pbesol + _, result_spearmans_coefficient, _, _ = build_results_pbesol results = {} for model_name in MODELS: @@ -365,7 +369,7 @@ def spearmans_coefficient_pbe_mean(build_results_pbe) -> dict[str, float]: dict[str, float] Dictionary of mean Spearman's coefficients for all models. """ - _, result_spearmans_coefficient, _ = build_results_pbe + _, result_spearmans_coefficient, _, _ = build_results_pbe results = {} for model_name in MODELS: @@ -386,13 +390,13 @@ def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]: Returns ------- dict[str, float] - Dictionary of mean relaxed structure RMSDs for all models. + Mean RMSD between MLIP and DFT relaxed structure that match. """ - _, _, result_rmsd = build_results_pbesol + _, _, result_rmsd, _ = build_results_pbesol results = {} for model_name in MODELS: - results[model_name] = float(np.mean(result_rmsd[model_name])) + results[model_name] = float(np.nanmean(result_rmsd[model_name])) return results @@ -409,13 +413,59 @@ def rmsd_pbe_mean(build_results_pbe) -> dict[str, float]: Returns ------- dict[str, float] - Dictionary of mean relaxed structure RMSDs for all models. + Mean RMSD between MLIP and DFT relaxed structures that match. + """ + _, _, result_rmsd, _ = build_results_pbe + + results = {} + for model_name in MODELS: + results[model_name] = float(np.nanmean(result_rmsd[model_name])) + return results + + +@pytest.fixture +def match_pbesol_rate(build_results_pbesol) -> dict[str, float]: + """ + Get RMSD between PBEsol and MLIP relaxed structures (oxides). + + Parameters + ---------- + build_results_pbesol + Tuple of results dictionaries. + + Returns + ------- + dict[str, float] + Rate of MLIP relaxing to same structure as DFT. + """ + _, _, _, result_match = build_results_pbesol + + results = {} + for model_name in MODELS: + results[model_name] = np.mean(result_match[model_name]) + return results + + +@pytest.fixture +def match_pbe_rate(build_results_pbe) -> dict[str, float]: + """ + Get RMSD between PBE and MLIP relaxed structures (oxides). + + Parameters + ---------- + build_results_pbe + Tuple of results dictionaries. + + Returns + ------- + dict[str, float] + Rate of MLIP relaxing to same structure as DFT. """ - _, _, result_rmsd = build_results_pbe + _, _, _, result_match = build_results_pbe results = {} for model_name in MODELS: - results[model_name] = float(np.mean(result_rmsd[model_name])) + results[model_name] = np.mean(result_match[model_name]) return results @@ -429,9 +479,11 @@ def metrics( formation_energy_pbesol_mae: dict[str, float], spearmans_coefficient_pbesol_mean: dict[str, float], rmsd_pbesol_mean: dict[str, float], + match_pbesol_rate: dict[str, float], formation_energy_pbe_mae: dict[str, float], spearmans_coefficient_pbe_mean: dict[str, float], rmsd_pbe_mean: dict[str, float], + match_pbe_rate: dict[str, float], ) -> dict[str, dict]: """ Get all new benchmark metrics. @@ -443,29 +495,32 @@ def metrics( spearmans_coefficient_pbesol_mean Mean Spearman's rank correlation (oxides, PBEsol) for all models. rmsd_pbesol_mean - Mean RMSD between DFT and MLIP relaxed structures (oxides, PBEsol) for all - models. + Mean RMSD between MLIP and PBEsol relaxed structure that match. + match_pbesol_rate + Rate of MLIP relaxing to same structure as PBEsol. formation_energy_pbe_mae Split vacancy formation energy MAE (nitrides, PBE(+U)) for all models. spearmans_coefficient_pbe_mean Mean Spearman's rank correlation (nitrides, PBE(+U)) for all models. rmsd_pbe_mean - Mean RMSD between DFT and MLIP relaxed structures (nitrides, PBE(+U)) for all - models. + Mean RMSD between MLIP and PBE relaxed structure that match. + match_pbe_rate + Rate of MLIP relaxing to same structure as PBE. Returns ------- dict[str, dict] Metric names and values for all models. """ - # print(formation_energy_dft_mae) return { "MAE (PBEsol)": formation_energy_pbesol_mae, "Spearman's (PBEsol)": spearmans_coefficient_pbesol_mean, "RMSD (PBEsol)": rmsd_pbesol_mean, + "Match Rate (PBEsol)": match_pbesol_rate, "MAE (PBE)": formation_energy_pbe_mae, "Spearman's (PBE)": spearmans_coefficient_pbe_mean, "RMSD (PBE)": rmsd_pbe_mean, + "Match Rate (PBE)": match_pbe_rate, } diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml index 2219d4698..ba528368f 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml @@ -1,19 +1,49 @@ metrics: - Mean Spearman's Coefficient: + Spearman's (PBEsol): good: 0.0 bad: 1.0 unit: null - tooltip: Measure of the accuracy of the energy ranking of DFT relaxed structures by MLIPs. + tooltip: Measure of the accuracy of the energy ranking of PBEsol relaxed oxide structures by MLIPs. + level_of_theory: PBEsol + MAE (PBEsol): + good: 0.0 + bad: 1.0 + unit: eV + tooltip: Formation energy of split vacancy, considering lowest energy relaxed oxide structures. + level_of_theory: PBEsol + RMSD (PBEsol): + good: 0.0 + bad: 1.0 + unit: Å + tooltip: RMSD between MLIP and PBEsol relaxed oxide structures that match. + level_of_theory: PBEsol + Match Rate (PBEsol): + good: 0.0 + bad: 1.0 + unit: null + tooltip: Fraction of MLIP relaxations that converge to the same structure as PBEsol. + level_of_theory: PBEsol + Spearman's (PBE): + good: 0.0 + bad: 1.0 + unit: null + tooltip: Measure of the accuracy of the energy ranking of PBE relaxed nitride structures by MLIPs. level_of_theory: PBE - MAE: + MAE (PBE): good: 0.0 bad: 1.0 unit: eV - tooltip: Formation energy of split vacancy, considering lowest energy relaxed structures. + tooltip: Formation energy of split vacancy, considering lowest energy relaxed nitride structures. level_of_theory: PBE - RMSD: + RMSD (PBE): good: 0.0 bad: 1.0 unit: Å - tooltip: RMSD between DFT relaxed structures, pre- and post- MLIP relaxation. + tooltip: RMSD between MLIP and PBE relaxed nitride structures that match. + level_of_theory: PBE + Match Rate (PBE): + good: 0.0 + bad: 1.0 + unit: null + tooltip: Fraction of MLIP relaxations that converge to the same structure as PBE. level_of_theory: PBE diff --git a/ml_peg/models/models.yml b/ml_peg/models/models.yml index 77e54a16b..f83c40c50 100644 --- a/ml_peg/models/models.yml +++ b/ml_peg/models/models.yml @@ -1,12 +1,12 @@ -mace-mp-0a: - module: mace.calculators - class_name: mace_mp - device: "auto" - default_dtype: float32 - trained_on_d3: false - level_of_theory: PBE - kwargs: - model: "medium" +# mace-mp-0a: +# module: mace.calculators +# class_name: mace_mp +# device: "auto" +# default_dtype: float32 +# trained_on_d3: false +# level_of_theory: PBE +# kwargs: +# model: "medium" mace-mp-0b3: module: mace.calculators @@ -51,122 +51,122 @@ mace-matpes-r2scan: d3_kwargs: xc: r2scan -# mace-mh-1-omat: -# module: mace.calculators -# class_name: mace_mp -# device: "auto" -# default_dtype: float32 -# trained_on_d3: false -# level_of_theory: PBE -# kwargs: -# model: "mh-1" -# head: omat_pbe - -orb-v3-consv-inf-omat: - module: orb_models.forcefield - class_name: OrbCalc - device: "cpu" - default_dtype: float32-high - trained_on_d3: false - level_of_theory: PBE - kwargs: - name: "orb_v3_conservative_inf_omat" - -# mattersim-5M: -# module: mattersim.forcefield -# class_name: MatterSimCalculator +# # mace-mh-1-omat: +# # module: mace.calculators +# # class_name: mace_mp +# # device: "auto" +# # default_dtype: float32 +# # trained_on_d3: false +# # level_of_theory: PBE +# # kwargs: +# # model: "mh-1" +# # head: omat_pbe + +# orb-v3-consv-inf-omat: +# module: orb_models.forcefield +# class_name: OrbCalc # device: "cpu" -# load_path: "mattersim-v1.0.0-5m" +# default_dtype: float32-high # trained_on_d3: false # level_of_theory: PBE - -# uma-m-1p1-omat: -# module: fairchem.core -# class_name: FAIRChemCalculator -# device: "cpu" -# level_of_theory: PBE -# trained_on_d3: false # kwargs: -# model_name: "uma-m-1p1" -# task_name: "omat" - -# uma-s-1p1-omat: -# module: fairchem.core -# class_name: FAIRChemCalculator -# device: "cpu" -# level_of_theory: PBE -# trained_on_d3: false -# kwargs: -# model_name: "uma-s-1p1" -# task_name: "omat" - -# GRACE-2L-OAM: -# module: tensorpotential.calculator -# class_name: TPCalculator +# name: "orb_v3_conservative_inf_omat" + +# # mattersim-5M: +# # module: mattersim.forcefield +# # class_name: MatterSimCalculator +# # device: "cpu" +# # load_path: "mattersim-v1.0.0-5m" +# # trained_on_d3: false +# # level_of_theory: PBE + +# # uma-m-1p1-omat: +# # module: fairchem.core +# # class_name: FAIRChemCalculator +# # device: "cpu" +# # level_of_theory: PBE +# # trained_on_d3: false +# # kwargs: +# # model_name: "uma-m-1p1" +# # task_name: "omat" + +# # uma-s-1p1-omat: +# # module: fairchem.core +# # class_name: FAIRChemCalculator +# # device: "cpu" +# # level_of_theory: PBE +# # trained_on_d3: false +# # kwargs: +# # model_name: "uma-s-1p1" +# # task_name: "omat" + +# # GRACE-2L-OAM: +# # module: tensorpotential.calculator +# # class_name: TPCalculator +# # device: "cpu" +# # default_dtype: float32 +# # trained_on_d3: false +# # kwargs: +# # model: "/Users/joehart/Desktop/0_Cambridge/0_MPhil_Scientific_Computing/MPhil_project/mlipx_testing/models/GRACE-2L-OAM_28Jan25" + +# # MACE-OFF23(L): +# # module: mace.calculators +# # class_name: mace_off +# # device: "auto" +# # default_dtype: float64 +# # trained_on_d3: true +# # kwargs: +# # name: large + +# # mace-omol: +# # module: mace.calculators +# # class_name: mace_omol +# # device: "auto" +# # trained_on_d3: true +# # level_of_theory: ωB97M-V/def2-TZVPD +# # default_dtype: float64 +# # kwargs: +# # model: "extra_large" + +# # mace-mh-1-omol: +# # module: mace.calculators +# # class_name: mace_mp +# # device: "auto" +# # default_dtype: float64 +# # trained_on_d3: true +# # level_of_theory: ωB97M-V/def2-TZVPD +# # kwargs: +# # model: "mh-1" +# # head: omol + +# # uma-m-1p1-omol: +# # module: fairchem.core +# # class_name: FAIRChemCalculator +# # device: "cpu" +# # trained_on_d3: true +# # level_of_theory: ωB97M-V/def2-TZVPD +# # kwargs: +# # model_name: "uma-m-1p1" +# # task_name: "omol" + +# # uma-s-1p1-omol: +# # module: fairchem.core +# # class_name: FAIRChemCalculator +# # device: "cpu" +# # trained_on_d3: true +# # level_of_theory: ωB97M-V/def2-TZVPD +# # kwargs: +# # model_name: "uma-s-1p1" +# # task_name: "omol" + +# pet-mad: +# module: pet_mad.calculator +# class_name: PETMADCalculator # device: "cpu" # default_dtype: float32 # trained_on_d3: false +# level_of_theory: PBEsol # kwargs: -# model: "/Users/joehart/Desktop/0_Cambridge/0_MPhil_Scientific_Computing/MPhil_project/mlipx_testing/models/GRACE-2L-OAM_28Jan25" - -# MACE-OFF23(L): -# module: mace.calculators -# class_name: mace_off -# device: "auto" -# default_dtype: float64 -# trained_on_d3: true -# kwargs: -# name: large - -# mace-omol: -# module: mace.calculators -# class_name: mace_omol -# device: "auto" -# trained_on_d3: true -# level_of_theory: ωB97M-V/def2-TZVPD -# default_dtype: float64 -# kwargs: -# model: "extra_large" - -# mace-mh-1-omol: -# module: mace.calculators -# class_name: mace_mp -# device: "auto" -# default_dtype: float64 -# trained_on_d3: true -# level_of_theory: ωB97M-V/def2-TZVPD -# kwargs: -# model: "mh-1" -# head: omol - -# uma-m-1p1-omol: -# module: fairchem.core -# class_name: FAIRChemCalculator -# device: "cpu" -# trained_on_d3: true -# level_of_theory: ωB97M-V/def2-TZVPD -# kwargs: -# model_name: "uma-m-1p1" -# task_name: "omol" - -# uma-s-1p1-omol: -# module: fairchem.core -# class_name: FAIRChemCalculator -# device: "cpu" -# trained_on_d3: true -# level_of_theory: ωB97M-V/def2-TZVPD -# kwargs: -# model_name: "uma-s-1p1" -# task_name: "omol" - -pet-mad: - module: pet_mad.calculator - class_name: PETMADCalculator - device: "cpu" - default_dtype: float32 - trained_on_d3: false - level_of_theory: PBEsol - kwargs: - version: "v1.0.2" - d3_kwargs: - xc: pbesol +# version: "v1.0.2" +# d3_kwargs: +# xc: pbesol From 1d0ddd99dcffb6df4e511699063ea02a26d7edb4 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Mon, 23 Mar 2026 22:13:59 +0000 Subject: [PATCH 19/20] update description --- ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py index 86e98f328..59d542230 100644 --- a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py +++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py @@ -61,7 +61,7 @@ def get_app() -> SplitVacancyApp: name=BENCHMARK_NAME, description=( "Performance predicting the formation energy of split " - "vacancies from regular vacancies in XX systems." # TODO: add number + "vacancies from fully ionised vacancies in nitrides and oxides." ), docs_url=DOCS_URL, table_path=DATA_PATH / "split_vacancy_metrics_table.json", From 76c55d4c3f47b560be076e85eada5709a3095aa9 Mon Sep 17 00:00:00 2001 From: Thomas Warford Date: Mon, 23 Mar 2026 22:16:01 +0000 Subject: [PATCH 20/20] remove structure match metric (always 1) --- .../analyse_split_vacancy.py | 10 +---- .../analyse_split_vacancy/metrics.yml | 40 +++++++++---------- .../split_vacancy/calc_split_vacancies.py | 7 +++- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py index 4d8f9f7be..69031c315 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py @@ -479,11 +479,9 @@ def metrics( formation_energy_pbesol_mae: dict[str, float], spearmans_coefficient_pbesol_mean: dict[str, float], rmsd_pbesol_mean: dict[str, float], - match_pbesol_rate: dict[str, float], formation_energy_pbe_mae: dict[str, float], spearmans_coefficient_pbe_mean: dict[str, float], rmsd_pbe_mean: dict[str, float], - match_pbe_rate: dict[str, float], ) -> dict[str, dict]: """ Get all new benchmark metrics. @@ -496,16 +494,12 @@ def metrics( Mean Spearman's rank correlation (oxides, PBEsol) for all models. rmsd_pbesol_mean Mean RMSD between MLIP and PBEsol relaxed structure that match. - match_pbesol_rate - Rate of MLIP relaxing to same structure as PBEsol. formation_energy_pbe_mae Split vacancy formation energy MAE (nitrides, PBE(+U)) for all models. spearmans_coefficient_pbe_mean Mean Spearman's rank correlation (nitrides, PBE(+U)) for all models. rmsd_pbe_mean Mean RMSD between MLIP and PBE relaxed structure that match. - match_pbe_rate - Rate of MLIP relaxing to same structure as PBE. Returns ------- @@ -516,11 +510,11 @@ def metrics( "MAE (PBEsol)": formation_energy_pbesol_mae, "Spearman's (PBEsol)": spearmans_coefficient_pbesol_mean, "RMSD (PBEsol)": rmsd_pbesol_mean, - "Match Rate (PBEsol)": match_pbesol_rate, + # "Match Rate (PBEsol)": match_pbesol_rate, # not included since always 1 "MAE (PBE)": formation_energy_pbe_mae, "Spearman's (PBE)": spearmans_coefficient_pbe_mean, "RMSD (PBE)": rmsd_pbe_mean, - "Match Rate (PBE)": match_pbe_rate, + # "Match Rate (PBE)": match_pbe_rate, # not included since always 1 } diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml index ba528368f..185e4df3c 100644 --- a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml +++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml @@ -1,49 +1,49 @@ metrics: Spearman's (PBEsol): - good: 0.0 - bad: 1.0 + good: 1.0 + bad: 0.6 unit: null tooltip: Measure of the accuracy of the energy ranking of PBEsol relaxed oxide structures by MLIPs. level_of_theory: PBEsol MAE (PBEsol): good: 0.0 - bad: 1.0 + bad: 2.0 unit: eV tooltip: Formation energy of split vacancy, considering lowest energy relaxed oxide structures. level_of_theory: PBEsol RMSD (PBEsol): good: 0.0 - bad: 1.0 + bad: 0.2 unit: Å tooltip: RMSD between MLIP and PBEsol relaxed oxide structures that match. level_of_theory: PBEsol - Match Rate (PBEsol): - good: 0.0 - bad: 1.0 - unit: null - tooltip: Fraction of MLIP relaxations that converge to the same structure as PBEsol. - level_of_theory: PBEsol + # Match Rate (PBEsol): # always 1 atm + # good: 0.0 + # bad: 1.0 + # unit: null + # tooltip: Fraction of MLIP relaxations that converge to the same structure as PBEsol. + # level_of_theory: PBEsol Spearman's (PBE): - good: 0.0 - bad: 1.0 + good: 1.0 + bad: 0.6 unit: null tooltip: Measure of the accuracy of the energy ranking of PBE relaxed nitride structures by MLIPs. level_of_theory: PBE MAE (PBE): good: 0.0 - bad: 1.0 + bad: 2.0 unit: eV tooltip: Formation energy of split vacancy, considering lowest energy relaxed nitride structures. level_of_theory: PBE RMSD (PBE): good: 0.0 - bad: 1.0 + bad: 0.2 unit: Å tooltip: RMSD between MLIP and PBE relaxed nitride structures that match. level_of_theory: PBE - Match Rate (PBE): - good: 0.0 - bad: 1.0 - unit: null - tooltip: Fraction of MLIP relaxations that converge to the same structure as PBE. - level_of_theory: PBE + # Match Rate (PBE): # always 1 atm + # good: 0.0 + # bad: 1.0 + # unit: null + # tooltip: Fraction of MLIP relaxations that converge to the same structure as PBE. + # level_of_theory: PBE diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py index 493ac1c07..11921c17c 100644 --- a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py +++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py @@ -14,12 +14,15 @@ import pytest from tqdm.auto import tqdm +from ml_peg.calcs.utils.utils import download_github_data from ml_peg.models.get_models import load_models from ml_peg.models.models import current_models MODELS = load_models(current_models) -# TODO: DATA_PATH = download_github_data(filename, github_uri) -DATA_PATH = Path("/Users/tw/Downloads/split_vacancy_data") +github_uri = "https://github.com/ThomasWarford/defect_data/raw/refs/heads/main/" +filename = "split_vacancy_data.zip" +DATA_PATH = download_github_data(filename, github_uri) +DATA_PATH = Path(DATA_PATH) / "split_vacancy_data" OUT_PATH = Path(__file__).parent / "outputs" # same setting as MatBench