diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py
new file mode 100644
index 000000000..69031c315
--- /dev/null
+++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/analyse_split_vacancy.py
@@ -0,0 +1,530 @@
+"""Analyse split vacancy benchmark."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from ase.io import read
+import numpy as np
+import pytest
+from scipy.stats import spearmanr
+from tqdm.auto import tqdm
+
+from ml_peg.analysis.utils.decorators import build_table, plot_parity
+from ml_peg.analysis.utils.utils import load_metrics_config, mae
+from ml_peg.app import APP_ROOT
+from ml_peg.calcs import CALCS_ROOT
+from ml_peg.models.get_models import get_model_names
+from ml_peg.models.models import current_models
+
+MODELS = get_model_names(current_models)
+CALC_PATH = CALCS_ROOT / "bulk_crystal" / "split_vacancy" / "outputs"
+CALC_PATH_PBESOL = CALC_PATH / "pbesol"  # oxides
+CALC_PATH_PBE = CALC_PATH / "pbe"  # nitrides
+OUT_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy"
+
+METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml")
+DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config(
+    METRICS_CONFIG_PATH
+)
+
+
+def get_hoverdata(functional_path: Path) -> tuple[list, list, list]:
+    """
+    Get hover data.
+
+    Parameters
+    ----------
+    functional_path
+        Path to data.
+
+    Returns
+    -------
+    tuple[list, list, list ]
+        Tuple of Materials Project IDs, bulk formulae and vacant cations.
+    """
+    # TODO: RMSD could be good hoverdata?
+    # TODO: add cell charge?
+    mp_ids = []
+    formulae = []
+    vacant_cations = []
+
+    model_dir = functional_path / MODELS[0]
+    for material_dir in model_dir.iterdir():
+        split_dir_name = material_dir.stem.split("-")
+        bulk_formula = split_dir_name[0]
+        mp_id = f"mp-{split_dir_name[-1]}"
+
+        cation_dirs = [
+            p for p in material_dir.iterdir() if p.is_dir()
+        ]  # skip pristine supercell.xyz files if present (not used)
+
+        for cation_dir in cation_dirs:
+            cation = cation_dir.stem
+
+            mp_ids.append(mp_id)
+            formulae.append(bulk_formula)
+            vacant_cations.append(cation)
+
+    return mp_ids, formulae, vacant_cations
+
+
+MP_IDS_PBE, BULK_FORMULAE_PBE, VACANT_CATIONS_PBE = get_hoverdata(CALC_PATH_PBE)
+MP_IDS_PBESOL, BULK_FORMULAE_PBESOL, VACANT_CATIONS_PBESOL = get_hoverdata(
+    CALC_PATH_PBESOL
+)
+
+
+def build_results(
+    functional_path,
+) -> tuple[dict[str, list], dict[str, list], dict[str, list]]:
+    """
+    Iterate through bulk-cation pairs calculating results.
+
+    Parameters
+    ----------
+    functional_path
+        Path to data.
+
+    Returns
+    -------
+    tuple[dict[str, float], dict[str, float], dict[str, float]]
+        Tuple of metrics.
+    """
+    # preference_energy_threshold = 0  # TODO: confirm
+
+    print(f"Analysing {functional_path.stem} calculations.")
+
+    result_formation_energy = {"ref": []} | {
+        mlip: [] for mlip in MODELS
+    }  # formation energy for every material-cation pair
+    result_spearmans_coefficient = {
+        mlip: [] for mlip in MODELS
+    }  # spearmans coefficient for every material-cation pair
+    result_rmsd = {
+        mlip: [] for mlip in MODELS
+    }  # RMSD error for every material-cation pair
+    result_match = {mlip: [] for mlip in MODELS}  # if structures relaxing to same state
+    # TODO: investigate Kendall rank correlation
+
+    ref_stored = False
+
+    for model_name in tqdm(MODELS):
+        model_dir = functional_path / model_name
+
+        if not model_dir.exists():
+            continue
+
+        for material_dir in tqdm(list(model_dir.iterdir()), leave=False):
+            cation_dirs = [
+                p for p in material_dir.iterdir() if p.is_dir()
+            ]  # skip pristine supercell.xyz files if present (not used)
+
+            for cation_dir in tqdm(cation_dirs, leave=False):
+                # cation = cation_dir.stem TODO: save structures for visualization
+
+                nv_xyz_path = cation_dir / "normal_vacancy.xyz.gz"
+                sv_xyz_path = cation_dir / "split_vacancy.xyz.gz"
+
+                if not (nv_xyz_path.exists() and sv_xyz_path.exists()):
+                    # continue
+                    raise ValueError  # TODO: remove
+
+                nv_atoms_list = read(nv_xyz_path, ":")
+                sv_atoms_list = read(sv_xyz_path, ":")
+
+                # Load reference data
+
+                ref_nv_energies = [float(at.info["ref_energy"]) for at in nv_atoms_list]
+                ref_sv_energies = [float(at.info["ref_energy"]) for at in sv_atoms_list]
+
+                if not ref_stored:
+                    ref_sv_formation_energy = min(ref_sv_energies) - min(
+                        ref_nv_energies
+                    )
+                    # ref_sv_preferred = (
+                    #     ref_sv_formation_energy < preference_energy_threshold
+                    # ) # TODO: F1 score
+
+                    result_formation_energy["ref"].append(ref_sv_formation_energy)
+
+                match_list = []
+                rmsd_list = []
+
+                nv_initial_energies = []
+                nv_relaxed_energies = []
+                for nv_atoms in nv_atoms_list:
+                    if nv_atoms.info["ref_structure_match"]:
+                        rmsd_list.append(nv_atoms.info["ref_rmsd"])
+                        nv_relaxed_energies.append(nv_atoms.info["relaxed_energy"])
+                    else:
+                        rmsd_list.append(np.nan)
+
+                    match_list.append(nv_atoms.info["ref_structure_match"])
+                    nv_initial_energies.append(nv_atoms.info["initial_energy"])
+
+                sv_initial_energies = []
+                sv_relaxed_energies = []
+                for sv_atoms in sv_atoms_list:
+                    if sv_atoms.info["ref_structure_match"]:
+                        rmsd_list.append(sv_atoms.info["ref_rmsd"])
+                        sv_relaxed_energies.append(sv_atoms.info["relaxed_energy"])
+                    else:
+                        rmsd_list.append(np.nan)
+
+                    match_list.append(sv_atoms.info["ref_structure_match"])
+                    sv_initial_energies.append(sv_atoms.info["initial_energy"])
+
+                sv_formation_energy = min(sv_relaxed_energies, default=np.nan) - min(
+                    nv_relaxed_energies, default=np.nan
+                )
+                spearmans_coefficient = spearmanr(
+                    nv_initial_energies + sv_initial_energies,
+                    ref_nv_energies + ref_sv_energies,
+                ).statistic
+
+                # add metrics to dicts
+                result_formation_energy[model_name].append(sv_formation_energy)
+                result_spearmans_coefficient[model_name].append(spearmans_coefficient)
+                result_rmsd[model_name].extend(rmsd_list)
+                result_match[model_name].extend(match_list)
+
+        ref_stored = True
+
+    return (
+        result_formation_energy,
+        result_spearmans_coefficient,
+        result_rmsd,
+        result_match,
+    )
+
+
+@pytest.fixture  # cache outputs
+def build_results_pbesol():
+    """
+    Get PBEsol (oxide) results.
+
+    Returns
+    -------
+    tuple[dict[str, float], dict[str, float], dict[str, float]]
+        Tuple of metrics.
+    """
+    return build_results(CALC_PATH_PBESOL)
+
+
+@pytest.fixture  # cache outputs
+def build_results_pbe():
+    """
+    Get PBE (nitride) results.
+
+    Returns
+    -------
+    tuple[dict[str, float], dict[str, float], dict[str, float]]
+        Tuple of metrics.
+    """
+    return build_results(CALC_PATH_PBE)
+
+
+@pytest.fixture
+@plot_parity(
+    filename=OUT_PATH / "figure_formation_energies_pbesol.json",
+    title="Split vacancy (Oxides, PBEsol)",
+    x_label="Predicted Split Vacancy Formation Energy / eV",
+    y_label="DFT Split Vacancy Formation Energy / eV",
+    hoverdata={
+        "Materials Project ID": MP_IDS_PBESOL,
+        "Formula": BULK_FORMULAE_PBESOL,
+        "Vacant Cation": VACANT_CATIONS_PBESOL,
+    },
+)
+def formation_energies_pbesol(build_results_pbesol) -> dict[str, list]:
+    """
+    Get DFT and predicted formation energies for oxides (PBEsol).
+
+    Parameters
+    ----------
+    build_results_pbesol
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, list]
+        Dictionary of DFT and predicted formation energies.
+    """
+    result_formation_energies, _, _, _ = build_results_pbesol
+    return result_formation_energies
+
+
+@pytest.fixture
+@plot_parity(
+    filename=OUT_PATH / "figure_formation_energies_pbe.json",
+    title="Split vacancy (Nitrides, PBE(+U))",
+    x_label="Predicted Split Vacancy Formation Energy / eV",
+    y_label="DFT Split Vacancy Formation Energy / eV",
+    hoverdata={
+        "Materials Project ID": MP_IDS_PBE,
+        "Formula": BULK_FORMULAE_PBE,
+        "Vacant Cation": VACANT_CATIONS_PBE,
+    },
+)
+def formation_energies_pbe(build_results_pbe) -> dict[str, list]:
+    """
+    Get DFT and predicted formation energies for nitrides (PBE(+U)).
+
+    Parameters
+    ----------
+    build_results_pbe
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, list]
+        Dictionary of DFT and predicted formation energies.
+    """
+    result_formation_energies, _, _, _ = build_results_pbe
+    return result_formation_energies
+
+
+@pytest.fixture
+def formation_energy_pbesol_mae(formation_energies_pbesol) -> dict[str, float]:
+    """
+    Get mean absolute error for split-vancacy formation energies compared to PBEsol.
+
+    Parameters
+    ----------
+    formation_energies_pbesol
+        Dictionary of DFT and predicted formation energies.
+
+    Returns
+    -------
+    dict[str, float]
+        Dictionary of formation energy MAEs for all models.
+    """
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = mae(
+            formation_energies_pbesol["ref"], formation_energies_pbesol[model_name]
+        )
+    return results
+
+
+@pytest.fixture
+def formation_energy_pbe_mae(formation_energies_pbe) -> dict[str, float]:
+    """
+    Get mean absolute error for split-vancacy formation energies compared to PBE(+U).
+
+    Parameters
+    ----------
+    formation_energies_pbe
+        Dictionary of DFT and predicted formation energies.
+
+    Returns
+    -------
+    dict[str, float]
+        Dictionary of formation energy MAEs for all models.
+    """
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = mae(
+            formation_energies_pbe["ref"], formation_energies_pbe[model_name]
+        )
+    return results
+
+
+@pytest.fixture
+def spearmans_coefficient_pbesol_mean(build_results_pbesol) -> dict[str, float]:
+    """
+    Energy ranking score of PBEsol relaxed structures (oxides).
+
+    Parameters
+    ----------
+    build_results_pbesol
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, float]
+        Dictionary of mean Spearman's coefficients for all models.
+    """
+    _, result_spearmans_coefficient, _, _ = build_results_pbesol
+
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = float(np.mean(result_spearmans_coefficient[model_name]))
+    return results
+
+
+@pytest.fixture
+def spearmans_coefficient_pbe_mean(build_results_pbe) -> dict[str, float]:
+    """
+    Energy ranking score of PBE relaxed structures (nitrides).
+
+    Parameters
+    ----------
+    build_results_pbe
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, float]
+        Dictionary of mean Spearman's coefficients for all models.
+    """
+    _, result_spearmans_coefficient, _, _ = build_results_pbe
+
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = float(np.mean(result_spearmans_coefficient[model_name]))
+    return results
+
+
+@pytest.fixture
+def rmsd_pbesol_mean(build_results_pbesol) -> dict[str, float]:
+    """
+    Get RMSD between PBEsol and MLIP relaxed structures (oxides).
+
+    Parameters
+    ----------
+    build_results_pbesol
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, float]
+        Mean RMSD between MLIP and DFT relaxed structure that match.
+    """
+    _, _, result_rmsd, _ = build_results_pbesol
+
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = float(np.nanmean(result_rmsd[model_name]))
+    return results
+
+
+@pytest.fixture
+def rmsd_pbe_mean(build_results_pbe) -> dict[str, float]:
+    """
+    Get RMSD between PBE and MLIP relaxed structures (oxides).
+
+    Parameters
+    ----------
+    build_results_pbe
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, float]
+        Mean RMSD between MLIP and DFT relaxed structures that match.
+    """
+    _, _, result_rmsd, _ = build_results_pbe
+
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = float(np.nanmean(result_rmsd[model_name]))
+    return results
+
+
+@pytest.fixture
+def match_pbesol_rate(build_results_pbesol) -> dict[str, float]:
+    """
+    Get RMSD between PBEsol and MLIP relaxed structures (oxides).
+
+    Parameters
+    ----------
+    build_results_pbesol
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, float]
+        Rate of MLIP relaxing to same structure as DFT.
+    """
+    _, _, _, result_match = build_results_pbesol
+
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = np.mean(result_match[model_name])
+    return results
+
+
+@pytest.fixture
+def match_pbe_rate(build_results_pbe) -> dict[str, float]:
+    """
+    Get RMSD between PBE and MLIP relaxed structures (oxides).
+
+    Parameters
+    ----------
+    build_results_pbe
+        Tuple of results dictionaries.
+
+    Returns
+    -------
+    dict[str, float]
+        Rate of MLIP relaxing to same structure as DFT.
+    """
+    _, _, _, result_match = build_results_pbe
+
+    results = {}
+    for model_name in MODELS:
+        results[model_name] = np.mean(result_match[model_name])
+    return results
+
+
+@pytest.fixture
+@build_table(
+    filename=OUT_PATH / "split_vacancy_metrics_table.json",
+    metric_tooltips=DEFAULT_TOOLTIPS,
+    thresholds=DEFAULT_THRESHOLDS,
+)
+def metrics(
+    formation_energy_pbesol_mae: dict[str, float],
+    spearmans_coefficient_pbesol_mean: dict[str, float],
+    rmsd_pbesol_mean: dict[str, float],
+    formation_energy_pbe_mae: dict[str, float],
+    spearmans_coefficient_pbe_mean: dict[str, float],
+    rmsd_pbe_mean: dict[str, float],
+) -> dict[str, dict]:
+    """
+    Get all new benchmark metrics.
+
+    Parameters
+    ----------
+    formation_energy_pbesol_mae
+        Split vacancy formation energy MAE (oxides, PBEsol) for all models.
+    spearmans_coefficient_pbesol_mean
+        Mean Spearman's rank correlation (oxides, PBEsol) for all models.
+    rmsd_pbesol_mean
+        Mean RMSD between MLIP and PBEsol relaxed structure that match.
+    formation_energy_pbe_mae
+        Split vacancy formation energy MAE (nitrides, PBE(+U)) for all models.
+    spearmans_coefficient_pbe_mean
+        Mean Spearman's rank correlation (nitrides, PBE(+U)) for all models.
+    rmsd_pbe_mean
+        Mean RMSD between MLIP and PBE relaxed structure that match.
+
+    Returns
+    -------
+    dict[str, dict]
+        Metric names and values for all models.
+    """
+    return {
+        "MAE (PBEsol)": formation_energy_pbesol_mae,
+        "Spearman's (PBEsol)": spearmans_coefficient_pbesol_mean,
+        "RMSD (PBEsol)": rmsd_pbesol_mean,
+        # "Match Rate (PBEsol)": match_pbesol_rate, # not included since always 1
+        "MAE (PBE)": formation_energy_pbe_mae,
+        "Spearman's (PBE)": spearmans_coefficient_pbe_mean,
+        "RMSD (PBE)": rmsd_pbe_mean,
+        # "Match Rate (PBE)": match_pbe_rate, # not included since always 1
+    }
+
+
+def test_new_benchmark(metrics: dict[str, dict]) -> None:
+    """
+    Run new benchmark analysis.
+
+    Parameters
+    ----------
+    metrics
+        All new benchmark metric names and dictionary of values for each model.
+    """
+    return
diff --git a/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml
new file mode 100644
index 000000000..185e4df3c
--- /dev/null
+++ b/ml_peg/analysis/bulk_crystal/analyse_split_vacancy/metrics.yml
@@ -0,0 +1,49 @@
+metrics:
+  Spearman's (PBEsol):
+    good: 1.0
+    bad: 0.6
+    unit: null
+    tooltip: Measure of the accuracy of the energy ranking of PBEsol relaxed oxide structures by MLIPs.
+    level_of_theory: PBEsol
+  MAE (PBEsol):
+    good: 0.0
+    bad: 2.0
+    unit: eV
+    tooltip: Formation energy of split vacancy, considering lowest energy relaxed oxide structures.
+    level_of_theory: PBEsol
+  RMSD (PBEsol):
+    good: 0.0
+    bad: 0.2
+    unit: Å
+    tooltip: RMSD between MLIP and PBEsol relaxed oxide structures that match.
+    level_of_theory: PBEsol
+  # Match Rate (PBEsol): # always 1 atm
+    # good: 0.0
+    # bad: 1.0
+  #   unit: null
+  #   tooltip: Fraction of MLIP relaxations that converge to the same structure as PBEsol.
+  #   level_of_theory: PBEsol
+  Spearman's (PBE):
+    good: 1.0
+    bad: 0.6
+    unit: null
+    tooltip: Measure of the accuracy of the energy ranking of PBE relaxed nitride structures by MLIPs.
+    level_of_theory: PBE
+  MAE (PBE):
+    good: 0.0
+    bad: 2.0
+    unit: eV
+    tooltip: Formation energy of split vacancy, considering lowest energy relaxed nitride structures.
+    level_of_theory: PBE
+  RMSD (PBE):
+    good: 0.0
+    bad: 0.2
+    unit: Å
+    tooltip: RMSD between MLIP and PBE relaxed nitride structures that match.
+    level_of_theory: PBE
+  # Match Rate (PBE): # always 1 atm
+  #   good: 0.0
+  #   bad: 1.0
+  #   unit: null
+  #   tooltip: Fraction of MLIP relaxations that converge to the same structure as PBE.
+  #   level_of_theory: PBE
diff --git a/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py
new file mode 100644
index 000000000..59d542230
--- /dev/null
+++ b/ml_peg/app/bulk_crystal/split_vacancy/app_split_vacancy.py
@@ -0,0 +1,80 @@
+"""Run split vacancy benchmark app."""
+
+from __future__ import annotations
+
+from dash import Dash
+from dash.html import Div
+
+from ml_peg.app import APP_ROOT
+from ml_peg.app.base_app import BaseApp
+from ml_peg.app.utils.build_callbacks import plot_from_table_column
+from ml_peg.app.utils.load import read_plot
+from ml_peg.models.get_models import get_model_names
+from ml_peg.models.models import current_models
+
+# Get all models
+MODELS = get_model_names(current_models)
+BENCHMARK_NAME = "Split vacancy"
+# TODO: change DOCS_URL
+DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/bulk_crystal.html#lattice-constants"
+DATA_PATH = APP_ROOT / "data" / "bulk_crystal" / "split_vacancy"
+
+
+class SplitVacancyApp(BaseApp):
+    """Split vacancy benchmark app layout and callbacks."""
+
+    def register_callbacks(self) -> None:
+        """Register callbacks to app."""
+        scatter_pbesol = read_plot(
+            DATA_PATH / "figure_formation_energies_pbesol.json",
+            id=f"{BENCHMARK_NAME}-figure",
+        )
+        scatter_pbe = read_plot(
+            DATA_PATH / "figure_formation_energies_pbe.json",
+            id=f"{BENCHMARK_NAME}-figure",
+        )
+
+        plot_from_table_column(
+            table_id=self.table_id,
+            plot_id=f"{BENCHMARK_NAME}-figure-placeholder",
+            column_to_plot={
+                "MAE (PBEsol)": scatter_pbesol,
+                "Spearman's (PBEsol)": scatter_pbesol,
+                "RMSD (PBEsol)": scatter_pbesol,
+                "MAE (PBE)": scatter_pbe,
+                "Spearman's (PBE)": scatter_pbe,
+                "RMSD (PBE)": scatter_pbe,
+            },
+        )
+
+
+def get_app() -> SplitVacancyApp:
+    """
+    Get split vacancy benchmark app layout and callback registration.
+
+    Returns
+    -------
+    SplitVacancyApp
+        Benchmark layout and callback registration.
+    """
+    return SplitVacancyApp(
+        name=BENCHMARK_NAME,
+        description=(
+            "Performance predicting the formation energy of split "
+            "vacancies from fully ionised vacancies in nitrides and oxides."
+        ),
+        docs_url=DOCS_URL,
+        table_path=DATA_PATH / "split_vacancy_metrics_table.json",
+        extra_components=[
+            Div(id=f"{BENCHMARK_NAME}-figure-placeholder"),
+            Div(id=f"{BENCHMARK_NAME}-struct-placeholder"),
+        ],
+    )
+
+
+if __name__ == "__main__":
+    full_app = Dash(__name__, assets_folder=DATA_PATH.parent.parent)
+    split_vacancy_app = get_app()
+    full_app.layout = split_vacancy_app.layout
+    split_vacancy_app.register_callbacks()
+    full_app.run(port=8054, debug=True)
diff --git a/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py
new file mode 100644
index 000000000..11921c17c
--- /dev/null
+++ b/ml_peg/calcs/bulk_crystal/split_vacancy/calc_split_vacancies.py
@@ -0,0 +1,131 @@
+"""Run calculations for split vacancy benchmark."""
+
+from __future__ import annotations
+
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+
+# from pymatgen.analysis import StructureMatcher
+from ase.io import read, write
+from ase.optimize import LBFGS
+from pymatgen.analysis.structure_matcher import StructureMatcher
+from pymatgen.core import Structure
+import pytest
+from tqdm.auto import tqdm
+
+from ml_peg.calcs.utils.utils import download_github_data
+from ml_peg.models.get_models import load_models
+from ml_peg.models.models import current_models
+
+MODELS = load_models(current_models)
+github_uri = "https://github.com/ThomasWarford/defect_data/raw/refs/heads/main/"
+filename = "split_vacancy_data.zip"
+DATA_PATH = download_github_data(filename, github_uri)
+DATA_PATH = Path(DATA_PATH) / "split_vacancy_data"
+OUT_PATH = Path(__file__).parent / "outputs"
+
+# same setting as MatBench
+# https://github.com/janosh/matbench-discovery/blob/93cc6907ac08b4adaa8391ccc4adf9c015c0dd61/matbench_discovery/structure/symmetry.py#L124
+STRUCTURE_MATCHER = StructureMatcher(stol=1.0, scale=False)
+
+
+def get_rms_dist(atoms_1, atoms_2) -> tuple[float, float] | None:
+    """
+    Calculate the RMSD between two ASE Atoms objects.
+
+    Parameters
+    ----------
+    atoms_1, atoms_2
+        ASE Atoms objects.
+
+    Returns
+    -------
+    tuple[float, float] | None
+        (Root mean square displacement, max di) or None if no match.
+    """
+    return STRUCTURE_MATCHER.get_rms_dist(
+        Structure.from_ase_atoms(atoms_1), Structure.from_ase_atoms(atoms_2)
+    )
+
+
+@pytest.mark.parametrize("mlip", MODELS.items())
+def test_relax_and_calculate_energy(mlip: tuple[str, Any]):
+    """
+    Run calculations required for split vacancy formation energies.
+
+    Parameters
+    ----------
+    mlip
+        Name of model use and model to get calculator.
+    """
+    model_name, model = mlip
+    model.default_dtype = "float64"
+    calc = model.get_calculator()
+
+    fmax = 0.03
+    steps = 200
+
+    for functional in ["pbe", "pbesol"]:
+        for material_dir in tqdm(list((DATA_PATH / functional).iterdir())):
+            cation_dirs = [
+                p for p in material_dir.iterdir() if p.is_dir()
+            ]  # skip pristine supercell.xyz files (not used)
+            for cation_dir in tqdm(cation_dirs, leave=False):
+                nv_xyz_path = cation_dir / "normal_vacancy.xyz"
+                sv_xyz_path = cation_dir / "split_vacancy.xyz"
+
+                if not (nv_xyz_path.exists() and sv_xyz_path.exists()):
+                    continue
+
+                atoms_paths = [nv_xyz_path, sv_xyz_path]
+
+                for atoms_path in tqdm(atoms_paths, leave=False):
+                    relaxed_atoms = []
+                    atoms_list = read(atoms_path, ":")
+
+                    ref_atoms_out_path = (
+                        OUT_PATH
+                        / functional
+                        / "ref"
+                        / material_dir.stem
+                        / cation_dir.stem
+                        / f"{atoms_path.stem}.xyz.gz"
+                    )
+                    # Copy ref structures once; subsequent runs skip if already present.
+                    if not ref_atoms_out_path.exists():
+                        ref_atoms_out_path.parent.mkdir(exist_ok=True, parents=True)
+                        write(ref_atoms_out_path, atoms_list)
+
+                    for initial_atoms in tqdm(atoms_list, leave=False):
+                        atoms = deepcopy(initial_atoms)
+                        atoms.calc = deepcopy(calc)
+                        atoms.info["initial_energy"] = atoms.get_potential_energy()
+
+                        opt = LBFGS(atoms, logfile=None)
+                        opt.run(fmax=fmax, steps=steps)
+
+                        atoms.info["relaxed_energy"] = atoms.get_potential_energy()
+
+                        rmsd_max_dist = get_rms_dist(atoms, initial_atoms)
+                        if rmsd_max_dist is None:
+                            atoms.info["ref_structure_match"] = False
+                        else:
+                            rmsd, max_dist = rmsd_max_dist
+                            atoms.info["ref_structure_match"] = True
+                            atoms.info["ref_rmsd"] = rmsd
+                            atoms.info["ref_max_distance"] = max_dist
+
+                        relaxed_atoms.append(atoms)
+
+                    atoms_out_path = (
+                        OUT_PATH
+                        / functional
+                        / model_name
+                        / material_dir.stem
+                        / cation_dir.stem
+                        / f"{atoms_path.stem}.xyz.gz"
+                    )
+                    atoms_out_path.parent.mkdir(exist_ok=True, parents=True)
+
+                    write(atoms_out_path, relaxed_atoms)