From 27d149867f55cf9246a5cf1fd5271e1d070ca5a6 Mon Sep 17 00:00:00 2001 From: elena Date: Mon, 23 Feb 2026 19:33:12 +0000 Subject: [PATCH 1/9] Initial BDE test on DFT geometries --- .../user_guide/benchmarks/molecular.rst | 42 +++ .../analysis/molecular/BDEs/analyse_BDEs.py | 258 ++++++++++++++++++ ml_peg/analysis/molecular/BDEs/metrics.yml | 7 + ml_peg/app/molecular/BDEs/app_BDEs.py | 73 +++++ ml_peg/calcs/molecular/BDEs/calc_BDEs.py | 67 +++++ 5 files changed, 447 insertions(+) create mode 100644 ml_peg/analysis/molecular/BDEs/analyse_BDEs.py create mode 100644 ml_peg/analysis/molecular/BDEs/metrics.yml create mode 100644 ml_peg/app/molecular/BDEs/app_BDEs.py create mode 100644 ml_peg/calcs/molecular/BDEs/calc_BDEs.py diff --git a/docs/source/user_guide/benchmarks/molecular.rst b/docs/source/user_guide/benchmarks/molecular.rst index 7b1857d9e..a6d3d7bdd 100644 --- a/docs/source/user_guide/benchmarks/molecular.rst +++ b/docs/source/user_guide/benchmarks/molecular.rst @@ -174,3 +174,45 @@ Input structures: Binary and Ternary Mixtures of Bis(2-hydroxyethyl)ammonium Acetate with Methanol, N,N-Dimethylformamide, and Water at Several Temperatures. J. Chem. Eng. Data 62, 3958-3966 (2017). https://doi.org/10.1021/acs.jced.7b00654 + + + +BDEs +========= + +Summary +------- + +Performance in [for now] predicting energies of 60 CYP substrates and derived radicals. +Only [CHO] elements + + +Metrics +------- + +1. Energy RMSE + +Accuracy of energy prediction. Measured as binding energy per atom. + +2. Force RMSE + + +Computational cost +------------------ + +Low: tests are likely to take less than a minute to run on CPU. + + +Data availability +----------------- + +Input structures: + +* TODO Input structures were evaluated as part of "Gelzinyte et al., Journal of Chemical Theory and Computation + 2024 20 (1), 164-177, DOI: 10.1021/acs.jctc.3c00710" but I didn't include the xyzs there... Just the SMILES strings. + + +Reference data: + +* (TODO ??Same as input data) +* B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py new file mode 100644 index 000000000..9006156b9 --- /dev/null +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -0,0 +1,258 @@ +"""Analyse X23 benchmark.""" + +from __future__ import annotations + +from pathlib import Path + +from ase import units +from ase.io import read +import pytest + +from ml_peg.analysis.utils.decorators import build_table, plot_parity +from ml_peg.analysis.utils.utils import build_d3_name_map, load_metrics_config, mae +from ml_peg.app import APP_ROOT +from ml_peg.calcs import CALCS_ROOT +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +MODELS = get_model_names(current_models) +D3_MODEL_NAMES = build_d3_name_map(MODELS) +CALC_PATH = CALCS_ROOT / "molecular" / "BDEs" / "outputs" +OUT_PATH = APP_ROOT / "data" / "molecular" / "BDEs" + +METRICS_CONFIG_PATH = Path(__file__).with_name("metrics.yml") +DEFAULT_THRESHOLDS, DEFAULT_TOOLTIPS, DEFAULT_WEIGHTS = load_metrics_config( + METRICS_CONFIG_PATH +) + +# Unit conversion +EV_TO_KJ_PER_MOL = units.mol / units.kJ + + +def into_dict_of_labels(atoms, key): + """ + Group atoms based on their atoms.info[key] labels. + + Parameters + ---------- + atoms : list[Atoms] + Atoms objects with info[key] labels to group. + key : str + Key in atoms.info to group by. + + Returns + ------- + dict[str, list[Atoms]] + Dictionary mapping each label value to a list of Atoms objects. + """ + dict_out = {} + for at in atoms: + label = at.info[key] + if label not in dict_out: + dict_out[label] = [] + dict_out[label].append(at) + return dict_out + + +def get_bde(mol_energy, rad_energy, isolated_h_energy): + """ + Compute bond dissociation energy. + + Parameters + ---------- + mol_energy : float + Energy of the intact molecule. + rad_energy : float + Energy of the molecule with the hydrogen atom removed. + isolated_h_energy : float + Energy of the isolated hydrogen atom. + + Returns + ------- + float + Bond dissociation energy. + """ + return rad_energy + isolated_h_energy - mol_energy + + +def get_hover_data_labels(key) -> list[str]: + """ + Get labels based on the key. + + Parameters + ---------- + key : str + Key in atoms.info to retrieve labels from. + + Returns + ------- + list[str] + List of system names from atoms.info entries. + """ + labels = [] + for model_name in MODELS: + model_dir = CALC_PATH / model_name + if model_dir.exists(): + xyz_files = sorted(model_dir.glob("*.xyz")) + if xyz_files: + for xyz_file in xyz_files: + atoms = read(xyz_file, ":") + labels += [at.info[key] for at in atoms] + break + return labels + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", + title="Bond Dissociation Energies on DFT Geometries", + x_label="Predicted BDE / kJ/mol", + y_label="Reference BDE / kJ/mol", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def dft_geometry_bdes() -> dict[str, list]: + """ + Compute BDEs for all sp3 H atoms using DFT optimised geometries. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDEs. + """ + results = {"ref": []} | {mlip: [] for mlip in MODELS} + ref_stored = False + + for model_name in MODELS: + model_dir = CALC_PATH / model_name + + if not model_dir.exists(): + continue + + xyz_files = sorted(model_dir.glob("*xyz")) + if not xyz_files: + continue + + for xyz_file in xyz_files: + all_atoms = read(xyz_file, index=":") + pred_bdes = process_bdes(all_atoms, "pred_") + results[model_name] += pred_bdes + + if not ref_stored: + dft_bdes = process_bdes(all_atoms, "dft_") + results["ref"] += dft_bdes + ref_stored = True + + return results + + +def process_bdes(all_atoms, prefix): + """ + Compute BDEs for given atoms and property prefix. + + Parameters + ---------- + all_atoms + List of Atoms with appropriate labels for radicals, + molecules and isolated atoms. + prefix + "dft_" or "pred_" to fetch the appropriate energy. + + Returns + ------- + list[float] + List of computed bond dissociation energies. + """ + compound_dict = into_dict_of_labels(all_atoms, key="compound") + + isolated_atoms = compound_dict.pop("isolated_atom") + isolated_atoms_dict = into_dict_of_labels(isolated_atoms, "element") + isolated_h_energy = isolated_atoms_dict["H"][0].info[f"{prefix}energy"] + + all_bdes = [] + for compound_atoms in compound_dict.values(): + mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") + if "rad" not in mol_rad_dict: + continue + + mol = mol_rad_dict["mol"] + assert len(mol) == 1 + mol = mol[0] + mol_energy = mol.info[f"{prefix}energy"] + + for rad in mol_rad_dict["rad"]: + rad_energy = rad.info[f"{prefix}energy"] + bde = get_bde( + mol_energy=mol_energy, + rad_energy=rad_energy, + isolated_h_energy=isolated_h_energy, + ) + all_bdes.append(bde * EV_TO_KJ_PER_MOL) + + return all_bdes + + +@pytest.fixture +def dft_geometry_bde_errors(dft_geometry_bdes) -> dict[str, float]: + """ + Get mean absolute error for bond dissociation energies. + + Parameters + ---------- + dft_geometry_bdes + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE errors for all models. + """ + results = {} + for model_name in MODELS: + if dft_geometry_bdes[model_name]: + results[model_name] = mae( + dft_geometry_bdes["ref"], + dft_geometry_bdes[model_name], + ) + else: + results[model_name] = None + return results + + +@pytest.fixture +@build_table( + filename=OUT_PATH / "metrics_table.CYP3A4.dft_opt_geometry.BDEs.json", + metric_tooltips=DEFAULT_TOOLTIPS, + thresholds=DEFAULT_THRESHOLDS, + mlip_name_map=D3_MODEL_NAMES, +) +def metrics(dft_geometry_bde_errors: dict[str, float]) -> dict[str, dict]: + """ + Get all total rad energy metrics. + + Parameters + ---------- + dft_geometry_bde_errors + Mean absolute errors for all systems. + + Returns + ------- + dict[str, dict] + Metric names and values for all models. + """ + return { + "MAE": dft_geometry_bde_errors, + } + + +def test_bdes(metrics: dict[str, dict]) -> None: + """ + Run BDEs test. + + Parameters + ---------- + metrics + All BDEs metrics. + """ + return diff --git a/ml_peg/analysis/molecular/BDEs/metrics.yml b/ml_peg/analysis/molecular/BDEs/metrics.yml new file mode 100644 index 000000000..0c147b1b1 --- /dev/null +++ b/ml_peg/analysis/molecular/BDEs/metrics.yml @@ -0,0 +1,7 @@ +metrics: + MAE: + good: 0.0 + bad: 100.0 + unit: kJ/mol + tooltip: "Mean Absolute Error for all BDEs" + level of theory: B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/app/molecular/BDEs/app_BDEs.py b/ml_peg/app/molecular/BDEs/app_BDEs.py new file mode 100644 index 000000000..3db90d030 --- /dev/null +++ b/ml_peg/app/molecular/BDEs/app_BDEs.py @@ -0,0 +1,73 @@ +"""Run BDEs app.""" + +from __future__ import annotations + +from dash.html import Div + +from ml_peg.app import APP_ROOT +from ml_peg.app.base_app import BaseApp +from ml_peg.app.utils.build_callbacks import ( + plot_from_table_column, + struct_from_scatter, +) +from ml_peg.app.utils.load import read_plot +from ml_peg.models.get_models import get_model_names +from ml_peg.models.models import current_models + +# Get all models +MODELS = get_model_names(current_models) +BENCHMARK_NAME = "Bond Dissociation Energies" +DOCS_URL = "https://ddmms.github.io/ml-peg/user_guide/benchmarks/molecular.html#bdes" +DATA_PATH = APP_ROOT / "data" / "molecular" / "BDEs" + + +class BDEsApp(BaseApp): + """BDEs benchmark app layout and callbacks.""" + + def register_callbacks(self) -> None: + """Register callbacks to app.""" + scatter = read_plot( + DATA_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", + id=f"{BENCHMARK_NAME}-figure", + ) + + # Assets dir will be parent directory - individual files for each system + structs_dir = DATA_PATH / MODELS[0] + structs = [ + f"assets/molecular/BDEs/{MODELS[0]}/{struct_file.stem}.xyz" + for struct_file in sorted(structs_dir.glob("*.xyz")) + ] + + plot_from_table_column( + table_id=self.table_id, + plot_id=f"{BENCHMARK_NAME}-figure-placeholder", + column_to_plot={"MAE": scatter}, + ) + + struct_from_scatter( + scatter_id=f"{BENCHMARK_NAME}-figure", + struct_id=f"{BENCHMARK_NAME}-struct-placeholder", + structs=structs, + mode="struct", + ) + + +def get_app() -> BDEsApp: + """ + Get bond dissociation energy benchmark app. + + Returns + ------- + BDEsApp + Benchmark layout and callback registration. + """ + return BDEsApp( + name=BENCHMARK_NAME, + description="Bond Dissociation Energies", + docs_url=DOCS_URL, + table_path=DATA_PATH / "metrics_table.CYP3A4.dft_opt_geometry.BDEs.json", + extra_components=[ + Div(id=f"{BENCHMARK_NAME}-figure-placeholder"), + Div(id=f"{BENCHMARK_NAME}-struct-placeholder"), + ], + ) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py new file mode 100644 index 000000000..40bc93090 --- /dev/null +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py @@ -0,0 +1,67 @@ +""" +Calculate bond dissociation energies of drug-like molecules. + +Journal of Chemical Theory and Computation 2024 20 (1), 164-177 +DOI: 10.1021/acs.jctc.3c00710 +""" + +from __future__ import annotations + +from copy import copy +from pathlib import Path +from typing import Any + +from ase import units +from ase.io import read, write +import pytest + +from ml_peg.calcs.utils.utils import download_s3_data +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) + +DATA_PATH = Path(__file__).parent / "data" +OUT_PATH = Path(__file__).parent / "outputs" + +# Unit conversion +EV_TO_KJ_PER_MOL = units.mol / units.kJ + + +@pytest.mark.parametrize("mlip", MODELS.items()) +def test_bond_dissociation_energy(mlip: tuple[str, Any]) -> None: + """ + Calculate C-H bond dissociation energy of drug-like molecules. + + Parameters + ---------- + mlip + Name of model use and model to get calculator. + """ + model_name, model = mlip + calc = model.get_calculator() + + # Add D3 calculator for this test (for models where applicable) + calc = model.add_d3_calculator(calc) + + bde_dir = ( + download_s3_data( + key="inputs/molecular/BDEs/BDEs.zip", + filename="BDEs.zip", + ) + / "BDEs" + ) + + structures_filename = "cytochrome_p450_substrates.dft_opt.xyz" + mols_rads = read(Path(bde_dir) / structures_filename, ":") + + ats_out = [] + for at in mols_rads: + at.calc = copy(calc) + at.info["pred_energy"] = at.get_potential_energy() + at.arrays["pred_forces"] = at.get_forces() + ats_out.append(at) + + write_dir = OUT_PATH / model_name + write_dir.mkdir(parents=True, exist_ok=True) + write(write_dir / structures_filename, ats_out) From 866eeb10924fd0c371acadd5f8b798c3a63536a2 Mon Sep 17 00:00:00 2001 From: elena Date: Thu, 26 Feb 2026 18:17:20 +0000 Subject: [PATCH 2/9] correct units to kcal/mol --- ml_peg/analysis/molecular/BDEs/analyse_BDEs.py | 8 ++++---- ml_peg/analysis/molecular/BDEs/metrics.yml | 2 +- ml_peg/calcs/molecular/BDEs/calc_BDEs.py | 4 ---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py index 9006156b9..cb5330c55 100644 --- a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -26,7 +26,7 @@ ) # Unit conversion -EV_TO_KJ_PER_MOL = units.mol / units.kJ +EV_TO_KCAL_PER_MOL = units.mol / units.kcal def into_dict_of_labels(atoms, key): @@ -106,8 +106,8 @@ def get_hover_data_labels(key) -> list[str]: @plot_parity( filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", title="Bond Dissociation Energies on DFT Geometries", - x_label="Predicted BDE / kJ/mol", - y_label="Reference BDE / kJ/mol", + x_label="Predicted BDE / kcal/mol", + y_label="Reference BDE / kcal/mol", hoverdata={ "Compound": get_hover_data_labels("compound"), }, @@ -188,7 +188,7 @@ def process_bdes(all_atoms, prefix): rad_energy=rad_energy, isolated_h_energy=isolated_h_energy, ) - all_bdes.append(bde * EV_TO_KJ_PER_MOL) + all_bdes.append(bde * EV_TO_KCAL_PER_MOL) return all_bdes diff --git a/ml_peg/analysis/molecular/BDEs/metrics.yml b/ml_peg/analysis/molecular/BDEs/metrics.yml index 0c147b1b1..1f151995d 100644 --- a/ml_peg/analysis/molecular/BDEs/metrics.yml +++ b/ml_peg/analysis/molecular/BDEs/metrics.yml @@ -2,6 +2,6 @@ metrics: MAE: good: 0.0 bad: 100.0 - unit: kJ/mol + unit: kcal/mol tooltip: "Mean Absolute Error for all BDEs" level of theory: B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py index 40bc93090..828dfcdb3 100644 --- a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py @@ -11,7 +11,6 @@ from pathlib import Path from typing import Any -from ase import units from ase.io import read, write import pytest @@ -24,9 +23,6 @@ DATA_PATH = Path(__file__).parent / "data" OUT_PATH = Path(__file__).parent / "outputs" -# Unit conversion -EV_TO_KJ_PER_MOL = units.mol / units.kJ - @pytest.mark.parametrize("mlip", MODELS.items()) def test_bond_dissociation_energy(mlip: tuple[str, Any]) -> None: From e56aaeb201697169165522583634dd6c5183288a Mon Sep 17 00:00:00 2001 From: elena Date: Tue, 3 Mar 2026 21:52:07 +0000 Subject: [PATCH 3/9] initial bde rank plots --- .../analysis/molecular/BDEs/analyse_BDEs.py | 186 ++++++++++++++++-- ml_peg/analysis/molecular/BDEs/metrics.yml | 18 +- ml_peg/app/molecular/BDEs/app_BDEs.py | 8 +- 3 files changed, 189 insertions(+), 23 deletions(-) diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py index cb5330c55..662bfc177 100644 --- a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -1,4 +1,4 @@ -"""Analyse X23 benchmark.""" +"""Analyse bond dissociation energy benchmark.""" from __future__ import annotations @@ -6,7 +6,9 @@ from ase import units from ase.io import read +import numpy as np import pytest +from scipy import stats from ml_peg.analysis.utils.decorators import build_table, plot_parity from ml_peg.analysis.utils.utils import build_d3_name_map, load_metrics_config, mae @@ -28,6 +30,10 @@ # Unit conversion EV_TO_KCAL_PER_MOL = units.mol / units.kcal +# Populated by dft_geometry_bdes; shared with downstream fixtures without +# appearing in the returned dict (which would add it as a scatter trace). +_COMPOUND_LABELS: list[str] = [] + def into_dict_of_labels(atoms, key): """ @@ -114,14 +120,17 @@ def get_hover_data_labels(key) -> list[str]: ) def dft_geometry_bdes() -> dict[str, list]: """ - Compute BDEs for all sp3 H atoms using DFT optimised geometries. + Compute BDEs for all sp3 H atoms on DFT optimised geometries. Returns ------- dict[str, list] - Dictionary of reference and predicted BDEs. + Dictionary of reference and predicted BDEs and labels of the + compound for the corresponding BDE. """ + global _COMPOUND_LABELS results = {"ref": []} | {mlip: [] for mlip in MODELS} + _COMPOUND_LABELS = [] ref_stored = False for model_name in MODELS: @@ -136,18 +145,88 @@ def dft_geometry_bdes() -> dict[str, list]: for xyz_file in xyz_files: all_atoms = read(xyz_file, index=":") - pred_bdes = process_bdes(all_atoms, "pred_") - results[model_name] += pred_bdes + pred_bdes_by_label = process_bdes_per_compound(all_atoms, "pred_") + for compound_label, pred_bdes in pred_bdes_by_label.items(): + if not ref_stored: + _COMPOUND_LABELS += [compound_label] * len(pred_bdes) + results[model_name] += pred_bdes if not ref_stored: - dft_bdes = process_bdes(all_atoms, "dft_") - results["ref"] += dft_bdes + dft_bdes_by_label = process_bdes_per_compound(all_atoms, "dft_") + for dft_bdes in dft_bdes_by_label.values(): + results["ref"] += dft_bdes ref_stored = True return results -def process_bdes(all_atoms, prefix): +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", + title="Bond Dissociation Energy ranks on DFT Geometries", + x_label="Predicted BDE rank", + y_label="Reference BDE rank", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def dft_geometry_bde_ranks(dft_geometry_bdes) -> dict[str, list]: + """ + Compute BDE ranks for all sp3 H atoms using DFT optimised geometries. + + Parameters + ---------- + dft_geometry_bdes : dict[str, list] + Dictionary of reference and predicted BDEs and labels of the + compound for the corresponding BDE. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDE ranks. + """ + results = {"ref": []} | {mlip: [] for mlip in MODELS} + for model_name in results.keys(): + bdes_by_compound = group_values_by_labels( + values=dft_geometry_bdes[model_name], + labels=_COMPOUND_LABELS, + ) + + for compound_bdes in bdes_by_compound.values(): + ranks = np.argsort(np.argsort(compound_bdes)) + results[model_name] += list(ranks) + + return results + + +def group_values_by_labels( + values: list[float], labels: list[str] +) -> dict[str, list[float]]: + """ + Group values by their corresponding string labels. + + Parameters + ---------- + values : list[float] + Values to group. + labels : list[str] + Labels corresponding to each value. + + Returns + ------- + dict[str, list[float]] + Dictionary mapping each label to a list of its values. + """ + results = {} + for label, value in zip(labels, values, strict=False): + if label not in results: + results[label] = [] + results[label].append(value) + + return results + + +def process_bdes_per_compound(all_atoms, prefix): """ Compute BDEs for given atoms and property prefix. @@ -161,8 +240,8 @@ def process_bdes(all_atoms, prefix): Returns ------- - list[float] - List of computed bond dissociation energies. + dict[str: list[float]] + List of computed bond dissociation energies, for each compound. """ compound_dict = into_dict_of_labels(all_atoms, key="compound") @@ -170,12 +249,14 @@ def process_bdes(all_atoms, prefix): isolated_atoms_dict = into_dict_of_labels(isolated_atoms, "element") isolated_h_energy = isolated_atoms_dict["H"][0].info[f"{prefix}energy"] - all_bdes = [] - for compound_atoms in compound_dict.values(): + all_bdes = {} + for compound_name, compound_atoms in compound_dict.items(): mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") if "rad" not in mol_rad_dict: continue + all_bdes[compound_name] = [] + mol = mol_rad_dict["mol"] assert len(mol) == 1 mol = mol[0] @@ -188,7 +269,7 @@ def process_bdes(all_atoms, prefix): rad_energy=rad_energy, isolated_h_energy=isolated_h_energy, ) - all_bdes.append(bde * EV_TO_KCAL_PER_MOL) + all_bdes[compound_name].append(bde * EV_TO_KCAL_PER_MOL) return all_bdes @@ -220,6 +301,74 @@ def dft_geometry_bde_errors(dft_geometry_bdes) -> dict[str, float]: return results +@pytest.fixture +def dft_geometry_bde_rank_correlations(dft_geometry_bde_ranks) -> dict[str, float]: + """ + Compute mean Kendall's tau rank correlation across all molecules. + + For each molecule, sp3 C-H bond strengths are ranked from lowest to + highest by both DFT and MLIP, and the correlation is measured by + Kendall's tau. The final metric is the average across all molecules. + + Parameters + ---------- + dft_geometry_bde_ranks : dict[str, list] + Dictionary of reference and predicted BDE ranks. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE rank correlation for all models. + """ + ref_ranks_by_compound = group_values_by_labels( + values=dft_geometry_bde_ranks["ref"], + labels=_COMPOUND_LABELS, + ) + + results = {} + for model_name in MODELS: + if dft_geometry_bde_ranks[model_name]: + pred_ranks_by_compound = group_values_by_labels( + values=dft_geometry_bde_ranks[model_name], + labels=_COMPOUND_LABELS, + ) + + # equivalent to mae, rmse + results[model_name] = mean_kendalls_tau( + ref_by_label=ref_ranks_by_compound, + pred_by_label=pred_ranks_by_compound, + ) + + else: + results[model_name] = None + return results + + +def mean_kendalls_tau(ref_by_label, pred_by_label): + """ + Compute mean Kendall's tau across labels. + + Parameters + ---------- + ref_by_label : dict[str, list] + Reference ranks grouped by compound label. + pred_by_label : dict[str, list] + Predicted ranks grouped by compound label. + + Returns + ------- + float + Mean Kendall's tau correlation across all labels. + """ + kendalls_taus = [] + for label in ref_by_label.keys(): + ref = ref_by_label[label] + pred = pred_by_label[label] + correlation = stats.kendalltau(ref, pred).correlation + kendalls_taus.append(correlation) + return np.mean(kendalls_taus) + + @pytest.fixture @build_table( filename=OUT_PATH / "metrics_table.CYP3A4.dft_opt_geometry.BDEs.json", @@ -227,7 +376,10 @@ def dft_geometry_bde_errors(dft_geometry_bdes) -> dict[str, float]: thresholds=DEFAULT_THRESHOLDS, mlip_name_map=D3_MODEL_NAMES, ) -def metrics(dft_geometry_bde_errors: dict[str, float]) -> dict[str, dict]: +def metrics( + dft_geometry_bde_errors: dict[str, float], + dft_geometry_bde_rank_correlations: dict[str, float], +) -> dict[str, dict]: """ Get all total rad energy metrics. @@ -235,6 +387,9 @@ def metrics(dft_geometry_bde_errors: dict[str, float]) -> dict[str, dict]: ---------- dft_geometry_bde_errors Mean absolute errors for all systems. + dft_geometry_bde_rank_correlations + Mean Kendal's tau across predicted and reference + BDE ranks within a given molecule. Returns ------- @@ -242,7 +397,8 @@ def metrics(dft_geometry_bde_errors: dict[str, float]) -> dict[str, dict]: Metric names and values for all models. """ return { - "MAE": dft_geometry_bde_errors, + "Direct BDE": dft_geometry_bde_errors, + "BDE rank": dft_geometry_bde_rank_correlations, } diff --git a/ml_peg/analysis/molecular/BDEs/metrics.yml b/ml_peg/analysis/molecular/BDEs/metrics.yml index 1f151995d..3b6469219 100644 --- a/ml_peg/analysis/molecular/BDEs/metrics.yml +++ b/ml_peg/analysis/molecular/BDEs/metrics.yml @@ -1,7 +1,13 @@ metrics: - MAE: - good: 0.0 - bad: 100.0 - unit: kcal/mol - tooltip: "Mean Absolute Error for all BDEs" - level of theory: B3LYP-D3BJ/def2-SV(P) + Direct BDE: + good: 0.0 + bad: 50.0 + unit: kcal/mol + tooltip: "Mean Absolute Error on DFT-relaxed structures" + level of theory: B3LYP-D3BJ/def2-SV(P) + BDE rank: + good: 1.0 + bad: 0.0 + unit: null + tooltip: "Mean Kendal rank correlation coefficient" + level of theory: B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/app/molecular/BDEs/app_BDEs.py b/ml_peg/app/molecular/BDEs/app_BDEs.py index 3db90d030..b97fcbfc4 100644 --- a/ml_peg/app/molecular/BDEs/app_BDEs.py +++ b/ml_peg/app/molecular/BDEs/app_BDEs.py @@ -26,10 +26,14 @@ class BDEsApp(BaseApp): def register_callbacks(self) -> None: """Register callbacks to app.""" - scatter = read_plot( + scatter_bdes = read_plot( DATA_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", id=f"{BENCHMARK_NAME}-figure", ) + scatter_ranks = read_plot( + DATA_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", + id=f"{BENCHMARK_NAME}-ranks-figure", + ) # Assets dir will be parent directory - individual files for each system structs_dir = DATA_PATH / MODELS[0] @@ -41,7 +45,7 @@ def register_callbacks(self) -> None: plot_from_table_column( table_id=self.table_id, plot_id=f"{BENCHMARK_NAME}-figure-placeholder", - column_to_plot={"MAE": scatter}, + column_to_plot={"Direct BDE": scatter_bdes, "BDE rank": scatter_ranks}, ) struct_from_scatter( From e34e40b72e5ad148a1f22e70b93458078f4ad340 Mon Sep 17 00:00:00 2001 From: elena Date: Tue, 3 Mar 2026 22:43:48 +0000 Subject: [PATCH 4/9] Tests with geometry optimisation --- .../analysis/molecular/BDEs/analyse_BDEs.py | 451 ++++++++++++------ ml_peg/analysis/molecular/BDEs/metrics.yml | 16 +- ml_peg/calcs/molecular/BDEs/calc_BDEs.py | 73 ++- .../molecular/BDEs/calc_BDEs_mlff_opt.py | 55 +++ 4 files changed, 439 insertions(+), 156 deletions(-) create mode 100644 ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py index 662bfc177..e3ae8d65e 100644 --- a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -30,9 +30,10 @@ # Unit conversion EV_TO_KCAL_PER_MOL = units.mol / units.kcal -# Populated by dft_geometry_bdes; shared with downstream fixtures without -# appearing in the returned dict (which would add it as a scatter trace). +# Populated by dft_geometry_bdes / mlff_geometry_bdes; shared with downstream +# fixtures without appearing in the returned dict (which would add a scatter trace). _COMPOUND_LABELS: list[str] = [] +_MLFF_COMPOUND_LABELS: list[str] = [] def into_dict_of_labels(atoms, key): @@ -108,29 +109,130 @@ def get_hover_data_labels(key) -> list[str]: return labels -@pytest.fixture -@plot_parity( - filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", - title="Bond Dissociation Energies on DFT Geometries", - x_label="Predicted BDE / kcal/mol", - y_label="Reference BDE / kcal/mol", - hoverdata={ - "Compound": get_hover_data_labels("compound"), - }, -) -def dft_geometry_bdes() -> dict[str, list]: +def group_values_by_labels( + values: list[float], labels: list[str] +) -> dict[str, list[float]]: """ - Compute BDEs for all sp3 H atoms on DFT optimised geometries. + Group values by their corresponding string labels. + + Parameters + ---------- + values : list[float] + Values to group. + labels : list[str] + Labels corresponding to each value. + + Returns + ------- + dict[str, list[float]] + Dictionary mapping each label to a list of its values. + """ + results = {} + for label, value in zip(labels, values, strict=False): + if label not in results: + results[label] = [] + results[label].append(value) + + return results + + +def process_bdes_per_compound(all_atoms, prefix): + """ + Compute BDEs for given atoms and property prefix. + + Parameters + ---------- + all_atoms + List of Atoms with appropriate labels for radicals, + molecules and isolated atoms. + prefix + "dft_" or "pred_" to fetch the appropriate energy. + + Returns + ------- + dict[str: list[float]] + List of computed bond dissociation energies, for each compound. + """ + compound_dict = into_dict_of_labels(all_atoms, key="compound") + + isolated_atoms = compound_dict.pop("isolated_atom") + isolated_atoms_dict = into_dict_of_labels(isolated_atoms, "element") + isolated_h_energy = isolated_atoms_dict["H"][0].info[f"{prefix}energy"] + + all_bdes = {} + for compound_name, compound_atoms in compound_dict.items(): + mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") + if "rad" not in mol_rad_dict: + continue + + all_bdes[compound_name] = [] + + mol = mol_rad_dict["mol"] + assert len(mol) == 1 + mol = mol[0] + mol_energy = mol.info[f"{prefix}energy"] + + for rad in mol_rad_dict["rad"]: + rad_energy = rad.info[f"{prefix}energy"] + bde = get_bde( + mol_energy=mol_energy, + rad_energy=rad_energy, + isolated_h_energy=isolated_h_energy, + ) + all_bdes[compound_name].append(bde * EV_TO_KCAL_PER_MOL) + + return all_bdes + + +def mean_kendalls_tau(ref_by_label, pred_by_label): + """ + Compute mean Kendall's tau across labels. + + Parameters + ---------- + ref_by_label : dict[str, list] + Reference ranks grouped by compound label. + pred_by_label : dict[str, list] + Predicted ranks grouped by compound label. + + Returns + ------- + float + Mean Kendall's tau correlation across all labels. + """ + kendalls_taus = [] + for label in ref_by_label.keys(): + ref = ref_by_label[label] + pred = pred_by_label[label] + correlation = stats.kendalltau(ref, pred).correlation + kendalls_taus.append(correlation) + return np.mean(kendalls_taus) + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _load_bdes(xyz_suffix: str, labels_out: list) -> dict[str, list]: + """ + Load BDEs from output xyz files for all models. + + Parameters + ---------- + xyz_suffix + Suffix of the xyz filename, e.g. "dft_opt" or "mlff_opt". + labels_out + List to populate in-place with compound labels for each BDE entry. Returns ------- dict[str, list] - Dictionary of reference and predicted BDEs and labels of the - compound for the corresponding BDE. + Dictionary of reference and predicted BDEs keyed by model name and "ref". """ - global _COMPOUND_LABELS + xyz_filename = f"cytochrome_p450_substrates.{xyz_suffix}.xyz" results = {"ref": []} | {mlip: [] for mlip in MODELS} - _COMPOUND_LABELS = [] + labels_out.clear() ref_stored = False for model_name in MODELS: @@ -139,7 +241,7 @@ def dft_geometry_bdes() -> dict[str, list]: if not model_dir.exists(): continue - xyz_files = sorted(model_dir.glob("*xyz")) + xyz_files = sorted(model_dir.glob(xyz_filename)) if not xyz_files: continue @@ -148,7 +250,7 @@ def dft_geometry_bdes() -> dict[str, list]: pred_bdes_by_label = process_bdes_per_compound(all_atoms, "pred_") for compound_label, pred_bdes in pred_bdes_by_label.items(): if not ref_stored: - _COMPOUND_LABELS += [compound_label] * len(pred_bdes) + labels_out += [compound_label] * len(pred_bdes) results[model_name] += pred_bdes if not ref_stored: @@ -160,25 +262,16 @@ def dft_geometry_bdes() -> dict[str, list]: return results -@pytest.fixture -@plot_parity( - filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", - title="Bond Dissociation Energy ranks on DFT Geometries", - x_label="Predicted BDE rank", - y_label="Reference BDE rank", - hoverdata={ - "Compound": get_hover_data_labels("compound"), - }, -) -def dft_geometry_bde_ranks(dft_geometry_bdes) -> dict[str, list]: +def _compute_ranks(bdes: dict, labels: list) -> dict[str, list]: """ - Compute BDE ranks for all sp3 H atoms using DFT optimised geometries. + Compute BDE ranks per compound for all models and reference. Parameters ---------- - dft_geometry_bdes : dict[str, list] - Dictionary of reference and predicted BDEs and labels of the - compound for the corresponding BDE. + bdes + Dictionary of reference and predicted BDEs keyed by model name and "ref". + labels + Compound label for each BDE entry. Returns ------- @@ -188,90 +281,124 @@ def dft_geometry_bde_ranks(dft_geometry_bdes) -> dict[str, list]: results = {"ref": []} | {mlip: [] for mlip in MODELS} for model_name in results.keys(): bdes_by_compound = group_values_by_labels( - values=dft_geometry_bdes[model_name], - labels=_COMPOUND_LABELS, + values=bdes[model_name], + labels=labels, ) - for compound_bdes in bdes_by_compound.values(): ranks = np.argsort(np.argsort(compound_bdes)) results[model_name] += list(ranks) - return results -def group_values_by_labels( - values: list[float], labels: list[str] -) -> dict[str, list[float]]: +def _compute_bde_errors(bdes: dict) -> dict[str, float]: """ - Group values by their corresponding string labels. + Compute mean absolute error of predicted BDEs against reference. Parameters ---------- - values : list[float] - Values to group. - labels : list[str] - Labels corresponding to each value. + bdes + Dictionary of reference and predicted BDEs keyed by model name and "ref". Returns ------- - dict[str, list[float]] - Dictionary mapping each label to a list of its values. + dict[str, float] + MAE for each model. """ - results = {} - for label, value in zip(labels, values, strict=False): - if label not in results: - results[label] = [] - results[label].append(value) - - return results + return { + model_name: mae(bdes["ref"], bdes[model_name]) if bdes[model_name] else None + for model_name in MODELS + } -def process_bdes_per_compound(all_atoms, prefix): +def _compute_rank_correlations(ranks: dict, labels: list) -> dict[str, float]: """ - Compute BDEs for given atoms and property prefix. + Compute mean Kendall's tau rank correlation against reference ranks. Parameters ---------- - all_atoms - List of Atoms with appropriate labels for radicals, - molecules and isolated atoms. - prefix - "dft_" or "pred_" to fetch the appropriate energy. + ranks + Dictionary of reference and predicted BDE ranks keyed by model name and "ref". + labels + Compound label for each rank entry. Returns ------- - dict[str: list[float]] - List of computed bond dissociation energies, for each compound. + dict[str, float] + Mean Kendall's tau for each model. """ - compound_dict = into_dict_of_labels(all_atoms, key="compound") + ref_ranks_by_compound = group_values_by_labels( + values=ranks["ref"], + labels=labels, + ) + results = {} + for model_name in MODELS: + if ranks[model_name]: + pred_ranks_by_compound = group_values_by_labels( + values=ranks[model_name], + labels=labels, + ) + results[model_name] = mean_kendalls_tau( + ref_by_label=ref_ranks_by_compound, + pred_by_label=pred_ranks_by_compound, + ) + else: + results[model_name] = None + return results - isolated_atoms = compound_dict.pop("isolated_atom") - isolated_atoms_dict = into_dict_of_labels(isolated_atoms, "element") - isolated_h_energy = isolated_atoms_dict["H"][0].info[f"{prefix}energy"] - all_bdes = {} - for compound_name, compound_atoms in compound_dict.items(): - mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") - if "rad" not in mol_rad_dict: - continue +# --------------------------------------------------------------------------- +# DFT-geometry fixtures +# --------------------------------------------------------------------------- - all_bdes[compound_name] = [] - mol = mol_rad_dict["mol"] - assert len(mol) == 1 - mol = mol[0] - mol_energy = mol.info[f"{prefix}energy"] +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDEs.json", + title="Bond Dissociation Energies on DFT Geometries", + x_label="Predicted BDE / kcal/mol", + y_label="Reference BDE / kcal/mol", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def dft_geometry_bdes() -> dict[str, list]: + """ + Compute BDEs for all sp3 H atoms on DFT optimised geometries. - for rad in mol_rad_dict["rad"]: - rad_energy = rad.info[f"{prefix}energy"] - bde = get_bde( - mol_energy=mol_energy, - rad_energy=rad_energy, - isolated_h_energy=isolated_h_energy, - ) - all_bdes[compound_name].append(bde * EV_TO_KCAL_PER_MOL) + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDEs and labels of the + compound for the corresponding BDE. + """ + return _load_bdes("dft_opt", _COMPOUND_LABELS) - return all_bdes + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", + title="Bond Dissociation Energy ranks on DFT Geometries", + x_label="Predicted BDE rank", + y_label="Reference BDE rank", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def dft_geometry_bde_ranks(dft_geometry_bdes) -> dict[str, list]: + """ + Compute BDE ranks for all sp3 H atoms using DFT optimised geometries. + + Parameters + ---------- + dft_geometry_bdes : dict[str, list] + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDE ranks. + """ + return _compute_ranks(dft_geometry_bdes, _COMPOUND_LABELS) @pytest.fixture @@ -289,16 +416,7 @@ def dft_geometry_bde_errors(dft_geometry_bdes) -> dict[str, float]: dict[str, float] Dictionary of predicted BDE errors for all models. """ - results = {} - for model_name in MODELS: - if dft_geometry_bdes[model_name]: - results[model_name] = mae( - dft_geometry_bdes["ref"], - dft_geometry_bdes[model_name], - ) - else: - results[model_name] = None - return results + return _compute_bde_errors(dft_geometry_bdes) @pytest.fixture @@ -320,53 +438,107 @@ def dft_geometry_bde_rank_correlations(dft_geometry_bde_ranks) -> dict[str, floa dict[str, float] Dictionary of predicted BDE rank correlation for all models. """ - ref_ranks_by_compound = group_values_by_labels( - values=dft_geometry_bde_ranks["ref"], - labels=_COMPOUND_LABELS, - ) + return _compute_rank_correlations(dft_geometry_bde_ranks, _COMPOUND_LABELS) - results = {} - for model_name in MODELS: - if dft_geometry_bde_ranks[model_name]: - pred_ranks_by_compound = group_values_by_labels( - values=dft_geometry_bde_ranks[model_name], - labels=_COMPOUND_LABELS, - ) - # equivalent to mae, rmse - results[model_name] = mean_kendalls_tau( - ref_by_label=ref_ranks_by_compound, - pred_by_label=pred_ranks_by_compound, - ) +# --------------------------------------------------------------------------- +# MLFF-geometry fixtures +# --------------------------------------------------------------------------- - else: - results[model_name] = None - return results +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.mlff_opt_geometry.BDEs.json", + title="Bond Dissociation Energies on MLFF Geometries", + x_label="Predicted BDE / kcal/mol", + y_label="Reference BDE / kcal/mol", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def mlff_geometry_bdes() -> dict[str, list]: + """ + Compute BDEs for all sp3 H atoms on MLFF optimised geometries. -def mean_kendalls_tau(ref_by_label, pred_by_label): + Returns + ------- + dict[str, list] + Dictionary of reference and predicted BDEs and labels of the + compound for the corresponding BDE. """ - Compute mean Kendall's tau across labels. + return _load_bdes("mlff_opt", _MLFF_COMPOUND_LABELS) + + +@pytest.fixture +@plot_parity( + filename=OUT_PATH / "figure.CYP3A4.mlff_opt_geometry.BDE_ranks.json", + title="Bond Dissociation Energy ranks on MLFF Geometries", + x_label="Predicted BDE rank", + y_label="Reference BDE rank", + hoverdata={ + "Compound": get_hover_data_labels("compound"), + }, +) +def mlff_geometry_bde_ranks(mlff_geometry_bdes) -> dict[str, list]: + """ + Compute BDE ranks for all sp3 H atoms using MLFF optimised geometries. Parameters ---------- - ref_by_label : dict[str, list] - Reference ranks grouped by compound label. - pred_by_label : dict[str, list] - Predicted ranks grouped by compound label. + mlff_geometry_bdes : dict[str, list] + Dictionary of reference and predicted BDEs. Returns ------- - float - Mean Kendall's tau correlation across all labels. + dict[str, list] + Dictionary of reference and predicted BDE ranks. """ - kendalls_taus = [] - for label in ref_by_label.keys(): - ref = ref_by_label[label] - pred = pred_by_label[label] - correlation = stats.kendalltau(ref, pred).correlation - kendalls_taus.append(correlation) - return np.mean(kendalls_taus) + return _compute_ranks(mlff_geometry_bdes, _MLFF_COMPOUND_LABELS) + + +@pytest.fixture +def mlff_geometry_bde_errors(mlff_geometry_bdes) -> dict[str, float]: + """ + Get mean absolute error for bond dissociation energies on MLFF geometries. + + Parameters + ---------- + mlff_geometry_bdes + Dictionary of reference and predicted BDEs. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE errors for all models. + """ + return _compute_bde_errors(mlff_geometry_bdes) + + +@pytest.fixture +def mlff_geometry_bde_rank_correlations(mlff_geometry_bde_ranks) -> dict[str, float]: + """ + Compute mean Kendall's tau rank correlation across all molecules on MLFF geometries. + + For each molecule, sp3 C-H bond strengths are ranked from lowest to + highest by both DFT and MLIP, and the correlation is measured by + Kendall's tau. The final metric is the average across all molecules. + + Parameters + ---------- + mlff_geometry_bde_ranks : dict[str, list] + Dictionary of reference and predicted BDE ranks. + + Returns + ------- + dict[str, float] + Dictionary of predicted BDE rank correlation for all models. + """ + return _compute_rank_correlations(mlff_geometry_bde_ranks, _MLFF_COMPOUND_LABELS) + + +# --------------------------------------------------------------------------- +# Metrics table +# --------------------------------------------------------------------------- @pytest.fixture @@ -379,17 +551,22 @@ def mean_kendalls_tau(ref_by_label, pred_by_label): def metrics( dft_geometry_bde_errors: dict[str, float], dft_geometry_bde_rank_correlations: dict[str, float], + mlff_geometry_bde_errors: dict[str, float], + mlff_geometry_bde_rank_correlations: dict[str, float], ) -> dict[str, dict]: """ - Get all total rad energy metrics. + Get all BDE metrics. Parameters ---------- dft_geometry_bde_errors - Mean absolute errors for all systems. + Mean absolute errors on DFT-relaxed structures. dft_geometry_bde_rank_correlations - Mean Kendal's tau across predicted and reference - BDE ranks within a given molecule. + Mean Kendall's tau across predicted and reference BDE ranks on DFT geometries. + mlff_geometry_bde_errors + Mean absolute errors on MLFF-relaxed structures. + mlff_geometry_bde_rank_correlations + Mean Kendall's tau across predicted and reference BDE ranks on MLFF geometries. Returns ------- @@ -399,6 +576,8 @@ def metrics( return { "Direct BDE": dft_geometry_bde_errors, "BDE rank": dft_geometry_bde_rank_correlations, + "Direct BDE (MLFF opt)": mlff_geometry_bde_errors, + "BDE rank (MLFF opt)": mlff_geometry_bde_rank_correlations, } diff --git a/ml_peg/analysis/molecular/BDEs/metrics.yml b/ml_peg/analysis/molecular/BDEs/metrics.yml index 3b6469219..3d6af0b25 100644 --- a/ml_peg/analysis/molecular/BDEs/metrics.yml +++ b/ml_peg/analysis/molecular/BDEs/metrics.yml @@ -1,13 +1,25 @@ metrics: Direct BDE: - good: 0.0 + good: 0.5 bad: 50.0 unit: kcal/mol tooltip: "Mean Absolute Error on DFT-relaxed structures" level of theory: B3LYP-D3BJ/def2-SV(P) + Direct BDE (MLFF opt): + good: 0.5 + bad: 50.0 + unit: kcal/mol + tooltip: "Mean Absolute Error on MLFF-relaxed structures" + level of theory: B3LYP-D3BJ/def2-SV(P) BDE rank: good: 1.0 - bad: 0.0 + bad: 0.5 unit: null tooltip: "Mean Kendal rank correlation coefficient" level of theory: B3LYP-D3BJ/def2-SV(P) + BDE rank (MLFF opt): + good: 1.0 + bad: 0.5 + unit: null + tooltip: "Mean Kendall rank correlation coefficient on MLFF-relaxed structures" + level of theory: B3LYP-D3BJ/def2-SV(P) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py index 828dfcdb3..ed0ea1456 100644 --- a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py @@ -12,6 +12,7 @@ from typing import Any from ase.io import read, write +from janus_core.calculations.geom_opt import GeomOpt import pytest from ml_peg.calcs.utils.utils import download_s3_data @@ -20,9 +21,55 @@ MODELS = load_models(current_models) -DATA_PATH = Path(__file__).parent / "data" OUT_PATH = Path(__file__).parent / "outputs" +DFT_OPT_FILENAME = "cytochrome_p450_substrates.dft_opt.xyz" + + +def evaluate_bde_structures( + model_name: str, + model: Any, + bde_dir: Path, + out_filename: str, + geom_opt: bool = False, +) -> None: + """ + Evaluate MLFF energies on BDE structures, optionally after geometry optimisation. + + Parameters + ---------- + model_name + Name of the model, used to determine the output directory. + model + Model object providing get_calculator() and add_d3_calculator(). + bde_dir + Directory containing the input xyz file. + out_filename + Name of the output xyz file. + geom_opt + If True, geometry-optimise each structure with the MLFF before + evaluating energies and forces. + """ + calc = model.get_calculator() + calc = model.add_d3_calculator(calc) + + mols_rads = read(bde_dir / DFT_OPT_FILENAME, ":") + + ats_out = [] + for at in mols_rads: + at.calc = copy(calc) + if geom_opt: + geomopt = GeomOpt(struct=at, filter_class=None, write_results=False) + geomopt.run() + at = geomopt.struct + at.info["pred_energy"] = at.get_potential_energy() + at.arrays["pred_forces"] = at.get_forces() + ats_out.append(at) + + write_dir = OUT_PATH / model_name + write_dir.mkdir(parents=True, exist_ok=True) + write(write_dir / out_filename, ats_out) + @pytest.mark.parametrize("mlip", MODELS.items()) def test_bond_dissociation_energy(mlip: tuple[str, Any]) -> None: @@ -35,10 +82,6 @@ def test_bond_dissociation_energy(mlip: tuple[str, Any]) -> None: Name of model use and model to get calculator. """ model_name, model = mlip - calc = model.get_calculator() - - # Add D3 calculator for this test (for models where applicable) - calc = model.add_d3_calculator(calc) bde_dir = ( download_s3_data( @@ -48,16 +91,10 @@ def test_bond_dissociation_energy(mlip: tuple[str, Any]) -> None: / "BDEs" ) - structures_filename = "cytochrome_p450_substrates.dft_opt.xyz" - mols_rads = read(Path(bde_dir) / structures_filename, ":") - - ats_out = [] - for at in mols_rads: - at.calc = copy(calc) - at.info["pred_energy"] = at.get_potential_energy() - at.arrays["pred_forces"] = at.get_forces() - ats_out.append(at) - - write_dir = OUT_PATH / model_name - write_dir.mkdir(parents=True, exist_ok=True) - write(write_dir / structures_filename, ats_out) + evaluate_bde_structures( + model_name=model_name, + model=model, + bde_dir=bde_dir, + out_filename=DFT_OPT_FILENAME, + geom_opt=False, + ) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py new file mode 100644 index 000000000..6f68638c2 --- /dev/null +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs_mlff_opt.py @@ -0,0 +1,55 @@ +""" +Calculate BDEs of drug-like molecules on MLFF-optimised geometries. + +Journal of Chemical Theory and Computation 2024 20 (1), 164-177 +DOI: 10.1021/acs.jctc.3c00710 +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from ml_peg.calcs.molecular.BDEs.calc_BDEs import evaluate_bde_structures +from ml_peg.calcs.utils.utils import download_s3_data +from ml_peg.models.get_models import load_models +from ml_peg.models.models import current_models + +MODELS = load_models(current_models) + +MLFF_OPT_FILENAME = "cytochrome_p450_substrates.mlff_opt.xyz" + + +@pytest.mark.parametrize("mlip", MODELS.items()) +def test_bond_dissociation_energy_mlff_opt(mlip: tuple[str, Any]) -> None: + """ + Calculate C-H BDEs of drug-like molecules on MLFF-optimised geometries. + + Geometry-optimises each structure (molecule, radical, isolated atom) using the + MLFF before evaluating energies and forces. DFT reference energies are preserved + from the input structures. Reference BDEs are computed from DFT energies at DFT + geometries. + + Parameters + ---------- + mlip + Name of model use and model to get calculator. + """ + model_name, model = mlip + + bde_dir = ( + download_s3_data( + key="inputs/molecular/BDEs/BDEs.zip", + filename="BDEs.zip", + ) + / "BDEs" + ) + + evaluate_bde_structures( + model_name=model_name, + model=model, + bde_dir=bde_dir, + out_filename=MLFF_OPT_FILENAME, + geom_opt=True, + ) From 6f41ef8f187a3ac7e03aaec542410ce34c83e982 Mon Sep 17 00:00:00 2001 From: elena Date: Wed, 11 Mar 2026 20:42:49 +0000 Subject: [PATCH 5/9] fix interactie visualisation --- .../analysis/molecular/BDEs/analyse_BDEs.py | 50 +++++- ml_peg/app/molecular/BDEs/app_BDEs.py | 168 +++++++++++++++++- 2 files changed, 206 insertions(+), 12 deletions(-) diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py index e3ae8d65e..dbd0ed195 100644 --- a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -5,7 +5,7 @@ from pathlib import Path from ase import units -from ase.io import read +from ase.io import read, write import numpy as np import pytest from scipy import stats @@ -310,6 +310,46 @@ def _compute_bde_errors(bdes: dict) -> dict[str, float]: } +def _save_struct_files(xyz_suffix: str) -> None: + """ + Save individual radical XYZ files for app visualisation. + + Writes one file per BDE data point (one per radical) in the same order + as ``_load_bdes``, so that scatter-point indices map to structure files. + Saves for every model that has the relevant output file. + + Parameters + ---------- + xyz_suffix + Suffix of the xyz filename, e.g. ``"dft_opt"`` or ``"mlff_opt"``. + """ + xyz_filename = f"cytochrome_p450_substrates.{xyz_suffix}.xyz" + for model_name in MODELS: + model_dir = CALC_PATH / model_name + if not model_dir.exists(): + continue + xyz_files = sorted(model_dir.glob(xyz_filename)) + if not xyz_files: + continue + + struct_dir = OUT_PATH / model_name / xyz_suffix + struct_dir.mkdir(parents=True, exist_ok=True) + + for xyz_file in xyz_files: + all_atoms = read(xyz_file, index=":") + compound_dict = into_dict_of_labels(all_atoms, key="compound") + compound_dict.pop("isolated_atom", None) + + idx = 0 + for compound_atoms in compound_dict.values(): + mol_rad_dict = into_dict_of_labels(compound_atoms, "mol_or_rad") + if "rad" not in mol_rad_dict: + continue + for rad in mol_rad_dict["rad"]: + write(struct_dir / f"{idx}.xyz", rad) + idx += 1 + + def _compute_rank_correlations(ranks: dict, labels: list) -> dict[str, float]: """ Compute mean Kendall's tau rank correlation against reference ranks. @@ -371,7 +411,9 @@ def dft_geometry_bdes() -> dict[str, list]: Dictionary of reference and predicted BDEs and labels of the compound for the corresponding BDE. """ - return _load_bdes("dft_opt", _COMPOUND_LABELS) + result = _load_bdes("dft_opt", _COMPOUND_LABELS) + _save_struct_files("dft_opt") + return result @pytest.fixture @@ -466,7 +508,9 @@ def mlff_geometry_bdes() -> dict[str, list]: Dictionary of reference and predicted BDEs and labels of the compound for the corresponding BDE. """ - return _load_bdes("mlff_opt", _MLFF_COMPOUND_LABELS) + result = _load_bdes("mlff_opt", _MLFF_COMPOUND_LABELS) + _save_struct_files("mlff_opt") + return result @pytest.fixture diff --git a/ml_peg/app/molecular/BDEs/app_BDEs.py b/ml_peg/app/molecular/BDEs/app_BDEs.py index b97fcbfc4..3a3115d1b 100644 --- a/ml_peg/app/molecular/BDEs/app_BDEs.py +++ b/ml_peg/app/molecular/BDEs/app_BDEs.py @@ -2,7 +2,8 @@ from __future__ import annotations -from dash.html import Div +from dash import Input, Output, callback +from dash.html import Div, Iframe from ml_peg.app import APP_ROOT from ml_peg.app.base_app import BaseApp @@ -11,6 +12,7 @@ struct_from_scatter, ) from ml_peg.app.utils.load import read_plot +from ml_peg.app.utils.weas import generate_weas_html from ml_peg.models.get_models import get_model_names from ml_peg.models.models import current_models @@ -34,27 +36,175 @@ def register_callbacks(self) -> None: DATA_PATH / "figure.CYP3A4.dft_opt_geometry.BDE_ranks.json", id=f"{BENCHMARK_NAME}-ranks-figure", ) + scatter_mlff_bdes = read_plot( + DATA_PATH / "figure.CYP3A4.mlff_opt_geometry.BDEs.json", + id=f"{BENCHMARK_NAME}-mlff-figure", + ) + scatter_mlff_ranks = read_plot( + DATA_PATH / "figure.CYP3A4.mlff_opt_geometry.BDE_ranks.json", + id=f"{BENCHMARK_NAME}-mlff-ranks-figure", + ) + + def _get_structs(suffix: str) -> list[str]: + """ + Return asset paths for the first model with saved structure files. + + Parameters + ---------- + suffix + Geometry suffix, e.g. ``"dft_opt"`` or ``"mlff_opt"``. + + Returns + ------- + list[str] + Asset URL paths for each structure file, or empty list if none found. + """ + for model_name in MODELS: + structs_dir = DATA_PATH / model_name / suffix + xyz_files = sorted(structs_dir.glob("*.xyz"), key=lambda p: int(p.stem)) + if xyz_files: + return [ + f"assets/molecular/BDEs/{model_name}/{suffix}/{f.stem}.xyz" + for f in xyz_files + ] + return [] + + def _get_structs_per_model(suffix: str) -> dict[str, list[str]]: + """ + Return asset paths for each model that has saved structure files. - # Assets dir will be parent directory - individual files for each system - structs_dir = DATA_PATH / MODELS[0] - structs = [ - f"assets/molecular/BDEs/{MODELS[0]}/{struct_file.stem}.xyz" - for struct_file in sorted(structs_dir.glob("*.xyz")) - ] + Parameters + ---------- + suffix + Geometry suffix, e.g. ``"dft_opt"`` or ``"mlff_opt"``. + + Returns + ------- + dict[str, list[str]] + Mapping of model name to asset URL paths for its structure files. + """ + result = {} + for model_name in MODELS: + structs_dir = DATA_PATH / model_name / suffix + xyz_files = sorted(structs_dir.glob("*.xyz"), key=lambda p: int(p.stem)) + if xyz_files: + result[model_name] = [ + f"assets/molecular/BDEs/{model_name}/{suffix}/{f.stem}.xyz" + for f in xyz_files + ] + return result + + structs_dft = _get_structs("dft_opt") + structs_mlff_by_model = _get_structs_per_model("mlff_opt") plot_from_table_column( table_id=self.table_id, plot_id=f"{BENCHMARK_NAME}-figure-placeholder", - column_to_plot={"Direct BDE": scatter_bdes, "BDE rank": scatter_ranks}, + column_to_plot={ + "Direct BDE": scatter_bdes, + "BDE rank": scatter_ranks, + "Direct BDE (MLFF opt)": scatter_mlff_bdes, + "BDE rank (MLFF opt)": scatter_mlff_ranks, + }, ) struct_from_scatter( scatter_id=f"{BENCHMARK_NAME}-figure", struct_id=f"{BENCHMARK_NAME}-struct-placeholder", - structs=structs, + structs=structs_dft, + mode="struct", + ) + + struct_from_scatter( + scatter_id=f"{BENCHMARK_NAME}-ranks-figure", + struct_id=f"{BENCHMARK_NAME}-struct-placeholder", + structs=structs_dft, mode="struct", ) + def _show_mlff_struct_from_click(click_data): + """ + Return structure viewer for the clicked MLFF scatter point. + + Parameters + ---------- + click_data + Clicked data point in scatter plot. + + Returns + ------- + Div + Visualised structure on plot click. + """ + if not click_data: + return Div("Click on a metric to view the structure.") + point = click_data["points"][0] + idx = point["pointNumber"] + curve_num = point["curveNumber"] + if curve_num >= len(MODELS): + return Div("No structures available for this model.") + model_name = MODELS[curve_num] + structs = structs_mlff_by_model.get(model_name, []) + if not structs: + return Div("No structures available for this model.") + return Div( + Iframe( + srcDoc=generate_weas_html(structs[idx], "struct", 0), + style={ + "height": "550px", + "width": "100%", + "border": "1px solid #ddd", + "borderRadius": "5px", + }, + ) + ) + + @callback( + Output( + f"{BENCHMARK_NAME}-struct-placeholder", "children", allow_duplicate=True + ), + Input(f"{BENCHMARK_NAME}-mlff-figure", "clickData"), + prevent_initial_call="initial_duplicate", + ) + def show_mlff_struct(click_data): + """ + Show structure for the clicked MLFF BDE scatter point. + + Parameters + ---------- + click_data + Clicked data point in scatter plot. + + Returns + ------- + Div + Visualised structure on plot click. + """ + return _show_mlff_struct_from_click(click_data) + + @callback( + Output( + f"{BENCHMARK_NAME}-struct-placeholder", "children", allow_duplicate=True + ), + Input(f"{BENCHMARK_NAME}-mlff-ranks-figure", "clickData"), + prevent_initial_call="initial_duplicate", + ) + def show_mlff_rank_struct(click_data): + """ + Show structure for the clicked MLFF BDE rank scatter point. + + Parameters + ---------- + click_data + Clicked data point in scatter plot. + + Returns + ------- + Div + Visualised structure on plot click. + """ + return _show_mlff_struct_from_click(click_data) + def get_app() -> BDEsApp: """ From b87fc886f4d311de348f37b87db17937ad91406c Mon Sep 17 00:00:00 2001 From: elena Date: Wed, 11 Mar 2026 21:41:07 +0000 Subject: [PATCH 6/9] float64 for BDE tests --- ml_peg/calcs/molecular/BDEs/calc_BDEs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py index ed0ea1456..e10f540f7 100644 --- a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py @@ -50,6 +50,8 @@ def evaluate_bde_structures( If True, geometry-optimise each structure with the MLFF before evaluating energies and forces. """ + model.default_dtype = "float64" + model.d3_kwargs["dtype"] = "float64" calc = model.get_calculator() calc = model.add_d3_calculator(calc) From 616793ad8764ed39573ea945c559683a628347a2 Mon Sep 17 00:00:00 2001 From: elena Date: Wed, 11 Mar 2026 21:53:27 +0000 Subject: [PATCH 7/9] update docs --- .../user_guide/benchmarks/molecular.rst | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/docs/source/user_guide/benchmarks/molecular.rst b/docs/source/user_guide/benchmarks/molecular.rst index a6d3d7bdd..ab3e4937c 100644 --- a/docs/source/user_guide/benchmarks/molecular.rst +++ b/docs/source/user_guide/benchmarks/molecular.rst @@ -178,29 +178,44 @@ Input structures: BDEs -========= +==== Summary ------- -Performance in [for now] predicting energies of 60 CYP substrates and derived radicals. -Only [CHO] elements +Performance in predicting C-H bond dissociation energies (BDEs) for 60 CYP3A4 drug-like +substrates (CHO elements only), comprising 1117 sp3 C-H bonds across all molecules. Metrics ------- -1. Energy RMSE +1. Direct BDE + +Mean absolute error (MAE) of predicted BDEs against DFT reference values, in kcal/mol. + +For each molecule, BDEs are computed as: BDE = E(radical) + E(H) − E(molecule), where +energies are evaluated on DFT-optimised geometries. + +2. BDE rank + +Mean Kendall's τ rank correlation of predicted BDE rankings against reference rankings, +evaluated per molecule and averaged across all 60 compounds. -Accuracy of energy prediction. Measured as binding energy per atom. +3. Direct BDE (MLFF opt) -2. Force RMSE +Same as (1), but geometries are first relaxed using the MLFF before evaluating energies. + +4. BDE rank (MLFF opt) + +Same as (2), but using MLFF-optimised geometries. Computational cost ------------------ -Low: tests are likely to take less than a minute to run on CPU. +Medium: the DFT geometry tests are fast, but the MLFF geometry optimisation tests may +take several minutes per model on CPU. Data availability @@ -208,11 +223,11 @@ Data availability Input structures: -* TODO Input structures were evaluated as part of "Gelzinyte et al., Journal of Chemical Theory and Computation - 2024 20 (1), 164-177, DOI: 10.1021/acs.jctc.3c00710" but I didn't include the xyzs there... Just the SMILES strings. - +* Gelzinyte, E. et al. Transferable Machine Learning Interatomic Potential for Bond + Dissociation Energy Estimation. J. Chem. Theory Comput. 20, 164-177 (2024). + DOI: 10.1021/acs.jctc.3c00710 Reference data: -* (TODO ??Same as input data) +* Same as input data * B3LYP-D3BJ/def2-SV(P) From 8d3175e0b9da47427d27a0fac95989ec57ec8f67 Mon Sep 17 00:00:00 2001 From: elena Date: Wed, 11 Mar 2026 21:58:17 +0000 Subject: [PATCH 8/9] make standalone app --- ml_peg/app/molecular/BDEs/app_BDEs.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ml_peg/app/molecular/BDEs/app_BDEs.py b/ml_peg/app/molecular/BDEs/app_BDEs.py index 3a3115d1b..9ba1716c6 100644 --- a/ml_peg/app/molecular/BDEs/app_BDEs.py +++ b/ml_peg/app/molecular/BDEs/app_BDEs.py @@ -2,7 +2,7 @@ from __future__ import annotations -from dash import Input, Output, callback +from dash import Dash, Input, Output, callback from dash.html import Div, Iframe from ml_peg.app import APP_ROOT @@ -225,3 +225,16 @@ def get_app() -> BDEsApp: Div(id=f"{BENCHMARK_NAME}-struct-placeholder"), ], ) + + +if __name__ == "__main__": + # Create Dash app + full_app = Dash(__name__, assets_folder=DATA_PATH.parent.parent) + + # Construct layout and register callbacks + bdes_app = get_app() + full_app.layout = bdes_app.layout + bdes_app.register_callbacks() + + # Run app + full_app.run(port=8055, debug=True) From 8262e91ffdb82a1cff862b8ea1789883c72e17e4 Mon Sep 17 00:00:00 2001 From: elena Date: Thu, 12 Mar 2026 21:51:43 +0000 Subject: [PATCH 9/9] changes to work with main --- ml_peg/analysis/molecular/BDEs/analyse_BDEs.py | 8 ++++++-- ml_peg/calcs/molecular/BDEs/calc_BDEs.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py index dbd0ed195..44ca3315c 100644 --- a/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py +++ b/ml_peg/analysis/molecular/BDEs/analyse_BDEs.py @@ -11,14 +11,18 @@ from scipy import stats from ml_peg.analysis.utils.decorators import build_table, plot_parity -from ml_peg.analysis.utils.utils import build_d3_name_map, load_metrics_config, mae +from ml_peg.analysis.utils.utils import ( + build_dispersion_name_map, + load_metrics_config, + mae, +) from ml_peg.app import APP_ROOT from ml_peg.calcs import CALCS_ROOT from ml_peg.models.get_models import get_model_names from ml_peg.models.models import current_models MODELS = get_model_names(current_models) -D3_MODEL_NAMES = build_d3_name_map(MODELS) +D3_MODEL_NAMES = build_dispersion_name_map(MODELS) CALC_PATH = CALCS_ROOT / "molecular" / "BDEs" / "outputs" OUT_PATH = APP_ROOT / "data" / "molecular" / "BDEs" diff --git a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py index e10f540f7..e8b06f633 100644 --- a/ml_peg/calcs/molecular/BDEs/calc_BDEs.py +++ b/ml_peg/calcs/molecular/BDEs/calc_BDEs.py @@ -51,7 +51,7 @@ def evaluate_bde_structures( evaluating energies and forces. """ model.default_dtype = "float64" - model.d3_kwargs["dtype"] = "float64" + model.dispersion_kwargs["dtype"] = "float64" calc = model.get_calculator() calc = model.add_d3_calculator(calc)