From f3d36f898508926ac59d9d03e9efbc65ef9d44e7 Mon Sep 17 00:00:00 2001
From: Rohit Goswami <rgoswami@ieee.org>
Date: Mon, 16 Feb 2026 21:02:50 +0100
Subject: [PATCH 1/5] feat(pet): add neighbor_atom_indices to systems_to_batch

Return the neighbor atom indices in NEF format from systems_to_batch.
This tensor maps each edge to its neighbor atom index, needed to
scatter edge-level force gradients to per-atom forces in the compiled
forward path.
---
 src/metatrain/pet/model.py              | 1 +
 src/metatrain/pet/modules/structures.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/src/metatrain/pet/model.py b/src/metatrain/pet/model.py
index 757e778140..f9b7d716ac 100644
--- a/src/metatrain/pet/model.py
+++ b/src/metatrain/pet/model.py
@@ -416,6 +416,7 @@ def forward(
                 reverse_neighbor_index,
                 cutoff_factors,
                 system_indices,
+                _neighbor_atom_indices,
                 sample_labels,
             ) = systems_to_batch(
                 systems,
diff --git a/src/metatrain/pet/modules/structures.py b/src/metatrain/pet/modules/structures.py
index 8438a7c67e..291556f9d8 100644
--- a/src/metatrain/pet/modules/structures.py
+++ b/src/metatrain/pet/modules/structures.py
@@ -154,6 +154,7 @@ def systems_to_batch(
     torch.Tensor,
     torch.Tensor,
     torch.Tensor,
+    torch.Tensor,
     Labels,
 ]:
     """
@@ -181,6 +182,8 @@ def systems_to_batch(
         - `reverse_neighbor_index`: The reversed neighbor list for each central atom
         - `cutoff_factors`: The cutoff function values for each edge
         - `system_indices`: The system index for each atom in the batch
+        - `neighbor_atom_indices`: The atom index of each neighbor in NEF format,
+            used to scatter edge-level gradients to per-atom forces in the compiled path
         - `sample_labels`: Labels indicating the system and atom indices for each atom
 
     """
@@ -296,6 +299,7 @@ def systems_to_batch(
     reversed_neighbor_list = compute_reversed_neighbor_list(
         nef_indices, corresponding_edges, nef_mask
     )
+    neighbor_atom_indices = edge_array_to_nef(neighbors, nef_indices, nef_mask, 0.0)
     neighbors_index = edge_array_to_nef(neighbors, nef_indices).to(torch.int64)
 
     # Here, we compute the array that allows indexing into a flattened
@@ -320,5 +324,6 @@ def systems_to_batch(
         reverse_neighbor_index,
         cutoff_factors,
         system_indices,
+        neighbor_atom_indices,
         sample_labels,
     )

From fda7130cc763df78b01575fc9b67dd0378be84c4 Mon Sep 17 00:00:00 2001
From: Rohit Goswami <rgoswami@ieee.org>
Date: Mon, 16 Feb 2026 21:03:17 +0100
Subject: [PATCH 2/5] feat(pet): add pure-tensor _forward_from_batch method

Add a pure-tensor forward path that bypasses metatensor wrapping,
returning Dict[str, Dict[str, Tensor]] (target -> block_key -> tensor).
Always uses SDPA attention since forces will be computed via
autograd.grad(create_graph=False) in the compiled graph, avoiding
double backward entirely.
---
 src/metatrain/pet/model.py | 72 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/metatrain/pet/model.py b/src/metatrain/pet/model.py
index f9b7d716ac..9ecfac551b 100644
--- a/src/metatrain/pet/model.py
+++ b/src/metatrain/pet/model.py
@@ -571,6 +571,78 @@ def forward(
 
         return return_dict
 
+    def _forward_from_batch(
+        self,
+        element_indices_nodes: torch.Tensor,
+        element_indices_neighbors: torch.Tensor,
+        edge_vectors: torch.Tensor,
+        edge_distances: torch.Tensor,
+        padding_mask: torch.Tensor,
+        reverse_neighbor_index: torch.Tensor,
+        cutoff_factors: torch.Tensor,
+    ) -> Dict[str, Dict[str, torch.Tensor]]:
+        """
+        Pure-tensor forward pass for FX compilation.
+
+        Takes batch tensors and returns raw per-atom predictions as nested
+        dictionaries (target_name -> block_key -> tensor). Always uses SDPA
+        attention (no manual attention needed since forces are computed via
+        ``autograd.grad(create_graph=False)`` in the compiled graph, avoiding
+        double backward).
+
+        :param element_indices_nodes: Atomic species of central atoms [n_atoms].
+        :param element_indices_neighbors: Atomic species of neighbors
+            [n_atoms, max_neighbors].
+        :param edge_vectors: Edge vectors [n_atoms, max_neighbors, 3].
+        :param edge_distances: Edge distances [n_atoms, max_neighbors].
+        :param padding_mask: Boolean mask for real neighbors
+            [n_atoms, max_neighbors].
+        :param reverse_neighbor_index: Reversed neighbor index for message
+            passing [n_atoms, max_neighbors].
+        :param cutoff_factors: Cutoff function values [n_atoms, max_neighbors].
+        :return: Nested dict mapping target_name -> block_key -> per-atom
+            prediction tensor.
+        """
+        featurizer_inputs: Dict[str, torch.Tensor] = dict(
+            element_indices_nodes=element_indices_nodes,
+            element_indices_neighbors=element_indices_neighbors,
+            edge_vectors=edge_vectors,
+            edge_distances=edge_distances,
+            reverse_neighbor_index=reverse_neighbor_index,
+            padding_mask=padding_mask,
+            cutoff_factors=cutoff_factors,
+        )
+
+        # Always use SDPA (no double backward in compiled path)
+        node_features_list, edge_features_list = self._calculate_features(
+            featurizer_inputs, use_manual_attention=False
+        )
+
+        node_ll_dict, edge_ll_dict = self._calculate_last_layer_features(
+            node_features_list, edge_features_list
+        )
+
+        outputs_all: Dict[str, ModelOutput] = {
+            name: ModelOutput(per_atom=True) for name in self.target_names
+        }
+        node_preds, edge_preds = self._calculate_atomic_predictions(
+            node_ll_dict, edge_ll_dict, padding_mask, cutoff_factors, outputs_all
+        )
+
+        # Sum across GNN layers for each target/block
+        results: Dict[str, Dict[str, torch.Tensor]] = {}
+        for target_name in self.target_names:
+            block_results: Dict[str, torch.Tensor] = {}
+            node_layers = node_preds[target_name]
+            edge_layers = edge_preds[target_name]
+            for j, key in enumerate(self.output_shapes[target_name]):
+                total = node_layers[0][j] + edge_layers[0][j]
+                for i in range(1, len(node_layers)):
+                    total = total + node_layers[i][j] + edge_layers[i][j]
+                block_results[key] = total
+            results[target_name] = block_results
+        return results
+
     def _calculate_features(
         self, inputs: Dict[str, torch.Tensor], use_manual_attention: bool
     ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:

From 3c4e684294b82d851111fecd915de54fd2b73f07 Mon Sep 17 00:00:00 2001
From: Rohit Goswami <rgoswami@ieee.org>
Date: Mon, 16 Feb 2026 21:03:44 +0100
Subject: [PATCH 3/5] feat(pet): full-graph FX compilation for training

Trace the entire PET forward pass (including force/stress computation
via autograd.grad) into a single FX graph using make_fx, then compile
with torch.compile(dynamic=True, fullgraph=True). This gives maximum
kernel fusion, zero compiled/eager boundary crossings, and always uses
SDPA since forces use create_graph=False (single backward, no double
backward).

Key components:
- modules/compile.py: _PETBatchForward wrapper, _make_pet_compiled_forward
  traceable function with NamedMemberAccessor param swapping, and
  compile_pet_model orchestrator
- trainer.py: _wrap_compiled_output converts compiled outputs to
  Dict[str, TensorMap], training loop branches between compiled and
  eager paths
- DecomposedSiLU (x * sigmoid(x)) replaces nn.SiLU before tracing
  since inductor can't differentiate silu_backward nodes
---
 pyproject.toml                         |   7 +-
 src/metatrain/pet/checkpoints.py       |   9 +
 src/metatrain/pet/documentation.py     |  11 +
 src/metatrain/pet/model.py             |   9 +-
 src/metatrain/pet/modules/compile.py   | 293 +++++++++++++++++++++++++
 src/metatrain/pet/modules/utilities.py |  31 +++
 src/metatrain/pet/trainer.py           | 278 +++++++++++++++++++++--
 7 files changed, 613 insertions(+), 25 deletions(-)
 create mode 100644 src/metatrain/pet/modules/compile.py

diff --git a/pyproject.toml b/pyproject.toml
index 4025b408c3..18ea6df8cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,6 +163,7 @@ filterwarnings = [
     "ignore:`torch.jit.script` is deprecated. Please switch to `torch.compile` or `torch.export`:DeprecationWarning",
     "ignore:`torch.jit.save` is deprecated. Please switch to `torch.export`:DeprecationWarning",
     "ignore:`torch.jit.load` is deprecated. Please switch to `torch.export`:DeprecationWarning",
+    "ignore:`torch.jit.script_method` is deprecated. Please switch to `torch.compile` or `torch.export`:DeprecationWarning",
     "ignore:`torch.jit.trace_method` is deprecated. Please switch to `torch.compile` or `torch.export`:DeprecationWarning",
     "ignore:`torch.jit.trace` is deprecated. Please switch to `torch.compile` or `torch.export`:DeprecationWarning",
     # PyTorch does not want these, but mypy requires them
@@ -177,7 +178,11 @@ filterwarnings = [
     # Multi-threaded tests clash with multi-process data-loading
     "ignore:This process \\(pid=\\d+\\) is multi-threaded, use of fork\\(\\) may lead to deadlocks in the child.:DeprecationWarning",
     # MACE warning with newer versions of pytorch (because they use e3nn==0.4.4)
-    "ignore:Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the`weights_only` argument was not explicitly passed:UserWarning"
+    "ignore:Environment variable TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD detected, since the`weights_only` argument was not explicitly passed:UserWarning",
+    # compiled_autograd + non-leaf tensors: dynamo incorrectly converts this C++
+    # warning to an error when tracing backward graphs through non-leaf tensors
+    # (e.g. edge_distances computed from edge_vectors via sqrt)
+    "ignore:The .grad attribute of a Tensor that is not a leaf Tensor is being accessed:UserWarning"
 ]
 addopts = ["-p", "mtt_plugin"]
 pythonpath = "src/metatrain/utils/testing"
diff --git a/src/metatrain/pet/checkpoints.py b/src/metatrain/pet/checkpoints.py
index 020b1e6590..f09c9afbb3 100644
--- a/src/metatrain/pet/checkpoints.py
+++ b/src/metatrain/pet/checkpoints.py
@@ -420,3 +420,12 @@ def trainer_update_v11_v12(checkpoint: dict) -> None:
     :param checkpoint: The checkpoint to update.
     """
     checkpoint["train_hypers"]["batch_atom_bounds"] = [None, None]
+
+
+def trainer_update_v12_v13(checkpoint: dict) -> None:
+    """
+    Update trainer checkpoint from version 12 to version 13.
+
+    :param checkpoint: The checkpoint to update.
+    """
+    checkpoint["train_hypers"]["compile"] = False
diff --git a/src/metatrain/pet/documentation.py b/src/metatrain/pet/documentation.py
index 8eaec9c03e..629e02c0ce 100644
--- a/src/metatrain/pet/documentation.py
+++ b/src/metatrain/pet/documentation.py
@@ -257,3 +257,14 @@ class TrainerHypers(TypedDict):
 
     See :ref:`label_fine_tuning_concept` for more details.
     """
+    compile: bool = False
+    """Whether to use full-graph FX compilation during training.
+
+    When enabled, the entire PET model (including force/stress computation via
+    ``autograd.grad``) is traced into a single FX graph using ``make_fx`` and
+    then compiled with ``torch.compile(dynamic=True, fullgraph=True)``. This
+    gives maximum kernel fusion, zero compiled/eager boundary crossings, and
+    always uses ``scaled_dot_product_attention`` (SDPA). Expect a one-time
+    compilation cost at the start of training, followed by speedups on every
+    subsequent step.
+    """
diff --git a/src/metatrain/pet/model.py b/src/metatrain/pet/model.py
index 9ecfac551b..c5cb7e5b5a 100644
--- a/src/metatrain/pet/model.py
+++ b/src/metatrain/pet/model.py
@@ -1413,7 +1413,12 @@ def upgrade_checkpoint(cls, checkpoint: Dict) -> Dict:
         return checkpoint
 
     def get_checkpoint(self) -> Dict:
-        model_state_dict = self.state_dict()
+        # Get state dict, handling compiled modules by removing _orig_mod prefix
+        state_dict = {
+            k.replace("._orig_mod", ""): v
+            for k, v in self.state_dict().items()
+        }
+        model_state_dict = dict(state_dict)
         model_state_dict["finetune_config"] = self.finetune_config
         checkpoint = {
             "architecture_name": "pet",
@@ -1426,7 +1431,7 @@ def get_checkpoint(self) -> Dict:
             "epoch": None,
             "best_epoch": None,
             "model_state_dict": model_state_dict,
-            "best_model_state_dict": self.state_dict(),
+            "best_model_state_dict": state_dict,
         }
         return checkpoint
 
diff --git a/src/metatrain/pet/modules/compile.py b/src/metatrain/pet/modules/compile.py
new file mode 100644
index 0000000000..56287e1c44
--- /dev/null
+++ b/src/metatrain/pet/modules/compile.py
@@ -0,0 +1,293 @@
+"""Full-graph FX compilation for PET.
+
+Traces the entire PET forward pass (including force/stress computation via
+``autograd.grad``) into a single FX graph, then compiles it with
+``torch.compile(dynamic=True, fullgraph=True)``.  This gives maximum kernel
+fusion, zero compiled/eager boundary crossings, and always uses SDPA
+(``scaled_dot_product_attention``) since forces use
+``create_graph=False`` (no double backward).
+"""
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch.nn.utils._named_member_accessor import NamedMemberAccessor
+
+from .utilities import replace_silu_modules
+
+
+class _PETBatchForward(torch.nn.Module):
+    """Thin wrapper whose ``forward()`` delegates to ``pet._forward_from_batch``.
+
+    PET is registered as a submodule so its parameters/buffers are visible
+    to ``functional_call`` / ``NamedMemberAccessor``.
+    """
+
+    def __init__(self, pet: torch.nn.Module) -> None:
+        super().__init__()
+        self.pet = pet
+
+    def forward(
+        self,
+        element_indices_nodes: torch.Tensor,
+        element_indices_neighbors: torch.Tensor,
+        edge_vectors: torch.Tensor,
+        edge_distances: torch.Tensor,
+        padding_mask: torch.Tensor,
+        reverse_neighbor_index: torch.Tensor,
+        cutoff_factors: torch.Tensor,
+    ) -> Dict[str, Dict[str, torch.Tensor]]:
+        return self.pet._forward_from_batch(
+            element_indices_nodes,
+            element_indices_neighbors,
+            edge_vectors,
+            edge_distances,
+            padding_mask,
+            reverse_neighbor_index,
+            cutoff_factors,
+        )
+
+
+def _make_pet_compiled_forward(
+    batch_model: _PETBatchForward,
+    param_names: List[str],
+    buffer_names: List[str],
+    target_names: List[str],
+    output_shapes: Dict[str, Dict[str, List[int]]],
+    compute_forces: bool,
+    compute_stress: bool,
+):
+    """Build the traceable forward function for ``make_fx``.
+
+    The returned function accepts all batch tensors and the model's
+    parameters/buffers as positional arguments (required by
+    ``make_fx`` with ``functional_call``).  It returns
+    ``(per_structure_preds, forces, stress, raw_predictions)``.
+    """
+    n_params = len(param_names)
+    accessor = NamedMemberAccessor(batch_model)
+
+    # Identify which target is the energy target (quantity == "energy")
+    # For force/stress we need to aggregate per-atom energy to per-structure.
+    energy_target_name: Optional[str] = None
+    energy_block_key: Optional[str] = None
+    pet = batch_model.pet
+    for tname in target_names:
+        if hasattr(pet, "outputs") and tname in pet.outputs:
+            if pet.outputs[tname].quantity == "energy":
+                energy_target_name = tname
+                # First block key for this target
+                energy_block_key = next(iter(output_shapes[tname]))
+                break
+
+    if (compute_forces or compute_stress) and energy_target_name is None:
+        raise ValueError(
+            "Force/stress compilation requested but no energy target found."
+        )
+
+    def forward_fn(
+        edge_vectors,
+        element_indices_nodes,
+        element_indices_neighbors,
+        padding_mask,
+        reverse_neighbor_index,
+        cutoff_factors,
+        system_indices,
+        neighbor_atom_indices,
+        n_structures,
+        *params_and_buffers,
+    ):
+        # Swap in the provided params/buffers via NamedMemberAccessor
+        params_buffers = {}
+        for i, name in enumerate(param_names):
+            params_buffers[name] = params_and_buffers[i]
+        for i, name in enumerate(buffer_names):
+            params_buffers[name] = params_and_buffers[n_params + i]
+
+        orig_values, _ = accessor.swap_tensors_dict(
+            params_buffers, allow_missing=True
+        )
+
+        # Compute edge_distances inside compiled graph (differentiable)
+        edge_distances = torch.sqrt((edge_vectors**2).sum(-1) + 1e-15)
+
+        raw_predictions = batch_model(
+            element_indices_nodes,
+            element_indices_neighbors,
+            edge_vectors,
+            edge_distances,
+            padding_mask,
+            reverse_neighbor_index,
+            cutoff_factors,
+        )
+
+        # Restore original params/buffers
+        accessor.swap_tensors_dict(orig_values, allow_missing=True)
+
+        # Aggregate per-atom predictions to per-structure for the energy target
+        n_atoms = edge_vectors.shape[0]
+        # +1 for padding structure index (scatter needs valid indices)
+        n_struct = n_structures + 1
+
+        energy: Optional[torch.Tensor] = None
+        forces: Optional[torch.Tensor] = None
+        stress: Optional[torch.Tensor] = None
+
+        if energy_target_name is not None and energy_block_key is not None:
+            per_atom_energy = raw_predictions[energy_target_name][energy_block_key]
+            energy = torch.zeros(
+                n_struct, dtype=edge_vectors.dtype, device=edge_vectors.device
+            )
+            energy.scatter_add_(
+                0, system_indices, per_atom_energy.squeeze(-1)
+            )
+
+        if (compute_forces or compute_stress) and energy is not None:
+            (dE_dR,) = torch.autograd.grad(
+                energy[:n_structures].sum(),
+                edge_vectors,
+                create_graph=False,
+            )
+            dE_dR = dE_dR * padding_mask[:, :, None].float()
+
+            if compute_forces:
+                # d(E)/d(pos[i]):
+                #   as center: -sum_j dE_dR[i, j]
+                #   as neighbor: +sum_{(k,j): neighbor_atom=i} dE_dR[k, j]
+                grad_as_center = -dE_dR.sum(dim=1)  # [n_atoms, 3]
+                flat_dE = dE_dR.reshape(-1, 3)
+                flat_idx = neighbor_atom_indices.reshape(-1, 1).expand(-1, 3).long()
+                grad_as_neighbor = torch.zeros(
+                    n_atoms, 3, dtype=edge_vectors.dtype, device=edge_vectors.device
+                )
+                grad_as_neighbor.scatter_add_(0, flat_idx, flat_dE)
+                forces = grad_as_center + grad_as_neighbor
+
+            if compute_stress:
+                # Virial: sigma = (1/V) sum r otimes (dE/dr)
+                virial_per_atom = torch.einsum("ema,emb->eab", edge_vectors, dE_dR)
+                stress_buf = torch.zeros(
+                    n_struct, 3, 3,
+                    dtype=edge_vectors.dtype, device=edge_vectors.device,
+                )
+                stress_buf.scatter_add_(
+                    0,
+                    system_indices[:, None, None].expand(-1, 3, 3),
+                    virial_per_atom,
+                )
+                stress = stress_buf[:n_structures]
+
+        if energy is not None:
+            energy = energy[:n_structures]
+
+        return energy, forces, stress, raw_predictions
+
+    return forward_fn
+
+
+def compile_pet_model(
+    model: torch.nn.Module,
+    train_dataloader,
+    compute_forces: bool,
+    compute_stress: bool,
+) -> Tuple[torch.nn.Module, List[str], List[str]]:
+    """Trace and compile the PET model as a single FX graph.
+
+    :param model: The PET model instance.
+    :param train_dataloader: A dataloader to get a sample batch for tracing.
+    :param compute_forces: Whether force computation is included.
+    :param compute_stress: Whether stress computation is included.
+    :return: Tuple of (compiled_module, param_names, buffer_names).
+    """
+    from torch.fx.experimental.proxy_tensor import make_fx
+
+    from metatrain.utils.data import unpack_batch
+    from metatrain.utils.transfer import batch_to
+
+    from ..modules.structures import systems_to_batch
+
+    batch_model = _PETBatchForward(model)
+    replace_silu_modules(batch_model)
+
+    params = dict(batch_model.named_parameters())
+    buffers = dict(batch_model.named_buffers())
+    param_names = list(params.keys())
+    buffer_names = list(buffers.keys())
+
+    forward_fn = _make_pet_compiled_forward(
+        batch_model,
+        param_names,
+        buffer_names,
+        model.target_names,
+        model.output_shapes,
+        compute_forces,
+        compute_stress,
+    )
+
+    # Get a sample batch for tracing
+    batch = next(iter(train_dataloader))
+    systems, _targets, _extra_data = unpack_batch(batch)
+    device = next(model.parameters()).device
+    dtype = next(model.parameters()).dtype
+    systems, _, _ = batch_to(systems, {}, {}, dtype=dtype, device=device)
+
+    (
+        element_indices_nodes,
+        element_indices_neighbors,
+        edge_vectors,
+        edge_distances,
+        padding_mask,
+        reverse_neighbor_index,
+        cutoff_factors,
+        system_indices,
+        neighbor_atom_indices,
+        _sample_labels,
+    ) = systems_to_batch(
+        systems,
+        model.requested_nl,
+        model.atomic_types,
+        model.species_to_species_index,
+        model.cutoff_function,
+        model.cutoff_width,
+        model.num_neighbors_adaptive,
+    )
+
+    n_structures = int(system_indices.max().item()) + 1
+
+    # edge_vectors needs grad for force tracing
+    tracing_edge_vectors = edge_vectors.clone().requires_grad_(True)
+
+    logging.info(
+        "Tracing PET model with make_fx (symbolic tracing)..."
+    )
+
+    old_duck = torch.fx.experimental._config.use_duck_shape
+    torch.fx.experimental._config.use_duck_shape = False
+    try:
+        fx_graph = make_fx(
+            forward_fn,
+            tracing_mode="symbolic",
+            _allow_non_fake_inputs=True,
+        )(
+            tracing_edge_vectors,
+            element_indices_nodes,
+            element_indices_neighbors,
+            padding_mask,
+            reverse_neighbor_index,
+            cutoff_factors,
+            system_indices,
+            neighbor_atom_indices,
+            n_structures,
+            *list(params.values()),
+            *list(buffers.values()),
+        )
+    finally:
+        torch.fx.experimental._config.use_duck_shape = old_duck
+
+    logging.info("Compiling traced FX graph with torch.compile...")
+    compiled = torch.compile(
+        fx_graph, dynamic=True, fullgraph=True
+    )
+
+    return compiled, param_names, buffer_names
diff --git a/src/metatrain/pet/modules/utilities.py b/src/metatrain/pet/modules/utilities.py
index 5795c5aed8..2414f2f1ca 100644
--- a/src/metatrain/pet/modules/utilities.py
+++ b/src/metatrain/pet/modules/utilities.py
@@ -52,6 +52,37 @@ def cutoff_func_cosine(
     return f
 
 
+class DecomposedSiLU(torch.nn.Module):
+    """SiLU activation implemented as ``x * sigmoid(x)``.
+
+    Unlike ``torch.nn.SiLU``, this decomposes into primitive ops so that
+    ``make_fx`` produces a backward graph without ``silu_backward`` nodes.
+    This is needed for ``torch.compile(inductor)`` to differentiate through
+    the inlined backward when using the FX compilation path for force training.
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(x)
+
+
+def replace_silu_modules(module: torch.nn.Module) -> None:
+    """Replace all ``torch.nn.SiLU`` instances with :class:`DecomposedSiLU`.
+
+    Recurses through the module tree, including inside ``nn.Sequential``.
+    """
+    for name, child in module.named_children():
+        if isinstance(child, torch.nn.SiLU):
+            setattr(module, name, DecomposedSiLU())
+        elif isinstance(child, torch.nn.Sequential):
+            for i, layer in enumerate(child):
+                if isinstance(layer, torch.nn.SiLU):
+                    child[i] = DecomposedSiLU()
+                else:
+                    replace_silu_modules(layer)
+        else:
+            replace_silu_modules(child)
+
+
 class DummyModule(torch.nn.Module):
     """Dummy torch module to make torchscript happy.
     This model should never be run"""
diff --git a/src/metatrain/pet/trainer.py b/src/metatrain/pet/trainer.py
index 8194adc7de..e7264601ca 100644
--- a/src/metatrain/pet/trainer.py
+++ b/src/metatrain/pet/trainer.py
@@ -37,10 +37,145 @@
 from metatrain.utils.scaler import get_remove_scale_transform
 from metatrain.utils.transfer import batch_to
 
+from metatensor.torch import Labels, TensorBlock, TensorMap
+from metatomic.torch import System
+
 from . import checkpoints
 from .documentation import TrainerHypers
 from .model import PET
 from .modules.finetuning import apply_finetuning_strategy
+from .modules.structures import systems_to_batch
+
+
+def _wrap_compiled_output(
+    energy: torch.Tensor,
+    forces: torch.Tensor,
+    stress: torch.Tensor,
+    raw_predictions: Dict[str, Dict[str, torch.Tensor]],
+    model: PET,
+    systems: List[System],
+    sample_labels: Labels,
+    system_indices: torch.Tensor,
+    train_targets: Dict,
+) -> Dict[str, TensorMap]:
+    """Convert compiled function outputs to Dict[str, TensorMap].
+
+    Produces the same format as ``evaluate_model`` so the loss function
+    and metric accumulators work unchanged.
+    """
+    from metatrain.utils.sum_over_atoms import sum_over_atoms
+
+    device = system_indices.device
+    predictions: Dict[str, TensorMap] = {}
+
+    # Identify the energy target
+    energy_target_name = None
+    for tname in model.target_names:
+        if tname in model.outputs and model.outputs[tname].quantity == "energy":
+            energy_target_name = tname
+            break
+
+    # Build energy TensorMap (per-structure) with optional gradient blocks
+    if energy_target_name is not None and energy is not None:
+        n_structures = energy.shape[0]
+        energy_block = TensorBlock(
+            values=energy.unsqueeze(-1),
+            samples=Labels(
+                "system",
+                torch.arange(
+                    n_structures, device=device, dtype=torch.int32
+                ).unsqueeze(-1),
+                assume_unique=True,
+            ),
+            components=[],
+            properties=Labels(
+                "energy", torch.tensor([[0]], device=device)
+            ),
+        )
+
+        if forces is not None:
+            # Position gradient block: samples are ["sample", "atom"]
+            # matching evaluate_model's _position_gradients_to_block format
+            grad_samples = Labels(
+                names=["sample", "atom"],
+                values=sample_labels.values.to(torch.int32),
+                assume_unique=True,
+            ).to(device)
+            xyz_labels = Labels(
+                "xyz", torch.tensor([[0], [1], [2]], device=device)
+            )
+            forces_block = TensorBlock(
+                values=forces.unsqueeze(-1),
+                samples=grad_samples,
+                components=[xyz_labels],
+                properties=Labels(
+                    "energy", torch.tensor([[0]], device=device)
+                ),
+            )
+            energy_block.add_gradient("positions", forces_block)
+
+        if stress is not None:
+            stress_samples = Labels(
+                "sample",
+                torch.arange(
+                    n_structures, device=device, dtype=torch.int32
+                ).unsqueeze(-1),
+                assume_unique=True,
+            )
+            xyz1 = Labels(
+                "xyz_1", torch.tensor([[0], [1], [2]], device=device)
+            )
+            xyz2 = Labels(
+                "xyz_2", torch.tensor([[0], [1], [2]], device=device)
+            )
+            stress_block = TensorBlock(
+                values=stress.unsqueeze(-1),
+                samples=stress_samples,
+                components=[xyz1, xyz2],
+                properties=Labels(
+                    "energy", torch.tensor([[0]], device=device)
+                ),
+            )
+            energy_block.add_gradient("strain", stress_block)
+
+        predictions[energy_target_name] = TensorMap(
+            keys=model.single_label,
+            blocks=[energy_block],
+        )
+
+    # Non-energy targets: wrap per-atom raw predictions into TensorMaps
+    for target_name in model.target_names:
+        if target_name == energy_target_name:
+            continue
+        if target_name not in raw_predictions:
+            continue
+        if target_name not in train_targets:
+            continue
+
+        target_preds = raw_predictions[target_name]
+        blocks = []
+        for key, shape, components, properties in zip(
+            model.output_shapes[target_name].keys(),
+            model.output_shapes[target_name].values(),
+            model.component_labels[target_name],
+            model.property_labels[target_name],
+            strict=True,
+        ):
+            values = target_preds[key].reshape([-1] + shape)
+            block = TensorBlock(
+                values=values,
+                samples=sample_labels,
+                components=components,
+                properties=properties,
+            )
+            blocks.append(block)
+
+        tmap = TensorMap(keys=model.key_labels[target_name], blocks=blocks)
+        if not train_targets[target_name].per_atom:
+            tmap = sum_over_atoms(tmap)
+        predictions[target_name] = tmap
+
+    return predictions
 
 
 def get_scheduler(
@@ -77,7 +212,7 @@ def lr_lambda(current_step: int) -> float:
 
 
 class Trainer(TrainerInterface[TrainerHypers]):
-    __checkpoint_version__ = 12
+    __checkpoint_version__ = 13
 
     def __init__(self, hypers: TrainerHypers) -> None:
         super().__init__(hypers)
@@ -160,6 +295,26 @@ def train(
             additive_model.to(dtype=torch.float64)
         model.scaler.to(dtype=torch.float64)
 
+        # torch.compile: full-graph FX compilation of the entire model
+        # (including force/stress computation via autograd.grad).
+        compile_enabled = self.hypers.get("compile", False)
+        has_gradients = any(
+            len(target_info.gradients) > 0
+            for target_info in model.dataset_info.targets.values()
+        )
+        has_strain_gradients = any(
+            "strain" in target_info.gradients
+            for target_info in model.dataset_info.targets.values()
+        )
+        if compile_enabled:
+            torch._dynamo.reset()
+            if is_distributed:
+                logging.warning(
+                    "torch.compile with DDP is not yet supported. "
+                    "Disabling compilation for distributed training."
+                )
+                compile_enabled = False
+
         logging.info("Calculating composition weights")
         model.additive_models[0].train_model(  # this is the composition model
             train_datasets,
@@ -357,6 +512,23 @@ def train(
         # Log the initial learning rate:
         logging.info(f"Base learning rate: {self.hypers['learning_rate']}")
 
+        # Full-graph FX compilation (after dataloaders are ready for tracing).
+        compiled_fn = None
+        if compile_enabled:
+            from .modules.compile import compile_pet_model
+
+            compiled_fn, _compiled_param_names, _compiled_buffer_names = (
+                compile_pet_model(
+                    model,
+                    train_dataloader,
+                    has_gradients,
+                    has_strain_gradients,
+                )
+            )
+            logging.info(
+                "FX compilation complete (will optimize on first call)"
+            )
+
         start_epoch = 0 if self.epoch is None else self.epoch + 1
 
         # Train the model:
@@ -389,27 +561,85 @@ def train(
                 systems, targets, extra_data = batch_to(
                     systems, targets, extra_data, dtype=dtype, device=device
                 )
-                predictions = evaluate_model(
-                    model,
-                    systems,
-                    {key: train_targets[key] for key in targets.keys()},
-                    is_training=True,
-                )
 
-                # average by the number of atoms
-                predictions = average_by_num_atoms(
-                    predictions, systems, per_structure_targets
-                )
-                targets = average_by_num_atoms(targets, systems, per_structure_targets)
-                train_loss_batch = loss_fn(predictions, targets, extra_data)
+                if compile_enabled and compiled_fn is not None:
+                    # FX-compiled path: call systems_to_batch directly,
+                    # run the compiled function, and wrap outputs.
+                    (
+                        c_element_indices_nodes,
+                        c_element_indices_neighbors,
+                        c_edge_vectors,
+                        _c_edge_distances,
+                        c_padding_mask,
+                        c_reverse_neighbor_index,
+                        c_cutoff_factors,
+                        c_system_indices,
+                        c_neighbor_atom_indices,
+                        c_sample_labels,
+                    ) = systems_to_batch(
+                        systems,
+                        model.requested_nl,
+                        model.atomic_types,
+                        model.species_to_species_index,
+                        model.cutoff_function,
+                        model.cutoff_width,
+                        model.num_neighbors_adaptive,
+                    )
+                    if has_gradients:
+                        c_edge_vectors = c_edge_vectors.requires_grad_(True)
+                    n_structures = len(systems)
+                    energy, forces, stress, raw_preds = compiled_fn(
+                        c_edge_vectors,
+                        c_element_indices_nodes,
+                        c_element_indices_neighbors,
+                        c_padding_mask,
+                        c_reverse_neighbor_index,
+                        c_cutoff_factors,
+                        c_system_indices,
+                        c_neighbor_atom_indices,
+                        n_structures,
+                        *list(model.parameters()),
+                        *list(model.buffers()),
+                    )
+                    predictions = _wrap_compiled_output(
+                        energy,
+                        forces,
+                        stress,
+                        raw_preds,
+                        model,
+                        systems,
+                        c_sample_labels,
+                        c_system_indices,
+                        train_targets,
+                    )
+                    predictions = average_by_num_atoms(
+                        predictions, systems, per_structure_targets
+                    )
+                    targets = average_by_num_atoms(
+                        targets, systems, per_structure_targets
+                    )
+                    train_loss_batch = loss_fn(predictions, targets, extra_data)
+                    train_loss_batch.backward()
+                else:
+                    predictions = evaluate_model(
+                        model,
+                        systems,
+                        {key: train_targets[key] for key in targets.keys()},
+                        is_training=True,
+                    )
+                    predictions = average_by_num_atoms(
+                        predictions, systems, per_structure_targets
+                    )
+                    targets = average_by_num_atoms(
+                        targets, systems, per_structure_targets
+                    )
+                    train_loss_batch = loss_fn(predictions, targets, extra_data)
 
-                if is_distributed:
-                    # make sure all parameters contribute to the gradient calculation
-                    # to make torch DDP happy
-                    for param in model.parameters():
-                        train_loss_batch += 0.0 * param.sum()
+                    if is_distributed:
+                        for param in model.parameters():
+                            train_loss_batch += 0.0 * param.sum()
 
-                train_loss_batch.backward()
+                    train_loss_batch.backward()
                 torch.nn.utils.clip_grad_norm_(
                     model.parameters(), self.hypers["grad_clip_norm"]
                 )
@@ -538,9 +768,13 @@ def train(
             )
             if val_metric < self.best_metric:
                 self.best_metric = val_metric
-                self.best_model_state_dict = copy.deepcopy(
-                    (model.module if is_distributed else model).state_dict()
-                )
+                raw_state_dict = (
+                    model.module if is_distributed else model
+                ).state_dict()
+                self.best_model_state_dict = {
+                    k.replace("._orig_mod", ""): v.clone()
+                    for k, v in raw_state_dict.items()
+                }
                 self.best_epoch = epoch
                 self.best_optimizer_state_dict = copy.deepcopy(optimizer.state_dict())
 

From 133bedc335b1c2fcd960f92f3040014e868a2e42 Mon Sep 17 00:00:00 2001
From: Rohit Goswami <rgoswami@ieee.org>
Date: Mon, 16 Feb 2026 21:04:06 +0100
Subject: [PATCH 4/5] test(pet): add tests for FX compilation and
 _forward_from_batch

- test_forward_from_batch: verifies numerical equivalence between
  _forward_from_batch and the standard forward path
- TestTrainingCompile: exercises the full FX compilation path during
  training (energy + forces), including restart and finetune scenarios
- Existing CartesianTransformer compile unit tests retained
---
 .../checkpoints/model-v11_trainer-v13.ckpt.gz | Bin 0 -> 16997 bytes
 src/metatrain/pet/tests/test_compile.py       | 226 ++++++++++++++++++
 2 files changed, 226 insertions(+)
 create mode 100644 src/metatrain/pet/tests/checkpoints/model-v11_trainer-v13.ckpt.gz
 create mode 100644 src/metatrain/pet/tests/test_compile.py

diff --git a/src/metatrain/pet/tests/checkpoints/model-v11_trainer-v13.ckpt.gz b/src/metatrain/pet/tests/checkpoints/model-v11_trainer-v13.ckpt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..c5c58fb9f2a6b0c3bb8c43f860ac6ed8c5779318
GIT binary patch
literal 16997
zcmZ8{1yoe+8tx3;-HmifgVG>KE7INF77a6$w4?$eDN4ggH$zGa(jg(;Aw9(04gY`c
zIrpvwJHGwK^FD8Ud$VAS!NU54k?e$mbn~#abLI0A6tsBr%F4;z?iFw?{Lse5^9^V(
z6Vr!mg7Pd`laOBFKuN;NyOu>$QSPHXCk9cfW~C~{==%OU2K23ItrtYPT12|hk?81L
z(fIPZit=(`VkM>TxtutQgEKQ#!jj${KV!46kgduA8Z*0>EgLQ8KQq%!>?0aTNLH4$
zbyjSV+g%fp1AHgz&v<!GKcF?P`=VqGRz@k^AatW>NpHo+wAy=WhReIreN^0Y`Im<9
zQ%EbCx^#B!R;ozrrv{qAlL=-SS4H(A!zQcik>GPy{-Y~3^=UMN53c6T)3k?&B(B!(
zIVq@L35VHQE8!!bvFQXdHMa#~G=IwUw7m55!`-*wdG4D7OSv$4>mul7y0N6*pu`}H
zR48e09<&5ct6jKcO%W<SOMyJ!v^z)v9eMI=R1ldjky5q=`y97)nI!G3{gOYjM0rpb
zKyg8N;bO(4wSF*JQxc(Zq}k8QhHfVP__jQD#&JmG@z7KIqy}<~*wAe5QJ(3P-SW>*
z>S)#Adq2Yt(w7EyhF4=awTyQMhEk|(bvC{`J)YSzY(0FSTrw)t#NBH6s3w-@Wvw&1
zNm-zm(D^HO-y9Ly?6QS6?>);vhpCm;loZM)_2$-)4!UyBdAe*Kn5ubgPr@z@)yhwP
z{=T(=#Ip3En#_8cwQeBsBP0s~GpUrEwl3!fZ?)w(URwP99q^=ZB^vWm$~$)|`G@^d
z1)J$7SF`;o=R=p)sf<($HFkd$29tPyQB9Ti@qepM8w`ZP45Wggbz#qcn60Mw`@6z#
zzBeBJ2q6h*$b~=TV(ZR&uqZ-k=$Mk((ehE$=YnY8u2zujOT=q&u8R%!g!#k1*4xmh
zm6}yMe<4aO(7H4pUBX3_Y5N=h8_u+{zQ8{;dQsT}ZPV(%!R4DX>slC{k=`RjwdiE_
zt*P-Rnz$`r?7YZ*M0cCW>14)UMNd!SGJnq-O+pb@et1~v#y#y4LcYa!v`XeXQ;VA$
z+kazT=UrLA8Shp{M8FIAKJ_Gk4c+zXwk2pF0^(Wy>?GCqRg3%bX)48s&?$l&f3wXt
zE}PjR@XXtKaZk*a)RYet^73-07c=k6^Z&jobZSk!YysC*u(qY_rQo_H`vkD1)0OmQ
zW4DAKW}URQABey5&~%V|t^6t#N32SEn(yyNGBlj|HMUmJoM;BW%p_LLDuLKrP8wMk
znjb=YW=G|=2GKkdxOx<a2`C=$;ZHn<F<J!X+5#VHMqgvM(vNB~f@LwszQ5d><}p<E
z=akM~+#5b6^TpaR1&K9KzLI8LI=zx&bFB-{ow~LQH(y>^=reC3>S_?vH)gPGqWnv*
z7jH@VaSP+-O?m9={y-&PQ-7|bRFZR6_RPJRR_Xe~*9Rw_6Eo_Yrw9#vp9o9~bq9Zk
zAFn+GE2rcod--@AuJHELs4Oem`pAFW2&C@qL}w2yzy!qWstbE@Bl2YfIr~-+CC#le
zt#i9m%rSe6mYBBR#qGUYji*p%?A}8?LZ$6?;l8Cjn6pGC`+tlBEV(5MM{pvf#;2BC
zQ7v}AOfzeJ$RqejZ|oSB*U!a0l-WyekQ?E$)X6=akwcB?(!b|LJ5%~{*VCZ+heg&3
z;ce8x`%vlWNE|O)-)E2M#1|?@e=B&U@DMDdtej&0ZclahLXWSr<k;9St3Q6U<LZSj
z*%;`B+8Q~~+#pkt{d%9^W|ODy_?guYvR5@)RNR;HzrTmG$x^5-bS-F4|M);*`Qs(}
zzFNk>LVs$VzZJ7Z=?ZEoN6b*4*DDXabD{JejYlWKM+UnOg#y{^*1+D06=9N0!W_q$
zuF#<mt0NVK<D_ubzf9QgPG70AqD=?LM4=g6OVvUpN$Neq+ZF3Sr6qJf8?1FvXZ^8p
zRHf7|f)ko#U;BvD2qo$DjP`=Jk=A>B7;DU_gLjPadr5)S9#j$~T$7(%J($y~%@67`
z=&r@s3MEA0&jzdu93(}GLk6fxG<i&j$D8p*HY)3mE$lH)?a`+tE2n2>zLjhqaU>s7
zbK1JUSM2TiiH|cx8n%9HL4g)dfibOQw3|7TnMch1kQ-O^H(MgC${UkypO|%jt&NvJ
z?Kh?6!LT)BeKFcdgAEJocf|*PEE!i>@xW)y<{j{<p;1e6KHPpOI?Ggt?>v=~iae-q
z|3)fy^~unX`Qj<ASMvBWmV}_hV@`8aPEU*^nwddMkO_&Rh3(#k>?~KdFxaHEjS1!T
znXcC67U)WYqDiL7W?aG2E>yRt%N?)Zm1KL>=yqT|r|UJ?OP%5>z-vGS+ru2}jl%!)
zX#M59PcIx-Td1?OH?e`VU0ohE>Ug@WTyX(DAdlbydznx)NjLefz3D~ham@|xW7)Up
z^!iaE?YtANjG=+jG4sjuyh#}*#-=@Hs)w>TTy4zV#?#`V<X8FvcAZjZdzLJI9FYsO
zmWxBy=XT;nX!w`=sUc;%{_O#TSeLECkV_`48v!`OptWo<s|9<?0W;TRPY_*KP>9>A
z=iyu;6JiYHFA^iIuiC01u(+ZP%e>*mx;5=RM?<dJ68Uk%gA%*5e!6#M@2khW4q(fT
z;n(Knp5y<jg6Yq0SZjfeIW#5p*Oll}6M7>Sb3^bmOoVmS=NZnM%0<jzSuFUm)Zq`N
z!vfINYl<s%uN5pNZ-GN8jlTxi$%2)XA#P`NmgnuF(8W;vCF_JMW$2A?OiRkVRupdf
zE!)>m(`}*Y<`hCL9d>HV_2p<etTI_`Wa6F{RVpawdR#kN!c0-WGxACBRF^GQ^i+h-
zbqa5i$o45c<+NY$<DZPmSKw4kX{pA0KPm1XAIvw_;**gpju>B0qG_f$Z(--2K>0eG
zqZ?sROjsx8`=?X#x24KEjUNiz7;i~X^Rsd~&zbUNIFDpS<;vdWOkh>h92;MkQI>sq
zplPe1KS~z<DJuNa7cE>eXKaNsxk6F?aaLO#_bZ`L%J+HtPiK44+*WCL+iMLiK5Iyl
zxyFak%fVQ%yq3{vxo{N1-+t_yQKd{7z%wv3$nA*bv_s*-$RGUR+ga<F81*=kyxY~<
z!|P;SPM!nH3$@*a=a^ljXJ6<s$5!ww>COc|>;=Cs<8PTJF!Rd^eM2$wbUgxDx(A=W
zLXAEp_1}qKz{pvOV|qk{X2$ICM*dJth27yn=nHq1={c?TIf0Z$?EFeOXCJ!-*uvA^
zE-E3C?x#^L^RWw+FFZxC-h<n}fAA&U-lItWoF4n6!K8EHcSzk4=Nh<(E}?(zt3>!G
zEN=@ctjsYjfq+Gd%1<0)wM6{R8!=@RITH%w=fSsVBiX$>-jVu@pDFvs%krrv)H{Bv
zqi@(14u%MU?I696$zG-WL=|$Q<x<{LwcbenbxAg!je{7IgZTCO>LQi`-f7G~<JP*V
zg&ohKZwSBerQloy_gDGa-7fbgr*5oZB7S=^4rM?7+aD8C(D=Il__!r(Q2z0sr!^i<
zzR!pWbQ2n+Fh@NTeA9ZaJ!x>mL@2CjVwmD2nd6!i$eEr$5%_7L`vDQ?i9`OXQPLEx
zC3N=LD?&#?hR#fvef;RxvcqgK=%-g&H$hqu(0J<M_LSDdp9qa0tY}7Q=nK@pmWJNE
zo5r!Lxj7rvAR}&{XBp}rv5N`6Y`@Yn0~g017GDd{$}V(e%CW88a5`(i^J>z1Z49e<
zI+dE#5`5|GvOwt*OjYb}ezEjotx;d|v7X!7zA&b$6f;Z_E9acTR&rX2{>vr5=cMb8
zF+LeO`%cT7t?*aB$CuLUL0i&~i&cOA&he^9M`)YKZTlwp1*e8s>F#65YMC#u&8n!j
zRzRD3Z*oJLy8`(l{uop7M$<7oF!^J~|Jq`X@_phr8|Dg2iMfs~SISXe^0684)^6Av
zucNI^8k`4BEb&e;ADSr!S()X`DTV%ELVFo@Uo5~~7s6Z@LR=Ttt9}VGC@v(fS+n>H
zrg}eV7-YK&)1Ha<ZN~JagGw!I9FK@=Das^#wdP=44))ag{VOP}`xVyMCDrI9+UVoK
zf-eqeQFMuHD@@teLdu=WYl69^?`<Ar%Rk6|@_^$)Q3RhaF}V`&^D3GBq0*z!F$D|@
zYYYdb&<fz^iCKBHAjc~)?Kx-G#hkM`tGxrXz1M{|(vC>b?8cX0@2O%R$o5xju5vUB
zUKx$NoBM77qrQ$tP|6|2N08EQTI2juqD@nZgjWf5k;3^hsiq{Kk>UN2(7$$CtsOii
z^sNtGzEgQA93>tWJ#R@w55|6%aH{x(Gnf@MTdp$rIaBym@hiia{QgxwR{XUwe8U|*
zJoy8?7mYM32AEqk&2Mx&&GNr#{8DqKR`5=&`Q^DRr5!O9cGZWcTZUVFDpw;@LDMZj
z<-{~ro%n9E^2M~BO}icGxC@TokX%KBe0^SVfKP070x%TRpA^d12u1c#8`CPR;JIo2
zrActbP1?ij&{VbSw9jwR7?GFAX?t%4$wY%>z9_|wv<oqu>c9CTo$!KvgOxqSrps+I
z)@?E*Nk5@FIKSl;q{A$MMSH}a^%>bN#hG{6QiWvnDd>9^)8qh?X*(xjTjGKc7ij~*
zPI)|7GIR&MsFm|As>&&9(QUJ?UFR!Xs755Sc-{ycmY_~V6Q&R<?}&wkFUm<6kXvLW
z#3$T<9p6xzK(WXaVG~Lh?8hSSLAE*Xs>N_Y^4V@+oYz)Ff|k@vOyAQBmpcyitBHJ{
zTcov$N3+I&npEKj7Mbx^zl<2RedYNXPjK)I=tp@{ie@4S=l+CwN`B^i#G~-lT8kU}
zRl8H~-ATW#gja`kAaST;{!l!oA*iNb-i5?bQ%qMZUtEcOl}YD?<0nH$37jFhTpXfC
z{4jTMMd5sL9rjJuDj3=UkvDzS3r_-^pXlwrOpg{lPnlcL6JL3;7)8E{=^*zi(6aK*
z9iL}<OQ<sM5^1mE@m*t3asSEptL_B`IS+NMLwTYAEVNXba+&-(4fY=i5ii_*r{{3m
z=U6yTIaQ=Fw0ZD<;cb;AMWy$xaGr%t&pDseVxVWEwbwA!|GaZnVEc;VuSmZ%=~=D$
zVJ-NpSjaegg5M<3P{M+JdfEr62y2tBg)C(c{+y*4rVA1*cTv+ol>@u<&vDHmujV=y
zYH=6Lm5S(@YDn|Fs73I$_C7kz^|=(@G)`|bR3k99+prNH!MAf56HojOddKN)`HS0V
zy;`4pwlaQLL|sG8xo(y3Ojy7aa^tupsv(TdgoOPh%)R}QgT&rs6f0yHl_V+J7f5>k
z;>L0MQn3B<rS2_WiY(?D>ATH$C(meBpJ)(}J`Rtz2qS-v8*P!dOs#c|L)IqdNYx_g
z5=r&ymB1=3>idO27p(Y&K#s!fC#MJUH<((o9$KP)=U<quPx`FbWvLZ9rDbV7>o+Ln
zPYrZiK*epr+XwH|o*q0Ljh<Avh3DM}n^vcyrMGFVdn7dA2^rwMh+XJrQ>6I%p^IZ_
zGn1KF$}cs6NLO70p3#X`|5yW>(HT|}zwnh^@nd#eK3iBnzchHDBAQ8${A+Qr?%Rao
z_P|Vd{J16-!MGh-ss`c_Bx$WZpFO?^!=-QOphH4mn<tpDtz;+=zno?8_!6}%Rfpkl
zJyD{WwG@Z6W&0ry8#t-`#_6zetzFMHOU^xaZkQilI#4nmW^k$W1y()yd^;GO%J%av
zbN8Cve!z0Do>#uidBJVVx`yNgM5pJCVTIx9X#xo&?=YT^D(Zzl%N*sRXhSQmE#t1u
zZ{G#eVcpS$t79UchQew%J}w`JZ?wvn9Hzzc4%~`Pi2ES&G2vfg3gva5Yx5wVXp!J$
z$9@CL8ZGq3aPp*l>hr%SRJcRunt#+nkUZH=0&KEQ1nqnmON}tojQR#^9>Ve!0(s+C
z5C&!ABah+-WMiw^LWvbIlS}MMo;tfnNn$yM%0Ar(e8-1hmkG)-Sf}J}#*^L;QTR7}
z>SuHF1sf9XA2klBI59Z=XzSqL;A6^?@^(+Gz;jPh@#kC~`820E^7l#=Q(!F7BWV(6
z6q^PMsoYAP{8K8%LfvThJn7*zf1jZ!s}9PlK`4obxi74>i5-ik+x$7UAUl@FlFnaW
z_4;MOVBTanKOEmUbh3LYS+bzInRWR2pjHTl%opY-&6YwUUYeyu?xC)s{hy}-w2d`(
zwd(uY$^V9%`aV)9T1}-SCwAx!6F|)4%T2wb38@IPD4@YIw@=tV{J{{Aq-7S+w!8BF
zBr6~sjj&_15?Vnu$r^((;4g3L(fbgaqU^8RwS-R%A+k8;>W?3<GX!q(6v3{T^>cUi
z9opw5d@nM33ACnnw<Dr>(DFY|WzNYAag5i`G`N{i?N~8ivW4~JVX{T`bYpUaJsWre
zo1@0rwSu=sITS?9xXY?n+WK%%zrP^1zE<gqSGXR64r5$9w%VE<k=x=~P>|XxEr}$3
zH$<ybiLs>fbVq-maX@cZo5?Q3aQv5bi74Rx$k@8|Ghg^$6rOB{J}C)mZD?tSzyJq~
zoyp|^EE=kZcZ)}!uh^cQo|4xlR3uCB2<=f&qbx4A<s09SO=lO9k5{&XE;xtZP&{bn
zzkJY8UceKqK=G|(KJZ!^LR;WHOxKd%*E)1!sY;Ym-hTbia~<p;GhdL&3*QXK_Thr5
za9&9Y7BoTMio^DE{of?`C)PX~Y<zAxA76?ItwA#+3~B4a`I`_T%V88fZl?_)M0I%V
zso#6e21lr|_@Z?(;&Ys)C5)?SIFrK}FsDg{Lgmw<N}o=D64+G**HT#RfX>L^Sr`$j
z<g3&TolGB@cZi$honNv*#lV|J=rQ({)ScP54mC=>+rER_=+G7{SSlH>aj$+@3*Ebx
z2k=^ES4p@z`ZNJ7%~alt+o(FpEOMT&=7I)ljyb}B9wYYmse)t5?hx9q>wXiTi{`lW
z7F2i(D8QioTjuVQr69pWX*348r7O@i2+9~Tk%IF|tFJgTAWWZ7wJZoP=Q-$&VS8~h
zUHhPnCcOQ}7X@^>&eHd&yTZ`GXLU|ONu*ek)bBD$EG?LJMc}Py)6}puWBFEXm?O|)
zzm|=Px;;QAe9+b}CL)5XbnhVi`*;v|hirWM4Hi5bbWs(eDSU|@+!h^7NpwMr0>NBT
z0^Nf3$GCg5)%5vRN%D|M#y7Nbso*zC?ZqFu*xq%lkREvov10H*7Z0ey*x3KV@)a3A
zepab-V&TG4$L}0PGnD8k29<Kdi`o8CjPa<PDx$}DX`_A@C1s;<0T+rvw^>K|BvhVS
zJ7U$k=Q+xhnp*=08T&~P6dydT(*7-3<f$d8AdCwogBkW1WCDV*8J20w8*>@j#pa6|
zyRp>x;X`Lp3hC#$Z653iHb1FHk-#Az!LHiIgFOTd<j>hsjgnjS2n@2qslw91_TV&f
zUj4Q5;Sao@qUgU96e)2)8^N2#=rI(d(t_hQCn{t^$|o9lz@cN%5A{$2boK(~#5TnO
ztDy2hMY&>z$$~IJhfiU~Wg&RFmCh^=t0h#N-wHtAfgB4%^Z@Svh==nqsTV^3q)lK!
z<Itu#(K85M33pU>!hWH+;&t?(>{3AEP-7-|ddt)(GP~lYkF;YLKx-h*RQF%5tiDwH
znq+TdFQanbe+LQkpyd!kNjJrY*oBCLm8>lVQPhbHao|FrO3Rw3=*&3!LwH9l5==G5
z1UagpPo<U<8s980#y!zBNVzQ~>_t&6$w5t;?cc(GzlWt;P4j}iFgxr~T?ydpA>E(I
zZL^Y;c+%si(Y0f6!PQuD-rR=OiA{%EjgrK}>*rWT^D#zz7}J5ECX4pGOerXg&1yOn
zWtTaO4}Dq)Dvd|sqL;9yMnP|jZXBTKeDne&jgj*Kha%IK-2T+(Q3*#_slJWA3+Rj$
zJ~_%~9BWJ8k%iNlMPWJ2<SGpp1G%yomYT_naT!*p^;S8)tQuc_4Vp#C+1R4s!}A3{
z_uyxbUaz$1Ab81Re<lR7056H5XV_QL2UkP~OA^(S1~-A~X`vOt6B6XBZ`n#HDlyAi
znQIX2&;^u{IUN16Rr*qB%O4ycgF<Lv+C&hv1rvsfBj&am7#2wME|3#W@L-n(N{=%A
z!&2}|NI;(LuV(~bx>>YivQaiUq3^j2-z2Q0O*kx%&21Q7a0pZ-gQU>m{A8{YeNJdW
zTo_sMO9oiiG*Acd1-@aKiF~Ub%vm()U}XDo(Zm7x1Qq&otjt(*RT&S-^_%OKR~OEo
zHc<g;0>KtH#mew5>8&U`VX|n`*;q|zur|<{xM8V*{6iiikz}68IQ1mM*ykZP2C6V5
zca-}t1(IZi2yN{o$H(%ov^P&aCXw~?l0%(98qf3Qs*|{}O1UxgpIE*{n*>>Ef+p3L
z%O)$BYLdxHXH_!u6DIJACK^G-m^la>{k?p!{iN1sBbLEZLHkKGs3v$?qrEs(MA;Ho
z5XOdL!VD7y&4CVSV6!<P)q0iNta~HQ{zVf6plz(2XKzZxShiySm?bGv@};L484M0$
z$&rGn8}%}x(uf_2QOmHZnMVlZFhQA?HCE_b-z+D35?qMq&75m6p?)ZZsUdp8TXBD;
z*#1~eaB%q;n#CKPqEF+4b_y)V6c8+~a#UVlK(vtplyD5NtAL@auKYVrU~uL}(zDn2
zkU}d#BBt$mak22j#N`gg@NkH*{YQNGFA&`q%duxY4-P2=J?T(9lyYdHE8tCVNgR*L
z`y@TgKBYHV<HeM6mmqE}*fwi8)jqBP94!}x-CAD>#DigoJ^yW9MU7aW;@x+YSf)Wj
z7zp0qZk>!Hq}rZ0ugK2g6+OPJ3(`a#dDEyQ`NZDRC%pd9qqC!pUN{3AZUkz6(*7-d
z@aYnrpyxxB4Q5Xn5GBZy+0e~co`TD$CP9)oFm-%;9@K~_cg}5ClhjQWWKoZ@PszOm
z9(2ZrGoVdB0qW6~tK(z16(KHc4a)=!QwE)J8oKJsGw4)0a_|s0sW~6L1WAK)CUAf@
z!+$5s_3*mJ@M1@xssMB$0?K{?+hW}zu9wIA9SPUO3ZQ^yfG_9_-OS}{ISt=Z^&LkB
z$vdAafm+aV%HEU^kor=0T3{Q%#d1+<ZS*TpTruHZqizyZmAF<-ov<mi=^d;&RG2vE
zjNQ=nh5Ukcr6aos%Nc!9cMHgmQts+*iwnYoJU6|-gdHya7?c4mMS!FVEyvVk6qc9;
z4{gwbLg1YsDHQngcIN?{SGs-0adb~kUKULZgH9eppK%)2Bqvh^ahrlIu*f}Wst_cW
z4xqCSA)1(%G*%9sFb}lpI;<-K%MH+(w4tk>e2`Y<ChG?A)tB+*=O6^NT-ykrMBJ(Y
zUK`WH>qXch<j3GcltV($t!mwtNTNE{wL0E4ZEz~F4?6T^pOk=1dfjYe2vyRSekPu(
z3PCPuGMj#KlGO(ys5Y=R#7f>dmh>HEG&z)9i|uiWo*q;W)$r7|Q4_Z)81~7^kPoa+
zYxNfN5e#c=?p_+}yFlmY-bc|kHecsBO}Y%HJWak_!@>7C7w9umRC%(WdYk)$zN7yg
zod<k{Bkp3#KXYyv>0be<IBsLSs9v~!_PfTn5*E-k@FL^TW81fL+`m;|VZsfiE$;`p
zY26an+3}8oijIt`72iP0QeL$EAqn!^wf6GqDn-(I-}PUDAXSC(0$#0W7s?HZ6o<9j
zFen#emJHQlJEL(2W0K`j#A1fBvHTlpp9}qm8#TQd8y|T^obkb1Fp{nx^3lIP?}_y-
zcdPd0F8@t-$;p{z?~Eq6#hCs8pVwqX&u&~-d;PVkxW2ftxS_box!$?axxu+<yI#oj
zt=XyRsTsa0zFC}UoSCMnrrD6`kQtvTpIMn{S<711QU?jdJy*P<S)~z{Pd~2eUzn_G
zD6(??iW;Gc*_;36(TnGY*gjb!U)oeSS_N4<oms`(R=b8i$f-i#tXn+8ZrxpBJVkF+
zrR2m8dYh2}AuuI=x#1U`(;N`2V1Z4f!mghVvrkSaPl$T)TRThx>pW}uvs8IY+LAP$
z1@-4S?s^zkeSRO_!;gCvIPnWER0y88U*phmx^N;KJp<$Tj0`Po3?$skx`Ka1>-hcl
z3jMNT{79mahptX+l;v?XNxnm+V~u*ns#g3cEn(dbXc`kb#Y5NRHA)#Q9vOT?G|d={
z3i4qAVY1^6IGC#G)7ugm)AQCypvgS}ivXiD`FB{l9otG?>K}y)A(Sc_Y}gA>Q@V4q
zAdRY6$I}Qa8?<S|uz3{NOHh-jk)iy`Hxt`f;3b;RFQ4KyC<Ql1s#vL~mx%w}21g8I
z8#LS^GW`<t2n;19cm1SFj1togCqeu80EP$U1*ulTB6})qClpX3AEp{-qUH2~KIK_X
zsC~1(s6Kpb-KYf?7AV1i&SSvl$X#c9HNu8)I`@eztBA!gEnkDh_>Byeg)RE>GI`~8
z6^go*K#inw-gjGEli+ohr~WyP5noT~KI-%h$X~8KFPT?<I@D_V35qK)o4~tPP`>9B
z6r-hDz#C+eGSOAx__7Vi7&}MrO$<A!FI{IgmP3tkuQR^=DX`I_xYEz(MKM2T-rW+|
zr3TkBSkZ!Ni7hjd!ox*|xq<y{C^GPw^GjT47Ul>p4h7$8_v*;zU-iDcxBMnYM1Iu5
z{^-y-P`z3Ew-3$HaC+;dXK4EhOYWdn4ER(sV8cJ9bhek)@l>;Dq7HN-23>dobLQ|D
ztnyBN*=m`s1-=q2IR|-xEDw05-y{$como+NX+Z<SU{63-RF>JSxVTOPJVtMlSR9XZ
zin_mmBrtMT-o(Tv2E6-|9V6LA3T*X)RXU(+6llSg7&Yl7Nw7l~yn^UL6lECYkQNk_
z!ONR}`K+D0GvRgQt-13lu-##f!~<_(s~mX9#{rLS9_Yd=(k9TMHyE%*a#!PCrLY^~
zcQ;({hs?YD&>MW%KwpQ#CmuD5GS-LalvnvGyfoTAS_(LXY8tsF)rt8jBOt$5(T?-#
zYHGm28AoOwAG!r9O;V_H;?N^D)Wj=&52wK@#fBS$N;4nu<`*RcRUj`k;%rtl;Rh-v
z%)!9X&&+3jC$5Hl;P9OItjb1T6FiA$IkY7<^mG)(imwx<k5VeyF$>zIf!1geQ;b&Y
zz#PRA&m#AjYNm<hGC||%?!N{T9&+U}um~%O#7~K`iNrf}4QhaFL5}(`Cw2-n<8C=x
zls&9Bpb>1iI~Nd+%ZE~974PcVC(bn-P(N^CU5Fk}13~9i7|)VlE(*e0UmR4*2uJWt
zS0{q8ngsD^IpL;Qv}mw)5N#3=19dU*?)8rOjJ>Fv7le;Ff`uMKAVkus{53NB)5nnK
znw^7BQ9bcMKe(sgB$*IBdWP3b9_)_x9y13M`uNVhDS8P7OW%`f17}f2Idl+rzDZ45
zBw;5<7>{M17Fa1wPY|vK(pRf<vQnY$m==h|5|Hgc0r6l?=aX46581NC*bAY5*ZE3O
z0Ctps+H)FKC*Vk>xU08UUG6F93k#MQftpY)tH^nc`{=@k@H^MBVV79IM@TrxBn3d<
zpwpKZ_dA^8;G=Ofu+gJ9eh*QSY#^*^l>Iz`s-UkfP=X0QBZ7bA2D;4-%l{Fc$wRn4
z(=IVo8VB}6J1ks;=L4@qA9wXd{rEB|$QVCI^i7ODi!aG@4;LN99rH=6hzm0NGhnX=
zFZnfUmPEPAEj<<VBt^+qTCxYpV8Z$CDA*t`HKzWHVjTFw`IHUhhCVWISCZtVP`5ug
z(+_pwC211`&^@&2hv*p;m9$nB;Z_y6Flj7*R7($(-Lg9x=75r1rLV3xa6U2v?V*kY
z-F>Dq*uRwv@bhQHMWPPwJ^?YHEh*!O?X41_$Woxn$}d@=F@To}KvJIo8XkYMosfV3
zWnwsAUlwx&8<eRFyl@o=T*r78_!t*L`2|aaL034I7GrL+v8%itt=OHd<d%!fyL9kY
z%xS{A7>;o$+}QuJSVNMGuYt`Tv}w?&MEm67QtpXKw4-3}fsbEPWcm#lE(`Wi1>U=^
z!{Ke+@Bysm5O}>V166iEDt-}QB}^BnhNLu_%1<fW-#_8ykb#b})fKCp*_?`U2S;_3
z()CpF?es;-#Tigx5+IlS(Xv?;O|(f8R2LQ)0g5olkP>>T4a}L-52drJsMWJt&NakP
z8q~t55Isz%=s!c+N|by<f>ji0M(Uti6iXNMl4qkB!BdgJZp70B!F%9oFqB>g(9Z<k
zzvJ?S^O{u3>`@ANps{aCyhsJD9wkRde0VJCV`wD6!vl8(RRMfZFUzF~<L$7+%ashh
zp{F3jVO=X`O@5#*M`^!Lsq&qHM@~W8C6>Pw+~J2o%N+;t;j$p%CSoWmcvBnrG7Tkq
zWQa!>0?);w!+>Xi(mrqlJymWy@$@N^U;g+uCuklo#}&QAONgfP2hbCxTHrcM{S}ZG
z5BxWd7`?4jhm@7!d(eQAAp^V_H1M39Vw8*%7?BEHh<pkTkmCbr^d<%UQ|;J4Y|9<|
z-5va$gN7gy5MoP=o}EO_vVrOmc5pKBBXp<^_z|C;p^C5>@J+!iOk~Df<3|g1#~jf>
zFR>TG?L5FTfV<|RtXt{>8xo=A@!fPp3FE`-q`-zr5_?g?_c5lW?^5Xh0cbVrt_6-7
zA6X~^XoTdh6q(-1vOc{2YIrdqP!$2Lr-Sm4xz6^nU<K&n1>gnq5ZB{C<53Tz0Op3A
z*-8j1F@8N)D=SAIkpM}5wVY7R>N#}iAcvV3Lh;iko<iMFrmNm$JY$)Og>7`e_)vCz
z!&*QvO7NL>F)YEpBt)qosyTMrAfKWR>?jE}|7!9^$QJY60U71(L*DE8v>6vr06o;7
z%=J?rEmnXIUI1>e2g@NnXdiv4=`Mmm;4dY%vU!Wi#?qjibMQ$pFc2?^ItkzLEV1OG
z5LoFmqVAGH>49;|qD*w=MCGLmrGGO05*mmB8@?+ZjuhV6KGm))pc7E!3cx2{LzFNd
zhyJmM!bQ69URR~fh=Z;vpo?6<KzxmLZH{*h4&ETXU;#l;mWJ>ATWsnNJ!QEh|3#Uv
z4?!I{05z)AIE#JiWu&@ws7HB-O@2see<}`th`Nh2@olCzp<aZJ;W_R>xF+dY$G1P`
z(<0Nj85sUnV;A8fq#--@+M^eCSVd0FMtvda*w;e9@252E>KXx3%bNi-!@ok{j}02o
zw|u0Q#K3K%5O}gcBY{^Xoyasi>Ejt;g#y<w-MX4;;JV||Z-ksF9TF+f9WUcXW}tcf
zt04Uv^ReGyuHp?MKU@|((WO(DCHqrWEgSheU9C`ag_orGa0cT3Jh37tf7*m(bLt1@
zt>Z-D?F?@KN(+0{^;p#f2f9I$+*4EUmne^QP#3gs$55_LUlvZ{-|+lJM^Q4DiD;01
zcDxsA;jXVrP_tYfcda{9-V+qKYfQTIK?u!j;H<$(=aVV%?bla99DW4|VfFrRJwGO5
zGgz8=PHuvkSuhD{=^4ej-2Gy_4*!JjHP%qgwPrS%ZBXnrUh(cpym1$Ne<hd=@k}Y`
z!dL2U`I=qOd$496K`6Q!<l(XXJMT$MGzB;E$R%<)e`eTxw%}1pApOIurQ@VVnx<!e
z$v*i1>eXPjw6}XY`*zs7I)>ffW`klDFNe4&%Da5Gj8rI*S?td_St@zan|8ejNU`h^
zpS}sl)0&SDOjk9eFW&Sj1%8ha(l%aa17{n&ZNiah-b4DWkj&~eGYiZ6nGO=aB4bQ(
zu|KkSqO595w#eXsqo=D(QbQ^`r!y+S0uC_20t>N%TAT7~VpUKq2y?r3fKRE=1afD)
zv=|E*4P|QYjd%LhPubo${^XSL7&eRjjchtN(!z*Q&tg(J9dTOpWao=Q94nds8aHw|
z{G2-M0K3Cmrb#fh_A@Q~Mrw&$x=`mM{a8i3o(Z8uG~V<?YhQxdoICwo^03v(k7M&y
zgAx4}H3EGjg7u(4jqTUd86!Rp=XLM)X-*77&6|T(%Cj$OwoF8}AYNt^7c!4-KyBn7
zA+J=vHcHzzP6h>j{`5TCre~LPrhCUD;AqLn4IyTRc-z`Q-`}$M`RR}9t+}^bQ{*S9
zj5ho(W8Ryi-X4;5uNS8+y#GFCoj;+J1z}0&H?9m9U0&=MJ3Jc>H&_q7{U}!Q(88?E
z>+_QZ$(N~G6q?v?3{A=V7rpHrR!UMKZd>HGT66guh?kmU)JrXW3&}Gyr_|C2Cx?Z7
zb7XRqWb4Ap*5%7&nGwoCm4zcF<i*G*gT8Lg;qV<a$&!Ju?c9eJUX{LPakqSa*NZp0
zo#y0}XYvGbkcx{JHdj^7ZV&@=dc0YiX>J=4MBUcW;)d(n!sD;0m{}8r9#dbkQ_VpL
zmUw~_mvQgSUll{AZ8hOCKDV>KU(oC<7S-HRUe!v}A!=JYb_j>0=Cm_pzBC|tt~={p
zPKqwfly=1TOlw=2Q##j~CuL^eLwfShF6#7Z_8Yy_aUD??`HKD9o+4xQk+fnyO^d09
zv;p2*zt2mb23G0c9Ek6O4%FT2#|=!b9NwHls%TCY?B;Gg`3;c#UO`P_)aBh-z0W?E
ze!neq-jO|clWLyX=I3DW!PsNZc`cB(KTm2uOONK%&G~er41t1JocpW)(TNvgcC$Fd
znExcb!uXdL;K=MIyGo6Z^LW{SLXhS()9=K)b{D(`d7YeL;4&sod%F3<!UXTfZIn_Q
z-DF|K4&vZvOsE`WmTOop4$)YSeAfL{?#7#W$9$~NWAe))j~(OYxxIwH==RPw@}2Bb
z#L7WuTMJD^+mOCALM90XvAdP-CbGTLc*|q<5~aWG6yXa7>>%3m^?9q2@0Fe1#tSP-
zkSod?2=@ilw^}@&_64=aZAQnZi^%6I=O%4eCB9@U=OtlKJ!J1ii_-bU&7k~5fCEPM
zf@sMaLi_pbY15oY8KkrKsC=G!Cjk3Bs!`KdcG5n2={)L13tmOuR<*ZlewV{_p2yk4
zIkn5Y+yMnYaCLOJ@-S`}ecD)~J=#jb-T~t`+k-YHTq5=TdBvpkf~+Xw=r-ul0>m?n
zC74m4D%sFCeo|lN)1aANO>~}ehL?q?%jamwhd3nV!o2Uv#1Ev?O+|Np4RQ_25$|^n
zf86WitWN57(-wYNf%KSZ^>|Vx9DNH3CW0+7ZXq=ruaGYw(z0_aoSST-W0zCq*uPGW
zf9b~@CMp>td_`|ZF9VP}b9VGRi(-^manE!Xf7V4LlvaDR>3A>72q9Wq$j|nI(b<MB
zI4bALN9zg({CW1@^BHZHeUi;&@<sOjIZwYlV8q+$fE1hl-p?iMJm53t7q%!vE=&su
zD%&94en|!0Rt(GP98XEzR=C|Z)V|@;6Tr;)c&anIo?V3S;zMk&XFHukD9p|wz*P~V
zv8Qc(NOtU`?4Jvr)3<SuqHM@FdWinelm&9|x@v;1>C&Nut@e0We+T&!xIa4>mNnV?
zo(-v$uog$UzU}e4ow1MIyRI4{ySj9sC7ru;$hd^q0E<yWz#`4vJ)hU{u-Mg|`VjHm
z{qZ88PHQ8HbQSLg)PHsX2lw}Yjd>E_ag!+E&CA4fBYW>``lsw1{r5~rtt245D{$w`
zD*EQSYNhQm0=b@z%z)f_++HFxE*&B+kzax19a4fF$PE&5Jd7NYMT#S@ksa4noo8)g
z$X%GrL7VsadW}V!;0AHrIbq}e;xk0h!lzG*OTa%<iZp#3{&EoMU*eR}_V!WD;9}6*
zqnn311l2!Hz3WM(qHGR(zxK_ZWIp9XAO;qTc5cgTKi#ZRisgIwdHP9mBAwiD-#ecS
zJ03y2uEGWz*Q?_6gJyC2Aq7XdH8iI~g4__Z6$7K}n(d!END&95CD1TE6+!(~vpzS@
z1LvnBg&t$#K{-v=Z{sZ1c3;b14j*2*IE_zDAcvY?iKifY2WRc$5I-Rm*E?ESw>CoC
zzPH1@k~>~2r)G=zzM3Visu8z?6!_HUi-A735@m>3f*VNQ^^n8~<;GNS5Hj)O+;a7{
z$(v0ecvQp<yGJ93(?nfQ{k5CKiGv8&{H<Q-+S~<Q7NTjOsAN%gDymB7)slpK)kiwx
z@@bpP^;&&<bL3!Y1j9)2!AK$Uc}km(@}>8dq2JyBU+NRa>ThZp$22#`goh)m5O|e8
z@ACy(UgQgLx>M%~@$T|#O+Q52DF;LFZGqWwS4Q>r&K8nk$i-vr)FrE}=8Jxyy6Y8V
za@MWKSa_@FP1Q+V!|QvQr*7$}Q?dsX9s!0Wt<pVf2P3&{uD>*tKYS=JbD1vYe@^`!
zACc?sUEXlDy(7Ne_4ey($7y)x_^P&z5^b_gr-)5~AAg37ZzUQcm(YcOzT!nTBq4(P
z29k44Ej<O<7k5@j=dU^QRV!)Dx;<El6J0{QwwV720L-wNyQP-Cv1oeC_n9r(BK*|9
zPS(34|LFRMZ2gn5%Y)4VMjA-qUyu}r*WShAan1IewfC*v$q9sGu4?4AK#a<Dw78-G
zSt5gQb422cKSauxv<lbmEtYQF)?MeKaUzW>ME^QXOpPI>`>)a5Ag2A@jmD=o0Z5n0
zF{B%DIYdmH_99I`Q2J((uj6$1Ta(M}H$B;&^EP_`w*!I9yqlt_CthBB8B&?%%A#uW
zYVrNSnP}Ue3RD}Stn8)S{ZA)0KKI5ZUNS6FJeox9M>d8>WdeJnV+(R<;X42FDmXdk
zdSkd)zf@25-FX`Y;A+la+#}HC!2SxVvo?2?%^l(A<N9juH8A-v%f1(vw>=-G`nAS-
z-d1%0j>)`YTwkJapmdq2yS|!2-l?B~Md>Ju%feoH>`7<m)!aC>+?JfXeHa{yknE)D
zs-$;g94+>Ux4N2oeIpK2{jv8=Z}`cj!}+d|X$hp@DUw4QVbo8vh_Ytk0T%~0ACtK6
zJdl#Cowlp+ooxTCj8UDtG!1i`BCD&;egmihI`jcLOp<8c^r!tbj|eQ|c<a2JK8ju5
zojrG!)N`A<I9Obay!q;O94np{znvnj^>shoANdpN|9vPO;yh7zb9K|EH+<u9lWP37
z&Rb?b;Dn{^i(lJ>C%>52)aIG-jifFD@-=wqY&S=geL#9cSa_kf{bCOBj&PwsZ}IXV
zwM}zsqwVt+zTJX@qFLKrzATK6sReFfrxhmKP80D*Hx-)CkyH6=wrT!!ja%?I<I~d<
zh{-Sm^e=x}%j<Cc8>&U*?8$Frd83VZHIkbvysGA|JKoiPA_)B1F8)mVi{2vf%HOHi
zNPo-GYXje#<PvYEOh`2#<8BfN6w2ge3$t=bD_8BB2T)CBX$WD{E5i4*-ZA&ZL;vHf
zqXxMdUv5zW@O9kg1*9o<hGhw{D8|!PdoPkyG?05ESz@O*QOvk<Y0;E*ThZy_?{|op
zGMMpHxu@KB$CaZL>5A+$jYAOFxDG7l79cacmAQ8p1B+I@LhsE87p3N2^JmmC{sPQg
zSwsL0wN4;hdyF95Yw`Me&rHj5r@*?e*_8!9FnhY|uK^|?<HGVt;nFi)<KZ>}fV`P}
zVghqW=+4sPGK63j^6mB((zqr%;&!8D_2$6wzF&eioHikidQwKpB|)(VkW65-cr*p!
ztyzd%e~7pr8$tvRBX$<E*3>bb?=ztD99iIoG{}4;%tlzwxLYhn=yuh%i6Fp7AQdkG
zBCGe1d6y0sflS3WE^|nwDe><7YihGA*>PZf2Y7ye{&9Aj`l|-{Wd4@F5h%mw_vgEr
z6uEWF|8(k>@aC$GO2_p8LY>U@4Y&mKPY#gQ8tYW4ZEWknLbB-Hvd-?LtZ?HQ<XZS{
ziREr7S=eiBn98OR8FObJdCUJUYVU^siwiR1_QoQ%{Du*64iStM?OF4<%AP@XANtE%
z*CS)B=#U5m>3<ym>uvCRCerTB9`eHB3?g{VkU+JMw7!sL`YwUoo<-(Dkd*(i`uJ{?
zC~E+Q0W6z~$ZkVxVD<=hzxl5S8z4fVJ_v|VxZDYTq*_24?+V_@(h1JrX#fL8^mcd`
zaM8jf`1>B&?cNRqz;c@oAfF%^yF1c!o_WMv-tzZ2#{*3DY)4vMO8!r<JrqE-+^Ng}
z!b5a-Dtn8I$brpS<k2ty-?cXF?s!I((ss|*zme_=wjw9~q3b^!_yg7Ob`Lqy_J4`&
zagTX<IrqQL80Jog|3m4&kw3DQLYDl$WWT2x{{IQ<1e(yh=YQxC2I%?M0s!@n+m+ZO
zgu}n^fE?W6c-iA~JGJ#c6*AZ}y@34mHt_Z?xAuTAP+QR#*Qu)J%jX|HZeCtfzLqGx
z@GFHP@kB3lh3rm0juIDWzz)tyfNx~+A<`wGmsfvv-^)_d>wlT=Fyv#$v=O6ri|w;t
zZ8ff6-1SQCcCvDPR?>~(LKgeln4)^)uOojzF@3fU=di<jjGtwMUEN2>sYE-57Y^xm
zq%jY`Cc})`^!x#GCM>FlU!iCT=@x3y(%?F+i^R|7$uk}bN;A404MnE?N0@3+1@N79
zwSWGq5UDt{_p*^0`R8pPF=IL<rMFgee>%-!M*+rDl7}Ay8nP`W`;ivv3RI%1`rKx5
ziGOS!x;>^u$VoU|WRI=>`PO^*i*eXrusT(prq#7k>T?vH$L~{8S?T6JJm31qLf1<t
zs0ZQiIh`(8QasgdFM22mv%W?+==OG&CH}ct7m;{MXrX{zr}U0rw#Qswt4))^Js$6Q
z<5ophZh9%!TeW%?p#b#^z0|Eg`0l9@k}REp4qjpeY!)WQ2PvM(r8A;4{z4fH*@DR=
z{4U<YN#0UaG_t8Z{JGr&H0CpX>u<LZeD*Jv?f5E=4#;KSMe{3UQ}Da^-F;kcBJlTf
zx&~$!Mh8%bM^{oumhIv^R(}Nth5yagj7qI(y>xl4%ym)FJ@9F9WUz{F>j-~cYuRqk
z<Cw^W(GO_rF5$|v^ZC5S^RZKt!LNWNi`1Wicb0zLN%i5UBDlwqQfkemH`0ljkBo9}
z#(?#TyRX2L3yNc|qslXHlZAdEzuOW=;JVIPY3@7Do(GuCeDXHh!l;Y08e$CS&i`Ir
zpTI%j-pgD6;*TId;X1N6y2|kC7uJ%85VM&t-1at3k)1dJMy4{hcJ)Tu?#O^J-tOsb
zw%vub7cVVc6&G@Amf~Y{<<b&lE+>D-0)#$(e6}zLB7W!58U5$3?k4gM3CHdDW#kHy
z;SPC}ET%aPa_=UEhtcSY6!P9*5qS!_lkdtdrAtJT{3~NbgPasnYi_6<(02+r>rC-c
zx0@;>zT_|HGT~|R{Lmw-5L2h;wZ<LiQ5izwQL#HJns@VSnf`-W)6;6CGUw6H{)gr)
zKwSDZb_Ny1h<NQ~%@1a+Ppd7ArATj1T|$3w9<6UySOCHB{jzAYFU{{V`O+3-rW8}B
z1iAV2J(|QL{uc3;ns0#^knUMY@o_krs!-=V60-r$)X%k+H37AUU%G{XlbE4ql(;hF
zrZJkrBkC4W(@vw-EQ;8fva9wnxtG>n-UGzS^SUMT+JfA^e<V4G1&m{}Ew}?`Q5$-;
z1+tsd9g=^b_2{JdyuNehc=QoKD4g}}A80_pchHXT*cKS>rJGp*AF-$Z7aGAmG+NH1
zZ+C(HT22Q}98${n&}J=h*%n9uH6i1BXh9}-+2_c@P3&O=Dtk8jg4%iXf|y?i$nEoM
zJQ1hsm&VP)h-(kuaG?OsHYB-ps0Ixr@-D~A4v|Wm9zd=B+JJoiXTet~1?hF4xa@B7
zKiL!>aWY8a3hjGayro!ycU2daZAYHl7D{puDT91c4U`S3th$;6rHD0$+qKaosGBe}
z35DPV_**}nqAe^2BSBRDXeI4|P!7uKIaA&Smy#_k0|Tb$@rQ_<$t8TbfLh7+kOrU0
z<=yfRVpXtmA;|xPqSB%OY>WTmS}rFN(6i-xajlpgiFY<)FBJRB<dFh;uW{Z*g9LHS
zAqJorwT!u<m;xrZ)PKp_tcrT~?3V~JwHCPYBWF;ph9ZW?FOV4d4-~*%)G|k-qV1vB
zy_RyN3Yen^<ZwdK)Al5TG`U9c2vz$*KNKecxFUV_px=@oA6{69&Mh}d(mhNW)w^Uy
zb9YII74?Xw2vicy;KX=u=*OEo_jlGeToqH=gJ|Hnc@%{pBXu&_tp^W6&mF5u?x5%@
zDPrhCKUE!rY2;+&So{nX@9s<4qO;G465FptfJegjRvh>MjNp6I5p0Tdxc5pCB6p&U
z21GCZRT2l3<Y%Q?{-c$73-CdFuT_R1-T+{C&-h+5-$Tw=XK@a`kE{GV()c0`L+hoN
z%sq`hi@u6{0Sb4l`K<Wd0gA4AeP%TqO$fmESbTpMeo!XoEQJ0AhO#3_)WPANj;1UY
zp^FrZJEm&aB5i_k0D}yWe-?WF{F}7qWAO)}HX?w<W7T`DD^C{ZKabq@5`aj=N1p<u
z<*e^sQ~!gx^xu>%#Ly(RZ~T{y`5a>E&pF^#o!-2?&v!E1KYy}38wCNBE@)9F{zE){
zA|RAkrl14FrFd5!KF_TcF)T*zrF5wPshstNOObm|Zh`;tluDV{o(g#K|0gKPl?OtP
zp4?O6V)73a$B`9|JJfc3FPa;8AE5pddglY7i%%F@uf4wARc5m+&x6pa`{allD%r}R
z7e11Vzj${~4MjmD-hCY$yol~yGNF+hYWXL_W(vgA0m>%SJV4F3$=tzl{`qsm@vdY%
zs-t@UDZyvyk$9~I;PKCyMIaZuV;5rTYdO?u?hMOhl=nJ!EYy820MmWY{H_ELxm4=<
zp6@{@qmxUi-sjAC9iaW}XC65J;h=#Qxqjly18|5sY-Bru)a^*!?i^0`+s^`lg#Q!B
zF8BIu&{r@WaJm=7t)4(UC4cRv3svnlo%8r78<zTlQ)iO+mM;XSo}w30&$OzbPU{ZV
zya0Ovl8_M)010T{g6M`?S*I(R*#Fp47Xf{-E|0?zC`og<iuuQ1De&<28`$+;hTLJ}
zDAfrR5#1oh4KK1T-@|dkOuS19Jm0#U?_t&%HOhbFc715zd3VNO#c3|WBOrh_U8q^N
zsR;`HlGZ(}$C4a9V_9q!_a2MVT-5XaVXz8-W&5u(kh^EO|2l61)^&l*%%tOZ0yfoS
z3Zy4Yx=m@xbC!3dWHK=<?<2GaBGhvP0Z3VNp?ZKnJsSA8I~Q8Q2c~fto`q>HUr;^+
zD2soLWmS~sDd(YR0C&zKpxC>p<c6AGYyJ=N5+ziiOtSM)cJGA7Km_+xf2w+&=`)+`
zu6M^NJFrCkU*7doF<|#j#$#QlzWfh%1R#^>cF$XxywqO$mwT#2&wt`Ofu7#QahJ8p
zNPzYO4Rk}mwCI0m7d`%|>G)sP%bcY4w&J;gIUxYh=JFI(;2%pYb3;y`nETumTbGxi
z;{%<PjspXd>7t7I=$;)Q3uVhr!+&({GLQx=)qDfr4Q<PY$TBHyXvf%iSA2jASJbdG
zDdaTb7yfHD4sQySPi$}6EnS*V{O6e44=Cr*ES8qFnR*znr5ViC)d+eFS1!*n7q1|o
NhGgwQ=xZ?O{{a5dPHF%E

literal 0
HcmV?d00001

diff --git a/src/metatrain/pet/tests/test_compile.py b/src/metatrain/pet/tests/test_compile.py
new file mode 100644
index 0000000000..bf88f03403
--- /dev/null
+++ b/src/metatrain/pet/tests/test_compile.py
@@ -0,0 +1,226 @@
+"""Tests for torch.compile support in PET."""
+
+import copy
+
+import pytest
+import torch
+from metatomic.torch import ModelOutput
+
+from metatrain.pet.modules.transformer import CartesianTransformer
+from metatrain.utils.architectures import get_default_hypers
+from metatrain.utils.testing import ArchitectureTests, TrainingTests
+
+
+class PETTests(ArchitectureTests):
+    architecture = "pet"
+
+    @pytest.fixture
+    def minimal_model_hypers(self):
+        hypers = get_default_hypers(self.architecture)["model"]
+        hypers = copy.deepcopy(hypers)
+        hypers["d_pet"] = 1
+        hypers["d_head"] = 1
+        hypers["d_node"] = 1
+        hypers["d_feedforward"] = 1
+        hypers["num_heads"] = 1
+        hypers["num_attention_layers"] = 1
+        hypers["num_gnn_layers"] = 1
+        return hypers
+
+
+def _make_cartesian_transformer(is_first=True, transformer_type="PreLN"):
+    """Helper to create a test CartesianTransformer."""
+    return CartesianTransformer(
+        cutoff=4.5,
+        cutoff_width=0.5,
+        d_model=8,
+        n_head=2,
+        dim_node_features=16,
+        dim_feedforward=8,
+        n_layers=2,
+        norm="RMSNorm",
+        activation="SwiGLU",
+        attention_temperature=1.0,
+        transformer_type=transformer_type,
+        n_atomic_species=4,
+        is_first=is_first,
+    )
+
+
+def _make_inputs(n_atoms=5, max_neighbors=10, d_model=8, dim_node_features=16):
+    """Helper to create test inputs for CartesianTransformer."""
+    input_node_embeddings = torch.randn(n_atoms, dim_node_features)
+    input_messages = torch.randn(n_atoms, max_neighbors, d_model)
+    element_indices_neighbors = torch.randint(0, 4, (n_atoms, max_neighbors))
+    edge_vectors = torch.randn(n_atoms, max_neighbors, 3)
+    padding_mask = torch.ones(n_atoms, max_neighbors, dtype=torch.bool)
+    padding_mask[:, -3:] = False
+    edge_distances = torch.randn(n_atoms, max_neighbors).abs()
+    cutoff_factors = torch.rand(n_atoms, max_neighbors)
+    cutoff_factors[~padding_mask] = 0.0
+    return (
+        input_node_embeddings,
+        input_messages,
+        element_indices_neighbors,
+        edge_vectors,
+        padding_mask,
+        edge_distances,
+        cutoff_factors,
+    )
+
+
+def test_compile_cartesian_transformer():
+    """Test that CartesianTransformer compiles with fullgraph=True and SDPA attention."""
+    ct = _make_cartesian_transformer()
+    compiled_ct = torch.compile(ct, fullgraph=True)
+
+    inputs = _make_inputs()
+    out_eager = ct(*inputs, False)
+    out_compiled = compiled_ct(*inputs, False)
+
+    assert torch.allclose(out_eager[0], out_compiled[0], atol=1e-5)
+    assert torch.allclose(out_eager[1], out_compiled[1], atol=1e-5)
+
+
+def test_compile_manual_attention():
+    """Test that CartesianTransformer compiles with manual attention path."""
+    ct = _make_cartesian_transformer()
+    compiled_ct = torch.compile(ct, fullgraph=True)
+
+    inputs = _make_inputs()
+    out_eager = ct(*inputs, True)
+    out_compiled = compiled_ct(*inputs, True)
+
+    assert torch.allclose(out_eager[0], out_compiled[0], atol=1e-5)
+    assert torch.allclose(out_eager[1], out_compiled[1], atol=1e-5)
+
+
+def test_compile_backward():
+    """Test that single backward through compiled CartesianTransformer works."""
+    ct = _make_cartesian_transformer()
+    compiled_ct = torch.compile(ct, fullgraph=True)
+
+    inputs = list(_make_inputs())
+    inputs[3] = inputs[3].requires_grad_(True)  # edge_vectors
+
+    out = compiled_ct(*inputs, False)
+    loss = out[0].sum() + out[1].sum()
+    loss.backward()
+
+    assert inputs[3].grad is not None
+    assert inputs[3].grad.shape == inputs[3].shape
+
+
+def test_compile_not_first_layer():
+    """Test compilation of non-first CartesianTransformer (different forward branch)."""
+    ct = _make_cartesian_transformer(is_first=False)
+    compiled_ct = torch.compile(ct, fullgraph=True)
+
+    inputs = _make_inputs()
+    out_eager = ct(*inputs, False)
+    out_compiled = compiled_ct(*inputs, False)
+
+    assert torch.allclose(out_eager[0], out_compiled[0], atol=1e-5)
+    assert torch.allclose(out_eager[1], out_compiled[1], atol=1e-5)
+
+
+def test_compile_postln():
+    """Test compilation with PostLN transformer type."""
+    ct = _make_cartesian_transformer(transformer_type="PostLN")
+    compiled_ct = torch.compile(ct, fullgraph=True)
+
+    inputs = _make_inputs()
+    out_eager = ct(*inputs, False)
+    out_compiled = compiled_ct(*inputs, False)
+
+    assert torch.allclose(out_eager[0], out_compiled[0], atol=1e-5)
+    assert torch.allclose(out_eager[1], out_compiled[1], atol=1e-5)
+
+
+def test_forward_from_batch():
+    """Test that _forward_from_batch matches forward for per-atom energy."""
+    from metatrain.pet import PET
+    from metatrain.utils.data import DatasetInfo
+    from metatrain.utils.data.readers import read_systems
+    from metatrain.utils.data.target_info import get_energy_target_info
+    from metatrain.utils.neighbor_lists import get_system_with_neighbor_lists
+
+    from . import DATASET_PATH, MODEL_HYPERS
+    from ..modules.structures import systems_to_batch
+
+    torch.manual_seed(42)
+
+    targets = {
+        "mtt::U0": get_energy_target_info(
+            "mtt::U0", {"quantity": "energy", "unit": "eV"}
+        )
+    }
+    dataset_info = DatasetInfo(
+        length_unit="Angstrom", atomic_types=[1, 6, 7, 8], targets=targets
+    )
+    model = PET(MODEL_HYPERS, dataset_info)
+    model.eval()
+
+    systems = read_systems(DATASET_PATH)[:3]
+    systems = [s.to(torch.float32) for s in systems]
+    for s in systems:
+        get_system_with_neighbor_lists(s, model.requested_neighbor_lists())
+
+    # Get per-atom predictions from forward
+    forward_output = model(
+        systems,
+        {"mtt::U0": ModelOutput(quantity="energy", unit="", per_atom=True)},
+    )
+    forward_per_atom = forward_output["mtt::U0"].block().values
+
+    # Get per-atom predictions from _forward_from_batch
+    (
+        element_indices_nodes,
+        element_indices_neighbors,
+        edge_vectors,
+        edge_distances,
+        padding_mask,
+        reverse_neighbor_index,
+        cutoff_factors,
+        system_indices,
+        neighbor_atom_indices,
+        sample_labels,
+    ) = systems_to_batch(
+        systems,
+        model.requested_nl,
+        model.atomic_types,
+        model.species_to_species_index,
+        model.cutoff_function,
+        model.cutoff_width,
+        model.num_neighbors_adaptive,
+    )
+
+    batch_output = model._forward_from_batch(
+        element_indices_nodes,
+        element_indices_neighbors,
+        edge_vectors,
+        edge_distances,
+        padding_mask,
+        reverse_neighbor_index,
+        cutoff_factors,
+    )
+    # Get the first (and only) block key for the energy target
+    energy_key = next(iter(model.output_shapes["mtt::U0"]))
+    batch_per_atom = batch_output["mtt::U0"][energy_key]
+
+    torch.testing.assert_close(forward_per_atom, batch_per_atom, atol=1e-6, rtol=1e-6)
+
+
+class TestTrainingCompile(TrainingTests, PETTests):
+    """Run the standard training tests with compile=True.
+
+    The full-graph FX compilation path traces the entire PET model
+    (including force/stress computation) into a single FX graph and
+    compiles it with ``torch.compile(dynamic=True, fullgraph=True)``.
+    """
+
+    @pytest.fixture
+    def default_hypers(self):
+        hypers = get_default_hypers(self.architecture)
+        hypers["training"]["compile"] = True
+        return hypers

From 4bdbf4e733f68a4695e14ab18853c30b48471b23 Mon Sep 17 00:00:00 2001
From: Rohit Goswami <rgoswami@ieee.org>
Date: Mon, 16 Feb 2026 21:24:02 +0100
Subject: [PATCH 5/5] chore(lint): for pet and torch.compile

---
 docs/src/dev-docs/changelog.rst         |  8 +++
 src/metatrain/pet/model.py              |  3 +-
 src/metatrain/pet/modules/compile.py    | 76 ++++++++++++++++---------
 src/metatrain/pet/modules/utilities.py  |  2 +
 src/metatrain/pet/tests/test_compile.py |  4 +-
 src/metatrain/pet/trainer.py            | 56 +++++++++---------
 6 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/docs/src/dev-docs/changelog.rst b/docs/src/dev-docs/changelog.rst
index e3656642b0..7aa1bb6ae7 100644
--- a/docs/src/dev-docs/changelog.rst
+++ b/docs/src/dev-docs/changelog.rst
@@ -24,6 +24,14 @@ changelog <https://keepachangelog.com/en/1.1.0/>`_ format. This project follows
 Unreleased
 ----------
 
+Added
+#####
+
+- The PET architecture now supports full-graph FX compilation for training via the
+  ``compile`` hyperparameter. When enabled, the entire model (including force/stress
+  computation) is traced into a single FX graph and compiled with ``torch.compile``,
+  providing maximum kernel fusion and consistently using scaled dot-product attention.
+
 Version 2026.1 - 2026-01-07
 ---------------------------
 
diff --git a/src/metatrain/pet/model.py b/src/metatrain/pet/model.py
index c5cb7e5b5a..be8a146cfd 100644
--- a/src/metatrain/pet/model.py
+++ b/src/metatrain/pet/model.py
@@ -1415,8 +1415,7 @@ def upgrade_checkpoint(cls, checkpoint: Dict) -> Dict:
     def get_checkpoint(self) -> Dict:
         # Get state dict, handling compiled modules by removing _orig_mod prefix
         state_dict = {
-            k.replace("._orig_mod", ""): v
-            for k, v in self.state_dict().items()
+            k.replace("._orig_mod", ""): v for k, v in self.state_dict().items()
         }
         model_state_dict = dict(state_dict)
         model_state_dict["finetune_config"] = self.finetune_config
diff --git a/src/metatrain/pet/modules/compile.py b/src/metatrain/pet/modules/compile.py
index 56287e1c44..f27122cf93 100644
--- a/src/metatrain/pet/modules/compile.py
+++ b/src/metatrain/pet/modules/compile.py
@@ -9,7 +9,7 @@
 """
 
 import logging
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 from torch.nn.utils._named_member_accessor import NamedMemberAccessor
@@ -22,6 +22,8 @@ class _PETBatchForward(torch.nn.Module):
 
     PET is registered as a submodule so its parameters/buffers are visible
     to ``functional_call`` / ``NamedMemberAccessor``.
+
+    :param pet: The PET model whose ``_forward_from_batch`` is called.
     """
 
     def __init__(self, pet: torch.nn.Module) -> None:
@@ -57,13 +59,31 @@ def _make_pet_compiled_forward(
     output_shapes: Dict[str, Dict[str, List[int]]],
     compute_forces: bool,
     compute_stress: bool,
-):
+) -> Callable[
+    ...,
+    Tuple[
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Dict[str, Dict[str, torch.Tensor]],
+    ],
+]:
     """Build the traceable forward function for ``make_fx``.
 
     The returned function accepts all batch tensors and the model's
     parameters/buffers as positional arguments (required by
     ``make_fx`` with ``functional_call``).  It returns
     ``(per_structure_preds, forces, stress, raw_predictions)``.
+
+    :param batch_model: Wrapper module whose ``forward`` delegates to
+        ``pet._forward_from_batch``.
+    :param param_names: Ordered parameter names for the batch model.
+    :param buffer_names: Ordered buffer names for the batch model.
+    :param target_names: Names of the prediction targets.
+    :param output_shapes: Mapping of target name to block key to shape.
+    :param compute_forces: Whether to include force computation in the graph.
+    :param compute_stress: Whether to include stress computation in the graph.
+    :return: A callable that can be traced by ``make_fx``.
     """
     n_params = len(param_names)
     accessor = NamedMemberAccessor(batch_model)
@@ -87,17 +107,22 @@ def _make_pet_compiled_forward(
         )
 
     def forward_fn(
-        edge_vectors,
-        element_indices_nodes,
-        element_indices_neighbors,
-        padding_mask,
-        reverse_neighbor_index,
-        cutoff_factors,
-        system_indices,
-        neighbor_atom_indices,
-        n_structures,
-        *params_and_buffers,
-    ):
+        edge_vectors: torch.Tensor,
+        element_indices_nodes: torch.Tensor,
+        element_indices_neighbors: torch.Tensor,
+        padding_mask: torch.Tensor,
+        reverse_neighbor_index: torch.Tensor,
+        cutoff_factors: torch.Tensor,
+        system_indices: torch.Tensor,
+        neighbor_atom_indices: torch.Tensor,
+        n_structures: int,
+        *params_and_buffers: torch.Tensor,
+    ) -> Tuple[
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Dict[str, Dict[str, torch.Tensor]],
+    ]:
         # Swap in the provided params/buffers via NamedMemberAccessor
         params_buffers = {}
         for i, name in enumerate(param_names):
@@ -105,9 +130,7 @@ def forward_fn(
         for i, name in enumerate(buffer_names):
             params_buffers[name] = params_and_buffers[n_params + i]
 
-        orig_values, _ = accessor.swap_tensors_dict(
-            params_buffers, allow_missing=True
-        )
+        orig_values, _ = accessor.swap_tensors_dict(params_buffers, allow_missing=True)
 
         # Compute edge_distances inside compiled graph (differentiable)
         edge_distances = torch.sqrt((edge_vectors**2).sum(-1) + 1e-15)
@@ -139,9 +162,7 @@ def forward_fn(
             energy = torch.zeros(
                 n_struct, dtype=edge_vectors.dtype, device=edge_vectors.device
             )
-            energy.scatter_add_(
-                0, system_indices, per_atom_energy.squeeze(-1)
-            )
+            energy.scatter_add_(0, system_indices, per_atom_energy.squeeze(-1))
 
         if (compute_forces or compute_stress) and energy is not None:
             (dE_dR,) = torch.autograd.grad(
@@ -168,8 +189,11 @@ def forward_fn(
                 # Virial: sigma = (1/V) sum r otimes (dE/dr)
                 virial_per_atom = torch.einsum("ema,emb->eab", edge_vectors, dE_dR)
                 stress_buf = torch.zeros(
-                    n_struct, 3, 3,
-                    dtype=edge_vectors.dtype, device=edge_vectors.device,
+                    n_struct,
+                    3,
+                    3,
+                    dtype=edge_vectors.dtype,
+                    device=edge_vectors.device,
                 )
                 stress_buf.scatter_add_(
                     0,
@@ -188,7 +212,7 @@ def forward_fn(
 
 def compile_pet_model(
     model: torch.nn.Module,
-    train_dataloader,
+    train_dataloader: Any,
     compute_forces: bool,
     compute_stress: bool,
 ) -> Tuple[torch.nn.Module, List[str], List[str]]:
@@ -258,9 +282,7 @@ def compile_pet_model(
     # edge_vectors needs grad for force tracing
     tracing_edge_vectors = edge_vectors.clone().requires_grad_(True)
 
-    logging.info(
-        "Tracing PET model with make_fx (symbolic tracing)..."
-    )
+    logging.info("Tracing PET model with make_fx (symbolic tracing)...")
 
     old_duck = torch.fx.experimental._config.use_duck_shape
     torch.fx.experimental._config.use_duck_shape = False
@@ -286,8 +308,6 @@ def compile_pet_model(
         torch.fx.experimental._config.use_duck_shape = old_duck
 
     logging.info("Compiling traced FX graph with torch.compile...")
-    compiled = torch.compile(
-        fx_graph, dynamic=True, fullgraph=True
-    )
+    compiled = torch.compile(fx_graph, dynamic=True, fullgraph=True)
 
     return compiled, param_names, buffer_names
diff --git a/src/metatrain/pet/modules/utilities.py b/src/metatrain/pet/modules/utilities.py
index 2414f2f1ca..2a3e88f4a3 100644
--- a/src/metatrain/pet/modules/utilities.py
+++ b/src/metatrain/pet/modules/utilities.py
@@ -69,6 +69,8 @@ def replace_silu_modules(module: torch.nn.Module) -> None:
     """Replace all ``torch.nn.SiLU`` instances with :class:`DecomposedSiLU`.
 
     Recurses through the module tree, including inside ``nn.Sequential``.
+
+    :param module: The module to recursively modify in-place.
     """
     for name, child in module.named_children():
         if isinstance(child, torch.nn.SiLU):
diff --git a/src/metatrain/pet/tests/test_compile.py b/src/metatrain/pet/tests/test_compile.py
index bf88f03403..7ca028b824 100644
--- a/src/metatrain/pet/tests/test_compile.py
+++ b/src/metatrain/pet/tests/test_compile.py
@@ -70,7 +70,7 @@ def _make_inputs(n_atoms=5, max_neighbors=10, d_model=8, dim_node_features=16):
 
 
 def test_compile_cartesian_transformer():
-    """Test that CartesianTransformer compiles with fullgraph=True and SDPA attention."""
+    """Test CartesianTransformer with fullgraph=True and SDPA attention."""
     ct = _make_cartesian_transformer()
     compiled_ct = torch.compile(ct, fullgraph=True)
 
@@ -145,8 +145,8 @@ def test_forward_from_batch():
     from metatrain.utils.data.target_info import get_energy_target_info
     from metatrain.utils.neighbor_lists import get_system_with_neighbor_lists
 
-    from . import DATASET_PATH, MODEL_HYPERS
     from ..modules.structures import systems_to_batch
+    from . import DATASET_PATH, MODEL_HYPERS
 
     torch.manual_seed(42)
 
diff --git a/src/metatrain/pet/trainer.py b/src/metatrain/pet/trainer.py
index e7264601ca..c114be5dcb 100644
--- a/src/metatrain/pet/trainer.py
+++ b/src/metatrain/pet/trainer.py
@@ -5,6 +5,8 @@
 from typing import Any, Dict, List, Literal, Optional, Union, cast
 
 import torch
+from metatensor.torch import Labels, TensorBlock, TensorMap
+from metatomic.torch import System
 from torch.optim.lr_scheduler import LambdaLR
 from torch.utils.data import DataLoader, DistributedSampler
 
@@ -37,9 +39,6 @@
 from metatrain.utils.scaler import get_remove_scale_transform
 from metatrain.utils.transfer import batch_to
 
-from metatensor.torch import Labels, TensorBlock, TensorMap
-from metatomic.torch import System
-
 from . import checkpoints
 from .documentation import TrainerHypers
 from .model import PET
@@ -62,6 +61,17 @@ def _wrap_compiled_output(
 
     Produces the same format as ``evaluate_model`` so the loss function
     and metric accumulators work unchanged.
+
+    :param energy: Per-structure energy tensor from the compiled function.
+    :param forces: Per-atom force tensor, or ``None``.
+    :param stress: Per-structure stress tensor, or ``None``.
+    :param raw_predictions: Per-atom predictions keyed by target and block.
+    :param model: The PET model instance.
+    :param systems: The input systems for this batch.
+    :param sample_labels: Labels indicating system and atom indices.
+    :param system_indices: System index for each atom in the batch.
+    :param train_targets: Target information dict from the training config.
+    :return: Predictions as ``Dict[str, TensorMap]``.
     """
     from metatrain.utils.sum_over_atoms import sum_over_atoms
 
@@ -82,15 +92,13 @@ def _wrap_compiled_output(
             values=energy.unsqueeze(-1),
             samples=Labels(
                 "system",
-                torch.arange(
-                    n_structures, device=device, dtype=torch.int32
-                ).unsqueeze(-1),
+                torch.arange(n_structures, device=device, dtype=torch.int32).unsqueeze(
+                    -1
+                ),
                 assume_unique=True,
             ),
             components=[],
-            properties=Labels(
-                "energy", torch.tensor([[0]], device=device)
-            ),
+            properties=Labels("energy", torch.tensor([[0]], device=device)),
         )
 
         if forces is not None:
@@ -101,40 +109,30 @@ def _wrap_compiled_output(
                 values=sample_labels.values.to(torch.int32),
                 assume_unique=True,
             ).to(device)
-            xyz_labels = Labels(
-                "xyz", torch.tensor([[0], [1], [2]], device=device)
-            )
+            xyz_labels = Labels("xyz", torch.tensor([[0], [1], [2]], device=device))
             forces_block = TensorBlock(
                 values=forces.unsqueeze(-1),
                 samples=grad_samples,
                 components=[xyz_labels],
-                properties=Labels(
-                    "energy", torch.tensor([[0]], device=device)
-                ),
+                properties=Labels("energy", torch.tensor([[0]], device=device)),
             )
             energy_block.add_gradient("positions", forces_block)
 
         if stress is not None:
             stress_samples = Labels(
                 "sample",
-                torch.arange(
-                    n_structures, device=device, dtype=torch.int32
-                ).unsqueeze(-1),
+                torch.arange(n_structures, device=device, dtype=torch.int32).unsqueeze(
+                    -1
+                ),
                 assume_unique=True,
             )
-            xyz1 = Labels(
-                "xyz_1", torch.tensor([[0], [1], [2]], device=device)
-            )
-            xyz2 = Labels(
-                "xyz_2", torch.tensor([[0], [1], [2]], device=device)
-            )
+            xyz1 = Labels("xyz_1", torch.tensor([[0], [1], [2]], device=device))
+            xyz2 = Labels("xyz_2", torch.tensor([[0], [1], [2]], device=device))
             stress_block = TensorBlock(
                 values=stress.unsqueeze(-1),
                 samples=stress_samples,
                 components=[xyz1, xyz2],
-                properties=Labels(
-                    "energy", torch.tensor([[0]], device=device)
-                ),
+                properties=Labels("energy", torch.tensor([[0]], device=device)),
             )
             energy_block.add_gradient("strain", stress_block)
 
@@ -525,9 +523,7 @@ def train(
                     has_strain_gradients,
                 )
             )
-            logging.info(
-                "FX compilation complete (will optimize on first call)"
-            )
+            logging.info("FX compilation complete (will optimize on first call)")
 
         start_epoch = 0 if self.epoch is None else self.epoch + 1