microsoft · willtebbutt · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -29,6 +29,7 @@ htmlcov
 .DS_Store
 *.swp
 .envrc
+uv.lock
 
 checkpoints/
 mlflow_tmp/

diff --git a/aurora/model/aurora.py b/aurora/model/aurora.py
@@ -92,6 +92,7 @@ def __init__(
         positive_atmos_vars: tuple[str, ...] = (),
         clamp_at_first_step: bool = False,
         simulate_indexing_bug: bool = False,
+        use_chunked_checkpointing: bool = False,
     ) -> None:
         """Construct an instance of the model.
 
@@ -175,6 +176,7 @@ def __init__(
             simulate_indexing_bug (bool, optional): Simulate an indexing bug that's present for the
                 air pollution version of Aurora. This is necessary to obtain numerical equivalence
                 to the original implementation. Defaults to `False`.
+            use_chunked_checkpointing (bool, optional): Enable chunked-checkpointing.
         """
         super().__init__()
         self.surf_vars = surf_vars
@@ -215,6 +217,7 @@ def __init__(
             dynamic_vars=dynamic_vars,
             atmos_static_vars=atmos_static_vars,
             simulate_indexing_bug=simulate_indexing_bug,
+            use_chunked_checkpointing=use_chunked_checkpointing,
         )
 
         self.backbone = Swin3DTransformerBackbone(
@@ -249,6 +252,7 @@ def __init__(
             level_condition=level_condition,
             separate_perceiver=separate_perceiver,
             modulation_heads=modulation_heads,
+            use_chunked_checkpointing=use_chunked_checkpointing,
         )
 
         if bf16_mode and not autocast:

diff --git a/aurora/model/decoder.py b/aurora/model/decoder.py
@@ -42,6 +42,7 @@ def __init__(
         level_condition: Optional[tuple[int | float, ...]] = None,
         separate_perceiver: tuple[str, ...] = (),
         modulation_heads: tuple[str, ...] = (),
+        use_chunked_checkpointing: bool = False,
     ) -> None:
         """Initialise.
 
@@ -72,6 +73,7 @@ def __init__(
             modulation_heads (tuple[str, ...], optional): Names of every variable for which to
                 enable an additional head, the so-called modulation head, that can be used to
                 predict the difference.
+            use_chunked_checkpointing (bool, optional): Enable chunked-checkpointing.
         """
         super().__init__()
 
@@ -100,6 +102,7 @@ def __init__(
             drop=drop_rate,
             residual_latent=True,
             ln_eps=perceiver_ln_eps,
+            use_chunked_checkpointing=use_chunked_checkpointing,
         )
         if self.separate_perceiver:
             self.level_decoder_alternate = PerceiverResampler(
@@ -112,6 +115,7 @@ def __init__(
                 drop=drop_rate,
                 residual_latent=True,
                 ln_eps=perceiver_ln_eps,
+                use_chunked_checkpointing=use_chunked_checkpointing,
             )
 
         self.surf_heads = nn.ParameterDict(

diff --git a/aurora/model/encoder.py b/aurora/model/encoder.py
@@ -51,6 +51,7 @@ def __init__(
         dynamic_vars: bool = False,
         atmos_static_vars: bool = False,
         simulate_indexing_bug: bool = False,
+        use_chunked_checkpointing: bool = False,
     ) -> None:
         """Initialise.
 
@@ -87,6 +88,7 @@ def __init__(
             simulate_indexing_bug (bool, optional): Simulate an indexing bug that's present for the
                 air pollution version of Aurora. This is necessary to obtain numerical equivalence
                 to the original implementation. Defaults to `False`.
+            use_chunked_checkpointing (bool, optional): Enable chunked-checkpointing.
         """
         super().__init__()
 
@@ -156,6 +158,7 @@ def __init__(
             mlp_ratio=mlp_ratio,
             ln_eps=perceiver_ln_eps,
             ln_k_q=stabilise_level_agg,
+            use_chunked_checkpointing=use_chunked_checkpointing,
         )
 
         # Drop patches after encoding.

diff --git a/aurora/model/perceiver.py b/aurora/model/perceiver.py
@@ -60,6 +60,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from chunkcheck import chunk_and_checkpoint
 
 __all__ = ["MLP", "PerceiverResampler"]
 
@@ -167,6 +168,7 @@ def __init__(
         residual_latent: bool = True,
         ln_eps: float = 1e-5,
         ln_k_q: bool = False,
+        use_chunked_checkpointing = False,
     ) -> None:
         """Initialise.
 
@@ -190,6 +192,7 @@ def __init__(
 
         self.residual_latent = residual_latent
         self.layers = nn.ModuleList([])
+        self.use_chunked_checkpointing = use_chunked_checkpointing
         mlp_hidden_dim = int(latent_dim * mlp_ratio)
         for i in range(depth):
             self.layers.append(
@@ -219,6 +222,11 @@ def forward(self, latents: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
         Returns:
             torch.Tensor: Latent features of shape `(B, L1, D1)`.
         """
+        if self.use_chunked_checkpointing:
+            return chunk_and_checkpoint(self._forward, latents, x, chunk_size=2025)
+        return self._forward(latents, x)
+
+    def _forward(self, latents: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
         for attn, ff, ln1, ln2 in self.layers:
             # We use post-res-norm like in Swin v2 and most Transformer architectures these days.
             # This empirically works better than the pre-norm used in the original Perceiver.

diff --git a/aurora/model/swin3d.py b/aurora/model/swin3d.py
@@ -14,6 +14,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from chunkcheck import chunk_and_checkpoint
 from einops import rearrange
 from timm.layers import DropPath, to_3tuple
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "xarray",
     "netcdf4",
     "azure-storage-blob",
+    "chunkcheck>=0.1.1",
 ]
 
 [project.optional-dependencies]
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,6 +29,7 @@ htmlcov @@
     .DS_Store
     *.swp
     .envrc
+    uv.lock
     checkpoints/
     mlflow_tmp/
@@ Expand Down @@