diff --git a/.gitignore b/.gitignore
index b91787c..095161e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,4 +184,6 @@ __pycache__/
 
 src/astropt/_version.py
 
+# checkpoints
+*.pt
 wandb/
diff --git a/README.md b/README.md
index 8dcbc13..63281d8 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 [![PyPI Downloads](https://static.pepy.tech/badge/astropt)](https://pepy.tech/projects/astropt) 
 [![docs](https://app.readthedocs.org/projects/astropt/badge/)](https://astropt.readthedocs.io/)
 [![License: AGPL-v3](https://img.shields.io/badge/License-AGPLv3-green.svg)](https://www.gnu.org/licenses/agpl-3.0.html)
-[![All Contributors](https://img.shields.io/badge/all_contributors-7-orange.svg?style=flat-square)](#contributors-)
+[![All Contributors](https://img.shields.io/badge/all_contributors-8-orange.svg?style=flat-square)](#contributors-)
 
 [![ICML](https://img.shields.io/badge/AI4Science@ICML-2024---?logo=https%3A%2F%2Fneurips.cc%2Fstatic%2Fcore%2Fimg%2FNeurIPS-logo.svg&labelColor=68448B&color=b3b3b3)](https://openreview.net/forum?id=aOLuuLxqav)
 [![arXiv](https://img.shields.io/badge/arXiv-2405.14930---?logo=arXiv&labelColor=b31b1b&color=grey)](https://arxiv.org/abs/2405.14930)
diff --git a/affine_run.job b/affine_run.job
new file mode 100644
index 0000000..f3fc98f
--- /dev/null
+++ b/affine_run.job
@@ -0,0 +1,32 @@
+# /bin/bash
+# ----------------Parameters---------------------- #
+#$ -S /bin/bash
+#$ -pe mthread 32
+#$ -q lTgpu.q
+#$ -cwd
+#$ -j y
+#$ -N affine_job
+#$ -o affine_job.log
+#$ -m bea
+#$ -M sogolsanjaripour@gmail.com
+#$ -l gpu,ngpus=1
+#
+# ----------------Modules------------------------- #
+#
+
+module load tools/conda
+start-conda
+conda activate astropt
+
+# ----------------Your Commands------------------- #
+#
+echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
+echo + NSLOTS = $NSLOTS
+#
+
+cd astroPT
+uv sync
+uv run scripts/train.py --batch_size=32 --compile=False
+
+#
+echo = `date` job $JOB_NAME done
\ No newline at end of file
diff --git a/aim_run.job b/aim_run.job
new file mode 100644
index 0000000..a35cb67
--- /dev/null
+++ b/aim_run.job
@@ -0,0 +1,32 @@
+# /bin/bash
+# ----------------Parameters---------------------- #
+#$ -S /bin/bash
+#$ -pe mthread 32
+#$ -q lTgpu.q
+#$ -cwd
+#$ -j y
+#$ -N aim_job
+#$ -o aim_job.log
+#$ -m bea
+#$ -M sogolsanjaripour@gmail.com
+#$ -l gpu,ngpus=1
+#
+# ----------------Modules------------------------- #
+#
+
+module load tools/conda
+start-conda
+conda activate astropt
+
+# ----------------Your Commands------------------- #
+#
+echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
+echo + NSLOTS = $NSLOTS
+#
+
+cd astroPT
+uv sync
+uv run scripts/train.py --batch_size=32 --compile=False
+
+#
+echo = `date` job $JOB_NAME done
\ No newline at end of file
diff --git a/jetformer_run.job b/jetformer_run.job
new file mode 100644
index 0000000..5627102
--- /dev/null
+++ b/jetformer_run.job
@@ -0,0 +1,33 @@
+# /bin/bash
+# ----------------Parameters---------------------- #
+#$ -S /bin/bash
+#$ -pe mthread 32
+#$ -q lTgpu.q
+#$ -cwd
+#$ -j y
+#$ -N jetformer_job
+#$ -o jetformer_job.log
+#$ -m bea
+#$ -M sogolsanjaripour@gmail.com
+#$ -l gpu,ngpus=1
+#
+# ----------------Modules------------------------- #
+#
+
+module load tools/conda
+start-conda
+conda activate astropt
+
+# ----------------Your Commands------------------- #
+#
+echo + `date` job $JOB_NAME started in $QUEUE with jobID=$JOB_ID on $HOSTNAME
+echo + NSLOTS = $NSLOTS
+#
+
+cd astroPT
+uv sync
+#uv run scripts/train_jetformer.py --batch_size=32 --compile=False
+#uv run torchrun --standalone --nproc_per_node=2 scripts/train_jetformer.py --batch_size=32 --compile=False
+PYTHONUNBUFFERED=1 uv run scripts/train_jetformer.py --init_from=resume --batch_size=32 --compile=False
+#
+echo = `date` job $JOB_NAME done
\ No newline at end of file
diff --git a/scripts/jetformer/download_galaxy_256x256.py b/scripts/jetformer/download_galaxy_256x256.py
new file mode 100644
index 0000000..f4cd07a
--- /dev/null
+++ b/scripts/jetformer/download_galaxy_256x256.py
@@ -0,0 +1,63 @@
+# make_galaxy_pt_shards.py
+# pip install datasets pillow torch
+
+import os, math
+import torch
+from datasets import load_dataset
+from PIL import Image
+
+OUTDIR = "galaxy_pt_256x256_test"   # folder for shards
+TOTAL  = 10_000        # how many samples
+SHARD  = 1024          # images per shard (last shard may be smaller)
+SEED   = 42
+BUFFER = 50_000        # shuffle buffer
+
+IMG_SIZE = 256  # match your model
+
+def pil_to_chw_uint8(img: Image.Image):
+    img = img.convert("RGB")
+    t = torch.tensor(bytearray(img.tobytes()), dtype=torch.uint8)
+    t = t.view(IMG_SIZE, IMG_SIZE, 3).permute(2,0,1).contiguous()  # CHW
+    return t
+
+def main():
+    os.makedirs(OUTDIR, exist_ok=True)
+    ds = load_dataset("Smith42/galaxies", split="test", streaming=True)
+    ds = ds.shuffle(buffer_size=BUFFER, seed=SEED)
+
+    buf = []
+    saved = 0
+    shard_idx = 0
+
+    for ex in ds:
+        # strictly use image_crop
+        if "image_crop" not in ex or ex["image_crop"] is None:
+            continue
+        try:
+            t = pil_to_chw_uint8(ex["image_crop"])
+        except Exception:
+            continue
+
+        buf.append(t)
+        saved += 1
+
+        if len(buf) == SHARD:
+            x = torch.stack(buf, dim=0)  # [N,3,256,256] uint8
+            torch.save({"images": x}, os.path.join(OUTDIR, f"shard_{shard_idx:04d}_test.pt"))
+            print(f"wrote shard {shard_idx:04d} with {x.size(0)} samples")
+            shard_idx += 1
+            buf.clear()
+
+        if saved >= TOTAL:
+            break
+
+    # flush remainder
+    if buf:
+        x = torch.stack(buf, dim=0)
+        torch.save({"images": x}, os.path.join(OUTDIR, f"shard_{shard_idx:04d}_test.pt"))
+        print(f"wrote shard {shard_idx:04d} with {x.size(0)} samples")
+
+    print(f"Done. Total saved: {saved} samples into {OUTDIR}/shard_*_test.pt")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/jetformer/train_jetformer.py b/scripts/jetformer/train_jetformer.py
new file mode 100644
index 0000000..a7cbdc7
--- /dev/null
+++ b/scripts/jetformer/train_jetformer.py
@@ -0,0 +1,775 @@
+# train_jetformer.py
+import math
+import os
+import csv
+import pandas as pd
+from dataclasses import dataclass
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Subset, TensorDataset
+import torchvision
+import torchvision.transforms as T
+from torchvision.utils import save_image
+from tqdm import tqdm
+
+
+# This must be done BEFORE importing pyplot
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+# ======================================================================================
+# Block 1: Configuration
+# ======================================================================================
+@dataclass
+class CFG:
+    # --- Heavier Transformer Config ---
+    # This is the config you provided. It's very large.
+    d_model: int = 1536
+    n_heads: int = 24
+    n_layers: int = 16
+    
+    # --- Training Config ---
+    epochs: int = 200
+    
+    batch_size: int = 4 
+    
+    lr: float = 3e-4 # Constant learning rate
+    wd: float = 0.01
+    
+    # --- Dataset Switch ---
+    dataset_name: str = "galaxy" 
+    
+    # --- Other Model/Data Params ---
+    img_size: int = 256 
+    in_ch: int = 3
+    
+
+    # A 16x16 patch creates (256/16)^2 = 16*16 = 256 tokens. This is standard
+    # practice (like ViT-Base) and much more manageable.
+    patch: int = 16
+    
+
+    # This will be (256 // 16)**2 = 256
+    n_tokens: int = (img_size // patch)**2
+    
+
+    # This is the dimension of a single flattened patch.
+    # It will be 3 * 16 * 16 = 768
+    d_token: int = in_ch * patch * patch
+    
+    gmm_K: int = 4
+    flow_steps: int = 4
+    
+    # --- Stability & Checkpointing ---
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+    grad_clip_val: float = 1.0
+    
+    # These paths are now set dynamically in the train() function
+    checkpoint_path: str = "" 
+    samples_dir: str = ""
+    loss_csv_path: str = ""
+    loss_plot_path: str = ""
+    
+    # --- Noise Curriculum ---
+    noise_max: float = 0.1
+    noise_min: float = 0.0
+
+# ======================================================================================
+# Block 2: Loss Logging and Plotting Utilities
+# ======================================================================================
+
+def append_losses_to_csv(epoch, train_loss, val_loss, filename):
+    """Appends the epoch, average train loss, and optional val loss to a CSV file."""
+    # Check if the file exists to write headers
+    file_exists = os.path.isfile(filename)
+    with open(filename, 'a', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        # Write header only if the file is new
+        if not file_exists:
+            writer.writerow(['epoch', 'train_loss', 'val_loss']) 
+        
+        # Write the data. csv.writer handles None as an empty field.
+        writer.writerow([epoch, train_loss, val_loss]) 
+
+def plot_loss_from_csv(csv_path, output_path):
+    """Reads a CSV file and saves a plot of the train and validation loss curves."""
+    # Prevent error if the file doesn't exist yet
+    if not os.path.isfile(csv_path):
+        return
+    
+    # Read the data using pandas. 
+    # Empty val_loss fields will be read as NaN.
+    df = pd.read_csv(csv_path) 
+    
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    # Plot training loss (all epochs)
+    ax.plot(df['epoch'], df['train_loss'], label='Train Loss', color='blue')
+    
+
+    # 1. Create a new DataFrame containing only rows where 'val_loss' is NOT NaN
+    df_val = df.dropna(subset=['val_loss'])
+    
+    # 2. Plot the filtered data. 
+    # This will plot just the valid points (e.g., 5, 10, 15...)
+    # and connect them with a dashed line.
+    if not df_val.empty: # Only plot if we have at least one validation point
+        ax.plot(df_val['epoch'], df_val['val_loss'], 
+                label='Validation Loss', color='orange', 
+                linestyle='--', marker='o') 
+    
+    ax.set_title('Training and Validation Loss per Epoch') 
+    ax.set_xlabel('Epoch')
+    ax.set_ylabel('Average Loss')
+    ax.legend() 
+    ax.grid(True)
+    
+    # Save the plot and close the figure to free memory
+    fig.savefig(output_path)
+    plt.close(fig)
+
+# ======================================================================================
+# Block 3: Data Loading
+# ======================================================================================
+
+
+def get_galaxy_dataloader(cfg, folder="galaxy_pt_256x256"):
+    """
+    Loads preprocessed 256x256 CHW uint8 tensors from .pt shards.
+    
+    This function assumes the folder "galaxy_pt_256x256" contains .pt shards
+    (e.g., "shard_0000.pt", "shard_0001.pt", etc.) where each shard
+    is a dictionary:
+    {"images": torch.Tensor[N, 3, 256, 256]}
+    
+    The tensors should be of type uint8.
+    This function does NO resizing. It streams the data as-is.
+    
+    Output: dict {"img": FloatTensor[B, 3, 256, 256] in [0,1], "label": LongTensor[B]}
+    """
+    import os, glob, torch
+    from torch.utils.data import IterableDataset, DataLoader, get_worker_info
+
+    files = sorted(glob.glob(os.path.join(folder, "shard_*.pt")))
+    if not files:
+        raise FileNotFoundError(f"No shards found in {folder}. Make sure your 256x256 shards are in this folder.")
+    
+    ## NEW: Added print statement for clarity
+    print(f"Found {len(files)} data shards in '{folder}'.")
+
+    class PTShardStream(IterableDataset):
+        def __init__(self, files):
+            self.files = files
+
+        def __iter__(self):
+            info = get_worker_info()
+            # shard files across workers
+            if info is None:
+                my_files = self.files
+            else:
+                my_files = self.files[info.id::info.num_workers]
+
+            for f in my_files:
+                data = torch.load(f, map_location="cpu")  # {"images": uint8 [N, 3, 256, 256]}
+                imgs_u8 = data["images"]
+                
+                # sanity check for image size
+                if imgs_u8.ndim != 4 or imgs_u8.shape[1] != 3 or imgs_u8.shape[2] != 256 or imgs_u8.shape[3] != 256:
+                    print(f"WARNING: Shard {f} has unexpected shape {imgs_u8.shape}. Expected [N, 3, 256, 256].")
+                    continue # Skip this shard
+                    
+                # yield per-sample; DataLoader will stack
+                for i in range(imgs_u8.size(0)):
+                    # convert to float [0,1] here
+                    yield {"img": imgs_u8[i].float().div_(255.0), "label": 0}
+
+    def _collate(batch):
+        imgs = torch.stack([b["img"] for b in batch], dim=0)  # [B, 3, 256, 256] float
+        labels = torch.zeros(len(batch), dtype=torch.long)
+        return {"img": imgs, "label": labels}
+
+    # multiple workers safe (we shard files, not streams)
+    nw = min(8, max(2, (os.cpu_count() or 4) - 1))
+    return DataLoader(
+        PTShardStream(files),
+        batch_size=cfg.batch_size,
+        num_workers=nw,
+        pin_memory=True,
+        prefetch_factor=4,
+        persistent_workers=True,
+        collate_fn=_collate,
+    )
+# ======================================================================================
+# Block 4: Checkpointing Functions
+# ======================================================================================
+def save_checkpoint(epoch, model, optimizer, cfg):
+    checkpoint = {'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}
+    torch.save(checkpoint, cfg.checkpoint_path)
+
+def load_checkpoint(model, optimizer, cfg):
+    if not os.path.exists(cfg.checkpoint_path):
+        print("No checkpoint found. Starting from scratch.")
+        return 0
+    checkpoint = torch.load(cfg.checkpoint_path, map_location=cfg.device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+    start_epoch = checkpoint['epoch'] + 1
+    print(f"Checkpoint loaded. Resuming from epoch {start_epoch}")
+    return start_epoch
+
+# ======================================================================================
+# Block 5.1: Image and Token Utilities
+# This block contains helper functions for preprocessing images and converting them
+# into a sequence of "tokens" that the Transformer can understand, and back again.
+# ======================================================================================
+def uniform_dequantize(x: torch.Tensor) -> torch.Tensor:
+    """
+    Takes a tensor of pixel values (scaled 0-1) and makes them continuous.
+    It adds a tiny amount of uniform noise, breaking the discrete nature of pixel values.
+    This is a crucial step for training continuous models like normalizing flows.
+    """
+    return (x + torch.rand_like(x) / 256.0).clamp(0.0, 1.0)
+
+def patchify(x: torch.Tensor, patch_size: int = 16) -> torch.Tensor:
+    """
+    Converts a batch of images into a sequence of flattened patches (tokens).
+    It slices the image into a grid and then flattens each patch.
+    Input Shape: (Batch, Channels, Height, Width) -> (B, C, H, W)
+    Output Shape: (Batch, Num_Patches, Patch_Dimension) -> (B, N, D_token)
+    """
+    B, C, H, W = x.shape
+    assert H % patch_size == 0 and W % patch_size == 0, "Image dimensions must be divisible by the patch size."
+    
+    # Use 'unfold' to create sliding blocks (patches) across height and width
+    x = x.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
+    
+    # Reshape and flatten to get the final sequence of tokens
+    x = x.contiguous().permute(0, 2, 3, 1, 4, 5).reshape(B, -1, C * patch_size * patch_size)
+    return x
+
+def depatchify(tokens: torch.Tensor, C: int = 3, H: int = 256, W: int = 256, patch_size: int = 16) -> torch.Tensor:
+    """
+    The exact inverse of the 'patchify' function.
+    Converts a sequence of tokens back into an image format.
+    """
+    B, N, D = tokens.shape
+    hp, wp = H // patch_size, W // patch_size # Number of patches along height and width
+    
+    # Reshape the sequence back into a grid of patches
+    x = tokens.reshape(B, hp, wp, C, patch_size, patch_size)
+    
+    # Permute and reshape to reconstruct the final image
+    x = x.permute(0, 3, 1, 4, 2, 5).reshape(B, C, H, W)
+    return x
+    
+# ======================================================================================
+# Block 5.2: Normalizing Flow (TinyFlow)
+# This block defines a simple RealNVP-style normalizing flow. A flow is an invertible
+# neural network, meaning it can map data to a latent space and back again perfectly.
+# Crucially, it provides the log-determinant of the Jacobian ('logdet'), which is
+# needed for the change of variables formula to calculate the exact likelihood of data.
+# Its purpose here is to "pre-process" the complex image distribution into a simpler one
+# that is easier for the Transformer to model.
+# ======================================================================================
+class CouplingNet(nn.Module):
+    """A small convolutional network that predicts the scale (s) and shift (t) parameters for the Affine Coupling Layer."""
+    def __init__(self, channels: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(channels, 128, kernel_size=3, padding=1), nn.ReLU(),
+            nn.Conv2d(128, 128, kernel_size=3, padding=1), nn.ReLU(),
+            nn.Conv2d(128, channels * 2, kernel_size=3, padding=1) # Output has 2x channels for s and t
+        )
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        st = self.net(x)
+        C = x.size(1)
+        s, t = st[:, :C], st[:, C:] # Split the output into scale and shift
+        s = torch.tanh(s) * 1.5 # Bound the scale for numerical stability
+        return s, t
+
+class AffineCoupling(nn.Module):
+    """
+    An affine coupling layer. It splits the input using a mask. One part is left
+    unchanged (identity), and this part is used to predict the scale/shift that
+    will transform the *other* part of the input. This makes the transformation
+    powerful yet easily invertible.
+    """
+    def __init__(self, in_ch: int, mask: torch.Tensor):
+        super().__init__()
+        self.register_buffer("mask", mask) # A binary mask (e.g., checkerboard)
+        self.net = CouplingNet(in_ch)
+
+    def forward(self, x: torch.Tensor, reverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        x_id = x * self.mask  # The part that remains unchanged
+        s, t = self.net(x_id) # Predict s and t from the unchanged part
+
+        if not reverse: # Forward pass: x -> z
+            # Transform the other part of x: y = x * scale + shift
+            y = x_id + (1 - self.mask) * (x * torch.exp(s) + t)
+            # The log-determinant is just the sum of the logs of the scale factors
+            logdet = ((1 - self.mask) * s).flatten(1).sum(dim=1)
+            return y, logdet
+        else: # Inverse pass: z -> x
+            # The inverse is cheap to compute: x = (y - shift) / scale
+            y = x_id + (1 - self.mask) * ((x - t) * torch.exp(-s))
+            logdet = -((1 - self.mask) * s).flatten(1).sum(dim=1)
+            return y, logdet
+
+def checker_mask(C: int, H: int, W: int, flip: bool = False, device: str = "cpu") -> torch.Tensor:
+    """Creates a checkerboard mask where half the channels are masked."""
+    m = torch.zeros(1, C, H, W, device=device)
+    m[:, ::2, :, :] = 1.0 # Mask even-indexed channels
+    return 1.0 - m if flip else m
+
+class TinyFlow(nn.Module):
+    """A stack of Affine Coupling layers. By alternating the mask between layers, we ensure all dimensions get transformed."""
+    def __init__(self, in_ch: int, img_size: int, steps: int = 4):
+        super().__init__()
+        self.blocks = nn.ModuleList([AffineCoupling(in_ch, checker_mask(in_ch, img_size, img_size, flip=(k % 2 == 1))) for k in range(steps)])
+    
+    def forward(self, x: torch.Tensor, reverse: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        logdet = x.new_zeros(x.size(0))
+        z = x
+        
+        # Apply the sequence of transformations
+        if not reverse:
+            for b in self.blocks:
+                z, ld = b(z, reverse=False)
+                logdet += ld
+        else: # Apply in reverse for the inverse pass
+            for b in reversed(self.blocks):
+                z, ld = b(z, reverse=True)
+                logdet += ld
+        return z, logdet
+
+# ======================================================================================
+# Block 5.3: Autoregressive Transformer (TinyGPT)
+# This block defines a standard "decoder-only" Transformer, similar to GPT. Its job is
+# to model the sequence of latent tokens produced by the Normalizing Flow.
+# It is "autoregressive" because it predicts each token based on all the tokens that came before it.
+# ======================================================================================
+class CausalSelfAttention(nn.Module):
+    """
+    The core mechanism of the Transformer. It allows each token to look at all
+    previous tokens in the sequence to gather context. A "causal" mask is applied
+    to prevent it from "cheating" by looking at future tokens.
+    """
+    def __init__(self, d_model: int, n_heads: int):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
+        self.register_buffer("mask", None, persistent=False)
+        
+    def _get_causal_mask(self, T: int, device: torch.device) -> torch.Tensor:
+        """Generates or retrieves the triangular mask to enforce causality."""
+        if self.mask is None or self.mask.size(0) != T:
+            self.mask = torch.triu(torch.ones(T, T, device=device), diagonal=1).bool()
+        return self.mask
+        
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        T = x.size(1)
+        m = self._get_causal_mask(T, x.device)
+        out, _ = self.attn(x, x, x, attn_mask=m, need_weights=False)
+        return out
+
+class DecoderBlock(nn.Module):
+    """A single Transformer block, which combines causal self-attention and a feed-forward network (MLP)."""
+    def __init__(self, d_model: int, n_heads: int, mlp_ratio: int = 4):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(d_model)
+        self.attn = CausalSelfAttention(d_model, n_heads)
+        self.ln2 = nn.LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model * mlp_ratio), nn.GELU(),
+            nn.Linear(d_model * mlp_ratio, d_model)
+        )
+        
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.ln1(x)) # Attention with residual connection
+        x = x + self.mlp(self.ln2(x))  # MLP with residual connection
+        return x
+
+class TinyGPT(nn.Module):
+    """A stack of decoder blocks to form the full Transformer model."""
+    def __init__(self, d_model: int, n_heads: int, n_layers: int):
+        super().__init__()
+        self.blocks = nn.ModuleList([DecoderBlock(d_model, n_heads) for _ in range(n_layers)])
+        self.ln_f = nn.LayerNorm(d_model)
+        
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for b in self.blocks:
+            x = b(x)
+        return self.ln_f(x)
+
+# ======================================================================================
+# Block 5.4: GMM Output Head and Loss
+# Since the latent tokens are continuous, we can't use a standard classification head.
+# Instead, this block predicts the parameters of a Gaussian Mixture Model (GMM) for each token.
+# A GMM provides a flexible probability distribution over continuous space. The loss
+# is then the negative log-likelihood (NLL) of the true token under this predicted distribution.
+# ======================================================================================
+class GMMHead(nn.Module):
+    """
+    A linear layer that takes the Transformer's output and maps it to the
+    parameters of a GMM for each token in the sequence.
+    For each of K components, we predict:
+    1. A mixture weight (pi)
+    2. A mean vector (mu)
+    3. A log standard deviation vector (log_sigma)
+    """
+    def __init__(self, d_model: int, d_token: int, K: int):
+        super().__init__()
+        self.K, self.D = K, d_token
+        self.proj = nn.Linear(d_model, K * (1 + 2 * d_token))
+
+    def forward(self, h: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N, _ = h.shape
+        out = self.proj(h).view(B, N, self.K, 1 + 2 * self.D)
+        
+        logits_pi = out[..., 0]          # Mixture weights (in logit form)
+        mu = out[..., 1:1+self.D]        # Means
+        log_sigma = out[..., 1+self.D:]  # Log standard deviations
+        
+        log_sigma = torch.clamp(log_sigma, -7, 2) # Clamp for stability
+        return logits_pi, mu, log_sigma
+
+def gmm_nll(y: torch.Tensor, logits_pi: torch.Tensor, mu: torch.Tensor, log_sigma: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log-likelihood of target tokens `y` under the predicted GMM.
+    This involves calculating the probability of `y` under each Gaussian component,
+    weighting it by the mixture probabilities, and using the log-sum-exp trick
+    to compute the total log-likelihood in a numerically stable way.
+    """
+    B, N, D = y.shape; K = logits_pi.size(-1)
+    
+    y = y.unsqueeze(2) # Reshape for broadcasting with GMM parameters
+
+    # Calculate log probability of y for each Gaussian component: log N(y | mu, sigma)
+    inv_var = torch.exp(-2 * log_sigma)
+    logp = -0.5 * ((y - mu)**2 * inv_var).sum(-1) - log_sigma.sum(-1) - 0.5 * D * math.log(2 * math.pi)
+    
+    # Combine with mixture weights and compute total log-likelihood
+    logmix = F.log_softmax(logits_pi, dim=-1) + logp
+    
+    # log-sum-exp over the K components gives the log-likelihood for each token.
+    # Sum over the sequence length to get per-sample NLL.
+    return -torch.logsumexp(logmix, dim=-1).sum(dim=1)
+
+# ======================================================================================
+# Block 5.5: The Complete JetFormerLite Model
+# This final class assembles all the previous components into the complete model.
+# It defines the full forward pass:
+# Image -> Flow -> Latent -> Patchify -> Transformer -> GMM Head -> Loss
+# ======================================================================================
+class JetFormerLite(nn.Module):
+    def __init__(self, cfg: CFG):
+        super().__init__()
+        self.cfg = cfg
+        # 1. The invertible pre-processor
+        self.flow = TinyFlow(cfg.in_ch, cfg.img_size, cfg.flow_steps)
+        # 2. The tokenizer (projection from raw patch to model dimension)
+        self.in_proj = nn.Linear(cfg.d_token, cfg.d_model)
+        self.pos = nn.Parameter(torch.randn(1, cfg.n_tokens, cfg.d_model) * 0.01)
+        # 3. The autoregressive model
+        self.gpt = TinyGPT(cfg.d_model, cfg.n_heads, cfg.n_layers)
+        # 4. The output head
+        self.head = GMMHead(cfg.d_model, cfg.d_token, cfg.gmm_K)
+
+    def forward(self, x: torch.Tensor, epoch_frac: float = 1.0) -> torch.Tensor:
+        # Pre-process image
+        if x.dtype == torch.uint8: x = x.float() / 255.0
+        x = uniform_dequantize(x)
+        
+        # 1. Pass image through the flow to get latent `z` and `logdet`
+        z, logdet = self.flow(x, reverse=False)
+        
+        # 2. Convert latent image `z` into a sequence of tokens
+        tokens = patchify(z, self.cfg.patch)
+        
+        # Add annealed noise for training stability
+        sigma = self.cfg.noise_max + (self.cfg.noise_min - self.cfg.noise_max) * epoch_frac
+        if self.training and sigma > 0:
+            tokens = tokens + torch.randn_like(tokens) * sigma
+            
+        # 3. Project tokens and pass through the Transformer
+        h = self.in_proj(tokens) + self.pos
+        h = self.gpt(h)
+        
+        # 4. Predict GMM parameters for the *next* token
+        logits_pi, mu, log_sigma = self.head(h[:, :-1])
+        target = tokens[:, 1:] # Teacher-forcing
+        
+        # 5. Calculate the two components of the loss
+        nll_gmm = gmm_nll(target, logits_pi, mu, log_sigma) # NLL of latent z
+        
+        # Final loss: NLL(x) = NLL(z) - log|det J|
+        loss = (nll_gmm - logdet).mean()
+        return loss
+
+    @torch.no_grad()
+    def sample(self, n: int = 16, x_real_batch: torch.Tensor = None): 
+        self.eval()
+        B = n; N = self.cfg.n_tokens; device = next(self.parameters()).device
+        
+        # Check if we are doing reconstruction or unconditional generation
+        if x_real_batch is None:
+            # This is the ORIGINAL unconditional generation path
+            tokens = torch.zeros(B, N, self.cfg.d_token, device=device)
+            print("Generating tokens autoregressively (unconditional)...")
+            
+            for t in tqdm(range(N - 1), leave=False):
+                h_in = self.in_proj(tokens) + self.pos
+                h_out = self.gpt(h_in)
+                logits_pi, mu, log_sigma = self.head(h_out[:, t:t+1])
+                pi = F.softmax(logits_pi.squeeze(1), dim=-1)
+                comp_idx = torch.multinomial(pi, 1)
+                gather_idx = comp_idx[:, :, None].expand(-1, 1, self.cfg.d_token)
+                sel_mu = mu.squeeze(1).gather(1, gather_idx).squeeze(1)
+                sel_sigma = log_sigma.squeeze(1).gather(1, gather_idx).squeeze(1).exp()
+                y = sel_mu + torch.randn_like(sel_mu) * sel_sigma
+                tokens[:, t+1] = y
+                
+            z = depatchify(tokens, C=self.cfg.in_ch, H=self.cfg.img_size, W=self.cfg.img_size, patch_size=self.cfg.patch)
+            x, _ = self.flow(z, reverse=True)
+            x = x.clamp(0, 1)
+            return x
+        
+        else:
+            # This is the RECONSTRUCTION path
+            print("Reconstructing real images for side-by-side comparison...")
+            
+            # We'll use n//2 real images and n//2 predicted images to make n total
+            n_pairs = n // 2 
+            # Get first n/2 images from the provided batch and move to device
+            x_real = x_real_batch[:n_pairs].to(device) 
+
+            # Define C, H, and W from the model's configuration
+            C = self.cfg.in_ch
+            H = self.cfg.img_size
+            W = self.cfg.img_size
+
+            
+            # 1. Pass real images through the flow to get latent z
+            z_real, _ = self.flow(x_real, reverse=False)
+            
+            # 2. Patchify to get the real tokens
+            tokens_real = patchify(z_real, self.cfg.patch)
+            
+            # 3. Get the model's predictions (teacher-forced)
+            # We pass the *entire* real token sequence and get predictions for what *should* come next
+            h_in = self.in_proj(tokens_real) + self.pos
+            h_out = self.gpt(h_in)
+            
+            # 4. Get GMM parameters for tokens 1...N-1
+            logits_pi, mu, log_sigma = self.head(h_out[:, :-1])
+            
+            # 5. Create the "predicted" token sequence
+            tokens_pred = torch.zeros_like(tokens_real)
+            tokens_pred[:, 0] = tokens_real[:, 0] # Copy first token (it's not predicted)
+            
+            # 6. Get deterministic 'mu' from most likely GMM component
+            # This is the model's "best guess" for each token, given the *real* preceding token
+            best_comp_idx = torch.argmax(logits_pi, dim=-1, keepdim=True) # Shape: [B, N-1, 1]
+            gather_idx = best_comp_idx.unsqueeze(-1).expand(-1, -1, -1, self.cfg.d_token)
+            sel_mu = torch.gather(mu, 2, gather_idx).squeeze(2) # Shape: [B, N-1, D_token]
+            tokens_pred[:, 1:] = sel_mu # Fill in the predicted tokens
+            
+            # 7. Depatchify and invert flow for predicted images
+            z_pred = depatchify(tokens_pred, C=self.cfg.in_ch, H=self.cfg.img_size, W=self.cfg.img_size, patch_size=self.cfg.patch)
+            x_pred, _ = self.flow(z_pred, reverse=True)
+            x_pred = x_pred.clamp(0, 1) # This is the final reconstructed image batch
+            
+            # 8. Interleave real and predicted images for side-by-side saving
+            # We stack [x_real, x_pred] and then reshape
+            # The saved grid will look like: [x_real[0], x_pred[0], x_real[1], x_pred[1], ...]
+            combined = torch.stack([x_real, x_pred], dim=1) # Shape [n_pairs, 2, C, H, W]
+            combined = combined.view(n, C, H, W) # Shape [n, C, H, W]
+            
+            return combined
+
+# ======================================================================================
+# Block 6: Main Training Loop
+# ======================================================================================
+def train():
+    # --- Basic Setup ---
+    cfg = CFG()
+    device = cfg.device
+    print(f"Using device: {device}")
+    
+    # Printing key config changes
+    print(f"--- CONFIGURATION ---")
+    print(f"  Image Size: {cfg.img_size}x{cfg.img_size}")
+    print(f"  Patch Size: {cfg.patch}x{cfg.patch}")
+    print(f"  Batch Size: {cfg.batch_size}")
+    print(f"  Num Tokens: {cfg.n_tokens}")
+    print(f"  Token Dim:  {cfg.d_token}")
+    print(f"  Model Dim:  {cfg.d_model}")
+    print(f"---------------------")
+    
+    # Dynamically set paths based on the dataset name
+    cfg.checkpoint_path = f"checkpoint_{cfg.dataset_name}_256.pt"
+    cfg.samples_dir = f"samples_{cfg.dataset_name}_256"
+    cfg.loss_csv_path = f"loss_log_{cfg.dataset_name}_256.csv"
+    cfg.loss_plot_path = f"loss_plot_{cfg.dataset_name}_256.png"
+    
+    os.makedirs(cfg.samples_dir, exist_ok=True)
+    print(f"Running experiment: {cfg.dataset_name} (256x256)")
+    print(f"Checkpoints will be saved to: {cfg.checkpoint_path}")
+
+    # --- Model and Optimizer Setup ---
+    model = JetFormerLite(cfg).to(device)
+    opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.wd)
+    start_epoch = load_checkpoint(model, opt, cfg)
+
+    # --- Data Loading ---
+    print("Loading 256x256 galaxy data shards...")
+    loader = get_galaxy_dataloader(cfg)
+    
+
+    # We can't know the exact steps for a streaming dataset.
+    # 1,000,000 (example) / 4 (batch_size) = 250,000
+    # Let's use a large fixed number for the noise curriculum.
+    steps_per_epoch = 250000 
+    
+    print(f"Starting training from epoch {start_epoch} up to {cfg.epochs}...")
+    # --- Main Training Loop ---
+    for ep in range(start_epoch, cfg.epochs):
+        model.train()
+        pbar = tqdm(loader, desc=f"Epoch {ep+1}/{cfg.epochs}")
+        
+        epoch_losses = []
+        
+        for i, batch in enumerate(pbar):
+
+            img = batch["img"]
+                
+            current_step = ep * steps_per_epoch + i
+            
+            # Forward and backward pass
+            img = img.to(device)
+            epoch_frac = current_step / (cfg.epochs * steps_per_epoch)
+            loss = model(img, epoch_frac=epoch_frac)
+            opt.zero_grad(set_to_none=True)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip_val)
+            opt.step()
+            
+            epoch_losses.append(loss.item())
+            
+            # Update progress bar
+            pbar.set_postfix(loss=f"{loss.item():.3f}", lr=f"{opt.param_groups[0]['lr']:.2e}")
+        
+        # --- End-of-Epoch Operations ---
+        avg_train_loss = sum(epoch_losses) / len(epoch_losses) if epoch_losses else 0
+        print(f"Epoch {ep+1} finished. Average Train Loss: {avg_train_loss:.3f}")
+
+        save_checkpoint(ep, model, opt, cfg)
+        
+        avg_val_loss = None
+        
+        # Generate samples and RUN VALIDATION every 2 epochs
+        if (ep + 1) % 2 == 0:
+            print(f"--- Running Validation & Sampling for epoch {ep+1} ---")
+            
+            model.eval() # Set model to evaluation mode
+            
+            #: Load ALL test shards
+            import glob
+            test_shard_pattern = os.path.join("galaxy_pt_256x256_test", "shard_*_test.pt")
+            test_shard_files = sorted(glob.glob(test_shard_pattern))
+            
+            if not test_shard_files:
+                print(f"WARNING: No test shards found matching pattern '{test_shard_pattern}'. Skipping validation.")
+            else:
+                print(f"Found {len(test_shard_files)} test shard(s): {[os.path.basename(f) for f in test_shard_files]}")
+            
+            val_losses = []
+            test_images_f32 = None # To store loaded test images
+            try:
+                # 1. Load and concatenate all test shard data
+                all_test_images = []
+                for test_shard_path in test_shard_files:
+                    print(f"Loading test shard: {test_shard_path}")
+                    test_data = torch.load(test_shard_path, map_location='cpu')
+                    test_images_u8 = test_data["images"]
+                    
+                    # Test shard sanity check
+                    if test_images_u8.ndim != 4 or test_images_u8.shape[1] != 3 or test_images_u8.shape[2] != 256 or test_images_u8.shape[3] != 256:
+                        print(f"WARNING: Test shard {test_shard_path} has unexpected shape {test_images_u8.shape}. Expected [N, 3, 256, 256]. Skipping this shard.")
+                        continue
+                    
+                    all_test_images.append(test_images_u8.float().div_(255.0))
+                
+                if not all_test_images:
+                    raise ValueError("No valid test shards found.")
+                
+                # Concatenate all test images
+                test_images_f32 = torch.cat(all_test_images, dim=0) # [Total_N, C, H, W]
+                print(f"Loaded {test_images_f32.size(0)} total test images from {len(all_test_images)} shard(s)")
+                
+                # 2. Create a DataLoader for the test set
+                test_dataset = TensorDataset(test_images_f32)
+                # Use a larger batch size for validation if possible, but we'll stick
+                # to the training batch size to be safe.
+                test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False)
+                
+                print(f"Running validation on {len(test_dataset)} test images...")
+                val_pbar = tqdm(test_loader, desc="Validation", leave=False)
+                
+                with torch.no_grad(): # Disable gradient calculation
+                    for val_batch in val_pbar:
+                        val_img = val_batch[0].to(device)
+                        # epoch_frac=1.0 means no noise
+                        val_loss = model(val_img, epoch_frac=1.0) 
+                        val_losses.append(val_loss.item())
+                
+                # 3. Calculate average validation loss
+                if val_losses:
+                    avg_val_loss = sum(val_losses) / len(val_losses)
+                    print(f"Validation finished. Average Validation Loss: {avg_val_loss:.3f}")
+                
+            except FileNotFoundError:
+                print(f"WARNING: Test shard '{test_shard_path}' not found. Skipping validation.")
+            except Exception as e:
+                print(f"WARNING: Error during validation: {e}. Skipping validation.")
+            
+            model.train() # Set model back to training mode
+            # --- END OF VALIDATION CODE ---
+
+            # Use the test data we already loaded for sampling
+            test_batch = None
+            if test_images_f32 is not None:
+                try:
+                    num_images_in_shard = test_images_f32.size(0)
+                    # Select 16 random indices
+                    random_indices = torch.randperm(num_images_in_shard)[:16] 
+                    test_batch = test_images_f32[random_indices]
+                except Exception as e:
+                    print(f"WARNING: Error selecting random samples: {e}. Skipping sample generation.")
+            else:
+                 print("WARNING: test_images_f32 not loaded. Skipping sample generation.")
+
+            # Only proceed if we successfully loaded the test_batch
+            if test_batch is not None:
+                # This will sample 16 real images and 16 reconstructions
+                fake_images = model.sample(n=32, x_real_batch=test_batch)
+                sample_path = os.path.join(cfg.samples_dir, f"epoch_{ep+1:03d}.png")
+                
+                # nrow=2 creates a (16, 2) grid.
+                save_image(fake_images, sample_path, nrow=2)
+                print(f"Samples saved to {sample_path}")
+
+        # Log the average losses to the CSV file
+        append_losses_to_csv(ep + 1, avg_train_loss, avg_val_loss, cfg.loss_csv_path)
+
+        # Update the static plot of the loss curve
+        plot_loss_from_csv(cfg.loss_csv_path, cfg.loss_plot_path)
+
+    print("Training finished.")
+
+if __name__ == "__main__":
+    train()
\ No newline at end of file
diff --git a/scripts/samplers/sample_embeddings.py b/scripts/samplers/sample_embeddings.py
index eb48fbd..cd97ac5 100644
--- a/scripts/samplers/sample_embeddings.py
+++ b/scripts/samplers/sample_embeddings.py
@@ -1,136 +1,205 @@
 """
-Sample from a trained astropt model
+Sample embeddings from a trained AstroPT model
 """
+
 import os
-import pickle
+import functools
 from contextlib import nullcontext
+
 import torch
 from torch.utils.data import DataLoader
-from tqdm import tqdm, trange
-import matplotlib.pyplot as plt
-import numpy as np
-from model import GPTConfig, GPT
-from datasets import load_dataset, concatenate_datasets 
-from astropt.local_datasets import GalaxyImageDataset
 from torchvision import transforms
 from torchvision.transforms import ToTensor
-import functools
-from einops import rearrange
+
+import numpy as np
 import pandas as pd
+from tqdm import tqdm
+from einops import rearrange
+
+from datasets import load_dataset, concatenate_datasets
+
+from astropt.model import GPT, GPTConfig
+from astropt.local_datasets import GalaxyImageDataset
 
 # -----------------------------------------------------------------------------
-init_from = 'resume'
-out_dir = 'logs/spiralized_astropt_300M' # ignored if init_from is not 'resume'
-refresh_cache = False # resample the embeddings
+# Config
+# -----------------------------------------------------------------------------
+init_from = "resume"
+out_dir = "logs/AIM"
 batch_size = 256
 seed = 1337
-spiral = True # do we want to process the galaxy patches in spiral order?
-patch_size = 16 # size of image patches for ViT tokenisation
-device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
-dtype = 'bfloat16' # 'float32' or 'bfloat16' or 'float16'
-compile = False # use PyTorch 2.0 to compile the model to be faster
-exec(open('src/astropt/configurator.py').read()) # overrides from command line or config file
-# -----------------------------------------------------------------------------
+spiral = True
+prefix_len = 64
+device = "cuda"
+dtype = "bfloat16"
+compile = False
 
 torch.manual_seed(seed)
 torch.cuda.manual_seed(seed)
-torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
-torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
-device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
-ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
-
-# model
-if init_from == 'resume':
-    # init from a model saved in a specific directory
-    ckpt_path = os.path.join(out_dir, '030000_ckpt.pt')
-    checkpoint = torch.load(ckpt_path, map_location=device)
-    # TODO remove this for latest models
-    gptconf = GPTConfig(**checkpoint['model_args'])
-    model = GPT(gptconf)
-    state_dict = checkpoint['model']
-    unwanted_prefix = '_orig_mod.'
-    for k,v in list(state_dict.items()):
-        if k.startswith(unwanted_prefix):
-            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
-    model.load_state_dict(state_dict)
-
-model.eval()
-model.to(device)
+
+device_type = "cuda" if "cuda" in device else "cpu"
+ptdtype = {
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+}[dtype]
+
+ctx = nullcontext() if device_type == "cpu" else torch.amp.autocast(
+    device_type=device_type, dtype=ptdtype
+)
+
+# -----------------------------------------------------------------------------
+# Load checkpoint
+# -----------------------------------------------------------------------------
+ckpt_path = os.path.join(out_dir, "ckpt.pt")
+checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
+
+modality_registry = checkpoint["modality_registry"]
+
+gptconf = GPTConfig(**checkpoint["model_args"])
+model = GPT(gptconf, modality_registry)
+state_dict = checkpoint["model"]
+
+# clean DDP prefix if present
+unwanted_prefix = "_orig_mod."
+for k, v in list(state_dict.items()):
+    if k.startswith(unwanted_prefix):
+        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+
+model.load_state_dict(state_dict)
+model.eval().to(device)
+
 if compile:
-    model = torch.compile(model) # requires PyTorch 2.0 (optional)
+    model = torch.compile(model)
 
-# set up HF galaxies in test set to be processed
+# -----------------------------------------------------------------------------
+# Dataset + transforms (MATCH TRAINING)
+# -----------------------------------------------------------------------------
 def normalise(x):
     std, mean = torch.std_mean(x, dim=1, keepdim=True)
-    return (x - mean)/(std + 1e-8)
+    return (x - mean) / (std + 1e-8)
+
+
 def data_transforms():
-    transform = transforms.Compose([
+    return transforms.Compose([
         transforms.Lambda(normalise),
     ])
-    return transform
-def _process_galaxy_wrapper(gal, func):
-    gal = ToTensor()(gal["image"]).to(torch.float16)
-    patch_galaxy = func(gal)
-    return {"image": patch_galaxy}
-galproc = GalaxyImageDataset(None, spiral=True, transform=data_transforms())
-ds = concatenate_datasets(( 
-    load_dataset("Smith42/galaxies", split="test", streaming=True),
-    load_dataset("Smith42/galaxies", split="validation", streaming=True),
+
+
+from PIL import Image
+import numpy as np
+
+def process_galaxy_wrapper(gal, func):
+    img = gal["image"]
+
+    # Case 1: PIL image (JPEG/PNG from HF)
+    if isinstance(img, Image.Image):
+        gal_tensor = ToTensor()(img).to(torch.float16)
+
+    # Case 2: numpy array
+    elif isinstance(img, np.ndarray):
+        gal_tensor = torch.from_numpy(img).permute(2, 0, 1).to(torch.float16)
+
+    # Case 3: torch tensor (local HF parquet)
+    elif torch.is_tensor(img):
+        gal_tensor = img.to(torch.float16)
+
+    else:
+        raise TypeError(f"Unsupported image type: {type(img)}")
+
+    patches = func(gal_tensor)
+
+    return {
+        "images": patches,
+        "images_positions": torch.arange(
+            patches.shape[0], dtype=torch.long
+        ),
+        "dr8_id": gal.get("dr8_id", -1),
+    }
+
+
+galproc = GalaxyImageDataset(
+    paths=None,
+    spiral=spiral,
+    transform={"images": data_transforms()},
+    modality_registry=modality_registry,
+)
+
+# -----------------------------------------------------------------------------
+# HF dataset (test + validation)
+# -----------------------------------------------------------------------------
+ds = concatenate_datasets((
+    load_dataset("/scratch02/public/sao/msmith/data/galaxies/", revision="v2.0", split="test", streaming=True),
+    load_dataset("/scratch02/public/sao/msmith/data/galaxies/", revision="v2.0", split="validation", streaming=True),
 ))
+
 ds = ds.map(
-    functools.partial(_process_galaxy_wrapper, func=galproc.process_galaxy)
+    functools.partial(process_galaxy_wrapper, func=galproc.process_galaxy)
 ).with_format("torch")
-dl = iter(DataLoader(
-    ds, batch_size=batch_size, num_workers=2,
-))
+ds = (
+    ds
+    .select_columns("image_crop")
+    .rename_column("image_crop", "image")
+    .map(
+        functools.partial(process_galaxy_wrapper, func=galproc.process_galaxy)
+    )
+    ).with_format("torch")
+ds = ds.remove_columns("image")
 
-n_tokens = 64
+dl = DataLoader(
+    ds,
+    batch_size=batch_size,
+    num_workers=2,
+    pin_memory=True,
+)
+
+# -----------------------------------------------------------------------------
+# Embedding extraction
+# -----------------------------------------------------------------------------
+n_tokens = prefix_len
 norm = "mean"
-if (not (
-        os.path.isfile(os.path.join(out_dir, f"zss_{n_tokens}t_{norm}.npy")) and 
-        os.path.isfile(os.path.join(out_dir, f"idss_{n_tokens}t_{norm}.npy")) and
-        os.path.isfile(os.path.join(out_dir, "metadata_processed.parquet"))
-   )) or refresh_cache:
-    # run generation
-    xss = []
-    zss = []
-    idxs = []
-    with torch.no_grad():
-        with ctx:
-            tt = tqdm(unit="galz", unit_scale=True)
-            for B in dl:
-                prefix_len = 64
-                xs = B["image"][:, :prefix_len]
-                idx = B["dr8_id"]
-                if model.config.attn_type == "prefix":
-                    # forward and backward attention over whole image if pretrained with prefix attention
-                    zs = model.generate_embeddings(xs.to(device), prefix_len=prefix_len)
-                else:
-                    zs = model.generate_embeddings(xs.to(device))
-                if not os.path.isfile(os.path.join(out_dir, f"xss_{n_tokens}t.npy")):
-                    xss.append(rearrange(xs, "b t c -> b (t c)").detach().to(torch.float16).cpu().numpy())
-                zss.append(zs.detach().cpu().numpy())
-                idxs.append(idx)
-                tt.update(batch_size)
-            tt.close()
-
-    if not os.path.isfile(os.path.join(out_dir, f"xss_{n_tokens}t.npy")):
-        xss = np.concatenate(xss, axis=0)
-        np.save(os.path.join(out_dir, f"xss_{n_tokens}t.npy"), xss)
-    zss = np.concatenate(zss, axis=0)
-    idxs = np.concatenate(idxs, axis=0)
-    np.save(os.path.join(out_dir, f"zss_{n_tokens}t_{norm}.npy"), zss)
-    np.save(os.path.join(out_dir, f"idxs_{n_tokens}t_{norm}.npy"), idxs)
-
-    print("processing metadata file")
-    metadata = pd.read_parquet("/raid/data/metadata.parquet")
-    metadata = metadata.set_index(["dr8_id"])
-    metadata = metadata.loc[list(idxs)]
-    metadata.to_parquet(os.path.join(out_dir, "metadata_processed.parquet"))
-else:
-    print("loading from cache")
-    metadata = pd.read_parquet(os.path.join(out_dir, "metadata_processed.parquet"))
-    zss = np.load(os.path.join(out_dir, f"zss_{n_tokens}t_{norm}.npy"))
-    #xss = np.load(os.path.join(out_dir, f"xss_{n_tokens}t.npy"))
-    idxs = np.load(os.path.join(out_dir, f"idxs_{n_tokens}t_{norm}.npy"))
+
+zss = []
+idxs = []
+
+with torch.no_grad():
+    with ctx:
+        tt = tqdm(unit="galaxies", unit_scale=True)
+
+        for B in dl:
+            xs = B["images"][:, :prefix_len].to(device)
+            pos = B["images_positions"][:, :prefix_len].to(device)
+
+            inputs = {
+                "images": xs,
+                "images_positions": pos,
+            }
+
+            zs = model.generate_embeddings(inputs)
+
+            zss.append(zs["images"].detach().cpu().numpy())
+            idxs.append(np.array(B["dr8_id"]))
+
+            tt.update(xs.size(0))
+
+        tt.close()
+
+# -----------------------------------------------------------------------------
+# Save outputs
+# -----------------------------------------------------------------------------
+zss = np.concatenate(zss, axis=0)
+idxs = np.concatenate(idxs, axis=0)
+
+np.save(os.path.join(out_dir, f"zss_{n_tokens}t_{norm}.npy"), zss)
+np.save(os.path.join(out_dir, f"idxs_{n_tokens}t_{norm}.npy"), idxs)
+
+print("Saved embeddings:", zss.shape)
+
+# -----------------------------------------------------------------------------
+# Optional: metadata join
+# -----------------------------------------------------------------------------
+# metadata = pd.read_parquet("/scratch02/public/sao/msmith/data/metadata.parquet").set_index("dr8_id")
+# metadata = metadata.loc[idxs]
+# metadata.to_parquet(os.path.join(out_dir, "metadata_processed.parquet"))
+
+print("Done.")
diff --git a/scripts/train.py b/scripts/train.py
index 190dea2..c5a0aad 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -34,13 +34,11 @@
 
 try:
     import wandb
-
     log_via_wandb = True
 except ImportError:
     log_via_wandb = False
 try:
     from codecarbon import EmissionsTracker
-
     log_emissions = False
 except ImportError:
     log_emissions = False
@@ -59,6 +57,7 @@ def normalise(x, use_hf=False):
 
 
 def data_transforms(use_hf):
+
     norm = partial(normalise, use_hf=use_hf)
     transform = transforms.Compose(
         [
@@ -70,6 +69,7 @@ def data_transforms(use_hf):
 
 
 def process_galaxy_wrapper(galdict, func):
+    """Wrapper for processing galaxy images from HF dataset."""
     patch_galaxy = func(np.array(galdict["image"]).swapaxes(0, 2))
     return {
         "images": patch_galaxy.to(torch.float),
@@ -81,7 +81,7 @@ def process_galaxy_wrapper(galdict, func):
     # -----------------------------------------------------------------------------
     # default config values designed to test run a 100M parameter model on DESI galaxy imagery
     # look at `config/astropt*.py` for a prod run example
-    out_dir = "logs/astropt0100M"
+    out_dir = "logs/AFFINE"
     eval_interval = 1000
     log_interval = 100
     checkpoint_interval = 5000
@@ -95,7 +95,7 @@ def process_galaxy_wrapper(galdict, func):
     use_hf = True  # use the huggingface dataset version of our galz
     stream_hf_dataset = True  # stream the galaxies from huggingface
     # data
-    gradient_accumulation_steps = 5 * 8  # used to simulate larger batch sizes
+    gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
     batch_size = 16  # if gradient_accumulation_steps > 1, this is the micro-batch size
     spiral = True  # do we want to process the galaxy patches in spiral order?
     block_size = 1024
@@ -123,12 +123,12 @@ def process_galaxy_wrapper(galdict, func):
     # Create modality registry
     modality_registry = ModalityRegistry(modalities)
     # Choose tokenisers from "affine" and "aim"
-    tokeniser = "aim"
+    tokeniser = "affine"
     # adamw optimizer
     # we follow the same schedule here as Chinchilla
     learning_rate = 6e-4  # max learning rate
     max_iters = (
-        30000  # total number of training iterations for one pass over our dataset
+        1_000_000  # total number of training iterations for one pass over our dataset
     )
     weight_decay = 1e-1
     beta1 = 0.9
@@ -147,7 +147,7 @@ def process_galaxy_wrapper(galdict, func):
     # system
     device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
     dtype = "bfloat16"  # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
-    compile = True  # use PyTorch 2.0 to compile the model to be faster
+    compile = False  # use PyTorch 2.0 to compile the model to be faster
     log_via_wandb = False
     wandb_project = None
     # -----------------------------------------------------------------------------
@@ -239,24 +239,34 @@ def process_galaxy_wrapper(galdict, func):
         from datasets import load_dataset
 
         tds_hf = load_dataset(
-            "Smith42/galaxies",
+            "/scratch02/public/sao/msmith/data/galaxies/",
             revision="v2.0",
             split="train",
             streaming=(True if stream_hf_dataset else False),
         )
-        tds_hf = tds_hf.select_columns("image").map(
-            partial(process_galaxy_wrapper, func=tds.process_galaxy)
+        tds_hf = (
+            tds_hf
+            .select_columns("image_crop")
+            .rename_column("image_crop", "image")
+            .map(
+                partial(process_galaxy_wrapper, func=tds.process_galaxy)
+            )
         )
         tds_hf = tds_hf.remove_columns("image")
 
         vds_hf = load_dataset(
-            "Smith42/galaxies",
+            "/scratch02/public/sao/msmith/data/galaxies/",
             revision="v2.0",
             split="test",
             streaming=(True if stream_hf_dataset else False),
         )
-        vds_hf = vds_hf.select_columns("image").map(
-            partial(process_galaxy_wrapper, func=tds.process_galaxy)
+        vds_hf = (
+            vds_hf
+            .select_columns("image_crop")
+            .rename_column("image_crop", "image")
+            .map(
+                partial(process_galaxy_wrapper, func=tds.process_galaxy)
+            )
         )
         vds_hf = vds_hf.remove_columns("image")
 
@@ -291,6 +301,7 @@ def process_galaxy_wrapper(galdict, func):
         dropout=dropout,
         modalities=modalities,
         attn_type=attn_type,
+        tokeniser=tokeniser,
     )
 
     if init_from == "scratch":
diff --git a/scripts/train_jetformer.py b/scripts/train_jetformer.py
new file mode 100644
index 0000000..f454ab5
--- /dev/null
+++ b/scripts/train_jetformer.py
@@ -0,0 +1,927 @@
+"""
+Training script for AstroPT with Jetformer tokenization.
+
+This script extends the standard AstroPT training to support Jetformer's
+continuous tokenization approach using normalizing flows and GMM outputs.
+
+To run on a single GPU:
+$ python train_jetformer.py --batch_size=32 --compile=False
+
+To run with DDP on 4 gpus on 1 node:
+$ torchrun --standalone --nproc_per_node=4 scripts/train_jetformer.py
+"""
+
+import math
+import os
+import time
+from contextlib import nullcontext
+from functools import partial
+
+import einops
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+from torch.distributed import destroy_process_group, init_process_group
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader
+from torchvision import transforms
+
+try:
+    import wandb
+    log_via_wandb = True
+except ImportError:
+    log_via_wandb = False
+try:
+    from codecarbon import EmissionsTracker
+    log_emissions = False
+except ImportError:
+    log_emissions = False
+
+from astropt.local_datasets import GalaxyImageDataset
+from astropt.model import GPT, GPTConfig, ModalityConfig, ModalityRegistry
+
+
+def prepare_batch_for_jetformer(batch, tokeniser):
+    """Prepare batch for Jetformer: use raw images instead of patches.
+    
+    For Jetformer, replaces 'images' (patches) with 'images_raw' (raw images [B,C,H,W])
+    and adjusts positions accordingly.
+    
+    Args:
+        batch: Dictionary with 'images' (patches) and optionally 'images_raw' (raw images)
+        tokeniser: Tokeniser type ('jetformer' or other)
+    
+    Returns:
+        Modified batch with 'images' replaced by raw images for Jetformer
+    """
+    if tokeniser != "jetformer":
+        return batch
+    
+    if "images_raw" not in batch:
+        raise ValueError(
+            "Jetformer requires raw images. For HF datasets, this is handled automatically. "
+            "For local datasets, you need to modify GalaxyImageDataset to return raw images."
+        )
+    
+    # Replace patches with raw images
+    raw_images = batch["images_raw"]  # [B, C, H, W] from dataset
+    B, C, H, W = raw_images.shape
+    patch_size = 16  # Should match modality config
+    T = (H // patch_size) * (W // patch_size)
+    
+    batch["images"] = raw_images  # Replace patches with raw images
+    batch["images_positions"] = torch.arange(T, dtype=torch.long).unsqueeze(0).expand(B, T)
+    batch["images_is_raw"] = True
+    
+    return batch
+
+
+def normalise(x, use_hf=False):
+    """Normalize images to zero mean, unit variance."""
+    if use_hf:
+        x = torch.from_numpy(x).to(torch.float32)
+    std, mean = torch.std_mean(x, dim=1, keepdim=True)
+    x_norm = (x - mean) / (std + 1e-8)
+    return x_norm.to(torch.float16)
+
+
+def data_transforms(use_hf):
+    """Data transformation pipeline."""
+    norm = partial(normalise, use_hf=use_hf)
+    transform = transforms.Compose(
+        [
+            # transforms.Lambda(lambda x: x/255.),
+            transforms.Lambda(norm),
+        ]
+    )
+    return transform
+
+
+def process_galaxy_wrapper(galdict, func, return_raw=False):
+    """Wrapper for processing galaxy images from HF dataset.
+    
+    Args:
+        galdict: Dictionary with "image" key containing image data
+        func: Function to process galaxy (process_galaxy)
+        return_raw: If True, also return raw image [C, H, W] for Jetformer
+    
+    Returns:
+        Dictionary with "images" (patches) and optionally "images_raw" (raw image)
+    """
+    raw_image = np.array(galdict["image"]).swapaxes(0, 2)  # [C, H, W]
+    patch_galaxy = func(raw_image)
+    result = {
+        "images": patch_galaxy.to(torch.float),
+        "images_positions": torch.arange(0, len(patch_galaxy), dtype=torch.long),
+    }
+    if return_raw:
+        # Convert to [C, H, W] tensor and normalize to [0,1] if needed
+        raw_tensor = torch.from_numpy(raw_image).to(torch.float)
+        if raw_tensor.max() > 1.0:
+            raw_tensor = raw_tensor / 255.0
+        result["images_raw"] = raw_tensor
+    return result
+
+
+if __name__ == "__main__":
+    # -----------------------------------------------------------------------------
+    # Configuration for Jetformer Training
+    # -----------------------------------------------------------------------------
+    tokeniser = "jetformer"
+    out_dir = "logs/astropt_jetformer_5epochs_resume"
+    eval_interval = 1000
+    log_interval = 100
+    checkpoint_interval = 5000
+    assert checkpoint_interval % eval_interval == 0
+    eval_iters = 100
+    eval_only = False
+    always_save_checkpoint = False
+    init_from = "scratch"  # 'scratch' or 'resume'
+    use_hf = True  # use the huggingface dataset version of our galz
+    stream_hf_dataset = True  # stream the galaxies from huggingface
+    # data
+    gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
+    batch_size = 32  # if gradient_accumulation_steps > 1, this is the micro-batch size
+    spiral = False  # do we want to process the galaxy patches in spiral order?
+    block_size = 1024
+    image_size = 256
+    num_workers = 32
+    num_epochs = 5  # number of epochs to train (None = use max_iters instead)
+    dataset_size = None  # dataset size for epoch calculation (None = try to get automatically, required for streaming)
+    # astroPT model
+    n_layer = 12
+    n_head = 12
+    n_embd = 768
+    n_chan = 3  # 3 imagery bands: r, i, z for jpeg, 1 imagery band for FITS
+    dropout = 0.0  # for pretraining 0 is good, for finetuning try 0.1+
+    # NB dropout is NOT implemented for flex attention
+    bias = False  # do we use bias inside LayerNorm and Linear layers?
+    # Define modalities configuration
+    modalities = [
+        ModalityConfig(
+            name="images",
+            input_size=16 * 16 * n_chan,  # Will be overridden by Jetformer
+            patch_size=16,
+            loss_weight=1.0,
+            embed_pos=True,
+            pos_input_size=1,
+        ),
+    ]
+    # Create modality registry
+    modality_registry = ModalityRegistry(modalities)
+    # Jetformer-specific hyperparameters
+    jetformer_flow_steps = 4  # Number of coupling layers in normalizing flow
+    jetformer_gmm_K = 4  # Number of Gaussian components in mixture
+    jetformer_noise_max = 0.1  # Maximum noise for curriculum
+    jetformer_noise_min = 0.0  # Minimum noise for curriculum
+    # Optimiser configuration
+    learning_rate = 6e-4  # max learning rate
+    max_iters = (
+        1_000_000  # total number of training iterations (overridden if num_epochs is set)
+    )
+    weight_decay = 1e-1
+    beta1 = 0.9
+    beta2 = 0.95
+    grad_clip = 1.0  # clip gradients at this value, or disable if == 0.0
+    # learning rate decay settings
+    decay_lr = True  # whether to decay the learning rate
+    warmup_iters = 2000  # how many steps to warm up for
+    lr_decay_iters = 27000 * 1.1  # should be ~= max_iters per Chinchilla
+    min_lr = (
+        learning_rate / 10
+    )  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+    attn_type = "causal"
+    # DDP settings
+    backend = "nccl"  # 'nccl', 'gloo', etc.
+    # system
+    device = "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+    dtype = "bfloat16"  # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
+    compile = False  # use PyTorch 2.0 to compile the model to be faster
+    log_via_wandb = False
+    wandb_project = None
+    # -----------------------------------------------------------------------------
+    config_keys = [
+        k for k, v in globals().items()
+        if not k.startswith("_") and isinstance(v, (int, float, bool, str))
+    ]
+    exec(
+        open("src/astropt/configurator.py").read()
+    )  # overrides from command line or config file
+    config = {k: globals()[k] for k in config_keys}  # will be useful for logging
+    # -----------------------------------------------------------------------------
+
+    # various inits, derived attributes, I/O setup
+    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
+    if ddp:
+        init_process_group(backend=backend)
+        ddp_rank = int(os.environ["RANK"])
+        ddp_local_rank = int(os.environ["LOCAL_RANK"])
+        ddp_world_size = int(os.environ["WORLD_SIZE"])
+        device = f"cuda:{ddp_local_rank}"
+        torch.cuda.set_device(ddp_local_rank)
+        master_process = (
+            ddp_rank == 0
+        )  # this process will do logging, checkpointing etc.
+        seed_offset = ddp_rank  # each process gets a different seed
+        assert gradient_accumulation_steps % torch.cuda.device_count() == 0
+        gradient_accumulation_steps //= torch.cuda.device_count()
+    else:
+        # if not ddp, we are running on a single gpu, and one process
+        master_process = True
+        seed_offset = 0
+        ddp_world_size = 1
+        ddp_rank = 0
+        ddp_local_rank = 0
+    tokens_per_iter = (
+        gradient_accumulation_steps
+        * ddp_world_size
+        * batch_size
+        * block_size
+        * len(modalities)
+    )
+    if master_process:
+        if log_via_wandb:
+            print("Logging to wandb enabled")
+        if log_emissions:
+            print("codecarbon detected, will log emissions")
+        print(f"tokens per iteration will be: {tokens_per_iter:,}")
+
+    if master_process:
+        os.makedirs(out_dir, exist_ok=True)
+    torch.manual_seed(1337 + seed_offset)
+    torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+    torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+    device_type = (
+        "cuda" if "cuda" in device else "cpu"
+    )  # for later use in torch.autocast
+    # note: float16 data type will automatically use a GradScaler
+    ptdtype = {
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+    }[dtype]
+    ctx = (
+        nullcontext()
+        if device_type == "cpu"
+        else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+    )
+
+    transforms = {"images": data_transforms(use_hf)}
+    
+    # Training dataset
+    tpaths = None if use_hf else "./data/train.txt"
+    tds = GalaxyImageDataset(
+        paths={"images": tpaths},
+        spiral=spiral,
+        transform=transforms,
+        modality_registry=modality_registry,
+    )
+    # validation dataset and dataloader
+    vpaths = None if use_hf else "./data/tests.txt"
+    vds = GalaxyImageDataset(
+        paths={"images": vpaths},
+        spiral=spiral,
+        transform=transforms,
+        modality_registry=modality_registry,
+    )
+
+    if use_hf:
+        from datasets import load_dataset
+
+        tds_hf = load_dataset(
+            "/scratch02/public/sao/msmith/data/galaxies/",
+            revision="v2.0",
+            split="train",
+            streaming=(True if stream_hf_dataset else False),
+        )
+        # For Jetformer, we need raw images, not just patches
+        return_raw = tokeniser == "jetformer"
+        tds_hf = (
+            tds_hf
+            .select_columns("image_crop")
+            .rename_column("image_crop", "image")
+            .map(
+                partial(process_galaxy_wrapper, func=tds.process_galaxy, return_raw=return_raw)
+            )
+        )
+        tds_hf = tds_hf.remove_columns("image")
+
+        vds_hf = load_dataset(
+            "/scratch02/public/sao/msmith/data/galaxies/",
+            revision="v2.0",
+            split="test",
+            streaming=(True if stream_hf_dataset else False),
+        )
+        # For Jetformer, we need raw images, not just patches
+        return_raw = tokeniser == "jetformer"
+        vds_hf = (
+            vds_hf
+            .select_columns("image_crop")
+            .rename_column("image_crop", "image")
+            .map(
+                partial(process_galaxy_wrapper, func=tds.process_galaxy, return_raw=return_raw)
+            )
+        )
+        vds_hf = vds_hf.remove_columns("image")
+
+    # Create infinite dataloader wrapper for streaming datasets
+    def infinite_dataloader(dataloader):
+        """Wrap a DataLoader to cycle infinitely, enabling multiple epochs with streaming datasets."""
+        while True:
+            yield from dataloader
+
+    # Create base dataloaders
+    train_loader = DataLoader(
+        tds_hf if use_hf else tds,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=True if num_workers > 0 and stream_hf_dataset else False,
+    )
+    val_loader = DataLoader(
+        vds_hf if use_hf else vds,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=True if num_workers > 0 and stream_hf_dataset else False,
+    )
+    
+    # Wrap with infinite iterator if streaming, otherwise use regular iterator
+    if stream_hf_dataset and use_hf:
+        tdl = infinite_dataloader(train_loader)
+        vdl = infinite_dataloader(val_loader)
+    else:
+        tdl = iter(train_loader)
+        vdl = iter(val_loader)
+
+    # Calculate dataset size and max_iters from num_epochs if specified
+    if num_epochs is not None:
+        # Try to get dataset size automatically if not provided
+        if dataset_size is None:
+            if use_hf and not stream_hf_dataset:
+                # Can get size from non-streaming HF dataset
+                try:
+                    dataset_size = len(tds_hf)
+                    if master_process:
+                        print(f"Dataset size (auto-detected): {dataset_size:,} samples")
+                except (TypeError, AttributeError):
+                    dataset_size = None
+            elif use_hf and stream_hf_dataset:
+                # For streaming datasets, get size from dataset info without loading data
+                try:
+                    from datasets import load_dataset_builder
+                    if master_process:
+                        print("Getting dataset size from info (not loading data)...")
+                    builder = load_dataset_builder(
+                        "/scratch02/public/sao/msmith/data/galaxies/",
+                        revision="v2.0",
+                    )
+                    # Access the split info directly without loading data
+                    if hasattr(builder.info, 'splits') and 'train' in builder.info.splits:
+                        dataset_size = builder.info.splits['train'].num_examples
+                        if master_process:
+                            print(f"Dataset size (auto-detected from info): {dataset_size:,} samples")
+                    else:
+                        dataset_size = None
+                        if master_process:
+                            print("Warning: Could not get dataset size from info splits")
+                except (TypeError, AttributeError, Exception) as e:
+                    if master_process:
+                        print(f"Warning: Could not get dataset size from info: {e}")
+                        print("Falling back to loading dataset (this may take a moment)...")
+                    # Fallback: load the dataset if info API doesn't work
+                    try:
+                        from datasets import load_dataset
+                        tds_info = load_dataset(
+                            "/scratch02/public/sao/msmith/data/galaxies/",
+                            revision="v2.0",
+                            split="train",
+                            streaming=False,
+                        )
+                        dataset_size = len(tds_info)
+                        if master_process:
+                            print(f"Dataset size (auto-detected): {dataset_size:,} samples")
+                        del tds_info  # Free memory
+                    except Exception as e2:
+                        if master_process:
+                            print(f"Error: Could not auto-detect dataset size: {e2}")
+                        dataset_size = None
+            else:
+                # Local dataset
+                try:
+                    dataset_size = len(tds)
+                    if master_process:
+                        print(f"Dataset size (auto-detected): {dataset_size:,} samples")
+                except (TypeError, AttributeError):
+                    dataset_size = None
+        
+        # If still None, require user to specify
+        if dataset_size is None:
+            raise ValueError(
+                "num_epochs is set but dataset_size cannot be determined automatically. "
+                "Please set dataset_size parameter."
+            )
+        
+        # Calculate iterations per epoch
+        effective_batch_size = batch_size * gradient_accumulation_steps * ddp_world_size
+        iterations_per_epoch = (dataset_size + effective_batch_size - 1) // effective_batch_size  # ceiling division
+        max_iters = num_epochs * iterations_per_epoch
+        
+        if master_process:
+            print(f"Training for {num_epochs} epochs:")
+            print(f"  Dataset size: {dataset_size:,} samples")
+            print(f"  Effective batch size: {effective_batch_size:,}")
+            print(f"  Iterations per epoch: {iterations_per_epoch:,}")
+            print(f"  Total iterations: {max_iters:,}")
+        
+        # Set training start point when starting from scratch
+        if init_from == "scratch":
+            training_start_samples = 0
+
+    # init these up here, can override if init_from='resume' (i.e. from a checkpoint)
+    iter_num = 0
+    best_val_loss = 1e9
+    current_epoch = 0
+    samples_seen = 0
+    training_start_samples = 0  # Track where current training run started (for relative epoch calculation)
+
+    # model init
+    model_args = dict(
+        n_layer=n_layer,
+        n_head=n_head,
+        n_embd=n_embd,
+        n_chan=n_chan,
+        block_size=block_size,
+        dropout=dropout,
+        modalities=modalities,
+        attn_type=attn_type,
+        tokeniser=tokeniser,
+        jetformer_flow_steps=jetformer_flow_steps,
+        jetformer_gmm_K=jetformer_gmm_K,
+        jetformer_noise_max=jetformer_noise_max,
+        jetformer_noise_min=jetformer_noise_min,
+        img_size=image_size,  # Image size for Jetformer image-space flow
+    )
+
+    if init_from == "scratch":
+        # init a new model from scratch
+        if master_process:
+            print("initializing a new model from scratch with Jetformer tokenization")
+        gptconf = GPTConfig(**model_args)
+        model = GPT(gptconf, modality_registry, master_process=master_process)
+    if init_from == "resume":
+        if master_process:
+            print(f"resuming training from {out_dir}")
+        # resume training from a checkpoint.
+        ckpt_path = os.path.join(out_dir, "ckpt.pt")
+        checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
+        checkpoint_model_args = checkpoint["model_args"]
+        # force these config attributes to be equal otherwise we can't even resume training
+        # the rest of the attributes (e.g. dropout) can stay as desired from command line
+        # NOTE had to remove 'bias' key here -- where does it go?!
+        for k in ["n_layer", "n_head", "n_embd", "block_size"]:
+            model_args[k] = checkpoint_model_args[k]
+        # create the model
+        gptconf = GPTConfig(**model_args)
+        model = GPT(gptconf, modality_registry, master_process=master_process)
+        state_dict = checkpoint["model"]
+        # fix the keys of the state dictionary :(
+        # honestly no idea how checkpoints sometimes get this prefix, have to debug more
+        unwanted_prefix = "_orig_mod."
+        for k, v in list(state_dict.items()):
+            if k.startswith(unwanted_prefix):
+                state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+        model.load_state_dict(state_dict)
+        iter_num = checkpoint["iter_num"]
+        best_val_loss = checkpoint["best_val_loss"]
+        # Restore epoch tracking if available
+        current_epoch = checkpoint.get("current_epoch", 0)
+        samples_seen = checkpoint.get("samples_seen", 0)
+        # Restore training start point, or set to current samples_seen if old checkpoint
+        training_start_samples = checkpoint.get("training_start_samples", samples_seen)
+        if master_process:
+            if "current_epoch" in checkpoint:
+                # Calculate relative epoch for current training run
+                if num_epochs is not None and dataset_size is not None:
+                    relative_epoch = int((samples_seen - training_start_samples) // dataset_size)
+                    print(f"Resuming from epoch {relative_epoch}/{num_epochs}, iteration {iter_num}")
+                else:
+                    print(f"Resuming from epoch {current_epoch}, iteration {iter_num}")
+            else:
+                print(f"Resuming from iteration {iter_num} (epoch tracking not available in checkpoint)")
+
+    # logging via wandb if available
+    # this is here so we can get the number of params from model()
+    if log_via_wandb and master_process:
+        if wandb_project is None:
+            wandb.init(
+                project=f"AstroPT-Jetformer-{model.get_num_params() / 1e6:06.1f}M",
+                config=config,
+            )
+        else:
+            wandb.init(
+                project=wandb_project,
+                config=config,
+            )
+    # write config and important information to log file
+    with open(f"{out_dir}/hparams.txt", "w") as fi:
+        fi.write(f"AstroPT-Jetformer-{model.get_num_params() / 1e6:06.1f}M\n")
+        fi.write(f"time: {int(time.time())}\n")
+        for k, v in config.items():
+            fi.write(f"{k}: {v}\n")
+
+    # crop down the model block size if desired, using model surgery
+    if block_size < model.config.block_size:
+        model.crop_block_size(block_size)
+        model_args["block_size"] = (
+            block_size  # so that the checkpoint will have the right value
+        )
+    model.to(device)
+
+    # initialize a GradScaler. If enabled=False scaler is a no-op
+    scaler = torch.amp.GradScaler(enabled=(dtype == "float16"))
+
+    # optimizer
+    optimizer = model.configure_optimizers(
+        weight_decay, learning_rate, (beta1, beta2), device_type
+    )
+    if init_from == "resume":
+        optimizer.load_state_dict(checkpoint["optimizer"])
+    checkpoint = None  # free up memory
+
+    # compile the model
+    if compile:
+        if master_process:
+            print("compiling the model... (takes a ~minute)")
+        unoptimized_model = model
+        model = torch.compile(model)  # requires PyTorch 2.0
+
+    # wrap model into DDP container
+    if ddp:
+        if master_process:
+            print("Wrapping in DDP")
+        # Note to future people: we had to turn off optimize_ddp due to a
+        # torch compiler error when running DDP. This _may_ be fixed in a
+        # future torch version so check periodically. I tested this on:
+        # 2.6.0.dev20241126+cu124
+        torch._dynamo.config.optimize_ddp = False
+        # if we have only one modality all params are used in a forward pass:
+        # BUT: Jetformer decoder isn't used in loss, so need find_unused_parameters=True
+        if len(modalities) == 1 and tokeniser != "jetformer":
+            model = DDP(model, device_ids=[ddp_local_rank])
+        else:
+            model = DDP(model, device_ids=[ddp_local_rank], find_unused_parameters=True)
+
+    # helps estimate an arbitrarily accurate loss over either split using many batches
+    @torch.no_grad()
+    def estimate_loss():
+        out = {}
+        model.eval()
+        for dl, split in zip(
+            [tdl, vdl],
+            ["train", "val"],
+        ):
+            out[split] = {}
+            losses = torch.zeros(eval_iters)
+            for k in range(eval_iters):
+                batch_raw = next(dl)
+                batch_raw = prepare_batch_for_jetformer(batch_raw, tokeniser)
+                B = tds.process_modes(batch_raw, modality_registry, device)
+                with ctx:
+                    logits, loss = model(B["X"], targets=B["Y"])
+                losses[k] = loss.item()
+            out[split]["dummy"] = losses.mean()
+        model.train()
+        return out
+
+    @torch.no_grad()
+    def validate(iter_num, out_dir):
+        model.eval()
+        raw_model = model.module if ddp else model
+        for dl, split in zip([tdl, vdl], ["train", "val"]):
+            f, axs = plt.subplots(8, 2, figsize=(3, 12), constrained_layout=True)
+            batch_raw = next(vdl)
+            batch_raw = prepare_batch_for_jetformer(batch_raw, tokeniser)
+            B = vds.process_modes(batch_raw, modality_registry, device)
+            with ctx:
+                P, loss = model(B["X"], B["Y"])
+                if "images" in modality_registry.names():
+                    # For Jetformer, B["Y"]["images"] is raw images [B,C,H,W]
+                    # For non-Jetformer, B["Y"]["images"] is patches [B,T,D]
+                    im_patch = modality_registry.get_config("images").patch_size
+                    is_jetformer = (
+                        isinstance(raw_model, GPT)
+                        and raw_model.config.tokeniser == "jetformer"
+                    )
+                    
+                    if is_jetformer:
+                        # Raw images [B,C,H,W] - convert directly to image format for visualization
+                        Yim_raw = B["Y"]["images"].to(device)  # [B,C,H,W]
+                        # Permute to [B,H,W,C] for visualization
+                        Yim = Yim_raw.permute(0, 2, 3, 1)  # [B,H,W,C]
+                        
+                        # Reconstruct images
+                        x_recon = raw_model.jetformer_reconstruct_images(B["Y"]["images"])
+                        # Permute to [B,H,W,C] for visualization
+                        Pim = x_recon.permute(0, 2, 3, 1)  # [B,H,W,C]
+                    else:
+                        # Non-Jetformer: patches already [B,T,D]
+                        Yim = B["Y"]["images"].to(device)
+                        b, t, c = Yim.size()
+                        zero_block = torch.zeros((b, 1, c)).to(device)
+                        Yim = torch.cat((zero_block, Yim), dim=1)
+                        if spiral:
+                            Yim = torch.stack([vds.antispiralise(yy) for yy in Yim])
+                        Yim = einops.rearrange(
+                            Yim,
+                            "b (h w) (p1 p2 c) -> b (h p1) (w p2) c",
+                            p1=im_patch,
+                            p2=im_patch,
+                            h=image_size // im_patch,
+                            w=image_size // im_patch,
+                        )
+                        
+                        Pim = torch.cat((zero_block, P["images"]), dim=1)
+                        if spiral:
+                            Pim = torch.stack([vds.antispiralise(pp) for pp in Pim])
+                        Pim = einops.rearrange(
+                            Pim,
+                            "b (h w) (p1 p2 c) -> b (h p1) (w p2) c",
+                            p1=im_patch,
+                            p2=im_patch,
+                            h=image_size // im_patch,
+                            w=image_size // im_patch,
+                        )
+
+                    for ax, p, y in zip(
+                        axs, Pim.to(float).cpu().numpy(), Yim.to(float).cpu().numpy()
+                    ):
+                        ax[0].imshow(np.clip(y, 0, 1))
+                        ax[1].imshow(np.clip(p, 0, 1))
+                        ax[0].axis("off")
+                        ax[1].axis("off")
+
+                    if log_via_wandb:
+                        wandb.log(
+                            {
+                                "Y": [wandb.Image(np.clip(yy.swapaxes(0, -1).cpu(), 0, 1)) for yy in Yim],
+                                "P": [wandb.Image(np.clip(pp.swapaxes(0, -1).cpu(), 0, 1)) for pp in Pim],
+                            }
+                        )
+
+            f.savefig(
+                os.path.join(out_dir, f"{iter_num:06d}_{split}.jpg"),
+                bbox_inches="tight",
+                pad_inches=0,
+            )
+            plt.close(f)
+        model.train()
+
+    # learning rate decay scheduler (cosine with warmup)
+    def get_lr(it):
+        # 1) linear warmup for warmup_iters steps
+        if it < warmup_iters:
+            return learning_rate * it / warmup_iters
+        # 2) if it > lr_decay_iters, return min learning rate
+        if it > lr_decay_iters:
+            return min_lr
+        # 3) in between, use cosine decay down to min learning rate
+        decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+        assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
+        return min_lr + coeff * (learning_rate - min_lr)
+
+    # training loop
+    if master_process:
+        print("starting training...")
+    batch_raw = next(tdl)
+    batch_raw = prepare_batch_for_jetformer(batch_raw, tokeniser)
+    B = tds.process_modes(batch_raw, modality_registry, device)  # fetch the very first batch
+    t0 = time.time()
+    dts = []
+    local_iter_num = 0  # number of iterations in the lifetime of this process
+    raw_model = model.module if ddp else model  # unwrap DDP container if needed
+    running_mfu = -1.0
+    if log_emissions and master_process:
+        tracker = EmissionsTracker(
+            output_dir=out_dir,
+            log_level="error",
+            save_to_file=True,
+            on_csv_write="update",
+        )
+        tracker.start()
+    while True:
+        # determine and set the learning rate for this iteration
+        lr = get_lr(iter_num) if decay_lr else learning_rate
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+
+        # update Jetformer noise schedule if applicable
+        raw_model = model.module if ddp else model
+        if isinstance(raw_model, GPT) and raw_model.config.tokeniser == "jetformer":
+            raw_model.set_jetformer_schedule(iter_num, max_iters)
+
+        # evaluate the loss on train/val sets and write checkpoints
+        if iter_num % eval_interval == 0 and master_process:
+            validate(iter_num, out_dir)
+            losses = estimate_loss()
+            val_loss = np.mean(list(losses["val"].values()))
+            print(
+                f"iter {iter_num}:\ntrain loss:\n{losses['train']}\nval loss:\n{losses['val']}"
+            )
+            with open(os.path.join(out_dir, "loss.txt"), "a") as fi:
+                if fi.tell() == 0:  # check if a new file and write header if so
+                    train_head_str = ",".join(
+                        map(lambda x: str(x) + "_train", losses["train"].keys())
+                    )
+                    valid_head_str = ",".join(
+                        map(lambda x: str(x) + "_valid", losses["val"].keys())
+                    )
+                    fi.write(f"iter_num,{train_head_str},{valid_head_str},lr,mfu\n")
+                train_loss_str = ",".join(
+                    map(lambda x: str(x.item()), losses["train"].values())
+                )
+                valid_loss_str = ",".join(
+                    map(lambda x: str(x.item()), losses["val"].values())
+                )
+                fi.write(
+                    f"{iter_num},{train_loss_str},{valid_loss_str},{lr},{running_mfu * 100}\n"
+                )
+            if log_via_wandb:
+                wandb.log({"valloss": losses["val"]}, step=iter_num)
+            if iter_num != 0:
+                loss_df = pd.read_csv(os.path.join(out_dir, "loss.txt"))
+                f, axs = plt.subplots(
+                    1,
+                    len(losses["train"]) + 1,
+                    figsize=(12, 4),
+                    constrained_layout=True,
+                )
+                axs.ravel()[0].set_title("mean")
+                axs.ravel()[0].plot(
+                    loss_df["iter_num"],
+                    loss_df.filter(like="train").mean(axis=1),
+                    label="train",
+                )
+                axs.ravel()[0].plot(
+                    loss_df["iter_num"],
+                    loss_df.filter(like="valid").mean(axis=1),
+                    label="valid",
+                )
+                for ax, train_loss, valid_loss in zip(
+                    axs.ravel()[1:],
+                    loss_df.filter(like="train"),
+                    loss_df.filter(like="valid"),
+                ):
+                    ax.set_title(train_loss)
+                    ax.plot(loss_df["iter_num"], loss_df[train_loss], label="train")
+                    ax.plot(loss_df["iter_num"], loss_df[valid_loss], label="valid")
+                # [ax.set_yscale("log") for ax in axs.ravel()]
+                [ax.legend() for ax in axs.ravel()]
+                f.savefig(os.path.join(out_dir, "loss.png"))
+                plt.close(f)
+
+            # Save checkpoint if validation improved or always_save_checkpoint is True
+            save_checkpoint_now = False
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                save_checkpoint_now = True
+                if master_process:
+                    print(f"Validation loss improved, saving checkpoint...")
+            elif always_save_checkpoint:
+                save_checkpoint_now = True
+                if master_process:
+                    print(f"Saving checkpoint (always_save_checkpoint=True)...")
+            
+            # Also save periodic checkpoints regardless of validation loss
+            # This ensures we can resume even if validation didn't improve
+            if iter_num > 0 and iter_num % checkpoint_interval == 0:
+                save_checkpoint_now = True
+                if master_process:
+                    print(f"Periodic checkpoint at iteration {iter_num}...")
+            
+            if save_checkpoint_now and iter_num > 0:
+                model_state = raw_model.state_dict()
+                checkpoint = {
+                    "model": model_state,
+                    "optimizer": optimizer.state_dict(),
+                    "model_args": model_args,
+                    "iter_num": iter_num,
+                    "best_val_loss": best_val_loss,
+                    "config": config,
+                    "modality_registry": modality_registry,
+                    "current_epoch": current_epoch,
+                    "samples_seen": samples_seen,
+                    "training_start_samples": training_start_samples,
+                }
+                if master_process:
+                    print(f"saving checkpoint to {out_dir}")
+                # Always save the latest checkpoint (for resume)
+                torch.save(checkpoint, os.path.join(out_dir, "ckpt.pt"))
+                # Also save numbered checkpoint if always_save_checkpoint or at checkpoint_interval
+                if always_save_checkpoint or (iter_num % checkpoint_interval == 0):
+                    torch.save(
+                        checkpoint, os.path.join(out_dir, f"{iter_num:06d}_ckpt.pt")
+                    )
+        if iter_num == 0 and eval_only:
+            break
+
+        # forward backward update, with optional gradient accumulation to simulate larger batch size
+        # and using the GradScaler if data type is float16
+        for micro_step in range(gradient_accumulation_steps):
+            if ddp:
+                # in DDP training we only need to sync gradients at the last micro step.
+                # the official way to do this is with model.no_sync() context manager, but
+                # I really dislike that this bloats the code and forces us to repeat code
+                # looking at the source of that context manager, it just toggles this variable
+                model.require_backward_grad_sync = (
+                    micro_step == gradient_accumulation_steps - 1
+                )
+
+            with ctx:
+                logits, loss = model(B["X"], targets=B["Y"])
+            # immediately async prefetch next batch while model is doing the forward pass on the GPU
+            batch_raw = next(tdl)
+            batch_raw = prepare_batch_for_jetformer(batch_raw, tokeniser)
+            B = tds.process_modes(batch_raw, modality_registry, device)  # fetch the very first batch
+            
+            # backward pass, with gradient scaling if training in fp16
+            scaler.scale(loss).backward()
+
+        # Track samples seen for epoch calculation (only once per iteration, after all micro steps)
+        if num_epochs is not None and dataset_size is not None:
+            batch_size_actual = B["X"]["images"].shape[0] if "images" in B["X"] else batch_size
+            # Count samples once per iteration: batch_size * gradient_accumulation_steps
+            samples_seen += batch_size_actual * gradient_accumulation_steps
+            # Calculate epoch relative to current training run start
+            new_epoch = int((samples_seen - training_start_samples) // dataset_size)
+            if new_epoch > current_epoch:
+                current_epoch = new_epoch
+                if master_process:
+                    print(f"Completed epoch {current_epoch}/{num_epochs} (samples seen: {samples_seen:,}/{dataset_size * num_epochs:,})")
+
+        # clip the gradient
+        if grad_clip != 0.0:
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+        # step the optimizer and scaler if training in fp16
+        scaler.step(optimizer)
+        scaler.update()
+        # flush the gradients as soon as we can, no need for this memory anymore
+        optimizer.zero_grad(set_to_none=True)
+
+        # timing and logging
+        t1 = time.time()
+        dt = t1 - t0
+        dts.append(dt)
+        t0 = t1
+        if iter_num % log_interval == 0 and master_process:
+            # get loss as float. note: this is a CPU-GPU sync point
+            # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
+            lossf = loss.item() * gradient_accumulation_steps
+            if local_iter_num >= 5:  # let the training loop settle a bit
+                mfu = raw_model.estimate_mfu(
+                    batch_size * gradient_accumulation_steps, dt
+                )
+                running_mfu = (
+                    mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+                )
+            if log_via_wandb:
+                log_dict = {"loss": lossf, "time": dt}
+                if num_epochs is not None:
+                    log_dict["epoch"] = current_epoch
+                wandb.log(log_dict, step=iter_num)
+            epoch_str = f", epoch {current_epoch}/{num_epochs}" if num_epochs is not None else ""
+            if log_emissions:
+                emissions: float = tracker.flush()
+                print(
+                    f"iter {iter_num}{epoch_str}: loss {lossf:.6f}, time {np.mean(dts) * 1000:.2f}ms, mfu {running_mfu * 100:.2f}%, tot co2 {emissions:.1f}kg"
+                )
+            else:
+                print(
+                    f"iter {iter_num}{epoch_str}: loss {lossf:.6f}, time {np.mean(dts) * 1000:.2f}ms, mfu {running_mfu * 100:.2f}%"
+                )
+            dts = []
+
+        iter_num += 1
+        local_iter_num += 1
+
+        # termination conditions
+        if iter_num > max_iters:
+            if log_emissions:
+                emissions: float = tracker.stop()
+                if master_process:
+                    print(emissions)
+            break
+
+    # Cleanup: destroy process group before exiting
+    if ddp:
+        try:
+            destroy_process_group()
+        except Exception as e:
+            # HeartbeatMonitor may already be shutting down - this is harmless
+            if master_process:
+                print(f"Note: Process group cleanup warning (harmless): {e}")
+    if log_via_wandb:
+        wandb.finish()
diff --git a/src/astropt/JETFORMER_INTEGRATION.md b/src/astropt/JETFORMER_INTEGRATION.md
new file mode 100644
index 0000000..65456a1
--- /dev/null
+++ b/src/astropt/JETFORMER_INTEGRATION.md
@@ -0,0 +1,1172 @@
+# Jetformer Integration into AstroPT
+
+## Table of Contents
+1. [Overview](#overview)
+2. [What is Jetformer?](#what-is-jetformer)
+3. [What is AstroPT?](#what-is-astropt)
+4. [Integration Goals](#integration-goals)
+5. [Architectural Challenges](#architectural-challenges)
+6. [Design Decisions](#design-decisions)
+7. [Implementation Details](#implementation-details)
+8. [Changes to Jetformer](#changes-to-jetformer)
+9. [Changes to AstroPT](#changes-to-astropt)
+10. [Data Flow Comparison](#data-flow-comparison)
+11. [Usage](#usage)
+12. [Technical Notes](#technical-notes)
+
+---
+
+## Overview
+
+This document describes the integration of **Jetformer** (a continuous tokenization approach using normalizing flows) into **AstroPT** (a multimodal GPT framework for astronomical data). The integration allows AstroPT to support three tokenization methods: **AIM**, **Affine**, and **Jetformer**, all sharing the same GPT backbone while maintaining full backward compatibility.
+
+---
+
+## What is Jetformer?
+
+Jetformer is a continuous tokenization approach for images that uses **normalizing flows** and **Gaussian Mixture Models (GMM)** instead of discrete tokens. The original implementation (`scripts/jetformer/train_jetformer.py`) was a standalone model called `JetFormerLite`.
+
+### Key Components of Original Jetformer
+
+1. **Normalizing Flow (TinyFlow)**: 
+   - RealNVP-style affine coupling layers operating on **image space** `[B, C, H, W]`
+   - Uses 2D convolutions with checkerboard masking
+   - Transforms raw images → latent `z` with log-determinant `logdet`
+
+2. **Patchification**:
+   - After flow, converts latent images `z [B, C, H, W]` → patch tokens `[B, T, D]`
+   - Each patch is a flattened 16×16×3 region (768-dimensional)
+
+3. **Autoregressive Transformer (TinyGPT)**:
+   - Standard decoder-only GPT architecture
+   - Processes sequence of patchified latent tokens
+
+4. **GMM Output Head**:
+   - Predicts parameters of a Gaussian Mixture Model for each token
+   - Outputs: mixture weights `logits_pi`, means `mu`, log-std `log_sigma`
+
+5. **Loss Function**:
+   - Negative log-likelihood: `loss = (NLL_GMM(z) - logdet).mean()`
+   - Can be **negative** when `logdet` is large (expected behavior)
+
+6. **Noise Curriculum**:
+   - Anneals noise added to tokens during training for stability
+   - `sigma = noise_max + (noise_min - noise_max) * epoch_frac`
+
+### Original Workflow
+
+```
+Raw Image [B,C,H,W]
+  ↓ uniform_dequantize
+  ↓ TinyFlow (image space) → z [B,C,H,W], logdet [B]
+  ↓ patchify → tokens [B,T,D]
+  ↓ add noise (curriculum)
+  ↓ project to embeddings
+  ↓ GPT
+  ↓ GMM Head
+  ↓ Loss: NLL_GMM(tokens[:,1:]) - logdet
+```
+
+---
+
+## What is AstroPT?
+
+AstroPT is a **multimodal GPT framework** designed for astronomical data (galaxy images, spectra, etc.). It provides:
+
+1. **Unified GPT Backbone**: Single transformer architecture for all modalities
+2. **Modality Registry**: Flexible system for managing different data types
+3. **Multiple Tokenizers**: Originally supported **AIM** and **Affine** tokenizers
+4. **Training Infrastructure**: Complete training loop with validation, checkpointing, logging
+
+### Original AstroPT Architecture
+
+- **Input**: Patch tokens `[B, T, D]` (produced by dataset)
+- **Encoder**: Projects patches → embeddings `[B, T, n_embd]`
+- **GPT**: Standard transformer with causal attention
+- **Decoder**: Projects embeddings → patch predictions
+- **Loss**: Huber loss for regression
+
+### Original Tokenizers
+
+1. **AIM Tokenizer**:
+   - Two-layer MLP: `Linear → GELU → Linear`
+   - Used for continuous patch tokens
+
+2. **Affine Tokenizer**:
+   - Single linear layer
+   - Simpler projection
+
+Both operate on **already-patchified** tokens `[B, T, D]` from the dataset.
+
+---
+
+## Integration Goals
+
+The primary goal was to integrate Jetformer into AstroPT **without breaking existing functionality**, allowing users to choose between:
+- `tokeniser="aim"` (original AIM)
+- `tokeniser="affine"` (original Affine)
+- `tokeniser="jetformer"` (new continuous tokenization)
+
+All three should:
+- Share the same GPT backbone
+- Use the same training infrastructure
+- Support the same modalities (initially images-only for Jetformer)
+- Maintain backward compatibility
+
+---
+
+## Architectural Challenges
+
+### Challenge 1: Input Format Mismatch
+
+**Problem**: 
+- AstroPT expects **patch tokens** `[B, T, D]` as input
+- Jetformer needs **raw images** `[B, C, H, W]` to apply the flow
+
+**Solution**: 
+- Created `JetformerImageEncoder` that accepts raw images
+- Modified data loading to pass raw images when `tokeniser=="jetformer"`
+- Added `prepare_batch_for_jetformer()` to swap patches for raw images
+
+### Challenge 2: Flow Operation Space
+
+**Problem**: 
+- Original Jetformer flows on **image space** `[B, C, H, W]`
+- AstroPT's architecture assumes patch tokens `[B, T, D]`
+
+**Decision**: 
+- **Option A**: Flow on patch tokens (simpler integration)
+- **Option B**: Flow on image space (matches original, better quality)
+
+**Chosen**: **Option B** - Flow on image space to match original JetFormerLite exactly.
+
+**Rationale**: 
+- Preserves original architecture and quality
+- More principled (flows operate on natural image structure)
+- Better matches the original paper's design
+
+### Challenge 3: Loss Function Difference
+
+**Problem**: 
+- AstroPT uses **Huber loss** (regression)
+- Jetformer uses **GMM NLL - logdet** (likelihood-based, can be negative)
+
+**Solution**: 
+- Added conditional branch in `GPT._forward_native()`
+- When `tokeniser=="jetformer"` and `modalities==["images"]`, use Jetformer loss path
+- Otherwise, use original Huber loss path
+
+### Challenge 4: Teacher Forcing / Cheating Protection
+
+**Problem**: 
+- AstroPT's `process_modes()` slices inputs/targets for teacher forcing
+- For Jetformer, we pass full raw images (no slicing)
+
+**Solution**: 
+- Added `images_is_raw` flag to skip slicing for Jetformer
+- GPT's causal attention still enforces autoregressive property
+- Loss uses `tokens[:, 1:]` as target (next-token prediction)
+
+### Challenge 5: Memory Usage
+
+**Problem**: 
+- Raw images `[B, C, H, W]` use more memory than patches `[B, T, D]`
+- Multiple DataLoader workers multiply memory usage
+
+**Solution**: 
+- Reduced `num_workers` for Jetformer (2-4 instead of 16)
+- Added `prefetch_factor` control
+- Flow operations are efficient (invertible, no gradients stored)
+
+### Challenge 6: Reconstruction / Validation
+
+**Problem**: 
+- AstroPT validation expects patch tokens for visualization
+- Jetformer needs to reconstruct from raw images
+
+**Solution**: 
+- Added `jetformer_reconstruct_images()` method
+- Converts raw images → flow → patchify → GPT → GMM → depatchify → inverse flow
+- Validation code branches to handle both formats
+
+---
+
+## Design Decisions
+
+This section details the critical design decisions made during integration, explaining **why** each decision was necessary, **what options** were considered, and **what changes** each option would require.
+
+---
+
+### Decision 1: Flow Location (F1 vs F2)
+
+**The Problem**: 
+Where should the normalizing flow live in the architecture? The flow transforms raw images `[B,C,H,W]` → latent `z [B,C,H,W]`, but AstroPT's encoder expects to process tokens. We need to decide where this transformation happens.
+
+**Options Considered**:
+
+**F1: Flow inside `JetformerImageEncoder`**
+- Flow is part of the encoder's `forward()` method
+- Encoder accepts raw images, applies flow internally
+- Flow → patchify → project happens sequentially in encoder
+
+**F2: Flow as separate preprocessing step**
+- Flow exists as standalone module before encoder
+- Training loop calls flow explicitly: `z = flow(x)`, then `tokens = patchify(z)`, then `embeddings = encoder(tokens)`
+- Encoder only sees patchified tokens
+
+**What Each Option Changes**:
+
+**F1 Changes**:
+- `JetformerImageEncoder.forward()` becomes: `x [B,C,H,W] → flow → z → patchify → tokens → project → embeddings`
+- Encoder is responsible for entire transformation pipeline
+- Training loop just calls `encoder(x)` with raw images
+- **Code location**: All logic in `src/astropt/model.py` `JetformerImageEncoder` class
+
+**F2 Changes**:
+- Training loop becomes: `z, logdet = flow(x)`, `tokens = patchify(z)`, `embeddings = encoder(tokens)`
+- Encoder remains simple (just projects tokens)
+- Flow and patchify are separate steps
+- **Code location**: Flow logic in training script, encoder in model.py
+
+**Trade-offs**:
+
+| Aspect | F1 (Inside Encoder) | F2 (Separate Step) |
+|--------|---------------------|---------------------|
+| **Encapsulation** | ✅ All transformation logic in one place | ❌ Logic scattered across files |
+| **Training Loop Complexity** | ✅ Simple: just `encoder(x)` | ❌ Complex: multiple steps |
+| **Reusability** | ✅ Encoder is self-contained | ⚠️ Flow can be reused separately |
+| **Matches Original** | ✅ Matches JetFormerLite design | ❌ Different from original |
+| **Code Organization** | ✅ Follows AstroPT patterns | ❌ Breaks encoder abstraction |
+
+**Chosen**: **F1** - Flow inside `JetformerImageEncoder`
+
+**Why**: 
+- **Encapsulation**: All image→token transformation logic lives in one place, following AstroPT's encoder abstraction
+- **Matches Original**: Original JetFormerLite has flow as part of the model, not separate preprocessing
+- **Cleaner Training Loop**: Training script doesn't need to know about flow/patchify details
+- **Consistency**: Other encoders (AIM/Affine) also handle their own transformations internally
+
+**Implementation Impact**:
+```python
+# With F1 (chosen):
+class JetformerImageEncoder(Encoder):
+    def forward(self, x):  # x is [B,C,H,W]
+        z, logdet = self.flow(x)  # Flow inside encoder
+        tokens = patchify(z)
+        # ... rest of transformation
+
+# Training loop:
+embeddings = encoder(raw_images)  # Simple!
+
+# With F2 (not chosen):
+# Training loop would be:
+z, logdet = flow(raw_images)  # Flow outside encoder
+tokens = patchify(z)
+embeddings = encoder(tokens)  # Encoder only projects
+```
+
+---
+
+### Decision 2: Noise Schedule Management (L1 vs L2 vs L3)
+
+**The Problem**: 
+Jetformer needs a noise curriculum that anneals from `noise_max` to `noise_min` over training. The noise fraction `epoch_frac` must be computed from `iter_num` and `max_iters`, then passed to the encoder. How should this be managed?
+
+**Options Considered**:
+
+**L1: Pass `epoch_frac` as parameter to `GPT.forward()`**
+- `GPT.forward(inputs, targets, epoch_frac=None)`
+- Training loop computes `epoch_frac = iter_num / max_iters`
+- Passes it through: `model(B["X"], targets=B["Y"], epoch_frac=epoch_frac)`
+- GPT forwards it to encoder
+
+**L2: Compute `epoch_frac` inside GPT using global iteration counter**
+- GPT maintains `self.current_iter` and `self.max_iters`
+- Training loop calls `model.set_iteration(iter_num, max_iters)` before forward
+- GPT computes `epoch_frac` internally and passes to encoder
+
+**L3: Dedicated setter method `set_jetformer_schedule()`**
+- GPT has `set_jetformer_schedule(iter_num, max_iters)` method
+- Training loop calls it before forward pass
+- GPT computes `epoch_frac` and stores in `self.jet_epoch_frac`
+- Encoder reads from `self.jet_epoch_frac` (threaded via `set_jet_epoch_frac()`)
+
+**What Each Option Changes**:
+
+**L1 Changes**:
+- `GPT.forward()` signature: `forward(self, inputs, targets=None, epoch_frac=None, ...)`
+- All forward calls must pass `epoch_frac` (even when None for non-Jetformer)
+- Encoder receives `epoch_frac` as parameter
+- **Code location**: Changes in `model.py` forward signature, all call sites
+
+**L2 Changes**:
+- GPT has `self.current_iter`, `self.max_iters` attributes
+- Training loop: `model.set_iteration(iter_num, max_iters)` before each forward
+- GPT computes `epoch_frac` and passes to encoder
+- **Code location**: New method in GPT, training loop calls it
+
+**L3 Changes**:
+- GPT has `self.jet_epoch_frac` attribute
+- New method: `GPT.set_jetformer_schedule(iter_num, max_iters)`
+- Training loop calls setter before forward
+- Encoder reads from cached value
+- **Code location**: New setter method, training loop calls it, encoder reads cached value
+
+**Trade-offs**:
+
+| Aspect | L1 (Parameter) | L2 (Internal Counter) | L3 (Setter Method) |
+|--------|----------------|----------------------|-------------------|
+| **Forward Signature** | ❌ Changes signature (breaking) | ✅ Clean signature | ✅ Clean signature |
+| **Training Loop** | ⚠️ Must pass parameter always | ⚠️ Must call setter always | ⚠️ Must call setter for Jetformer |
+| **Flexibility** | ✅ Can vary per batch | ❌ Fixed per forward | ❌ Fixed per forward |
+| **Backward Compat** | ❌ Changes all forward calls | ✅ No signature change | ✅ No signature change |
+| **Clarity** | ⚠️ Parameter might be None | ✅ Explicit setter call | ✅ Explicit setter call |
+| **State Management** | ✅ No state needed | ❌ GPT tracks iteration | ⚠️ GPT caches fraction |
+
+**Chosen**: **L3** - Dedicated setter method `set_jetformer_schedule()`
+
+**Why**: 
+- **Clean Signature**: `GPT.forward()` signature remains unchanged, maintaining backward compatibility
+- **Explicit Control**: Training loop explicitly sets schedule, making curriculum visible
+- **No Global State**: Doesn't require GPT to track iteration counter (simpler)
+- **Flexible**: Can be called conditionally only for Jetformer runs
+- **Matches Patterns**: Similar to how other hyperparameters are set (e.g., learning rate)
+
+**Implementation Impact**:
+```python
+# With L3 (chosen):
+# Training loop:
+if tokeniser == "jetformer":
+    model.set_jetformer_schedule(iter_num, max_iters)
+loss = model(B["X"], targets=B["Y"])  # Clean signature
+
+# Inside GPT:
+def set_jetformer_schedule(self, iter_num, max_iters):
+    self.jet_epoch_frac = iter_num / max_iters
+    # Propagate to encoder
+    for enc in self.encoders.values():
+        if hasattr(enc, 'set_jet_epoch_frac'):
+            enc.set_jet_epoch_frac(self.jet_epoch_frac)
+
+# With L1 (not chosen):
+loss = model(B["X"], targets=B["Y"], epoch_frac=iter_num/max_iters)  # Signature change!
+
+# With L2 (not chosen):
+model.set_iteration(iter_num, max_iters)  # GPT tracks iteration
+loss = model(B["X"], targets=B["Y"])
+```
+
+---
+
+### Decision 3: What Space Does GPT Model? (Z1 vs Z2)
+
+**The Problem**: 
+After the flow transforms images `x → z`, we have latent images `z [B,C,H,W]`. But GPT operates on sequences of tokens. What should GPT actually model?
+
+**Options Considered**:
+
+**Z1: GPT models patchified `z` tokens**
+- Flow: `x [B,C,H,W] → z [B,C,H,W]`
+- Patchify: `z [B,C,H,W] → tokens [B,T,D]` (patchified z)
+- GPT processes: `tokens [B,T,D]`
+- GMM head predicts: parameters for `tokens[:,1:]`
+- Loss targets: `tokens[:,1:]` (patchified z)
+
+**Z2: GPT models raw `z` images (before patchification)**
+- Flow: `x [B,C,H,W] → z [B,C,H,W]`
+- GPT processes: `z` directly (somehow flattened or reshaped)
+- GMM head predicts: parameters for `z`
+- Loss targets: `z` directly
+
+**What Each Option Changes**:
+
+**Z1 Changes**:
+- Flow output `z [B,C,H,W]` is immediately patchified
+- GPT sees patch tokens `[B,T,D]` (standard sequence format)
+- GMM head outputs parameters for token space
+- Loss compares predicted tokens to actual patchified z tokens
+- **Code location**: Encoder patchifies after flow, GPT processes tokens normally
+
+**Z2 Changes**:
+- Flow output `z [B,C,H,W]` must be reshaped for GPT (e.g., `z.view(B, -1, C*H*W)`)
+- GPT processes flattened z images as sequence
+- GMM head outputs parameters for image space
+- Loss compares predicted z images to actual z images
+- **Code location**: Encoder doesn't patchify, GPT processes reshaped z
+
+**Trade-offs**:
+
+| Aspect | Z1 (Patchified z) | Z2 (Raw z) |
+|--------|-------------------|------------|
+| **Matches Original** | ✅ Exact match to JetFormerLite | ❌ Different from original |
+| **GPT Architecture** | ✅ Standard sequence processing | ❌ Non-standard input format |
+| **Token Dimensionality** | ✅ Fixed: `D = C*patch*patch` | ❌ Variable: `C*H*W` (very large) |
+| **Sequence Length** | ✅ Fixed: `T = (H/patch)*(W/patch)` | ❌ Very short: `T = 1` (or reshaped) |
+| **GMM Head** | ✅ Predicts token-space distributions | ⚠️ Predicts image-space distributions |
+| **Loss Computation** | ✅ Standard token-wise NLL | ⚠️ Image-wise NLL (different scale) |
+| **Implementation Complexity** | ✅ Standard AstroPT patterns | ❌ Requires custom reshaping |
+
+**Chosen**: **Z1** - GPT models patchified `z` tokens
+
+**Why**: 
+- **Matches Original**: Original JetFormerLite patchifies z before GPT, this is the exact design
+- **Standard Architecture**: GPT processes sequences of tokens, which is what patchified z provides
+- **Consistent Dimensionality**: Token dimension `D = 768` (for 16×16×3 patches) is manageable for GMM
+- **Proper Sequence Modeling**: Sequence length `T = 256` (for 256×256 images) allows proper autoregressive modeling
+- **Implementation Simplicity**: Uses standard AstroPT token processing patterns
+
+**Implementation Impact**:
+```python
+# With Z1 (chosen):
+# In JetformerImageEncoder:
+z, logdet = self.flow(x)  # z is [B,C,H,W]
+tokens = patchify(z, patch_size)  # tokens is [B,T,D] where T=256, D=768
+# GPT processes tokens normally
+# GMM head predicts: (logits_pi, mu, log_sigma) for tokens
+# Loss: NLL_GMM(tokens[:,1:], ...) - logdet
+
+# With Z2 (not chosen):
+# In JetformerImageEncoder:
+z, logdet = self.flow(x)  # z is [B,C,H,W]
+z_flat = z.view(B, 1, C*H*W)  # Reshape to [B,1,196608] - huge!
+# GPT processes z_flat (non-standard)
+# GMM head predicts for 196608-dim space (impractical!)
+# Loss: NLL_GMM(z, ...) - logdet
+```
+
+---
+
+### Decision 4: Reconstruction API Design (S1 vs S2 vs S3)
+
+**The Problem**: 
+For validation/visualization, we need to reconstruct images. The reconstruction process involves: flow → patchify → GPT → GMM → depatchify → inverse flow. Where should this logic live and how should it be called?
+
+**Options Considered**:
+
+**S1: Reconstruction logic in training script `validate()` function**
+- All reconstruction code in `scripts/train_jetformer.py`
+- `validate()` function has full reconstruction pipeline
+- Model only provides forward pass, script handles reconstruction
+
+**S2: Dedicated method `jetformer_reconstruct_images()` + branch in `validate()`**
+- GPT has method: `jetformer_reconstruct_images(x_real)`
+- Method encapsulates full reconstruction pipeline
+- `validate()` calls this method when `tokeniser=="jetformer"`
+- For other tokenizers, uses existing reconstruction logic
+
+**S3: Unified reconstruction method that handles all tokenizers**
+- GPT has method: `reconstruct_images(x, tokeniser)`
+- Method branches internally based on tokenizer
+- `validate()` always calls same method
+
+**What Each Option Changes**:
+
+**S1 Changes**:
+- Reconstruction code in `scripts/train_jetformer.py` `validate()` function
+- Script directly calls: `flow()`, `patchify()`, `model()`, `GMM_head()`, `depatchify()`, `flow(reverse=True)`
+- Script needs access to model internals (flow, GMM head)
+- **Code location**: All in training script
+
+**S2 Changes**:
+- GPT has method: `@torch.no_grad() def jetformer_reconstruct_images(self, x_real)`
+- Method encapsulates: flow → patchify → GPT → GMM → depatchify → inverse flow
+- `validate()` branches: if Jetformer, call `jetformer_reconstruct_images()`, else use existing logic
+- **Code location**: Method in `model.py`, call in training script
+
+**S3 Changes**:
+- GPT has unified method: `reconstruct_images(x, tokeniser)`
+- Method has internal branches for each tokenizer
+- `validate()` always calls `reconstruct_images()` regardless of tokenizer
+- **Code location**: Unified method in `model.py`
+
+**Trade-offs**:
+
+| Aspect | S1 (Script Logic) | S2 (Dedicated Method) | S3 (Unified Method) |
+|--------|-------------------|----------------------|---------------------|
+| **Encapsulation** | ❌ Logic in script | ✅ Logic in model | ✅ Logic in model |
+| **Code Reusability** | ❌ Hard to reuse | ✅ Easy to reuse | ✅ Easy to reuse |
+| **Access to Internals** | ❌ Script needs model internals | ✅ Method has access | ✅ Method has access |
+| **Validation Complexity** | ⚠️ Script handles branching | ⚠️ Script branches on tokenizer | ✅ No branching in script |
+| **Backward Compat** | ✅ No model changes | ✅ New method, old code unchanged | ⚠️ Changes existing patterns |
+| **Clarity** | ❌ Reconstruction scattered | ✅ Clear Jetformer-specific API | ⚠️ Unified but more complex |
+
+**Chosen**: **S2** - Dedicated method `jetformer_reconstruct_images()` + branch in `validate()`
+
+**Why**: 
+- **Encapsulation**: Reconstruction logic lives with the model, not scattered in training script
+- **Access to Internals**: Method can access `self.encoders["images"].flow`, `self.jetformer_images_head`, etc.
+- **Backward Compatibility**: Doesn't change existing reconstruction patterns for AIM/Affine
+- **Clear API**: Method name makes it obvious this is Jetformer-specific
+- **Flexibility**: Can easily extend to other reconstruction modes later
+
+**Implementation Impact**:
+```python
+# With S2 (chosen):
+# In model.py:
+@torch.no_grad()
+def jetformer_reconstruct_images(self, x_real):
+    # Full reconstruction pipeline
+    z_real, _ = self.encoders["images"].flow(x_real, reverse=False)
+    tokens_real = patchify(z_real, ...)
+    # ... GPT forward ...
+    # ... GMM head ...
+    # ... depatchify ...
+    x_pred, _ = self.encoders["images"].flow(z_pred, reverse=True)
+    return x_pred
+
+# In validate():
+if tokeniser == "jetformer":
+    x_recon = model.jetformer_reconstruct_images(B["Y"]["images"])
+else:
+    # Existing AIM/Affine reconstruction
+    P = model(B["X"], B["Y"])
+
+# With S1 (not chosen):
+# In validate() - all logic here:
+z_real, _ = model.encoders["images"].flow(B["Y"]["images"], reverse=False)
+tokens_real = patchify(z_real, ...)
+# ... need access to model internals ...
+# Script becomes very complex
+
+# With S3 (not chosen):
+# Unified method with internal branching:
+def reconstruct_images(self, x, tokeniser):
+    if tokeniser == "jetformer":
+        # Jetformer reconstruction
+    elif tokeniser == "aim":
+        # AIM reconstruction
+    # ... but AIM/Affine don't have reconstruction methods currently
+```
+
+---
+
+### Decision 5: Configuration Management (C1 vs C2)
+
+**The Problem**: 
+Jetformer has hyperparameters: `flow_steps`, `gmm_K`, `noise_max`, `noise_min`, `img_size`. Where should these be stored and how should they be passed to the model?
+
+**Options Considered**:
+
+**C1: All Jetformer fields on `GPTConfig`**
+- `GPTConfig` dataclass has: `jetformer_flow_steps`, `jetformer_gmm_K`, `jetformer_noise_max`, `jetformer_noise_min`, `img_size`
+- Training script passes via `model_args = dict(..., jetformer_flow_steps=4, ...)`
+- Model reads from `config.jetformer_flow_steps`, etc.
+
+**C2: Separate `JetformerConfig` dataclass**
+- New dataclass: `@dataclass class JetformerConfig: flow_steps, gmm_K, ...`
+- `GPTConfig` has: `jetformer_config: JetformerConfig | None = None`
+- Training script: `jetformer_config = JetformerConfig(...)`, then `GPTConfig(..., jetformer_config=jetformer_config)`
+
+**C3: Pass as parameters to `JetformerImageEncoder` directly**
+- `GPTConfig` doesn't have Jetformer fields
+- `_init_native_backbone()` passes values directly: `JetformerImageEncoder(config, in_size, img_size=256, n_chan=3, ...)`
+- Values come from training script via closure or global
+
+**What Each Option Changes**:
+
+**C1 Changes**:
+- `GPTConfig` dataclass gets 5 new fields (all optional with defaults)
+- Training script: `model_args = dict(..., jetformer_flow_steps=4, ...)`
+- Model initialization: `GPTConfig(**model_args)` automatically includes fields
+- Model code: `self.encoders["images"] = JetformerImageEncoder(..., steps=config.jetformer_flow_steps)`
+- **Code location**: Fields in `model.py` `GPTConfig`, passed via `model_args`
+
+**C2 Changes**:
+- New file or section: `JetformerConfig` dataclass
+- `GPTConfig` has: `jetformer_config: JetformerConfig | None`
+- Training script: Creates `JetformerConfig`, passes to `GPTConfig`
+- Model code: `if config.jetformer_config: steps = config.jetformer_config.flow_steps`
+- **Code location**: New config class, nested in `GPTConfig`
+
+**C3 Changes**:
+- `GPTConfig` unchanged
+- Training script: Stores values in variables
+- Model initialization: Hardcodes or reads from somewhere: `JetformerImageEncoder(..., steps=4)`
+- **Code location**: Values passed directly, not via config
+
+**Trade-offs**:
+
+| Aspect | C1 (GPTConfig Fields) | C2 (Separate Config) | C3 (Direct Parameters) |
+|--------|----------------------|---------------------|----------------------|
+| **Centralization** | ✅ All config in one place | ⚠️ Nested config | ❌ Scattered |
+| **Type Safety** | ✅ Dataclass fields | ✅ Separate dataclass | ❌ No type checking |
+| **Checkpointing** | ✅ Saved in config | ✅ Saved in config | ❌ Not saved |
+| **Default Values** | ✅ Easy with dataclass | ✅ Easy with dataclass | ⚠️ Manual defaults |
+| **Code Complexity** | ✅ Simple, flat structure | ⚠️ Nested access | ✅ Simple but not saved |
+| **Consistency** | ✅ Matches existing patterns | ⚠️ New pattern | ❌ Different pattern |
+
+**Chosen**: **C1** - All Jetformer fields on `GPTConfig`
+
+**Why**: 
+- **Consistency**: Matches how other AstroPT config is structured (flat dataclass)
+- **Checkpointing**: Config is saved/loaded automatically, Jetformer hyperparameters preserved
+- **Simplicity**: No nested configs, easy to access `config.jetformer_flow_steps`
+- **Type Safety**: Dataclass provides type hints and validation
+- **Default Values**: Easy to provide defaults: `jetformer_flow_steps: int = 4`
+
+**Implementation Impact**:
+```python
+# With C1 (chosen):
+# In model.py:
+@dataclass
+class GPTConfig:
+    # ... existing fields ...
+    jetformer_flow_steps: int = 4
+    jetformer_gmm_K: int = 4
+    jetformer_noise_max: float = 0.1
+    jetformer_noise_min: float = 0.0
+    img_size: int = 256
+
+# Training script:
+model_args = dict(
+    # ... other args ...
+    jetformer_flow_steps=4,
+    jetformer_gmm_K=4,
+    # ...
+)
+gptconf = GPTConfig(**model_args)
+
+# Model code:
+self.encoders["images"] = JetformerImageEncoder(
+    config, in_size, 
+    img_size=config.img_size,  # From config!
+    # ...
+)
+
+# With C2 (not chosen):
+@dataclass
+class JetformerConfig:
+    flow_steps: int = 4
+    # ...
+
+@dataclass
+class GPTConfig:
+    # ...
+    jetformer_config: JetformerConfig | None = None
+
+# More complex, nested access
+
+# With C3 (not chosen):
+# No config fields, values passed directly:
+JetformerImageEncoder(config, in_size, img_size=256, ...)  # Hardcoded!
+# Not saved in checkpoints
+```
+
+---
+
+### Decision 6: Modality Support (M1 vs M2)
+
+**The Problem**: 
+Should Jetformer support multiple modalities (images + spectra) in the same run, or should it be images-only initially?
+
+**Options Considered**:
+
+**M1: Multi-modal Jetformer (images + spectra in same run)**
+- Jetformer can be used for images while other modalities use AIM/Affine
+- Single forward pass processes mixed modalities
+- Loss combines Jetformer loss (images) + Huber loss (spectra)
+
+**M2: Jetformer-only runs (images modality only initially)**
+- When `tokeniser=="jetformer"`, only images modality is supported
+- Other modalities are not included in the run
+- Simpler implementation, can extend later
+
+**What Each Option Changes**:
+
+**M1 Changes**:
+- `_forward_native()` must handle mixed modalities
+- Jetformer loss path only for images, Huber loss for other modalities
+- Need to combine losses: `loss = loss_images + loss_spectra`
+- Encoder selection: `JetformerImageEncoder` for images, `Encoder` for spectra
+- **Code location**: Complex branching in `_forward_native()`, loss combination logic
+
+**M2 Changes**:
+- `_forward_native()` checks: `if tokeniser=="jetformer" and modalities==["images"]`
+- Only images modality allowed when using Jetformer
+- Single loss path (Jetformer loss only)
+- Simpler encoder selection (always `JetformerImageEncoder` for images)
+- **Code location**: Simple conditional in `_forward_native()`
+
+**Trade-offs**:
+
+| Aspect | M1 (Multi-modal) | M2 (Images-only) |
+|--------|------------------|------------------|
+| **Flexibility** | ✅ Can mix modalities | ❌ Only images |
+| **Implementation Complexity** | ❌ Complex loss combination | ✅ Simple, single path |
+| **Testing** | ❌ Need to test all combinations | ✅ Test images only |
+| **Extensibility** | ✅ Already supports extension | ⚠️ Need to add later |
+| **User Confusion** | ⚠️ Which tokenizer for which modality? | ✅ Clear: Jetformer = images |
+| **Code Maintainability** | ❌ More complex branching | ✅ Simpler code |
+
+**Chosen**: **M2** - Jetformer-only runs (images modality only initially)
+
+**Why**: 
+- **Simplicity**: Much simpler initial implementation, easier to debug and test
+- **Clear Semantics**: When `tokeniser=="jetformer"`, it's clear this is for images
+- **Incremental Development**: Can add multi-modal support later once images work perfectly
+- **Reduced Risk**: Fewer code paths = fewer bugs, easier to maintain
+- **Matches Original**: Original JetFormerLite was images-only
+
+**Implementation Impact**:
+```python
+# With M2 (chosen):
+# In _forward_native():
+if (self.config.backbone == "native" 
+    and self.config.tokeniser == "jetformer"
+    and self.modality_registry.names() == ["images"]):
+    # Jetformer loss path - simple!
+    loss = (nll_gmm - logdet).mean()
+    return outputs, loss
+
+# Training script:
+modalities = [ModalityConfig(name="images", ...)]  # Only images
+
+# With M1 (not chosen):
+# In _forward_native():
+if self.config.tokeniser == "jetformer":
+    loss_images = None
+    loss_spectra = None
+    if "images" in modalities:
+        # Compute Jetformer loss
+        loss_images = (nll_gmm - logdet).mean()
+    if "spectra" in modalities:
+        # Compute Huber loss
+        loss_spectra = huber_loss(...)
+    # Combine losses - complex!
+    loss = loss_images + loss_spectra
+    return outputs, loss
+```
+
+---
+
+### Decision 7: Training Loop Modifications (T1 vs T2)
+
+**The Problem**: 
+The training loop needs to call `set_jetformer_schedule()` to update the noise curriculum. How much should the training loop change?
+
+**Options Considered**:
+
+**T1: Minimal changes + `set_jetformer_schedule()` call**
+- Training loop adds: `if tokeniser=="jetformer": model.set_jetformer_schedule(...)`
+- All other logic unchanged
+- Forward pass, loss computation, etc. all handled in model
+
+**T2: Extensive changes to handle Jetformer-specific logic**
+- Training loop branches on tokenizer
+- Different forward calls, different loss handling, different validation
+- More Jetformer-specific code in training script
+
+**What Each Option Changes**:
+
+**T1 Changes**:
+- Training loop adds ~3 lines: check tokenizer, call setter
+- All forward/backward logic unchanged: `loss = model(B["X"], targets=B["Y"])`
+- Model handles all Jetformer-specific logic internally
+- **Code location**: Minimal changes in training script, logic in model
+
+**T2 Changes**:
+- Training loop has branches: `if tokeniser=="jetformer": ... else: ...`
+- Different forward calls for different tokenizers
+- Different loss handling, different validation logic
+- **Code location**: Extensive changes in training script
+
+**Trade-offs**:
+
+| Aspect | T1 (Minimal Changes) | T2 (Extensive Changes) |
+|--------|---------------------|----------------------|
+| **Training Script Complexity** | ✅ Simple, minimal changes | ❌ Complex, many branches |
+| **Code Maintainability** | ✅ Logic in model, script simple | ❌ Logic scattered |
+| **Backward Compatibility** | ✅ Existing code mostly unchanged | ⚠️ More changes to review |
+| **Encapsulation** | ✅ Model handles its own logic | ❌ Script knows tokenizer details |
+| **Flexibility** | ⚠️ Less control in script | ✅ More control in script |
+
+**Chosen**: **T1** - Minimal changes + `set_jetformer_schedule()` call
+
+**Why**: 
+- **Encapsulation**: All Jetformer logic lives in the model, training script stays simple
+- **Maintainability**: Easier to maintain - changes to Jetformer don't require training script changes
+- **Consistency**: Matches AstroPT pattern - model handles complexity, script is simple
+- **Backward Compatibility**: Existing training scripts mostly unchanged
+
+**Implementation Impact**:
+```python
+# With T1 (chosen):
+# Training loop - minimal changes:
+raw_model = model.module if ddp else model
+if raw_model.config.tokeniser == "jetformer":
+    raw_model.set_jetformer_schedule(iter_num, max_iters)  # Only addition!
+
+loss = model(B["X"], targets=B["Y"])  # Same as before
+loss.backward()  # Same as before
+# ... rest unchanged
+
+# With T2 (not chosen):
+# Training loop - extensive changes:
+if tokeniser == "jetformer":
+    raw_model.set_jetformer_schedule(iter_num, max_iters)
+    # Different forward?
+    # Different loss handling?
+    # Different validation?
+else:
+    # Original logic
+    # ...
+# Many branches, complex code
+```
+
+---
+
+## Summary of Design Decisions
+
+Each decision was made to balance:
+1. **Matching Original**: Preserving JetFormerLite's design and quality
+2. **AstroPT Integration**: Fitting cleanly into existing architecture
+3. **Backward Compatibility**: Not breaking AIM/Affine tokenizers
+4. **Code Maintainability**: Keeping code simple and maintainable
+5. **Future Extensibility**: Allowing easy extension later
+
+The chosen options (F1, L3, Z1, S2, C1, M2, T1) create a clean, maintainable integration that preserves both the original JetFormerLite quality and AstroPT's flexibility.
+
+---
+
+## Implementation Details
+
+### New Files Created
+
+1. **`src/astropt/jetformer.py`**:
+   - `TinyFlow2D`: 2D image-space flow (from original)
+   - `TinyFlow1D`: 1D patch-space flow (kept for potential future use)
+   - `GMMHead`: GMM output head
+   - `gmm_nll()`: Negative log-likelihood computation
+   - `patchify()` / `depatchify()`: Image ↔ token conversion
+   - `uniform_dequantize()`: Continuous dequantization
+
+### Modified Files
+
+1. **`src/astropt/model.py`**:
+   - Added `JetformerImageEncoder` class
+   - Added `jetformer_images_head` in `_init_native_backbone()`
+   - Added Jetformer loss path in `_forward_native()`
+   - Added `set_jetformer_schedule()` method
+   - Added `jetformer_reconstruct_images()` method
+   - Added Jetformer config fields to `GPTConfig`
+
+2. **`scripts/train_jetformer.py`**:
+   - Added `prepare_batch_for_jetformer()` helper
+   - Modified `process_galaxy_wrapper()` to return raw images
+   - Updated validation to handle raw images
+   - Added schedule setter call in training loop
+   - Fixed loss plotting for negative values (symlog scale)
+
+3. **`src/astropt/local_datasets.py`**:
+   - Modified `process_modes()` to skip slicing for raw images
+   - Added `images_is_raw` flag handling
+
+---
+
+## Changes to Jetformer
+
+### 1. Modularization
+
+**Original**: Monolithic `JetFormerLite` class with everything inside
+
+**Changed**: Split into reusable components:
+- `TinyFlow2D` in `jetformer.py`
+- `GMMHead` in `jetformer.py`
+- `JetformerImageEncoder` in `model.py`
+
+**Reason**: Better integration with AstroPT's modular architecture
+
+### 2. Encoder Integration
+
+**Original**: `in_proj` + `pos` embeddings inside `JetFormerLite`
+
+**Changed**: Flow → patchify → project in `JetformerImageEncoder`, then standard AstroPT embedding
+
+**Reason**: Reuse AstroPT's embedding infrastructure
+
+### 3. Loss Computation
+
+**Original**: Loss computed inside `JetFormerLite.forward()`
+
+**Changed**: Loss computed in `GPT._forward_native()` with conditional branching
+
+**Reason**: Unified loss handling across all tokenizers
+
+### 4. Noise Curriculum
+
+**Original**: `epoch_frac` passed as parameter to `forward()`
+
+**Changed**: `set_jetformer_schedule()` method + cached `jet_epoch_frac`
+
+**Reason**: Cleaner API, matches AstroPT patterns
+
+### 5. Reconstruction
+
+**Original**: `sample()` method with reconstruction path
+
+**Changed**: `jetformer_reconstruct_images()` method
+
+**Reason**: Better separation, matches AstroPT validation patterns
+
+### 6. Data Loading
+
+**Original**: Custom dataloader with `.pt` shards
+
+**Changed**: Works with AstroPT's `GalaxyImageDataset` and HuggingFace datasets
+
+**Reason**: Unified data infrastructure
+
+---
+
+## Changes to AstroPT
+
+### 1. Encoder System
+
+**Original**: Single `Encoder` class for all tokenizers
+
+**Changed**: 
+- `Encoder` remains for AIM/Affine
+- `JetformerImageEncoder` inherits from `Encoder` for Jetformer
+
+**Impact**: Backward compatible, AIM/Affine unchanged
+
+### 2. Input Handling
+
+**Original**: Always expects patch tokens `[B, T, D]`
+
+**Changed**: 
+- `JetformerImageEncoder` accepts raw images `[B, C, H, W]`
+- Data loading branches to provide appropriate format
+
+**Impact**: Requires `prepare_batch_for_jetformer()` in training script
+
+### 3. Loss Computation
+
+**Original**: Single Huber loss path
+
+**Changed**: 
+- Conditional branch: Jetformer uses GMM NLL - logdet
+- Other tokenizers use original Huber loss
+
+**Impact**: Loss values can be negative for Jetformer (expected)
+
+### 4. Configuration
+
+**Original**: `GPTConfig` with AIM/Affine fields
+
+**Changed**: Added Jetformer fields:
+- `jetformer_flow_steps`: Number of coupling layers (default: 4)
+- `jetformer_gmm_K`: Number of GMM components (default: 4)
+- `jetformer_noise_max`: Maximum noise (default: 0.1)
+- `jetformer_noise_min`: Minimum noise (default: 0.0)
+- `img_size`: Image size for flow (default: 256)
+
+**Impact**: New config fields, defaults provided
+
+### 5. Data Pipeline
+
+**Original**: `process_modes()` always slices for teacher forcing
+
+**Changed**: 
+- Checks `images_is_raw` flag
+- Skips slicing for raw images (Jetformer)
+- Still slices for patch tokens (AIM/Affine)
+
+**Impact**: Maintains teacher forcing for all tokenizers
+
+### 6. Validation / Visualization
+
+**Original**: Expects patch tokens for plotting
+
+**Changed**: 
+- Branches based on tokenizer
+- Jetformer: works with raw images, converts for visualization
+- AIM/Affine: original patch-based visualization
+
+**Impact**: Validation code handles both formats
+
+### 7. Loss Plotting
+
+**Original**: Log scale (only positive values)
+
+**Changed**: Symlog scale (handles negative values)
+
+**Impact**: Can visualize negative losses (expected for Jetformer)
+
+---
+
+## Data Flow Comparison
+
+### AIM / Affine Tokenizer (Original)
+
+```
+Dataset → Patches [B,T,D]
+  ↓
+Encoder (AIM/Affine projection)
+  ↓
+Embeddings [B,T,n_embd]
+  ↓
+GPT (causal attention)
+  ↓
+Hidden States [B,T,n_embd]
+  ↓
+Decoder (AIM/Affine projection)
+  ↓
+Predictions [B,T,D]
+  ↓
+Huber Loss
+```
+
+### Jetformer Tokenizer (New)
+
+```
+Dataset → Raw Images [B,C,H,W]
+  ↓
+JetformerImageEncoder:
+  - uniform_dequantize
+  - TinyFlow2D → z [B,C,H,W], logdet [B]
+  - patchify → tokens [B,T,D]
+  - add noise (curriculum)
+  - project to embeddings
+  ↓
+Embeddings [B,T,n_embd]
+  ↓
+GPT (causal attention)
+  ↓
+Hidden States [B,T,n_embd]
+  ↓
+GMM Head → (logits_pi, mu, log_sigma)
+  ↓
+Loss: NLL_GMM(tokens[:,1:]) - logdet
+```
+
+**Key Differences**:
+1. Jetformer operates on raw images initially
+2. Flow happens before patchification
+3. Loss is likelihood-based (can be negative)
+4. Noise curriculum for training stability
+
+---
+
+## Usage
+
+### Training with Jetformer
+
+```python
+# In train_jetformer.py or similar
+tokeniser = "jetformer"
+
+model_args = dict(
+    # ... standard AstroPT args ...
+    tokeniser=tokeniser,
+    jetformer_flow_steps=4,
+    jetformer_gmm_K=4,
+    jetformer_noise_max=0.1,
+    jetformer_noise_min=0.0,
+    img_size=256,
+)
+
+# Model initialization
+gptconf = GPTConfig(**model_args)
+model = GPT(gptconf, modality_registry)
+```
+
+### Training Loop
+
+```python
+# Before forward pass
+raw_model = model.module if ddp else model
+if raw_model.config.tokeniser == "jetformer":
+    raw_model.set_jetformer_schedule(iter_num, max_iters)
+
+# Forward pass (handles Jetformer automatically)
+logits, loss = model(B["X"], targets=B["Y"])
+```
+
+### Data Loading
+
+```python
+# Prepare batch (swaps patches for raw images if Jetformer)
+batch_raw = next(dataloader)
+batch_raw = prepare_batch_for_jetformer(batch_raw, tokeniser)
+B = dataset.process_modes(batch_raw, modality_registry, device)
+```
+
+### Validation / Reconstruction
+
+```python
+# Reconstruction for visualization
+if tokeniser == "jetformer":
+    x_recon = model.jetformer_reconstruct_images(B["Y"]["images"])
+    # x_recon is [B, C, H, W]
+```
+
+---
+
+## Technical Notes
+
+### Why Flow on Image Space?
+
+The original JetFormerLite flows on image space `[B, C, H, W]` before patchification. This is more principled because:
+1. Flows operate on natural image structure (spatial relationships)
+2. Better matches the original paper's design
+3. Preserves image-level statistics before tokenization
+
+Alternative (flow on patches) would be simpler but loses these benefits.
+
+### Why Negative Loss?
+
+Jetformer loss is `NLL_GMM - logdet`. When the flow's log-determinant is large (good flow fit), `logdet` can exceed `NLL_GMM`, making loss negative. This is **expected and correct** - it indicates the model is learning a good latent representation.
+
+### Memory Considerations
+
+Raw images use more memory than patches:
+- Image: `B × 3 × 256 × 256 × 4 bytes = B × 786KB`
+- Patches: `B × 256 × 768 × 4 bytes = B × 786KB` (same size, but different access pattern)
+
+The main issue is DataLoader workers - reduce `num_workers` for Jetformer (2-4 instead of 16).
+
+### Teacher Forcing
+
+Even though Jetformer receives full raw images (no slicing), teacher forcing is maintained via:
+1. GPT's causal attention mask (can't see future tokens)
+2. Loss targets are `tokens[:, 1:]` (next-token prediction)
+3. First token is copied from real data in reconstruction
+
+### Compatibility
+
+- **AIM/Affine tokenizers**: Completely unchanged, work exactly as before
+- **Other modalities**: Unaffected (Jetformer is images-only initially)
+- **Training infrastructure**: Shared, no changes needed
+- **Checkpointing**: Compatible (Jetformer fields in config)
+
+---
+
+## Summary
+
+The integration of Jetformer into AstroPT successfully:
+
+1. ✅ Preserves original JetFormerLite architecture (flow on image space)
+2. ✅ Maintains full backward compatibility (AIM/Affine unchanged)
+3. ✅ Shares GPT backbone across all tokenizers
+4. ✅ Uses unified training infrastructure
+5. ✅ Handles negative losses correctly
+6. ✅ Supports proper reconstruction/validation
+
+The key insight was to **encapsulate Jetformer-specific logic** in `JetformerImageEncoder` and conditional branches, while keeping the core GPT architecture unchanged. This allows all three tokenizers to coexist seamlessly.
+
+---
+
+## Future Work
+
+Potential extensions:
+1. **Multi-modal Jetformer**: Extend to spectra and other modalities
+2. **Mixed tokenizers**: Use different tokenizers for different modalities
+3. **Flow improvements**: Experiment with different flow architectures
+4. **GMM head variants**: Try different mixture models
+5. **Memory optimization**: Further reduce memory footprint for large batches
+
+---
+
+*This integration was completed with careful attention to preserving both the original JetFormerLite design and AstroPT's flexibility. All changes are backward compatible and well-documented.*
+
diff --git a/src/astropt/jetformer.py b/src/astropt/jetformer.py
new file mode 100644
index 0000000..4de962f
--- /dev/null
+++ b/src/astropt/jetformer.py
@@ -0,0 +1,367 @@
+import math
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+@dataclass
+class TinyFlow1DConfig:
+    """Configuration for TinyFlow1D operating on patch tokens."""
+
+    dim: int
+    steps: int = 4
+    hidden_dim: int = 128
+
+
+class CouplingMLP(nn.Module):
+    """RealNVP-style affine coupling over feature dimension for 1D tokens."""
+
+    def __init__(self, dim: int, hidden_dim: int) -> None:
+        super().__init__()
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.split = dim // 2
+        self.net = nn.Sequential(
+            nn.Linear(self.split, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, 2 * (dim - self.split)),
+        )
+
+    def forward(
+        self, x: torch.Tensor, reverse: bool = False, flip: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        x: [*, D]
+        flip=False: x1 is identity, x2 is transformed.
+        flip=True: x2 is identity, x1 is transformed.
+        """
+        if not flip:
+            x1 = x[..., : self.split]
+            x2 = x[..., self.split :]
+            st = self.net(x1)
+            s, t = st.chunk(2, dim=-1)
+            s = torch.tanh(s) * 1.5
+            if not reverse:
+                y2 = x2 * torch.exp(s) + t
+                y = torch.cat([x1, y2], dim=-1)
+                logdet = s.sum(dim=-1)
+            else:
+                y2 = (x2 - t) * torch.exp(-s)
+                y = torch.cat([x1, y2], dim=-1)
+                logdet = -s.sum(dim=-1)
+        else:
+            x1 = x[..., : self.split]
+            x2 = x[..., self.split :]
+            st = self.net(x2)
+            s, t = st.chunk(2, dim=-1)
+            s = torch.tanh(s) * 1.5
+            if not reverse:
+                y1 = x1 * torch.exp(s) + t
+                y = torch.cat([y1, x2], dim=-1)
+                logdet = s.sum(dim=-1)
+            else:
+                y1 = (x1 - t) * torch.exp(-s)
+                y = torch.cat([y1, x2], dim=-1)
+                logdet = -s.sum(dim=-1)
+        return y, logdet
+
+
+class TinyFlow1D(nn.Module):
+    """
+    A small RealNVP-style normalizing flow over patch tokens.
+
+    Operates on [B, T, D] where each token is a D-dimensional vector.
+    The log-determinant is aggregated over all tokens to produce a
+    per-sample scalar logdet [B].
+    """
+
+    def __init__(self, dim: int, steps: int = 4, hidden_dim: int = 128) -> None:
+        super().__init__()
+        self.dim = dim
+        self.steps = steps
+        self.blocks = nn.ModuleList(
+            [
+                CouplingMLP(dim, hidden_dim)
+                for _ in range(steps)
+            ]
+        )
+
+    def forward(
+        self, x: torch.Tensor, reverse: bool = False
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        x: [B, T, D]
+        Returns:
+            z: [B, T, D]
+            logdet: [B]
+        """
+        B, T, D = x.shape
+        assert D == self.dim
+
+        z = x
+        logdet = x.new_zeros(B)
+
+        if not reverse:
+            for i, block in enumerate(self.blocks):
+                flip = (i % 2) == 1
+                z_flat = z.reshape(B * T, D)
+                z_flat, ld_flat = block(z_flat, reverse=False, flip=flip)
+                z = z_flat.view(B, T, D)
+                ld = ld_flat.view(B, T).sum(dim=1)
+                logdet = logdet + ld
+        else:
+            for i, block in reversed(list(enumerate(self.blocks))):
+                flip = (i % 2) == 1
+                z_flat = z.reshape(B * T, D)
+                z_flat, ld_flat = block(z_flat, reverse=True, flip=flip)
+                z = z_flat.view(B, T, D)
+                ld = ld_flat.view(B, T).sum(dim=1)
+                logdet = logdet + ld
+
+        return z, logdet
+
+
+class CouplingNet2D(nn.Module):
+    """A small convolutional network that predicts scale (s) and shift (t) for 2D image flows."""
+
+    def __init__(self, channels: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(channels, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(128, 128, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(128, channels * 2, kernel_size=3, padding=1),  # Output has 2x channels for s and t
+        )
+
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        st = self.net(x)
+        C = x.size(1)
+        s, t = st[:, :C], st[:, C:]  # Split the output into scale and shift
+        s = torch.tanh(s) * 1.5  # Bound the scale for numerical stability
+        return s, t
+
+
+class AffineCoupling2D(nn.Module):
+    """
+    An affine coupling layer for 2D images. It splits the input using a mask.
+    One part is left unchanged (identity), and this part is used to predict
+    the scale/shift that will transform the *other* part of the input.
+    """
+
+    def __init__(self, in_ch: int, mask: torch.Tensor):
+        super().__init__()
+        self.register_buffer("mask", mask)  # A binary mask (e.g., checkerboard)
+        self.net = CouplingNet2D(in_ch)
+
+    def forward(self, x: torch.Tensor, reverse: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+        x_id = x * self.mask  # The part that remains unchanged
+        s, t = self.net(x_id)  # Predict s and t from the unchanged part
+
+        if not reverse:  # Forward pass: x -> z
+            # Transform the other part of x: y = x * scale + shift
+            y = x_id + (1 - self.mask) * (x * torch.exp(s) + t)
+            # The log-determinant is just the sum of the logs of the scale factors
+            logdet = ((1 - self.mask) * s).flatten(1).sum(dim=1)
+            return y, logdet
+        else:  # Inverse pass: z -> x
+            # The inverse is cheap to compute: x = (y - shift) / scale
+            y = x_id + (1 - self.mask) * ((x - t) * torch.exp(-s))
+            logdet = -((1 - self.mask) * s).flatten(1).sum(dim=1)
+            return y, logdet
+
+
+def checker_mask(C: int, H: int, W: int, flip: bool = False, device: str = "cpu") -> torch.Tensor:
+    """Creates a checkerboard mask where half the channels are masked."""
+    m = torch.zeros(1, C, H, W, device=device)
+    m[:, ::2, :, :] = 1.0  # Mask even-indexed channels
+    return 1.0 - m if flip else m
+
+
+class TinyFlow2D(nn.Module):
+    """
+    A stack of Affine Coupling layers for 2D images.
+    By alternating the mask between layers, we ensure all dimensions get transformed.
+    
+    Operates on [B, C, H, W] image tensors.
+    """
+
+    def __init__(self, in_ch: int, img_size: int, steps: int = 4):
+        super().__init__()
+        self.in_ch = in_ch
+        self.img_size = img_size
+        self.steps = steps
+        self.blocks = nn.ModuleList(
+            [
+                AffineCoupling2D(
+                    in_ch, checker_mask(in_ch, img_size, img_size, flip=(k % 2 == 1))
+                )
+                for k in range(steps)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor, reverse: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: [B, C, H, W] image tensor
+            reverse: If True, apply inverse transformation
+        
+        Returns:
+            z: [B, C, H, W] transformed image
+            logdet: [B] per-sample log-determinant
+        """
+        logdet = x.new_zeros(x.size(0))
+        z = x
+
+        # Apply the sequence of transformations
+        if not reverse:
+            for b in self.blocks:
+                z, ld = b(z, reverse=False)
+                logdet += ld
+        else:  # Apply in reverse for the inverse pass
+            for b in reversed(self.blocks):
+                z, ld = b(z, reverse=True)
+                logdet += ld
+        return z, logdet
+
+
+class GMMHead(nn.Module):
+    """
+    GMM head for continuous tokens.
+
+    Takes Transformer hidden states [B, T, d_model] and predicts parameters
+    of a Gaussian Mixture Model over token space of dimension D.
+    """
+
+    def __init__(self, d_model: int, d_token: int, K: int) -> None:
+        super().__init__()
+        self.K = K
+        self.D = d_token
+        self.proj = nn.Linear(d_model, K * (1 + 2 * d_token))
+
+    def forward(
+        self, h: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        h: [B, T, d_model]
+        Returns:
+            logits_pi: [B, T, K]
+            mu: [B, T, K, D]
+            log_sigma: [B, T, K, D]
+        """
+        B, T, _ = h.shape
+        out = self.proj(h).view(B, T, self.K, 1 + 2 * self.D)
+        logits_pi = out[..., 0]
+        mu = out[..., 1 : 1 + self.D]
+        log_sigma = out[..., 1 + self.D :]
+        log_sigma = torch.clamp(log_sigma, -7, 2)
+        return logits_pi, mu, log_sigma
+
+
+def gmm_nll(
+    y: torch.Tensor,
+    logits_pi: torch.Tensor,
+    mu: torch.Tensor,
+    log_sigma: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Negative log-likelihood of targets y under predicted GMM.
+
+    Args:
+        y: [B, T, D] target tokens (e.g. latent z at positions 1..T-1)
+        logits_pi: [B, T, K]
+        mu: [B, T, K, D]
+        log_sigma: [B, T, K, D]
+
+    Returns:
+        nll: [B] per-sample negative log-likelihood summed over tokens.
+    """
+    B, T, D = y.shape
+    K = logits_pi.size(-1)
+
+    y_exp = y.unsqueeze(2)  # [B, T, 1, D]
+    inv_var = torch.exp(-2 * log_sigma)
+    logp = (
+        -0.5 * ((y_exp - mu) ** 2 * inv_var).sum(dim=-1)
+        - log_sigma.sum(dim=-1)
+        - 0.5 * D * math.log(2 * math.pi)
+    )  # [B, T, K]
+
+    logmix = F.log_softmax(logits_pi, dim=-1) + logp  # [B, T, K]
+    loglik = torch.logsumexp(logmix, dim=-1).sum(dim=-1)  # [B]
+    return -loglik
+
+
+def uniform_dequantize(x: torch.Tensor) -> torch.Tensor:
+    """
+    Takes a tensor of pixel values (scaled 0-1) and makes them continuous.
+    It adds a tiny amount of uniform noise, breaking the discrete nature of pixel values.
+    This is a crucial step for training continuous models like normalizing flows.
+    
+    Args:
+        x: Tensor in [0, 1] range (can be uint8 or float)
+    
+    Returns:
+        Dequantized tensor in [0, 1] range with uniform noise added.
+    """
+    if x.dtype == torch.uint8:
+        x = x.float() / 255.0
+    return (x + torch.rand_like(x) / 256.0).clamp(0.0, 1.0)
+
+
+def patchify(x: torch.Tensor, patch_size: int = 16) -> torch.Tensor:
+    """
+    Converts a batch of images into a sequence of flattened patches (tokens).
+    It slices the image into a grid and then flattens each patch.
+    
+    Args:
+        x: [B, C, H, W] image tensor
+        patch_size: Size of each patch (default 16)
+    
+    Returns:
+        tokens: [B, N, D] where N = (H//patch_size) * (W//patch_size),
+               D = C * patch_size * patch_size
+    """
+    B, C, H, W = x.shape
+    assert H % patch_size == 0 and W % patch_size == 0, (
+        f"Image dimensions must be divisible by the patch size. "
+        f"Got H={H}, W={W}, patch_size={patch_size}"
+    )
+    
+    # Use 'unfold' to create sliding blocks (patches) across height and width
+    x = x.unfold(2, patch_size, patch_size).unfold(3, patch_size, patch_size)
+    
+    # Reshape and flatten to get the final sequence of tokens
+    x = x.contiguous().permute(0, 2, 3, 1, 4, 5).reshape(B, -1, C * patch_size * patch_size)
+    return x
+
+
+def depatchify(tokens: torch.Tensor, C: int = 3, H: int = 256, W: int = 256, patch_size: int = 16) -> torch.Tensor:
+    """
+    The exact inverse of the 'patchify' function.
+    Converts a sequence of tokens back into an image format.
+    
+    Args:
+        tokens: [B, N, D] where N = (H//patch_size) * (W//patch_size),
+               D = C * patch_size * patch_size
+        C: Number of channels
+        H: Image height
+        W: Image width
+        patch_size: Size of each patch (default 16)
+    
+    Returns:
+        x: [B, C, H, W] image tensor
+    """
+    B, N, D = tokens.shape
+    hp, wp = H // patch_size, W // patch_size  # Number of patches along height and width
+    
+    # Reshape the sequence back into a grid of patches
+    x = tokens.reshape(B, hp, wp, C, patch_size, patch_size)
+    
+    # Permute and reshape to reconstruct the final image
+    x = x.permute(0, 3, 1, 4, 2, 5).reshape(B, C, H, W)
+    return x
+
+
diff --git a/src/astropt/local_datasets.py b/src/astropt/local_datasets.py
index 1db3a1b..15d8a7d 100644
--- a/src/astropt/local_datasets.py
+++ b/src/astropt/local_datasets.py
@@ -173,14 +173,18 @@ def process_modes(x, modality_registry, device, shuf=False):
         X = {}
         Y = {}
         for ii, mode in enumerate(modes):
+            raw_flag = x.get(f"{mode}_is_raw", False)
+
             X[mode] = x_on_device[mode]
             X[f"{mode}_positions"] = x_on_device[f"{mode}_positions"]
             Y[mode] = x_on_device[mode]
-            if ii == 0:
-                Y[mode] = Y[mode][:, 1:]
-            if len(modes) == 1:
-                X[mode] = X[mode][:, :-1]
-                X[f"{mode}_positions"] = X[f"{mode}_positions"][:, :-1]
+
+            if not raw_flag:
+                if ii == 0:
+                    Y[mode] = Y[mode][:, 1:]
+                if len(modes) == 1:
+                    X[mode] = X[mode][:, :-1]
+                    X[f"{mode}_positions"] = X[f"{mode}_positions"][:, :-1]
 
         return {"X": X, "Y": Y}
 
diff --git a/src/astropt/model.py b/src/astropt/model.py
index b2ff2bc..adf1fac 100644
--- a/src/astropt/model.py
+++ b/src/astropt/model.py
@@ -36,6 +36,15 @@
     flex_attention_avail = False
 from torch.nn import functional as F
 
+from astropt.jetformer import (
+    GMMHead as JetformerGMMHead,
+    TinyFlow2D,
+    gmm_nll,
+    patchify,
+    depatchify,
+    uniform_dequantize,
+)
+
 
 @dataclass
 class ModalityConfig:
@@ -292,6 +301,67 @@ def forward(self, x):
             return x
 
 
+class JetformerImageEncoder(Encoder):
+    """Encoder for images when using Jetformer tokeniser.
+
+    Matches original JetFormerLite workflow:
+    1. Accepts raw images [B, C, H, W]
+    2. Applies uniform_dequantize
+    3. Flows on image space → z [B, C, H, W]
+    4. Patchifies z → tokens [B, T, D]
+    5. Adds noise to tokens (not z)
+    6. Projects tokens to embeddings
+
+    Caches patchified z tokens and logdet for loss computation.
+    """
+
+    def __init__(self, config, in_size, img_size: int, n_chan: int, patch_size: int):
+        super().__init__(config, in_size)
+        self.img_size = img_size
+        self.n_chan = n_chan
+        self.patch_size = patch_size
+        # Flow operates on image space [B, C, H, W]
+        self.flow = TinyFlow2D(in_ch=n_chan, img_size=img_size, steps=config.jetformer_flow_steps)
+        self.jetformer_noise_max = config.jetformer_noise_max
+        self.jetformer_noise_min = config.jetformer_noise_min
+        self.jet_epoch_frac = 1.0
+        # Cache patchified z tokens (not z directly) and logdet
+        self.last_tokens: torch.Tensor | None = None
+        self.last_logdet: torch.Tensor | None = None
+
+    def set_jet_epoch_frac(self, frac: float) -> None:
+        self.jet_epoch_frac = float(frac)
+
+    def forward(self, x):
+        # x: [B, C, H, W] raw images (for Jetformer)
+        # 1. Uniform dequantization
+        x = uniform_dequantize(x)
+        
+        # 2. Flow on image space → z [B, C, H, W]
+        z, logdet = self.flow(x, reverse=False)
+        
+        # 3. Patchify z → tokens [B, T, D]
+        tokens = patchify(z, self.patch_size)
+        self.last_tokens = tokens
+        self.last_logdet = logdet.detach()
+        
+        # 4. Add noise to tokens (not z) - matching original
+        sigma = self.jetformer_noise_max + (
+            self.jetformer_noise_min - self.jetformer_noise_max
+        ) * self.jet_epoch_frac
+        if self.training and sigma > 0.0:
+            tokens = tokens + torch.randn_like(tokens) * sigma
+        
+        # 5. Project tokens to embeddings
+        if self.tokeniser == "affine":
+            return self.c_fc(tokens)
+        # AIM-style projection (same as Encoder)
+        tokens = self.c_fc(tokens)
+        tokens = new_gelu(tokens)
+        tokens = self.c_proj(tokens)
+        return tokens
+
+
 class Decoder(nn.Module):
     """base module to move from embedding space to data space"""
 
@@ -338,7 +408,7 @@ class GPTConfig:
     dropout: float = 0.0
     bias: bool = False  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
     attn_type: str = "causal"  # causal or prefix
-    tokeniser: str = "aim" # one of "aim" or "affine"
+    tokeniser: str = "aim"  # one of "aim", "affine", or "jetformer"
     # LoRA params
     lora_r: int = 0  # rank, 0 disables LoRA
     lora_alpha: float = 2.0
@@ -347,6 +417,12 @@ class GPTConfig:
     # LLM specific parameters
     backbone: str = "native"  # native or llm
     llm_model_name: str = None
+    # Jetformer-specific hyperparameters (used only when tokeniser == "jetformer" and backbone == "native")
+    jetformer_flow_steps: int = 4
+    jetformer_gmm_K: int = 4
+    jetformer_noise_max: float = 0.1
+    jetformer_noise_min: float = 0.0
+    img_size: int = 256  # Image size (H=W) for Jetformer image-space flow
 
 
 class GPT(nn.Module):
@@ -361,6 +437,8 @@ def __init__(
         self.config = config
         self.modality_registry = modality_registry
         self.backbone = config.backbone
+        # Jetformer noise curriculum state (only used for native + jetformer)
+        self.jet_epoch_frac: float = 1.0
 
         if self.backbone == "native":
             self._init_native_backbone(config)
@@ -400,6 +478,20 @@ def __init__(
             print(f"Total parameters: {total_params / 1e6:.2f}M")
             print(f"Trainable parameters: {trainable_params / 1e6:.2f}M")
 
+    def set_jetformer_schedule(self, iter_num: int, max_iters: int) -> None:
+        """Set Jetformer noise schedule fraction based on global iteration."""
+        if max_iters <= 0:
+            self.jet_epoch_frac = 1.0
+            return
+        frac = float(iter_num) / float(max_iters)
+        frac = max(0.0, min(1.0, frac))
+        self.jet_epoch_frac = frac
+        # Propagate to Jetformer encoders if present
+        if hasattr(self, "encoders"):
+            for enc in self.encoders.values():
+                if hasattr(enc, "set_jet_epoch_frac"):
+                    enc.set_jet_epoch_frac(self.jet_epoch_frac)
+
     def get_num_params(self):
         """Return the number of parameters in the model."""
         return sum(p.numel() for p in self.parameters())
@@ -419,7 +511,15 @@ def _init_native_backbone(self, config):
         decoders = {}
         embedders = {}
         for name, mod_config in self.modality_registry.modalities.items():
-            if mod_config.vocab_size > 0:
+            if config.tokeniser == "jetformer" and name == "images":
+                # For Jetformer, need image dimensions and patch size
+                img_size = config.img_size
+                n_chan = config.n_chan
+                patch_size = mod_config.patch_size
+                encoders[name] = JetformerImageEncoder(
+                    config, mod_config.input_size, img_size, n_chan, patch_size
+                )
+            elif mod_config.vocab_size > 0:
                 # for e.g. if you have a list of integers to process a la AION
                 # if we define a vocab size 
                 encoders[name] = Embedder(config, vocab_size=mod_config.vocab_size)
@@ -435,6 +535,17 @@ def _init_native_backbone(self, config):
         self.decoders = nn.ModuleDict(decoders)
         self.embedders = nn.ModuleDict(embedders)
 
+        # Jetformer GMM head for images modality (only used for native + jetformer)
+        if (
+            config.tokeniser == "jetformer"
+            and "images" in self.modality_registry.modalities
+        ):
+            images_cfg = self.modality_registry.get_config("images")
+            patch_dim = images_cfg.input_size
+            self.jetformer_images_head = JetformerGMMHead(
+                config.n_embd, patch_dim, config.jetformer_gmm_K
+            )
+
         # with weight tying when using torch.compile() some warnings get generated:
         # "UserWarning: functional_call was passed multiple values for tied weights.
         # This behavior is deprecated and will be an error in future versions"
@@ -600,6 +711,34 @@ def _forward_native(
             x = block(x, block_mask=block_mask)
         x = self.transformer.ln_f(x)
 
+        # Jetformer images-only path (native backbone)
+        if (
+            self.config.backbone == "native"
+            and self.config.tokeniser == "jetformer"
+            and self.modality_registry.names() == ["images"]
+        ):
+            # Only images modality is present; encoders["images"] is JetformerImageEncoder
+            # Get patchified z tokens (not z directly) - matching original JetFormerLite
+            tokens = self.encoders["images"].last_tokens
+            logdet = self.encoders["images"].last_logdet
+            if tokens is None or logdet is None:
+                raise RuntimeError(
+                    "JetformerImageEncoder did not cache tokens/logdet; forward() must be called before loss."
+                )
+            # x currently contains hidden states for all tokens (just images here)
+            h_images = x
+            logits_pi, mu, log_sigma = self.jetformer_images_head(h_images[:, :-1])
+            # Target is patchified z tokens at positions 1..T (matching original)
+            target_tokens = tokens[:, 1:, :]
+            nll_gmm = gmm_nll(target_tokens, logits_pi, mu, log_sigma)
+            loss_images = nll_gmm - logdet
+            loss = loss_images.mean()
+
+            # Optionally decode images for logging (not used in loss)
+            outputs = {}
+            outputs["images"] = self.decoders["images"](h_images)
+            return outputs, loss
+
         outputs = {}
         current_idx = 0
 
@@ -1040,6 +1179,100 @@ def estimate_mfu(self, fwdbwd_per_iter, dt):
         mfu = flops_achieved / flops_promised
         return mfu
 
+    @torch.no_grad()
+    def jetformer_reconstruct_images(
+        self, x_real: torch.Tensor, positions: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        """
+        Reconstruct images using Jetformer in a teacher-forced way.
+        Matches original JetFormerLite.sample(reconstruction) workflow.
+
+        Args:
+            x_real: [B, C, H, W] ground-truth raw images.
+            positions: optional [B, T] long tensor of positions. If None, a
+                simple 0..T-1 range is used per batch element (T = (H//patch_size) * (W//patch_size)).
+
+        Returns:
+            x_pred: [B, C, H, W] reconstructed images.
+        """
+        if self.config.backbone != "native" or self.config.tokeniser != "jetformer":
+            raise RuntimeError("jetformer_reconstruct_images is only valid for native Jetformer runs.")
+        if "images" not in self.encoders:
+            raise RuntimeError("Images encoder not found; Jetformer expects an 'images' modality.")
+
+        device = next(self.parameters()).device
+        x_real = x_real.to(device)
+
+        img_encoder = self.encoders["images"]
+        if not hasattr(img_encoder, "flow"):
+            raise RuntimeError("Images encoder does not have a flow; expected JetformerImageEncoder.")
+
+        B, C, H, W = x_real.shape
+        patch_size = img_encoder.patch_size
+        T = (H // patch_size) * (W // patch_size)
+
+        if positions is None:
+            positions = torch.arange(T, device=device, dtype=torch.long).unsqueeze(0).expand(B, T)
+        else:
+            positions = positions.to(device)
+
+        # 1. Apply uniform dequantization (matching encoder)
+        x_real = uniform_dequantize(x_real)
+        
+        # 2. Pass real images through the flow to get latent z [B, C, H, W]
+        z_real, _ = img_encoder.flow(x_real, reverse=False)
+
+        # 3. Patchify to get the real tokens [B, T, D]
+        tokens_real = patchify(z_real, patch_size)
+
+        # 4. Get the model's predictions (teacher-forced)
+        # Temporarily disable noise
+        old_frac = getattr(img_encoder, "jet_epoch_frac", 1.0)
+        img_encoder.set_jet_epoch_frac(0.0)
+        tok_emb = img_encoder.c_fc(tokens_real)
+        if img_encoder.tokeniser != "affine":
+            tok_emb = new_gelu(tok_emb)
+            tok_emb = img_encoder.c_proj(tok_emb)
+        img_encoder.set_jet_epoch_frac(old_frac)
+
+        pos_emb = self.embedders["images"](positions)
+        h = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            h = block(h)
+        h = self.transformer.ln_f(h)
+
+        # 5. Get GMM parameters for tokens 1...T-1
+        logits_pi, mu, log_sigma = self.jetformer_images_head(h[:, :-1])
+
+        # 6. Create the "predicted" token sequence
+        tokens_pred = torch.zeros_like(tokens_real)
+        tokens_pred[:, 0] = tokens_real[:, 0]  # Copy first token (it's not predicted)
+
+        # 7. Get deterministic 'mu' from most likely GMM component
+        best_comp_idx = torch.argmax(logits_pi, dim=-1, keepdim=True)  # [B, T-1, 1]
+        D = tokens_real.size(-1)
+        gather_idx = best_comp_idx.unsqueeze(-1).expand(-1, -1, -1, D)
+        sel_mu = torch.gather(mu, 2, gather_idx).squeeze(2)  # [B, T-1, D]
+        tokens_pred[:, 1:] = sel_mu  # Fill in the predicted tokens
+
+        # 8. Depatchify tokens_pred back to image space [B, C, H, W]
+        z_pred = depatchify(tokens_pred, C=C, H=H, W=W, patch_size=patch_size)
+
+        # 9. Invert flow to get reconstructed images
+        x_pred, _ = img_encoder.flow(z_pred, reverse=True)
+        x_pred = x_pred.clamp(0, 1)  # Clamp to [0,1] matching original
+        return x_pred
+
+    @torch.no_grad()
+    def jetformer_reconstruct_patches(
+        self, x_patches: torch.Tensor, positions: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        """
+        Legacy method name for compatibility. Now expects raw images [B,C,H,W].
+        Use jetformer_reconstruct_images() for clarity.
+        """
+        return self.jetformer_reconstruct_images(x_patches, positions)
+
     @torch.no_grad()
     def generate(self, inputs, new_tokens, target_modality, temperature=0.0):
         """