From 115e41e08c136fd851029150ece9e31b2f5bef4b Mon Sep 17 00:00:00 2001
From: Alec Hammond <alechammond@meta.com>
Date: Sat, 7 Feb 2026 19:40:00 -0800
Subject: [PATCH 1/2] Add per-phase timing instrumentation and cold/warm
 benchmarking to examples

Add detailed timing to prepare_simulation! (per-phase: init_geometry,
init_boundaries, add_sources, init_fields, init_monitors) and to run()
(total steps, elapsed time, MVoxels/s overall and after 50-step warmup).

Augment all 6 example scripts to run twice with fresh simulation objects:
once cold (includes JIT compilation) and once warm, to separate algorithmic
cost from compilation overhead. Plotting is done only once at the end.
---
 examples/dipole.jl        | 68 ++++++++++++++++++++++++++-------------
 examples/gaussian_beam.jl | 38 +++++++++++++---------
 examples/periodic_slab.jl | 67 ++++++++++++++++++++++----------------
 examples/planewave.jl     | 27 +++++++++++++---
 examples/sphere.jl        | 25 +++++++++++---
 examples/waveguide.jl     | 35 ++++++++++++--------
 src/Simulation.jl         | 49 +++++++++++++++++++++++++++-
 7 files changed, 220 insertions(+), 89 deletions(-)

diff --git a/examples/dipole.jl b/examples/dipole.jl
index 319491f..20bdbe8 100644
--- a/examples/dipole.jl
+++ b/examples/dipole.jl
@@ -7,30 +7,52 @@ using CairoMakie
 
 Khronos.choose_backend(Khronos.CUDADevice(), Float64)
 
-# Place a z-polarized dipole with a wavelength 1μm at the center
-sources = [
-    Khronos.UniformSource(
-        time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
-        component = Khronos.Ex(),
-        center = [0.0, 0.0, 0.0],
-        size = [0.0, 0.0, 0.0],
-        # we add a pi phase shift to force a sine wave, rather than a cosine.
-        amplitude = 1im,
-    ),
-]
-
-# Build the simulation object, such that it spans a cube 10μm×10μm×10μm. Place PML 1μm thick.
-sim = Khronos.Simulation(
-    cell_size = [4.0, 4.0, 4.0],
-    cell_center = [0.0, 0.0, 0.0],
-    resolution = 10,#40,
-    sources = sources,
-    boundaries = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
-)
+function build_dipole_sim()
+    # Place a z-polarized dipole with a wavelength 1μm at the center
+    sources = [
+        Khronos.UniformSource(
+            time_profile = Khronos.ContinuousWaveSource(fcen = 1.0),
+            component = Khronos.Ex(),
+            center = [0.0, 0.0, 0.0],
+            size = [0.0, 0.0, 0.0],
+            # we add a pi phase shift to force a sine wave, rather than a cosine.
+            amplitude = 1im,
+        ),
+    ]
+
+    # Build the simulation object, such that it spans a cube 10μm×10μm×10μm. Place PML 1μm thick.
+    sim = Khronos.Simulation(
+        cell_size = [4.0, 4.0, 4.0],
+        cell_center = [0.0, 0.0, 0.0],
+        resolution = 10,#40,
+        sources = sources,
+        boundaries = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]],
+    )
+    return sim
+end
+
+function run_dipole()
+    sim = build_dipole_sim()
+    t_end = 1.0#40.0;
+    Khronos.run(sim, until = t_end)
+    return sim
+end
+
+# --- Run 1: includes JIT compilation ---
+println("=" ^ 60)
+println("RUN 1 (cold — includes JIT compilation)")
+println("=" ^ 60)
+t1 = time()
+sim = run_dipole()
+println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n")
 
-# Run the simulation for 10 "time units"
-t_end = 1.0#40.0;
-Khronos.run(sim, until = t_end)
+# --- Run 2: fresh sim, JIT already done ---
+println("=" ^ 60)
+println("RUN 2 (warm — JIT already compiled)")
+println("=" ^ 60)
+t2 = time()
+sim = run_dipole()
+println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n")
 
 # Plot cross sections of the result
 scene = Khronos.plot2D(
diff --git a/examples/gaussian_beam.jl b/examples/gaussian_beam.jl
index 433656a..55e747a 100644
--- a/examples/gaussian_beam.jl
+++ b/examples/gaussian_beam.jl
@@ -53,7 +53,11 @@ function build_simulation(λ, waist_radius, resolution)
     return sim
 end
 
-function run_simulation!(sim::Khronos.SimulationData)
+function run_gaussian_beam()
+    λ = 1.0
+    waist_radius = 0.75
+    resolution = 20.0
+    sim = build_simulation(λ, waist_radius, resolution)
     Khronos.run(
         sim,
         until_after_sources = Khronos.stop_when_dft_decayed(
@@ -62,27 +66,31 @@ function run_simulation!(sim::Khronos.SimulationData)
             maximum_runtime = 300.0,
         ),
     )
+    return sim
 end
 
-function plot_source_profile!(sim::Khronos.SimulationData)
-    Khronos.prepare_simulation!(sim)
-    Ex = sum(sim.source_data[1].amplitude_data, dims = 3)[:, :, 1]
-    heatmap(Ex)
-end
-
+# --- Run 1: includes JIT compilation ---
+println("=" ^ 60)
+println("RUN 1 (cold — includes JIT compilation)")
+println("=" ^ 60)
+t1 = time()
+sim = run_gaussian_beam()
+println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n")
+
+# --- Run 2: fresh sim, JIT already done ---
+println("=" ^ 60)
+println("RUN 2 (warm — JIT already compiled)")
+println("=" ^ 60)
+t2 = time()
+sim = run_gaussian_beam()
+println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n")
+
+# Visualize the source profile
 λ = 1.0
-waist_radius = 0.75
-resolution = 20.0
-
-sim = build_simulation(λ, waist_radius, resolution)
-
-# Visualize the source profile in the time and frequency domains
 frequencies = 0.75:0.005:1.25
 scene = Khronos.plot_timesource(sim, sim.sources[1], frequencies)
 save("sources.png", scene)
 
-run_simulation!(sim)
-
 # Visualize the response of the monitor
 scene = Khronos.plot_monitor(sim.monitors[1], 1)
 save("gaussian_beam.png", scene)
diff --git a/examples/periodic_slab.jl b/examples/periodic_slab.jl
index 598fe1b..452e32b 100644
--- a/examples/periodic_slab.jl
+++ b/examples/periodic_slab.jl
@@ -105,39 +105,50 @@ function compute_power(sim::Khronos.SimulationData)
     return sum(abs.(sim.monitors[1].monitor_data.fields) .^ 2, dims = [1, 2])[1, 1, 1, :]
 end
 
-function main()
-    # Arbitrary simulation parameters that produce at least two maxima
-    d = 0.5
-    n = 3.5
-    λ = range(1.0, 2.0, length = 100)
-    resolution = 50
-
-    sim = build_simulation(d, n, λ, resolution; plot_geometry = false)
+# Arbitrary simulation parameters that produce at least two maxima
+d = 0.5
+n = 3.5
+λ = range(1.0, 2.0, length = 100)
+resolution = 50
 
+function run_periodic_slab(d, n, λ, resolution)
     # Run a normalization simulation
-    run_simulation!(sim)
-
-    # Compute the total power to normalize the actual simulation
-    baseline = compute_power(sim)
+    sim_norm = build_simulation(d, n, λ, resolution; plot_geometry = false)
+    run_simulation!(sim_norm)
+    baseline = compute_power(sim_norm)
 
     # Run the actual simulation
-    sim = build_simulation(d, n, λ, resolution; plot_geometry = true)
-    run_simulation!(sim)
-    slab = compute_power(sim)
+    sim_slab = build_simulation(d, n, λ, resolution; plot_geometry = true)
+    run_simulation!(sim_slab)
+    slab = compute_power(sim_slab)
 
-    # Normalize the transmission response
     T_Khronos = slab ./ baseline
-
-    # Compute the analytic response for comparison
-    T = slab_analytical(d, n, λ)
-
-    # Plot the comparison
-    f = Figure()
-    ax = Axis(f[1, 1], xlabel = "λ (μm)", ylabel = "T (a.u.)")
-    line1 = scatterlines!(ax, λ, Array(T))
-    line2 = scatterlines!(ax, λ, Array(T_Khronos))
-    legend = Legend(f[1, 2], [line1, line2], ["Analytic", "Khronos"])
-    save("periodic_slab.png", f)
+    return T_Khronos
 end
 
-main()
+# --- Run 1: includes JIT compilation ---
+println("=" ^ 60)
+println("RUN 1 (cold — includes JIT compilation)")
+println("=" ^ 60)
+t1 = time()
+T_Khronos = run_periodic_slab(d, n, λ, resolution)
+println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n")
+
+# --- Run 2: fresh sim, JIT already done ---
+println("=" ^ 60)
+println("RUN 2 (warm — JIT already compiled)")
+println("=" ^ 60)
+t2 = time()
+T_Khronos = run_periodic_slab(d, n, λ, resolution)
+println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n")
+
+# Compute the analytic response for comparison
+T = slab_analytical(d, n, λ)
+
+# Plot the comparison
+f = Figure()
+ax = Axis(f[1, 1], xlabel = "λ (μm)", ylabel = "T (a.u.)")
+line1 = scatterlines!(ax, λ, Array(T))
+line2 = scatterlines!(ax, λ, Array(T_Khronos))
+legend = Legend(f[1, 2], [line1, line2], ["Analytic", "Khronos"])
+save("periodic_slab.png", f)
diff --git a/examples/planewave.jl b/examples/planewave.jl
index c5f96e5..9c5f57f 100644
--- a/examples/planewave.jl
+++ b/examples/planewave.jl
@@ -48,11 +48,30 @@ function build_simulation(λ, resolution)
     return sim
 end
 
-λ = 1.0
-resolution = 40.0
+function run_planewave()
+    λ = 1.0
+    resolution = 40.0
+    sim = build_simulation(λ, resolution)
+    Khronos.run(sim, until = 60)
+    return sim
+end
+
+# --- Run 1: includes JIT compilation ---
+println("=" ^ 60)
+println("RUN 1 (cold — includes JIT compilation)")
+println("=" ^ 60)
+t1 = time()
+sim = run_planewave()
+println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n")
+
+# --- Run 2: fresh sim, JIT already done ---
+println("=" ^ 60)
+println("RUN 2 (warm — JIT already compiled)")
+println("=" ^ 60)
+t2 = time()
+sim = run_planewave()
+println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n")
 
-sim = build_simulation(λ, resolution)
-Khronos.run(sim, until = 60)
 scene = Khronos.plot2D(
     sim,
     Khronos.Ex(),
diff --git a/examples/sphere.jl b/examples/sphere.jl
index fd6b19a..65f8610 100644
--- a/examples/sphere.jl
+++ b/examples/sphere.jl
@@ -48,11 +48,28 @@ function build_sphere_sim(resolution, radius; include_loss = false)
     return sim, s_xyz
 end
 
-sim, s_xyz = build_sphere_sim(20.0, 1.0; include_loss = true)
+function run_sphere()
+    sim, s_xyz = build_sphere_sim(20.0, 1.0; include_loss = true)
+    t_end = 10.0;
+    Khronos.run(sim, until = t_end)
+    return sim, s_xyz
+end
+
+# --- Run 1: includes JIT compilation ---
+println("=" ^ 60)
+println("RUN 1 (cold — includes JIT compilation)")
+println("=" ^ 60)
+t1 = time()
+sim, s_xyz = run_sphere()
+println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n")
 
-# Run the simulation until steady state
-t_end = 10.0;
-Khronos.run(sim, until = t_end)
+# --- Run 2: fresh sim, JIT already done ---
+println("=" ^ 60)
+println("RUN 2 (warm — JIT already compiled)")
+println("=" ^ 60)
+t2 = time()
+sim, s_xyz = run_sphere()
+println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n")
 
 # Visualize the fields
 scene = Khronos.plot2D(
diff --git a/examples/waveguide.jl b/examples/waveguide.jl
index 9dd6a13..ddc5d47 100644
--- a/examples/waveguide.jl
+++ b/examples/waveguide.jl
@@ -46,22 +46,29 @@ function build_waveguide_simulation(λ::Number)
     return sim
 end
 
-λ = 1.55
-sim = build_waveguide_simulation(λ)
-
-# scene = Khronos.plot2D(
-#     sim,
-#     nothing,
-#     Khronos.Volume([0.0, 0.0, 0.0], [6.0, 2.0, 2.0]);
-#     plot_geometry=true,
-# )
-
+function run_waveguide()
+    λ = 1.55
+    sim = build_waveguide_simulation(λ)
+    t_end = 40.0;
+    Khronos.run(sim, until = t_end)
+    return sim
+end
 
-# scene = Khronos.plot_source(sim, sim.sources[1])
-# save("waveguide_source.png", scene)
+# --- Run 1: includes JIT compilation ---
+println("=" ^ 60)
+println("RUN 1 (cold — includes JIT compilation)")
+println("=" ^ 60)
+t1 = time()
+sim = run_waveguide()
+println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n")
 
-t_end = 40.0;
-Khronos.run(sim, until = t_end)
+# --- Run 2: fresh sim, JIT already done ---
+println("=" ^ 60)
+println("RUN 2 (warm — JIT already compiled)")
+println("=" ^ 60)
+t2 = time()
+sim = run_waveguide()
+println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n")
 
 scene = Khronos.plot2D(
     sim,
diff --git a/src/Simulation.jl b/src/Simulation.jl
index 713e0a6..c32340f 100644
--- a/src/Simulation.jl
+++ b/src/Simulation.jl
@@ -65,25 +65,41 @@ function prepare_simulation!(sim::SimulationData)
         return
     end
 
-    @info("Preparing simulation object...")
+    num_voxels = sim.Nx * sim.Ny * sim.Nz
+    @info("Preparing simulation object ($(sim.Nx)×$(sim.Ny)×$(sim.Nz) = $(num_voxels) voxels)...")
+
+    t_start = time()
 
     # prepare geometry
+    t0 = time()
     init_geometry(sim, sim.geometry)
+    t1 = time()
+    @info("  init_geometry:   $(round(t1-t0, digits=3))s")
 
     # prepare boundaries
     init_boundaries(sim, sim.boundaries)
+    t2 = time()
+    @info("  init_boundaries: $(round(t2-t1, digits=3))s")
 
     # prepare sources
     add_sources(sim, sim.sources)
+    t3 = time()
+    @info("  add_sources:     $(round(t3-t2, digits=3))s")
 
     # determine dimensionality
     get_sim_dims(sim)
 
     # prepare fields
     init_fields(sim, sim.dimensionality)
+    t4 = time()
+    @info("  init_fields:     $(round(t4-t3, digits=3))s")
 
     # prepare time and dft monitors
     init_monitors(sim, sim.monitors)
+    t5 = time()
+    @info("  init_monitors:   $(round(t5-t4, digits=3))s")
+
+    @info("  Total prepare:   $(round(t5-t_start, digits=3))s")
 
     sim.is_prepared = true
 
@@ -119,7 +135,13 @@ function run(
         prepare_simulation!(sim)
     end
 
+    num_voxels = sim.Nx * sim.Ny * sim.Nz
+    step_start = sim.timestep
     @info("Starting simulation...")
+    t_run_start = time()
+    warmup_steps = 50
+    t_after_warmup = nothing
+    steps_at_warmup = 0
     # prepare the step functions
     wait_for_sources = false
     if until isa Function
@@ -141,6 +163,10 @@ function run(
         time_for_sources = last_source_time(sim)
         while round_time(sim) <= time_for_sources
             step!(sim)
+            if isnothing(t_after_warmup) && (sim.timestep - step_start) == warmup_steps
+                t_after_warmup = time()
+                steps_at_warmup = sim.timestep - step_start
+            end
         end
         @info("All sources have terminated...")
     end
@@ -149,6 +175,27 @@ function run(
     # break out of the loop.
     while !(_run_func(sim))
         step!(sim)
+        if isnothing(t_after_warmup) && (sim.timestep - step_start) == warmup_steps
+            t_after_warmup = time()
+            steps_at_warmup = sim.timestep - step_start
+        end
+    end
+
+    t_run_end = time()
+    total_steps = sim.timestep - step_start
+    elapsed = t_run_end - t_run_start
+    if total_steps > 0 && elapsed > 0
+        mcells_per_s = num_voxels * total_steps / elapsed / 1e6
+        if !isnothing(t_after_warmup) && total_steps > steps_at_warmup
+            steady_steps = total_steps - steps_at_warmup
+            steady_elapsed = t_run_end - t_after_warmup
+            steady_rate = num_voxels * steady_steps / steady_elapsed / 1e6
+            @info("Simulation complete: $(total_steps) steps in $(round(elapsed, digits=3))s " *
+                  "($(round(mcells_per_s, digits=1)) MVoxels/s overall, " *
+                  "$(round(steady_rate, digits=1)) MVoxels/s after warmup)")
+        else
+            @info("Simulation complete: $(total_steps) steps in $(round(elapsed, digits=3))s ($(round(mcells_per_s, digits=1)) MVoxels/s)")
+        end
     end
 
     return

From 5787dcf1d4eabc3777b52578ff65aebe2851c6f2 Mon Sep 17 00:00:00 2001
From: Alec Hammond <alechammond@meta.com>
Date: Sat, 7 Feb 2026 21:19:59 -0800
Subject: [PATCH 2/2] Add feature roadmap and example coverage plan

Two new documents in docs/:

- ROADMAP.md: Three-layer architecture (Frontend, Graph Compiler, Engine
  Backend) organizing 32+ feature gaps identified by comparing Khronos
  against Meep, Tidy3D, and fdtdx. Each feature specifies priority tier
  (P0-P3), implementation guidance, code references, and scope estimates.

- EXAMPLES.md: 49 proposed examples mapped against 33 Meep examples,
  199 Tidy3D notebooks, and 5 fdtdx examples. Includes 13 scaling and
  performance benchmarks (9 buildable now), cross-solver coverage matrix,
  and phased implementation plan aligned with the roadmap.
---
 docs/EXAMPLES.md | 1293 ++++++++++++++++++++++++++++++++++++++++
 docs/ROADMAP.md  | 1463 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2756 insertions(+)
 create mode 100644 docs/EXAMPLES.md
 create mode 100644 docs/ROADMAP.md

diff --git a/docs/EXAMPLES.md b/docs/EXAMPLES.md
new file mode 100644
index 0000000..f89ebd1
--- /dev/null
+++ b/docs/EXAMPLES.md
@@ -0,0 +1,1293 @@
+# Khronos.jl Example Coverage Plan
+
+This document catalogs every example and tutorial across the four reference
+FDTD solvers (Khronos.jl, Meep, Tidy3D, fdtdx), identifies coverage gaps,
+and specifies the examples Khronos should implement to maximize testing of
+feature parity, correctness, performance, and coverage.
+
+---
+
+## Current Khronos.jl Examples
+
+Khronos currently has **6 examples**, **3 benchmarks**, and **8 test files**.
+
+| # | File | Physics | Dim | Sources | Monitors | Materials | Validation |
+|---|------|---------|-----|---------|----------|-----------|------------|
+| 1 | `examples/dipole.jl` | Point dipole radiation | 3D | UniformSource (CW) | None | Vacuum | Visual only |
+| 2 | `examples/planewave.jl` | Planewave propagation | 3D | PlaneWaveSource (CW) | None | Vacuum | Visual only |
+| 3 | `examples/sphere.jl` | Mie scattering | 3D | PlaneWaveSource (CW) | None | Lossy dielectric (Ball) | Visual only |
+| 4 | `examples/waveguide.jl` | Waveguide mode excitation | 3D | ModeSource | None | Si + SiO₂ (Cuboid) | Visual only |
+| 5 | `examples/gaussian_beam.jl` | Oblique Gaussian beam | 3D | GaussianBeamSource (pulse) | DFTMonitor | Vacuum | Visual only |
+| 6 | `examples/periodic_slab.jl` | Fabry-Perot transmission | 3D | PlaneWaveSource (pulse) | DFTMonitor (100 freqs) | Dielectric slab | **Analytic comparison** |
+
+**Key gaps in current examples:**
+- All examples are 3D (no 2D examples despite `TwoD_TE`/`TwoD_TM` support)
+- Only 2 of 6 use monitors
+- Only 1 has quantitative validation (periodic_slab vs. Fabry-Perot theory)
+- No examples use: periodic BCs, Bloch BCs, PEC/PMC, flux monitors, mode
+  monitors, near-to-far, dispersive materials, nonlinear materials,
+  symmetry, multi-GPU, cylindrical coordinates, or adjoint optimization
+- No performance benchmarks with cross-solver comparison
+- No 2-run normalization workflow (incident + scattered)
+
+---
+
+## Reference Solver Example Inventory
+
+### Meep — 33 Examples
+
+| Category | Count | Examples |
+|----------|-------|---------|
+| Waveguides / Basics | 3 | straight-waveguide, bent-waveguide, bend-flux |
+| Resonators / Cavities | 5 | ring, holey-wvg-cavity, holey-wvg-bands, metal-cavity-ldos, solve-cw |
+| Gratings / Diffraction | 6 | binary_grating, binary_grating_n2f, binary_grating_oblique, binary_grating_phasemap, polarization_grating, finite_grating |
+| Scattering | 3 | mie_scattering, differential_cross_section, cylinder_cross_section |
+| Reflectance | 3 | refl-angular, refl-quartz, refl_angular_bfast |
+| Sources / Planewaves | 2 | oblique-planewave, oblique-source |
+| Mode Decomposition | 1 | mode-decomposition (taper) |
+| Near-to-Far / Radiation | 4 | antenna-radiation, cavity-farfield, metasurface_lens, zone_plate |
+| Nonlinear Optics | 1 | 3rd-harm-1d |
+| Optical Forces | 1 | parallel-wvgs-force |
+| Perturbation Theory | 1 | perturbation_theory |
+| Absorbed Power | 1 | absorbed_power_density |
+| Stochastic / LED | 1 | stochastic_emitter |
+| GDSII / Circuits | 1 | coupler |
+
+### Tidy3D — 199 Examples (key categories)
+
+| Category | Count | Notable Examples |
+|----------|-------|-----------------|
+| Core Tutorials | 21 | StartHere, BoundaryConditions, TFSF, Symmetry, AutoGrid |
+| Materials | 9 | Dispersion, Fitting, FullyAnisotropic, Gyrotropic, TimeModulation |
+| Geometry / Import | 8 | GDSImport, STLImport, TriangleMesh, PolySlab |
+| Mode Solving | 8 | ModeSolver, ModeOverlap, BatchModeSolver |
+| Waveguide Components | 21 | RingResonator, DirectionalCoupler, YJunction, MMI1x4, EdgeCoupler |
+| Gratings | 5 | GratingCoupler, GratingEfficiency, BraggGratings |
+| Photonic Crystals | 8 | Bandstructure, NanobeamCavity, OptimizedL3 |
+| Metasurfaces | 11 | Metalens, VortexMetasurface, DielectricMetasurfaceAbsorber |
+| Plasmonics | 7 | PlasmonicNanoparticle, YagiUdaNanoantenna |
+| Scattering / Far-Field | 6 | Near2FarSphereRCS, PECSphereRCS, MultipoleExpansion |
+| Resonators | 5 | CavityFOM, ResonanceFinder, HighQSi |
+| Nonlinear | 2 | KerrSidebands, SiWaveguideTPA |
+| RF / Microwave | 21 | PatchAntenna, LumpedElements, CoupledLineBandpassFilter |
+| Inverse Design (autograd) | 26 | Autograd series (0–26), covering bends, metalenses, WDM, etc. |
+| Classical Optimization | 8 | CMA-ES, Bayesian, Genetic Algorithm, PSO, DBS |
+| Multi-Physics | 7 | HeatSolver, ChargeSolver, ThermallyTunedRingResonator |
+
+### fdtdx — 5 Examples
+
+| # | File | Physics | Key Feature |
+|---|------|---------|-------------|
+| 1 | `optimize_ceviche_corner.py` | Waveguide bend inverse design | Full differentiable optimization loop |
+| 2 | `simulate_gaussian_source.py` | Gaussian beam + time reversal | Forward + backward FDTD verification |
+| 3 | `simulate_gaussian_source_anisotropic.py` | Anisotropic slab interaction | Per-axis permittivity/permeability |
+| 4 | `width_sweep_analysis.py` | Waveguide loss parametric sweep | Mode solver vs. FDTD validation |
+| 5 | `check_waveguide_modes.py` | SOI waveguide mode check | Mode injection verification |
+
+---
+
+## Proposed Khronos Examples
+
+Examples are organized by category. Each entry specifies:
+- **Priority** (P0–P3) aligned with the roadmap
+- **Validates**: what correctness/feature claim the example tests
+- **Requires**: roadmap features needed before this example can work
+- **Reference**: which solver examples it mirrors
+- **Status**: `CAN BUILD NOW` vs. `BLOCKED ON §X.Y`
+
+### Category 1: Foundational Validation
+
+These examples validate core FDTD correctness against analytical solutions.
+They form the regression test suite backbone.
+
+---
+
+#### E1. Dielectric Slab Transmission (Fabry-Perot) ✓ EXISTS
+
+**File:** `examples/periodic_slab.jl`
+**Priority:** P0
+**Validates:** Broadband pulse propagation, DFT monitors, power extraction,
+two-run normalization workflow
+**Reference:** Meep `bend-flux` (normalization pattern), Tidy3D `StartHere`
+**Status:** `EXISTS` — the only quantitatively validated example today.
+
+**Improvements needed:**
+- Add 2D version (`TwoD_TM`) to validate 2D code paths
+- Add explicit flux monitor once §1.3 + §3.3 are implemented
+- Add error metric (max |T_khronos - T_analytic|) printed to stdout
+
+---
+
+#### E2. Fresnel Reflectance (Normal Incidence)
+
+**Priority:** P0
+**Validates:** Material boundaries, field continuity at interfaces, DFT
+monitor accuracy
+**Reference:** Meep `refl-quartz` (normal-incidence reflectance vs. Fresnel)
+**Computes:** R(λ) for a planar air → n=3.5 dielectric interface; compare
+to Fresnel formula R = |(n-1)/(n+1)|²
+
+**Setup:**
+- 1D-equivalent (thin slab in 3D with tiny transverse extent)
+- GaussianPulseSource, broadband (0.4–1.0 μm range)
+- Two DFT flux monitors: reflected (behind source) and transmitted
+- Two-run normalization (empty → with slab)
+
+**Status:** `CAN BUILD NOW`
+**New features exercised:** Two-run normalization workflow, reflected flux
+
+---
+
+#### E3. Angular Fresnel Reflectance (Oblique Incidence)
+
+**Priority:** P0
+**Validates:** Bloch-periodic boundaries, oblique plane wave injection,
+angle-dependent reflectance
+**Reference:** Meep `refl-angular` (Brewster's angle validation)
+**Computes:** R(λ, θ) for P-polarized light at an air → n=3.5 interface;
+identify Brewster's angle at arctan(3.5) ≈ 74°
+
+**Setup:**
+- Thin cell in y-z, Bloch-periodic in y
+- PlaneWaveSource at angle θ via k_point
+- DFT flux monitors for reflected/transmitted power
+- Sweep over θ = 0°, 10°, 20°, ..., 80°
+
+**Status:** `BLOCKED ON §1.1 + §3.1` (periodic/Bloch BCs)
+**Validates roadmap:** §1.1 Boundary Spec, §3.1 Bloch Kernels
+
+---
+
+#### E4. Mie Scattering Cross Section
+
+**Priority:** P0
+**Validates:** Spherical geometry (Ball), flux monitors on a closed
+surface, scattering cross-section extraction, two-run normalization
+**Reference:** Meep `mie_scattering`, Tidy3D `Near2FarSphereRCS`
+**Computes:** σ_scat(λ) / πr² for a dielectric sphere (n=2, r=1μm);
+compare to analytical Mie series (PyMieScatt or Julia equivalent)
+
+**Setup:**
+- 3D, PML on all faces
+- PlaneWaveSource (GaussianPulse, broadband)
+- 6 DFT flux monitors forming a closed box around sphere
+- Normalization run without sphere
+- Compare to Mie series solution
+
+**Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors)
+**Validates roadmap:** §1.3 FluxMonitor, §3.3 Flux Kernels
+
+---
+
+#### E5. 2D Waveguide Bend Transmission
+
+**Priority:** P0
+**Validates:** 2D simulation (`TwoD_TM`), curved geometry, broadband
+transmission, flux normalization
+**Reference:** Meep `bent-waveguide` / `bend-flux`
+**Computes:** T(λ) through a 90° dielectric waveguide bend (ε=12);
+compare straight-waveguide normalization vs. bend
+
+**Setup:**
+- 2D TM, PML on all faces
+- Block geometry: straight waveguide + 90° bend section
+- GaussianPulseSource
+- DFT flux monitors at input and output ports
+
+**Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors)
+**Validates roadmap:** 2D code paths, §1.3 FluxMonitor
+
+---
+
+### Category 2: Resonators and Spectral Analysis
+
+---
+
+#### E6. Ring Resonator (Q-factor Extraction)
+
+**Priority:** P1
+**Validates:** Curved geometry, resonance detection, quality factor
+computation, convergence-based run termination
+**Reference:** Meep `ring`, Tidy3D `RingResonator`
+**Computes:** Resonant frequencies ωₙ and quality factors Qₙ of a
+dielectric ring resonator
+
+**Setup:**
+- 2D, PML, Cylinder geometry (ring = outer cylinder - inner cylinder)
+- GaussianPulseSource (broadband, inside ring)
+- Time monitor at a point inside the ring
+- Post-process: FFT or Harminv-style analysis to extract ω and Q
+
+**Status:** `BLOCKED ON §3.12` (time monitor kernel) + resonance extraction
+**Validates roadmap:** §3.12 TimeMonitor, §3.19 Harminv (optional)
+
+---
+
+#### E7. Photonic Crystal Waveguide Bands
+
+**Priority:** P1
+**Validates:** Periodic boundaries, Bloch k-sweep, band structure
+extraction
+**Reference:** Meep `holey-wvg-bands`
+**Computes:** ω(k) band diagram of a 1D photonic crystal waveguide
+(periodic holes in a dielectric strip)
+
+**Setup:**
+- 2D, periodic in x (propagation), PML in y
+- Sweep k_x over the irreducible Brillouin zone
+- At each k, run broadband pulse and extract resonances
+- Plot ω vs. k band diagram
+
+**Status:** `BLOCKED ON §1.1 + §3.1` (periodic/Bloch BCs)
+**Validates roadmap:** §1.1 Boundary Spec, §3.1 Periodic Kernels
+
+---
+
+#### E8. Photonic Crystal Cavity Q
+
+**Priority:** P2
+**Validates:** Defect cavity in a periodic lattice, high-Q resonance
+extraction
+**Reference:** Meep `holey-wvg-cavity`, Tidy3D `NanobeamCavity`/`OptimizedL3`
+**Computes:** Resonant frequency and Q-factor of a defect cavity in a 1D
+photonic crystal waveguide; Q vs. number of mirror periods
+
+**Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs) + §3.12 (time monitor)
+
+---
+
+### Category 3: Waveguide Mode Analysis
+
+---
+
+#### E9. Mode Source Validation (Existing Waveguide Example Enhancement)
+
+**Priority:** P0
+**Validates:** ModeSource accuracy, mode profile fidelity, guided power
+**Reference:** fdtdx `check_waveguide_modes`, `width_sweep_analysis`
+**Computes:** Inject fundamental mode into Si waveguide; measure mode
+overlap at output to verify >99% coupling efficiency
+
+**Setup:**
+- Enhance existing `waveguide.jl` with:
+  - DFT flux monitor at input and output planes
+  - Mode overlap computation (once §1.4 + §3.4 are implemented)
+  - Print coupling efficiency and insertion loss (dB)
+
+**Status:** Partial `CAN BUILD NOW` (visual check); full validation
+`BLOCKED ON §1.3 + §3.3` (flux) and §1.4 + §3.4 (mode overlap)
+
+---
+
+#### E10. Waveguide Taper S-Parameters
+
+**Priority:** P1
+**Validates:** Mode decomposition, S-parameter extraction, geometry
+(tapered prism or interpolated cuboids)
+**Reference:** Meep `mode-decomposition` (linear taper reflectance)
+**Computes:** S₁₁ (reflectance) and S₂₁ (transmittance) of fundamental
+mode through a linear waveguide taper; verify R scales as 1/L²
+
+**Setup:**
+- 2D or 3D, PML
+- Waveguide width taper from w₁ to w₂ over length L
+- ModeSource at input, mode monitors at input (reflected) and output
+- Sweep taper length L
+
+**Status:** `BLOCKED ON §1.4 + §3.4` (mode decomposition)
+
+---
+
+#### E11. Directional Coupler
+
+**Priority:** P2
+**Validates:** Evanescent coupling, two-waveguide geometry, power
+splitting ratio
+**Reference:** Meep `coupler`, Tidy3D `DirectionalCoupler`
+**Computes:** Coupling ratio (bar/cross port power) vs. coupling length
+for two parallel Si waveguides
+
+**Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors)
+
+---
+
+### Category 4: Gratings and Diffraction
+
+---
+
+#### E12. Binary Grating Diffraction Orders
+
+**Priority:** P1
+**Validates:** Periodic boundaries, diffraction order decomposition,
+subwavelength optics
+**Reference:** Meep `binary_grating`, Tidy3D `GratingEfficiency`
+**Computes:** Transmittance into diffraction orders (0th through ±5th)
+for a periodic binary phase grating at normal incidence
+
+**Setup:**
+- 2D, periodic in x, PML in y
+- Block geometry: periodic grating ridges
+- PlaneWaveSource (broadband)
+- Flux monitors above and below grating
+
+**Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs) + §1.9 + §3.16
+(diffraction monitor)
+
+---
+
+#### E13. Grating Coupler Efficiency
+
+**Priority:** P2
+**Validates:** 3D grating simulation, fiber coupling, angle-dependent
+efficiency
+**Reference:** Tidy3D `GratingCoupler`, `FocusedApodGC`
+**Computes:** Coupling efficiency from free-space Gaussian beam to
+waveguide mode via grating coupler
+
+**Status:** `BLOCKED ON §1.1` (periodic BCs) + §1.4 (mode monitor)
+
+---
+
+### Category 5: Scattering and Far-Field
+
+---
+
+#### E14. Antenna Radiation Pattern (2D Dipole Far-Field)
+
+**Priority:** P1
+**Validates:** Near-to-far field transformation, radiation pattern
+computation
+**Reference:** Meep `antenna-radiation`, Tidy3D `AntennaCharacteristics`
+**Computes:** Far-field radiation pattern E(θ) of a point dipole in
+vacuum; verify isotropic pattern and total radiated power via far-field
+flux integration
+
+**Setup:**
+- 2D, PML
+- Point source (CW or pulse)
+- Near-field DFT surface (box around source)
+- Near-to-far transformation to observation circle at r >> λ
+- Compare total far-field power to near-field flux (Poynting theorem check)
+
+**Status:** `BLOCKED ON §1.5 + §3.10` (near-to-far field)
+
+---
+
+#### E15. Differential Scattering Cross Section
+
+**Priority:** P2
+**Validates:** 3D scattering, near-to-far field, angular distribution
+**Reference:** Meep `differential_cross_section`, Tidy3D `Near2FarSphereRCS`
+**Computes:** dσ/dΩ(θ) for a dielectric sphere; integrate to get total σ
+and compare to Mie theory
+
+**Status:** `BLOCKED ON §1.5 + §3.10` (near-to-far)
+
+---
+
+### Category 6: Dispersive and Advanced Materials
+
+---
+
+#### E16. Dispersive Slab Reflectance (Lorentzian Material)
+
+**Priority:** P0
+**Validates:** Dispersive material implementation (ADE), frequency-dependent
+reflectance
+**Reference:** Meep `refl-quartz` (Sellmeier/Lorentzian fused quartz)
+**Computes:** R(λ) for a slab of Lorentzian material; compare to
+analytic Fresnel with ε(ω) = ε_∞ + σ/(ω₀² - ω² - iγω)
+
+**Setup:**
+- 1D-equivalent geometry
+- Material with one Lorentzian pole
+- Broadband GaussianPulseSource
+- DFT flux monitors for R and T
+- Analytic comparison using Fresnel + Lorentzian ε(ω)
+
+**Status:** `BLOCKED ON §1.2 + §3.2` (dispersive materials) + §1.3 + §3.3
+(flux monitors)
+**Validates roadmap:** §1.2 LorentzianPole, §3.2 ADE Kernels
+
+---
+
+#### E17. Drude Metal Reflectance
+
+**Priority:** P1
+**Validates:** Drude pole implementation, metallic response, plasma
+frequency behavior
+**Reference:** Meep material library metals, Tidy3D `Dispersion`
+**Computes:** R(λ) for a thick Drude metal slab (e.g., simplified gold);
+verify R ≈ 1 below plasma frequency, transition to transparent above
+
+**Status:** `BLOCKED ON §1.2 + §3.2` (dispersive materials)
+
+---
+
+#### E18. Nonlinear Third-Harmonic Generation
+
+**Priority:** P2
+**Validates:** χ³ nonlinear material, third-harmonic spectral peak,
+power scaling
+**Reference:** Meep `3rd-harm-1d`, Tidy3D `KerrSidebands`
+**Computes:** Transmitted power spectrum showing 3ω peak; verify P(3ω) ∝
+(χ³)² in the weak-nonlinearity regime
+
+**Status:** `BLOCKED ON §1.6 + §3.13` (nonlinear materials)
+
+---
+
+#### E19. Anisotropic Material Slab
+
+**Priority:** P2
+**Validates:** Per-axis permittivity, polarization-dependent transmission
+**Reference:** fdtdx `simulate_gaussian_source_anisotropic`
+**Computes:** Transmission through a slab with ε = (2, 1, 2); verify
+polarization-dependent wavelength shortening
+
+**Setup:**
+- 3D, PML
+- Cuboid slab with `Material(epsilonx=2, epsilony=1, epsilonz=2)`
+- PlaneWaveSource (two runs: x-polarized and y-polarized)
+- DFT monitors to compare T for each polarization
+
+**Status:** `CAN BUILD NOW` (anisotropic ε already supported) — needs
+flux monitors for quantitative validation, otherwise visual check works
+
+---
+
+### Category 7: Boundary Conditions
+
+---
+
+#### E20. PEC and PMC Cavity Modes
+
+**Priority:** P2
+**Validates:** PEC and PMC boundary conditions, cavity resonance formula
+**Reference:** Tidy3D `BoundaryConditions`
+**Computes:** Resonant frequencies of a rectangular PEC cavity; compare
+to analytical f_mnp = c/(2π) √((mπ/a)² + (nπ/b)² + (pπ/d)²)
+
+**Status:** `BLOCKED ON §1.1 + §3.9` (PEC/PMC boundaries)
+
+---
+
+#### E21. Periodic Boundary Verification
+
+**Priority:** P0
+**Validates:** Periodic BC correctness, field continuity across boundary
+**Reference:** Meep `oblique-planewave`, Tidy3D `BoundaryConditions`
+**Computes:** Plane wave propagation in periodic cell; verify field values
+at opposite boundaries are identical (periodic) or phase-shifted (Bloch)
+
+**Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs)
+
+---
+
+### Category 8: Symmetry and Performance
+
+---
+
+#### E22. Mirror Symmetry Acceleration
+
+**Priority:** P2
+**Validates:** Symmetry exploitation, memory/time reduction, result
+equivalence
+**Reference:** Meep `bend-flux` (mirror symmetry), Tidy3D `Symmetry`
+**Computes:** Run waveguide bend with and without mirror symmetry; verify
+identical flux results with 2× memory reduction
+
+**Status:** `BLOCKED ON §1.7 + §2.4` (symmetry)
+
+---
+
+#### E23. Non-Uniform Grid Convergence
+
+**Priority:** P1
+**Validates:** Non-uniform grid accuracy, convergence rate, memory savings
+**Reference:** Tidy3D `AutoGrid` / `CustomGrid`
+**Computes:** Slab transmission with uniform vs. non-uniform grid; verify
+identical results with fewer voxels; measure memory reduction
+
+**Status:** `BLOCKED ON §2.5 + §3.6` (non-uniform grid)
+
+---
+
+### Category 9: Source Types
+
+---
+
+#### E24. Gaussian Beam Waist Verification ✓ PARTIAL
+
+**File:** `examples/gaussian_beam.jl` (exists but needs quantitative check)
+**Priority:** P1
+**Validates:** GaussianBeamSource profile accuracy, beam waist, divergence
+**Reference:** Tidy3D `AdvancedGaussianSources`
+**Computes:** Measure beam waist w(z) at several planes along propagation;
+compare to analytical Gaussian beam formula
+w(z) = w₀ √(1 + (z/z_R)²)
+
+**Improvements to existing example:**
+- Add multiple DFT monitors at different z positions
+- Fit Gaussian to transverse field profile at each z
+- Compare extracted w(z) to theory
+
+**Status:** Enhancement of existing example. `CAN BUILD NOW` (partial)
+
+---
+
+#### E25. CW Source Steady-State Verification
+
+**Priority:** P1
+**Validates:** ContinuousWaveSource reaches true steady state, field
+amplitude matches expected value
+**Reference:** Meep `solve-cw`
+**Computes:** CW point source in vacuum; verify E-field amplitude at
+distance r matches analytical Green's function |E| ∝ 1/r (3D) or
+1/√r (2D)
+
+**Status:** `CAN BUILD NOW`
+
+---
+
+### Category 10: Monitors and Post-Processing
+
+---
+
+#### E26. Time Monitor Validation
+
+**Priority:** P2
+**Validates:** Time monitor kernel, temporal field recording, FFT
+consistency with DFT monitor
+**Reference:** General FDTD practice
+**Computes:** Record E(t) at a point using TimeMonitor; take FFT and
+compare to DFTMonitor at same location — spectra should match
+
+**Status:** `BLOCKED ON §3.12` (time monitor kernel)
+
+---
+
+#### E27. LDOS (Local Density of States)
+
+**Priority:** P2
+**Validates:** LDOS computation, Purcell enhancement near structures
+**Reference:** Meep `metal-cavity-ldos`
+**Computes:** LDOS at the center of a 2D metal cavity; compare to
+analytical Purcell factor Q/(4π²V_eff)
+
+**Status:** `BLOCKED ON §1.10 + §3.17` (LDOS) + §1.2 + §3.2 (dispersive
+materials for metal)
+
+---
+
+### Category 11: Scaling, Speed, and Performance
+
+This category is critical for demonstrating Khronos's primary value
+proposition: GPU-accelerated FDTD that scales. The benchmarks here should
+be run regularly and results tracked over time.
+
+---
+
+#### E28. Single-GPU Throughput vs. Problem Size
+
+**Priority:** P0
+**Validates:** GPU kernel efficiency, memory bandwidth utilization, and
+the regime where GPU acceleration pays off vs. CPU
+**Computes:** Cells/second as a function of grid size N³, from N=32
+(tiny, launch-overhead-dominated) to N=2048 (memory-limit)
+
+**Setup:**
+- 3D, PML, point dipole in vacuum (minimal physics — isolates raw
+  FDTD kernel throughput)
+- Sweep grid sizes: 32³, 64³, 128³, 256³, 512³, 768³, 1024³, 1536³, 2048³
+- Run 100 timesteps (warm, after JIT compilation)
+- Report per size: cells/second, wall time per step, GPU memory used,
+  GPU occupancy (if available)
+- Plot: cells/s vs. N³ (should plateau at memory bandwidth limit)
+
+**Expected behavior:**
+- Small grids (< 64³): GPU underutilized, CPU may be faster
+- Medium grids (128³–512³): GPU ramps up, 10-100× over CPU
+- Large grids (≥ 1024³): Memory-bandwidth-limited plateau, peak cells/s
+
+**Status:** `CAN BUILD NOW` — extends existing `benchmark/dipole.jl`
+which already sweeps resolutions but does not report cells/s vs. N³
+
+---
+
+#### E29. Single-GPU Throughput vs. Physics Complexity
+
+**Priority:** P0
+**Validates:** Kernel specialization overhead — how much does adding
+physics features (PML, materials, monitors, dispersive poles) cost?
+**Computes:** Cells/second at fixed grid size (512³) with progressively
+more physics enabled
+
+**Setup — incremental physics layers:**
+
+| Config | Physics | Expected Overhead |
+|--------|---------|-------------------|
+| A | Vacuum, no PML, no monitors | Baseline (fastest) |
+| B | Vacuum + PML | +10-20% (ψ field updates) |
+| C | Dielectric geometry (2 materials) + PML | +5% (ε lookup) |
+| D | Multi-material (6 layers) + PML | +5% (same kernel) |
+| E | Config C + DFT monitor (10 freqs) | +5-10% |
+| F | Config C + dispersive material (1 pole) | +30-50% (ADE fields) |
+| G | Config C + dispersive material (3 poles) | +50-80% |
+| H | Config C + nonlinear χ³ | +10-20% |
+| I | Config C + non-uniform grid (vector Δx) | +5-10% |
+
+**Report:** Bar chart of cells/s for each config. This directly
+demonstrates the value of Khronos's `Nothing`-dispatch specialization:
+configs without a feature should have zero overhead for that feature.
+
+**Status:** Partially `CAN BUILD NOW` (configs A–E); configs F–I
+`BLOCKED` on respective roadmap features
+
+---
+
+#### E30. Backend Comparison (CUDA vs. ROCm vs. Metal vs. CPU)
+
+**Priority:** P0
+**Validates:** KernelAbstractions.jl multi-backend portability and
+relative performance across hardware
+**Computes:** Cells/second for identical problems on each backend
+
+**Setup:**
+- Fixed problem: 3D vacuum dipole, 512³ grid, PML, 100 timesteps
+- Run on each available backend: `CUDABackend()`, `ROCBackend()`,
+  `MetalBackend()`, `CPU()`
+- Also test with materials (sphere scattering) and monitors
+
+**Report per backend:**
+
+```
+Backend          Device               cells/s      relative
+─────────────────────────────────────────────────────────────
+CUDA             NVIDIA A100          1.5e9        1.00×
+CUDA             NVIDIA H100          2.1e9        1.40×
+ROCm             AMD MI250X           1.2e9        0.80×
+Metal            Apple M1 Pro         0.3e9        0.20×
+CPU (threads=8)  AMD EPYC 7763        0.05e9       0.03×
+CPU (threads=1)  AMD EPYC 7763        0.008e9      0.005×
+```
+
+**Status:** `CAN BUILD NOW` — existing `benchmark/*.yml` files already
+have A100/H100/M1 baselines but don't report in a unified comparison
+
+---
+
+#### E31. Precision Comparison (Float32 vs. Float64)
+
+**Priority:** P1
+**Validates:** Float32 delivers ~2× speedup and ~2× memory savings with
+acceptable accuracy loss for most applications
+**Computes:** Cells/second and accuracy for Float32 vs. Float64 on the
+same problem
+
+**Setup:**
+- Problem: dielectric slab transmission (periodic_slab example)
+- Run both `backend_number = Float32` and `backend_number = Float64`
+- Compare: (1) cells/s, (2) peak GPU memory, (3) transmission spectrum
+  accuracy vs. analytical Fabry-Perot
+
+**Expected results:**
+- Float32: ~2× cells/s, ~2× less memory, max |T_err| < 0.01
+- Float64: baseline cells/s, baseline memory, max |T_err| < 0.001
+
+**Status:** `CAN BUILD NOW`
+
+---
+
+#### E32. JIT Compilation Overhead
+
+**Priority:** P1
+**Validates:** First-run JIT cost and warm-run performance, so users
+know what to expect
+**Computes:** Wall time breakdown: JIT compilation vs. actual simulation
+
+**Setup:**
+- Problem: 3D waveguide with ModeSource (exercises the most complex
+  kernel path)
+- Measure: (1) cold start (first `run` call), (2) warm start (second
+  `run` call with same types), (3) re-warm (different grid size, same
+  types — should be fast due to cached compilation)
+- Report time spent in: Julia compilation, GPU kernel compilation,
+  mode solving, field initialization, actual timestep loop
+
+**Status:** `CAN BUILD NOW` — existing examples already print cold/warm
+timing but don't break it down systematically
+
+---
+
+#### E33. Memory Capacity Test (Maximum Problem Size)
+
+**Priority:** P1
+**Validates:** Maximum achievable problem size on a given GPU, memory
+efficiency of field storage
+**Computes:** Largest N³ grid that fits in GPU memory for each
+configuration
+
+**Setup:**
+- Binary search for max N on a given GPU (e.g., A100 80GB, H100 80GB)
+- Test three configurations:
+  1. Vacuum + PML (6 field arrays + ψ arrays): expect ~1800³ on 80GB
+  2. With 2 materials + DFT monitor: expect ~1600³
+  3. With dispersive material (1 pole, adds 6 P arrays): expect ~1200³
+- Report: N³, total voxels, memory used, memory per voxel
+
+**Theoretical minimum memory per voxel:**
+- 6 field components × 4 bytes (Float32) = 24 bytes/voxel
+- With PML ψ: +12 auxiliary fields = +48 bytes in PML region
+- With 1 dispersive pole: +6 P arrays = +24 bytes/voxel
+- With DFT monitor: +2 complex × N_freq per monitor voxel
+
+**Status:** `CAN BUILD NOW`
+
+---
+
+#### E34. Cross-Solver Performance Comparison
+
+**Priority:** P0
+**Validates:** Khronos's GPU performance advantage is real and quantified
+**Computes:** Identical physical problems across Khronos, Meep, and fdtdx
+
+**Benchmark suite — three problems at three scales:**
+
+| Problem | Small | Medium | Large |
+|---------|-------|--------|-------|
+| Vacuum dipole | 128³ | 512³ | 1024³ |
+| Sphere scattering | 128³ | 512³ | 1024³ |
+| Slab + DFT monitor | 128³ | 512³ | 1024³ |
+
+**Per solver:**
+
+| Solver | Run Method |
+|--------|------------|
+| **Khronos** (CUDA, Float64) | `Khronos.run_benchmark(sim, 100)` |
+| **Khronos** (CUDA, Float32) | Same, `Float32` |
+| **Meep** (CPU, 1 thread) | `sim.run(until=...)` |
+| **Meep** (CPU, 8 threads) | `sim.run(until=...)` with OMP |
+| **fdtdx** (CUDA, Float32) | `run_fdtd(...)` |
+
+**Report:**
+
+```
+Problem: Sphere Scattering 512³ (100 timesteps)
+─────────────────────────────────────────────────
+Solver               cells/s      speedup vs Meep-1T
+Khronos CUDA F32     2.1e9        420×
+Khronos CUDA F64     1.2e9        240×
+fdtdx CUDA F32       1.8e9        360×
+Meep CPU 8T          4.0e7        8×
+Meep CPU 1T          5.0e6        1×
+```
+
+**Setup scripts:** Provide matched simulation definitions in Julia
+(Khronos), Python (Meep), and Python/JAX (fdtdx) that set up physically
+identical problems with the same grid, materials, and timestep count.
+
+**Status:** `CAN BUILD NOW` for Khronos vs. Meep; fdtdx needs JAX env
+
+---
+
+#### E35. Weak Scaling (Multi-GPU)
+
+**Priority:** P1
+**Validates:** Multi-GPU halo exchange overhead, near-linear scaling
+**Reference:** Meep MPI scaling, fdtdx JAX sharding
+**Computes:** Fixed problem size per GPU (512³ per device); measure
+aggregate throughput for 1, 2, 4, 8 GPUs
+
+**Setup:**
+- 3D, PML, dipole source in vacuum
+- MPI-based domain decomposition along longest axis
+- 100 timesteps per measurement
+
+**Report:**
+
+```
+GPUs    Total grid     cells/s        efficiency
+────────────────────────────────────────────────
+1       512³           1.5e9          100%
+2       512×512×1024   2.8e9          93%
+4       512×512×2048   5.4e9          90%
+8       512×512×4096   10.2e9         85%
+```
+
+**Key metric:** Parallel efficiency = (N-GPU throughput) / (N × 1-GPU
+throughput). Target: >85% at 8 GPUs for single-axis decomposition.
+
+**Status:** `BLOCKED ON §2.1 + §3.7` (domain decomposition + halo exchange)
+
+---
+
+#### E36. Strong Scaling (Multi-GPU)
+
+**Priority:** P2
+**Validates:** Multi-GPU speedup for a fixed large problem
+**Computes:** Fixed total problem (2048³); divide across 1, 2, 4, 8 GPUs
+
+**Report:**
+
+```
+GPUs    Partition      wall time/step   speedup
+────────────────────────────────────────────────
+1       2048³          680 ms           1.0×
+2       2048×2048×1024 355 ms           1.9×
+4       2048×2048×512  185 ms           3.7×
+8       2048×2048×256  100 ms           6.8×
+```
+
+**Key metric:** Strong scaling efficiency. Communication-to-computation
+ratio grows as slabs get thinner, so efficiency will degrade. Target:
+>80% at 4 GPUs, >65% at 8 GPUs.
+
+**Status:** `BLOCKED ON §2.1 + §3.7`
+
+---
+
+#### E37. Halo Exchange Overhead Profiling
+
+**Priority:** P2
+**Validates:** Communication is not the bottleneck; compute/communicate
+overlap works
+**Computes:** Breakdown of time per timestep into compute vs.
+communication at various GPU counts and problem sizes
+
+**Setup:**
+- Instrument halo exchange with timers
+- Measure: (1) kernel execution time, (2) halo pack time, (3) MPI
+  send/recv time, (4) halo unpack time, (5) synchronization overhead
+- Test with CUDA-aware MPI (direct GPU-GPU) vs. staged (GPU→CPU→GPU)
+
+**Report:**
+
+```
+2 GPUs, 512³ per GPU, CUDA-aware MPI:
+  Kernel:      8.2 ms (92%)
+  Halo pack:   0.1 ms (1%)
+  MPI comm:    0.5 ms (6%)
+  Halo unpack: 0.1 ms (1%)
+  Total:       8.9 ms
+
+2 GPUs, 512³ per GPU, Staged MPI:
+  Kernel:      8.2 ms (78%)
+  D2H copy:    0.8 ms (8%)
+  MPI comm:    0.7 ms (7%)
+  H2D copy:    0.8 ms (8%)
+  Total:       10.5 ms
+```
+
+**Status:** `BLOCKED ON §2.1 + §3.7`
+
+---
+
+#### E38. Large-Scale Realistic Benchmarks
+
+**Priority:** P1
+**Validates:** Khronos handles real-world-sized problems, not just toy
+examples. These are the benchmarks that matter for marketing and papers.
+
+**Three flagship problems:**
+
+**E38a. Metalens (large metasurface)**
+- 3D: ~500 × 500 × 100 cells at resolution 30 → ~750M voxels
+- Periodic unit cell with dielectric pillars on SiO₂ substrate
+- PlaneWaveSource (broadband), DFT transmission monitor
+- Tests: periodic BCs, multi-material geometry, large grid
+- **Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs)
+
+**E38b. Photonic integrated circuit (multi-component)**
+- 3D: ~2000 × 500 × 100 cells → ~100M voxels
+- Si waveguides, bends, directional coupler on SiO₂
+- ModeSource input, DFT flux monitors at each port
+- Tests: long propagation, multiple materials, many monitors
+- **Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors)
+
+**E38c. Full 3D Mie scattering (large sphere)**
+- 3D: sphere radius = 10λ → ~600³ grid at 30 cells/λ → ~216M voxels
+- PlaneWaveSource, 6-face flux box, PML
+- Tests: large 3D problem, many timesteps, PML performance
+- Compare to Mie theory at 50+ frequencies
+- **Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors)
+
+**E38d. Vacuum propagation at memory limit (single-GPU stress test)**
+- 3D: largest grid that fits on target GPU (e.g., 1800³ on A100 80GB)
+- Point source, PML, no monitors, 100 steps
+- Tests: raw throughput at maximum memory utilization, no OOM
+- **Status:** `CAN BUILD NOW`
+
+---
+
+#### E39. Kernel Roofline Analysis
+
+**Priority:** P2
+**Validates:** Khronos kernels are memory-bandwidth-bound (expected for
+FDTD) and operating near the hardware roofline
+**Computes:** Arithmetic intensity and achieved bandwidth for each
+kernel variant
+
+**Setup:**
+- Use NVIDIA Nsight Compute (or `CUDA.@profile`) to measure per-kernel:
+  - DRAM bytes read/written
+  - FLOPs executed
+  - Achieved memory bandwidth (GB/s)
+  - Arithmetic intensity (FLOPs/byte)
+- Compare to device peak bandwidth (e.g., A100 = 2039 GB/s HBM)
+- Plot on roofline diagram
+
+**Expected results:**
+- Vacuum curl kernel: ~0.5 FLOPs/byte (memory-bound), achieving
+  ~70-80% peak bandwidth
+- PML curl kernel: ~0.8 FLOPs/byte (still memory-bound, more ops
+  per byte due to ψ updates)
+- ADE kernel: ~1.0 FLOPs/byte (approaching compute-bound)
+
+**Status:** `CAN BUILD NOW` (needs CUDA profiling tools)
+
+---
+
+#### E40. Non-Uniform Grid Performance and Accuracy
+
+**Priority:** P1
+**Validates:** Non-uniform grid delivers equivalent accuracy at lower
+memory cost, with acceptable throughput overhead
+**Computes:** Accuracy and performance comparison: uniform grid vs.
+non-uniform grid for a problem with disparate feature scales
+
+**Setup:**
+- Problem: thin metal film (10nm) on glass substrate, illuminated by
+  broadband planewave. Film requires fine grid (Δz = 1nm), substrate
+  needs coarse grid (Δz = 50nm).
+- Uniform grid: Δ = 1nm everywhere → ~2000³ grid (~enormous)
+- Non-uniform grid: Δz = 1nm near film, grading to 50nm away → ~200³
+  equivalent
+- Compare: (1) R(λ) spectra should match, (2) memory: 10-100× reduction,
+  (3) throughput: slight penalty from vector Δx reads
+
+**Status:** `BLOCKED ON §2.5 + §3.6` (non-uniform grid)
+
+---
+
+### Category 12: Inverse Design (Future)
+
+---
+
+#### E41. Waveguide Bend Topology Optimization
+
+**Priority:** P2
+**Validates:** Adjoint solver, Enzyme.jl AD through FDTD kernels,
+convergence to known-good design
+**Reference:** fdtdx `optimize_ceviche_corner`, Tidy3D `Autograd8WaveguideBend`
+**Computes:** Optimize permittivity distribution in a 2D design region to
+minimize insertion loss of a 90° waveguide bend
+
+**Status:** `BLOCKED ON §2.7` (adjoint compilation)
+
+---
+
+#### E42. Metalens Phase Map Optimization
+
+**Priority:** P3
+**Validates:** Large-scale inverse design, metasurface parameterization
+**Reference:** Tidy3D `Autograd7Metalens`, Meep `metasurface_lens`
+**Computes:** Optimize pillar radii in a metalens unit-cell library to
+maximize focal spot intensity
+
+**Status:** `BLOCKED ON §2.7` (adjoint) + §1.1 (periodic BCs)
+
+---
+
+### Category 13: Cylindrical Coordinates
+
+---
+
+#### E43. Cylinder Scattering (Cylindrical FDTD)
+
+**Priority:** P2
+**Validates:** Cylindrical coordinate kernels, r=0 singularity handling,
+~90× speedup vs. 3D for rotationally symmetric problems
+**Reference:** Meep `cylinder_cross_section`
+**Computes:** Scattering cross section of a finite dielectric cylinder
+using cylindrical (r,z) FDTD with angular mode index m
+
+**Status:** `BLOCKED ON §3.14` (cylindrical kernels)
+
+---
+
+#### E44. Fresnel Zone Plate Focusing
+
+**Priority:** P3
+**Validates:** Cylindrical coordinates, near-to-far field, large-scale
+diffractive optics
+**Reference:** Meep `zone_plate`
+**Computes:** Focal spot intensity profile of a binary zone plate using
+cylindrical FDTD
+
+**Status:** `BLOCKED ON §3.14` (cylindrical) + §1.5 + §3.10 (near-to-far)
+
+---
+
+### Category 14: CFS-PML and Boundary Stability
+
+---
+
+#### E45. CFS-PML Evanescent Wave Absorption
+
+**Priority:** P1
+**Validates:** CFS-PML (κ, α parameters) stability for evanescent fields
+**Reference:** Tidy3D `AbsorbingBoundaryReflection`
+**Computes:** Compare PML reflectance for standard σ-only PML vs. CFS-PML
+with κ and α; show improved absorption at grazing incidence and for
+evanescent waves
+
+**Setup:**
+- Dipole source near PML boundary
+- Measure spurious reflection amplitude with and without κ/α
+- Long-time simulation to test stability
+
+**Status:** `BLOCKED ON §3.8` (CFS-PML activation)
+
+---
+
+### Category 15: 2D Fundamentals
+
+---
+
+#### E46. 2D TM Dipole Radiation
+
+**Priority:** P0
+**Validates:** 2D simulation code path (`TwoD_TM`), field patterns match
+analytical 2D Green's function
+**Reference:** Foundational FDTD validation
+**Computes:** Ez field from a 2D point source; compare amplitude decay
+to 1/√r (2D cylindrical wave)
+
+**Status:** `CAN BUILD NOW`
+
+---
+
+#### E47. 2D TE Waveguide Propagation
+
+**Priority:** P0
+**Validates:** 2D TE mode (`TwoD_TE`), correct field components (Hx, Hy, Ez)
+**Computes:** TE mode propagation in a dielectric slab waveguide; verify
+confinement and propagation constant
+
+**Status:** `CAN BUILD NOW`
+
+---
+
+### Category 16: Subpixel Averaging
+
+---
+
+#### E48. Subpixel Averaging Convergence
+
+**Priority:** P1
+**Validates:** Second-order convergence at curved dielectric interfaces
+**Reference:** Meep subpixel smoothing, Tidy3D `SubpixelSpec`
+**Computes:** Ring resonator frequency vs. resolution with and without
+subpixel averaging; verify O(Δx²) convergence with averaging vs.
+O(Δx) without
+
+**Status:** `BLOCKED ON §3.5` (subpixel averaging)
+
+---
+
+### Category 17: Checkpoint and I/O
+
+---
+
+#### E49. Checkpoint / Restart
+
+**Priority:** P3
+**Validates:** HDF5 save/load of simulation state, exact restart
+**Reference:** Meep HDF5, fdtdx JAX checkpointing
+**Computes:** Run simulation for N steps, save state, restart, run N more
+steps; compare to continuous 2N-step run — results should be identical
+
+**Status:** `BLOCKED ON §3.20` (HDF5 I/O)
+
+---
+
+## Cross-Solver Example Coverage Matrix
+
+This matrix shows which examples exist in each solver and whether Khronos
+can implement them today or needs roadmap features.
+
+| Example | Meep | Tidy3D | fdtdx | Khronos | Status |
+|---------|------|--------|-------|---------|--------|
+| **Basics** | | | | | |
+| Dielectric slab transmission | `bend-flux` | `StartHere` | — | `periodic_slab.jl` ✓ | EXISTS |
+| 2D dipole radiation | `straight-waveguide` | — | — | E36 | CAN BUILD |
+| 2D waveguide propagation | `straight-waveguide` | — | — | E37 | CAN BUILD |
+| Planewave in vacuum | `oblique-planewave` | — | `simulate_gaussian_source` | `planewave.jl` ✓ | EXISTS |
+| CW steady-state check | `solve-cw` | — | — | E25 | CAN BUILD |
+| **Fresnel / Reflectance** | | | | | |
+| Normal Fresnel | `refl-quartz` | — | — | E2 | CAN BUILD |
+| Angular Fresnel | `refl-angular` | `BroadbandPlaneWave...` | — | E3 | BLOCKED (BCs) |
+| Dispersive Fresnel | `refl-quartz` | `Dispersion` | — | E16 | BLOCKED (ADE) |
+| **Scattering** | | | | | |
+| Mie scattering (sphere) | `mie_scattering` | `Near2FarSphereRCS` | — | E4 | BLOCKED (flux) |
+| Differential cross section | `differential_cross_section` | `PECSphereRCS` | — | E15 | BLOCKED (N2F) |
+| Cylinder cross section | `cylinder_cross_section` | — | — | E43 | BLOCKED (cyl) |
+| **Waveguides** | | | | | |
+| Waveguide mode launch | — | `ModalSourcesMonitors` | `check_waveguide_modes` | `waveguide.jl` ✓ | EXISTS |
+| Waveguide bend | `bent-waveguide` | `EulerWaveguideBend` | — | E5 | BLOCKED (flux) |
+| Waveguide taper | `mode-decomposition` | `WaveguideSizeConverter` | — | E10 | BLOCKED (mode) |
+| Directional coupler | `coupler` | `DirectionalCoupler` | — | E11 | BLOCKED (flux) |
+| Width sweep (loss) | — | — | `width_sweep_analysis` | E9 | PARTIAL |
+| **Resonators** | | | | | |
+| Ring resonator Q | `ring` | `RingResonator` | — | E6 | BLOCKED (time mon) |
+| PhC cavity Q | `holey-wvg-cavity` | `NanobeamCavity` | — | E8 | BLOCKED (BCs) |
+| PhC band structure | `holey-wvg-bands` | `Bandstructure` | — | E7 | BLOCKED (BCs) |
+| **Gratings** | | | | | |
+| Binary grating orders | `binary_grating` | `GratingEfficiency` | — | E12 | BLOCKED (BCs) |
+| Grating coupler | — | `GratingCoupler` | — | E13 | BLOCKED (BCs) |
+| **Far-Field** | | | | | |
+| Dipole radiation pattern | `antenna-radiation` | `AntennaCharacteristics` | — | E14 | BLOCKED (N2F) |
+| Zone plate focusing | `zone_plate` | `ZonePlateFieldProjection` | — | E44 | BLOCKED (cyl+N2F) |
+| **Materials** | | | | | |
+| Lorentzian dispersion | `refl-quartz` | `Dispersion` | — | E16 | BLOCKED (ADE) |
+| Drude metal | — | `PlasmonicNanoparticle` | — | E17 | BLOCKED (ADE) |
+| χ³ nonlinearity | `3rd-harm-1d` | `KerrSidebands` | — | E18 | BLOCKED (NL) |
+| Anisotropic dielectric | `polarization_grating` | `FullyAnisotropic` | `gaussian_anisotropic` | E19 | CAN BUILD |
+| **Boundaries** | | | | | |
+| PEC/PMC cavity | — | `BoundaryConditions` | — | E20 | BLOCKED (PEC) |
+| Periodic verification | `oblique-planewave` | `BoundaryConditions` | — | E21 | BLOCKED (BCs) |
+| CFS-PML stability | — | `AbsorbingBoundaryReflection` | — | E45 | BLOCKED (CFS) |
+| **Advanced** | | | | | |
+| Mirror symmetry | `bend-flux` | `Symmetry` | — | E22 | BLOCKED (sym) |
+| Non-uniform grid | — | `AutoGrid` | — | E23 | BLOCKED (grid) |
+| LDOS | `metal-cavity-ldos` | — | — | E27 | BLOCKED (LDOS) |
+| Subpixel convergence | — | — | — | E48 | BLOCKED (subpx) |
+| **Performance** | | | | | |
+| Cross-solver benchmark | — | — | — | E34 | CAN BUILD |
+| Single-GPU throughput | — | — | — | E28/E29 | CAN BUILD |
+| Backend comparison | — | — | — | E30 | CAN BUILD |
+| Precision (F32 vs F64) | — | — | — | E31 | CAN BUILD |
+| JIT overhead | — | — | — | E32 | CAN BUILD |
+| Memory capacity | — | — | — | E33 | CAN BUILD |
+| Weak scaling (multi-GPU) | — | — | — | E35 | BLOCKED (MPI) |
+| Strong scaling (multi-GPU) | — | — | — | E36 | BLOCKED (MPI) |
+| Halo profiling | — | — | — | E37 | BLOCKED (MPI) |
+| Large-scale realistic | — | — | — | E38a-d | PARTIAL |
+| Roofline analysis | — | — | — | E39 | CAN BUILD |
+| Non-uniform grid perf | — | — | — | E40 | BLOCKED (grid) |
+| **Inverse Design** | | | | | |
+| Waveguide bend opt | — | `Autograd8WaveguideBend` | `optimize_ceviche_corner` | E41 | BLOCKED (adj) |
+| Metalens opt | — | `Autograd7Metalens` | — | E42 | BLOCKED (adj) |
+| **Gaussian Beam** | | | | | |
+| Beam waist validation | — | `AdvancedGaussianSources` | `simulate_gaussian_source` | E24 / existing ✓ | PARTIAL |
+
+---
+
+## Implementation Priority
+
+### Phase 1: Build Now (no new features required)
+
+These examples can be implemented immediately with the current Khronos
+feature set and provide valuable coverage:
+
+| Example | What it validates |
+|---------|-------------------|
+| **E2** Fresnel Reflectance (Normal) | Material interfaces, DFT accuracy, two-run workflow |
+| **E25** CW Steady-State Verification | CW source, Green's function decay |
+| **E46** 2D TM Dipole | 2D code path validation |
+| **E47** 2D TE Waveguide | 2D TE code path validation |
+| **E19** Anisotropic Material Slab | Per-axis permittivity |
+| **E1** (enhance) Periodic Slab | Add 2D version, error metrics |
+| **E24** (enhance) Gaussian Beam | Add waist measurement |
+| | |
+| **Scaling / Performance (CAN BUILD NOW):** | |
+| **E28** Single-GPU Throughput vs. Size | GPU utilization curve, crossover vs. CPU |
+| **E29** Throughput vs. Physics Complexity | Kernel specialization overhead |
+| **E30** Backend Comparison | CUDA/ROCm/Metal/CPU relative speed |
+| **E31** Precision Comparison (F32 vs F64) | Speed/accuracy tradeoff |
+| **E32** JIT Compilation Overhead | Cold vs. warm start timing |
+| **E33** Memory Capacity Test | Max problem size per GPU |
+| **E34** Cross-Solver Performance | Khronos vs. Meep vs. fdtdx |
+| **E38d** Vacuum Stress Test | Max grid at memory limit |
+| **E39** Kernel Roofline Analysis | Memory bandwidth utilization |
+
+### Phase 2: After P0 Features (§1.1–§1.3, §3.1–§3.3)
+
+Once periodic BCs and flux monitors are implemented:
+
+| Example | Unlocked by |
+|---------|-------------|
+| **E3** Angular Fresnel | §1.1 + §3.1 (Bloch BCs) |
+| **E4** Mie Scattering | §1.3 + §3.3 (flux monitors) |
+| **E5** 2D Waveguide Bend | §1.3 + §3.3 (flux monitors) |
+| **E21** Periodic Verification | §1.1 + §3.1 (periodic BCs) |
+| **E9** Mode Source (full) | §1.3 + §3.3 (flux monitors) |
+| **E38b** PIC Benchmark | §1.3 + §3.3 (flux monitors) |
+| **E38c** Large Mie Benchmark | §1.3 + §3.3 (flux monitors) |
+
+### Phase 3: After P0–P1 Features (§1.2, §3.2, §1.4, §3.4, §3.8, §3.12)
+
+Once dispersive materials, mode monitors, CFS-PML, and time monitors work:
+
+| Example | Unlocked by |
+|---------|-------------|
+| **E16** Dispersive Fresnel | §1.2 + §3.2 (ADE) |
+| **E17** Drude Metal | §1.2 + §3.2 (ADE) |
+| **E6** Ring Resonator Q | §3.12 (time monitor) |
+| **E7** PhC Bands | §1.1 + §3.1 (BCs) |
+| **E10** Waveguide Taper | §1.4 + §3.4 (mode overlap) |
+| **E45** CFS-PML Stability | §3.8 (CFS-PML) |
+| **E12** Binary Grating | §1.1 + §1.9 + §3.16 |
+| **E40** Non-Uniform Grid Perf | §2.5 + §3.6 (grid) |
+
+### Phase 4: After P2 Features
+
+| Example | Unlocked by |
+|---------|-------------|
+| **E14** Antenna Pattern | §1.5 + §3.10 (near-to-far) |
+| **E18** THG Nonlinear | §1.6 + §3.13 (nonlinear) |
+| **E22** Symmetry | §1.7 + §2.4 (symmetry) |
+| **E23** Non-Uniform Grid Accuracy | §2.5 + §3.6 (grid) |
+| **E43** Cylindrical | §3.14 (cylindrical) |
+| **E35/E36/E37** Multi-GPU Scaling | §2.1 + §3.7 (MPI) |
+| **E41** Inverse Design | §2.7 (adjoint) |
+| **E38a** Metalens Benchmark | §1.1 (periodic BCs) |
+
+---
+
+## Quantitative Validation Standards
+
+Every example with an analytical or cross-solver reference should report:
+
+1. **Accuracy metric**: Max absolute error, RMS error, or relative error
+   vs. analytical/reference solution.
+2. **Convergence**: Error vs. resolution (cells/wavelength) should show
+   expected order (first-order without subpixel, second-order with).
+3. **Performance**: Cells/second, total wall time, peak GPU memory.
+4. **Pass/fail threshold**: Each example should define an acceptable error
+   bound (e.g., |T_khronos - T_analytic| < 0.01 for all frequencies).
+
+Example output format:
+
+```
+========================================
+Khronos.jl Validation: E2 Fresnel Reflectance
+========================================
+Grid:       200 x 1 x 1600 (resolution=40)
+Backend:    CUDA (NVIDIA A100)
+Precision:  Float64
+
+Results:
+  Max |R_err|:     0.0023
+  RMS R_err:       0.0008
+  Frequencies:     100 points (0.4 - 1.0 μm)
+
+Performance:
+  Wall time:       12.3 s (110 timesteps warm)
+  Cells/second:    1.2e9
+  GPU memory:      0.8 GB
+
+Status: PASS (max error < 0.01 threshold)
+========================================
+```
diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md
new file mode 100644
index 0000000..c735dea
--- /dev/null
+++ b/docs/ROADMAP.md
@@ -0,0 +1,1463 @@
+# Khronos.jl Feature Roadmap
+
+This document catalogs every feature gap between Khronos.jl and the three
+reference FDTD solvers (Meep, Tidy3D, fdtdx), organized around a three-layer
+architecture that cleanly separates concerns and enables long-term scalability.
+
+Reference codebases:
+
+| Solver | Language | GPU | Differentiable | Open-Source |
+|--------|----------|-----|----------------|-------------|
+| **Meep** | C++ / Python | No | Adjoint (autograd, black-box) | Yes (GPL-2) |
+| **Tidy3D** | Python API / proprietary backend | Yes (cloud) | Adjoint (autograd) | API only |
+| **fdtdx** | Python / JAX | Yes (JAX) | Yes (time-reversible custom VJP) | Yes (MIT) |
+| **Khronos.jl** | Julia | Yes (KernelAbstractions) | Not yet (Enzyme.jl potential) | Yes (MIT) |
+
+---
+
+## Architecture Overview
+
+Khronos.jl is structured as three distinct layers, each with a clear
+responsibility boundary:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  Layer 1: Frontend                                          │
+│  User-facing API, geometry/material/source/monitor types,   │
+│  simulation specification DSL, GUI potential                 │
+│                                                             │
+│  Emits: Simulation IR (declarative, language-agnostic)      │
+├─────────────────────────────────────────────────────────────┤
+│  Layer 2: Graph Compiler                                    │
+│  Domain decomposition, kernel selection/specialization,     │
+│  memory planning, launch configuration, schedule generation │
+│                                                             │
+│  Emits: Execution plan (kernel DAG + memory map)            │
+├─────────────────────────────────────────────────────────────┤
+│  Layer 3: Engine Backend                                    │
+│  Optimized FDTD kernels, field updates, halo exchange,      │
+│  multi-backend execution (CUDA/ROCm/Metal/CPU)              │
+│                                                             │
+│  Executes: The plan, producing simulation results           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Layer 1 (Frontend)** is concerned with *what* the user wants to simulate.
+It provides the API for defining geometries, materials, sources, monitors,
+and boundary conditions. Long-term, it emits a declarative IR (leveraging
+`GeometryPrimitives.jl`) that is language-agnostic, enabling non-Julia
+frontends (Python bindings, GUI, config files) to target the same compiler.
+
+**Layer 2 (Graph Compiler)** is concerned with *how* to execute the
+simulation efficiently. It takes the declarative IR and produces an
+execution plan: which kernels to launch, in what order, with what
+specializations, on what devices, and with what memory layout. Initially
+this layer handles domain decomposition and kernel selection; progressively
+it evolves toward full type-specialized codegen.
+
+**Layer 3 (Engine Backend)** is concerned with *executing* the plan. It owns
+the actual FDTD kernels (`@kernel` functions), field array storage, time
+stepping, PML updates, source injection, and monitor accumulation. It
+receives a fully resolved execution plan and runs it.
+
+---
+
+## Priority Definitions
+
+| Tier | Label | Meaning |
+|------|-------|---------|
+| **P0** | Critical | Blocks the majority of practical FDTD workflows |
+| **P1** | High | Required for competitive feature parity or significant performance gains |
+| **P2** | Medium | Important for specific application domains |
+| **P3** | Low | Nice-to-have, niche, or complementary |
+
+---
+
+# Layer 1: Frontend
+
+The Frontend layer defines the user-facing API and the declarative
+simulation specification. Features here affect how users describe
+simulations, what physical models they can express, and how the simulation
+description is represented internally.
+
+---
+
+## 1.1 Boundary Specification API — P0
+
+**Present in:** Meep, Tidy3D, fdtdx
+
+**Why critical:** Users currently have no way to request anything other than
+PML. The API must support per-axis, per-side boundary type selection.
+
+**Current state:** `README.md:27` states "All simulations require PML." No
+boundary type selection exists in the user API.
+
+**Implementation guidance:**
+
+Add a per-axis, per-side boundary type enum and specification struct:
+
+```julia
+@enum BoundaryType PML_BC Periodic_BC Bloch_BC PEC_BC PMC_BC
+
+struct BoundarySpec
+    x::Tuple{BoundaryType, BoundaryType}  # (minus, plus)
+    y::Tuple{BoundaryType, BoundaryType}
+    z::Tuple{BoundaryType, BoundaryType}
+end
+```
+
+Tidy3D's `BoundarySpec` with per-face `Boundary(minus, plus)` is a good
+model. Add a `boundary_spec` field to `Simulation`, defaulting to PML on
+all faces for backward compatibility.
+
+For Bloch-periodic BCs, add a `k_point::Union{SVector{3},Nothing}` field
+on `Simulation` that the user sets to specify the Bloch wave vector.
+
+**Estimated scope:** ~80 lines in `DataStructures.jl`, `Simulation.jl`.
+
+---
+
+## 1.2 Dispersive Material Types (Lorentz / Drude / Debye Poles) — P0
+
+**Present in:** Meep, Tidy3D (both extensive)
+
+**Why critical:** Metals (Au, Ag, Cu, Al), semiconductors at optical
+frequencies, and any broadband simulation through dispersive dielectrics
+require frequency-dependent permittivity. Users need to be able to specify
+these material models.
+
+**Current state:** `DataStructures.jl:240` has `#TODO: add info relevant for
+polarizability`. The `Polarizability.jl` section (line 249-252) is empty.
+
+**Implementation guidance:**
+
+Define a `Susceptibility` abstract type with concrete subtypes:
+
+```julia
+abstract type Susceptibility end
+
+struct LorentzianPole{N} <: Susceptibility
+    σ::N          # oscillator strength (Δε * ω₀²)
+    ω₀::N         # resonance frequency
+    γ::N          # damping rate
+end
+
+struct DrudePole{N} <: Susceptibility
+    σ::N          # plasma-frequency-squared term
+    γ::N          # collision frequency
+end
+```
+
+Add a `poles::Vector{Susceptibility}` field to `Material`.
+
+For maximum generality, consider implementing the Pole-Residue
+representation (as in Tidy3D's `PoleResidue` class). All Lorentz, Drude,
+Debye, and Sellmeier models can be converted to pole-residue pairs `(aₙ, cₙ)`.
+
+**Key references:**
+- Taflove & Hagness, *Computational Electrodynamics*, Ch. 9 (ADE method)
+- Meep `meep.hpp:88-278` (susceptibility class hierarchy)
+
+**Estimated scope:** ~150 lines for types in new `Susceptibility.jl` and
+modifications to `DataStructures.jl`.
+
+---
+
+## 1.3 Flux Monitor Type — P0
+
+**Present in:** Meep (`dft_flux`), Tidy3D (`FluxMonitor`), fdtdx (`PoyntingFluxDetector`)
+
+**Why critical:** Computing transmission and reflection spectra is the
+single most common FDTD post-processing task. Users need to be able to
+specify flux monitors in the simulation description.
+
+**Current state:** Khronos has `DFTMonitor` (single-component
+frequency-domain fields) but no flux monitor type.
+
+**Implementation guidance:**
+
+Add a `FluxMonitor` type that specifies a planar region and frequency list:
+
+```julia
+struct FluxMonitor
+    center::SVector{3}
+    size::SVector{3}        # one dimension should be 0 (planar)
+    freqs::Vector{Float64}
+    normal::Direction        # derived from size, or user-specified
+end
+```
+
+The flux computation itself happens in the Engine (see §3.3). The Frontend
+only needs the type definition and validation (ensure exactly one dimension
+is zero).
+
+**Estimated scope:** ~50 lines in `Monitors.jl` / `DataStructures.jl`.
+
+---
+
+## 1.4 Mode Decomposition Monitor — P1
+
+**Present in:** Meep (MPB integration), Tidy3D (`ModeMonitor`), fdtdx (`ModeOverlapDetector`)
+
+**Why it matters:** Determines how much power couples into each waveguide
+mode. Essential for photonic circuit design (S-parameters, insertion loss,
+crosstalk).
+
+**Current state:** Khronos has a mode solver (`VectorModesolver.jl`) and
+mode source injection, but no overlap monitor type.
+
+**Implementation guidance:**
+
+Add a `ModeMonitor` type specifying the cross-section plane, modes of
+interest, and frequencies. The mode-solve and overlap computation happens
+in the Engine (see §3.4).
+
+**Estimated scope:** ~50 lines.
+
+---
+
+## 1.5 Near-to-Far Field Monitor — P1
+
+**Present in:** Meep (full dyadic Green's function), Tidy3D (angle/Cartesian/k-space)
+
+**Why it matters:** Computing radiation patterns, scattering cross-sections,
+and far-field intensities without simulating the full propagation distance.
+
+**Implementation guidance:**
+
+Add monitor types for near-to-far field projection:
+
+```julia
+struct NearFieldSurface
+    center::SVector{3}
+    size::SVector{3}
+    freqs::Vector{Float64}
+end
+
+struct FarFieldProjection
+    surface::NearFieldSurface
+    observation::Union{AngleGrid, CartesianGrid, KSpaceGrid}
+end
+```
+
+Support observation in angle space (θ, φ), Cartesian space (x, y at
+distance d), and k-space (kx/k₀, ky/k₀), following Tidy3D's model.
+
+**Estimated scope:** ~100 lines for type definitions.
+
+---
+
+## 1.6 Nonlinear Material Types (χ² and χ³) — P2
+
+**Present in:** Meep (χ² and χ³ with Padé approximant), Tidy3D (Kerr, TPA, FCA)
+
+**Implementation guidance:**
+
+Add `chi2` and `chi3` fields to `Material`:
+
+```julia
+struct Material{N}
+    ...
+    chi2::N   # second-order nonlinear susceptibility
+    chi3::N   # third-order nonlinear susceptibility
+end
+```
+
+The nonlinear update equations are handled by the Engine (see §3.6).
+
+**Key references:**
+- Meep `step_generic.cpp:535-547`
+- Tidy3D `nonlinear.py` (`KerrNonlinearity`, `TwoPhotonAbsorption`)
+
+**Estimated scope:** ~30 lines in `DataStructures.jl`.
+
+---
+
+## 1.7 Symmetry Specification — P2
+
+**Present in:** Meep (mirror, rotate2, rotate4), Tidy3D (per-axis ±1)
+
+**Implementation guidance:**
+
+Add a `symmetry::SVector{3,Int}` field on `Simulation` (values: 0, +1, -1
+per axis). The Graph Compiler uses this to halve grid dimensions and the
+Engine applies symmetry-aware boundary conditions. The Frontend only needs
+the type and validation.
+
+**Key references:**
+- Tidy3D `Simulation.symmetry` (per-axis 0/+1/-1)
+
+**Estimated scope:** ~30 lines.
+
+---
+
+## 1.8 TFSF (Total-Field/Scattered-Field) Source — P2
+
+**Present in:** Tidy3D (`TFSF`), fdtdx (`tfsf.py`)
+
+**Implementation guidance:**
+
+Add a `TFSFSource` type specifying the injection box, plane wave direction,
+polarization, and time profile. TFSF injects a plane wave inside a
+rectangular volume while maintaining zero incident field outside.
+
+**Key references:**
+- Taflove & Hagness, Ch. 5.6 (TFSF formulation)
+- Tidy3D `source/field.py` (`TFSF` class)
+
+**Estimated scope:** ~60 lines for the type definition.
+
+---
+
+## 1.9 Diffraction Monitor — P2
+
+**Present in:** Tidy3D (`DiffractionMonitor`), fdtdx (`DiffractiveDetector`)
+
+**Implementation guidance:**
+
+Add a `DiffractionMonitor` type for periodic structures. Specifies the
+monitor plane and diffraction orders of interest. The FFT-based computation
+happens in the Engine.
+
+**Estimated scope:** ~40 lines.
+
+---
+
+## 1.10 LDOS Monitor — P2
+
+**Present in:** Meep (`dft_ldos`)
+
+**Implementation guidance:**
+
+Add an `LDOSMonitor` type that is co-located with a point dipole source.
+Records the DFT of the E-field at the source location.
+
+**Key references:**
+- Meep `dft_ldos.cpp:60-80`
+
+**Estimated scope:** ~30 lines.
+
+---
+
+## 1.11 Force / Stress Tensor Monitor — P2
+
+**Present in:** Meep (`dft_force`, stress tensor integration)
+
+**Implementation guidance:**
+
+Add a `ForceMonitor` type specifying a closed surface around an object.
+The Maxwell stress tensor integration happens in the Engine.
+
+**Key references:**
+- Meep `stress.cpp:90-114`
+
+**Estimated scope:** ~40 lines.
+
+---
+
+## 1.12 Custom Sources (Current and Time Profile) — P3
+
+**Current state:** `CustomCurrentSource` at `SpatialSources.jl:252-277` is
+`# TODO`. Custom time source at `TimeSources.jl:138` is `# TODO`.
+
+**Implementation:** Allow users to pass arbitrary arrays (spatial current
+distribution) and functions (time profile) to the source infrastructure.
+
+**Estimated scope:** ~100 lines.
+
+---
+
+## 1.13 GDS Import — P3
+
+**Present in:** Meep (libGDSII), Tidy3D (gdstk)
+
+**Implementation:** Use `GDSTK.jl` or parse GDSII directly to extract
+polygon vertices. Convert polygons to `GeometryPrimitives.jl` shapes (or
+a new `PolySlab` type).
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 1.14 PolySlab Geometry (Extruded Polygon with Sidewall Angle) — P3
+
+**Present in:** Tidy3D (`PolySlab` with `sidewall_angle`), fdtdx (extruded polygon)
+
+**Implementation:** Define a `PolySlab` shape with 2D polygon vertices,
+extrusion axis, slab bounds, and optional sidewall taper angle. Use
+point-in-polygon testing for the cross-section.
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 1.15 STL / Triangle Mesh Import — P3
+
+**Present in:** Tidy3D (`TriangleMesh`)
+
+**Implementation:** Parse STL files into triangle meshes. Use ray-casting
+(Möller-Trumbore intersection) for point-in-mesh testing during geometry
+initialization.
+
+**Estimated scope:** ~300 lines.
+
+---
+
+## 1.16 Astigmatic Gaussian Beam Source — P3
+
+**Present in:** Tidy3D (`AstigmaticGaussianBeam`)
+
+**Implementation:** Extend `GaussianBeamSource` to support two independent
+waist radii and waist distances (one per transverse axis).
+
+**Estimated scope:** ~50 lines.
+
+---
+
+## 1.17 Time Modulation of Materials — P3
+
+**Present in:** Tidy3D (`ModulationSpec`, `SpaceTimeModulation`)
+
+**Implementation:** Allow ε and σ to vary sinusoidally in time. Add a
+`ModulationSpec` field to `Material` specifying modulation amplitude,
+frequency, and phase.
+
+**Estimated scope:** ~60 lines for types.
+
+---
+
+## 1.18 Lumped Circuit Elements — P3
+
+**Present in:** Tidy3D (`LumpedResistor`, `RLCNetwork`)
+
+**Implementation:** Define R, L, C element types that embed as modified
+update equations in localized grid regions.
+
+**Estimated scope:** ~80 lines for types.
+
+---
+
+## 1.19 Simulation IR Specification — P1
+
+**Not present in:** any reference solver (all are monolithic)
+
+**Why it matters:** A declarative, language-agnostic intermediate
+representation decouples the user API from the compiler and engine. This
+enables multiple frontends (Julia API, Python bindings, GUI, config files)
+to target the same simulation pipeline.
+
+**Implementation guidance:**
+
+The IR should be a serializable representation of a complete simulation
+specification, leveraging `GeometryPrimitives.jl` for shape primitives.
+It captures:
+
+1. **Domain**: cell size, resolution (uniform or non-uniform), dimension.
+2. **Geometry**: ordered list of geometric objects with material assignments.
+3. **Materials**: permittivity, permeability, conductivity, susceptibility
+   poles, nonlinear coefficients.
+4. **Sources**: type, spatial extent, time profile, polarization.
+5. **Monitors**: type, location, frequencies, components.
+6. **Boundaries**: per-axis, per-side boundary types + Bloch k-vector.
+7. **Symmetries**: per-axis symmetry phases.
+8. **Run parameters**: total time, Courant factor, convergence criteria.
+
+Initially this can be a Julia struct hierarchy (essentially a cleaned-up
+version of `SimulationData`). The language-agnostic serialization format
+(JSON, MessagePack, or a schema-defined binary format) comes later.
+
+**Estimated scope:** ~300 lines for initial Julia struct IR.
+
+---
+
+## 1.20 Energy Monitor — P3
+
+**Present in:** fdtdx (`EnergyDetector`)
+
+**Implementation:** Add an `EnergyMonitor` type. Computes
+`u = ½(ε|E|² + μ|H|²)` at monitor locations, either instantaneous or
+DFT-based.
+
+**Estimated scope:** ~30 lines for the type.
+
+---
+
+# Layer 2: Graph Compiler
+
+The Graph Compiler takes a complete simulation specification (from Layer 1)
+and produces an optimized execution plan for Layer 3. It is responsible for
+all decisions about *how* to run the simulation efficiently: domain
+decomposition, kernel specialization, memory layout, device assignment, and
+scheduling.
+
+This layer evolves progressively:
+
+- **Phase A (near-term):** Domain decomposition + kernel selection based on
+  active physics modules. Essentially formalizes the logic currently
+  embedded in `prepare_simulation!`.
+- **Phase B (mid-term):** Full type-specialized kernel DAG generation.
+  Analyzes which physics modules are active (dispersive? nonlinear? PML?)
+  and produces a minimal kernel schedule, eliminating branches at runtime.
+- **Phase C (long-term):** Cost-model-driven optimization. Profile-guided
+  kernel fusion, occupancy tuning, memory placement (shared vs global),
+  and multi-device load balancing.
+
+---
+
+## 2.1 Domain Decomposition Planner — P1
+
+**Present in:** Meep (MPI), fdtdx (JAX `NamedSharding`), Tidy3D (cloud)
+
+**Why it matters:** Single-GPU memory limits simulation size to ~2000³
+voxels (Float32). The compiler must decide how to partition the domain
+across devices.
+
+**Current state:** No multi-device logic exists. Ghost cells exist
+(`OffsetArrays` with ±1 padding) and 8 `# TODO update halo` comments mark
+insertion points.
+
+**Implementation guidance:**
+
+The domain decomposition planner takes the simulation grid dimensions and
+available device count, then produces a partition map:
+
+1. **Single-axis decomposition** — Split along the longest axis. Assign
+   contiguous slab regions to each device/rank. Emit halo-exchange
+   instructions in the execution plan between each field-update substep.
+
+2. **Cost-balanced partitioning** — Weight partitions by computational cost
+   (PML regions are more expensive than free space). Meep
+   `structure.cpp:66-94` implements binary-tree cost-based partitioning.
+
+3. **Multi-axis decomposition** — For large device counts, split along
+   2-3 axes using a 3D process grid.
+
+The planner emits a `PartitionPlan` struct consumed by the Engine:
+
+```julia
+struct PartitionPlan
+    axis::Int                       # split axis
+    slabs::Vector{UnitRange{Int}}   # per-device grid ranges
+    neighbors::Vector{Tuple{Int,Int}}  # (left, right) rank per device
+    halo_width::Int
+end
+```
+
+**Key references:**
+- Meep `structure.cpp:66-94` (cost-based partitioning)
+- fdtdx `sharding.py` (`get_named_sharding_from_shape`)
+
+**Estimated scope:** ~300 lines.
+
+---
+
+## 2.2 Kernel Selection and Specialization — P1
+
+**Not present in:** any reference solver (unique to Khronos)
+
+**Why it matters:** Khronos's `Nothing`-vs-`AbstractArray` dispatch pattern
+in `Timestep.jl` generates ~32+ specialized GPU kernels from a single
+source. The Graph Compiler should formalize this: given the set of active
+physics (PML? dispersive? nonlinear? magnetic materials?), select the
+minimal set of kernel specializations needed.
+
+**Current state:** Kernel specialization happens implicitly through Julia's
+multiple dispatch at JIT time. This works but is opaque — the user and
+compiler cannot inspect or optimize the kernel schedule.
+
+**Implementation guidance:**
+
+1. **Physics feature flags** — Analyze the simulation IR to determine which
+   features are active:
+   - PML boundaries → PML-aware curl kernels
+   - Dispersive materials → ADE update kernels
+   - Nonlinear materials → nonlinear constitutive update
+   - μ ≠ 1 → non-trivial H-from-B update
+   - Non-uniform grid → vector Δx kernels
+
+2. **Kernel DAG** — Produce a directed acyclic graph of kernel invocations
+   per timestep. Each node specifies the kernel function, its type
+   parameters (which dispatch variant), and its array arguments. The DAG
+   encodes dependencies (e.g., ADE update must happen between D-update and
+   E-update).
+
+3. **Specialization table** — For each region of the grid (interior, PML,
+   interface), record which kernel variant applies. Regions with different
+   physics get different kernel launches.
+
+**Estimated scope:** ~400 lines.
+
+---
+
+## 2.3 Memory Planning — P2
+
+**Why it matters:** GPU memory is the primary bottleneck for 3D FDTD. The
+compiler should compute the total memory footprint before launching and
+warn (or auto-adjust resolution) if it exceeds device capacity.
+
+**Implementation guidance:**
+
+1. **Static analysis** — From the simulation IR, compute:
+   - Field arrays: `6 × Nx × Ny × Nz × sizeof(T)` (E and H)
+   - PML arrays: auxiliary ψ fields in PML regions
+   - Dispersive arrays: P and P_prev per pole per component
+   - Monitor arrays: DFT accumulation buffers
+   - Material arrays: ε, μ, σ per component
+
+2. **Memory budget** — Query device memory via KernelAbstractions and
+   compare against the computed requirement.
+
+3. **Auto-adjustment** — Optionally suggest resolution reduction, single-
+   precision, or multi-GPU decomposition if the problem exceeds single-GPU
+   memory.
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 2.4 Symmetry Exploitation Planner — P2
+
+**Present in:** Meep (mirror, rotate2, rotate4), Tidy3D (per-axis ±1)
+
+**Implementation guidance:**
+
+When the simulation IR specifies symmetries (see §1.7), the compiler:
+
+1. Halves the grid dimensions along each symmetric axis.
+2. Modifies boundary conditions at symmetry planes: PEC-like (symmetric E,
+   antisymmetric H) for +1 symmetry, PMC-like for -1 symmetry.
+3. Applies phase factors when monitors read fields across the symmetry
+   plane.
+4. Reduces memory and computation by 2× per axis (up to 8× for 3 axes).
+
+The compiler emits a modified grid spec and symmetry boundary instructions
+for the Engine.
+
+**Key references:**
+- Meep `vec.hpp:1181-1231` (symmetry class)
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 2.5 Non-Uniform Grid Planning — P1
+
+**Present in:** Tidy3D (per-axis `GridSpec`, `AutoGrid`, `CustomGrid`)
+**Not in:** Meep (scalar `a`), fdtdx (scalar `resolution`)
+
+**Why it matters:** Structures with disparate feature scales waste enormous
+memory with uniform grids. Per-axis resolution alone can give 4-100×
+memory savings. **No other open-source GPU FDTD solver has this.**
+
+**Current state:** `DataStructures.jl:341-343` defines `Δx, Δy, Δz` as
+`Union{Vector{N}, N, Nothing}` — designed from day one for non-uniform
+grids. But all code paths treat them as scalars.
+
+**Implementation guidance:**
+
+The compiler resolves grid spacing from the simulation IR:
+
+1. **Per-axis uniform (Δx ≠ Δy ≠ Δz)** — Allow `resolution` to be a
+   `Tuple{Int,Int,Int}`. Compute `Δx = cell_size[1]/resolution[1]` etc.
+   Update Courant: `Δt = C / √(1/Δx² + 1/Δy² + 1/Δz²)`.
+
+2. **Per-cell non-uniform** — Produce `Δx::Vector{N}` arrays. The
+   compiler selects the correct kernel variants (vector Δx dispatch).
+   PML σ profiles become per-cell. Courant condition uses
+   `Δt = C × min(Δx_min, Δy_min, Δz_min) / √3`.
+
+3. **AutoGrid (future)** — Compute per-cell sizes based on local
+   wavelength in material and feature proximity (Tidy3D model).
+
+**Key references:**
+- Tidy3D `grid_spec.py` (`AutoGrid`, `CustomGrid`, `MeshOverrideStructure`)
+
+**Estimated scope:** ~200 lines for per-axis uniform; ~400 additional for
+per-cell planning.
+
+---
+
+## 2.6 Launch Configuration Optimizer — P3
+
+**Why it matters:** GPU performance is sensitive to thread block size,
+shared memory usage, and register pressure. The compiler should auto-tune
+launch configurations based on kernel type and grid dimensions.
+
+**Implementation guidance:**
+
+1. Query device compute capability and SM count.
+2. For each kernel, compute occupancy at various block sizes.
+3. Select the block size that maximizes occupancy (or use a heuristic
+   based on kernel characteristics — memory-bound vs compute-bound).
+4. KernelAbstractions.jl's `@kernel` already handles some of this, but
+   explicit workgroup size selection can improve performance by 10-30%.
+
+**Estimated scope:** ~150 lines.
+
+---
+
+## 2.7 Adjoint Compilation — P2
+
+**Present in:** Meep (autograd), Tidy3D (autograd), fdtdx (time-reversible JAX AD)
+
+**Implementation guidance:**
+
+The compiler supports inverse design by producing both forward and adjoint
+execution plans:
+
+1. **Enzyme.jl path (recommended)** — The compiler marks kernels for
+   Enzyme differentiation. Unlike fdtdx (which manually implements
+   `update_E_reverse()` and `update_H_reverse()` in `backward.py`),
+   Enzyme.jl differentiates directly through KernelAbstractions kernels.
+
+2. **Manual adjoint path** — The compiler generates the time-reversed
+   kernel DAG with adjoint sources from objective function gradients.
+
+Supporting infrastructure in the compiler:
+- `DesignRegion` mapping latent parameters to permittivity arrays.
+- Filter + projection pipeline (Gaussian filter → tanh projection).
+  fdtdx's `SubpixelSmoothedProjection` (`projection.py:215`, based on
+  arxiv.org/2503.20189) is the state of the art.
+- Minimum feature size constraints (fdtdx `BrushConstraint2D`).
+
+**Key references:**
+- fdtdx `fdtd.py:16-230` (reversible FDTD with custom VJP)
+- fdtdx `projection.py:215` (subpixel smoothed projection)
+- Meep `python/adjoint/` (OptimizationProblem, DesignRegion)
+- Tidy3D `plugins/invdes/` (TopologyDesignRegion, FilterProject)
+
+**Estimated scope:** ~1000+ lines for the full pipeline.
+
+---
+
+## 2.8 KD-Tree Geometry Acceleration — P3
+
+**Current state:** `Geometry.jl:8` notes this as a future optimization.
+Currently uses linear search through geometry objects.
+
+**Implementation guidance:**
+
+During compilation, build a KD-tree (or BVH) over bounding boxes for the
+geometry list. The Engine uses tree traversal instead of linear `findfirst`
+during voxelization. Matters for scenes with hundreds of objects.
+
+**Estimated scope:** ~150 lines (or use `NearestNeighbors.jl`).
+
+---
+
+# Layer 3: Engine Backend
+
+The Engine Backend executes the plan produced by the Graph Compiler. It
+owns the FDTD kernels, field array storage, time stepping, boundary
+condition enforcement, source injection, and monitor accumulation. All
+kernels use `KernelAbstractions.jl` `@kernel` macros for multi-backend
+portability (CUDA, ROCm, Metal, oneAPI, CPU).
+
+---
+
+## 3.1 Periodic and Bloch-Periodic Boundary Kernels — P0
+
+**Present in:** Meep, Tidy3D, fdtdx (periodic only)
+
+**Why critical:** Required for photonic crystals, gratings, metasurfaces,
+waveguide supercells. Without periodic BCs every such simulation must use
+PML, wasting compute.
+
+**Current state:** No periodic boundary code exists. The ghost-cell
+infrastructure exists (`Fields.jl:128-139`, `OffsetArrays` with one ghost
+pixel per side). The 8 `# TODO update halo` sites in `Timestep.jl`
+(lines 74, 108, 144, 178, 235, 243, 253, 264) are the exact insertion
+points.
+
+**Implementation guidance:**
+
+1. **Periodic BCs** — For each axis flagged as periodic, copy field values
+   from one side of the domain to the ghost cells on the opposite side
+   after each field-update substep. Each halo-update site becomes:
+
+   ```julia
+   if boundary_is_periodic(sim, X())
+       copy_periodic_ghost!(sim.fields.fEx, X())
+   end
+   ```
+
+   On GPU, this is a simple kernel or `copyto!` between slices.
+
+2. **Bloch-periodic BCs** — Same as periodic, but multiply copied values
+   by a complex phase factor `exp(i k · L)` where `k` is the Bloch wave
+   vector and `L` is the lattice vector. This requires:
+   - Complex-valued fields (switch `backend_number` to complex when
+     `k_point != nothing`).
+   - Phase factors `eikna = exp(i * k[d] * cell_size[d])` precomputed
+     once per axis.
+
+   Reference: Meep `boundaries.cpp:90-105` implements exactly this via
+   `eikna[d]`, `coskna[d]`, `sinkna[d]`. fdtdx implements periodic BCs
+   via `jnp.pad(mode="wrap")` in `curl.py`.
+
+**Estimated scope:** ~220 lines in `Boundaries.jl`, `Timestep.jl`.
+
+---
+
+## 3.2 Dispersive Material ADE Kernels — P0
+
+**Why critical:** The material types from §1.2 need corresponding update
+kernels.
+
+**Current state:** Polarization fields `Px/Py/Pz` are wired through
+`update_field!` (`Timestep.jl:475-477`) but always passed as `Nothing`.
+The kernel dispatch infrastructure means adding real P arrays will
+auto-generate the correct specialized kernel with no changes to
+`update_field_generic`.
+
+**Implementation guidance:**
+
+The standard approach is the Auxiliary Differential Equation (ADE) method.
+For each dispersive pole, auxiliary polarization fields `P` and `P_prev`
+are stored and updated at each timestep.
+
+1. **Auxiliary fields** — For each pole, allocate two arrays per component
+   (`P_current`, `P_prev`) with the same shape as the E-field arrays.
+   Modify `Fields.jl` to allocate these when
+   `any(obj -> has_poles(obj.material), geometry)`.
+
+2. **ADE update kernel** — Insert a new kernel call between
+   `step_D_from_H!` and `update_E_from_D!` (between lines 30 and 32 of
+   `step!`). The Lorentzian ADE update is (Meep
+   `susceptibility.cpp:254`):
+
+   ```
+   P_next = γ₁⁻¹ * (P_cur * (2 - ω₀²Δt²) - γ₁ * P_prev + ω₀²Δt² * σ * E)
+   ```
+
+   where `γ₁ = 1 + γΔt/2`, `γ₁⁻¹ = 1 / γ₁`.
+
+   The Drude case sets the denominator term differently (no `ω₀²` in the
+   resonance term).
+
+3. **Constitutive update** — In `update_E_from_D!`, the P fields are
+   already wired: `update_field_generic` at `Timestep.jl:437` adds
+   `P[idx]` to the net field. Once P arrays are non-`Nothing`, this path
+   activates automatically via dispatch.
+
+**Key references:**
+- Taflove & Hagness, Ch. 9 (ADE method)
+- Meep `susceptibility.cpp:188-261` (Lorentzian ADE update)
+
+**Estimated scope:** ~350 lines in new `Susceptibility.jl` kernel code,
+modifications to `Fields.jl`, `Timestep.jl`.
+
+---
+
+## 3.3 Flux Computation Kernels — P0
+
+**Why critical:** The flux monitor type from §1.3 needs corresponding
+DFT accumulation and post-processing kernels.
+
+**Implementation guidance:**
+
+1. **DFTFluxMonitor kernel** — Store DFT accumulations for 4 tangential
+   components (e.g., for a z-normal plane: `Ex, Ey, Hx, Hy`). Reuse the
+   existing `update_dft_monitor!` kernel (`Monitors.jl:120-137`) for each
+   component independently.
+
+2. **Flux computation** — After the simulation, compute:
+
+   ```julia
+   function flux(m::DFTFluxMonitorData, freq_idx)
+       # For z-normal: S_z = Ex*Hy - Ey*Hx
+       return real(sum(m.Ex[:,:,freq_idx] .* conj(m.Hy[:,:,freq_idx])
+                     - m.Ey[:,:,freq_idx] .* conj(m.Hx[:,:,freq_idx])))
+   end
+   ```
+
+   Scale by `Δx * Δy / (N_timesteps)²` for proper normalization.
+
+3. **Normal direction handling** — Determine the 4 tangential components
+   from the monitor plane normal using the existing `Direction` type.
+
+**Key references:**
+- Meep `dft.cpp` (DFT flux implementation)
+- fdtdx `poynting_flux.py:11` (`PoyntingFluxDetector`)
+
+**Estimated scope:** ~150 lines in `Monitors.jl`.
+
+---
+
+## 3.4 Mode Overlap Computation — P1
+
+**Why it matters:** The mode monitor type from §1.4 needs the actual
+overlap integral computation.
+
+**Implementation guidance:**
+
+1. Record DFT of all 4 tangential field components on a cross-section
+   plane (reuse `DFTMonitor` infrastructure).
+
+2. Compute the mode profile at each DFT frequency using
+   `VectorModesolver.jl`.
+
+3. Compute the overlap integral:
+   ```
+   α = (1/4) ∫ (E_mode × H*_sim + E*_sim × H_mode) · dA
+   ```
+
+   This is exactly fdtdx's `ModeOverlapDetector` formula (`mode.py:15`).
+
+4. The complex coefficient `α` gives both amplitude and phase. Power in
+   mode `n` is `|αₙ|²`.
+
+**Key references:**
+- Meep `meep.hpp:2145-2151` (`get_overlap`, `get_mode_flux_overlap`)
+- fdtdx `detectors/mode.py`
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 3.5 Subpixel Averaging — P1
+
+**Present in:** Meep (anisotropic averaging), Tidy3D (multiple methods)
+
+**Why it matters:** Without subpixel averaging, FDTD convergence at curved
+dielectric interfaces is first-order instead of second-order. In practice
+this means ~4× finer grids (and ~64× more voxels in 3D) for equivalent
+accuracy. On GPU where memory is the bottleneck, this directly limits
+problem size.
+
+**Current state:** `Geometry.jl` uses point sampling — each voxel material
+is determined by a single `findfirst(point, geometry)` call (lines 187-203).
+
+**Implementation guidance:**
+
+The standard method (Farjadpour et al. 2006) computes an effective
+inverse-permittivity tensor at each interface voxel:
+
+```
+ε⁻¹_eff = P‖ * <ε⁻¹> + P⊥ * <ε>⁻¹
+```
+
+where `<ε>` and `<ε⁻¹>` are volume-averaged permittivity and its inverse,
+`P‖` projects parallel to the interface, and `P⊥` projects perpendicular.
+
+1. **Normal vector computation** — Sample ε at N quadrature points within
+   each voxel. The gradient of ε gives the interface normal. Meep uses
+   sphere quadrature (`anisotropic_averaging.cpp:33-56`). A simpler start
+   is central differences on the ε grid.
+
+2. **Volume averaging** — For each voxel straddling a material interface,
+   compute the fill fraction `f` and averaged `<ε>`, `<ε⁻¹>`.
+
+3. **Tensor storage** — The effective `ε⁻¹` is a 3x3 symmetric tensor.
+   For diagonal anisotropy (the common case), store 3 components per
+   voxel. Khronos already supports per-component `εx, εy, εz` arrays.
+
+4. **Integration** — Modify `_write_geometry_3d!` in `Geometry.jl` to
+   detect boundary voxels and apply the averaging procedure.
+
+**Key references:**
+- A. Farjadpour et al., "Improving accuracy by subpixel smoothing in the
+  finite-difference time domain," *Optics Letters* 31(20), 2006.
+- Meep `anisotropic_averaging.cpp:90-150`
+- Tidy3D `SubpixelSpec`
+
+**Estimated scope:** ~400 lines in `Geometry.jl`.
+
+---
+
+## 3.6 Non-Uniform Grid Stencils — P1
+
+**Why it matters:** The grid plans from §2.5 need corresponding stencil
+implementations in the Engine.
+
+**Current state:** The finite difference stencils (`Timestep.jl:395-400`)
+already parameterize on `Δx, Δy, Δz` independently:
+
+```julia
+d_dx!(A, Δx, idx_curl, ix, iy, iz) = inv(Δx) * (A[ix+idx_curl,...] - A[ix,...])
+```
+
+**Implementation guidance:**
+
+1. **Per-axis uniform** — No stencil changes needed. The existing functions
+   work with independent scalar `Δx, Δy, Δz`.
+
+2. **Per-cell non-uniform** — Add method overloads:
+   ```julia
+   d_dx!(A, Δx::AbstractVector, idx_curl, ix, iy, iz) =
+       inv(Δx[ix]) * (A[ix+idx_curl,...] - A[ix,...])
+   ```
+
+   Julia's dispatch will generate a specialized GPU kernel that reads
+   `Δx[ix]` per cell, while the uniform case continues to use a
+   compile-time constant.
+
+   PML sigma profiles become per-cell (already arrays, so natural).
+
+**Estimated scope:** ~200 lines in `Timestep.jl`.
+
+---
+
+## 3.7 Halo Exchange (Multi-GPU/MPI) — P1
+
+**Why it matters:** The partition plan from §2.1 needs corresponding data
+movement kernels.
+
+**Current state:** 8 `# TODO update halo` sites in `Timestep.jl`.
+
+**Implementation guidance:**
+
+Two approaches:
+
+1. **MPI-based (Meep model)** — Use `MPI.jl` for inter-process
+   communication. After each field-update substep, exchange ghost-cell
+   slices between neighboring ranks. Use CUDA-aware MPI to avoid
+   device-to-host copies.
+
+2. **KernelAbstractions multi-device (fdtdx model)** — Use Julia's
+   `Distributed` or `CUDA.jl`'s multi-device API with explicit `copyto!`
+   for halo exchange.
+
+Each halo-update site needs:
+
+```julia
+if num_processes > 1
+    exchange_halo!(field_array, axis, rank, comm)
+end
+```
+
+where `exchange_halo!` does non-blocking `MPI.Isend` / `MPI.Irecv` of
+the boundary slices.
+
+**Key references:**
+- Meep `boundaries.cpp:35-77` (communication sequences)
+- fdtdx `sharding.py`
+
+**Estimated scope:** ~600 lines.
+
+---
+
+## 3.8 CFS-PML Activation (κ and α Parameters) — P1
+
+**Present in:** Meep, Tidy3D, fdtdx (all use full CPML)
+
+**Why it matters:** Standard PML (σ only) can be unstable for evanescent
+waves, long-time simulations, and dispersive media. CFS-PML adds κ
+(coordinate stretching) and α (complex shift) for improved absorption.
+
+**Current state:** `BoundaryData` already defines `κBx/κBy/κBz`,
+`κDx/κDy/κDz`, `αBx/αBy/αBz`, `αDx/αDy/αDz`
+(`DataStructures.jl:145-158`) but they are all commented
+`# currently not used`.
+
+**Implementation guidance:**
+
+The CFS-PML stretching factor is:
+
+```
+s(ω) = κ + σ / (α + iω)
+```
+
+In the time domain, the CPML update becomes (fdtdx `curl.py`):
+
+```
+b = exp(-(σ/κ + α) * Δt)
+a = σ * (b - 1) / (σ + κα)
+ψ_new = b * ψ_old + a * (∂F/∂x)
+```
+
+1. Populate κ and α arrays in `init_boundaries` (`Boundaries.jl:57-110`)
+   alongside existing σ arrays.
+
+2. Modify the `generic_curl!` overloads in `Timestep.jl` to use CPML
+   coefficients `a` and `b` (precomputed from σ, κ, α) instead of raw σ.
+
+3. Since κ and α fields already exist in the struct, this is mostly
+   computing them and threading them into existing PML update paths.
+
+**Key references:**
+- J.A. Roden and S.D. Gedney, "Convolutional PML (CPML)," *Microwave and
+  Optical Technology Letters*, 27(5), 2000.
+- fdtdx `perfectly_matched_layer.py:117`
+
+**Estimated scope:** ~200 lines in `Boundaries.jl` and `Timestep.jl`.
+
+---
+
+## 3.9 PEC and PMC Boundary Walls — P2
+
+**Present in:** Meep, Tidy3D (`PECBoundary`, `PMCBoundary`)
+
+**Implementation guidance:**
+
+- **PEC:** Zero the tangential E-field components at the boundary. For a
+  z-boundary: set `Ex = Ey = 0` on the wall.
+- **PMC:** Zero the tangential H-field components. For a z-boundary:
+  set `Hx = Hy = 0` on the wall.
+
+These are trivial Dirichlet conditions applied after each field update.
+`Boundaries.jl:117` has a comment referencing Dirichlet zeroing but no code.
+
+**Estimated scope:** ~50 lines.
+
+---
+
+## 3.10 Near-to-Far Field Transformation Kernels — P1
+
+**Why it matters:** The monitor types from §1.5 need the actual Green's
+function integration.
+
+**Implementation guidance:**
+
+1. Record DFT of tangential E and H on a closed/open surface. Compute
+   equivalent currents: `J = n × H`, `M = -n × E`.
+
+2. Integrate against the free-space Green's function:
+   - **3D:** Dyadic Green's function with 1/r, 1/r², 1/r³ terms
+     (Meep `near2far.cpp:133-187`).
+   - **2D:** Hankel functions (`H₀⁽¹⁾(kr)`, `H₁⁽¹⁾(kr)`).
+
+3. For far-field only, use the asymptotic form (plane-wave approximation),
+   reducing to a Fourier transform of the surface currents.
+
+**Key references:**
+- Meep `near2far.cpp`
+- Tidy3D `FieldProjectionAngleMonitor`, `FieldProjectionCartesianMonitor`,
+  `FieldProjectionKSpaceMonitor`
+- Taflove & Hagness, Ch. 8
+
+**Estimated scope:** ~500 lines.
+
+---
+
+## 3.11 Custom Permeability (μ) Wiring — P2
+
+**Present in:** Meep, Tidy3D, fdtdx
+
+**Current state:** The `Material` struct has `μx, μy, μz` fields.
+`Geometry.jl` allocates `μ_inv_x/y/z` arrays but hardcodes `μ_inv = 1.0`
+at line 312 with `# todo add μ support`.
+
+**Implementation guidance:**
+
+Wire the allocated μ arrays into `GeometryData` and pass them to
+`update_H_from_B!` as the `m_inv` parameter (currently receives scalar
+`1.0`). The kernel dispatch already handles `m_inv::AbstractArray` vs
+`m_inv::Real` via `get_m_inv()` (`Timestep.jl:459-463`), so the kernel
+side needs no changes.
+
+**Estimated scope:** ~30 lines in `Geometry.jl`.
+
+---
+
+## 3.12 Time Monitor Kernel — P2
+
+**Current state:** `TimeMonitor` struct and initialization exist
+(`Monitors.jl:28-53`) but `#TODO add update and kernel functions for time
+monitor` at line 55.
+
+**Implementation guidance:**
+
+Write an `update_monitor` dispatch for `TimeMonitorData` that copies the
+current field value at the monitor location into the preallocated storage
+at the current time index:
+
+```julia
+@kernel function update_time_monitor!(monitor_fields, sim_fields,
+                                       offset_x, offset_y, offset_z, time_idx)
+    ix, iy, iz = @index(Global, NTuple)
+    monitor_fields[time_idx, ix, iy, iz] = sim_fields[ix+offset_x, iy+offset_y, iz+offset_z]
+end
+```
+
+**Estimated scope:** ~50 lines.
+
+---
+
+## 3.13 Nonlinear Material Update Kernels — P2
+
+**Why it matters:** The material types from §1.6 need corresponding
+kernels.
+
+**Implementation guidance:**
+
+The nonlinear update modifies the E = ε⁻¹D constitutive relation to
+include intensity-dependent terms. Meep uses a Padé approximant for
+stability (`step_generic.cpp:535-547`):
+
+```
+u = (1 + c₂ + 2c₃) / (1 + 2c₂ + 3c₃)
+E = u * ε⁻¹ * D
+```
+
+where `c₂ = D * χ² * (ε⁻¹)²` and `c₃ = |D|² * χ³ * (ε⁻¹)³`.
+
+Allocate per-voxel arrays in `Geometry.jl` for the nonlinear coefficients,
+and add a multiplicative factor in `update_field_generic`.
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 3.14 Cylindrical Coordinate Kernels — P2
+
+**Present in:** Meep (`Dcyl`, angular mode index `m`)
+
+**Current state:** `abstract type Cylindrical <: Dimension` declared at
+`DataStructures.jl:55`, `#TODO add support for cylindrical coordinates` at
+`Simulation.jl:52`.
+
+**Implementation guidance:**
+
+Cylindrical FDTD replaces Cartesian curl equations with their cylindrical
+equivalents (r, φ, z). For structures with rotational symmetry, the
+φ-dependence is `exp(imφ)`, reducing 3D to 2D (r, z) with mode index `m`.
+
+1. New field arrays: `Er, Eφ, Ez, Hr, Hφ, Hz` on a (r, z) grid.
+2. Modified curl equations with `1/r` metric factors and `im/r` terms.
+3. Special handling at `r = 0` (axis singularity).
+
+**Key references:**
+- Meep cylindrical coordinates implementation
+- Taflove & Hagness, Ch. 12 (body-of-revolution FDTD)
+
+**Estimated scope:** ~800 lines.
+
+---
+
+## 3.15 TFSF Injection Kernels — P2
+
+**Why it matters:** The source type from §1.8 needs corresponding
+injection logic.
+
+**Implementation guidance:**
+
+TFSF injects a plane wave inside a rectangular volume while maintaining
+zero incident field outside. On each face of the TFSF box:
+
+- Inner side: add incident E and H to the update equations.
+- Outer side: subtract them.
+
+The incident field comes from a 1D auxiliary simulation (or analytical
+formula for plane waves) propagated along the k-vector.
+
+**Key references:**
+- Taflove & Hagness, Ch. 5.6
+- Tidy3D `source/field.py`
+
+**Estimated scope:** ~250 lines.
+
+---
+
+## 3.16 Diffraction Order Computation — P2
+
+**Why it matters:** The monitor type from §1.9 needs FFT-based
+computation.
+
+**Implementation guidance:**
+
+For periodic structures, compute power in each diffraction order by taking
+the 2D FFT of the tangential DFT fields on a plane:
+
+```julia
+E_kx_ky = fft2(E_xy)
+```
+
+Extract power at discrete diffraction order wave vectors
+`kx = 2πn/Lx`, `ky = 2πm/Ly`.
+
+**Estimated scope:** ~150 lines.
+
+---
+
+## 3.17 LDOS Computation — P2
+
+**Why it matters:** The monitor type from §1.10 needs the actual
+computation.
+
+**Implementation guidance:**
+
+```
+LDOS(ω) = -(2/π) * Re[E(ω) · J*(ω)] / |J(ω)|²
+```
+
+Record the DFT of the E-field at the source location and the DFT of the
+source current.
+
+**Key references:**
+- Meep `dft_ldos.cpp:60-80`
+
+**Estimated scope:** ~100 lines.
+
+---
+
+## 3.18 Stress Tensor Integration — P2
+
+**Why it matters:** The monitor type from §1.11 needs the actual tensor
+computation.
+
+**Implementation guidance:**
+
+Compute the Maxwell stress tensor from DFT fields on a closed surface:
+
+```
+T_ij = ε₀(E_iE_j - ½δ_ij|E|²) + μ₀⁻¹(H_iH_j - ½δ_ij|H|²)
+```
+
+Integrate over the surface to get the net force.
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 3.19 Harminv (Resonance Extraction) — P2
+
+**Present in:** Meep (via external harminv library)
+
+**Implementation guidance:**
+
+Use the filter diagonalization method (FDM) or harmonic inversion to
+extract resonant frequencies `ωₙ`, decay rates `γₙ`, amplitudes, and
+phases from time-domain field data. Implement directly in Julia or wrap
+the C harminv library via `ccall`.
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 3.20 HDF5 / Checkpoint I/O — P3
+
+**Present in:** Meep (HDF5), fdtdx (JAX checkpointing)
+
+**Implementation:** Use `HDF5.jl` to save/load field arrays, simulation
+parameters, and monitor data. Enable checkpoint/restart for long
+simulations.
+
+**Estimated scope:** ~200 lines.
+
+---
+
+## 3.21 1D Simulations — P3
+
+**Implementation:** When two cell dimensions are zero, set `ndims = 1`
+and allocate only 1D field arrays. Add a `OneD` dimension type. Simplify
+the curl to a single finite difference per component.
+
+**Estimated scope:** ~100 lines.
+
+---
+
+## 3.22 Energy Computation Kernel — P3
+
+**Implementation:** Compute `u = ½(ε|E|² + μ|H|²)` at monitor locations.
+Either time-domain (instantaneous) or DFT-based.
+
+**Estimated scope:** ~80 lines.
+
+---
+
+## 3.23 Lumped Element Update Equations — P3
+
+**Why it matters:** The types from §1.18 need corresponding modified
+update equations.
+
+**Implementation:** A resistor is equivalent to a conductivity in a single
+voxel; inductors and capacitors require auxiliary state variables with
+their own update equations embedded in the E-field update step.
+
+**Estimated scope:** ~150 lines.
+
+---
+
+## 3.24 Time Modulation Kernels — P3
+
+**Why it matters:** The types from §1.17 need runtime modulation of ε and
+σ.
+
+**Implementation:** Add a time-dependent multiplicative factor to the
+constitutive update. At each timestep, evaluate
+`ε_eff(t) = ε₀ * (1 + Δε * sin(ωt + φ))` and update the permittivity
+array or pass the modulation factor into the kernel.
+
+**Estimated scope:** ~100 lines.
+
+---
+
+# Cross-Layer Features
+
+Some features span multiple layers. They are listed here with references
+to their per-layer components.
+
+| Feature | Frontend | Compiler | Engine |
+|---------|----------|----------|--------|
+| **Periodic/Bloch BCs** | §1.1 (boundary spec) | — | §3.1 (kernels) |
+| **Dispersive Materials** | §1.2 (material types) | §2.2 (kernel selection) | §3.2 (ADE kernels) |
+| **Flux Monitors** | §1.3 (monitor type) | — | §3.3 (computation) |
+| **Mode Decomposition** | §1.4 (monitor type) | — | §3.4 (overlap) |
+| **Near-to-Far Field** | §1.5 (monitor types) | — | §3.10 (Green's fn) |
+| **Non-Uniform Grid** | — | §2.5 (planning) | §3.6 (stencils) |
+| **Multi-GPU** | — | §2.1 (decomposition) | §3.7 (halo exchange) |
+| **Symmetry** | §1.7 (spec) | §2.4 (planning) | — |
+| **Adjoint/Inverse Design** | — | §2.7 (compilation) | — |
+| **Nonlinear Materials** | §1.6 (types) | §2.2 (kernel selection) | §3.13 (kernels) |
+| **TFSF Source** | §1.8 (source type) | — | §3.15 (injection) |
+| **Diffraction Monitor** | §1.9 (monitor type) | — | §3.16 (FFT) |
+| **LDOS** | §1.10 (monitor type) | — | §3.17 (computation) |
+| **Force/Stress** | §1.11 (monitor type) | — | §3.18 (tensor) |
+| **Subpixel Averaging** | — | — | §3.5 (geometry init) |
+| **CFS-PML** | — | — | §3.8 (coefficients) |
+| **Time Modulation** | §1.17 (types) | — | §3.24 (kernels) |
+| **Lumped Elements** | §1.18 (types) | — | §3.23 (equations) |
+
+---
+
+# Khronos-Unique Advantages
+
+These are capabilities that Khronos already has or is uniquely positioned
+to implement, which the reference solvers lack:
+
+### A. Non-Uniform GPU FDTD (Open-Source First)
+
+No open-source GPU FDTD solver supports non-uniform grids. Khronos's type
+system (`Union{Vector{N}, N, Nothing}` for Δx/Δy/Δz) and dispatch-based
+kernel specialization make this achievable with minimal code changes.
+**Spans:** §2.5 (compiler planning) + §3.6 (engine stencils).
+
+### B. Composable Kernel Specialization
+
+The `Nothing`-vs-`AbstractArray` dispatch pattern in `Timestep.jl`
+generates ~32+ specialized GPU kernels from a single source. This is
+unique and should be preserved as new features are added — each new
+physics module should follow the same pattern. The Graph Compiler (§2.2)
+formalizes this into an explicit kernel selection step.
+
+### C. Multi-Backend from Single Source
+
+KernelAbstractions.jl targets CUDA, Metal, ROCm, oneAPI, and CPU from one
+codebase. Ensure all new kernels use `@kernel` and `backend_array()`.
+
+### D. Source-Level AD via Enzyme.jl
+
+Unlike fdtdx's manual backward pass, Enzyme.jl can differentiate directly
+through GPU kernels. This should be explored once the core feature set
+stabilizes. The Graph Compiler (§2.7) manages the forward/adjoint plan
+generation.
+
+### E. User-Extensible Physics
+
+Julia's open type system means users can define new material models,
+source types, and monitors without modifying Khronos source code. The
+three-layer architecture reinforces this: users can extend the Frontend
+types and the Engine will dispatch to user-defined kernels.
+
+### F. Runtime Precision Selection
+
+`Float32` / `Float64` selectable per-simulation. Extend to mixed precision
+(e.g., Float32 fields + Float64 monitor accumulation) for further GPU
+memory savings without sacrificing output accuracy.
+
+### G. Language-Agnostic IR (Future)
+
+The declarative IR (§1.19) enables non-Julia frontends to target the same
+compiler and engine. This is unique among FDTD solvers and opens the door
+to Python bindings, GUI tools, and config-file-driven workflows.