From 115e41e08c136fd851029150ece9e31b2f5bef4b Mon Sep 17 00:00:00 2001 From: Alec Hammond Date: Sat, 7 Feb 2026 19:40:00 -0800 Subject: [PATCH 1/2] Add per-phase timing instrumentation and cold/warm benchmarking to examples Add detailed timing to prepare_simulation! (per-phase: init_geometry, init_boundaries, add_sources, init_fields, init_monitors) and to run() (total steps, elapsed time, MVoxels/s overall and after 50-step warmup). Augment all 6 example scripts to run twice with fresh simulation objects: once cold (includes JIT compilation) and once warm, to separate algorithmic cost from compilation overhead. Plotting is done only once at the end. --- examples/dipole.jl | 68 ++++++++++++++++++++++++++------------- examples/gaussian_beam.jl | 38 +++++++++++++--------- examples/periodic_slab.jl | 67 ++++++++++++++++++++++---------------- examples/planewave.jl | 27 +++++++++++++--- examples/sphere.jl | 25 +++++++++++--- examples/waveguide.jl | 35 ++++++++++++-------- src/Simulation.jl | 49 +++++++++++++++++++++++++++- 7 files changed, 220 insertions(+), 89 deletions(-) diff --git a/examples/dipole.jl b/examples/dipole.jl index 319491f..20bdbe8 100644 --- a/examples/dipole.jl +++ b/examples/dipole.jl @@ -7,30 +7,52 @@ using CairoMakie Khronos.choose_backend(Khronos.CUDADevice(), Float64) -# Place a z-polarized dipole with a wavelength 1μm at the center -sources = [ - Khronos.UniformSource( - time_profile = Khronos.ContinuousWaveSource(fcen = 1.0), - component = Khronos.Ex(), - center = [0.0, 0.0, 0.0], - size = [0.0, 0.0, 0.0], - # we add a pi phase shift to force a sine wave, rather than a cosine. - amplitude = 1im, - ), -] - -# Build the simulation object, such that it spans a cube 10μm×10μm×10μm. Place PML 1μm thick. -sim = Khronos.Simulation( - cell_size = [4.0, 4.0, 4.0], - cell_center = [0.0, 0.0, 0.0], - resolution = 10,#40, - sources = sources, - boundaries = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], -) +function build_dipole_sim() + # Place a z-polarized dipole with a wavelength 1μm at the center + sources = [ + Khronos.UniformSource( + time_profile = Khronos.ContinuousWaveSource(fcen = 1.0), + component = Khronos.Ex(), + center = [0.0, 0.0, 0.0], + size = [0.0, 0.0, 0.0], + # we add a pi phase shift to force a sine wave, rather than a cosine. + amplitude = 1im, + ), + ] + + # Build the simulation object, such that it spans a cube 10μm×10μm×10μm. Place PML 1μm thick. + sim = Khronos.Simulation( + cell_size = [4.0, 4.0, 4.0], + cell_center = [0.0, 0.0, 0.0], + resolution = 10,#40, + sources = sources, + boundaries = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]], + ) + return sim +end + +function run_dipole() + sim = build_dipole_sim() + t_end = 1.0#40.0; + Khronos.run(sim, until = t_end) + return sim +end + +# --- Run 1: includes JIT compilation --- +println("=" ^ 60) +println("RUN 1 (cold — includes JIT compilation)") +println("=" ^ 60) +t1 = time() +sim = run_dipole() +println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n") -# Run the simulation for 10 "time units" -t_end = 1.0#40.0; -Khronos.run(sim, until = t_end) +# --- Run 2: fresh sim, JIT already done --- +println("=" ^ 60) +println("RUN 2 (warm — JIT already compiled)") +println("=" ^ 60) +t2 = time() +sim = run_dipole() +println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n") # Plot cross sections of the result scene = Khronos.plot2D( diff --git a/examples/gaussian_beam.jl b/examples/gaussian_beam.jl index 433656a..55e747a 100644 --- a/examples/gaussian_beam.jl +++ b/examples/gaussian_beam.jl @@ -53,7 +53,11 @@ function build_simulation(λ, waist_radius, resolution) return sim end -function run_simulation!(sim::Khronos.SimulationData) +function run_gaussian_beam() + λ = 1.0 + waist_radius = 0.75 + resolution = 20.0 + sim = build_simulation(λ, waist_radius, resolution) Khronos.run( sim, until_after_sources = Khronos.stop_when_dft_decayed( @@ -62,27 +66,31 @@ function run_simulation!(sim::Khronos.SimulationData) maximum_runtime = 300.0, ), ) + return sim end -function plot_source_profile!(sim::Khronos.SimulationData) - Khronos.prepare_simulation!(sim) - Ex = sum(sim.source_data[1].amplitude_data, dims = 3)[:, :, 1] - heatmap(Ex) -end - +# --- Run 1: includes JIT compilation --- +println("=" ^ 60) +println("RUN 1 (cold — includes JIT compilation)") +println("=" ^ 60) +t1 = time() +sim = run_gaussian_beam() +println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n") + +# --- Run 2: fresh sim, JIT already done --- +println("=" ^ 60) +println("RUN 2 (warm — JIT already compiled)") +println("=" ^ 60) +t2 = time() +sim = run_gaussian_beam() +println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n") + +# Visualize the source profile λ = 1.0 -waist_radius = 0.75 -resolution = 20.0 - -sim = build_simulation(λ, waist_radius, resolution) - -# Visualize the source profile in the time and frequency domains frequencies = 0.75:0.005:1.25 scene = Khronos.plot_timesource(sim, sim.sources[1], frequencies) save("sources.png", scene) -run_simulation!(sim) - # Visualize the response of the monitor scene = Khronos.plot_monitor(sim.monitors[1], 1) save("gaussian_beam.png", scene) diff --git a/examples/periodic_slab.jl b/examples/periodic_slab.jl index 598fe1b..452e32b 100644 --- a/examples/periodic_slab.jl +++ b/examples/periodic_slab.jl @@ -105,39 +105,50 @@ function compute_power(sim::Khronos.SimulationData) return sum(abs.(sim.monitors[1].monitor_data.fields) .^ 2, dims = [1, 2])[1, 1, 1, :] end -function main() - # Arbitrary simulation parameters that produce at least two maxima - d = 0.5 - n = 3.5 - λ = range(1.0, 2.0, length = 100) - resolution = 50 - - sim = build_simulation(d, n, λ, resolution; plot_geometry = false) +# Arbitrary simulation parameters that produce at least two maxima +d = 0.5 +n = 3.5 +λ = range(1.0, 2.0, length = 100) +resolution = 50 +function run_periodic_slab(d, n, λ, resolution) # Run a normalization simulation - run_simulation!(sim) - - # Compute the total power to normalize the actual simulation - baseline = compute_power(sim) + sim_norm = build_simulation(d, n, λ, resolution; plot_geometry = false) + run_simulation!(sim_norm) + baseline = compute_power(sim_norm) # Run the actual simulation - sim = build_simulation(d, n, λ, resolution; plot_geometry = true) - run_simulation!(sim) - slab = compute_power(sim) + sim_slab = build_simulation(d, n, λ, resolution; plot_geometry = true) + run_simulation!(sim_slab) + slab = compute_power(sim_slab) - # Normalize the transmission response T_Khronos = slab ./ baseline - - # Compute the analytic response for comparison - T = slab_analytical(d, n, λ) - - # Plot the comparison - f = Figure() - ax = Axis(f[1, 1], xlabel = "λ (μm)", ylabel = "T (a.u.)") - line1 = scatterlines!(ax, λ, Array(T)) - line2 = scatterlines!(ax, λ, Array(T_Khronos)) - legend = Legend(f[1, 2], [line1, line2], ["Analytic", "Khronos"]) - save("periodic_slab.png", f) + return T_Khronos end -main() +# --- Run 1: includes JIT compilation --- +println("=" ^ 60) +println("RUN 1 (cold — includes JIT compilation)") +println("=" ^ 60) +t1 = time() +T_Khronos = run_periodic_slab(d, n, λ, resolution) +println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n") + +# --- Run 2: fresh sim, JIT already done --- +println("=" ^ 60) +println("RUN 2 (warm — JIT already compiled)") +println("=" ^ 60) +t2 = time() +T_Khronos = run_periodic_slab(d, n, λ, resolution) +println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n") + +# Compute the analytic response for comparison +T = slab_analytical(d, n, λ) + +# Plot the comparison +f = Figure() +ax = Axis(f[1, 1], xlabel = "λ (μm)", ylabel = "T (a.u.)") +line1 = scatterlines!(ax, λ, Array(T)) +line2 = scatterlines!(ax, λ, Array(T_Khronos)) +legend = Legend(f[1, 2], [line1, line2], ["Analytic", "Khronos"]) +save("periodic_slab.png", f) diff --git a/examples/planewave.jl b/examples/planewave.jl index c5f96e5..9c5f57f 100644 --- a/examples/planewave.jl +++ b/examples/planewave.jl @@ -48,11 +48,30 @@ function build_simulation(λ, resolution) return sim end -λ = 1.0 -resolution = 40.0 +function run_planewave() + λ = 1.0 + resolution = 40.0 + sim = build_simulation(λ, resolution) + Khronos.run(sim, until = 60) + return sim +end + +# --- Run 1: includes JIT compilation --- +println("=" ^ 60) +println("RUN 1 (cold — includes JIT compilation)") +println("=" ^ 60) +t1 = time() +sim = run_planewave() +println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n") + +# --- Run 2: fresh sim, JIT already done --- +println("=" ^ 60) +println("RUN 2 (warm — JIT already compiled)") +println("=" ^ 60) +t2 = time() +sim = run_planewave() +println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n") -sim = build_simulation(λ, resolution) -Khronos.run(sim, until = 60) scene = Khronos.plot2D( sim, Khronos.Ex(), diff --git a/examples/sphere.jl b/examples/sphere.jl index fd6b19a..65f8610 100644 --- a/examples/sphere.jl +++ b/examples/sphere.jl @@ -48,11 +48,28 @@ function build_sphere_sim(resolution, radius; include_loss = false) return sim, s_xyz end -sim, s_xyz = build_sphere_sim(20.0, 1.0; include_loss = true) +function run_sphere() + sim, s_xyz = build_sphere_sim(20.0, 1.0; include_loss = true) + t_end = 10.0; + Khronos.run(sim, until = t_end) + return sim, s_xyz +end + +# --- Run 1: includes JIT compilation --- +println("=" ^ 60) +println("RUN 1 (cold — includes JIT compilation)") +println("=" ^ 60) +t1 = time() +sim, s_xyz = run_sphere() +println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n") -# Run the simulation until steady state -t_end = 10.0; -Khronos.run(sim, until = t_end) +# --- Run 2: fresh sim, JIT already done --- +println("=" ^ 60) +println("RUN 2 (warm — JIT already compiled)") +println("=" ^ 60) +t2 = time() +sim, s_xyz = run_sphere() +println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n") # Visualize the fields scene = Khronos.plot2D( diff --git a/examples/waveguide.jl b/examples/waveguide.jl index 9dd6a13..ddc5d47 100644 --- a/examples/waveguide.jl +++ b/examples/waveguide.jl @@ -46,22 +46,29 @@ function build_waveguide_simulation(λ::Number) return sim end -λ = 1.55 -sim = build_waveguide_simulation(λ) - -# scene = Khronos.plot2D( -# sim, -# nothing, -# Khronos.Volume([0.0, 0.0, 0.0], [6.0, 2.0, 2.0]); -# plot_geometry=true, -# ) - +function run_waveguide() + λ = 1.55 + sim = build_waveguide_simulation(λ) + t_end = 40.0; + Khronos.run(sim, until = t_end) + return sim +end -# scene = Khronos.plot_source(sim, sim.sources[1]) -# save("waveguide_source.png", scene) +# --- Run 1: includes JIT compilation --- +println("=" ^ 60) +println("RUN 1 (cold — includes JIT compilation)") +println("=" ^ 60) +t1 = time() +sim = run_waveguide() +println("Run 1 total wall time: $(round(time() - t1, digits=3))s\n") -t_end = 40.0; -Khronos.run(sim, until = t_end) +# --- Run 2: fresh sim, JIT already done --- +println("=" ^ 60) +println("RUN 2 (warm — JIT already compiled)") +println("=" ^ 60) +t2 = time() +sim = run_waveguide() +println("Run 2 total wall time: $(round(time() - t2, digits=3))s\n") scene = Khronos.plot2D( sim, diff --git a/src/Simulation.jl b/src/Simulation.jl index 713e0a6..c32340f 100644 --- a/src/Simulation.jl +++ b/src/Simulation.jl @@ -65,25 +65,41 @@ function prepare_simulation!(sim::SimulationData) return end - @info("Preparing simulation object...") + num_voxels = sim.Nx * sim.Ny * sim.Nz + @info("Preparing simulation object ($(sim.Nx)×$(sim.Ny)×$(sim.Nz) = $(num_voxels) voxels)...") + + t_start = time() # prepare geometry + t0 = time() init_geometry(sim, sim.geometry) + t1 = time() + @info(" init_geometry: $(round(t1-t0, digits=3))s") # prepare boundaries init_boundaries(sim, sim.boundaries) + t2 = time() + @info(" init_boundaries: $(round(t2-t1, digits=3))s") # prepare sources add_sources(sim, sim.sources) + t3 = time() + @info(" add_sources: $(round(t3-t2, digits=3))s") # determine dimensionality get_sim_dims(sim) # prepare fields init_fields(sim, sim.dimensionality) + t4 = time() + @info(" init_fields: $(round(t4-t3, digits=3))s") # prepare time and dft monitors init_monitors(sim, sim.monitors) + t5 = time() + @info(" init_monitors: $(round(t5-t4, digits=3))s") + + @info(" Total prepare: $(round(t5-t_start, digits=3))s") sim.is_prepared = true @@ -119,7 +135,13 @@ function run( prepare_simulation!(sim) end + num_voxels = sim.Nx * sim.Ny * sim.Nz + step_start = sim.timestep @info("Starting simulation...") + t_run_start = time() + warmup_steps = 50 + t_after_warmup = nothing + steps_at_warmup = 0 # prepare the step functions wait_for_sources = false if until isa Function @@ -141,6 +163,10 @@ function run( time_for_sources = last_source_time(sim) while round_time(sim) <= time_for_sources step!(sim) + if isnothing(t_after_warmup) && (sim.timestep - step_start) == warmup_steps + t_after_warmup = time() + steps_at_warmup = sim.timestep - step_start + end end @info("All sources have terminated...") end @@ -149,6 +175,27 @@ function run( # break out of the loop. while !(_run_func(sim)) step!(sim) + if isnothing(t_after_warmup) && (sim.timestep - step_start) == warmup_steps + t_after_warmup = time() + steps_at_warmup = sim.timestep - step_start + end + end + + t_run_end = time() + total_steps = sim.timestep - step_start + elapsed = t_run_end - t_run_start + if total_steps > 0 && elapsed > 0 + mcells_per_s = num_voxels * total_steps / elapsed / 1e6 + if !isnothing(t_after_warmup) && total_steps > steps_at_warmup + steady_steps = total_steps - steps_at_warmup + steady_elapsed = t_run_end - t_after_warmup + steady_rate = num_voxels * steady_steps / steady_elapsed / 1e6 + @info("Simulation complete: $(total_steps) steps in $(round(elapsed, digits=3))s " * + "($(round(mcells_per_s, digits=1)) MVoxels/s overall, " * + "$(round(steady_rate, digits=1)) MVoxels/s after warmup)") + else + @info("Simulation complete: $(total_steps) steps in $(round(elapsed, digits=3))s ($(round(mcells_per_s, digits=1)) MVoxels/s)") + end end return From 5787dcf1d4eabc3777b52578ff65aebe2851c6f2 Mon Sep 17 00:00:00 2001 From: Alec Hammond Date: Sat, 7 Feb 2026 21:19:59 -0800 Subject: [PATCH 2/2] Add feature roadmap and example coverage plan Two new documents in docs/: - ROADMAP.md: Three-layer architecture (Frontend, Graph Compiler, Engine Backend) organizing 32+ feature gaps identified by comparing Khronos against Meep, Tidy3D, and fdtdx. Each feature specifies priority tier (P0-P3), implementation guidance, code references, and scope estimates. - EXAMPLES.md: 49 proposed examples mapped against 33 Meep examples, 199 Tidy3D notebooks, and 5 fdtdx examples. Includes 13 scaling and performance benchmarks (9 buildable now), cross-solver coverage matrix, and phased implementation plan aligned with the roadmap. --- docs/EXAMPLES.md | 1293 ++++++++++++++++++++++++++++++++++++++++ docs/ROADMAP.md | 1463 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2756 insertions(+) create mode 100644 docs/EXAMPLES.md create mode 100644 docs/ROADMAP.md diff --git a/docs/EXAMPLES.md b/docs/EXAMPLES.md new file mode 100644 index 0000000..f89ebd1 --- /dev/null +++ b/docs/EXAMPLES.md @@ -0,0 +1,1293 @@ +# Khronos.jl Example Coverage Plan + +This document catalogs every example and tutorial across the four reference +FDTD solvers (Khronos.jl, Meep, Tidy3D, fdtdx), identifies coverage gaps, +and specifies the examples Khronos should implement to maximize testing of +feature parity, correctness, performance, and coverage. + +--- + +## Current Khronos.jl Examples + +Khronos currently has **6 examples**, **3 benchmarks**, and **8 test files**. + +| # | File | Physics | Dim | Sources | Monitors | Materials | Validation | +|---|------|---------|-----|---------|----------|-----------|------------| +| 1 | `examples/dipole.jl` | Point dipole radiation | 3D | UniformSource (CW) | None | Vacuum | Visual only | +| 2 | `examples/planewave.jl` | Planewave propagation | 3D | PlaneWaveSource (CW) | None | Vacuum | Visual only | +| 3 | `examples/sphere.jl` | Mie scattering | 3D | PlaneWaveSource (CW) | None | Lossy dielectric (Ball) | Visual only | +| 4 | `examples/waveguide.jl` | Waveguide mode excitation | 3D | ModeSource | None | Si + SiO₂ (Cuboid) | Visual only | +| 5 | `examples/gaussian_beam.jl` | Oblique Gaussian beam | 3D | GaussianBeamSource (pulse) | DFTMonitor | Vacuum | Visual only | +| 6 | `examples/periodic_slab.jl` | Fabry-Perot transmission | 3D | PlaneWaveSource (pulse) | DFTMonitor (100 freqs) | Dielectric slab | **Analytic comparison** | + +**Key gaps in current examples:** +- All examples are 3D (no 2D examples despite `TwoD_TE`/`TwoD_TM` support) +- Only 2 of 6 use monitors +- Only 1 has quantitative validation (periodic_slab vs. Fabry-Perot theory) +- No examples use: periodic BCs, Bloch BCs, PEC/PMC, flux monitors, mode + monitors, near-to-far, dispersive materials, nonlinear materials, + symmetry, multi-GPU, cylindrical coordinates, or adjoint optimization +- No performance benchmarks with cross-solver comparison +- No 2-run normalization workflow (incident + scattered) + +--- + +## Reference Solver Example Inventory + +### Meep — 33 Examples + +| Category | Count | Examples | +|----------|-------|---------| +| Waveguides / Basics | 3 | straight-waveguide, bent-waveguide, bend-flux | +| Resonators / Cavities | 5 | ring, holey-wvg-cavity, holey-wvg-bands, metal-cavity-ldos, solve-cw | +| Gratings / Diffraction | 6 | binary_grating, binary_grating_n2f, binary_grating_oblique, binary_grating_phasemap, polarization_grating, finite_grating | +| Scattering | 3 | mie_scattering, differential_cross_section, cylinder_cross_section | +| Reflectance | 3 | refl-angular, refl-quartz, refl_angular_bfast | +| Sources / Planewaves | 2 | oblique-planewave, oblique-source | +| Mode Decomposition | 1 | mode-decomposition (taper) | +| Near-to-Far / Radiation | 4 | antenna-radiation, cavity-farfield, metasurface_lens, zone_plate | +| Nonlinear Optics | 1 | 3rd-harm-1d | +| Optical Forces | 1 | parallel-wvgs-force | +| Perturbation Theory | 1 | perturbation_theory | +| Absorbed Power | 1 | absorbed_power_density | +| Stochastic / LED | 1 | stochastic_emitter | +| GDSII / Circuits | 1 | coupler | + +### Tidy3D — 199 Examples (key categories) + +| Category | Count | Notable Examples | +|----------|-------|-----------------| +| Core Tutorials | 21 | StartHere, BoundaryConditions, TFSF, Symmetry, AutoGrid | +| Materials | 9 | Dispersion, Fitting, FullyAnisotropic, Gyrotropic, TimeModulation | +| Geometry / Import | 8 | GDSImport, STLImport, TriangleMesh, PolySlab | +| Mode Solving | 8 | ModeSolver, ModeOverlap, BatchModeSolver | +| Waveguide Components | 21 | RingResonator, DirectionalCoupler, YJunction, MMI1x4, EdgeCoupler | +| Gratings | 5 | GratingCoupler, GratingEfficiency, BraggGratings | +| Photonic Crystals | 8 | Bandstructure, NanobeamCavity, OptimizedL3 | +| Metasurfaces | 11 | Metalens, VortexMetasurface, DielectricMetasurfaceAbsorber | +| Plasmonics | 7 | PlasmonicNanoparticle, YagiUdaNanoantenna | +| Scattering / Far-Field | 6 | Near2FarSphereRCS, PECSphereRCS, MultipoleExpansion | +| Resonators | 5 | CavityFOM, ResonanceFinder, HighQSi | +| Nonlinear | 2 | KerrSidebands, SiWaveguideTPA | +| RF / Microwave | 21 | PatchAntenna, LumpedElements, CoupledLineBandpassFilter | +| Inverse Design (autograd) | 26 | Autograd series (0–26), covering bends, metalenses, WDM, etc. | +| Classical Optimization | 8 | CMA-ES, Bayesian, Genetic Algorithm, PSO, DBS | +| Multi-Physics | 7 | HeatSolver, ChargeSolver, ThermallyTunedRingResonator | + +### fdtdx — 5 Examples + +| # | File | Physics | Key Feature | +|---|------|---------|-------------| +| 1 | `optimize_ceviche_corner.py` | Waveguide bend inverse design | Full differentiable optimization loop | +| 2 | `simulate_gaussian_source.py` | Gaussian beam + time reversal | Forward + backward FDTD verification | +| 3 | `simulate_gaussian_source_anisotropic.py` | Anisotropic slab interaction | Per-axis permittivity/permeability | +| 4 | `width_sweep_analysis.py` | Waveguide loss parametric sweep | Mode solver vs. FDTD validation | +| 5 | `check_waveguide_modes.py` | SOI waveguide mode check | Mode injection verification | + +--- + +## Proposed Khronos Examples + +Examples are organized by category. Each entry specifies: +- **Priority** (P0–P3) aligned with the roadmap +- **Validates**: what correctness/feature claim the example tests +- **Requires**: roadmap features needed before this example can work +- **Reference**: which solver examples it mirrors +- **Status**: `CAN BUILD NOW` vs. `BLOCKED ON §X.Y` + +### Category 1: Foundational Validation + +These examples validate core FDTD correctness against analytical solutions. +They form the regression test suite backbone. + +--- + +#### E1. Dielectric Slab Transmission (Fabry-Perot) ✓ EXISTS + +**File:** `examples/periodic_slab.jl` +**Priority:** P0 +**Validates:** Broadband pulse propagation, DFT monitors, power extraction, +two-run normalization workflow +**Reference:** Meep `bend-flux` (normalization pattern), Tidy3D `StartHere` +**Status:** `EXISTS` — the only quantitatively validated example today. + +**Improvements needed:** +- Add 2D version (`TwoD_TM`) to validate 2D code paths +- Add explicit flux monitor once §1.3 + §3.3 are implemented +- Add error metric (max |T_khronos - T_analytic|) printed to stdout + +--- + +#### E2. Fresnel Reflectance (Normal Incidence) + +**Priority:** P0 +**Validates:** Material boundaries, field continuity at interfaces, DFT +monitor accuracy +**Reference:** Meep `refl-quartz` (normal-incidence reflectance vs. Fresnel) +**Computes:** R(λ) for a planar air → n=3.5 dielectric interface; compare +to Fresnel formula R = |(n-1)/(n+1)|² + +**Setup:** +- 1D-equivalent (thin slab in 3D with tiny transverse extent) +- GaussianPulseSource, broadband (0.4–1.0 μm range) +- Two DFT flux monitors: reflected (behind source) and transmitted +- Two-run normalization (empty → with slab) + +**Status:** `CAN BUILD NOW` +**New features exercised:** Two-run normalization workflow, reflected flux + +--- + +#### E3. Angular Fresnel Reflectance (Oblique Incidence) + +**Priority:** P0 +**Validates:** Bloch-periodic boundaries, oblique plane wave injection, +angle-dependent reflectance +**Reference:** Meep `refl-angular` (Brewster's angle validation) +**Computes:** R(λ, θ) for P-polarized light at an air → n=3.5 interface; +identify Brewster's angle at arctan(3.5) ≈ 74° + +**Setup:** +- Thin cell in y-z, Bloch-periodic in y +- PlaneWaveSource at angle θ via k_point +- DFT flux monitors for reflected/transmitted power +- Sweep over θ = 0°, 10°, 20°, ..., 80° + +**Status:** `BLOCKED ON §1.1 + §3.1` (periodic/Bloch BCs) +**Validates roadmap:** §1.1 Boundary Spec, §3.1 Bloch Kernels + +--- + +#### E4. Mie Scattering Cross Section + +**Priority:** P0 +**Validates:** Spherical geometry (Ball), flux monitors on a closed +surface, scattering cross-section extraction, two-run normalization +**Reference:** Meep `mie_scattering`, Tidy3D `Near2FarSphereRCS` +**Computes:** σ_scat(λ) / πr² for a dielectric sphere (n=2, r=1μm); +compare to analytical Mie series (PyMieScatt or Julia equivalent) + +**Setup:** +- 3D, PML on all faces +- PlaneWaveSource (GaussianPulse, broadband) +- 6 DFT flux monitors forming a closed box around sphere +- Normalization run without sphere +- Compare to Mie series solution + +**Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors) +**Validates roadmap:** §1.3 FluxMonitor, §3.3 Flux Kernels + +--- + +#### E5. 2D Waveguide Bend Transmission + +**Priority:** P0 +**Validates:** 2D simulation (`TwoD_TM`), curved geometry, broadband +transmission, flux normalization +**Reference:** Meep `bent-waveguide` / `bend-flux` +**Computes:** T(λ) through a 90° dielectric waveguide bend (ε=12); +compare straight-waveguide normalization vs. bend + +**Setup:** +- 2D TM, PML on all faces +- Block geometry: straight waveguide + 90° bend section +- GaussianPulseSource +- DFT flux monitors at input and output ports + +**Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors) +**Validates roadmap:** 2D code paths, §1.3 FluxMonitor + +--- + +### Category 2: Resonators and Spectral Analysis + +--- + +#### E6. Ring Resonator (Q-factor Extraction) + +**Priority:** P1 +**Validates:** Curved geometry, resonance detection, quality factor +computation, convergence-based run termination +**Reference:** Meep `ring`, Tidy3D `RingResonator` +**Computes:** Resonant frequencies ωₙ and quality factors Qₙ of a +dielectric ring resonator + +**Setup:** +- 2D, PML, Cylinder geometry (ring = outer cylinder - inner cylinder) +- GaussianPulseSource (broadband, inside ring) +- Time monitor at a point inside the ring +- Post-process: FFT or Harminv-style analysis to extract ω and Q + +**Status:** `BLOCKED ON §3.12` (time monitor kernel) + resonance extraction +**Validates roadmap:** §3.12 TimeMonitor, §3.19 Harminv (optional) + +--- + +#### E7. Photonic Crystal Waveguide Bands + +**Priority:** P1 +**Validates:** Periodic boundaries, Bloch k-sweep, band structure +extraction +**Reference:** Meep `holey-wvg-bands` +**Computes:** ω(k) band diagram of a 1D photonic crystal waveguide +(periodic holes in a dielectric strip) + +**Setup:** +- 2D, periodic in x (propagation), PML in y +- Sweep k_x over the irreducible Brillouin zone +- At each k, run broadband pulse and extract resonances +- Plot ω vs. k band diagram + +**Status:** `BLOCKED ON §1.1 + §3.1` (periodic/Bloch BCs) +**Validates roadmap:** §1.1 Boundary Spec, §3.1 Periodic Kernels + +--- + +#### E8. Photonic Crystal Cavity Q + +**Priority:** P2 +**Validates:** Defect cavity in a periodic lattice, high-Q resonance +extraction +**Reference:** Meep `holey-wvg-cavity`, Tidy3D `NanobeamCavity`/`OptimizedL3` +**Computes:** Resonant frequency and Q-factor of a defect cavity in a 1D +photonic crystal waveguide; Q vs. number of mirror periods + +**Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs) + §3.12 (time monitor) + +--- + +### Category 3: Waveguide Mode Analysis + +--- + +#### E9. Mode Source Validation (Existing Waveguide Example Enhancement) + +**Priority:** P0 +**Validates:** ModeSource accuracy, mode profile fidelity, guided power +**Reference:** fdtdx `check_waveguide_modes`, `width_sweep_analysis` +**Computes:** Inject fundamental mode into Si waveguide; measure mode +overlap at output to verify >99% coupling efficiency + +**Setup:** +- Enhance existing `waveguide.jl` with: + - DFT flux monitor at input and output planes + - Mode overlap computation (once §1.4 + §3.4 are implemented) + - Print coupling efficiency and insertion loss (dB) + +**Status:** Partial `CAN BUILD NOW` (visual check); full validation +`BLOCKED ON §1.3 + §3.3` (flux) and §1.4 + §3.4 (mode overlap) + +--- + +#### E10. Waveguide Taper S-Parameters + +**Priority:** P1 +**Validates:** Mode decomposition, S-parameter extraction, geometry +(tapered prism or interpolated cuboids) +**Reference:** Meep `mode-decomposition` (linear taper reflectance) +**Computes:** S₁₁ (reflectance) and S₂₁ (transmittance) of fundamental +mode through a linear waveguide taper; verify R scales as 1/L² + +**Setup:** +- 2D or 3D, PML +- Waveguide width taper from w₁ to w₂ over length L +- ModeSource at input, mode monitors at input (reflected) and output +- Sweep taper length L + +**Status:** `BLOCKED ON §1.4 + §3.4` (mode decomposition) + +--- + +#### E11. Directional Coupler + +**Priority:** P2 +**Validates:** Evanescent coupling, two-waveguide geometry, power +splitting ratio +**Reference:** Meep `coupler`, Tidy3D `DirectionalCoupler` +**Computes:** Coupling ratio (bar/cross port power) vs. coupling length +for two parallel Si waveguides + +**Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors) + +--- + +### Category 4: Gratings and Diffraction + +--- + +#### E12. Binary Grating Diffraction Orders + +**Priority:** P1 +**Validates:** Periodic boundaries, diffraction order decomposition, +subwavelength optics +**Reference:** Meep `binary_grating`, Tidy3D `GratingEfficiency` +**Computes:** Transmittance into diffraction orders (0th through ±5th) +for a periodic binary phase grating at normal incidence + +**Setup:** +- 2D, periodic in x, PML in y +- Block geometry: periodic grating ridges +- PlaneWaveSource (broadband) +- Flux monitors above and below grating + +**Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs) + §1.9 + §3.16 +(diffraction monitor) + +--- + +#### E13. Grating Coupler Efficiency + +**Priority:** P2 +**Validates:** 3D grating simulation, fiber coupling, angle-dependent +efficiency +**Reference:** Tidy3D `GratingCoupler`, `FocusedApodGC` +**Computes:** Coupling efficiency from free-space Gaussian beam to +waveguide mode via grating coupler + +**Status:** `BLOCKED ON §1.1` (periodic BCs) + §1.4 (mode monitor) + +--- + +### Category 5: Scattering and Far-Field + +--- + +#### E14. Antenna Radiation Pattern (2D Dipole Far-Field) + +**Priority:** P1 +**Validates:** Near-to-far field transformation, radiation pattern +computation +**Reference:** Meep `antenna-radiation`, Tidy3D `AntennaCharacteristics` +**Computes:** Far-field radiation pattern E(θ) of a point dipole in +vacuum; verify isotropic pattern and total radiated power via far-field +flux integration + +**Setup:** +- 2D, PML +- Point source (CW or pulse) +- Near-field DFT surface (box around source) +- Near-to-far transformation to observation circle at r >> λ +- Compare total far-field power to near-field flux (Poynting theorem check) + +**Status:** `BLOCKED ON §1.5 + §3.10` (near-to-far field) + +--- + +#### E15. Differential Scattering Cross Section + +**Priority:** P2 +**Validates:** 3D scattering, near-to-far field, angular distribution +**Reference:** Meep `differential_cross_section`, Tidy3D `Near2FarSphereRCS` +**Computes:** dσ/dΩ(θ) for a dielectric sphere; integrate to get total σ +and compare to Mie theory + +**Status:** `BLOCKED ON §1.5 + §3.10` (near-to-far) + +--- + +### Category 6: Dispersive and Advanced Materials + +--- + +#### E16. Dispersive Slab Reflectance (Lorentzian Material) + +**Priority:** P0 +**Validates:** Dispersive material implementation (ADE), frequency-dependent +reflectance +**Reference:** Meep `refl-quartz` (Sellmeier/Lorentzian fused quartz) +**Computes:** R(λ) for a slab of Lorentzian material; compare to +analytic Fresnel with ε(ω) = ε_∞ + σ/(ω₀² - ω² - iγω) + +**Setup:** +- 1D-equivalent geometry +- Material with one Lorentzian pole +- Broadband GaussianPulseSource +- DFT flux monitors for R and T +- Analytic comparison using Fresnel + Lorentzian ε(ω) + +**Status:** `BLOCKED ON §1.2 + §3.2` (dispersive materials) + §1.3 + §3.3 +(flux monitors) +**Validates roadmap:** §1.2 LorentzianPole, §3.2 ADE Kernels + +--- + +#### E17. Drude Metal Reflectance + +**Priority:** P1 +**Validates:** Drude pole implementation, metallic response, plasma +frequency behavior +**Reference:** Meep material library metals, Tidy3D `Dispersion` +**Computes:** R(λ) for a thick Drude metal slab (e.g., simplified gold); +verify R ≈ 1 below plasma frequency, transition to transparent above + +**Status:** `BLOCKED ON §1.2 + §3.2` (dispersive materials) + +--- + +#### E18. Nonlinear Third-Harmonic Generation + +**Priority:** P2 +**Validates:** χ³ nonlinear material, third-harmonic spectral peak, +power scaling +**Reference:** Meep `3rd-harm-1d`, Tidy3D `KerrSidebands` +**Computes:** Transmitted power spectrum showing 3ω peak; verify P(3ω) ∝ +(χ³)² in the weak-nonlinearity regime + +**Status:** `BLOCKED ON §1.6 + §3.13` (nonlinear materials) + +--- + +#### E19. Anisotropic Material Slab + +**Priority:** P2 +**Validates:** Per-axis permittivity, polarization-dependent transmission +**Reference:** fdtdx `simulate_gaussian_source_anisotropic` +**Computes:** Transmission through a slab with ε = (2, 1, 2); verify +polarization-dependent wavelength shortening + +**Setup:** +- 3D, PML +- Cuboid slab with `Material(epsilonx=2, epsilony=1, epsilonz=2)` +- PlaneWaveSource (two runs: x-polarized and y-polarized) +- DFT monitors to compare T for each polarization + +**Status:** `CAN BUILD NOW` (anisotropic ε already supported) — needs +flux monitors for quantitative validation, otherwise visual check works + +--- + +### Category 7: Boundary Conditions + +--- + +#### E20. PEC and PMC Cavity Modes + +**Priority:** P2 +**Validates:** PEC and PMC boundary conditions, cavity resonance formula +**Reference:** Tidy3D `BoundaryConditions` +**Computes:** Resonant frequencies of a rectangular PEC cavity; compare +to analytical f_mnp = c/(2π) √((mπ/a)² + (nπ/b)² + (pπ/d)²) + +**Status:** `BLOCKED ON §1.1 + §3.9` (PEC/PMC boundaries) + +--- + +#### E21. Periodic Boundary Verification + +**Priority:** P0 +**Validates:** Periodic BC correctness, field continuity across boundary +**Reference:** Meep `oblique-planewave`, Tidy3D `BoundaryConditions` +**Computes:** Plane wave propagation in periodic cell; verify field values +at opposite boundaries are identical (periodic) or phase-shifted (Bloch) + +**Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs) + +--- + +### Category 8: Symmetry and Performance + +--- + +#### E22. Mirror Symmetry Acceleration + +**Priority:** P2 +**Validates:** Symmetry exploitation, memory/time reduction, result +equivalence +**Reference:** Meep `bend-flux` (mirror symmetry), Tidy3D `Symmetry` +**Computes:** Run waveguide bend with and without mirror symmetry; verify +identical flux results with 2× memory reduction + +**Status:** `BLOCKED ON §1.7 + §2.4` (symmetry) + +--- + +#### E23. Non-Uniform Grid Convergence + +**Priority:** P1 +**Validates:** Non-uniform grid accuracy, convergence rate, memory savings +**Reference:** Tidy3D `AutoGrid` / `CustomGrid` +**Computes:** Slab transmission with uniform vs. non-uniform grid; verify +identical results with fewer voxels; measure memory reduction + +**Status:** `BLOCKED ON §2.5 + §3.6` (non-uniform grid) + +--- + +### Category 9: Source Types + +--- + +#### E24. Gaussian Beam Waist Verification ✓ PARTIAL + +**File:** `examples/gaussian_beam.jl` (exists but needs quantitative check) +**Priority:** P1 +**Validates:** GaussianBeamSource profile accuracy, beam waist, divergence +**Reference:** Tidy3D `AdvancedGaussianSources` +**Computes:** Measure beam waist w(z) at several planes along propagation; +compare to analytical Gaussian beam formula +w(z) = w₀ √(1 + (z/z_R)²) + +**Improvements to existing example:** +- Add multiple DFT monitors at different z positions +- Fit Gaussian to transverse field profile at each z +- Compare extracted w(z) to theory + +**Status:** Enhancement of existing example. `CAN BUILD NOW` (partial) + +--- + +#### E25. CW Source Steady-State Verification + +**Priority:** P1 +**Validates:** ContinuousWaveSource reaches true steady state, field +amplitude matches expected value +**Reference:** Meep `solve-cw` +**Computes:** CW point source in vacuum; verify E-field amplitude at +distance r matches analytical Green's function |E| ∝ 1/r (3D) or +1/√r (2D) + +**Status:** `CAN BUILD NOW` + +--- + +### Category 10: Monitors and Post-Processing + +--- + +#### E26. Time Monitor Validation + +**Priority:** P2 +**Validates:** Time monitor kernel, temporal field recording, FFT +consistency with DFT monitor +**Reference:** General FDTD practice +**Computes:** Record E(t) at a point using TimeMonitor; take FFT and +compare to DFTMonitor at same location — spectra should match + +**Status:** `BLOCKED ON §3.12` (time monitor kernel) + +--- + +#### E27. LDOS (Local Density of States) + +**Priority:** P2 +**Validates:** LDOS computation, Purcell enhancement near structures +**Reference:** Meep `metal-cavity-ldos` +**Computes:** LDOS at the center of a 2D metal cavity; compare to +analytical Purcell factor Q/(4π²V_eff) + +**Status:** `BLOCKED ON §1.10 + §3.17` (LDOS) + §1.2 + §3.2 (dispersive +materials for metal) + +--- + +### Category 11: Scaling, Speed, and Performance + +This category is critical for demonstrating Khronos's primary value +proposition: GPU-accelerated FDTD that scales. The benchmarks here should +be run regularly and results tracked over time. + +--- + +#### E28. Single-GPU Throughput vs. Problem Size + +**Priority:** P0 +**Validates:** GPU kernel efficiency, memory bandwidth utilization, and +the regime where GPU acceleration pays off vs. CPU +**Computes:** Cells/second as a function of grid size N³, from N=32 +(tiny, launch-overhead-dominated) to N=2048 (memory-limit) + +**Setup:** +- 3D, PML, point dipole in vacuum (minimal physics — isolates raw + FDTD kernel throughput) +- Sweep grid sizes: 32³, 64³, 128³, 256³, 512³, 768³, 1024³, 1536³, 2048³ +- Run 100 timesteps (warm, after JIT compilation) +- Report per size: cells/second, wall time per step, GPU memory used, + GPU occupancy (if available) +- Plot: cells/s vs. N³ (should plateau at memory bandwidth limit) + +**Expected behavior:** +- Small grids (< 64³): GPU underutilized, CPU may be faster +- Medium grids (128³–512³): GPU ramps up, 10-100× over CPU +- Large grids (≥ 1024³): Memory-bandwidth-limited plateau, peak cells/s + +**Status:** `CAN BUILD NOW` — extends existing `benchmark/dipole.jl` +which already sweeps resolutions but does not report cells/s vs. N³ + +--- + +#### E29. Single-GPU Throughput vs. Physics Complexity + +**Priority:** P0 +**Validates:** Kernel specialization overhead — how much does adding +physics features (PML, materials, monitors, dispersive poles) cost? +**Computes:** Cells/second at fixed grid size (512³) with progressively +more physics enabled + +**Setup — incremental physics layers:** + +| Config | Physics | Expected Overhead | +|--------|---------|-------------------| +| A | Vacuum, no PML, no monitors | Baseline (fastest) | +| B | Vacuum + PML | +10-20% (ψ field updates) | +| C | Dielectric geometry (2 materials) + PML | +5% (ε lookup) | +| D | Multi-material (6 layers) + PML | +5% (same kernel) | +| E | Config C + DFT monitor (10 freqs) | +5-10% | +| F | Config C + dispersive material (1 pole) | +30-50% (ADE fields) | +| G | Config C + dispersive material (3 poles) | +50-80% | +| H | Config C + nonlinear χ³ | +10-20% | +| I | Config C + non-uniform grid (vector Δx) | +5-10% | + +**Report:** Bar chart of cells/s for each config. This directly +demonstrates the value of Khronos's `Nothing`-dispatch specialization: +configs without a feature should have zero overhead for that feature. + +**Status:** Partially `CAN BUILD NOW` (configs A–E); configs F–I +`BLOCKED` on respective roadmap features + +--- + +#### E30. Backend Comparison (CUDA vs. ROCm vs. Metal vs. CPU) + +**Priority:** P0 +**Validates:** KernelAbstractions.jl multi-backend portability and +relative performance across hardware +**Computes:** Cells/second for identical problems on each backend + +**Setup:** +- Fixed problem: 3D vacuum dipole, 512³ grid, PML, 100 timesteps +- Run on each available backend: `CUDABackend()`, `ROCBackend()`, + `MetalBackend()`, `CPU()` +- Also test with materials (sphere scattering) and monitors + +**Report per backend:** + +``` +Backend Device cells/s relative +───────────────────────────────────────────────────────────── +CUDA NVIDIA A100 1.5e9 1.00× +CUDA NVIDIA H100 2.1e9 1.40× +ROCm AMD MI250X 1.2e9 0.80× +Metal Apple M1 Pro 0.3e9 0.20× +CPU (threads=8) AMD EPYC 7763 0.05e9 0.03× +CPU (threads=1) AMD EPYC 7763 0.008e9 0.005× +``` + +**Status:** `CAN BUILD NOW` — existing `benchmark/*.yml` files already +have A100/H100/M1 baselines but don't report in a unified comparison + +--- + +#### E31. Precision Comparison (Float32 vs. Float64) + +**Priority:** P1 +**Validates:** Float32 delivers ~2× speedup and ~2× memory savings with +acceptable accuracy loss for most applications +**Computes:** Cells/second and accuracy for Float32 vs. Float64 on the +same problem + +**Setup:** +- Problem: dielectric slab transmission (periodic_slab example) +- Run both `backend_number = Float32` and `backend_number = Float64` +- Compare: (1) cells/s, (2) peak GPU memory, (3) transmission spectrum + accuracy vs. analytical Fabry-Perot + +**Expected results:** +- Float32: ~2× cells/s, ~2× less memory, max |T_err| < 0.01 +- Float64: baseline cells/s, baseline memory, max |T_err| < 0.001 + +**Status:** `CAN BUILD NOW` + +--- + +#### E32. JIT Compilation Overhead + +**Priority:** P1 +**Validates:** First-run JIT cost and warm-run performance, so users +know what to expect +**Computes:** Wall time breakdown: JIT compilation vs. actual simulation + +**Setup:** +- Problem: 3D waveguide with ModeSource (exercises the most complex + kernel path) +- Measure: (1) cold start (first `run` call), (2) warm start (second + `run` call with same types), (3) re-warm (different grid size, same + types — should be fast due to cached compilation) +- Report time spent in: Julia compilation, GPU kernel compilation, + mode solving, field initialization, actual timestep loop + +**Status:** `CAN BUILD NOW` — existing examples already print cold/warm +timing but don't break it down systematically + +--- + +#### E33. Memory Capacity Test (Maximum Problem Size) + +**Priority:** P1 +**Validates:** Maximum achievable problem size on a given GPU, memory +efficiency of field storage +**Computes:** Largest N³ grid that fits in GPU memory for each +configuration + +**Setup:** +- Binary search for max N on a given GPU (e.g., A100 80GB, H100 80GB) +- Test three configurations: + 1. Vacuum + PML (6 field arrays + ψ arrays): expect ~1800³ on 80GB + 2. With 2 materials + DFT monitor: expect ~1600³ + 3. With dispersive material (1 pole, adds 6 P arrays): expect ~1200³ +- Report: N³, total voxels, memory used, memory per voxel + +**Theoretical minimum memory per voxel:** +- 6 field components × 4 bytes (Float32) = 24 bytes/voxel +- With PML ψ: +12 auxiliary fields = +48 bytes in PML region +- With 1 dispersive pole: +6 P arrays = +24 bytes/voxel +- With DFT monitor: +2 complex × N_freq per monitor voxel + +**Status:** `CAN BUILD NOW` + +--- + +#### E34. Cross-Solver Performance Comparison + +**Priority:** P0 +**Validates:** Khronos's GPU performance advantage is real and quantified +**Computes:** Identical physical problems across Khronos, Meep, and fdtdx + +**Benchmark suite — three problems at three scales:** + +| Problem | Small | Medium | Large | +|---------|-------|--------|-------| +| Vacuum dipole | 128³ | 512³ | 1024³ | +| Sphere scattering | 128³ | 512³ | 1024³ | +| Slab + DFT monitor | 128³ | 512³ | 1024³ | + +**Per solver:** + +| Solver | Run Method | +|--------|------------| +| **Khronos** (CUDA, Float64) | `Khronos.run_benchmark(sim, 100)` | +| **Khronos** (CUDA, Float32) | Same, `Float32` | +| **Meep** (CPU, 1 thread) | `sim.run(until=...)` | +| **Meep** (CPU, 8 threads) | `sim.run(until=...)` with OMP | +| **fdtdx** (CUDA, Float32) | `run_fdtd(...)` | + +**Report:** + +``` +Problem: Sphere Scattering 512³ (100 timesteps) +───────────────────────────────────────────────── +Solver cells/s speedup vs Meep-1T +Khronos CUDA F32 2.1e9 420× +Khronos CUDA F64 1.2e9 240× +fdtdx CUDA F32 1.8e9 360× +Meep CPU 8T 4.0e7 8× +Meep CPU 1T 5.0e6 1× +``` + +**Setup scripts:** Provide matched simulation definitions in Julia +(Khronos), Python (Meep), and Python/JAX (fdtdx) that set up physically +identical problems with the same grid, materials, and timestep count. + +**Status:** `CAN BUILD NOW` for Khronos vs. Meep; fdtdx needs JAX env + +--- + +#### E35. Weak Scaling (Multi-GPU) + +**Priority:** P1 +**Validates:** Multi-GPU halo exchange overhead, near-linear scaling +**Reference:** Meep MPI scaling, fdtdx JAX sharding +**Computes:** Fixed problem size per GPU (512³ per device); measure +aggregate throughput for 1, 2, 4, 8 GPUs + +**Setup:** +- 3D, PML, dipole source in vacuum +- MPI-based domain decomposition along longest axis +- 100 timesteps per measurement + +**Report:** + +``` +GPUs Total grid cells/s efficiency +──────────────────────────────────────────────── +1 512³ 1.5e9 100% +2 512×512×1024 2.8e9 93% +4 512×512×2048 5.4e9 90% +8 512×512×4096 10.2e9 85% +``` + +**Key metric:** Parallel efficiency = (N-GPU throughput) / (N × 1-GPU +throughput). Target: >85% at 8 GPUs for single-axis decomposition. + +**Status:** `BLOCKED ON §2.1 + §3.7` (domain decomposition + halo exchange) + +--- + +#### E36. Strong Scaling (Multi-GPU) + +**Priority:** P2 +**Validates:** Multi-GPU speedup for a fixed large problem +**Computes:** Fixed total problem (2048³); divide across 1, 2, 4, 8 GPUs + +**Report:** + +``` +GPUs Partition wall time/step speedup +──────────────────────────────────────────────── +1 2048³ 680 ms 1.0× +2 2048×2048×1024 355 ms 1.9× +4 2048×2048×512 185 ms 3.7× +8 2048×2048×256 100 ms 6.8× +``` + +**Key metric:** Strong scaling efficiency. Communication-to-computation +ratio grows as slabs get thinner, so efficiency will degrade. Target: +>80% at 4 GPUs, >65% at 8 GPUs. + +**Status:** `BLOCKED ON §2.1 + §3.7` + +--- + +#### E37. Halo Exchange Overhead Profiling + +**Priority:** P2 +**Validates:** Communication is not the bottleneck; compute/communicate +overlap works +**Computes:** Breakdown of time per timestep into compute vs. +communication at various GPU counts and problem sizes + +**Setup:** +- Instrument halo exchange with timers +- Measure: (1) kernel execution time, (2) halo pack time, (3) MPI + send/recv time, (4) halo unpack time, (5) synchronization overhead +- Test with CUDA-aware MPI (direct GPU-GPU) vs. staged (GPU→CPU→GPU) + +**Report:** + +``` +2 GPUs, 512³ per GPU, CUDA-aware MPI: + Kernel: 8.2 ms (92%) + Halo pack: 0.1 ms (1%) + MPI comm: 0.5 ms (6%) + Halo unpack: 0.1 ms (1%) + Total: 8.9 ms + +2 GPUs, 512³ per GPU, Staged MPI: + Kernel: 8.2 ms (78%) + D2H copy: 0.8 ms (8%) + MPI comm: 0.7 ms (7%) + H2D copy: 0.8 ms (8%) + Total: 10.5 ms +``` + +**Status:** `BLOCKED ON §2.1 + §3.7` + +--- + +#### E38. Large-Scale Realistic Benchmarks + +**Priority:** P1 +**Validates:** Khronos handles real-world-sized problems, not just toy +examples. These are the benchmarks that matter for marketing and papers. + +**Three flagship problems:** + +**E38a. Metalens (large metasurface)** +- 3D: ~500 × 500 × 100 cells at resolution 30 → ~750M voxels +- Periodic unit cell with dielectric pillars on SiO₂ substrate +- PlaneWaveSource (broadband), DFT transmission monitor +- Tests: periodic BCs, multi-material geometry, large grid +- **Status:** `BLOCKED ON §1.1 + §3.1` (periodic BCs) + +**E38b. Photonic integrated circuit (multi-component)** +- 3D: ~2000 × 500 × 100 cells → ~100M voxels +- Si waveguides, bends, directional coupler on SiO₂ +- ModeSource input, DFT flux monitors at each port +- Tests: long propagation, multiple materials, many monitors +- **Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors) + +**E38c. Full 3D Mie scattering (large sphere)** +- 3D: sphere radius = 10λ → ~600³ grid at 30 cells/λ → ~216M voxels +- PlaneWaveSource, 6-face flux box, PML +- Tests: large 3D problem, many timesteps, PML performance +- Compare to Mie theory at 50+ frequencies +- **Status:** `BLOCKED ON §1.3 + §3.3` (flux monitors) + +**E38d. Vacuum propagation at memory limit (single-GPU stress test)** +- 3D: largest grid that fits on target GPU (e.g., 1800³ on A100 80GB) +- Point source, PML, no monitors, 100 steps +- Tests: raw throughput at maximum memory utilization, no OOM +- **Status:** `CAN BUILD NOW` + +--- + +#### E39. Kernel Roofline Analysis + +**Priority:** P2 +**Validates:** Khronos kernels are memory-bandwidth-bound (expected for +FDTD) and operating near the hardware roofline +**Computes:** Arithmetic intensity and achieved bandwidth for each +kernel variant + +**Setup:** +- Use NVIDIA Nsight Compute (or `CUDA.@profile`) to measure per-kernel: + - DRAM bytes read/written + - FLOPs executed + - Achieved memory bandwidth (GB/s) + - Arithmetic intensity (FLOPs/byte) +- Compare to device peak bandwidth (e.g., A100 = 2039 GB/s HBM) +- Plot on roofline diagram + +**Expected results:** +- Vacuum curl kernel: ~0.5 FLOPs/byte (memory-bound), achieving + ~70-80% peak bandwidth +- PML curl kernel: ~0.8 FLOPs/byte (still memory-bound, more ops + per byte due to ψ updates) +- ADE kernel: ~1.0 FLOPs/byte (approaching compute-bound) + +**Status:** `CAN BUILD NOW` (needs CUDA profiling tools) + +--- + +#### E40. Non-Uniform Grid Performance and Accuracy + +**Priority:** P1 +**Validates:** Non-uniform grid delivers equivalent accuracy at lower +memory cost, with acceptable throughput overhead +**Computes:** Accuracy and performance comparison: uniform grid vs. +non-uniform grid for a problem with disparate feature scales + +**Setup:** +- Problem: thin metal film (10nm) on glass substrate, illuminated by + broadband planewave. Film requires fine grid (Δz = 1nm), substrate + needs coarse grid (Δz = 50nm). +- Uniform grid: Δ = 1nm everywhere → ~2000³ grid (~enormous) +- Non-uniform grid: Δz = 1nm near film, grading to 50nm away → ~200³ + equivalent +- Compare: (1) R(λ) spectra should match, (2) memory: 10-100× reduction, + (3) throughput: slight penalty from vector Δx reads + +**Status:** `BLOCKED ON §2.5 + §3.6` (non-uniform grid) + +--- + +### Category 12: Inverse Design (Future) + +--- + +#### E41. Waveguide Bend Topology Optimization + +**Priority:** P2 +**Validates:** Adjoint solver, Enzyme.jl AD through FDTD kernels, +convergence to known-good design +**Reference:** fdtdx `optimize_ceviche_corner`, Tidy3D `Autograd8WaveguideBend` +**Computes:** Optimize permittivity distribution in a 2D design region to +minimize insertion loss of a 90° waveguide bend + +**Status:** `BLOCKED ON §2.7` (adjoint compilation) + +--- + +#### E42. Metalens Phase Map Optimization + +**Priority:** P3 +**Validates:** Large-scale inverse design, metasurface parameterization +**Reference:** Tidy3D `Autograd7Metalens`, Meep `metasurface_lens` +**Computes:** Optimize pillar radii in a metalens unit-cell library to +maximize focal spot intensity + +**Status:** `BLOCKED ON §2.7` (adjoint) + §1.1 (periodic BCs) + +--- + +### Category 13: Cylindrical Coordinates + +--- + +#### E43. Cylinder Scattering (Cylindrical FDTD) + +**Priority:** P2 +**Validates:** Cylindrical coordinate kernels, r=0 singularity handling, +~90× speedup vs. 3D for rotationally symmetric problems +**Reference:** Meep `cylinder_cross_section` +**Computes:** Scattering cross section of a finite dielectric cylinder +using cylindrical (r,z) FDTD with angular mode index m + +**Status:** `BLOCKED ON §3.14` (cylindrical kernels) + +--- + +#### E44. Fresnel Zone Plate Focusing + +**Priority:** P3 +**Validates:** Cylindrical coordinates, near-to-far field, large-scale +diffractive optics +**Reference:** Meep `zone_plate` +**Computes:** Focal spot intensity profile of a binary zone plate using +cylindrical FDTD + +**Status:** `BLOCKED ON §3.14` (cylindrical) + §1.5 + §3.10 (near-to-far) + +--- + +### Category 14: CFS-PML and Boundary Stability + +--- + +#### E45. CFS-PML Evanescent Wave Absorption + +**Priority:** P1 +**Validates:** CFS-PML (κ, α parameters) stability for evanescent fields +**Reference:** Tidy3D `AbsorbingBoundaryReflection` +**Computes:** Compare PML reflectance for standard σ-only PML vs. CFS-PML +with κ and α; show improved absorption at grazing incidence and for +evanescent waves + +**Setup:** +- Dipole source near PML boundary +- Measure spurious reflection amplitude with and without κ/α +- Long-time simulation to test stability + +**Status:** `BLOCKED ON §3.8` (CFS-PML activation) + +--- + +### Category 15: 2D Fundamentals + +--- + +#### E46. 2D TM Dipole Radiation + +**Priority:** P0 +**Validates:** 2D simulation code path (`TwoD_TM`), field patterns match +analytical 2D Green's function +**Reference:** Foundational FDTD validation +**Computes:** Ez field from a 2D point source; compare amplitude decay +to 1/√r (2D cylindrical wave) + +**Status:** `CAN BUILD NOW` + +--- + +#### E47. 2D TE Waveguide Propagation + +**Priority:** P0 +**Validates:** 2D TE mode (`TwoD_TE`), correct field components (Hx, Hy, Ez) +**Computes:** TE mode propagation in a dielectric slab waveguide; verify +confinement and propagation constant + +**Status:** `CAN BUILD NOW` + +--- + +### Category 16: Subpixel Averaging + +--- + +#### E48. Subpixel Averaging Convergence + +**Priority:** P1 +**Validates:** Second-order convergence at curved dielectric interfaces +**Reference:** Meep subpixel smoothing, Tidy3D `SubpixelSpec` +**Computes:** Ring resonator frequency vs. resolution with and without +subpixel averaging; verify O(Δx²) convergence with averaging vs. +O(Δx) without + +**Status:** `BLOCKED ON §3.5` (subpixel averaging) + +--- + +### Category 17: Checkpoint and I/O + +--- + +#### E49. Checkpoint / Restart + +**Priority:** P3 +**Validates:** HDF5 save/load of simulation state, exact restart +**Reference:** Meep HDF5, fdtdx JAX checkpointing +**Computes:** Run simulation for N steps, save state, restart, run N more +steps; compare to continuous 2N-step run — results should be identical + +**Status:** `BLOCKED ON §3.20` (HDF5 I/O) + +--- + +## Cross-Solver Example Coverage Matrix + +This matrix shows which examples exist in each solver and whether Khronos +can implement them today or needs roadmap features. + +| Example | Meep | Tidy3D | fdtdx | Khronos | Status | +|---------|------|--------|-------|---------|--------| +| **Basics** | | | | | | +| Dielectric slab transmission | `bend-flux` | `StartHere` | — | `periodic_slab.jl` ✓ | EXISTS | +| 2D dipole radiation | `straight-waveguide` | — | — | E36 | CAN BUILD | +| 2D waveguide propagation | `straight-waveguide` | — | — | E37 | CAN BUILD | +| Planewave in vacuum | `oblique-planewave` | — | `simulate_gaussian_source` | `planewave.jl` ✓ | EXISTS | +| CW steady-state check | `solve-cw` | — | — | E25 | CAN BUILD | +| **Fresnel / Reflectance** | | | | | | +| Normal Fresnel | `refl-quartz` | — | — | E2 | CAN BUILD | +| Angular Fresnel | `refl-angular` | `BroadbandPlaneWave...` | — | E3 | BLOCKED (BCs) | +| Dispersive Fresnel | `refl-quartz` | `Dispersion` | — | E16 | BLOCKED (ADE) | +| **Scattering** | | | | | | +| Mie scattering (sphere) | `mie_scattering` | `Near2FarSphereRCS` | — | E4 | BLOCKED (flux) | +| Differential cross section | `differential_cross_section` | `PECSphereRCS` | — | E15 | BLOCKED (N2F) | +| Cylinder cross section | `cylinder_cross_section` | — | — | E43 | BLOCKED (cyl) | +| **Waveguides** | | | | | | +| Waveguide mode launch | — | `ModalSourcesMonitors` | `check_waveguide_modes` | `waveguide.jl` ✓ | EXISTS | +| Waveguide bend | `bent-waveguide` | `EulerWaveguideBend` | — | E5 | BLOCKED (flux) | +| Waveguide taper | `mode-decomposition` | `WaveguideSizeConverter` | — | E10 | BLOCKED (mode) | +| Directional coupler | `coupler` | `DirectionalCoupler` | — | E11 | BLOCKED (flux) | +| Width sweep (loss) | — | — | `width_sweep_analysis` | E9 | PARTIAL | +| **Resonators** | | | | | | +| Ring resonator Q | `ring` | `RingResonator` | — | E6 | BLOCKED (time mon) | +| PhC cavity Q | `holey-wvg-cavity` | `NanobeamCavity` | — | E8 | BLOCKED (BCs) | +| PhC band structure | `holey-wvg-bands` | `Bandstructure` | — | E7 | BLOCKED (BCs) | +| **Gratings** | | | | | | +| Binary grating orders | `binary_grating` | `GratingEfficiency` | — | E12 | BLOCKED (BCs) | +| Grating coupler | — | `GratingCoupler` | — | E13 | BLOCKED (BCs) | +| **Far-Field** | | | | | | +| Dipole radiation pattern | `antenna-radiation` | `AntennaCharacteristics` | — | E14 | BLOCKED (N2F) | +| Zone plate focusing | `zone_plate` | `ZonePlateFieldProjection` | — | E44 | BLOCKED (cyl+N2F) | +| **Materials** | | | | | | +| Lorentzian dispersion | `refl-quartz` | `Dispersion` | — | E16 | BLOCKED (ADE) | +| Drude metal | — | `PlasmonicNanoparticle` | — | E17 | BLOCKED (ADE) | +| χ³ nonlinearity | `3rd-harm-1d` | `KerrSidebands` | — | E18 | BLOCKED (NL) | +| Anisotropic dielectric | `polarization_grating` | `FullyAnisotropic` | `gaussian_anisotropic` | E19 | CAN BUILD | +| **Boundaries** | | | | | | +| PEC/PMC cavity | — | `BoundaryConditions` | — | E20 | BLOCKED (PEC) | +| Periodic verification | `oblique-planewave` | `BoundaryConditions` | — | E21 | BLOCKED (BCs) | +| CFS-PML stability | — | `AbsorbingBoundaryReflection` | — | E45 | BLOCKED (CFS) | +| **Advanced** | | | | | | +| Mirror symmetry | `bend-flux` | `Symmetry` | — | E22 | BLOCKED (sym) | +| Non-uniform grid | — | `AutoGrid` | — | E23 | BLOCKED (grid) | +| LDOS | `metal-cavity-ldos` | — | — | E27 | BLOCKED (LDOS) | +| Subpixel convergence | — | — | — | E48 | BLOCKED (subpx) | +| **Performance** | | | | | | +| Cross-solver benchmark | — | — | — | E34 | CAN BUILD | +| Single-GPU throughput | — | — | — | E28/E29 | CAN BUILD | +| Backend comparison | — | — | — | E30 | CAN BUILD | +| Precision (F32 vs F64) | — | — | — | E31 | CAN BUILD | +| JIT overhead | — | — | — | E32 | CAN BUILD | +| Memory capacity | — | — | — | E33 | CAN BUILD | +| Weak scaling (multi-GPU) | — | — | — | E35 | BLOCKED (MPI) | +| Strong scaling (multi-GPU) | — | — | — | E36 | BLOCKED (MPI) | +| Halo profiling | — | — | — | E37 | BLOCKED (MPI) | +| Large-scale realistic | — | — | — | E38a-d | PARTIAL | +| Roofline analysis | — | — | — | E39 | CAN BUILD | +| Non-uniform grid perf | — | — | — | E40 | BLOCKED (grid) | +| **Inverse Design** | | | | | | +| Waveguide bend opt | — | `Autograd8WaveguideBend` | `optimize_ceviche_corner` | E41 | BLOCKED (adj) | +| Metalens opt | — | `Autograd7Metalens` | — | E42 | BLOCKED (adj) | +| **Gaussian Beam** | | | | | | +| Beam waist validation | — | `AdvancedGaussianSources` | `simulate_gaussian_source` | E24 / existing ✓ | PARTIAL | + +--- + +## Implementation Priority + +### Phase 1: Build Now (no new features required) + +These examples can be implemented immediately with the current Khronos +feature set and provide valuable coverage: + +| Example | What it validates | +|---------|-------------------| +| **E2** Fresnel Reflectance (Normal) | Material interfaces, DFT accuracy, two-run workflow | +| **E25** CW Steady-State Verification | CW source, Green's function decay | +| **E46** 2D TM Dipole | 2D code path validation | +| **E47** 2D TE Waveguide | 2D TE code path validation | +| **E19** Anisotropic Material Slab | Per-axis permittivity | +| **E1** (enhance) Periodic Slab | Add 2D version, error metrics | +| **E24** (enhance) Gaussian Beam | Add waist measurement | +| | | +| **Scaling / Performance (CAN BUILD NOW):** | | +| **E28** Single-GPU Throughput vs. Size | GPU utilization curve, crossover vs. CPU | +| **E29** Throughput vs. Physics Complexity | Kernel specialization overhead | +| **E30** Backend Comparison | CUDA/ROCm/Metal/CPU relative speed | +| **E31** Precision Comparison (F32 vs F64) | Speed/accuracy tradeoff | +| **E32** JIT Compilation Overhead | Cold vs. warm start timing | +| **E33** Memory Capacity Test | Max problem size per GPU | +| **E34** Cross-Solver Performance | Khronos vs. Meep vs. fdtdx | +| **E38d** Vacuum Stress Test | Max grid at memory limit | +| **E39** Kernel Roofline Analysis | Memory bandwidth utilization | + +### Phase 2: After P0 Features (§1.1–§1.3, §3.1–§3.3) + +Once periodic BCs and flux monitors are implemented: + +| Example | Unlocked by | +|---------|-------------| +| **E3** Angular Fresnel | §1.1 + §3.1 (Bloch BCs) | +| **E4** Mie Scattering | §1.3 + §3.3 (flux monitors) | +| **E5** 2D Waveguide Bend | §1.3 + §3.3 (flux monitors) | +| **E21** Periodic Verification | §1.1 + §3.1 (periodic BCs) | +| **E9** Mode Source (full) | §1.3 + §3.3 (flux monitors) | +| **E38b** PIC Benchmark | §1.3 + §3.3 (flux monitors) | +| **E38c** Large Mie Benchmark | §1.3 + §3.3 (flux monitors) | + +### Phase 3: After P0–P1 Features (§1.2, §3.2, §1.4, §3.4, §3.8, §3.12) + +Once dispersive materials, mode monitors, CFS-PML, and time monitors work: + +| Example | Unlocked by | +|---------|-------------| +| **E16** Dispersive Fresnel | §1.2 + §3.2 (ADE) | +| **E17** Drude Metal | §1.2 + §3.2 (ADE) | +| **E6** Ring Resonator Q | §3.12 (time monitor) | +| **E7** PhC Bands | §1.1 + §3.1 (BCs) | +| **E10** Waveguide Taper | §1.4 + §3.4 (mode overlap) | +| **E45** CFS-PML Stability | §3.8 (CFS-PML) | +| **E12** Binary Grating | §1.1 + §1.9 + §3.16 | +| **E40** Non-Uniform Grid Perf | §2.5 + §3.6 (grid) | + +### Phase 4: After P2 Features + +| Example | Unlocked by | +|---------|-------------| +| **E14** Antenna Pattern | §1.5 + §3.10 (near-to-far) | +| **E18** THG Nonlinear | §1.6 + §3.13 (nonlinear) | +| **E22** Symmetry | §1.7 + §2.4 (symmetry) | +| **E23** Non-Uniform Grid Accuracy | §2.5 + §3.6 (grid) | +| **E43** Cylindrical | §3.14 (cylindrical) | +| **E35/E36/E37** Multi-GPU Scaling | §2.1 + §3.7 (MPI) | +| **E41** Inverse Design | §2.7 (adjoint) | +| **E38a** Metalens Benchmark | §1.1 (periodic BCs) | + +--- + +## Quantitative Validation Standards + +Every example with an analytical or cross-solver reference should report: + +1. **Accuracy metric**: Max absolute error, RMS error, or relative error + vs. analytical/reference solution. +2. **Convergence**: Error vs. resolution (cells/wavelength) should show + expected order (first-order without subpixel, second-order with). +3. **Performance**: Cells/second, total wall time, peak GPU memory. +4. **Pass/fail threshold**: Each example should define an acceptable error + bound (e.g., |T_khronos - T_analytic| < 0.01 for all frequencies). + +Example output format: + +``` +======================================== +Khronos.jl Validation: E2 Fresnel Reflectance +======================================== +Grid: 200 x 1 x 1600 (resolution=40) +Backend: CUDA (NVIDIA A100) +Precision: Float64 + +Results: + Max |R_err|: 0.0023 + RMS R_err: 0.0008 + Frequencies: 100 points (0.4 - 1.0 μm) + +Performance: + Wall time: 12.3 s (110 timesteps warm) + Cells/second: 1.2e9 + GPU memory: 0.8 GB + +Status: PASS (max error < 0.01 threshold) +======================================== +``` diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md new file mode 100644 index 0000000..c735dea --- /dev/null +++ b/docs/ROADMAP.md @@ -0,0 +1,1463 @@ +# Khronos.jl Feature Roadmap + +This document catalogs every feature gap between Khronos.jl and the three +reference FDTD solvers (Meep, Tidy3D, fdtdx), organized around a three-layer +architecture that cleanly separates concerns and enables long-term scalability. + +Reference codebases: + +| Solver | Language | GPU | Differentiable | Open-Source | +|--------|----------|-----|----------------|-------------| +| **Meep** | C++ / Python | No | Adjoint (autograd, black-box) | Yes (GPL-2) | +| **Tidy3D** | Python API / proprietary backend | Yes (cloud) | Adjoint (autograd) | API only | +| **fdtdx** | Python / JAX | Yes (JAX) | Yes (time-reversible custom VJP) | Yes (MIT) | +| **Khronos.jl** | Julia | Yes (KernelAbstractions) | Not yet (Enzyme.jl potential) | Yes (MIT) | + +--- + +## Architecture Overview + +Khronos.jl is structured as three distinct layers, each with a clear +responsibility boundary: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Layer 1: Frontend │ +│ User-facing API, geometry/material/source/monitor types, │ +│ simulation specification DSL, GUI potential │ +│ │ +│ Emits: Simulation IR (declarative, language-agnostic) │ +├─────────────────────────────────────────────────────────────┤ +│ Layer 2: Graph Compiler │ +│ Domain decomposition, kernel selection/specialization, │ +│ memory planning, launch configuration, schedule generation │ +│ │ +│ Emits: Execution plan (kernel DAG + memory map) │ +├─────────────────────────────────────────────────────────────┤ +│ Layer 3: Engine Backend │ +│ Optimized FDTD kernels, field updates, halo exchange, │ +│ multi-backend execution (CUDA/ROCm/Metal/CPU) │ +│ │ +│ Executes: The plan, producing simulation results │ +└─────────────────────────────────────────────────────────────┘ +``` + +**Layer 1 (Frontend)** is concerned with *what* the user wants to simulate. +It provides the API for defining geometries, materials, sources, monitors, +and boundary conditions. Long-term, it emits a declarative IR (leveraging +`GeometryPrimitives.jl`) that is language-agnostic, enabling non-Julia +frontends (Python bindings, GUI, config files) to target the same compiler. + +**Layer 2 (Graph Compiler)** is concerned with *how* to execute the +simulation efficiently. It takes the declarative IR and produces an +execution plan: which kernels to launch, in what order, with what +specializations, on what devices, and with what memory layout. Initially +this layer handles domain decomposition and kernel selection; progressively +it evolves toward full type-specialized codegen. + +**Layer 3 (Engine Backend)** is concerned with *executing* the plan. It owns +the actual FDTD kernels (`@kernel` functions), field array storage, time +stepping, PML updates, source injection, and monitor accumulation. It +receives a fully resolved execution plan and runs it. + +--- + +## Priority Definitions + +| Tier | Label | Meaning | +|------|-------|---------| +| **P0** | Critical | Blocks the majority of practical FDTD workflows | +| **P1** | High | Required for competitive feature parity or significant performance gains | +| **P2** | Medium | Important for specific application domains | +| **P3** | Low | Nice-to-have, niche, or complementary | + +--- + +# Layer 1: Frontend + +The Frontend layer defines the user-facing API and the declarative +simulation specification. Features here affect how users describe +simulations, what physical models they can express, and how the simulation +description is represented internally. + +--- + +## 1.1 Boundary Specification API — P0 + +**Present in:** Meep, Tidy3D, fdtdx + +**Why critical:** Users currently have no way to request anything other than +PML. The API must support per-axis, per-side boundary type selection. + +**Current state:** `README.md:27` states "All simulations require PML." No +boundary type selection exists in the user API. + +**Implementation guidance:** + +Add a per-axis, per-side boundary type enum and specification struct: + +```julia +@enum BoundaryType PML_BC Periodic_BC Bloch_BC PEC_BC PMC_BC + +struct BoundarySpec + x::Tuple{BoundaryType, BoundaryType} # (minus, plus) + y::Tuple{BoundaryType, BoundaryType} + z::Tuple{BoundaryType, BoundaryType} +end +``` + +Tidy3D's `BoundarySpec` with per-face `Boundary(minus, plus)` is a good +model. Add a `boundary_spec` field to `Simulation`, defaulting to PML on +all faces for backward compatibility. + +For Bloch-periodic BCs, add a `k_point::Union{SVector{3},Nothing}` field +on `Simulation` that the user sets to specify the Bloch wave vector. + +**Estimated scope:** ~80 lines in `DataStructures.jl`, `Simulation.jl`. + +--- + +## 1.2 Dispersive Material Types (Lorentz / Drude / Debye Poles) — P0 + +**Present in:** Meep, Tidy3D (both extensive) + +**Why critical:** Metals (Au, Ag, Cu, Al), semiconductors at optical +frequencies, and any broadband simulation through dispersive dielectrics +require frequency-dependent permittivity. Users need to be able to specify +these material models. + +**Current state:** `DataStructures.jl:240` has `#TODO: add info relevant for +polarizability`. The `Polarizability.jl` section (line 249-252) is empty. + +**Implementation guidance:** + +Define a `Susceptibility` abstract type with concrete subtypes: + +```julia +abstract type Susceptibility end + +struct LorentzianPole{N} <: Susceptibility + σ::N # oscillator strength (Δε * ω₀²) + ω₀::N # resonance frequency + γ::N # damping rate +end + +struct DrudePole{N} <: Susceptibility + σ::N # plasma-frequency-squared term + γ::N # collision frequency +end +``` + +Add a `poles::Vector{Susceptibility}` field to `Material`. + +For maximum generality, consider implementing the Pole-Residue +representation (as in Tidy3D's `PoleResidue` class). All Lorentz, Drude, +Debye, and Sellmeier models can be converted to pole-residue pairs `(aₙ, cₙ)`. + +**Key references:** +- Taflove & Hagness, *Computational Electrodynamics*, Ch. 9 (ADE method) +- Meep `meep.hpp:88-278` (susceptibility class hierarchy) + +**Estimated scope:** ~150 lines for types in new `Susceptibility.jl` and +modifications to `DataStructures.jl`. + +--- + +## 1.3 Flux Monitor Type — P0 + +**Present in:** Meep (`dft_flux`), Tidy3D (`FluxMonitor`), fdtdx (`PoyntingFluxDetector`) + +**Why critical:** Computing transmission and reflection spectra is the +single most common FDTD post-processing task. Users need to be able to +specify flux monitors in the simulation description. + +**Current state:** Khronos has `DFTMonitor` (single-component +frequency-domain fields) but no flux monitor type. + +**Implementation guidance:** + +Add a `FluxMonitor` type that specifies a planar region and frequency list: + +```julia +struct FluxMonitor + center::SVector{3} + size::SVector{3} # one dimension should be 0 (planar) + freqs::Vector{Float64} + normal::Direction # derived from size, or user-specified +end +``` + +The flux computation itself happens in the Engine (see §3.3). The Frontend +only needs the type definition and validation (ensure exactly one dimension +is zero). + +**Estimated scope:** ~50 lines in `Monitors.jl` / `DataStructures.jl`. + +--- + +## 1.4 Mode Decomposition Monitor — P1 + +**Present in:** Meep (MPB integration), Tidy3D (`ModeMonitor`), fdtdx (`ModeOverlapDetector`) + +**Why it matters:** Determines how much power couples into each waveguide +mode. Essential for photonic circuit design (S-parameters, insertion loss, +crosstalk). + +**Current state:** Khronos has a mode solver (`VectorModesolver.jl`) and +mode source injection, but no overlap monitor type. + +**Implementation guidance:** + +Add a `ModeMonitor` type specifying the cross-section plane, modes of +interest, and frequencies. The mode-solve and overlap computation happens +in the Engine (see §3.4). + +**Estimated scope:** ~50 lines. + +--- + +## 1.5 Near-to-Far Field Monitor — P1 + +**Present in:** Meep (full dyadic Green's function), Tidy3D (angle/Cartesian/k-space) + +**Why it matters:** Computing radiation patterns, scattering cross-sections, +and far-field intensities without simulating the full propagation distance. + +**Implementation guidance:** + +Add monitor types for near-to-far field projection: + +```julia +struct NearFieldSurface + center::SVector{3} + size::SVector{3} + freqs::Vector{Float64} +end + +struct FarFieldProjection + surface::NearFieldSurface + observation::Union{AngleGrid, CartesianGrid, KSpaceGrid} +end +``` + +Support observation in angle space (θ, φ), Cartesian space (x, y at +distance d), and k-space (kx/k₀, ky/k₀), following Tidy3D's model. + +**Estimated scope:** ~100 lines for type definitions. + +--- + +## 1.6 Nonlinear Material Types (χ² and χ³) — P2 + +**Present in:** Meep (χ² and χ³ with Padé approximant), Tidy3D (Kerr, TPA, FCA) + +**Implementation guidance:** + +Add `chi2` and `chi3` fields to `Material`: + +```julia +struct Material{N} + ... + chi2::N # second-order nonlinear susceptibility + chi3::N # third-order nonlinear susceptibility +end +``` + +The nonlinear update equations are handled by the Engine (see §3.6). + +**Key references:** +- Meep `step_generic.cpp:535-547` +- Tidy3D `nonlinear.py` (`KerrNonlinearity`, `TwoPhotonAbsorption`) + +**Estimated scope:** ~30 lines in `DataStructures.jl`. + +--- + +## 1.7 Symmetry Specification — P2 + +**Present in:** Meep (mirror, rotate2, rotate4), Tidy3D (per-axis ±1) + +**Implementation guidance:** + +Add a `symmetry::SVector{3,Int}` field on `Simulation` (values: 0, +1, -1 +per axis). The Graph Compiler uses this to halve grid dimensions and the +Engine applies symmetry-aware boundary conditions. The Frontend only needs +the type and validation. + +**Key references:** +- Tidy3D `Simulation.symmetry` (per-axis 0/+1/-1) + +**Estimated scope:** ~30 lines. + +--- + +## 1.8 TFSF (Total-Field/Scattered-Field) Source — P2 + +**Present in:** Tidy3D (`TFSF`), fdtdx (`tfsf.py`) + +**Implementation guidance:** + +Add a `TFSFSource` type specifying the injection box, plane wave direction, +polarization, and time profile. TFSF injects a plane wave inside a +rectangular volume while maintaining zero incident field outside. + +**Key references:** +- Taflove & Hagness, Ch. 5.6 (TFSF formulation) +- Tidy3D `source/field.py` (`TFSF` class) + +**Estimated scope:** ~60 lines for the type definition. + +--- + +## 1.9 Diffraction Monitor — P2 + +**Present in:** Tidy3D (`DiffractionMonitor`), fdtdx (`DiffractiveDetector`) + +**Implementation guidance:** + +Add a `DiffractionMonitor` type for periodic structures. Specifies the +monitor plane and diffraction orders of interest. The FFT-based computation +happens in the Engine. + +**Estimated scope:** ~40 lines. + +--- + +## 1.10 LDOS Monitor — P2 + +**Present in:** Meep (`dft_ldos`) + +**Implementation guidance:** + +Add an `LDOSMonitor` type that is co-located with a point dipole source. +Records the DFT of the E-field at the source location. + +**Key references:** +- Meep `dft_ldos.cpp:60-80` + +**Estimated scope:** ~30 lines. + +--- + +## 1.11 Force / Stress Tensor Monitor — P2 + +**Present in:** Meep (`dft_force`, stress tensor integration) + +**Implementation guidance:** + +Add a `ForceMonitor` type specifying a closed surface around an object. +The Maxwell stress tensor integration happens in the Engine. + +**Key references:** +- Meep `stress.cpp:90-114` + +**Estimated scope:** ~40 lines. + +--- + +## 1.12 Custom Sources (Current and Time Profile) — P3 + +**Current state:** `CustomCurrentSource` at `SpatialSources.jl:252-277` is +`# TODO`. Custom time source at `TimeSources.jl:138` is `# TODO`. + +**Implementation:** Allow users to pass arbitrary arrays (spatial current +distribution) and functions (time profile) to the source infrastructure. + +**Estimated scope:** ~100 lines. + +--- + +## 1.13 GDS Import — P3 + +**Present in:** Meep (libGDSII), Tidy3D (gdstk) + +**Implementation:** Use `GDSTK.jl` or parse GDSII directly to extract +polygon vertices. Convert polygons to `GeometryPrimitives.jl` shapes (or +a new `PolySlab` type). + +**Estimated scope:** ~200 lines. + +--- + +## 1.14 PolySlab Geometry (Extruded Polygon with Sidewall Angle) — P3 + +**Present in:** Tidy3D (`PolySlab` with `sidewall_angle`), fdtdx (extruded polygon) + +**Implementation:** Define a `PolySlab` shape with 2D polygon vertices, +extrusion axis, slab bounds, and optional sidewall taper angle. Use +point-in-polygon testing for the cross-section. + +**Estimated scope:** ~200 lines. + +--- + +## 1.15 STL / Triangle Mesh Import — P3 + +**Present in:** Tidy3D (`TriangleMesh`) + +**Implementation:** Parse STL files into triangle meshes. Use ray-casting +(Möller-Trumbore intersection) for point-in-mesh testing during geometry +initialization. + +**Estimated scope:** ~300 lines. + +--- + +## 1.16 Astigmatic Gaussian Beam Source — P3 + +**Present in:** Tidy3D (`AstigmaticGaussianBeam`) + +**Implementation:** Extend `GaussianBeamSource` to support two independent +waist radii and waist distances (one per transverse axis). + +**Estimated scope:** ~50 lines. + +--- + +## 1.17 Time Modulation of Materials — P3 + +**Present in:** Tidy3D (`ModulationSpec`, `SpaceTimeModulation`) + +**Implementation:** Allow ε and σ to vary sinusoidally in time. Add a +`ModulationSpec` field to `Material` specifying modulation amplitude, +frequency, and phase. + +**Estimated scope:** ~60 lines for types. + +--- + +## 1.18 Lumped Circuit Elements — P3 + +**Present in:** Tidy3D (`LumpedResistor`, `RLCNetwork`) + +**Implementation:** Define R, L, C element types that embed as modified +update equations in localized grid regions. + +**Estimated scope:** ~80 lines for types. + +--- + +## 1.19 Simulation IR Specification — P1 + +**Not present in:** any reference solver (all are monolithic) + +**Why it matters:** A declarative, language-agnostic intermediate +representation decouples the user API from the compiler and engine. This +enables multiple frontends (Julia API, Python bindings, GUI, config files) +to target the same simulation pipeline. + +**Implementation guidance:** + +The IR should be a serializable representation of a complete simulation +specification, leveraging `GeometryPrimitives.jl` for shape primitives. +It captures: + +1. **Domain**: cell size, resolution (uniform or non-uniform), dimension. +2. **Geometry**: ordered list of geometric objects with material assignments. +3. **Materials**: permittivity, permeability, conductivity, susceptibility + poles, nonlinear coefficients. +4. **Sources**: type, spatial extent, time profile, polarization. +5. **Monitors**: type, location, frequencies, components. +6. **Boundaries**: per-axis, per-side boundary types + Bloch k-vector. +7. **Symmetries**: per-axis symmetry phases. +8. **Run parameters**: total time, Courant factor, convergence criteria. + +Initially this can be a Julia struct hierarchy (essentially a cleaned-up +version of `SimulationData`). The language-agnostic serialization format +(JSON, MessagePack, or a schema-defined binary format) comes later. + +**Estimated scope:** ~300 lines for initial Julia struct IR. + +--- + +## 1.20 Energy Monitor — P3 + +**Present in:** fdtdx (`EnergyDetector`) + +**Implementation:** Add an `EnergyMonitor` type. Computes +`u = ½(ε|E|² + μ|H|²)` at monitor locations, either instantaneous or +DFT-based. + +**Estimated scope:** ~30 lines for the type. + +--- + +# Layer 2: Graph Compiler + +The Graph Compiler takes a complete simulation specification (from Layer 1) +and produces an optimized execution plan for Layer 3. It is responsible for +all decisions about *how* to run the simulation efficiently: domain +decomposition, kernel specialization, memory layout, device assignment, and +scheduling. + +This layer evolves progressively: + +- **Phase A (near-term):** Domain decomposition + kernel selection based on + active physics modules. Essentially formalizes the logic currently + embedded in `prepare_simulation!`. +- **Phase B (mid-term):** Full type-specialized kernel DAG generation. + Analyzes which physics modules are active (dispersive? nonlinear? PML?) + and produces a minimal kernel schedule, eliminating branches at runtime. +- **Phase C (long-term):** Cost-model-driven optimization. Profile-guided + kernel fusion, occupancy tuning, memory placement (shared vs global), + and multi-device load balancing. + +--- + +## 2.1 Domain Decomposition Planner — P1 + +**Present in:** Meep (MPI), fdtdx (JAX `NamedSharding`), Tidy3D (cloud) + +**Why it matters:** Single-GPU memory limits simulation size to ~2000³ +voxels (Float32). The compiler must decide how to partition the domain +across devices. + +**Current state:** No multi-device logic exists. Ghost cells exist +(`OffsetArrays` with ±1 padding) and 8 `# TODO update halo` comments mark +insertion points. + +**Implementation guidance:** + +The domain decomposition planner takes the simulation grid dimensions and +available device count, then produces a partition map: + +1. **Single-axis decomposition** — Split along the longest axis. Assign + contiguous slab regions to each device/rank. Emit halo-exchange + instructions in the execution plan between each field-update substep. + +2. **Cost-balanced partitioning** — Weight partitions by computational cost + (PML regions are more expensive than free space). Meep + `structure.cpp:66-94` implements binary-tree cost-based partitioning. + +3. **Multi-axis decomposition** — For large device counts, split along + 2-3 axes using a 3D process grid. + +The planner emits a `PartitionPlan` struct consumed by the Engine: + +```julia +struct PartitionPlan + axis::Int # split axis + slabs::Vector{UnitRange{Int}} # per-device grid ranges + neighbors::Vector{Tuple{Int,Int}} # (left, right) rank per device + halo_width::Int +end +``` + +**Key references:** +- Meep `structure.cpp:66-94` (cost-based partitioning) +- fdtdx `sharding.py` (`get_named_sharding_from_shape`) + +**Estimated scope:** ~300 lines. + +--- + +## 2.2 Kernel Selection and Specialization — P1 + +**Not present in:** any reference solver (unique to Khronos) + +**Why it matters:** Khronos's `Nothing`-vs-`AbstractArray` dispatch pattern +in `Timestep.jl` generates ~32+ specialized GPU kernels from a single +source. The Graph Compiler should formalize this: given the set of active +physics (PML? dispersive? nonlinear? magnetic materials?), select the +minimal set of kernel specializations needed. + +**Current state:** Kernel specialization happens implicitly through Julia's +multiple dispatch at JIT time. This works but is opaque — the user and +compiler cannot inspect or optimize the kernel schedule. + +**Implementation guidance:** + +1. **Physics feature flags** — Analyze the simulation IR to determine which + features are active: + - PML boundaries → PML-aware curl kernels + - Dispersive materials → ADE update kernels + - Nonlinear materials → nonlinear constitutive update + - μ ≠ 1 → non-trivial H-from-B update + - Non-uniform grid → vector Δx kernels + +2. **Kernel DAG** — Produce a directed acyclic graph of kernel invocations + per timestep. Each node specifies the kernel function, its type + parameters (which dispatch variant), and its array arguments. The DAG + encodes dependencies (e.g., ADE update must happen between D-update and + E-update). + +3. **Specialization table** — For each region of the grid (interior, PML, + interface), record which kernel variant applies. Regions with different + physics get different kernel launches. + +**Estimated scope:** ~400 lines. + +--- + +## 2.3 Memory Planning — P2 + +**Why it matters:** GPU memory is the primary bottleneck for 3D FDTD. The +compiler should compute the total memory footprint before launching and +warn (or auto-adjust resolution) if it exceeds device capacity. + +**Implementation guidance:** + +1. **Static analysis** — From the simulation IR, compute: + - Field arrays: `6 × Nx × Ny × Nz × sizeof(T)` (E and H) + - PML arrays: auxiliary ψ fields in PML regions + - Dispersive arrays: P and P_prev per pole per component + - Monitor arrays: DFT accumulation buffers + - Material arrays: ε, μ, σ per component + +2. **Memory budget** — Query device memory via KernelAbstractions and + compare against the computed requirement. + +3. **Auto-adjustment** — Optionally suggest resolution reduction, single- + precision, or multi-GPU decomposition if the problem exceeds single-GPU + memory. + +**Estimated scope:** ~200 lines. + +--- + +## 2.4 Symmetry Exploitation Planner — P2 + +**Present in:** Meep (mirror, rotate2, rotate4), Tidy3D (per-axis ±1) + +**Implementation guidance:** + +When the simulation IR specifies symmetries (see §1.7), the compiler: + +1. Halves the grid dimensions along each symmetric axis. +2. Modifies boundary conditions at symmetry planes: PEC-like (symmetric E, + antisymmetric H) for +1 symmetry, PMC-like for -1 symmetry. +3. Applies phase factors when monitors read fields across the symmetry + plane. +4. Reduces memory and computation by 2× per axis (up to 8× for 3 axes). + +The compiler emits a modified grid spec and symmetry boundary instructions +for the Engine. + +**Key references:** +- Meep `vec.hpp:1181-1231` (symmetry class) + +**Estimated scope:** ~200 lines. + +--- + +## 2.5 Non-Uniform Grid Planning — P1 + +**Present in:** Tidy3D (per-axis `GridSpec`, `AutoGrid`, `CustomGrid`) +**Not in:** Meep (scalar `a`), fdtdx (scalar `resolution`) + +**Why it matters:** Structures with disparate feature scales waste enormous +memory with uniform grids. Per-axis resolution alone can give 4-100× +memory savings. **No other open-source GPU FDTD solver has this.** + +**Current state:** `DataStructures.jl:341-343` defines `Δx, Δy, Δz` as +`Union{Vector{N}, N, Nothing}` — designed from day one for non-uniform +grids. But all code paths treat them as scalars. + +**Implementation guidance:** + +The compiler resolves grid spacing from the simulation IR: + +1. **Per-axis uniform (Δx ≠ Δy ≠ Δz)** — Allow `resolution` to be a + `Tuple{Int,Int,Int}`. Compute `Δx = cell_size[1]/resolution[1]` etc. + Update Courant: `Δt = C / √(1/Δx² + 1/Δy² + 1/Δz²)`. + +2. **Per-cell non-uniform** — Produce `Δx::Vector{N}` arrays. The + compiler selects the correct kernel variants (vector Δx dispatch). + PML σ profiles become per-cell. Courant condition uses + `Δt = C × min(Δx_min, Δy_min, Δz_min) / √3`. + +3. **AutoGrid (future)** — Compute per-cell sizes based on local + wavelength in material and feature proximity (Tidy3D model). + +**Key references:** +- Tidy3D `grid_spec.py` (`AutoGrid`, `CustomGrid`, `MeshOverrideStructure`) + +**Estimated scope:** ~200 lines for per-axis uniform; ~400 additional for +per-cell planning. + +--- + +## 2.6 Launch Configuration Optimizer — P3 + +**Why it matters:** GPU performance is sensitive to thread block size, +shared memory usage, and register pressure. The compiler should auto-tune +launch configurations based on kernel type and grid dimensions. + +**Implementation guidance:** + +1. Query device compute capability and SM count. +2. For each kernel, compute occupancy at various block sizes. +3. Select the block size that maximizes occupancy (or use a heuristic + based on kernel characteristics — memory-bound vs compute-bound). +4. KernelAbstractions.jl's `@kernel` already handles some of this, but + explicit workgroup size selection can improve performance by 10-30%. + +**Estimated scope:** ~150 lines. + +--- + +## 2.7 Adjoint Compilation — P2 + +**Present in:** Meep (autograd), Tidy3D (autograd), fdtdx (time-reversible JAX AD) + +**Implementation guidance:** + +The compiler supports inverse design by producing both forward and adjoint +execution plans: + +1. **Enzyme.jl path (recommended)** — The compiler marks kernels for + Enzyme differentiation. Unlike fdtdx (which manually implements + `update_E_reverse()` and `update_H_reverse()` in `backward.py`), + Enzyme.jl differentiates directly through KernelAbstractions kernels. + +2. **Manual adjoint path** — The compiler generates the time-reversed + kernel DAG with adjoint sources from objective function gradients. + +Supporting infrastructure in the compiler: +- `DesignRegion` mapping latent parameters to permittivity arrays. +- Filter + projection pipeline (Gaussian filter → tanh projection). + fdtdx's `SubpixelSmoothedProjection` (`projection.py:215`, based on + arxiv.org/2503.20189) is the state of the art. +- Minimum feature size constraints (fdtdx `BrushConstraint2D`). + +**Key references:** +- fdtdx `fdtd.py:16-230` (reversible FDTD with custom VJP) +- fdtdx `projection.py:215` (subpixel smoothed projection) +- Meep `python/adjoint/` (OptimizationProblem, DesignRegion) +- Tidy3D `plugins/invdes/` (TopologyDesignRegion, FilterProject) + +**Estimated scope:** ~1000+ lines for the full pipeline. + +--- + +## 2.8 KD-Tree Geometry Acceleration — P3 + +**Current state:** `Geometry.jl:8` notes this as a future optimization. +Currently uses linear search through geometry objects. + +**Implementation guidance:** + +During compilation, build a KD-tree (or BVH) over bounding boxes for the +geometry list. The Engine uses tree traversal instead of linear `findfirst` +during voxelization. Matters for scenes with hundreds of objects. + +**Estimated scope:** ~150 lines (or use `NearestNeighbors.jl`). + +--- + +# Layer 3: Engine Backend + +The Engine Backend executes the plan produced by the Graph Compiler. It +owns the FDTD kernels, field array storage, time stepping, boundary +condition enforcement, source injection, and monitor accumulation. All +kernels use `KernelAbstractions.jl` `@kernel` macros for multi-backend +portability (CUDA, ROCm, Metal, oneAPI, CPU). + +--- + +## 3.1 Periodic and Bloch-Periodic Boundary Kernels — P0 + +**Present in:** Meep, Tidy3D, fdtdx (periodic only) + +**Why critical:** Required for photonic crystals, gratings, metasurfaces, +waveguide supercells. Without periodic BCs every such simulation must use +PML, wasting compute. + +**Current state:** No periodic boundary code exists. The ghost-cell +infrastructure exists (`Fields.jl:128-139`, `OffsetArrays` with one ghost +pixel per side). The 8 `# TODO update halo` sites in `Timestep.jl` +(lines 74, 108, 144, 178, 235, 243, 253, 264) are the exact insertion +points. + +**Implementation guidance:** + +1. **Periodic BCs** — For each axis flagged as periodic, copy field values + from one side of the domain to the ghost cells on the opposite side + after each field-update substep. Each halo-update site becomes: + + ```julia + if boundary_is_periodic(sim, X()) + copy_periodic_ghost!(sim.fields.fEx, X()) + end + ``` + + On GPU, this is a simple kernel or `copyto!` between slices. + +2. **Bloch-periodic BCs** — Same as periodic, but multiply copied values + by a complex phase factor `exp(i k · L)` where `k` is the Bloch wave + vector and `L` is the lattice vector. This requires: + - Complex-valued fields (switch `backend_number` to complex when + `k_point != nothing`). + - Phase factors `eikna = exp(i * k[d] * cell_size[d])` precomputed + once per axis. + + Reference: Meep `boundaries.cpp:90-105` implements exactly this via + `eikna[d]`, `coskna[d]`, `sinkna[d]`. fdtdx implements periodic BCs + via `jnp.pad(mode="wrap")` in `curl.py`. + +**Estimated scope:** ~220 lines in `Boundaries.jl`, `Timestep.jl`. + +--- + +## 3.2 Dispersive Material ADE Kernels — P0 + +**Why critical:** The material types from §1.2 need corresponding update +kernels. + +**Current state:** Polarization fields `Px/Py/Pz` are wired through +`update_field!` (`Timestep.jl:475-477`) but always passed as `Nothing`. +The kernel dispatch infrastructure means adding real P arrays will +auto-generate the correct specialized kernel with no changes to +`update_field_generic`. + +**Implementation guidance:** + +The standard approach is the Auxiliary Differential Equation (ADE) method. +For each dispersive pole, auxiliary polarization fields `P` and `P_prev` +are stored and updated at each timestep. + +1. **Auxiliary fields** — For each pole, allocate two arrays per component + (`P_current`, `P_prev`) with the same shape as the E-field arrays. + Modify `Fields.jl` to allocate these when + `any(obj -> has_poles(obj.material), geometry)`. + +2. **ADE update kernel** — Insert a new kernel call between + `step_D_from_H!` and `update_E_from_D!` (between lines 30 and 32 of + `step!`). The Lorentzian ADE update is (Meep + `susceptibility.cpp:254`): + + ``` + P_next = γ₁⁻¹ * (P_cur * (2 - ω₀²Δt²) - γ₁ * P_prev + ω₀²Δt² * σ * E) + ``` + + where `γ₁ = 1 + γΔt/2`, `γ₁⁻¹ = 1 / γ₁`. + + The Drude case sets the denominator term differently (no `ω₀²` in the + resonance term). + +3. **Constitutive update** — In `update_E_from_D!`, the P fields are + already wired: `update_field_generic` at `Timestep.jl:437` adds + `P[idx]` to the net field. Once P arrays are non-`Nothing`, this path + activates automatically via dispatch. + +**Key references:** +- Taflove & Hagness, Ch. 9 (ADE method) +- Meep `susceptibility.cpp:188-261` (Lorentzian ADE update) + +**Estimated scope:** ~350 lines in new `Susceptibility.jl` kernel code, +modifications to `Fields.jl`, `Timestep.jl`. + +--- + +## 3.3 Flux Computation Kernels — P0 + +**Why critical:** The flux monitor type from §1.3 needs corresponding +DFT accumulation and post-processing kernels. + +**Implementation guidance:** + +1. **DFTFluxMonitor kernel** — Store DFT accumulations for 4 tangential + components (e.g., for a z-normal plane: `Ex, Ey, Hx, Hy`). Reuse the + existing `update_dft_monitor!` kernel (`Monitors.jl:120-137`) for each + component independently. + +2. **Flux computation** — After the simulation, compute: + + ```julia + function flux(m::DFTFluxMonitorData, freq_idx) + # For z-normal: S_z = Ex*Hy - Ey*Hx + return real(sum(m.Ex[:,:,freq_idx] .* conj(m.Hy[:,:,freq_idx]) + - m.Ey[:,:,freq_idx] .* conj(m.Hx[:,:,freq_idx]))) + end + ``` + + Scale by `Δx * Δy / (N_timesteps)²` for proper normalization. + +3. **Normal direction handling** — Determine the 4 tangential components + from the monitor plane normal using the existing `Direction` type. + +**Key references:** +- Meep `dft.cpp` (DFT flux implementation) +- fdtdx `poynting_flux.py:11` (`PoyntingFluxDetector`) + +**Estimated scope:** ~150 lines in `Monitors.jl`. + +--- + +## 3.4 Mode Overlap Computation — P1 + +**Why it matters:** The mode monitor type from §1.4 needs the actual +overlap integral computation. + +**Implementation guidance:** + +1. Record DFT of all 4 tangential field components on a cross-section + plane (reuse `DFTMonitor` infrastructure). + +2. Compute the mode profile at each DFT frequency using + `VectorModesolver.jl`. + +3. Compute the overlap integral: + ``` + α = (1/4) ∫ (E_mode × H*_sim + E*_sim × H_mode) · dA + ``` + + This is exactly fdtdx's `ModeOverlapDetector` formula (`mode.py:15`). + +4. The complex coefficient `α` gives both amplitude and phase. Power in + mode `n` is `|αₙ|²`. + +**Key references:** +- Meep `meep.hpp:2145-2151` (`get_overlap`, `get_mode_flux_overlap`) +- fdtdx `detectors/mode.py` + +**Estimated scope:** ~200 lines. + +--- + +## 3.5 Subpixel Averaging — P1 + +**Present in:** Meep (anisotropic averaging), Tidy3D (multiple methods) + +**Why it matters:** Without subpixel averaging, FDTD convergence at curved +dielectric interfaces is first-order instead of second-order. In practice +this means ~4× finer grids (and ~64× more voxels in 3D) for equivalent +accuracy. On GPU where memory is the bottleneck, this directly limits +problem size. + +**Current state:** `Geometry.jl` uses point sampling — each voxel material +is determined by a single `findfirst(point, geometry)` call (lines 187-203). + +**Implementation guidance:** + +The standard method (Farjadpour et al. 2006) computes an effective +inverse-permittivity tensor at each interface voxel: + +``` +ε⁻¹_eff = P‖ * <ε⁻¹> + P⊥ * <ε>⁻¹ +``` + +where `<ε>` and `<ε⁻¹>` are volume-averaged permittivity and its inverse, +`P‖` projects parallel to the interface, and `P⊥` projects perpendicular. + +1. **Normal vector computation** — Sample ε at N quadrature points within + each voxel. The gradient of ε gives the interface normal. Meep uses + sphere quadrature (`anisotropic_averaging.cpp:33-56`). A simpler start + is central differences on the ε grid. + +2. **Volume averaging** — For each voxel straddling a material interface, + compute the fill fraction `f` and averaged `<ε>`, `<ε⁻¹>`. + +3. **Tensor storage** — The effective `ε⁻¹` is a 3x3 symmetric tensor. + For diagonal anisotropy (the common case), store 3 components per + voxel. Khronos already supports per-component `εx, εy, εz` arrays. + +4. **Integration** — Modify `_write_geometry_3d!` in `Geometry.jl` to + detect boundary voxels and apply the averaging procedure. + +**Key references:** +- A. Farjadpour et al., "Improving accuracy by subpixel smoothing in the + finite-difference time domain," *Optics Letters* 31(20), 2006. +- Meep `anisotropic_averaging.cpp:90-150` +- Tidy3D `SubpixelSpec` + +**Estimated scope:** ~400 lines in `Geometry.jl`. + +--- + +## 3.6 Non-Uniform Grid Stencils — P1 + +**Why it matters:** The grid plans from §2.5 need corresponding stencil +implementations in the Engine. + +**Current state:** The finite difference stencils (`Timestep.jl:395-400`) +already parameterize on `Δx, Δy, Δz` independently: + +```julia +d_dx!(A, Δx, idx_curl, ix, iy, iz) = inv(Δx) * (A[ix+idx_curl,...] - A[ix,...]) +``` + +**Implementation guidance:** + +1. **Per-axis uniform** — No stencil changes needed. The existing functions + work with independent scalar `Δx, Δy, Δz`. + +2. **Per-cell non-uniform** — Add method overloads: + ```julia + d_dx!(A, Δx::AbstractVector, idx_curl, ix, iy, iz) = + inv(Δx[ix]) * (A[ix+idx_curl,...] - A[ix,...]) + ``` + + Julia's dispatch will generate a specialized GPU kernel that reads + `Δx[ix]` per cell, while the uniform case continues to use a + compile-time constant. + + PML sigma profiles become per-cell (already arrays, so natural). + +**Estimated scope:** ~200 lines in `Timestep.jl`. + +--- + +## 3.7 Halo Exchange (Multi-GPU/MPI) — P1 + +**Why it matters:** The partition plan from §2.1 needs corresponding data +movement kernels. + +**Current state:** 8 `# TODO update halo` sites in `Timestep.jl`. + +**Implementation guidance:** + +Two approaches: + +1. **MPI-based (Meep model)** — Use `MPI.jl` for inter-process + communication. After each field-update substep, exchange ghost-cell + slices between neighboring ranks. Use CUDA-aware MPI to avoid + device-to-host copies. + +2. **KernelAbstractions multi-device (fdtdx model)** — Use Julia's + `Distributed` or `CUDA.jl`'s multi-device API with explicit `copyto!` + for halo exchange. + +Each halo-update site needs: + +```julia +if num_processes > 1 + exchange_halo!(field_array, axis, rank, comm) +end +``` + +where `exchange_halo!` does non-blocking `MPI.Isend` / `MPI.Irecv` of +the boundary slices. + +**Key references:** +- Meep `boundaries.cpp:35-77` (communication sequences) +- fdtdx `sharding.py` + +**Estimated scope:** ~600 lines. + +--- + +## 3.8 CFS-PML Activation (κ and α Parameters) — P1 + +**Present in:** Meep, Tidy3D, fdtdx (all use full CPML) + +**Why it matters:** Standard PML (σ only) can be unstable for evanescent +waves, long-time simulations, and dispersive media. CFS-PML adds κ +(coordinate stretching) and α (complex shift) for improved absorption. + +**Current state:** `BoundaryData` already defines `κBx/κBy/κBz`, +`κDx/κDy/κDz`, `αBx/αBy/αBz`, `αDx/αDy/αDz` +(`DataStructures.jl:145-158`) but they are all commented +`# currently not used`. + +**Implementation guidance:** + +The CFS-PML stretching factor is: + +``` +s(ω) = κ + σ / (α + iω) +``` + +In the time domain, the CPML update becomes (fdtdx `curl.py`): + +``` +b = exp(-(σ/κ + α) * Δt) +a = σ * (b - 1) / (σ + κα) +ψ_new = b * ψ_old + a * (∂F/∂x) +``` + +1. Populate κ and α arrays in `init_boundaries` (`Boundaries.jl:57-110`) + alongside existing σ arrays. + +2. Modify the `generic_curl!` overloads in `Timestep.jl` to use CPML + coefficients `a` and `b` (precomputed from σ, κ, α) instead of raw σ. + +3. Since κ and α fields already exist in the struct, this is mostly + computing them and threading them into existing PML update paths. + +**Key references:** +- J.A. Roden and S.D. Gedney, "Convolutional PML (CPML)," *Microwave and + Optical Technology Letters*, 27(5), 2000. +- fdtdx `perfectly_matched_layer.py:117` + +**Estimated scope:** ~200 lines in `Boundaries.jl` and `Timestep.jl`. + +--- + +## 3.9 PEC and PMC Boundary Walls — P2 + +**Present in:** Meep, Tidy3D (`PECBoundary`, `PMCBoundary`) + +**Implementation guidance:** + +- **PEC:** Zero the tangential E-field components at the boundary. For a + z-boundary: set `Ex = Ey = 0` on the wall. +- **PMC:** Zero the tangential H-field components. For a z-boundary: + set `Hx = Hy = 0` on the wall. + +These are trivial Dirichlet conditions applied after each field update. +`Boundaries.jl:117` has a comment referencing Dirichlet zeroing but no code. + +**Estimated scope:** ~50 lines. + +--- + +## 3.10 Near-to-Far Field Transformation Kernels — P1 + +**Why it matters:** The monitor types from §1.5 need the actual Green's +function integration. + +**Implementation guidance:** + +1. Record DFT of tangential E and H on a closed/open surface. Compute + equivalent currents: `J = n × H`, `M = -n × E`. + +2. Integrate against the free-space Green's function: + - **3D:** Dyadic Green's function with 1/r, 1/r², 1/r³ terms + (Meep `near2far.cpp:133-187`). + - **2D:** Hankel functions (`H₀⁽¹⁾(kr)`, `H₁⁽¹⁾(kr)`). + +3. For far-field only, use the asymptotic form (plane-wave approximation), + reducing to a Fourier transform of the surface currents. + +**Key references:** +- Meep `near2far.cpp` +- Tidy3D `FieldProjectionAngleMonitor`, `FieldProjectionCartesianMonitor`, + `FieldProjectionKSpaceMonitor` +- Taflove & Hagness, Ch. 8 + +**Estimated scope:** ~500 lines. + +--- + +## 3.11 Custom Permeability (μ) Wiring — P2 + +**Present in:** Meep, Tidy3D, fdtdx + +**Current state:** The `Material` struct has `μx, μy, μz` fields. +`Geometry.jl` allocates `μ_inv_x/y/z` arrays but hardcodes `μ_inv = 1.0` +at line 312 with `# todo add μ support`. + +**Implementation guidance:** + +Wire the allocated μ arrays into `GeometryData` and pass them to +`update_H_from_B!` as the `m_inv` parameter (currently receives scalar +`1.0`). The kernel dispatch already handles `m_inv::AbstractArray` vs +`m_inv::Real` via `get_m_inv()` (`Timestep.jl:459-463`), so the kernel +side needs no changes. + +**Estimated scope:** ~30 lines in `Geometry.jl`. + +--- + +## 3.12 Time Monitor Kernel — P2 + +**Current state:** `TimeMonitor` struct and initialization exist +(`Monitors.jl:28-53`) but `#TODO add update and kernel functions for time +monitor` at line 55. + +**Implementation guidance:** + +Write an `update_monitor` dispatch for `TimeMonitorData` that copies the +current field value at the monitor location into the preallocated storage +at the current time index: + +```julia +@kernel function update_time_monitor!(monitor_fields, sim_fields, + offset_x, offset_y, offset_z, time_idx) + ix, iy, iz = @index(Global, NTuple) + monitor_fields[time_idx, ix, iy, iz] = sim_fields[ix+offset_x, iy+offset_y, iz+offset_z] +end +``` + +**Estimated scope:** ~50 lines. + +--- + +## 3.13 Nonlinear Material Update Kernels — P2 + +**Why it matters:** The material types from §1.6 need corresponding +kernels. + +**Implementation guidance:** + +The nonlinear update modifies the E = ε⁻¹D constitutive relation to +include intensity-dependent terms. Meep uses a Padé approximant for +stability (`step_generic.cpp:535-547`): + +``` +u = (1 + c₂ + 2c₃) / (1 + 2c₂ + 3c₃) +E = u * ε⁻¹ * D +``` + +where `c₂ = D * χ² * (ε⁻¹)²` and `c₃ = |D|² * χ³ * (ε⁻¹)³`. + +Allocate per-voxel arrays in `Geometry.jl` for the nonlinear coefficients, +and add a multiplicative factor in `update_field_generic`. + +**Estimated scope:** ~200 lines. + +--- + +## 3.14 Cylindrical Coordinate Kernels — P2 + +**Present in:** Meep (`Dcyl`, angular mode index `m`) + +**Current state:** `abstract type Cylindrical <: Dimension` declared at +`DataStructures.jl:55`, `#TODO add support for cylindrical coordinates` at +`Simulation.jl:52`. + +**Implementation guidance:** + +Cylindrical FDTD replaces Cartesian curl equations with their cylindrical +equivalents (r, φ, z). For structures with rotational symmetry, the +φ-dependence is `exp(imφ)`, reducing 3D to 2D (r, z) with mode index `m`. + +1. New field arrays: `Er, Eφ, Ez, Hr, Hφ, Hz` on a (r, z) grid. +2. Modified curl equations with `1/r` metric factors and `im/r` terms. +3. Special handling at `r = 0` (axis singularity). + +**Key references:** +- Meep cylindrical coordinates implementation +- Taflove & Hagness, Ch. 12 (body-of-revolution FDTD) + +**Estimated scope:** ~800 lines. + +--- + +## 3.15 TFSF Injection Kernels — P2 + +**Why it matters:** The source type from §1.8 needs corresponding +injection logic. + +**Implementation guidance:** + +TFSF injects a plane wave inside a rectangular volume while maintaining +zero incident field outside. On each face of the TFSF box: + +- Inner side: add incident E and H to the update equations. +- Outer side: subtract them. + +The incident field comes from a 1D auxiliary simulation (or analytical +formula for plane waves) propagated along the k-vector. + +**Key references:** +- Taflove & Hagness, Ch. 5.6 +- Tidy3D `source/field.py` + +**Estimated scope:** ~250 lines. + +--- + +## 3.16 Diffraction Order Computation — P2 + +**Why it matters:** The monitor type from §1.9 needs FFT-based +computation. + +**Implementation guidance:** + +For periodic structures, compute power in each diffraction order by taking +the 2D FFT of the tangential DFT fields on a plane: + +```julia +E_kx_ky = fft2(E_xy) +``` + +Extract power at discrete diffraction order wave vectors +`kx = 2πn/Lx`, `ky = 2πm/Ly`. + +**Estimated scope:** ~150 lines. + +--- + +## 3.17 LDOS Computation — P2 + +**Why it matters:** The monitor type from §1.10 needs the actual +computation. + +**Implementation guidance:** + +``` +LDOS(ω) = -(2/π) * Re[E(ω) · J*(ω)] / |J(ω)|² +``` + +Record the DFT of the E-field at the source location and the DFT of the +source current. + +**Key references:** +- Meep `dft_ldos.cpp:60-80` + +**Estimated scope:** ~100 lines. + +--- + +## 3.18 Stress Tensor Integration — P2 + +**Why it matters:** The monitor type from §1.11 needs the actual tensor +computation. + +**Implementation guidance:** + +Compute the Maxwell stress tensor from DFT fields on a closed surface: + +``` +T_ij = ε₀(E_iE_j - ½δ_ij|E|²) + μ₀⁻¹(H_iH_j - ½δ_ij|H|²) +``` + +Integrate over the surface to get the net force. + +**Estimated scope:** ~200 lines. + +--- + +## 3.19 Harminv (Resonance Extraction) — P2 + +**Present in:** Meep (via external harminv library) + +**Implementation guidance:** + +Use the filter diagonalization method (FDM) or harmonic inversion to +extract resonant frequencies `ωₙ`, decay rates `γₙ`, amplitudes, and +phases from time-domain field data. Implement directly in Julia or wrap +the C harminv library via `ccall`. + +**Estimated scope:** ~200 lines. + +--- + +## 3.20 HDF5 / Checkpoint I/O — P3 + +**Present in:** Meep (HDF5), fdtdx (JAX checkpointing) + +**Implementation:** Use `HDF5.jl` to save/load field arrays, simulation +parameters, and monitor data. Enable checkpoint/restart for long +simulations. + +**Estimated scope:** ~200 lines. + +--- + +## 3.21 1D Simulations — P3 + +**Implementation:** When two cell dimensions are zero, set `ndims = 1` +and allocate only 1D field arrays. Add a `OneD` dimension type. Simplify +the curl to a single finite difference per component. + +**Estimated scope:** ~100 lines. + +--- + +## 3.22 Energy Computation Kernel — P3 + +**Implementation:** Compute `u = ½(ε|E|² + μ|H|²)` at monitor locations. +Either time-domain (instantaneous) or DFT-based. + +**Estimated scope:** ~80 lines. + +--- + +## 3.23 Lumped Element Update Equations — P3 + +**Why it matters:** The types from §1.18 need corresponding modified +update equations. + +**Implementation:** A resistor is equivalent to a conductivity in a single +voxel; inductors and capacitors require auxiliary state variables with +their own update equations embedded in the E-field update step. + +**Estimated scope:** ~150 lines. + +--- + +## 3.24 Time Modulation Kernels — P3 + +**Why it matters:** The types from §1.17 need runtime modulation of ε and +σ. + +**Implementation:** Add a time-dependent multiplicative factor to the +constitutive update. At each timestep, evaluate +`ε_eff(t) = ε₀ * (1 + Δε * sin(ωt + φ))` and update the permittivity +array or pass the modulation factor into the kernel. + +**Estimated scope:** ~100 lines. + +--- + +# Cross-Layer Features + +Some features span multiple layers. They are listed here with references +to their per-layer components. + +| Feature | Frontend | Compiler | Engine | +|---------|----------|----------|--------| +| **Periodic/Bloch BCs** | §1.1 (boundary spec) | — | §3.1 (kernels) | +| **Dispersive Materials** | §1.2 (material types) | §2.2 (kernel selection) | §3.2 (ADE kernels) | +| **Flux Monitors** | §1.3 (monitor type) | — | §3.3 (computation) | +| **Mode Decomposition** | §1.4 (monitor type) | — | §3.4 (overlap) | +| **Near-to-Far Field** | §1.5 (monitor types) | — | §3.10 (Green's fn) | +| **Non-Uniform Grid** | — | §2.5 (planning) | §3.6 (stencils) | +| **Multi-GPU** | — | §2.1 (decomposition) | §3.7 (halo exchange) | +| **Symmetry** | §1.7 (spec) | §2.4 (planning) | — | +| **Adjoint/Inverse Design** | — | §2.7 (compilation) | — | +| **Nonlinear Materials** | §1.6 (types) | §2.2 (kernel selection) | §3.13 (kernels) | +| **TFSF Source** | §1.8 (source type) | — | §3.15 (injection) | +| **Diffraction Monitor** | §1.9 (monitor type) | — | §3.16 (FFT) | +| **LDOS** | §1.10 (monitor type) | — | §3.17 (computation) | +| **Force/Stress** | §1.11 (monitor type) | — | §3.18 (tensor) | +| **Subpixel Averaging** | — | — | §3.5 (geometry init) | +| **CFS-PML** | — | — | §3.8 (coefficients) | +| **Time Modulation** | §1.17 (types) | — | §3.24 (kernels) | +| **Lumped Elements** | §1.18 (types) | — | §3.23 (equations) | + +--- + +# Khronos-Unique Advantages + +These are capabilities that Khronos already has or is uniquely positioned +to implement, which the reference solvers lack: + +### A. Non-Uniform GPU FDTD (Open-Source First) + +No open-source GPU FDTD solver supports non-uniform grids. Khronos's type +system (`Union{Vector{N}, N, Nothing}` for Δx/Δy/Δz) and dispatch-based +kernel specialization make this achievable with minimal code changes. +**Spans:** §2.5 (compiler planning) + §3.6 (engine stencils). + +### B. Composable Kernel Specialization + +The `Nothing`-vs-`AbstractArray` dispatch pattern in `Timestep.jl` +generates ~32+ specialized GPU kernels from a single source. This is +unique and should be preserved as new features are added — each new +physics module should follow the same pattern. The Graph Compiler (§2.2) +formalizes this into an explicit kernel selection step. + +### C. Multi-Backend from Single Source + +KernelAbstractions.jl targets CUDA, Metal, ROCm, oneAPI, and CPU from one +codebase. Ensure all new kernels use `@kernel` and `backend_array()`. + +### D. Source-Level AD via Enzyme.jl + +Unlike fdtdx's manual backward pass, Enzyme.jl can differentiate directly +through GPU kernels. This should be explored once the core feature set +stabilizes. The Graph Compiler (§2.7) manages the forward/adjoint plan +generation. + +### E. User-Extensible Physics + +Julia's open type system means users can define new material models, +source types, and monitors without modifying Khronos source code. The +three-layer architecture reinforces this: users can extend the Frontend +types and the Engine will dispatch to user-defined kernels. + +### F. Runtime Precision Selection + +`Float32` / `Float64` selectable per-simulation. Extend to mixed precision +(e.g., Float32 fields + Float64 monitor accumulation) for further GPU +memory savings without sacrificing output accuracy. + +### G. Language-Agnostic IR (Future) + +The declarative IR (§1.19) enables non-Julia frontends to target the same +compiler and engine. This is unique among FDTD solvers and opens the door +to Python bindings, GUI tools, and config-file-driven workflows.