CORE PRINCIPLE: Python serves as performance-first API prototype that demonstrates clean design patterns for subsequent Fortran API implementation, with eventual API convergence.
CORE ARCHITECTURE: Expose existing SoA performance through batch-oriented APIs
simple/
├── __init__.py # Public API: ParticleBatch, BatchResults, trace_orbits
├── core/
│ ├── __init__.py
│ ├── particles.py # ParticleBatch class - wraps zstart(5, n_particles)
│ ├── results.py # BatchResults class - wraps result arrays
│ ├── config.py # Config class - replaces namelist globals
│ └── simulation.py # High-level Simulation wrapper
├── backends/
│ ├── __init__.py
│ ├── fortran.py # f90wrap interface to existing pysimple
│ └── native.py # Future: Direct memory interface
├── samplers/
│ ├── __init__.py
│ ├── surface.py # Surface sampling - wraps existing samplers.f90
│ ├── volume.py # Volume sampling
│ └── file.py # File-based initialization
└── utils/
├── __init__.py
├── memory.py # Memory-efficient streaming utilities
└── visualization.py # Batch visualization toolsParticleBatch Class: Direct wrapper around existing Fortran SoA arrays
class ParticleBatch:
"""Batch container for particle data using existing SoA layout"""
def __init__(self, n_particles: int):
"""Initialize batch with existing zstart(5, n_particles) layout"""
self.n_particles = n_particles
# Direct allocation using existing Fortran allocation patterns
self._backend = simple._backend.allocate_particle_arrays(n_particles)
@property
def positions(self) -> np.ndarray:
"""Zero-copy view of zstart array - Shape: (5, n_particles)"""
# Direct memory view - no copying, existing SoA layout
return self._backend.get_zstart_view()
@property
def coordinates(self) -> Coordinates:
"""Structured access to particle coordinates"""
pos = self.positions
return Coordinates(
s=pos[0, :], # Flux surface coordinate
theta=pos[1, :], # Poloidal angle
phi=pos[2, :], # Toroidal angle
v_par=pos[3, :], # Parallel velocity
mu=pos[4, :] # Magnetic moment
)
def initialize_surface(self, vmec_file: str, s: float, **kwargs):
"""Initialize particles on flux surface using existing samplers"""
# Calls existing samplers.f90 functions
simple._backend.sample_flux_surface(
self._backend.arrays, vmec_file, s, self.n_particles, **kwargs
)
def initialize_volume(self, vmec_file: str, s_min: float, s_max: float):
"""Volume sampling using existing implementation"""
simple._backend.sample_volume(
self._backend.arrays, vmec_file, s_min, s_max, self.n_particles
)
def to_dict(self) -> Dict[str, np.ndarray]:
"""Export data for serialization/analysis"""
return {
'coordinates': self.positions.copy(),
'n_particles': self.n_particles,
'metadata': self._backend.get_metadata()
}
@classmethod
def from_dict(cls, data: Dict) -> 'ParticleBatch':
"""Reconstruct from serialized data"""
batch = cls(data['n_particles'])
batch.positions[:] = data['coordinates']
return batchBatchResults Class: Wrapper around existing result arrays
class BatchResults:
"""Results container wrapping existing Fortran output arrays"""
def __init__(self, backend_results):
self._backend = backend_results
self.n_particles = backend_results.n_particles
@property
def loss_times(self) -> np.ndarray:
"""Access to existing times_lost array - Shape: (n_particles,)"""
return self._backend.get_times_lost_view() # Zero-copy view
@property
def final_positions(self) -> np.ndarray:
"""Access to existing zend array - Shape: (5, n_particles)"""
return self._backend.get_zend_view() # Zero-copy view
@property
def trap_parameter(self) -> np.ndarray:
"""Access to existing trap_par array"""
return self._backend.get_trap_par_view()
@property
def perpendicular_invariant(self) -> np.ndarray:
"""Access to existing perp_inv array"""
return self._backend.get_perp_inv_view()
def confinement_statistics(self) -> ConfinementStats:
"""Vectorized analysis of confinement using existing arrays"""
lost_mask = self.loss_times < np.inf
confined_mask = ~lost_mask
return ConfinementStats(
n_total=self.n_particles,
n_confined=confined_mask.sum(),
n_lost=lost_mask.sum(),
confined_fraction=confined_mask.sum() / self.n_particles,
mean_loss_time=self.loss_times[lost_mask].mean() if lost_mask.any() else np.inf,
loss_time_distribution=np.histogram(self.loss_times[lost_mask], bins=50)
)
def save_hdf5(self, filename: str, append: bool = False):
"""Efficient HDF5 export of all result arrays"""
import h5py
mode = 'a' if append else 'w'
with h5py.File(filename, mode) as f:
grp = f.create_group(f'batch_{len(f.keys())}' if append else 'results')
grp.create_dataset('loss_times', data=self.loss_times, compression='gzip')
grp.create_dataset('final_positions', data=self.final_positions, compression='gzip')
grp.create_dataset('trap_parameter', data=self.trap_parameter, compression='gzip')
grp.create_dataset('perpendicular_invariant', data=self.perpendicular_invariant, compression='gzip')def trace_orbits(
particles: ParticleBatch,
tmax: float,
integrator: str = 'symplectic',
openmp_threads: Optional[int] = None,
memory_efficient: bool = False
) -> BatchResults:
"""
Batch orbit tracing using existing OpenMP parallelized implementation
Performance: <5% overhead vs direct Fortran execution
Memory: Zero-copy access to existing SoA arrays
Scaling: Leverages existing OpenMP thread-per-particle pattern
"""
# Configure OpenMP threads (uses existing OpenMP infrastructure)
if openmp_threads is not None:
os.environ['OMP_NUM_THREADS'] = str(openmp_threads)
# Set up configuration using existing parameter system
config = {
'tmax': tmax,
'integmode': _integrator_map[integrator],
'ntestpart': particles.n_particles,
'memory_efficient': memory_efficient
}
# Call existing optimized Fortran implementation
# Maps directly to existing simple_main.f90 structure
result_arrays = simple._backend.run_simulation(
particles._backend.arrays, # Uses existing zstart allocation
config
)
return BatchResults(result_arrays)class ParticleBatchStream:
"""Memory-efficient streaming for processing millions of particles"""
def __init__(self, total_particles: int, batch_size: int = 100_000):
self.total_particles = total_particles
self.batch_size = batch_size
self.current_batch = 0
def __iter__(self):
return self
def __next__(self) -> ParticleBatch:
start_idx = self.current_batch * self.batch_size
if start_idx >= self.total_particles:
raise StopIteration
end_idx = min(start_idx + self.batch_size, self.total_particles)
actual_size = end_idx - start_idx
# Create batch using existing allocation patterns
batch = ParticleBatch(actual_size)
self.current_batch += 1
return batch
def process_large_simulation(
vmec_file: str,
n_total: int,
tmax: float,
batch_size: int = 100_000,
output_file: str = 'results.h5'
) -> StreamResults:
"""
Process millions of particles in memory-efficient batches
Memory Usage: Constant - independent of total particle count
Performance: Linear scaling with existing OpenMP optimization
Output: Streaming HDF5 for large dataset handling
"""
stream = ParticleBatchStream(n_total, batch_size)
for i, batch in enumerate(stream):
# Initialize using existing sampling
batch.initialize_surface(vmec_file, s=0.9)
# Process using existing optimized algorithms
results = trace_orbits(batch, tmax=tmax)
# Stream results to disk
results.save_hdf5(output_file, append=(i > 0))
# Clean memory between batches
del batch, results
return StreamResults(output_file, n_total)class GPUParticleBatch(ParticleBatch):
"""GPU-optimized batch using existing SoA layout for future CUDA/OpenCL"""
def __init__(self, n_particles: int, device: str = 'cuda:0'):
super().__init__(n_particles)
self.device = device
# Future: GPU memory allocation preserving SoA layout
def to_gpu(self) -> 'GPUParticleBatch':
"""Transfer to GPU maintaining SoA memory layout"""
# Existing SoA layout is optimal for coalesced GPU memory access
try:
import cupy as cp
gpu_positions = cp.asarray(self.positions) # Efficient transfer
return GPUParticleBatch._from_gpu_array(gpu_positions)
except ImportError:
raise RuntimeError("CuPy required for GPU operations")
def from_gpu(self) -> ParticleBatch:
"""Efficient GPU->CPU transfer maintaining SoA layout"""
cpu_positions = cupy.asnumpy(self._gpu_positions)
batch = ParticleBatch(self.n_particles)
batch.positions[:] = cpu_positions
return batch
# Future GPU kernel interface design
def trace_orbits_gpu(particles: GPUParticleBatch, **kwargs) -> 'GPUBatchResults':
"""
Future GPU implementation leveraging existing SoA layout
Memory Pattern: positions[5, n_particles] - optimal for coalesced access
Thread Mapping: One thread per particle - matches existing OpenMP pattern
Algorithm: Identical to existing symplectic integrators
"""
# GPU kernels will operate on existing SoA memory layout
# positions[coordinate, particle] allows coalesced memory access
# Each thread processes one particle - natural mapping from OpenMP
pass# backends/fortran.py - Wrapper around existing pysimple module
class FortranBackend:
"""Integration layer with existing f90wrap pysimple module"""
def __init__(self):
try:
import pysimple # Existing f90wrap module
self.pysimple = pysimple
except ImportError:
raise RuntimeError("pysimple module not available - build with f90wrap")
def allocate_particle_arrays(self, n_particles: int):
"""Use existing Fortran allocation for zstart arrays"""
# Calls existing params_mod allocation routines
self.pysimple.params_mod.ntestpart = n_particles
self.pysimple.alloc_arrays() # Existing allocation routine
return FortranArrayWrapper(self.pysimple, n_particles)
def run_simulation(self, arrays, config):
"""Execute simulation using existing simple_main structure"""
# Set parameters using existing namelist system
for key, value in config.items():
setattr(self.pysimple.params_mod, key, value)
# Call existing main simulation loop
self.pysimple.run_simple() # Existing OpenMP parallelized execution
return FortranResultWrapper(self.pysimple)
class FortranArrayWrapper:
"""Zero-copy wrapper around existing Fortran arrays"""
def __init__(self, pysimple_module, n_particles):
self.pysimple = pysimple_module
self.n_particles = n_particles
def get_zstart_view(self) -> np.ndarray:
"""Direct view of existing zstart array"""
# f90wrap provides direct memory access to Fortran arrays
return self.pysimple.params_mod.zstart # Shape: (5, ntestpart)
def get_zend_view(self) -> np.ndarray:
"""Direct view of existing zend array"""
return self.pysimple.params_mod.zend
def get_times_lost_view(self) -> np.ndarray:
"""Direct view of existing times_lost array"""
return self.pysimple.params_mod.times_lostclass PerformanceValidator:
"""Validate that Python API preserves existing HPC performance"""
def validate_soa_layout(self, batch: ParticleBatch):
"""Verify SoA memory layout efficiency"""
positions = batch.positions
# Test 1: Contiguous memory layout
assert positions.flags.c_contiguous, "SoA layout must be C-contiguous"
# Test 2: Cache-friendly column access (existing optimization)
self._benchmark_column_access(positions)
# Test 3: Memory strides match existing Fortran layout
self._validate_memory_strides(positions)
def validate_openmp_scaling(self, particles: ParticleBatch):
"""Verify existing OpenMP performance is preserved"""
# Test scaling with different thread counts
for threads in [1, 2, 4, 8]:
execution_time = self._benchmark_threads(particles, threads)
# Validate scaling efficiency matches existing implementation
def validate_memory_overhead(self, n_particles: int):
"""Ensure minimal memory overhead vs existing implementation"""
baseline = self._measure_fortran_memory(n_particles)
python_api = self._measure_python_memory(n_particles)
overhead = (python_api - baseline) / baseline
assert overhead < 0.05, f"Memory overhead too high: {overhead:.1%}"
def golden_record_comparison(self):
"""Bit-identical results vs existing implementation"""
# Run identical simulation with both APIs
fortran_results = self._run_fortran_reference()
python_results = self._run_python_api()
# Results must be identical
np.testing.assert_array_equal(
fortran_results['times_lost'],
python_results.loss_times,
"Results must match existing implementation exactly"
)IMPLEMENTATION TIMELINE:
- Week 1: Core ParticleBatch and BatchResults classes with f90wrap integration
- Week 2: Memory-efficient streaming and performance validation framework
- Week 3: GPU-ready architecture and comprehensive testing
- Week 4: Documentation, optimization, and golden record validation
- AUTOMATIC TRANSLATION:
typename_t↔PEP8convention mapping - TRANSPARENT: Users see only their preferred convention
- SEAMLESS INTEGRATION: Clean interface between Python and Fortran
- PATTERN IMPLEMENTATION: Apply proven Python patterns to Fortran
- NAMING CONVENTION: Fortran uses
typename_tconvention (config_t,simulation_t,orbit_t) - ATOMIC STEPS: Safe incremental refactoring preserving
simple.xfunctionality - GOLDEN RECORD: Validation ensures no regressions
- END STATE: Python becomes lightweight wrapper over clean Fortran API
- PERFORMANCE: Remains in Fortran, Python provides convenience
- API CONVERGENCE: Both interfaces expose same clean patterns
- Performance: Highly optimized symplectic integrators with OpenMP parallelization
- Scientific Accuracy: Proven algorithms for stellarator particle tracing
- Robustness: Stable codebase with extensive real-world usage
! Current: Global module variables
module params
real(dp) :: tmax, dtau
integer :: nparticles, field_type
end module
! Target: Encapsulated configuration
type :: config_t
real(dp) :: tmax, dtau
integer :: nparticles, field_type
contains
procedure :: from_namelist
procedure :: validate
end type! Current: Rigid sequence with side effects
call read_config()
call init_field()
call params_init()
call run()
! Target: Clean encapsulated interface
type(simulation_t) :: sim
sim = create_simulation(config)
results = sim%run()# Current: Complex multi-step setup
import pysimple
pysimple.read_config('simple.in')
pysimple.init_field()
pysimple.params_init()
pysimple.run()
# Target: Clean high-level interface
import pysimple
config = {'vmec_file': 'wout.nc', 'nparticles': 10000}
sim = simple.Simulation(config)
results = sim.run()class Config:
def from_dict(self, config_dict): pass
def to_namelist_string(self): pass
class Simulation:
def __init__(self, config): pass
def run(self): pass
class Orbit:
def trace(self, tmax): passtype :: config_t
contains
procedure :: from_dict => config_from_dict
procedure :: to_namelist_string => config_to_namelist
end type
type :: simulation_t
contains
procedure :: init => simulation_init
procedure :: run => simulation_run
end type
type :: orbit_t
contains
procedure :: trace => orbit_trace
end type# Name translator handles convention mapping
class NameTranslator:
def python_to_fortran(self, name):
return f"{name.lower()}_t"
def method_python_to_fortran(self, class_name, method_name):
return f"{class_name.lower()}_{method_name}"simple/
├── __init__.py # Public API (Config, Simulation, Orbit)
├── config.py # Configuration management prototype
├── simulation.py # Batch simulation interface
├── orbit.py # Single orbit analysis
├── translation.py # Name convention translator
├── _backend.py # f90wrap interface to Fortran
└── visualization.py # Built-in plotting
src/api/
├── simple_config.f90 # config_t derived type
├── simple_simulation.f90 # simulation_t derived type
├── simple_orbit.f90 # orbit_t derived type
├── simple_api.f90 # High-level API functions
└── simple_translation.f90 # Name translation utilities
CRITICAL INSIGHT: Current Fortran implementation is ALREADY HPC/SoA optimized:
zstart(1:5, ntestpart)- Structure of Arrays pattern ✓- OpenMP particle parallelization ✓
- Cache-friendly column access ✓
- Million+ particle capability ✓
DESIGN GOAL: Create batch-oriented APIs around existing high-performance core.
# LEVERAGES EXISTING: SoA data structures, OpenMP parallelization
import pysimple
# Batch processing (primary use case)
particles = simple.ParticleBatch.from_vmec_surface(
vmec_file='wout.nc',
n_particles=100_000, # Leverages existing SoA: zstart(5, n_particles)
surface_s=0.9
)
# GPU-ready batch operations
results = simple.trace_orbits(
particles,
tmax=1000.0,
integrator='symplectic_midpoint' # Uses existing optimized integrators
)
# Direct access to underlying SoA arrays
positions = results.final_positions # Shape: (5, n_particles) - existing zstart format
loss_times = results.loss_times # Shape: (n_particles,) - existing times_lost format# LEVERAGE EXISTING: SoA layout perfect for GPU acceleration
class ParticleBatch:
def to_gpu(self) -> 'GPUParticleBatch':
# Existing SoA layout transfers efficiently to GPU
gpu_positions = cupy.asarray(self.positions) # Shape: (5, n_particles)
return GPUParticleBatch(gpu_positions)
def from_gpu(self) -> 'ParticleBatch':
# Efficient GPU->CPU transfer maintaining SoA layout
cpu_positions = cupy.asnumpy(self._gpu_positions)
return ParticleBatch(cpu_positions)
# Future GPU kernel interface (leverages existing SoA)
def trace_orbits_gpu(particles: GPUParticleBatch, **kwargs):
# GPU kernels operate on existing SoA memory layout
# positions[5, n_particles] - coalesced memory access
# One thread per particle - maps to existing OpenMP pattern
pass# STREAM PROCESSING: Handle memory constraints efficiently
def process_large_simulation(n_total: int, batch_size: int = 100_000):
"""Process millions of particles in memory-efficient batches"""
results = []
for start_idx in range(0, n_total, batch_size):
end_idx = min(start_idx + batch_size, n_total)
batch_size_actual = end_idx - start_idx
# Create batch using existing SoA allocation patterns
batch = simple.ParticleBatch(n_particles=batch_size_actual)
batch.initialize_surface(vmec_file='wout.nc', s=0.9)
# Use existing OpenMP parallelization
batch_results = simple.trace_orbits(batch, tmax=1000.0)
results.append(batch_results)
# Clean memory between batches
del batch, batch_results
return combine_batch_results(results)- Python API must add <5% overhead to Fortran execution
- Zero-copy array access to existing SoA structures
- Configuration conversion cached for repeated use
- Memory management aligned with existing Fortran patterns
# Clean integration with scientific Python ecosystem
import pysimple
import numpy as np
import matplotlib.pyplot as plt
config = simple.Config.from_file('simple.in')
config.nparticles = 5000
sim = simple.Simulation(config)
results = sim.run()
# Direct NumPy array access to results
confined_fraction = results.confined_fraction # NumPy array
loss_times = results.loss_times # NumPy array
# Built-in visualization
results.plot_confinement()
results.plot_poincare_section()- Impact: HIGH - Core dependency for zero-copy array access
- Probability: MEDIUM - f90wrap can be brittle with complex Fortran types
- Mitigation:
- Early prototype with minimal f90wrap interface to validate feasibility
- Fallback: Native ctypes interface if f90wrap proves problematic
- Modular backend design allows swapping f90wrap for alternative approaches
- Test with current pysimple module first to understand existing limitations
- Impact: HIGH - Performance depends on zero-copy access to Fortran arrays
- Probability: LOW - f90wrap generally preserves memory layout correctly
- Mitigation:
- Comprehensive memory layout validation tests in Performance Validation Framework
- Memory stride verification to ensure SoA patterns are preserved
- Fallback: Memory copying with performance warning if zero-copy fails
- Early validation with small test arrays before scaling to full implementation
- Impact: MEDIUM - <5% overhead requirement is strict
- Probability: MEDIUM - Python function call overhead may accumulate
- Mitigation:
- Minimize Python-to-Fortran call frequency (batch operations, not per-particle)
- Profile critical paths early and optimize hotspots
- Use NumPy vectorized operations where possible
- Consider Cython optimization for performance-critical wrapper code
- Impact: MEDIUM - Existing OpenMP may not be thread-safe with Python
- Probability: LOW - Existing pysimple likely already handles this correctly
- Mitigation:
- Validate OpenMP behavior through existing pysimple interface first
- Implement thread-local storage for any Python state if needed
- Document OpenMP thread count configuration clearly
- Test with various thread counts to ensure deterministic results
- Impact: MEDIUM - May delay initial implementation
- Probability: MEDIUM - Complex f90wrap interfaces require expertise
- Mitigation:
- Start with existing pysimple module analysis to understand current patterns
- Allocate extra time in Week 1 for f90wrap investigation
- Consult f90wrap documentation and examples early
- Consider bringing in f90wrap expertise if needed
- Impact: MEDIUM - Golden record validation may be time-consuming
- Probability: MEDIUM - Ensuring bit-identical results is non-trivial
- Mitigation:
- Develop automated golden record test framework early
- Start with simple test cases and build complexity gradually
- Use existing examples/test cases as validation baseline
- Plan extra time in Week 4 for comprehensive validation
- Impact: MEDIUM - Could limit practical usability for large simulations
- Probability: LOW - Proper resource management should prevent this
- Mitigation:
- Implement explicit resource cleanup in ParticleBatch.del()
- Memory profiling tests for long-running batch processing
- Clear documentation of memory management best practices
- Stress testing with multiple batch creation/destruction cycles
- Impact: MEDIUM - Poor API design could limit adoption
- Probability: LOW - Design is based on proven batch processing patterns
- Mitigation:
- Early user feedback through prototype demonstrations
- Comprehensive documentation with examples
- Follow established scientific Python API patterns (NumPy, SciPy style)
- Iterative refinement based on initial usage
- Benefit: HIGH - SoA layout is optimal for future GPU implementation
- Effort: LOW - Architecture already designed for GPU compatibility
- Implementation:
- Current SoA design provides perfect foundation for CUDA/OpenCL
- Memory layout enables coalesced GPU memory access patterns
- Thread-per-particle mapping natural for GPU kernels
- Zero-copy GPU memory transfers with CuPy integration
- Benefit: MEDIUM - NumPy vectorized operations on result arrays
- Effort: LOW - Natural extension of batch processing design
- Implementation:
- Batch statistics calculations using vectorized NumPy operations
- Efficient post-processing of large result arrays
- Integration with SciPy for advanced statistical analysis
- Parallel analysis across particle batches
- Benefit: MEDIUM - More efficient memory usage for large simulations
- Effort: MEDIUM - Requires careful design of streaming interfaces
- Implementation:
- Memory-mapped result files for out-of-core processing
- Lazy loading of particle data from HDF5 files
- Memory pool allocation for repeated batch processing
- Automatic garbage collection tuning for large datasets
- Benefit: HIGH - Seamless integration with broader scientific Python ecosystem
- Effort: LOW - Natural result of clean API design
- Implementation:
- Direct NumPy array interfaces enable matplotlib, seaborn visualization
- Pandas DataFrame export for data analysis workflows
- Jupyter notebook integration with rich display methods
- Integration with scientific workflow tools (Dask, Zarr)
- Benefit: MEDIUM - More flexible and powerful parameter management
- Effort: LOW - Natural extension of batch-oriented design
- Implementation:
- YAML/JSON configuration files replace rigid namelist format
- Parameter sweeps and optimization workflows
- Configuration validation and documentation
- Integration with hyperparameter optimization libraries
- Benefit: MEDIUM - Better data interchange and analysis capabilities
- Effort: MEDIUM - Requires implementing multiple format handlers
- Implementation:
- HDF5 with rich metadata for large dataset handling
- NetCDF integration for climate/fusion community standards
- Parquet format for big data analytics workflows
- Direct integration with data visualization tools
- Benefit: HIGH - Enable AI/ML workflows on particle data
- Effort: MEDIUM - Requires ML-friendly data formats and interfaces
- Implementation:
- PyTorch/TensorFlow tensor interfaces for neural network training
- Feature extraction pipelines for orbit classification
- Transfer learning from simulation to experimental data
- Reinforcement learning for control optimization
- Benefit: MEDIUM - Interactive exploration of large simulation datasets
- Effort: MEDIUM - Requires efficient streaming visualization
- Implementation:
- Bokeh/Plotly integration for interactive web visualizations
- Real-time Poincaré section plotting during simulation
- 3D trajectory visualization with ParaView integration
- Progressive rendering for million-particle datasets
- Benefit: MEDIUM - Enable large-scale distributed simulations
- Effort: HIGH - Requires distributed computing framework design
- Implementation:
- Dask integration for cluster computing
- AWS/GCP batch processing workflows
- Container-based deployment with optimal dependencies
- Automatic scaling based on computational demand
Each implementation step must:
- Preserve existing
simple.xfunctionality exactly - Pass all golden record tests unchanged
- Allow rollback if issues discovered
- Be reviewable in isolation
- Fortran: Strict
typename_tconvention throughout - Python: Strict PEP8 convention throughout
- Translation: Automatic and transparent
- Documentation: Consistent with target convention
# Verify existing SoA patterns are preserved and accessible
def validate_soa_performance():
import pysimple
import numpy as np
import time
# Test 1: SoA memory layout efficiency
n_particles = 1_000_000
batch = simple.ParticleBatch(n_particles)
# Verify contiguous memory layout (existing pattern)
positions = batch.positions # Shape: (5, n_particles)
assert positions.flags.c_contiguous, "SoA layout must be contiguous"
# Test 2: Column access performance (existing cache pattern)
start_time = time.time()
for i in range(min(1000, n_particles)):
particle_coords = positions[:, i] # Cache-friendly column access
column_access_time = time.time() - start_time
# Test 3: Row access performance (should be slower)
start_time = time.time()
for j in range(5):
coord_array = positions[j, :1000] # Row access
row_access_time = time.time() - start_time
# Validate existing cache-friendly pattern is preserved
assert column_access_time < row_access_time * 0.5, "Column access should be faster (SoA benefit)"
# Test 4: OpenMP scaling validation
def validate_openmp_scaling():
"""Verify existing OpenMP performance is preserved"""
import pysimple
import os
results_1_thread = {}
results_8_threads = {}
# Test with 1 thread
os.environ['OMP_NUM_THREADS'] = '1'
start_time = time.time()
result_1 = simple.trace_orbits(particles, tmax=100.0)
results_1_thread['time'] = time.time() - start_time
# Test with 8 threads
os.environ['OMP_NUM_THREADS'] = '8'
start_time = time.time()
result_8 = simple.trace_orbits(particles, tmax=100.0)
results_8_threads['time'] = time.time() - start_time
# Verify scaling efficiency (existing OpenMP patterns)
scaling_efficiency = results_1_thread['time'] / (8 * results_8_threads['time'])
assert scaling_efficiency > 0.6, f"OpenMP scaling too low: {scaling_efficiency}"
# Verify results are identical (deterministic with existing algorithms)
np.testing.assert_array_equal(result_1.loss_times, result_8.loss_times)
# Test 5: Memory efficiency validation
def validate_memory_efficiency():
"""Ensure no memory overhead vs existing implementation"""
import psutil
import pysimple
# Baseline memory usage
process = psutil.Process()
baseline_memory = process.memory_info().rss
# Create batch using new API
n_particles = 100_000
batch = simple.ParticleBatch(n_particles)
api_memory = process.memory_info().rss
# Expected memory for SoA arrays: 5 * n_particles * 8 bytes + additional arrays
expected_particle_memory = 5 * n_particles * 8 # positions
expected_particle_memory += n_particles * 8 # loss_times
expected_particle_memory += n_particles * 8 # trap_par
expected_particle_memory += n_particles * 8 # perp_inv
actual_memory_increase = api_memory - baseline_memory
overhead_ratio = actual_memory_increase / expected_particle_memory
assert overhead_ratio < 1.1, f"Memory overhead too high: {overhead_ratio:.2f}"def validate_golden_record_performance():
"""Compare against existing Fortran implementation performance"""
# Performance benchmarks must match existing simple.x within 5%
fortran_time = benchmark_fortran_simple() # Existing implementation
python_time = benchmark_python_api() # New API
performance_ratio = python_time / fortran_time
assert performance_ratio < 1.05, f"Python API too slow: {performance_ratio:.3f}x"
# Results must be bit-identical
fortran_results = load_fortran_results()
python_results = run_python_equivalent()
np.testing.assert_array_equal(
fortran_results['times_lost'],
python_results.loss_times,
"Results must be identical to existing implementation"
)
def benchmark_memory_scaling():
"""Validate memory usage scales linearly with particle count"""
particle_counts = [10_000, 100_000, 1_000_000]
memory_usage = []
for n in particle_counts:
process = psutil.Process()
before = process.memory_info().rss
batch = simple.ParticleBatch(n)
batch.initialize_surface('wout.nc', s=0.9)
after = process.memory_info().rss
memory_usage.append((after - before) / n) # Per-particle memory
del batch
# Memory per particle should be constant (linear scaling)
assert all(abs(m - memory_usage[0]) / memory_usage[0] < 0.1 for m in memory_usage), \
"Memory scaling must be linear"def validate_gpu_readiness():
"""Ensure SoA layout is optimal for future GPU acceleration"""
import pysimple
batch = simple.ParticleBatch(100_000)
positions = batch.positions # Shape: (5, n_particles)
# Test 1: Contiguous memory for efficient GPU transfer
assert positions.flags.c_contiguous, "Must be C-contiguous for GPU transfer"
# Test 2: Coalesced memory access pattern
# GPU threads would access positions[:, thread_id] - coalesced
for thread_block in range(0, 1000, 256): # Simulate GPU thread blocks
thread_ids = range(thread_block, min(thread_block + 256, 1000))
particle_data = positions[:, thread_ids] # Coalesced access pattern
assert particle_data.shape[0] == 5, "Coordinate access must be contiguous"
# Test 3: Memory layout matches GPU optimization requirements
stride_coords = positions.strides[0] # Stride between coordinates
stride_particles = positions.strides[1] # Stride between particles
# For coalesced access, particle stride should be smaller
assert stride_particles == 8, "Particle stride optimal for GPU coalescing"
assert stride_coords == 8 * positions.shape[1], "Coordinate stride correct for SoA"- Performance: <5% overhead vs direct Fortran
- Usability: Single orbit in <5 lines of code
- Isolation: Multiple simulations without interference
- Golden Record: Exact equivalence with existing implementation
- Transparent: Users unaware of translation layer
- Complete: All API elements correctly mapped
- Consistent: Predictable naming patterns
- Performance: No measurable translation overhead
- API Parity: Fortran matches Python interface patterns
- Convention: Consistent
typename_tthroughout - Compatibility:
simple.xunchanged externally - Quality: Modern Fortran patterns applied
- Thin Wrapper: Python primarily delegates to Fortran
- Performance: Near-native Fortran performance through Python
- Maintenance: Single source of truth for API patterns
- User Experience: Consistent interface across languages
- Python prototype validates patterns before Fortran changes
- Atomic refactoring allows safe rollback
- Golden record testing prevents regressions
- User feedback guides API design
- Python enables rapid API iteration
- Patterns proven before costly Fortran implementation
- Incremental value delivery throughout process
- Clear blueprint for Fortran modernization
- Modern API patterns in both languages
- Consistent naming conventions
- Clean separation of concerns
- Foundation for future enhancements
This design creates a pathway from the current Fortran-centric codebase to a modern, dual-language API while preserving performance and maintaining backward compatibility throughout the transition.