From e9b75d6373789d93186447d064af84143b73b74f Mon Sep 17 00:00:00 2001 From: arduano Date: Mon, 23 Mar 2026 15:42:50 +1100 Subject: [PATCH] math: close remaining scalar holdout waves --- src/math/f64/core.rs | 9 ++++++--- src/math/families/binary_misc/mod.rs | 5 ++++- src/math/families/core.rs | 6 ++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs index 28f743f..91db3f5 100644 --- a/src/math/f64/core.rs +++ b/src/math/f64/core.rs @@ -9,7 +9,6 @@ const F64_LOG_NORM_MANTISSA: i64 = 0x3FE0_0000_0000_0000u64 as i64; const F64_EXPONENT_BIAS_ADJUST: i64 = 1022; const F64_EXP_LN2_HI: f64 = 6.931_471_803_691_238e-1; const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10; - // DECISION(2026-03-23): KEEP_SIMD_PORTABLE // Function(s): f64 log2_u35 / exp2_u35 / ln_u35 / exp_u35 // Why kept: @@ -21,8 +20,12 @@ const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10; // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE // Function(s): f64 sin_u35 / cos_u35 / tan_u35 // Why scalar: -// - the previous portable trig fast path still lagged native scalar on this host -// - the implementation was reverted to scalar-reference while preserving family ownership +// - the final retry of the old portable trig kernel failed the u35 contract around pi boundaries, +// tan-pole neighborhoods, and moderate finite lanes before it could justify a speed keep +// - the refreshed scalar-reference recheck still leaves runtime-selected throughput behind native +// scalar on this host (`sin`: about 17.03 ms vs 15.97 ms, `cos`: about 16.58 ms vs 15.75 ms, +// `tan`: about 20.85 ms vs 20.19 ms) +// - native scalar still remains the honest default while family ownership stays localized here // Revisit when: // - a stronger range-reduction strategy or cheaper trig kernel appears diff --git a/src/math/families/binary_misc/mod.rs b/src/math/families/binary_misc/mod.rs index aa2e21d..efab55c 100644 --- a/src/math/families/binary_misc/mod.rs +++ b/src/math/families/binary_misc/mod.rs @@ -6,7 +6,10 @@ use crate::{Simd, SimdFloat32, SimdFloat64}; // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE // Function(s): f32 fmod // Why scalar: -// - local benches still favor native scalar and there is no convincing portable SIMD default yet +// - the final combined-wave recheck still leaves runtime-selected fmod behind native scalar on this +// host (about 8.10 ms vs 7.62 ms) +// - there is still no convincing portable SIMD default that beats scalar without adding quotient-range +// complexity // - the public trait entry point stays stable while the honest implementation remains scalar-reference // Revisit when: // - quotient-range handling becomes cheap enough for a worthwhile portable kernel diff --git a/src/math/families/core.rs b/src/math/families/core.rs index fc15e2c..e9c8300 100644 --- a/src/math/families/core.rs +++ b/src/math/families/core.rs @@ -4,8 +4,10 @@ use crate::{Simd, SimdFloat32, SimdFloat64}; // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE // Function(s): f32 ln_u35 / exp_u35 // Why scalar: -// - local benches keep the current runtime-selected path below native scalar -// - retaining the family entry points still preserves structure for later retries +// - the final combined-wave recheck still keeps runtime-selected ln_u35 and exp_u35 below native scalar +// on this host (`ln`: about 2.72 ms vs 2.46 ms, `exp`: about 2.34 ms vs 2.11 ms) +// - these contracts are stricter than the relaxed portable f32 log2_u35 / exp2_u35 pieces they would +// naturally compose from, so there is no cheap honest rescue today // Revisit when: // - a better shared f32 log/exp kernel exists