From e9b75d6373789d93186447d064af84143b73b74f Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Mon, 23 Mar 2026 15:42:50 +1100
Subject: [PATCH] math: close remaining scalar holdout waves

---
 src/math/f64/core.rs                 | 9 ++++++---
 src/math/families/binary_misc/mod.rs | 5 ++++-
 src/math/families/core.rs            | 6 ++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs
index 28f743f..91db3f5 100644
--- a/src/math/f64/core.rs
+++ b/src/math/f64/core.rs
@@ -9,7 +9,6 @@ const F64_LOG_NORM_MANTISSA: i64 = 0x3FE0_0000_0000_0000u64 as i64;
 const F64_EXPONENT_BIAS_ADJUST: i64 = 1022;
 const F64_EXP_LN2_HI: f64 = 6.931_471_803_691_238e-1;
 const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10;
-
 // DECISION(2026-03-23): KEEP_SIMD_PORTABLE
 // Function(s): f64 log2_u35 / exp2_u35 / ln_u35 / exp_u35
 // Why kept:
@@ -21,8 +20,12 @@ const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10;
 // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
 // Function(s): f64 sin_u35 / cos_u35 / tan_u35
 // Why scalar:
-// - the previous portable trig fast path still lagged native scalar on this host
-// - the implementation was reverted to scalar-reference while preserving family ownership
+// - the final retry of the old portable trig kernel failed the u35 contract around pi boundaries,
+//   tan-pole neighborhoods, and moderate finite lanes before it could justify a speed keep
+// - the refreshed scalar-reference recheck still leaves runtime-selected throughput behind native
+//   scalar on this host (`sin`: about 17.03 ms vs 15.97 ms, `cos`: about 16.58 ms vs 15.75 ms,
+//   `tan`: about 20.85 ms vs 20.19 ms)
+// - native scalar still remains the honest default while family ownership stays localized here
 // Revisit when:
 // - a stronger range-reduction strategy or cheaper trig kernel appears
 
diff --git a/src/math/families/binary_misc/mod.rs b/src/math/families/binary_misc/mod.rs
index aa2e21d..efab55c 100644
--- a/src/math/families/binary_misc/mod.rs
+++ b/src/math/families/binary_misc/mod.rs
@@ -6,7 +6,10 @@ use crate::{Simd, SimdFloat32, SimdFloat64};
 // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
 // Function(s): f32 fmod
 // Why scalar:
-// - local benches still favor native scalar and there is no convincing portable SIMD default yet
+// - the final combined-wave recheck still leaves runtime-selected fmod behind native scalar on this
+//   host (about 8.10 ms vs 7.62 ms)
+// - there is still no convincing portable SIMD default that beats scalar without adding quotient-range
+//   complexity
 // - the public trait entry point stays stable while the honest implementation remains scalar-reference
 // Revisit when:
 // - quotient-range handling becomes cheap enough for a worthwhile portable kernel
diff --git a/src/math/families/core.rs b/src/math/families/core.rs
index fc15e2c..e9c8300 100644
--- a/src/math/families/core.rs
+++ b/src/math/families/core.rs
@@ -4,8 +4,10 @@ use crate::{Simd, SimdFloat32, SimdFloat64};
 // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
 // Function(s): f32 ln_u35 / exp_u35
 // Why scalar:
-// - local benches keep the current runtime-selected path below native scalar
-// - retaining the family entry points still preserves structure for later retries
+// - the final combined-wave recheck still keeps runtime-selected ln_u35 and exp_u35 below native scalar
+//   on this host (`ln`: about 2.72 ms vs 2.46 ms, `exp`: about 2.34 ms vs 2.11 ms)
+// - these contracts are stricter than the relaxed portable f32 log2_u35 / exp2_u35 pieces they would
+//   naturally compose from, so there is no cheap honest rescue today
 // Revisit when:
 // - a better shared f32 log/exp kernel exists