From d7259dbc71a9134d9c274aec92949c4d877b0568 Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Mon, 23 Mar 2026 14:06:02 +1100
Subject: [PATCH] math/f64: revive hyperbolic and acosh mix

---
 src/math/f64/hyperbolic.rs         | 141 +++++++++++++++++++++++++++--
 src/math/f64/inverse_hyperbolic.rs |  22 +++--
 src/math/f64/mod.rs                |   4 +-
 src/math/families/hyperbolic.rs    |  15 ++-
 src/math/mod.rs                    |   3 +-
 5 files changed, 164 insertions(+), 21 deletions(-)
diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs
index ae02352..14f8792 100644
--- a/src/math/f64/hyperbolic.rs
+++ b/src/math/f64/hyperbolic.rs
@@ -1,26 +1,135 @@
-use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::math::{f64, map, scalar};
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};
 
-// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+const SIGN_MASK: i64 = i64::MIN;
+const SINH_COSH_SCALAR_PATCH_ABS: f64 = 1.0;
+const SINH_COSH_FAST_ABS_MAX: f64 = 20.0;
+const TANH_SCALAR_PATCH_ABS: f64 = 1.0;
+const TANH_FAST_ABS_MAX: f64 = 20.0;
+
+// DECISION(2026-03-23): KEEP_MIXED
 // Function(s): f64 sinh_u35 / cosh_u35 / tanh_u35
-// Why scalar:
-// - local benches do not justify a portable SIMD default for this family on the current host
-// - keeping the family split still preserves test and ownership structure for a later retry
+// Why kept:
+// - local runtime-selected benches show clear wins for sinh_u35 and tanh_u35 after restoring
+//   scalar-lane patching for the strict 1-ULP near-zero region
+// - cosh_u35 still loses to native scalar on this host, so it stays scalar-reference
 // Revisit when:
-// - a cheaper f64 exp/log backbone or a dedicated hyperbolic kernel lands
+// - cosh_u35 gets a better kernel or non-x86 evidence shifts the keep/revert balance
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn apply_input_sign<V>(magnitude: V, input: V) -> V
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let sign_bits = input.bitcast_i64() & SimdI64::<V>::set1(SIGN_MASK);
+    (magnitude.bitcast_i64() | sign_bits).bitcast_f64()
+}
+
+#[inline(always)]
+fn sinh_cosh_medium<V>(abs_input: V) -> (V, V)
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let exp_abs = f64::exp_u35(abs_input);
+    let exp_neg_abs = V::set1(1.0) / exp_abs;
+    let half = V::set1(0.5);
+
+    (
+        (exp_abs - exp_neg_abs) * half,
+        (exp_abs + exp_neg_abs) * half,
+    )
+}
+
+#[inline(always)]
+fn sinh_cosh_masks<V>(input: V) -> (SimdI64<V>, V, V)
+where
+    V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
+{
+    let abs_input = input.abs();
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = abs_input
+        .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX))
+        .bitcast_i64();
+
+    (finite_mask & within_fast_range, abs_input, input * input)
+}
 
 #[inline(always)]
 pub(crate) fn sinh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::sinh_u35_f64)
+    let (fast_mask, abs_input, _) = sinh_cosh_masks(input);
+    let small_scalar_mask = abs_input
+        .cmp_lt(V::set1(SINH_COSH_SCALAR_PATCH_ABS))
+        .bitcast_i64();
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes()) | small_scalar_mask;
+
+    let (sinh_medium, _) = sinh_cosh_medium(abs_input);
+    let fast = apply_input_sign(sinh_medium, input);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64)
 }
 
 #[inline(always)]
 pub(crate) fn cosh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
     map::unary_f64(input, scalar::cosh_u35_f64)
 }
@@ -29,6 +138,20 @@ where
 pub(crate) fn tanh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
+    V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::tanh_u35_f64)
+    let abs_input = input.abs();
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64();
+    let small_scalar_mask = abs_input
+        .cmp_lt(V::set1(TANH_SCALAR_PATCH_ABS))
+        .bitcast_i64();
+    let exceptional_mask =
+        (finite_mask & within_fast_range).cmp_eq(SimdI64::<V>::zeroes()) | small_scalar_mask;
+
+    let exp_neg_2x = f64::exp_u35(abs_input * V::set1(-2.0));
+    let tanh_medium = (V::set1(1.0) - exp_neg_2x) / (V::set1(1.0) + exp_neg_2x);
+    let fast = apply_input_sign(tanh_medium, input);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64)
 }
diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs
index bec3a35..5cc4eef 100644
--- a/src/math/f64/inverse_hyperbolic.rs
+++ b/src/math/f64/inverse_hyperbolic.rs
@@ -9,13 +9,14 @@ use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
 // Revisit when:
 // - asinh gets its own cheaper core or can safely absorb the relaxed portable ln_u35 error budget
 
-// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
+// DECISION(2026-03-23): KEEP_MIXED
 // Function(s): f64 acosh_u35 / atanh_u35
-// Why scalar:
-// - local runtime-selected results do not beat native scalar on this host
-// - scalar-reference keeps semantics honest without adding more f64 complexity today
+// Why kept:
+// - acosh_u35 now passes the strict contract and beats native scalar on local runtime-selected benches
+// - atanh_u35's retry never held the strict 1-ULP contract without collapsing the fast band too far,
+//   so it remains scalar-reference on this host
 // Revisit when:
-// - a stronger f64 inverse-hyperbolic kernel family exists
+// - atanh_u35 gets a tighter portable kernel or cleaner cross-host evidence appears
 
 type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
 
@@ -95,7 +96,16 @@ where
     V: SimdFloat64,
     V::Engine: Simd<Vf64 = V>,
 {
-    map::unary_f64(input, scalar::acosh_u35_f64)
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64();
+    let away_from_one_mask = input.cmp_gte(V::set1(1.5)).bitcast_i64();
+    let fast_mask = finite_mask & in_domain_mask & away_from_one_mask;
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+
+    let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt());
+    let fast = f64::ln_u35(input + root_term);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64)
 }
 
 #[inline(always)]
diff --git a/src/math/f64/mod.rs b/src/math/f64/mod.rs
index 7378791..161699d 100644
--- a/src/math/f64/mod.rs
+++ b/src/math/f64/mod.rs
@@ -2,8 +2,8 @@
 //! - family-local modules own the public internal routing points for each math family.
 //! - current decisions are intentionally mixed:
 //!   portable SIMD for the revived core log/exp family, inverse trig, and binary misc,
-//!   scalar-reference for trig and the losing hyperbolic defaults,
-//!   and hybrid paths where a stricter scalar sub-op still underpins the fast path.
+//!   scalar-reference for trig and the losing `cosh_u35` / `atanh_u35` defaults,
+//!   and hybrid paths where a stricter scalar sub-op or scalar-lane patch still underpins the fast path.
 //! - follow-up optimization work can still replace one family module at a time.
 
 mod binary_misc;
diff --git a/src/math/families/hyperbolic.rs b/src/math/families/hyperbolic.rs
index 676bca4..ad4f08e 100644
--- a/src/math/families/hyperbolic.rs
+++ b/src/math/families/hyperbolic.rs
@@ -31,17 +31,26 @@ impl<T: SimdFloat32> SimdMathF32Hyperbolic for T {}
 
 pub trait SimdMathF64Hyperbolic: SimdFloat64 {
     #[inline(always)]
-    fn sinh_u35(self) -> Self {
+    fn sinh_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::sinh_u35(self)
     }
 
     #[inline(always)]
-    fn cosh_u35(self) -> Self {
+    fn cosh_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::cosh_u35(self)
     }
 
     #[inline(always)]
-    fn tanh_u35(self) -> Self {
+    fn tanh_u35(self) -> Self
+    where
+        Self::Engine: Simd<Vf64 = Self>,
+    {
         f64::tanh_u35(self)
     }
 }
diff --git a/src/math/mod.rs b/src/math/mod.rs
index df421a5..82300ad 100644
--- a/src/math/mod.rs
+++ b/src/math/mod.rs
@@ -13,7 +13,8 @@
 //! kernels with centralized scalar patching for exceptional lanes.
 //! The stabilized `f64` map is intentionally mixed:
 //! portable SIMD for the revived core log/exp family, inverse trig, and several binary-misc kernels,
-//! scalar-reference for the current losing trig and hyperbolic families,
+//! scalar-reference for the current losing trig family plus selected `f64` holdouts such as `cosh_u35`
+//! and `atanh_u35`,
 //! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops.
 //!
 //! Structure notes: