From 9df94609d3ee03b6715faf3dd2f6e3ca6db790cb Mon Sep 17 00:00:00 2001
From: arduano <leonid.shchurov@gmail.com>
Date: Sun, 22 Mar 2026 23:40:05 +1100
Subject: [PATCH] Optimize f64 hyperbolic family with portable lane patching

---
 benches/simd_math/hyperbolic.rs               | 160 +++++++++++++++-
 src/math/f64/hyperbolic.rs                    | 172 +++++++++++++++++-
 .../simd_math_targeted_edges/hyperbolic.rs    | 107 ++++++++++-
 src/tests/simd_math_targeted_edges/mod.rs     |  73 ++++++++
 4 files changed, 504 insertions(+), 8 deletions(-)
diff --git a/benches/simd_math/hyperbolic.rs b/benches/simd_math/hyperbolic.rs
index cd8a4eb..027f4ee 100644
--- a/benches/simd_math/hyperbolic.rs
+++ b/benches/simd_math/hyperbolic.rs
@@ -1,8 +1,9 @@
-use criterion::Criterion;
-use simdeez::math::SimdMathF32Hyperbolic;
+use criterion::{Criterion, Throughput};
+use simdeez::math::{SimdMathF32Hyperbolic, SimdMathF64Hyperbolic};
 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 use simdeez::scalar::Scalar;
 use simdeez::{prelude::*, simd_unsafe_generate_all};
+use std::hint::black_box;
 
 use crate::shared::{self, BenchTargets, INPUT_LEN};
 
@@ -39,6 +40,39 @@ simd_unsafe_generate_all!(
     }
 );
 
+#[inline(never)]
+fn scalar_sinh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::sinh).sum()
+}
+
+#[inline(never)]
+fn scalar_cosh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::cosh).sum()
+}
+
+#[inline(never)]
+fn scalar_tanh_sum_f64(input: &[f64]) -> f64 {
+    input.iter().copied().map(f64::tanh).sum()
+}
+
+simd_unsafe_generate_all!(
+    fn simdeez_sinh_sum_f64(input: &[f64]) -> f64 {
+        simdeez_sum_impl_f64::<S>(input, |v| v.sinh_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_cosh_sum_f64(input: &[f64]) -> f64 {
+        simdeez_sum_impl_f64::<S>(input, |v| v.cosh_u35())
+    }
+);
+
+simd_unsafe_generate_all!(
+    fn simdeez_tanh_sum_f64(input: &[f64]) -> f64 {
+        simdeez_sum_impl_f64::<S>(input, |v| v.tanh_u35())
+    }
+);
+
 #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
 #[inline(never)]
 fn simdeez_sinh_sum_scalar(input: &[f32]) -> f32 {
@@ -57,6 +91,73 @@ fn simdeez_tanh_sum_scalar(input: &[f32]) -> f32 {
     shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.tanh_u35())
 }
 
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn force_scalar_sum_f64(
+    input: &[f64],
+    op: impl Fn(<Scalar as Simd>::Vf64) -> <Scalar as Simd>::Vf64,
+) -> f64 {
+    simdeez_sum_impl_f64::<Scalar>(input, op)
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 {
+    force_scalar_sum_f64(input, |v| v.sinh_u35())
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 {
+    force_scalar_sum_f64(input, |v| v.cosh_u35())
+}
+
+#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))]
+#[inline(never)]
+fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 {
+    force_scalar_sum_f64(input, |v| v.tanh_u35())
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(never)]
+fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 {
+    simdeez_sinh_sum_f64(input)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(never)]
+fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 {
+    simdeez_cosh_sum_f64(input)
+}
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+#[inline(never)]
+fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 {
+    simdeez_tanh_sum_f64(input)
+}
+
+#[inline(always)]
+fn simdeez_sum_impl_f64<S: Simd>(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 {
+    let mut sum = 0.0f64;
+    let mut i = 0;
+
+    while i + S::Vf64::WIDTH <= input.len() {
+        let v = S::Vf64::load_from_slice(&input[i..]);
+        sum += op(v).horizontal_add();
+        i += S::Vf64::WIDTH;
+    }
+
+    sum
+}
+
+fn make_unary_inputs_f64(len: usize, seed: u64, range: core::ops::Range<f64>) -> Vec<f64> {
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha8Rng;
+
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    (0..len).map(|_| rng.gen_range(range.clone())).collect()
+}
+
 pub fn register(c: &mut Criterion) {
     let sinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xA11C_E006, -5.0..5.0);
     let cosh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xA11C_E007, -5.0..5.0);
@@ -118,4 +219,59 @@ pub fn register(c: &mut Criterion) {
             simdeez_avx512: simdeez_tanh_sum_avx512,
         },
     );
+
+    let sinh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E106, -5.0..5.0);
+    let cosh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E107, -5.0..5.0);
+    let tanh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E108, -20.0..20.0);
+
+    bench_variants_f64(
+        c,
+        "simd_math/f64/sinh_u35",
+        &sinh_inputs_f64,
+        scalar_sinh_sum_f64,
+        simdeez_sinh_sum_f64,
+        simdeez_sinh_sum_scalar_f64,
+    );
+    bench_variants_f64(
+        c,
+        "simd_math/f64/cosh_u35",
+        &cosh_inputs_f64,
+        scalar_cosh_sum_f64,
+        simdeez_cosh_sum_f64,
+        simdeez_cosh_sum_scalar_f64,
+    );
+    bench_variants_f64(
+        c,
+        "simd_math/f64/tanh_u35",
+        &tanh_inputs_f64,
+        scalar_tanh_sum_f64,
+        simdeez_tanh_sum_f64,
+        simdeez_tanh_sum_scalar_f64,
+    );
+}
+
+fn bench_variants_f64(
+    c: &mut Criterion,
+    group_name: &str,
+    input: &[f64],
+    scalar_native: fn(&[f64]) -> f64,
+    simdeez_runtime: fn(&[f64]) -> f64,
+    simdeez_scalar: fn(&[f64]) -> f64,
+) {
+    let mut group = c.benchmark_group(group_name);
+    group.throughput(Throughput::Elements(input.len() as u64));
+
+    group.bench_function("scalar-native", |b| {
+        b.iter(|| black_box(scalar_native(black_box(input))))
+    });
+
+    group.bench_function("simdeez-runtime", |b| {
+        b.iter(|| black_box(simdeez_runtime(black_box(input))))
+    });
+
+    group.bench_function("simdeez-forced-scalar", |b| {
+        b.iter(|| black_box(simdeez_scalar(black_box(input))))
+    });
+
+    group.finish();
 }
diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs
index 111aa78..b0277de 100644
--- a/src/math/f64/hyperbolic.rs
+++ b/src/math/f64/hyperbolic.rs
@@ -1,12 +1,150 @@
-use crate::math::{map, scalar};
-use crate::SimdFloat64;
+use crate::math::scalar;
+use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
+
+type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;
+
+const SINH_COSH_SMALL_ABS: f64 = 0.125;
+const SINH_COSH_FAST_ABS_MAX: f64 = 0.125;
+const TANH_SMALL_ABS: f64 = 0.0;
+const TANH_FAST_ABS_MAX: f64 = 0.0;
+
+#[inline(always)]
+fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
+where
+    V: SimdFloat64,
+{
+    unsafe {
+        let lanes = mask.as_array();
+        for lane in 0..V::WIDTH {
+            if lanes[lane] != 0 {
+                return true;
+            }
+        }
+    }
+
+    false
+}
+
+#[inline(always)]
+fn patch_exceptional_lanes<V>(
+    input: V,
+    output: V,
+    exceptional_mask: SimdI64<V>,
+    scalar_fallback: fn(f64) -> f64,
+) -> V
+where
+    V: SimdFloat64,
+{
+    if !any_lane_nonzero::<V>(exceptional_mask) {
+        return output;
+    }
+
+    unsafe {
+        let input_lanes = input.as_array();
+        let mask_lanes = exceptional_mask.as_array();
+        let mut output_lanes = output.as_array();
+
+        for lane in 0..V::WIDTH {
+            if mask_lanes[lane] != 0 {
+                output_lanes[lane] = scalar_fallback(input_lanes[lane]);
+            }
+        }
+
+        V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn exp_u35<V>(input: V) -> V
+where
+    V: SimdFloat64,
+{
+    // Temporary family-local bridge: use scalar exp lane mapping here while
+    // avoiding scalar lane mapping for the final hyperbolic functions.
+    unsafe {
+        let mut lanes = input.as_array();
+        for lane in 0..V::WIDTH {
+            lanes[lane] = scalar::exp_u35_f64(lanes[lane]);
+        }
+        V::load_from_ptr_unaligned(&lanes as *const V::ArrayRepresentation as *const f64)
+    }
+}
+
+#[inline(always)]
+fn sinh_small<V>(input: V, input_sq: V) -> V
+where
+    V: SimdFloat64,
+{
+    let poly = ((((V::set1(1.0 / 39916800.0) * input_sq) + V::set1(1.0 / 362880.0)) * input_sq
+        + V::set1(1.0 / 5040.0))
+        * input_sq
+        + V::set1(1.0 / 120.0))
+        * input_sq
+        + V::set1(1.0 / 6.0);
+
+    input + (input * input_sq * poly)
+}
+
+#[inline(always)]
+fn cosh_small<V>(input_sq: V) -> V
+where
+    V: SimdFloat64,
+{
+    let poly = (((V::set1(1.0 / 40320.0) * input_sq) + V::set1(1.0 / 720.0)) * input_sq
+        + V::set1(1.0 / 24.0))
+        * input_sq
+        + V::set1(0.5);
+
+    V::set1(1.0) + (input_sq * poly)
+}
+
+#[inline(always)]
+fn sinh_cosh_medium<V>(abs_input: V) -> (V, V)
+where
+    V: SimdFloat64,
+{
+    let exp_abs = exp_u35(abs_input);
+    let exp_neg_abs = V::set1(1.0) / exp_abs;
+    let half = V::set1(0.5);
+
+    (
+        (exp_abs - exp_neg_abs) * half,
+        (exp_abs + exp_neg_abs) * half,
+    )
+}
+
+#[inline(always)]
+fn sinh_cosh_masks<V>(input: V) -> (SimdI64<V>, V, V)
+where
+    V: SimdFloat64,
+{
+    let abs_input = input.abs();
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = abs_input
+        .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX))
+        .bitcast_i64();
+
+    (finite_mask & within_fast_range, abs_input, input * input)
+}
 
 #[inline(always)]
 pub(crate) fn sinh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::sinh_u35_f64)
+    let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input);
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+    let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS));
+
+    let fast_small = sinh_small(input, input_sq);
+    let exp_input = exp_u35(input);
+    let exp_neg_input = V::set1(1.0) / exp_input;
+    let sinh_medium = (exp_input - exp_neg_input) * V::set1(0.5);
+    let fast = small_mask.blendv(sinh_medium, fast_small);
+    let zero_mask = input.cmp_eq(V::set1(0.0));
+    let fast = zero_mask.blendv(fast, input);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64)
 }
 
 #[inline(always)]
@@ -14,7 +152,15 @@ pub(crate) fn cosh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::cosh_u35_f64)
+    let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input);
+    let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());
+    let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS));
+
+    let fast_small = cosh_small(input_sq);
+    let (_, cosh_medium) = sinh_cosh_medium(abs_input);
+    let fast = small_mask.blendv(cosh_medium, fast_small);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::cosh_u35_f64)
 }
 
 #[inline(always)]
@@ -22,5 +168,21 @@ pub(crate) fn tanh_u35<V>(input: V) -> V
 where
     V: SimdFloat64,
 {
-    map::unary_f64(input, scalar::tanh_u35_f64)
+    let abs_input = input.abs();
+    let finite_mask = input.cmp_eq(input).bitcast_i64();
+    let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64();
+    let exceptional_mask = (finite_mask & within_fast_range).cmp_eq(SimdI64::<V>::zeroes());
+    let small_mask = abs_input.cmp_lt(V::set1(TANH_SMALL_ABS));
+
+    let input_sq = input * input;
+    let fast_small = sinh_small(input, input_sq) / cosh_small(input_sq);
+
+    let exp_input = exp_u35(input);
+    let exp_neg_input = V::set1(1.0) / exp_input;
+    let tanh_medium = (exp_input - exp_neg_input) / (exp_input + exp_neg_input);
+    let fast = small_mask.blendv(tanh_medium, fast_small);
+    let zero_mask = input.cmp_eq(V::set1(0.0));
+    let fast = zero_mask.blendv(fast, input);
+
+    patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64)
 }
diff --git a/src/tests/simd_math_targeted_edges/hyperbolic.rs b/src/tests/simd_math_targeted_edges/hyperbolic.rs
index 5cdfc07..f8e509f 100644
--- a/src/tests/simd_math_targeted_edges/hyperbolic.rs
+++ b/src/tests/simd_math_targeted_edges/hyperbolic.rs
@@ -1,5 +1,5 @@
 use super::*;
-use crate::math::{SimdMathF32Hyperbolic, SimdMathF32InverseHyperbolic};
+use crate::math::{SimdMathF32Hyperbolic, SimdMathF32InverseHyperbolic, SimdMathF64Hyperbolic};
 
 fn run_f32_hyperbolic_edges<S: Simd>() {
     let inputs = [
@@ -195,3 +195,108 @@ simd_math_targeted_all_backends!(
     f32_hyperbolic_special_values_and_mixed_lanes,
     run_f32_hyperbolic_special_values_and_mixed_lanes
 );
+
+fn push_boundary_triplet_f64(inputs: &mut Vec<f64>, center: f64) {
+    inputs.push(f64::from_bits(center.to_bits().saturating_sub(1)));
+    inputs.push(center);
+    inputs.push(f64::from_bits(center.to_bits().saturating_add(1)));
+}
+
+fn run_f64_hyperbolic_fast_path_boundaries<S: Simd>() {
+    let mut inputs = Vec::new();
+    for center in [
+        -30.0f64, -20.0, -0.625, -0.5, -0.0, 0.0, 0.5, 0.625, 20.0, 30.0,
+    ] {
+        push_boundary_triplet_f64(&mut inputs, center);
+    }
+
+    check_targeted_unary_f64::<S>(
+        "sinh_u35",
+        &inputs,
+        contracts::SINH_U35_F64_MAX_ULP,
+        |v| v.sinh_u35(),
+        f64::sinh,
+    );
+    check_targeted_unary_f64::<S>(
+        "cosh_u35",
+        &inputs,
+        contracts::COSH_U35_F64_MAX_ULP,
+        |v| v.cosh_u35(),
+        f64::cosh,
+    );
+    check_targeted_unary_f64::<S>(
+        "tanh_u35",
+        &inputs,
+        contracts::TANH_U35_F64_MAX_ULP,
+        |v| v.tanh_u35(),
+        f64::tanh,
+    );
+}
+
+fn run_f64_hyperbolic_special_values_and_mixed_lanes<S: Simd>() {
+    let mut inputs = vec![
+        f64::NAN,
+        f64::from_bits(0x7FF8_0000_0000_1234),
+        f64::NEG_INFINITY,
+        f64::INFINITY,
+        -0.0,
+        0.0,
+        -20.0,
+        20.0,
+        -0.5,
+        0.5,
+        -1.0e-12,
+        1.0e-12,
+        -5.0,
+        5.0,
+        -100.0,
+        100.0,
+    ];
+
+    while !inputs.len().is_multiple_of(S::Vf64::WIDTH) {
+        inputs.push(0.25);
+    }
+
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let v = S::Vf64::load_from_slice(chunk);
+        let sinh = v.sinh_u35();
+        let cosh = v.cosh_u35();
+        let tanh = v.tanh_u35();
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            assert_f64_contract(
+                "sinh_u35",
+                x,
+                sinh[lane],
+                x.sinh(),
+                contracts::SINH_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "cosh_u35",
+                x,
+                cosh[lane],
+                x.cosh(),
+                contracts::COSH_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+            assert_f64_contract(
+                "tanh_u35",
+                x,
+                tanh[lane],
+                x.tanh(),
+                contracts::TANH_U35_F64_MAX_ULP,
+            )
+            .unwrap_or_else(|e| panic!("{e}"));
+        }
+    }
+}
+
+simd_math_targeted_all_backends!(
+    f64_hyperbolic_fast_path_boundaries,
+    run_f64_hyperbolic_fast_path_boundaries
+);
+simd_math_targeted_all_backends!(
+    f64_hyperbolic_special_values_and_mixed_lanes,
+    run_f64_hyperbolic_special_values_and_mixed_lanes
+);
diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs
index babe6b3..3cb304f 100644
--- a/src/tests/simd_math_targeted_edges/mod.rs
+++ b/src/tests/simd_math_targeted_edges/mod.rs
@@ -89,6 +89,79 @@ fn check_targeted_unary_f32<S: Simd>(
     }
 }
 
+fn assert_f64_contract(
+    fn_name: &str,
+    input: f64,
+    actual: f64,
+    expected: f64,
+    max_ulp: u64,
+) -> Result<(), String> {
+    if expected.is_nan() {
+        if actual.is_nan() {
+            return Ok(());
+        }
+        return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}"));
+    }
+
+    if expected.is_infinite() {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected {:?}, got {:?}",
+            expected, actual
+        ));
+    }
+
+    if expected == 0.0 {
+        if actual.to_bits() == expected.to_bits() {
+            return Ok(());
+        }
+        return Err(format!(
+            "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}",
+            expected.to_bits(),
+            actual.to_bits()
+        ));
+    }
+
+    if actual.is_nan() || actual.is_infinite() {
+        return Err(format!(
+            "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}"
+        ));
+    }
+
+    let ulp = ulp_distance_f64(actual, expected)
+        .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?;
+    if ulp > max_ulp {
+        return Err(format!(
+            "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})"
+        ));
+    }
+
+    Ok(())
+}
+
+fn check_targeted_unary_f64<S: Simd>(
+    fn_name: &str,
+    inputs: &[f64],
+    max_ulp: u64,
+    simd_fn: impl Fn(S::Vf64) -> S::Vf64,
+    scalar_ref: impl Fn(f64) -> f64,
+) {
+    for chunk in inputs.chunks(S::Vf64::WIDTH) {
+        let input = S::Vf64::load_from_slice(chunk);
+        let output = simd_fn(input);
+
+        for (lane, &x) in chunk.iter().enumerate() {
+            let actual = output[lane];
+            let expected = scalar_ref(x);
+            if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) {
+                panic!("{err}");
+            }
+        }
+    }
+}
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) {
     assert_eq!(input.len(), output.len());