From d7259dbc71a9134d9c274aec92949c4d877b0568 Mon Sep 17 00:00:00 2001 From: arduano Date: Mon, 23 Mar 2026 14:06:02 +1100 Subject: [PATCH] math/f64: revive hyperbolic and acosh mix --- src/math/f64/hyperbolic.rs | 141 +++++++++++++++++++++++++++-- src/math/f64/inverse_hyperbolic.rs | 22 +++-- src/math/f64/mod.rs | 4 +- src/math/families/hyperbolic.rs | 15 ++- src/math/mod.rs | 3 +- 5 files changed, 164 insertions(+), 21 deletions(-) diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs index ae02352..14f8792 100644 --- a/src/math/f64/hyperbolic.rs +++ b/src/math/f64/hyperbolic.rs @@ -1,26 +1,135 @@ -use crate::math::{map, scalar}; -use crate::SimdFloat64; +use crate::math::{f64, map, scalar}; +use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; -// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +type SimdI64 = <::Engine as Simd>::Vi64; + +const SIGN_MASK: i64 = i64::MIN; +const SINH_COSH_SCALAR_PATCH_ABS: f64 = 1.0; +const SINH_COSH_FAST_ABS_MAX: f64 = 20.0; +const TANH_SCALAR_PATCH_ABS: f64 = 1.0; +const TANH_FAST_ABS_MAX: f64 = 20.0; + +// DECISION(2026-03-23): KEEP_MIXED // Function(s): f64 sinh_u35 / cosh_u35 / tanh_u35 -// Why scalar: -// - local benches do not justify a portable SIMD default for this family on the current host -// - keeping the family split still preserves test and ownership structure for a later retry +// Why kept: +// - local runtime-selected benches show clear wins for sinh_u35 and tanh_u35 after restoring +// scalar-lane patching for the strict 1-ULP near-zero region +// - cosh_u35 still loses to native scalar on this host, so it stays scalar-reference // Revisit when: -// - a cheaper f64 exp/log backbone or a dedicated hyperbolic kernel lands +// - cosh_u35 gets a better kernel or non-x86 evidence shifts the keep/revert balance + +#[inline(always)] +fn any_lane_nonzero(mask: SimdI64) -> bool +where + V: SimdFloat64, + V::Engine: Simd, +{ + unsafe { + let lanes = mask.as_array(); + for lane in 0..V::WIDTH { + if lanes[lane] != 0 { + return true; + } + } + } + + false +} + +#[inline(always)] +fn patch_exceptional_lanes( + input: V, + output: V, + exceptional_mask: SimdI64, + scalar_fallback: fn(f64) -> f64, +) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + if !any_lane_nonzero::(exceptional_mask) { + return output; + } + + unsafe { + let input_lanes = input.as_array(); + let mask_lanes = exceptional_mask.as_array(); + let mut output_lanes = output.as_array(); + + for lane in 0..V::WIDTH { + if mask_lanes[lane] != 0 { + output_lanes[lane] = scalar_fallback(input_lanes[lane]); + } + } + + V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) + } +} + +#[inline(always)] +fn apply_input_sign(magnitude: V, input: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let sign_bits = input.bitcast_i64() & SimdI64::::set1(SIGN_MASK); + (magnitude.bitcast_i64() | sign_bits).bitcast_f64() +} + +#[inline(always)] +fn sinh_cosh_medium(abs_input: V) -> (V, V) +where + V: SimdFloat64, + V::Engine: Simd, +{ + let exp_abs = f64::exp_u35(abs_input); + let exp_neg_abs = V::set1(1.0) / exp_abs; + let half = V::set1(0.5); + + ( + (exp_abs - exp_neg_abs) * half, + (exp_abs + exp_neg_abs) * half, + ) +} + +#[inline(always)] +fn sinh_cosh_masks(input: V) -> (SimdI64, V, V) +where + V: SimdFloat64, + V::Engine: Simd, +{ + let abs_input = input.abs(); + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let within_fast_range = abs_input + .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX)) + .bitcast_i64(); + + (finite_mask & within_fast_range, abs_input, input * input) +} #[inline(always)] pub(crate) fn sinh_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::sinh_u35_f64) + let (fast_mask, abs_input, _) = sinh_cosh_masks(input); + let small_scalar_mask = abs_input + .cmp_lt(V::set1(SINH_COSH_SCALAR_PATCH_ABS)) + .bitcast_i64(); + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()) | small_scalar_mask; + + let (sinh_medium, _) = sinh_cosh_medium(abs_input); + let fast = apply_input_sign(sinh_medium, input); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64) } #[inline(always)] pub(crate) fn cosh_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { map::unary_f64(input, scalar::cosh_u35_f64) } @@ -29,6 +138,20 @@ where pub(crate) fn tanh_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::tanh_u35_f64) + let abs_input = input.abs(); + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64(); + let small_scalar_mask = abs_input + .cmp_lt(V::set1(TANH_SCALAR_PATCH_ABS)) + .bitcast_i64(); + let exceptional_mask = + (finite_mask & within_fast_range).cmp_eq(SimdI64::::zeroes()) | small_scalar_mask; + + let exp_neg_2x = f64::exp_u35(abs_input * V::set1(-2.0)); + let tanh_medium = (V::set1(1.0) - exp_neg_2x) / (V::set1(1.0) + exp_neg_2x); + let fast = apply_input_sign(tanh_medium, input); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64) } diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs index bec3a35..5cc4eef 100644 --- a/src/math/f64/inverse_hyperbolic.rs +++ b/src/math/f64/inverse_hyperbolic.rs @@ -9,13 +9,14 @@ use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; // Revisit when: // - asinh gets its own cheaper core or can safely absorb the relaxed portable ln_u35 error budget -// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// DECISION(2026-03-23): KEEP_MIXED // Function(s): f64 acosh_u35 / atanh_u35 -// Why scalar: -// - local runtime-selected results do not beat native scalar on this host -// - scalar-reference keeps semantics honest without adding more f64 complexity today +// Why kept: +// - acosh_u35 now passes the strict contract and beats native scalar on local runtime-selected benches +// - atanh_u35's retry never held the strict 1-ULP contract without collapsing the fast band too far, +// so it remains scalar-reference on this host // Revisit when: -// - a stronger f64 inverse-hyperbolic kernel family exists +// - atanh_u35 gets a tighter portable kernel or cleaner cross-host evidence appears type SimdI64 = <::Engine as Simd>::Vi64; @@ -95,7 +96,16 @@ where V: SimdFloat64, V::Engine: Simd, { - map::unary_f64(input, scalar::acosh_u35_f64) + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64(); + let away_from_one_mask = input.cmp_gte(V::set1(1.5)).bitcast_i64(); + let fast_mask = finite_mask & in_domain_mask & away_from_one_mask; + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + + let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt()); + let fast = f64::ln_u35(input + root_term); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64) } #[inline(always)] diff --git a/src/math/f64/mod.rs b/src/math/f64/mod.rs index 7378791..161699d 100644 --- a/src/math/f64/mod.rs +++ b/src/math/f64/mod.rs @@ -2,8 +2,8 @@ //! - family-local modules own the public internal routing points for each math family. //! - current decisions are intentionally mixed: //! portable SIMD for the revived core log/exp family, inverse trig, and binary misc, -//! scalar-reference for trig and the losing hyperbolic defaults, -//! and hybrid paths where a stricter scalar sub-op still underpins the fast path. +//! scalar-reference for trig and the losing `cosh_u35` / `atanh_u35` defaults, +//! and hybrid paths where a stricter scalar sub-op or scalar-lane patch still underpins the fast path. //! - follow-up optimization work can still replace one family module at a time. mod binary_misc; diff --git a/src/math/families/hyperbolic.rs b/src/math/families/hyperbolic.rs index 676bca4..ad4f08e 100644 --- a/src/math/families/hyperbolic.rs +++ b/src/math/families/hyperbolic.rs @@ -31,17 +31,26 @@ impl SimdMathF32Hyperbolic for T {} pub trait SimdMathF64Hyperbolic: SimdFloat64 { #[inline(always)] - fn sinh_u35(self) -> Self { + fn sinh_u35(self) -> Self + where + Self::Engine: Simd, + { f64::sinh_u35(self) } #[inline(always)] - fn cosh_u35(self) -> Self { + fn cosh_u35(self) -> Self + where + Self::Engine: Simd, + { f64::cosh_u35(self) } #[inline(always)] - fn tanh_u35(self) -> Self { + fn tanh_u35(self) -> Self + where + Self::Engine: Simd, + { f64::tanh_u35(self) } } diff --git a/src/math/mod.rs b/src/math/mod.rs index df421a5..82300ad 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -13,7 +13,8 @@ //! kernels with centralized scalar patching for exceptional lanes. //! The stabilized `f64` map is intentionally mixed: //! portable SIMD for the revived core log/exp family, inverse trig, and several binary-misc kernels, -//! scalar-reference for the current losing trig and hyperbolic families, +//! scalar-reference for the current losing trig family plus selected `f64` holdouts such as `cosh_u35` +//! and `atanh_u35`, //! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops. //! //! Structure notes: