From 5b3cad1164bd616930dbc4d350b0aaf80260d1cf Mon Sep 17 00:00:00 2001 From: arduano Date: Mon, 23 Mar 2026 00:28:55 +1100 Subject: [PATCH] f64 hyperbolic: revert regressing paths, keep asinh fast path --- src/math/f64/hyperbolic.rs | 172 +----------------- src/math/f64/inverse_hyperbolic.rs | 28 +-- .../simd_math_targeted_edges/hyperbolic.rs | 25 +++ .../inverse_hyperbolic.rs | 26 +++ 4 files changed, 59 insertions(+), 192 deletions(-) diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs index b0277de..111aa78 100644 --- a/src/math/f64/hyperbolic.rs +++ b/src/math/f64/hyperbolic.rs @@ -1,150 +1,12 @@ -use crate::math::scalar; -use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; - -type SimdI64 = <::Engine as Simd>::Vi64; - -const SINH_COSH_SMALL_ABS: f64 = 0.125; -const SINH_COSH_FAST_ABS_MAX: f64 = 0.125; -const TANH_SMALL_ABS: f64 = 0.0; -const TANH_FAST_ABS_MAX: f64 = 0.0; - -#[inline(always)] -fn any_lane_nonzero(mask: SimdI64) -> bool -where - V: SimdFloat64, -{ - unsafe { - let lanes = mask.as_array(); - for lane in 0..V::WIDTH { - if lanes[lane] != 0 { - return true; - } - } - } - - false -} - -#[inline(always)] -fn patch_exceptional_lanes( - input: V, - output: V, - exceptional_mask: SimdI64, - scalar_fallback: fn(f64) -> f64, -) -> V -where - V: SimdFloat64, -{ - if !any_lane_nonzero::(exceptional_mask) { - return output; - } - - unsafe { - let input_lanes = input.as_array(); - let mask_lanes = exceptional_mask.as_array(); - let mut output_lanes = output.as_array(); - - for lane in 0..V::WIDTH { - if mask_lanes[lane] != 0 { - output_lanes[lane] = scalar_fallback(input_lanes[lane]); - } - } - - V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) - } -} - -#[inline(always)] -fn exp_u35(input: V) -> V -where - V: SimdFloat64, -{ - // Temporary family-local bridge: use scalar exp lane mapping here while - // avoiding scalar lane mapping for the final hyperbolic functions. - unsafe { - let mut lanes = input.as_array(); - for lane in 0..V::WIDTH { - lanes[lane] = scalar::exp_u35_f64(lanes[lane]); - } - V::load_from_ptr_unaligned(&lanes as *const V::ArrayRepresentation as *const f64) - } -} - -#[inline(always)] -fn sinh_small(input: V, input_sq: V) -> V -where - V: SimdFloat64, -{ - let poly = ((((V::set1(1.0 / 39916800.0) * input_sq) + V::set1(1.0 / 362880.0)) * input_sq - + V::set1(1.0 / 5040.0)) - * input_sq - + V::set1(1.0 / 120.0)) - * input_sq - + V::set1(1.0 / 6.0); - - input + (input * input_sq * poly) -} - -#[inline(always)] -fn cosh_small(input_sq: V) -> V -where - V: SimdFloat64, -{ - let poly = (((V::set1(1.0 / 40320.0) * input_sq) + V::set1(1.0 / 720.0)) * input_sq - + V::set1(1.0 / 24.0)) - * input_sq - + V::set1(0.5); - - V::set1(1.0) + (input_sq * poly) -} - -#[inline(always)] -fn sinh_cosh_medium(abs_input: V) -> (V, V) -where - V: SimdFloat64, -{ - let exp_abs = exp_u35(abs_input); - let exp_neg_abs = V::set1(1.0) / exp_abs; - let half = V::set1(0.5); - - ( - (exp_abs - exp_neg_abs) * half, - (exp_abs + exp_neg_abs) * half, - ) -} - -#[inline(always)] -fn sinh_cosh_masks(input: V) -> (SimdI64, V, V) -where - V: SimdFloat64, -{ - let abs_input = input.abs(); - let finite_mask = input.cmp_eq(input).bitcast_i64(); - let within_fast_range = abs_input - .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX)) - .bitcast_i64(); - - (finite_mask & within_fast_range, abs_input, input * input) -} +use crate::math::{map, scalar}; +use crate::SimdFloat64; #[inline(always)] pub(crate) fn sinh_u35(input: V) -> V where V: SimdFloat64, { - let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input); - let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); - let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS)); - - let fast_small = sinh_small(input, input_sq); - let exp_input = exp_u35(input); - let exp_neg_input = V::set1(1.0) / exp_input; - let sinh_medium = (exp_input - exp_neg_input) * V::set1(0.5); - let fast = small_mask.blendv(sinh_medium, fast_small); - let zero_mask = input.cmp_eq(V::set1(0.0)); - let fast = zero_mask.blendv(fast, input); - - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64) + map::unary_f64(input, scalar::sinh_u35_f64) } #[inline(always)] @@ -152,15 +14,7 @@ pub(crate) fn cosh_u35(input: V) -> V where V: SimdFloat64, { - let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input); - let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); - let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS)); - - let fast_small = cosh_small(input_sq); - let (_, cosh_medium) = sinh_cosh_medium(abs_input); - let fast = small_mask.blendv(cosh_medium, fast_small); - - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::cosh_u35_f64) + map::unary_f64(input, scalar::cosh_u35_f64) } #[inline(always)] @@ -168,21 +22,5 @@ pub(crate) fn tanh_u35(input: V) -> V where V: SimdFloat64, { - let abs_input = input.abs(); - let finite_mask = input.cmp_eq(input).bitcast_i64(); - let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64(); - let exceptional_mask = (finite_mask & within_fast_range).cmp_eq(SimdI64::::zeroes()); - let small_mask = abs_input.cmp_lt(V::set1(TANH_SMALL_ABS)); - - let input_sq = input * input; - let fast_small = sinh_small(input, input_sq) / cosh_small(input_sq); - - let exp_input = exp_u35(input); - let exp_neg_input = V::set1(1.0) / exp_input; - let tanh_medium = (exp_input - exp_neg_input) / (exp_input + exp_neg_input); - let fast = small_mask.blendv(tanh_medium, fast_small); - let zero_mask = input.cmp_eq(V::set1(0.0)); - let fast = zero_mask.blendv(fast, input); - - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64) + map::unary_f64(input, scalar::tanh_u35_f64) } diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs index f7ce366..50914bb 100644 --- a/src/math/f64/inverse_hyperbolic.rs +++ b/src/math/f64/inverse_hyperbolic.rs @@ -1,4 +1,4 @@ -use crate::math::{f64, scalar}; +use crate::math::{f64, map, scalar}; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; type SimdI64 = <::Engine as Simd>::Vi64; @@ -79,15 +79,7 @@ where V: SimdFloat64, V::Engine: Simd, { - let finite_mask = input.cmp_eq(input).bitcast_i64(); - let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64(); - let fast_mask = finite_mask & in_domain_mask; - let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); - - let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt()); - let fast = f64::ln_u35(input + root_term); - - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64) + map::unary_f64(input, scalar::acosh_u35_f64) } #[inline(always)] @@ -96,19 +88,5 @@ where V: SimdFloat64, V::Engine: Simd, { - let finite_mask = input.cmp_eq(input).bitcast_i64(); - let abs_x = input.abs(); - let strict_domain_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i64(); - let non_zero_mask = input.cmp_neq(V::zeroes()).bitcast_i64(); - let stable_range_mask = abs_x.cmp_lte(V::set1(0.99)).bitcast_i64(); - let away_from_zero_mask = abs_x.cmp_gte(V::set1(0.9)).bitcast_i64(); - let fast_mask = - finite_mask & strict_domain_mask & non_zero_mask & stable_range_mask & away_from_zero_mask; - let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); - - let one = V::set1(1.0); - let ratio = (one + input) / (one - input); - let fast = f64::ln_u35(ratio) * V::set1(0.5); - - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atanh_u35_f64) + map::unary_f64(input, scalar::atanh_u35_f64) } diff --git a/src/tests/simd_math_targeted_edges/hyperbolic.rs b/src/tests/simd_math_targeted_edges/hyperbolic.rs index f8e509f..c8ad2d5 100644 --- a/src/tests/simd_math_targeted_edges/hyperbolic.rs +++ b/src/tests/simd_math_targeted_edges/hyperbolic.rs @@ -300,3 +300,28 @@ simd_math_targeted_all_backends!( f64_hyperbolic_special_values_and_mixed_lanes, run_f64_hyperbolic_special_values_and_mixed_lanes ); + +fn run_f64_hyperbolic_signed_zero_semantics() { + let mut lanes = vec![0.0f64; S::Vf64::WIDTH]; + lanes[0] = -0.0; + + let input = S::Vf64::load_from_slice(&lanes); + let sinh = input.sinh_u35(); + let tanh = input.tanh_u35(); + + assert_eq!(sinh[0].to_bits(), (-0.0f64).sinh().to_bits()); + assert_eq!(tanh[0].to_bits(), (-0.0f64).tanh().to_bits()); + + if S::Vf64::WIDTH > 1 { + assert_eq!(sinh[1].to_bits(), 0.0f64.sinh().to_bits()); + assert_eq!(tanh[1].to_bits(), 0.0f64.tanh().to_bits()); + } + + let cosh = input.cosh_u35(); + assert_eq!(cosh[0].to_bits(), (-0.0f64).cosh().to_bits()); +} + +simd_math_targeted_all_backends!( + f64_hyperbolic_signed_zero_semantics, + run_f64_hyperbolic_signed_zero_semantics +); diff --git a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs index c07c793..cc382e7 100644 --- a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs +++ b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs @@ -270,3 +270,29 @@ simd_math_targeted_all_backends!( f64_inverse_hyperbolic_mixed_lanes, run_f64_inverse_hyperbolic_mixed_lanes ); + +fn run_f64_inverse_hyperbolic_signed_zero_semantics() { + let mut lanes = vec![0.0f64; S::Vf64::WIDTH]; + lanes[0] = -0.0; + + let input = S::Vf64::load_from_slice(&lanes); + let asinh = input.asinh_u35(); + let atanh = input.atanh_u35(); + + assert_eq!(asinh[0].to_bits(), (-0.0f64).asinh().to_bits()); + assert_eq!(atanh[0].to_bits(), (-0.0f64).atanh().to_bits()); + + if S::Vf64::WIDTH > 1 { + assert_eq!(asinh[1].to_bits(), 0.0f64.asinh().to_bits()); + assert_eq!(atanh[1].to_bits(), 0.0f64.atanh().to_bits()); + } + + let ones = S::Vf64::set1(1.0); + let acosh = ones.acosh_u35(); + assert_eq!(acosh[0].to_bits(), 1.0f64.acosh().to_bits()); +} + +simd_math_targeted_all_backends!( + f64_inverse_hyperbolic_signed_zero_semantics, + run_f64_inverse_hyperbolic_signed_zero_semantics +);