diff --git a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs index 2b95ac3..c8d3734 100644 --- a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs +++ b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs @@ -4,11 +4,33 @@ use simdeez::{prelude::*, simd_unsafe_generate_all}; use crate::shared::{self, INPUT_LEN}; +#[inline(never)] +fn scalar_asinh_sum(input: &[f32]) -> f32 { + input.iter().copied().map(f32::asinh).sum() +} + +#[inline(never)] +fn scalar_acosh_sum(input: &[f32]) -> f32 { + input.iter().copied().map(f32::acosh).sum() +} + #[inline(never)] fn scalar_atanh_sum(input: &[f32]) -> f32 { input.iter().copied().map(f32::atanh).sum() } +simd_unsafe_generate_all!( + fn simdeez_asinh_sum(input: &[f32]) -> f32 { + shared::simdeez_unary_sum_impl::(input, |v| v.asinh_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_acosh_sum(input: &[f32]) -> f32 { + shared::simdeez_unary_sum_impl::(input, |v| v.acosh_u35()) + } +); + simd_unsafe_generate_all!( fn simdeez_atanh_sum(input: &[f32]) -> f32 { shared::simdeez_unary_sum_impl::(input, |v| v.atanh_u35()) @@ -16,8 +38,26 @@ simd_unsafe_generate_all!( ); pub fn register(c: &mut Criterion) { + let asinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB001, -16_384.0..16_384.0); + let acosh_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB002, 1.0, 16_384.0); let atanh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB003, -0.999_999..0.999_999); + shared::bench_unary( + c, + "simd_math_baseline/f32/asinh_u35", + &asinh_inputs, + scalar_asinh_sum, + simdeez_asinh_sum, + ); + + shared::bench_unary( + c, + "simd_math_baseline/f32/acosh_u35", + &acosh_inputs, + scalar_acosh_sum, + simdeez_acosh_sum, + ); + shared::bench_unary( c, "simd_math_baseline/f32/atanh_u35", diff --git a/src/math/contracts.rs b/src/math/contracts.rs index 66b6132..a32c9d3 100644 --- a/src/math/contracts.rs +++ b/src/math/contracts.rs @@ -16,9 +16,10 @@ pub const ATAN2_U35_F32_MAX_ULP: u32 = 1; pub const SINH_U35_F32_MAX_ULP: u32 = 35; pub const COSH_U35_F32_MAX_ULP: u32 = 35; pub const TANH_U35_F32_MAX_ULP: u32 = 35; -pub const ASINH_U35_F32_MAX_ULP: u32 = 1; -pub const ACOSH_U35_F32_MAX_ULP: u32 = 1; -pub const ATANH_U35_F32_MAX_ULP: u32 = 1; +// Portable f32 inverse-hyperbolic kernels target u35 accuracy. +pub const ASINH_U35_F32_MAX_ULP: u32 = 35; +pub const ACOSH_U35_F32_MAX_ULP: u32 = 35; +pub const ATANH_U35_F32_MAX_ULP: u32 = 35; pub const HYPOT_U35_F32_MAX_ULP: u32 = 1; // f32 log10 now rides the portable log2_u35 kernel and inherits its relaxed envelope. pub const LOG10_U35_F32_MAX_ULP: u32 = 35; diff --git a/src/math/f32/mod.rs b/src/math/f32/mod.rs index ec3f35c..02ca809 100644 --- a/src/math/f32/mod.rs +++ b/src/math/f32/mod.rs @@ -90,6 +90,33 @@ where hyperbolic::tanh_u35(input) } +#[inline(always)] +pub(crate) fn asinh_u35(input: V) -> V +where + V: SimdFloat32, + V::Engine: Simd, +{ + portable::asinh_u35(input) +} + +#[inline(always)] +pub(crate) fn acosh_u35(input: V) -> V +where + V: SimdFloat32, + V::Engine: Simd, +{ + portable::acosh_u35(input) +} + +#[inline(always)] +pub(crate) fn atanh_u35(input: V) -> V +where + V: SimdFloat32, + V::Engine: Simd, +{ + portable::atanh_u35(input) +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[inline(always)] fn is_avx2_engine() -> bool { diff --git a/src/math/f32/portable.rs b/src/math/f32/portable.rs index 7a8b12d..b39f1d2 100644 --- a/src/math/f32/portable.rs +++ b/src/math/f32/portable.rs @@ -253,3 +253,65 @@ where let fast = sin_fast / cos_fast; patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tan_u35_f32) } + +#[inline(always)] +pub(super) fn asinh_u35(input: V) -> V +where + V: SimdFloat32, + V::Engine: Simd, +{ + let finite_mask = input.cmp_eq(input).bitcast_i32(); + let abs_x = input.abs(); + let tiny_mask = abs_x.cmp_lt(V::set1(0.05)).bitcast_i32(); + let large_mask = abs_x.cmp_gt(V::set1(1.0e19)).bitcast_i32(); + let zero_mask = input.cmp_eq(V::zeroes()).bitcast_i32(); + let exceptional_mask = + finite_mask.cmp_eq(SimdI32::::zeroes()) | tiny_mask | large_mask | zero_mask; + + let radicand = (abs_x * abs_x) + V::set1(1.0); + let magnitude = log2_u35(abs_x + radicand.sqrt()) * V::set1(core::f32::consts::LN_2); + let negative_mask = input.cmp_lt(V::zeroes()); + let fast = negative_mask.blendv(magnitude, -magnitude); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asinh_u35_f32) +} + +#[inline(always)] +pub(super) fn acosh_u35(input: V) -> V +where + V: SimdFloat32, + V::Engine: Simd, +{ + let finite_mask = input.cmp_eq(input).bitcast_i32(); + let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i32(); + let fast_mask = finite_mask & in_domain_mask; + let exceptional_mask = fast_mask.cmp_eq(SimdI32::::zeroes()); + + let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt()); + let fast = log2_u35(input + root_term) * V::set1(core::f32::consts::LN_2); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f32) +} + +#[inline(always)] +pub(super) fn atanh_u35(input: V) -> V +where + V: SimdFloat32, + V::Engine: Simd, +{ + let finite_mask = input.cmp_eq(input).bitcast_i32(); + let abs_x = input.abs(); + let strict_domain_mask = abs_x.cmp_lt(V::set1(1.0)).bitcast_i32(); + let non_zero_mask = input.cmp_neq(V::zeroes()).bitcast_i32(); + let stable_range_mask = abs_x.cmp_lte(V::set1(0.75)).bitcast_i32(); + let away_from_zero_mask = abs_x.cmp_gte(V::set1(0.05)).bitcast_i32(); + let fast_mask = + finite_mask & strict_domain_mask & non_zero_mask & stable_range_mask & away_from_zero_mask; + let exceptional_mask = fast_mask.cmp_eq(SimdI32::::zeroes()); + + let one = V::set1(1.0); + let ratio = (one + input) / (one - input); + let fast = log2_u35(ratio) * V::set1(0.5 * core::f32::consts::LN_2); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atanh_u35_f32) +} diff --git a/src/math/families/inverse_hyperbolic.rs b/src/math/families/inverse_hyperbolic.rs index 4c69d01..c0a9cfe 100644 --- a/src/math/families/inverse_hyperbolic.rs +++ b/src/math/families/inverse_hyperbolic.rs @@ -1,20 +1,29 @@ -use crate::math::{map, scalar}; -use crate::{SimdFloat32, SimdFloat64}; +use crate::math::{f32, map, scalar}; +use crate::{Simd, SimdFloat32, SimdFloat64}; pub trait SimdMathF32InverseHyperbolic: SimdFloat32 { #[inline(always)] - fn asinh_u35(self) -> Self { - map::unary_f32(self, scalar::asinh_u35_f32) + fn asinh_u35(self) -> Self + where + Self::Engine: Simd, + { + f32::asinh_u35(self) } #[inline(always)] - fn acosh_u35(self) -> Self { - map::unary_f32(self, scalar::acosh_u35_f32) + fn acosh_u35(self) -> Self + where + Self::Engine: Simd, + { + f32::acosh_u35(self) } #[inline(always)] - fn atanh_u35(self) -> Self { - map::unary_f32(self, scalar::atanh_u35_f32) + fn atanh_u35(self) -> Self + where + Self::Engine: Simd, + { + f32::atanh_u35(self) } } diff --git a/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs new file mode 100644 index 0000000..b5390c4 --- /dev/null +++ b/src/tests/simd_math_targeted_edges/inverse_hyperbolic.rs @@ -0,0 +1,137 @@ +use super::*; +use crate::math::SimdMathF32InverseHyperbolic; + +fn run_f32_inverse_hyperbolic_domain_edges() { + let acosh_inputs = [ + 0.0f32, + 0.5, + 1.0 - f32::EPSILON, + 1.0, + 1.0 + f32::EPSILON, + 1.000_001, + 2.0, + 10.0, + f32::INFINITY, + f32::NAN, + ]; + check_targeted_unary_f32::( + "acosh_u35", + &acosh_inputs, + contracts::ACOSH_U35_F32_MAX_ULP, + |v| v.acosh_u35(), + f32::acosh, + ); + + let atanh_inputs = [ + -1.0f32, + -1.0 + f32::EPSILON, + -0.999_999, + -0.5, + -0.0, + 0.0, + 0.5, + 0.999_999, + 1.0 - f32::EPSILON, + 1.0, + 1.0 + f32::EPSILON, + f32::NAN, + ]; + check_targeted_unary_f32::( + "atanh_u35", + &atanh_inputs, + contracts::ATANH_U35_F32_MAX_ULP, + |v| v.atanh_u35(), + f32::atanh, + ); + + let asinh_inputs = [ + f32::NEG_INFINITY, + -1.0e20, + -1000.0, + -1.0, + -f32::MIN_POSITIVE, + -0.0, + 0.0, + f32::MIN_POSITIVE, + 1.0, + 1000.0, + 1.0e20, + f32::INFINITY, + f32::NAN, + ]; + check_targeted_unary_f32::( + "asinh_u35", + &asinh_inputs, + contracts::ASINH_U35_F32_MAX_ULP, + |v| v.asinh_u35(), + f32::asinh, + ); +} + +fn run_f32_inverse_hyperbolic_mixed_lanes() { + let mut lanes = vec![0.0f32; S::Vf32::WIDTH]; + let pattern = [ + -0.75f32, + -0.0, + 0.0, + 0.75, + 0.999_999, + -0.999_999, + 1.0, + -1.0, + 1.0 + f32::EPSILON, + f32::NAN, + f32::INFINITY, + f32::NEG_INFINITY, + 1.0, + 2.0, + 10.0, + 0.5, + ]; + for (i, lane) in lanes.iter_mut().enumerate() { + *lane = pattern[i % pattern.len()]; + } + + let input = S::Vf32::load_from_slice(&lanes); + let asinh = input.asinh_u35(); + let acosh = input.acosh_u35(); + let atanh = input.atanh_u35(); + + for (lane, &x) in lanes.iter().enumerate() { + assert_f32_contract( + "asinh_u35", + x, + asinh[lane], + x.asinh(), + contracts::ASINH_U35_F32_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("lane {lane}: {e}")); + + assert_f32_contract( + "acosh_u35", + x, + acosh[lane], + x.acosh(), + contracts::ACOSH_U35_F32_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("lane {lane}: {e}")); + + assert_f32_contract( + "atanh_u35", + x, + atanh[lane], + x.atanh(), + contracts::ATANH_U35_F32_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("lane {lane}: {e}")); + } +} + +simd_math_targeted_all_backends!( + f32_inverse_hyperbolic_domain_edges, + run_f32_inverse_hyperbolic_domain_edges +); +simd_math_targeted_all_backends!( + f32_inverse_hyperbolic_mixed_lanes, + run_f32_inverse_hyperbolic_mixed_lanes +); diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs index 756f2ac..babe6b3 100644 --- a/src/tests/simd_math_targeted_edges/mod.rs +++ b/src/tests/simd_math_targeted_edges/mod.rs @@ -140,4 +140,5 @@ macro_rules! simd_math_targeted_all_backends { mod binary_misc; mod core; mod hyperbolic; +mod inverse_hyperbolic; mod inverse_trig;