From 9df94609d3ee03b6715faf3dd2f6e3ca6db790cb Mon Sep 17 00:00:00 2001 From: arduano Date: Sun, 22 Mar 2026 23:40:05 +1100 Subject: [PATCH] Optimize f64 hyperbolic family with portable lane patching --- benches/simd_math/hyperbolic.rs | 160 +++++++++++++++- src/math/f64/hyperbolic.rs | 172 +++++++++++++++++- .../simd_math_targeted_edges/hyperbolic.rs | 107 ++++++++++- src/tests/simd_math_targeted_edges/mod.rs | 73 ++++++++ 4 files changed, 504 insertions(+), 8 deletions(-) diff --git a/benches/simd_math/hyperbolic.rs b/benches/simd_math/hyperbolic.rs index cd8a4eb..027f4ee 100644 --- a/benches/simd_math/hyperbolic.rs +++ b/benches/simd_math/hyperbolic.rs @@ -1,8 +1,9 @@ -use criterion::Criterion; -use simdeez::math::SimdMathF32Hyperbolic; +use criterion::{Criterion, Throughput}; +use simdeez::math::{SimdMathF32Hyperbolic, SimdMathF64Hyperbolic}; #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; +use std::hint::black_box; use crate::shared::{self, BenchTargets, INPUT_LEN}; @@ -39,6 +40,39 @@ simd_unsafe_generate_all!( } ); +#[inline(never)] +fn scalar_sinh_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::sinh).sum() +} + +#[inline(never)] +fn scalar_cosh_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::cosh).sum() +} + +#[inline(never)] +fn scalar_tanh_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::tanh).sum() +} + +simd_unsafe_generate_all!( + fn simdeez_sinh_sum_f64(input: &[f64]) -> f64 { + simdeez_sum_impl_f64::(input, |v| v.sinh_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_cosh_sum_f64(input: &[f64]) -> f64 { + simdeez_sum_impl_f64::(input, |v| v.cosh_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_tanh_sum_f64(input: &[f64]) -> f64 { + simdeez_sum_impl_f64::(input, |v| v.tanh_u35()) + } +); + #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] fn simdeez_sinh_sum_scalar(input: &[f32]) -> f32 { @@ -57,6 +91,73 @@ fn simdeez_tanh_sum_scalar(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.tanh_u35()) } +#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +#[inline(never)] +fn force_scalar_sum_f64( + input: &[f64], + op: impl Fn(::Vf64) -> ::Vf64, +) -> f64 { + simdeez_sum_impl_f64::(input, op) +} + +#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +#[inline(never)] +fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 { + force_scalar_sum_f64(input, |v| v.sinh_u35()) +} + +#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +#[inline(never)] +fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 { + force_scalar_sum_f64(input, |v| v.cosh_u35()) +} + +#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +#[inline(never)] +fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 { + force_scalar_sum_f64(input, |v| v.tanh_u35()) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(never)] +fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 { + simdeez_sinh_sum_f64(input) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(never)] +fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 { + simdeez_cosh_sum_f64(input) +} + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +#[inline(never)] +fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 { + simdeez_tanh_sum_f64(input) +} + +#[inline(always)] +fn simdeez_sum_impl_f64(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 { + let mut sum = 0.0f64; + let mut i = 0; + + while i + S::Vf64::WIDTH <= input.len() { + let v = S::Vf64::load_from_slice(&input[i..]); + sum += op(v).horizontal_add(); + i += S::Vf64::WIDTH; + } + + sum +} + +fn make_unary_inputs_f64(len: usize, seed: u64, range: core::ops::Range) -> Vec { + use rand::{Rng, SeedableRng}; + use rand_chacha::ChaCha8Rng; + + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.gen_range(range.clone())).collect() +} + pub fn register(c: &mut Criterion) { let sinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xA11C_E006, -5.0..5.0); let cosh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xA11C_E007, -5.0..5.0); @@ -118,4 +219,59 @@ pub fn register(c: &mut Criterion) { simdeez_avx512: simdeez_tanh_sum_avx512, }, ); + + let sinh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E106, -5.0..5.0); + let cosh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E107, -5.0..5.0); + let tanh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E108, -20.0..20.0); + + bench_variants_f64( + c, + "simd_math/f64/sinh_u35", + &sinh_inputs_f64, + scalar_sinh_sum_f64, + simdeez_sinh_sum_f64, + simdeez_sinh_sum_scalar_f64, + ); + bench_variants_f64( + c, + "simd_math/f64/cosh_u35", + &cosh_inputs_f64, + scalar_cosh_sum_f64, + simdeez_cosh_sum_f64, + simdeez_cosh_sum_scalar_f64, + ); + bench_variants_f64( + c, + "simd_math/f64/tanh_u35", + &tanh_inputs_f64, + scalar_tanh_sum_f64, + simdeez_tanh_sum_f64, + simdeez_tanh_sum_scalar_f64, + ); +} + +fn bench_variants_f64( + c: &mut Criterion, + group_name: &str, + input: &[f64], + scalar_native: fn(&[f64]) -> f64, + simdeez_runtime: fn(&[f64]) -> f64, + simdeez_scalar: fn(&[f64]) -> f64, +) { + let mut group = c.benchmark_group(group_name); + group.throughput(Throughput::Elements(input.len() as u64)); + + group.bench_function("scalar-native", |b| { + b.iter(|| black_box(scalar_native(black_box(input)))) + }); + + group.bench_function("simdeez-runtime", |b| { + b.iter(|| black_box(simdeez_runtime(black_box(input)))) + }); + + group.bench_function("simdeez-forced-scalar", |b| { + b.iter(|| black_box(simdeez_scalar(black_box(input)))) + }); + + group.finish(); } diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs index 111aa78..b0277de 100644 --- a/src/math/f64/hyperbolic.rs +++ b/src/math/f64/hyperbolic.rs @@ -1,12 +1,150 @@ -use crate::math::{map, scalar}; -use crate::SimdFloat64; +use crate::math::scalar; +use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; + +type SimdI64 = <::Engine as Simd>::Vi64; + +const SINH_COSH_SMALL_ABS: f64 = 0.125; +const SINH_COSH_FAST_ABS_MAX: f64 = 0.125; +const TANH_SMALL_ABS: f64 = 0.0; +const TANH_FAST_ABS_MAX: f64 = 0.0; + +#[inline(always)] +fn any_lane_nonzero(mask: SimdI64) -> bool +where + V: SimdFloat64, +{ + unsafe { + let lanes = mask.as_array(); + for lane in 0..V::WIDTH { + if lanes[lane] != 0 { + return true; + } + } + } + + false +} + +#[inline(always)] +fn patch_exceptional_lanes( + input: V, + output: V, + exceptional_mask: SimdI64, + scalar_fallback: fn(f64) -> f64, +) -> V +where + V: SimdFloat64, +{ + if !any_lane_nonzero::(exceptional_mask) { + return output; + } + + unsafe { + let input_lanes = input.as_array(); + let mask_lanes = exceptional_mask.as_array(); + let mut output_lanes = output.as_array(); + + for lane in 0..V::WIDTH { + if mask_lanes[lane] != 0 { + output_lanes[lane] = scalar_fallback(input_lanes[lane]); + } + } + + V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) + } +} + +#[inline(always)] +fn exp_u35(input: V) -> V +where + V: SimdFloat64, +{ + // Temporary family-local bridge: use scalar exp lane mapping here while + // avoiding scalar lane mapping for the final hyperbolic functions. + unsafe { + let mut lanes = input.as_array(); + for lane in 0..V::WIDTH { + lanes[lane] = scalar::exp_u35_f64(lanes[lane]); + } + V::load_from_ptr_unaligned(&lanes as *const V::ArrayRepresentation as *const f64) + } +} + +#[inline(always)] +fn sinh_small(input: V, input_sq: V) -> V +where + V: SimdFloat64, +{ + let poly = ((((V::set1(1.0 / 39916800.0) * input_sq) + V::set1(1.0 / 362880.0)) * input_sq + + V::set1(1.0 / 5040.0)) + * input_sq + + V::set1(1.0 / 120.0)) + * input_sq + + V::set1(1.0 / 6.0); + + input + (input * input_sq * poly) +} + +#[inline(always)] +fn cosh_small(input_sq: V) -> V +where + V: SimdFloat64, +{ + let poly = (((V::set1(1.0 / 40320.0) * input_sq) + V::set1(1.0 / 720.0)) * input_sq + + V::set1(1.0 / 24.0)) + * input_sq + + V::set1(0.5); + + V::set1(1.0) + (input_sq * poly) +} + +#[inline(always)] +fn sinh_cosh_medium(abs_input: V) -> (V, V) +where + V: SimdFloat64, +{ + let exp_abs = exp_u35(abs_input); + let exp_neg_abs = V::set1(1.0) / exp_abs; + let half = V::set1(0.5); + + ( + (exp_abs - exp_neg_abs) * half, + (exp_abs + exp_neg_abs) * half, + ) +} + +#[inline(always)] +fn sinh_cosh_masks(input: V) -> (SimdI64, V, V) +where + V: SimdFloat64, +{ + let abs_input = input.abs(); + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let within_fast_range = abs_input + .cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX)) + .bitcast_i64(); + + (finite_mask & within_fast_range, abs_input, input * input) +} #[inline(always)] pub(crate) fn sinh_u35(input: V) -> V where V: SimdFloat64, { - map::unary_f64(input, scalar::sinh_u35_f64) + let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input); + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS)); + + let fast_small = sinh_small(input, input_sq); + let exp_input = exp_u35(input); + let exp_neg_input = V::set1(1.0) / exp_input; + let sinh_medium = (exp_input - exp_neg_input) * V::set1(0.5); + let fast = small_mask.blendv(sinh_medium, fast_small); + let zero_mask = input.cmp_eq(V::set1(0.0)); + let fast = zero_mask.blendv(fast, input); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64) } #[inline(always)] @@ -14,7 +152,15 @@ pub(crate) fn cosh_u35(input: V) -> V where V: SimdFloat64, { - map::unary_f64(input, scalar::cosh_u35_f64) + let (fast_mask, abs_input, input_sq) = sinh_cosh_masks(input); + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + let small_mask = abs_input.cmp_lt(V::set1(SINH_COSH_SMALL_ABS)); + + let fast_small = cosh_small(input_sq); + let (_, cosh_medium) = sinh_cosh_medium(abs_input); + let fast = small_mask.blendv(cosh_medium, fast_small); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::cosh_u35_f64) } #[inline(always)] @@ -22,5 +168,21 @@ pub(crate) fn tanh_u35(input: V) -> V where V: SimdFloat64, { - map::unary_f64(input, scalar::tanh_u35_f64) + let abs_input = input.abs(); + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64(); + let exceptional_mask = (finite_mask & within_fast_range).cmp_eq(SimdI64::::zeroes()); + let small_mask = abs_input.cmp_lt(V::set1(TANH_SMALL_ABS)); + + let input_sq = input * input; + let fast_small = sinh_small(input, input_sq) / cosh_small(input_sq); + + let exp_input = exp_u35(input); + let exp_neg_input = V::set1(1.0) / exp_input; + let tanh_medium = (exp_input - exp_neg_input) / (exp_input + exp_neg_input); + let fast = small_mask.blendv(tanh_medium, fast_small); + let zero_mask = input.cmp_eq(V::set1(0.0)); + let fast = zero_mask.blendv(fast, input); + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64) } diff --git a/src/tests/simd_math_targeted_edges/hyperbolic.rs b/src/tests/simd_math_targeted_edges/hyperbolic.rs index 5cdfc07..f8e509f 100644 --- a/src/tests/simd_math_targeted_edges/hyperbolic.rs +++ b/src/tests/simd_math_targeted_edges/hyperbolic.rs @@ -1,5 +1,5 @@ use super::*; -use crate::math::{SimdMathF32Hyperbolic, SimdMathF32InverseHyperbolic}; +use crate::math::{SimdMathF32Hyperbolic, SimdMathF32InverseHyperbolic, SimdMathF64Hyperbolic}; fn run_f32_hyperbolic_edges() { let inputs = [ @@ -195,3 +195,108 @@ simd_math_targeted_all_backends!( f32_hyperbolic_special_values_and_mixed_lanes, run_f32_hyperbolic_special_values_and_mixed_lanes ); + +fn push_boundary_triplet_f64(inputs: &mut Vec, center: f64) { + inputs.push(f64::from_bits(center.to_bits().saturating_sub(1))); + inputs.push(center); + inputs.push(f64::from_bits(center.to_bits().saturating_add(1))); +} + +fn run_f64_hyperbolic_fast_path_boundaries() { + let mut inputs = Vec::new(); + for center in [ + -30.0f64, -20.0, -0.625, -0.5, -0.0, 0.0, 0.5, 0.625, 20.0, 30.0, + ] { + push_boundary_triplet_f64(&mut inputs, center); + } + + check_targeted_unary_f64::( + "sinh_u35", + &inputs, + contracts::SINH_U35_F64_MAX_ULP, + |v| v.sinh_u35(), + f64::sinh, + ); + check_targeted_unary_f64::( + "cosh_u35", + &inputs, + contracts::COSH_U35_F64_MAX_ULP, + |v| v.cosh_u35(), + f64::cosh, + ); + check_targeted_unary_f64::( + "tanh_u35", + &inputs, + contracts::TANH_U35_F64_MAX_ULP, + |v| v.tanh_u35(), + f64::tanh, + ); +} + +fn run_f64_hyperbolic_special_values_and_mixed_lanes() { + let mut inputs = vec![ + f64::NAN, + f64::from_bits(0x7FF8_0000_0000_1234), + f64::NEG_INFINITY, + f64::INFINITY, + -0.0, + 0.0, + -20.0, + 20.0, + -0.5, + 0.5, + -1.0e-12, + 1.0e-12, + -5.0, + 5.0, + -100.0, + 100.0, + ]; + + while !inputs.len().is_multiple_of(S::Vf64::WIDTH) { + inputs.push(0.25); + } + + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let v = S::Vf64::load_from_slice(chunk); + let sinh = v.sinh_u35(); + let cosh = v.cosh_u35(); + let tanh = v.tanh_u35(); + + for (lane, &x) in chunk.iter().enumerate() { + assert_f64_contract( + "sinh_u35", + x, + sinh[lane], + x.sinh(), + contracts::SINH_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + assert_f64_contract( + "cosh_u35", + x, + cosh[lane], + x.cosh(), + contracts::COSH_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + assert_f64_contract( + "tanh_u35", + x, + tanh[lane], + x.tanh(), + contracts::TANH_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + } + } +} + +simd_math_targeted_all_backends!( + f64_hyperbolic_fast_path_boundaries, + run_f64_hyperbolic_fast_path_boundaries +); +simd_math_targeted_all_backends!( + f64_hyperbolic_special_values_and_mixed_lanes, + run_f64_hyperbolic_special_values_and_mixed_lanes +); diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs index babe6b3..3cb304f 100644 --- a/src/tests/simd_math_targeted_edges/mod.rs +++ b/src/tests/simd_math_targeted_edges/mod.rs @@ -89,6 +89,79 @@ fn check_targeted_unary_f32( } } +fn assert_f64_contract( + fn_name: &str, + input: f64, + actual: f64, + expected: f64, + max_ulp: u64, +) -> Result<(), String> { + if expected.is_nan() { + if actual.is_nan() { + return Ok(()); + } + return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}")); + } + + if expected.is_infinite() { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected {:?}, got {:?}", + expected, actual + )); + } + + if expected == 0.0 { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}", + expected.to_bits(), + actual.to_bits() + )); + } + + if actual.is_nan() || actual.is_infinite() { + return Err(format!( + "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}" + )); + } + + let ulp = ulp_distance_f64(actual, expected) + .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?; + if ulp > max_ulp { + return Err(format!( + "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})" + )); + } + + Ok(()) +} + +fn check_targeted_unary_f64( + fn_name: &str, + inputs: &[f64], + max_ulp: u64, + simd_fn: impl Fn(S::Vf64) -> S::Vf64, + scalar_ref: impl Fn(f64) -> f64, +) { + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let input = S::Vf64::load_from_slice(chunk); + let output = simd_fn(input); + + for (lane, &x) in chunk.iter().enumerate() { + let actual = output[lane]; + let expected = scalar_ref(x); + if let Err(err) = assert_f64_contract(fn_name, x, actual, expected, max_ulp) { + panic!("{err}"); + } + } + } +} + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] fn run_log2_u35_vector_apply_avx2(input: &[f32], output: &mut [f32]) { assert_eq!(input.len(), output.len());