diff --git a/benches/simd_math/inverse_trig.rs b/benches/simd_math/inverse_trig.rs index 6fddcfd..e4d32f7 100644 --- a/benches/simd_math/inverse_trig.rs +++ b/benches/simd_math/inverse_trig.rs @@ -1,10 +1,9 @@ use criterion::Criterion; -use simdeez::math::SimdMathF32InverseTrig; -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +use simdeez::math::{SimdMathF32InverseTrig, SimdMathF64InverseTrig}; use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; -use crate::shared::{self, BenchTargets, INPUT_LEN}; +use crate::shared::{self, BenchTargets, BenchTargetsF64, INPUT_LEN}; #[inline(never)] fn scalar_asin_sum(input: &[f32]) -> f32 { @@ -21,6 +20,21 @@ fn scalar_atan_sum(input: &[f32]) -> f32 { input.iter().copied().map(f32::atan).sum() } +#[inline(never)] +fn scalar_asin_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::asin).sum() +} + +#[inline(never)] +fn scalar_acos_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::acos).sum() +} + +#[inline(never)] +fn scalar_atan_sum_f64(input: &[f64]) -> f64 { + input.iter().copied().map(f64::atan).sum() +} + simd_unsafe_generate_all!( fn simdeez_asin_sum(input: &[f32]) -> f32 { shared::simdeez_sum_impl::(input, |v| v.asin_u35()) @@ -39,27 +53,59 @@ simd_unsafe_generate_all!( } ); -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +simd_unsafe_generate_all!( + fn simdeez_asin_sum_f64(input: &[f64]) -> f64 { + shared::simdeez_sum_impl_f64::(input, |v| v.asin_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_acos_sum_f64(input: &[f64]) -> f64 { + shared::simdeez_sum_impl_f64::(input, |v| v.acos_u35()) + } +); + +simd_unsafe_generate_all!( + fn simdeez_atan_sum_f64(input: &[f64]) -> f64 { + shared::simdeez_sum_impl_f64::(input, |v| v.atan_u35()) + } +); + #[inline(never)] -fn simdeez_asin_sum_scalar(input: &[f32]) -> f32 { +fn forced_scalar_asin_sum(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.asin_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_acos_sum_scalar(input: &[f32]) -> f32 { +fn forced_scalar_acos_sum(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.acos_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_atan_sum_scalar(input: &[f32]) -> f32 { +fn forced_scalar_atan_sum(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.atan_u35()) } +#[inline(never)] +fn forced_scalar_asin_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.asin_u35()) +} + +#[inline(never)] +fn forced_scalar_acos_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.acos_u35()) +} + +#[inline(never)] +fn forced_scalar_atan_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.atan_u35()) +} + pub fn register(c: &mut Criterion) { let inverse_inputs = shared::make_inverse_trig_inputs(INPUT_LEN, 0xA11C_E101); let atan_inputs = shared::make_atan_inputs(INPUT_LEN, 0xA11C_E102); + let inverse_inputs_f64 = shared::make_inverse_trig_inputs_f64(INPUT_LEN, 0xA11C_E201); + let atan_inputs_f64 = shared::make_atan_inputs_f64(INPUT_LEN, 0xA11C_E202); shared::bench_variants( c, @@ -68,7 +114,7 @@ pub fn register(c: &mut Criterion) { BenchTargets { scalar_native: scalar_asin_sum, simdeez_runtime: simdeez_asin_sum, - simdeez_scalar: simdeez_asin_sum_scalar, + simdeez_scalar: forced_scalar_asin_sum, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] simdeez_sse2: simdeez_asin_sum_sse2, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -87,7 +133,7 @@ pub fn register(c: &mut Criterion) { BenchTargets { scalar_native: scalar_acos_sum, simdeez_runtime: simdeez_acos_sum, - simdeez_scalar: simdeez_acos_sum_scalar, + simdeez_scalar: forced_scalar_acos_sum, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] simdeez_sse2: simdeez_acos_sum_sse2, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -106,7 +152,7 @@ pub fn register(c: &mut Criterion) { BenchTargets { scalar_native: scalar_atan_sum, simdeez_runtime: simdeez_atan_sum, - simdeez_scalar: simdeez_atan_sum_scalar, + simdeez_scalar: forced_scalar_atan_sum, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] simdeez_sse2: simdeez_atan_sum_sse2, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -117,4 +163,61 @@ pub fn register(c: &mut Criterion) { simdeez_avx512: simdeez_atan_sum_avx512, }, ); + + shared::bench_variants_f64( + c, + "simd_math/f64/asin_u35", + &inverse_inputs_f64, + BenchTargetsF64 { + scalar_native: scalar_asin_sum_f64, + simdeez_runtime: simdeez_asin_sum_f64, + simdeez_scalar: forced_scalar_asin_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_asin_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_asin_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_asin_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_asin_sum_f64_avx512, + }, + ); + + shared::bench_variants_f64( + c, + "simd_math/f64/acos_u35", + &inverse_inputs_f64, + BenchTargetsF64 { + scalar_native: scalar_acos_sum_f64, + simdeez_runtime: simdeez_acos_sum_f64, + simdeez_scalar: forced_scalar_acos_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_acos_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_acos_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_acos_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_acos_sum_f64_avx512, + }, + ); + + shared::bench_variants_f64( + c, + "simd_math/f64/atan_u35", + &atan_inputs_f64, + BenchTargetsF64 { + scalar_native: scalar_atan_sum_f64, + simdeez_runtime: simdeez_atan_sum_f64, + simdeez_scalar: forced_scalar_atan_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_atan_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_atan_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_atan_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_atan_sum_f64_avx512, + }, + ); } diff --git a/benches/simd_math/shared.rs b/benches/simd_math/shared.rs index e98b0e5..b122c8f 100644 --- a/benches/simd_math/shared.rs +++ b/benches/simd_math/shared.rs @@ -2,7 +2,6 @@ use criterion::{Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; use simdeez::prelude::*; -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] use simdeez::scalar::Scalar; use std::hint::black_box; @@ -48,11 +47,21 @@ pub fn make_inverse_trig_inputs(len: usize, seed: u64) -> Vec { (0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect() } +pub fn make_inverse_trig_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.gen_range(-1.0f64..1.0f64)).collect() +} + pub fn make_atan_inputs(len: usize, seed: u64) -> Vec { let mut rng = ChaCha8Rng::seed_from_u64(seed); (0..len).map(|_| rng.gen_range(-64.0f32..64.0f32)).collect() } +pub fn make_atan_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len).map(|_| rng.gen_range(-64.0f64..64.0f64)).collect() +} + pub fn make_tan_inputs(len: usize, seed: u64) -> Vec { let mut rng = ChaCha8Rng::seed_from_u64(seed); let half_pi = core::f32::consts::FRAC_PI_2; @@ -87,6 +96,20 @@ pub struct BenchTargets { pub simdeez_avx512: unsafe fn(&[f32]) -> f32, } +pub struct BenchTargetsF64 { + pub scalar_native: fn(&[f64]) -> f64, + pub simdeez_runtime: fn(&[f64]) -> f64, + pub simdeez_scalar: fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse2: unsafe fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse41: unsafe fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx2: unsafe fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx512: unsafe fn(&[f64]) -> f64, +} + pub fn bench_variants(c: &mut Criterion, group_name: &str, input: &[f32], targets: BenchTargets) { let mut group = c.benchmark_group(group_name); group.throughput(Throughput::Elements(input.len() as u64)); @@ -146,15 +169,83 @@ pub fn bench_variants(c: &mut Criterion, group_name: &str, input: &[f32], target group.finish(); } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] +pub fn bench_variants_f64( + c: &mut Criterion, + group_name: &str, + input: &[f64], + targets: BenchTargetsF64, +) { + let mut group = c.benchmark_group(group_name); + group.throughput(Throughput::Elements(input.len() as u64)); + + group.bench_function("scalar-native", |b| { + b.iter(|| black_box((targets.scalar_native)(black_box(input)))) + }); + + group.bench_function("simdeez-runtime", |b| { + b.iter(|| black_box((targets.simdeez_runtime)(black_box(input)))) + }); + + group.bench_function("simdeez-forced-scalar", |b| { + b.iter(|| black_box((targets.simdeez_scalar)(black_box(input)))) + }); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if std::is_x86_feature_detected!("sse2") { + group.bench_function("simdeez-forced-sse2", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_sse2)(black_box(input))) }) + }); + } else { + eprintln!("[bench] skipped simdeez-forced-sse2 for {group_name}: CPU lacks sse2"); + } + + if std::is_x86_feature_detected!("sse4.1") { + group.bench_function("simdeez-forced-sse41", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_sse41)(black_box(input))) }) + }); + } else { + eprintln!("[bench] skipped simdeez-forced-sse41 for {group_name}: CPU lacks sse4.1"); + } + + if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") { + group.bench_function("simdeez-forced-avx2", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_avx2)(black_box(input))) }) + }); + } else { + eprintln!("[bench] skipped simdeez-forced-avx2 for {group_name}: CPU lacks avx2/fma"); + } + + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512dq") + { + group.bench_function("simdeez-forced-avx512", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_avx512)(black_box(input))) }) + }); + } else { + eprintln!( + "[bench] skipped simdeez-forced-avx512 for {group_name}: CPU lacks avx512f+bw+dq" + ); + } + } + + group.finish(); +} + type ScalarVf32 = ::Vf32; +type ScalarVf64 = ::Vf64; -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] pub fn force_scalar_sum(input: &[f32], op: impl Fn(ScalarVf32) -> ScalarVf32) -> f32 { simdeez_sum_impl::(input, op) } +#[inline(never)] +pub fn force_scalar_sum_f64(input: &[f64], op: impl Fn(ScalarVf64) -> ScalarVf64) -> f64 { + simdeez_sum_impl_f64::(input, op) +} + #[inline(always)] pub fn simdeez_sum_impl(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32) -> f32 { let mut sum = 0.0f32; @@ -168,3 +259,17 @@ pub fn simdeez_sum_impl(input: &[f32], op: impl Fn(S::Vf32) -> S::Vf32) sum } + +#[inline(always)] +pub fn simdeez_sum_impl_f64(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 { + let mut sum = 0.0f64; + let mut i = 0; + + while i + S::Vf64::WIDTH <= input.len() { + let v = S::Vf64::load_from_slice(&input[i..]); + sum += op(v).horizontal_add(); + i += S::Vf64::WIDTH; + } + + sum +} diff --git a/src/math/contracts.rs b/src/math/contracts.rs index a32c9d3..311f6d2 100644 --- a/src/math/contracts.rs +++ b/src/math/contracts.rs @@ -32,9 +32,11 @@ pub const SIN_U35_F64_MAX_ULP: u64 = 1; pub const COS_U35_F64_MAX_ULP: u64 = 1; pub const TAN_U35_F64_MAX_ULP: u64 = 1; -pub const ASIN_U35_F64_MAX_ULP: u64 = 1; -pub const ACOS_U35_F64_MAX_ULP: u64 = 1; -pub const ATAN_U35_F64_MAX_ULP: u64 = 1; +// Portable f64 inverse-trig kernels now use the family-local SIMD implementation +// rather than scalar lane mapping, so they carry the honest u35 contract. +pub const ASIN_U35_F64_MAX_ULP: u64 = 35; +pub const ACOS_U35_F64_MAX_ULP: u64 = 35; +pub const ATAN_U35_F64_MAX_ULP: u64 = 35; pub const ATAN2_U35_F64_MAX_ULP: u64 = 1; pub const SINH_U35_F64_MAX_ULP: u64 = 1; pub const COSH_U35_F64_MAX_ULP: u64 = 1; diff --git a/src/math/f64/inverse_trig.rs b/src/math/f64/inverse_trig.rs index 288cd49..43a8962 100644 --- a/src/math/f64/inverse_trig.rs +++ b/src/math/f64/inverse_trig.rs @@ -1,26 +1,29 @@ -use crate::math::{map, scalar}; -use crate::SimdFloat64; +use crate::math::families::inverse_trig::portable_f64; +use crate::{Simd, SimdFloat64}; #[inline(always)] pub(crate) fn asin_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::asin_u35_f64) + portable_f64::asin_u35(input) } #[inline(always)] pub(crate) fn acos_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::acos_u35_f64) + portable_f64::acos_u35(input) } #[inline(always)] pub(crate) fn atan_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::atan_u35_f64) + portable_f64::atan_u35(input) } diff --git a/src/math/families/inverse_trig/mod.rs b/src/math/families/inverse_trig/mod.rs index a5de564..e1a4d5b 100644 --- a/src/math/families/inverse_trig/mod.rs +++ b/src/math/families/inverse_trig/mod.rs @@ -1,4 +1,5 @@ mod portable_f32; +pub(crate) mod portable_f64; use crate::math::f64; use crate::{Simd, SimdFloat32, SimdFloat64}; @@ -33,17 +34,26 @@ impl SimdMathF32InverseTrig for T {} pub trait SimdMathF64InverseTrig: SimdFloat64 { #[inline(always)] - fn asin_u35(self) -> Self { + fn asin_u35(self) -> Self + where + Self::Engine: Simd, + { f64::asin_u35(self) } #[inline(always)] - fn acos_u35(self) -> Self { + fn acos_u35(self) -> Self + where + Self::Engine: Simd, + { f64::acos_u35(self) } #[inline(always)] - fn atan_u35(self) -> Self { + fn atan_u35(self) -> Self + where + Self::Engine: Simd, + { f64::atan_u35(self) } } diff --git a/src/math/families/inverse_trig/portable_f64.rs b/src/math/families/inverse_trig/portable_f64.rs new file mode 100644 index 0000000..817b39b --- /dev/null +++ b/src/math/families/inverse_trig/portable_f64.rs @@ -0,0 +1,237 @@ +use crate::math::scalar; +use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; + +type SimdI64 = <::Engine as Simd>::Vi64; + +const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64; +const F64_SIGN_MASK: i64 = i64::MIN; + +const ASIN_SQRT_FALLBACK_BOUND_BITS: u64 = 0x3FEF_F000_0000_0000; + +const FRAC_PI_2: f64 = core::f64::consts::FRAC_PI_2; +const FRAC_PI_4: f64 = core::f64::consts::FRAC_PI_4; +const PI: f64 = core::f64::consts::PI; + +const TAN_PI_8: f64 = 0.414_213_562_373_095_03; +const TAN_3PI_8: f64 = 2.414_213_562_373_095; + +#[inline(always)] +fn any_lane_nonzero(mask: SimdI64) -> bool +where + V: SimdFloat64, + V::Engine: Simd, +{ + unsafe { + let lanes = mask.as_array(); + for lane in 0..V::WIDTH { + if lanes[lane] != 0 { + return true; + } + } + } + + false +} + +#[inline(always)] +fn patch_exceptional_lanes( + input: V, + output: V, + exceptional_mask: SimdI64, + scalar_fallback: fn(f64) -> f64, +) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + if !any_lane_nonzero::(exceptional_mask) { + return output; + } + + unsafe { + let input_lanes = input.as_array(); + let mask_lanes = exceptional_mask.as_array(); + let mut output_lanes = output.as_array(); + + for lane in 0..V::WIDTH { + if mask_lanes[lane] != 0 { + output_lanes[lane] = scalar_fallback(input_lanes[lane]); + } + } + + V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) + } +} + +#[inline(always)] +fn restore_sign(magnitude: V, sign_bits: SimdI64) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + (magnitude.bitcast_i64() ^ sign_bits).bitcast_f64() +} + +#[inline(always)] +fn finite_mask(input: V) -> SimdI64 +where + V: SimdFloat64, + V::Engine: Simd, +{ + let exponent_bits = input.bitcast_i64() & SimdI64::::set1(F64_EXPONENT_MASK); + exponent_bits.cmp_neq(SimdI64::::set1(F64_EXPONENT_MASK)) +} + +#[inline(always)] +fn atan_poly(z: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let p0 = V::set1(-8.750_608_600_031_904e-1); + let p1 = V::set1(-1.615_753_718_733_365_2e1); + let p2 = V::set1(-7.500_855_792_314_705e1); + let p3 = V::set1(-1.228_866_684_490_136_2e2); + let p4 = V::set1(-6.485_021_904_942_025e1); + + let q0 = V::set1(2.485_846_490_142_306_4e1); + let q1 = V::set1(1.650_270_098_316_988_6e2); + let q2 = V::set1(4.328_810_604_912_903e2); + let q3 = V::set1(4.853_903_996_359_137e2); + let q4 = V::set1(1.945_506_571_482_614e2); + + let numerator = (((((p0 * z) + p1) * z) + p2) * z + p3) * z + p4; + let denominator = (((((z + q0) * z) + q1) * z + q2) * z + q3) * z + q4; + numerator / denominator +} + +#[inline(always)] +fn atan_reduced(reduced: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let z = reduced * reduced; + reduced + reduced * z * atan_poly(z) +} + +#[inline(always)] +fn atan_fast(input: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let sign_bits = input.bitcast_i64() & SimdI64::::set1(F64_SIGN_MASK); + let abs_input = input.abs(); + + let large_mask = abs_input.cmp_gt(V::set1(TAN_3PI_8)).bitcast_i64(); + let medium_base = abs_input.cmp_gt(V::set1(TAN_PI_8)).bitcast_i64(); + let medium_mask = medium_base & large_mask.cmp_eq(SimdI64::::zeroes()); + let large_mask_f = large_mask.bitcast_f64(); + let medium_mask_f = medium_mask.bitcast_f64(); + + let mut reduced = abs_input; + reduced = medium_mask_f.blendv( + reduced, + (abs_input - V::set1(1.0)) / (abs_input + V::set1(1.0)), + ); + reduced = large_mask_f.blendv(reduced, -V::set1(1.0) / abs_input); + + let mut offset = V::zeroes(); + offset = medium_mask_f.blendv(offset, V::set1(FRAC_PI_4)); + offset = large_mask_f.blendv(offset, V::set1(FRAC_PI_2)); + + let magnitude = offset + atan_reduced(reduced); + restore_sign(magnitude, sign_bits) +} + +#[inline(always)] +fn asin_exceptional_mask(input: V) -> SimdI64 +where + V: SimdFloat64, + V::Engine: Simd, +{ + let finite = finite_mask(input); + let abs_input = input.abs(); + let out_of_domain = abs_input.cmp_gt(V::set1(1.0)).bitcast_i64(); + let near_edge = abs_input + .bitcast_i64() + .cmp_gt(SimdI64::::set1(ASIN_SQRT_FALLBACK_BOUND_BITS as i64)); + finite.cmp_eq(SimdI64::::zeroes()) | out_of_domain | near_edge +} + +#[inline(always)] +fn acos_exceptional_mask(input: V) -> SimdI64 +where + V: SimdFloat64, + V::Engine: Simd, +{ + asin_exceptional_mask(input) +} + +#[inline(always)] +fn asin_acos_core(input: V) -> (V, V) +where + V: SimdFloat64, + V::Engine: Simd, +{ + let one = V::set1(1.0); + let half = V::set1(0.5); + let abs_input = input.abs(); + let sign_bits = input.bitcast_i64() & SimdI64::::set1(F64_SIGN_MASK); + + let small_mask = abs_input.cmp_lt(half).bitcast_i64(); + let small_mask_f = small_mask.bitcast_f64(); + + let small_t = abs_input / (one - abs_input * abs_input).sqrt(); + let small_asin_mag = atan_fast(small_t); + + let large_t = ((one - abs_input) / (one + abs_input)).sqrt(); + let large_angle = V::set1(2.0) * atan_fast(large_t); + let large_asin_mag = V::set1(FRAC_PI_2) - large_angle; + + let asin_mag = small_mask_f.blendv(large_asin_mag, small_asin_mag); + let asin_out = restore_sign(asin_mag, sign_bits); + + let small_acos = V::set1(FRAC_PI_2) - asin_out; + let large_acos_pos = large_angle; + let large_acos_neg = V::set1(PI) - large_angle; + let negative_mask = input.cmp_lt(V::zeroes()).bitcast_i64().bitcast_f64(); + let large_acos = negative_mask.blendv(large_acos_pos, large_acos_neg); + let acos_out = small_mask_f.blendv(large_acos, small_acos); + + (asin_out, acos_out) +} + +#[inline(always)] +pub(crate) fn asin_u35(input: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let exceptional_mask = asin_exceptional_mask(input); + let (fast, _) = asin_acos_core(input); + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::asin_u35_f64) +} + +#[inline(always)] +pub(crate) fn acos_u35(input: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let exceptional_mask = acos_exceptional_mask(input); + let (_, fast) = asin_acos_core(input); + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acos_u35_f64) +} + +#[inline(always)] +pub(crate) fn atan_u35(input: V) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + let fast = atan_fast(input); + let exceptional_mask = finite_mask(input).cmp_eq(SimdI64::::zeroes()); + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::atan_u35_f64) +} diff --git a/src/math/families/mod.rs b/src/math/families/mod.rs index e6aaab9..f566447 100644 --- a/src/math/families/mod.rs +++ b/src/math/families/mod.rs @@ -2,7 +2,7 @@ mod binary_misc; mod core; mod hyperbolic; mod inverse_hyperbolic; -mod inverse_trig; +pub(crate) mod inverse_trig; use crate::{SimdFloat32, SimdFloat64}; diff --git a/src/tests/simd_math_targeted_edges/inverse_trig.rs b/src/tests/simd_math_targeted_edges/inverse_trig.rs index ad25fb5..36e6a91 100644 --- a/src/tests/simd_math_targeted_edges/inverse_trig.rs +++ b/src/tests/simd_math_targeted_edges/inverse_trig.rs @@ -1,5 +1,5 @@ use super::*; -use crate::math::SimdMathF32InverseTrig; +use crate::math::{SimdMathF32InverseTrig, SimdMathF64InverseTrig}; fn run_f32_inverse_trig_near_one() { let inputs = [ @@ -141,3 +141,149 @@ simd_math_targeted_all_backends!( ); simd_math_targeted_all_backends!(f32_inverse_trig_symmetry, run_f32_inverse_trig_symmetry); simd_math_targeted_all_backends!(f32_inverse_trig_identity, run_f32_inverse_trig_identity); + +fn run_f64_inverse_trig_near_one() { + let inputs = [ + f64::from_bits(0x3FEF_FFFF_FFFF_FFFE), + f64::from_bits(0x3FEF_FFFF_FFFF_FFFF), + 1.0, + f64::from_bits(0x3FF0_0000_0000_0001), + -f64::from_bits(0x3FEF_FFFF_FFFF_FFFE), + -f64::from_bits(0x3FEF_FFFF_FFFF_FFFF), + -1.0, + -f64::from_bits(0x3FF0_0000_0000_0001), + ]; + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let v = S::Vf64::load_from_slice(chunk); + let asin = v.asin_u35(); + let acos = v.acos_u35(); + for (lane, &x) in chunk.iter().enumerate() { + assert_f64_contract( + "asin_u35", + x, + asin[lane], + x.asin(), + contracts::ASIN_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + assert_f64_contract( + "acos_u35", + x, + acos[lane], + x.acos(), + contracts::ACOS_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + } + } +} + +fn run_f64_inverse_trig_special_lanes() { + let inputs = [ + f64::NAN, + f64::INFINITY, + f64::NEG_INFINITY, + 0.0, + -0.0, + 1.0, + -1.0, + f64::from_bits(0x3FF0_0000_0000_0001), + -f64::from_bits(0x3FF0_0000_0000_0001), + 0.5, + -0.5, + 4.0, + -4.0, + TAN_PI_8_F64, + f64::from_bits(TAN_PI_8_F64.to_bits() + 1), + TAN_3PI_8_F64, + f64::from_bits(TAN_3PI_8_F64.to_bits() + 1), + 1.0e-300, + -1.0e-300, + ]; + + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let v = S::Vf64::load_from_slice(chunk); + let asin = v.asin_u35(); + let acos = v.acos_u35(); + let atan = v.atan_u35(); + + for (lane, &x) in chunk.iter().enumerate() { + assert_f64_contract( + "asin_u35", + x, + asin[lane], + x.asin(), + contracts::ASIN_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + assert_f64_contract( + "acos_u35", + x, + acos[lane], + x.acos(), + contracts::ACOS_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + assert_f64_contract( + "atan_u35", + x, + atan[lane], + x.atan(), + contracts::ATAN_U35_F64_MAX_ULP, + ) + .unwrap_or_else(|e| panic!("{e}")); + } + } +} + +fn run_f64_inverse_trig_symmetry() { + let inputs = [-0.875, -0.75, -0.5, -0.125, 0.125, 0.5, 0.75, 0.875]; + + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let v = S::Vf64::load_from_slice(chunk); + let neg_v = -v; + + let asin = v.asin_u35(); + let asin_neg = neg_v.asin_u35(); + let atan = v.atan_u35(); + let atan_neg = neg_v.atan_u35(); + + for lane in 0..chunk.len() { + assert_f64_contract("asin symmetry", chunk[lane], asin_neg[lane], -asin[lane], 2) + .unwrap_or_else(|e| panic!("{e}")); + assert_f64_contract("atan symmetry", chunk[lane], atan_neg[lane], -atan[lane], 2) + .unwrap_or_else(|e| panic!("{e}")); + } + } +} + +fn run_f64_inverse_trig_identity() { + let inputs = [-1.0, -0.875, -0.5, -0.125, 0.0, 0.125, 0.5, 0.875, 1.0]; + + for chunk in inputs.chunks(S::Vf64::WIDTH) { + let v = S::Vf64::load_from_slice(chunk); + let sum = v.asin_u35() + v.acos_u35(); + + for (lane, &x) in chunk.iter().enumerate() { + assert_f64_contract( + "asin+acos identity", + x, + sum[lane], + std::f64::consts::FRAC_PI_2, + 12, + ) + .unwrap_or_else(|e| panic!("{e}")); + } + } +} + +const TAN_PI_8_F64: f64 = 0.414_213_562_373_095_03; +const TAN_3PI_8_F64: f64 = 2.414_213_562_373_095; + +simd_math_targeted_all_backends!(f64_inverse_trig_near_one, run_f64_inverse_trig_near_one); +simd_math_targeted_all_backends!( + f64_inverse_trig_special_lanes, + run_f64_inverse_trig_special_lanes +); +simd_math_targeted_all_backends!(f64_inverse_trig_symmetry, run_f64_inverse_trig_symmetry); +simd_math_targeted_all_backends!(f64_inverse_trig_identity, run_f64_inverse_trig_identity); diff --git a/src/tests/simd_math_targeted_edges/mod.rs b/src/tests/simd_math_targeted_edges/mod.rs index babe6b3..4c31173 100644 --- a/src/tests/simd_math_targeted_edges/mod.rs +++ b/src/tests/simd_math_targeted_edges/mod.rs @@ -68,6 +68,58 @@ fn assert_f32_contract( Ok(()) } +fn assert_f64_contract( + fn_name: &str, + input: f64, + actual: f64, + expected: f64, + max_ulp: u64, +) -> Result<(), String> { + if expected.is_nan() { + if actual.is_nan() { + return Ok(()); + } + return Err(format!("{fn_name}({input:?}) expected NaN, got {actual:?}")); + } + + if expected.is_infinite() { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected {:?}, got {:?}", + expected, actual + )); + } + + if expected == 0.0 { + if actual.to_bits() == expected.to_bits() { + return Ok(()); + } + return Err(format!( + "{fn_name}({input:?}) expected signed zero bits {:016x}, got {:016x}", + expected.to_bits(), + actual.to_bits() + )); + } + + if actual.is_nan() || actual.is_infinite() { + return Err(format!( + "{fn_name}({input:?}) expected finite {expected:?}, got {actual:?}" + )); + } + + let ulp = ulp_distance_f64(actual, expected) + .ok_or_else(|| format!("{fn_name}({input:?}) failed to compute f64 ULP distance"))?; + if ulp > max_ulp { + return Err(format!( + "{fn_name}({input:?}) ULP distance {ulp} exceeds max {max_ulp} (actual={actual:?}, expected={expected:?})" + )); + } + + Ok(()) +} + fn check_targeted_unary_f32( fn_name: &str, inputs: &[f32],