diff --git a/benches/simd_math/f64_core.rs b/benches/simd_math/f64_core.rs index 33d6716..c86cb8e 100644 --- a/benches/simd_math/f64_core.rs +++ b/benches/simd_math/f64_core.rs @@ -1,73 +1,8 @@ -use criterion::{Criterion, Throughput}; -use rand::{Rng, SeedableRng}; -use rand_chacha::ChaCha8Rng; +use crate::shared::{self, BenchTargetsF64, INPUT_LEN}; +use criterion::Criterion; use simdeez::math::SimdMathF64Core; +use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; -use std::hint::black_box; - -const INPUT_LEN: usize = 1 << 20; - -fn make_positive_log_inputs(seed: u64) -> Vec { - let mut rng = ChaCha8Rng::seed_from_u64(seed); - (0..INPUT_LEN) - .map(|_| { - let log2x = rng.gen_range(-40.0f64..40.0f64); - let mantissa = rng.gen_range(1.0f64..2.0f64); - mantissa * log2x.exp2() - }) - .collect() -} - -fn make_exp2_inputs(seed: u64) -> Vec { - let mut rng = ChaCha8Rng::seed_from_u64(seed); - (0..INPUT_LEN) - .map(|_| rng.gen_range(-1000.0f64..1000.0f64)) - .collect() -} - -fn make_exp_inputs(seed: u64) -> Vec { - let mut rng = ChaCha8Rng::seed_from_u64(seed); - (0..INPUT_LEN) - .map(|_| rng.gen_range(-700.0f64..700.0f64)) - .collect() -} - -fn make_trig_inputs(seed: u64) -> Vec { - let mut rng = ChaCha8Rng::seed_from_u64(seed); - (0..INPUT_LEN) - .map(|_| rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI)) - .collect() -} - -fn make_tan_inputs(seed: u64) -> Vec { - let mut rng = ChaCha8Rng::seed_from_u64(seed); - (0..INPUT_LEN) - .map(|_| { - let mut x = - rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI); - let k = (x / core::f64::consts::PI).round(); - let nearest_pole = (k + 0.5) * core::f64::consts::PI; - if (x - nearest_pole).abs() < 1.0e-8 { - x += if x >= 0.0 { 2.5e-8 } else { -2.5e-8 }; - } - x - }) - .collect() -} - -#[inline(always)] -fn simdeez_sum_impl(input: &[f64], op: impl Fn(S::Vf64) -> S::Vf64) -> f64 { - let mut sum = 0.0f64; - let mut i = 0; - - while i + S::Vf64::WIDTH <= input.len() { - let v = S::Vf64::load_from_slice(&input[i..]); - sum += op(v).horizontal_add(); - i += S::Vf64::WIDTH; - } - - sum -} #[inline(never)] fn scalar_log2_sum(input: &[f64]) -> f64 { @@ -100,112 +35,206 @@ fn scalar_tan_sum(input: &[f64]) -> f64 { simd_unsafe_generate_all!( fn simdeez_log2_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.log2_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.log2_u35()) } ); simd_unsafe_generate_all!( fn simdeez_exp2_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.exp2_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.exp2_u35()) } ); simd_unsafe_generate_all!( fn simdeez_ln_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.ln_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.ln_u35()) } ); simd_unsafe_generate_all!( fn simdeez_exp_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.exp_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.exp_u35()) } ); simd_unsafe_generate_all!( fn simdeez_sin_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.sin_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.sin_u35()) } ); simd_unsafe_generate_all!( fn simdeez_cos_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.cos_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.cos_u35()) } ); simd_unsafe_generate_all!( fn simdeez_tan_sum(input: &[f64]) -> f64 { - simdeez_sum_impl::(input, |v| v.tan_u35()) + shared::simdeez_sum_impl_f64::(input, |v| v.tan_u35()) } ); -fn bench_pair( - c: &mut Criterion, - name: &str, - input: &[f64], - scalar: fn(&[f64]) -> f64, - simd: fn(&[f64]) -> f64, -) { - let mut group = c.benchmark_group(name); - group.throughput(Throughput::Elements(input.len() as u64)); - group.bench_function("scalar-native", |b| { - b.iter(|| black_box(scalar(black_box(input)))) - }); - group.bench_function("simdeez-runtime", |b| { - b.iter(|| black_box(simd(black_box(input)))) - }); - group.finish(); +#[inline(never)] +fn forced_scalar_log2_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.log2_u35()) +} + +#[inline(never)] +fn forced_scalar_exp2_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.exp2_u35()) +} + +#[inline(never)] +fn forced_scalar_ln_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.ln_u35()) +} + +#[inline(never)] +fn forced_scalar_exp_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.exp_u35()) +} + +#[inline(never)] +fn forced_scalar_sin_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.sin_u35()) +} + +#[inline(never)] +fn forced_scalar_cos_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.cos_u35()) +} + +#[inline(never)] +fn forced_scalar_tan_sum(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.tan_u35()) } pub fn register(c: &mut Criterion) { - let log_inputs = make_positive_log_inputs(0xF640_1001); - let exp2_inputs = make_exp2_inputs(0xF640_1002); - let exp_inputs = make_exp_inputs(0xF640_1003); - let trig_inputs = make_trig_inputs(0xF640_1004); - let tan_inputs = make_tan_inputs(0xF640_1005); + let log_inputs = shared::make_positive_log_inputs_f64(INPUT_LEN, 0xF640_1001); + let exp2_inputs = shared::make_exp2_inputs_f64(INPUT_LEN, 0xF640_1002); + let exp_inputs = shared::make_exp_inputs_f64(INPUT_LEN, 0xF640_1003); + let trig_inputs = shared::make_trig_inputs_f64(INPUT_LEN, 0xF640_1004); + let tan_inputs = shared::make_tan_inputs_f64(INPUT_LEN, 0xF640_1005); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/log2_u35", &log_inputs, - scalar_log2_sum, - simdeez_log2_sum, + BenchTargetsF64 { + scalar_native: scalar_log2_sum, + simdeez_runtime: simdeez_log2_sum, + simdeez_scalar: forced_scalar_log2_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_log2_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_log2_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_log2_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_log2_sum_avx512, + }, ); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/exp2_u35", &exp2_inputs, - scalar_exp2_sum, - simdeez_exp2_sum, + BenchTargetsF64 { + scalar_native: scalar_exp2_sum, + simdeez_runtime: simdeez_exp2_sum, + simdeez_scalar: forced_scalar_exp2_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_exp2_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_exp2_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_exp2_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_exp2_sum_avx512, + }, ); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/ln_u35", &log_inputs, - scalar_ln_sum, - simdeez_ln_sum, + BenchTargetsF64 { + scalar_native: scalar_ln_sum, + simdeez_runtime: simdeez_ln_sum, + simdeez_scalar: forced_scalar_ln_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_ln_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_ln_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_ln_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_ln_sum_avx512, + }, ); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/exp_u35", &exp_inputs, - scalar_exp_sum, - simdeez_exp_sum, + BenchTargetsF64 { + scalar_native: scalar_exp_sum, + simdeez_runtime: simdeez_exp_sum, + simdeez_scalar: forced_scalar_exp_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_exp_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_exp_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_exp_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_exp_sum_avx512, + }, ); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/sin_u35", &trig_inputs, - scalar_sin_sum, - simdeez_sin_sum, + BenchTargetsF64 { + scalar_native: scalar_sin_sum, + simdeez_runtime: simdeez_sin_sum, + simdeez_scalar: forced_scalar_sin_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_sin_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_sin_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_sin_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_sin_sum_avx512, + }, ); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/cos_u35", &trig_inputs, - scalar_cos_sum, - simdeez_cos_sum, + BenchTargetsF64 { + scalar_native: scalar_cos_sum, + simdeez_runtime: simdeez_cos_sum, + simdeez_scalar: forced_scalar_cos_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_cos_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_cos_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_cos_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_cos_sum_avx512, + }, ); - bench_pair( + shared::bench_variants_f64( c, "simd_math/f64/tan_u35", &tan_inputs, - scalar_tan_sum, - simdeez_tan_sum, + BenchTargetsF64 { + scalar_native: scalar_tan_sum, + simdeez_runtime: simdeez_tan_sum, + simdeez_scalar: forced_scalar_tan_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_tan_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_tan_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_tan_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_tan_sum_avx512, + }, ); } diff --git a/benches/simd_math/hyperbolic.rs b/benches/simd_math/hyperbolic.rs index 027f4ee..d7cdda4 100644 --- a/benches/simd_math/hyperbolic.rs +++ b/benches/simd_math/hyperbolic.rs @@ -1,11 +1,8 @@ -use criterion::{Criterion, Throughput}; +use crate::shared::{self, BenchTargets, BenchTargetsF64, INPUT_LEN}; +use criterion::Criterion; use simdeez::math::{SimdMathF32Hyperbolic, SimdMathF64Hyperbolic}; -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; -use std::hint::black_box; - -use crate::shared::{self, BenchTargets, INPUT_LEN}; #[inline(never)] fn scalar_sinh_sum(input: &[f32]) -> f32 { @@ -73,67 +70,34 @@ simd_unsafe_generate_all!( } ); -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_sinh_sum_scalar(input: &[f32]) -> f32 { +fn forced_scalar_sinh_sum(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.sinh_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_cosh_sum_scalar(input: &[f32]) -> f32 { +fn forced_scalar_cosh_sum(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.cosh_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_tanh_sum_scalar(input: &[f32]) -> f32 { +fn forced_scalar_tanh_sum(input: &[f32]) -> f32 { shared::force_scalar_sum(input, |v: ::Vf32| v.tanh_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn force_scalar_sum_f64( - input: &[f64], - op: impl Fn(::Vf64) -> ::Vf64, -) -> f64 { - simdeez_sum_impl_f64::(input, op) +fn forced_scalar_sinh_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.sinh_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 { - force_scalar_sum_f64(input, |v| v.sinh_u35()) +fn forced_scalar_cosh_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.cosh_u35()) } -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] #[inline(never)] -fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 { - force_scalar_sum_f64(input, |v| v.cosh_u35()) -} - -#[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] -#[inline(never)] -fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 { - force_scalar_sum_f64(input, |v| v.tanh_u35()) -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(never)] -fn simdeez_sinh_sum_scalar_f64(input: &[f64]) -> f64 { - simdeez_sinh_sum_f64(input) -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(never)] -fn simdeez_cosh_sum_scalar_f64(input: &[f64]) -> f64 { - simdeez_cosh_sum_f64(input) -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(never)] -fn simdeez_tanh_sum_scalar_f64(input: &[f64]) -> f64 { - simdeez_tanh_sum_f64(input) +fn forced_scalar_tanh_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.tanh_u35()) } #[inline(always)] @@ -170,7 +134,7 @@ pub fn register(c: &mut Criterion) { BenchTargets { scalar_native: scalar_sinh_sum, simdeez_runtime: simdeez_sinh_sum, - simdeez_scalar: simdeez_sinh_sum_scalar, + simdeez_scalar: forced_scalar_sinh_sum, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] simdeez_sse2: simdeez_sinh_sum_sse2, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -189,7 +153,7 @@ pub fn register(c: &mut Criterion) { BenchTargets { scalar_native: scalar_cosh_sum, simdeez_runtime: simdeez_cosh_sum, - simdeez_scalar: simdeez_cosh_sum_scalar, + simdeez_scalar: forced_scalar_cosh_sum, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] simdeez_sse2: simdeez_cosh_sum_sse2, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -208,7 +172,7 @@ pub fn register(c: &mut Criterion) { BenchTargets { scalar_native: scalar_tanh_sum, simdeez_runtime: simdeez_tanh_sum, - simdeez_scalar: simdeez_tanh_sum_scalar, + simdeez_scalar: forced_scalar_tanh_sum, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] simdeez_sse2: simdeez_tanh_sum_sse2, #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -224,54 +188,58 @@ pub fn register(c: &mut Criterion) { let cosh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E107, -5.0..5.0); let tanh_inputs_f64 = make_unary_inputs_f64(INPUT_LEN, 0xA11C_E108, -20.0..20.0); - bench_variants_f64( + shared::bench_variants_f64( c, "simd_math/f64/sinh_u35", &sinh_inputs_f64, - scalar_sinh_sum_f64, - simdeez_sinh_sum_f64, - simdeez_sinh_sum_scalar_f64, + BenchTargetsF64 { + scalar_native: scalar_sinh_sum_f64, + simdeez_runtime: simdeez_sinh_sum_f64, + simdeez_scalar: forced_scalar_sinh_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_sinh_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_sinh_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_sinh_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_sinh_sum_f64_avx512, + }, ); - bench_variants_f64( + shared::bench_variants_f64( c, "simd_math/f64/cosh_u35", &cosh_inputs_f64, - scalar_cosh_sum_f64, - simdeez_cosh_sum_f64, - simdeez_cosh_sum_scalar_f64, + BenchTargetsF64 { + scalar_native: scalar_cosh_sum_f64, + simdeez_runtime: simdeez_cosh_sum_f64, + simdeez_scalar: forced_scalar_cosh_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_cosh_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_cosh_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_cosh_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_cosh_sum_f64_avx512, + }, ); - bench_variants_f64( + shared::bench_variants_f64( c, "simd_math/f64/tanh_u35", &tanh_inputs_f64, - scalar_tanh_sum_f64, - simdeez_tanh_sum_f64, - simdeez_tanh_sum_scalar_f64, + BenchTargetsF64 { + scalar_native: scalar_tanh_sum_f64, + simdeez_runtime: simdeez_tanh_sum_f64, + simdeez_scalar: forced_scalar_tanh_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_tanh_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_tanh_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_tanh_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_tanh_sum_f64_avx512, + }, ); } - -fn bench_variants_f64( - c: &mut Criterion, - group_name: &str, - input: &[f64], - scalar_native: fn(&[f64]) -> f64, - simdeez_runtime: fn(&[f64]) -> f64, - simdeez_scalar: fn(&[f64]) -> f64, -) { - let mut group = c.benchmark_group(group_name); - group.throughput(Throughput::Elements(input.len() as u64)); - - group.bench_function("scalar-native", |b| { - b.iter(|| black_box(scalar_native(black_box(input)))) - }); - - group.bench_function("simdeez-runtime", |b| { - b.iter(|| black_box(simdeez_runtime(black_box(input)))) - }); - - group.bench_function("simdeez-forced-scalar", |b| { - b.iter(|| black_box(simdeez_scalar(black_box(input)))) - }); - - group.finish(); -} diff --git a/benches/simd_math/shared.rs b/benches/simd_math/shared.rs index b122c8f..7fd0366 100644 --- a/benches/simd_math/shared.rs +++ b/benches/simd_math/shared.rs @@ -18,6 +18,17 @@ pub fn make_positive_log_inputs(len: usize, seed: u64) -> Vec { .collect() } +pub fn make_positive_log_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len) + .map(|_| { + let log2x = rng.gen_range(-40.0f64..40.0f64); + let mantissa = rng.gen_range(1.0f64..2.0f64); + mantissa * log2x.exp2() + }) + .collect() +} + pub fn make_exp2_inputs(len: usize, seed: u64) -> Vec { let mut rng = ChaCha8Rng::seed_from_u64(seed); (0..len) @@ -25,11 +36,25 @@ pub fn make_exp2_inputs(len: usize, seed: u64) -> Vec { .collect() } +pub fn make_exp2_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len) + .map(|_| rng.gen_range(-1000.0f64..1000.0f64)) + .collect() +} + pub fn make_exp_inputs(len: usize, seed: u64) -> Vec { let mut rng = ChaCha8Rng::seed_from_u64(seed); (0..len).map(|_| rng.gen_range(-80.0f32..80.0f32)).collect() } +pub fn make_exp_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len) + .map(|_| rng.gen_range(-700.0f64..700.0f64)) + .collect() +} + pub fn make_unary_inputs(len: usize, seed: u64, range: core::ops::Range) -> Vec { let mut rng = ChaCha8Rng::seed_from_u64(seed); (0..len).map(|_| rng.gen_range(range.clone())).collect() @@ -42,6 +67,13 @@ pub fn make_trig_inputs(len: usize, seed: u64) -> Vec { .collect() } +pub fn make_trig_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len) + .map(|_| rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI)) + .collect() +} + pub fn make_inverse_trig_inputs(len: usize, seed: u64) -> Vec { let mut rng = ChaCha8Rng::seed_from_u64(seed); (0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect() @@ -82,6 +114,22 @@ pub fn make_tan_inputs(len: usize, seed: u64) -> Vec { .collect() } +pub fn make_tan_inputs_f64(len: usize, seed: u64) -> Vec { + let mut rng = ChaCha8Rng::seed_from_u64(seed); + (0..len) + .map(|_| { + let mut x = + rng.gen_range(-100.0f64 * core::f64::consts::PI..100.0f64 * core::f64::consts::PI); + let k = (x / core::f64::consts::PI).round(); + let nearest_pole = (k + 0.5) * core::f64::consts::PI; + if (x - nearest_pole).abs() < 1.0e-8 { + x += if x >= 0.0 { 2.5e-8 } else { -2.5e-8 }; + } + x + }) + .collect() +} + pub struct BenchTargets { pub scalar_native: fn(&[f32]) -> f32, pub simdeez_runtime: fn(&[f32]) -> f32, diff --git a/benches/simd_math_remaining_baseline/binary_misc.rs b/benches/simd_math_remaining_baseline/binary_misc.rs index 6056fb9..1596f53 100644 --- a/benches/simd_math_remaining_baseline/binary_misc.rs +++ b/benches/simd_math_remaining_baseline/binary_misc.rs @@ -1,8 +1,11 @@ use criterion::Criterion; use simdeez::math::{SimdMathF32BinaryMisc, SimdMathF64BinaryMisc}; +use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; -use crate::shared::{self, INPUT_LEN}; +use crate::shared::{ + self, BenchTargets, BenchTargetsF64, BinaryBenchTargets, BinaryBenchTargetsF64, INPUT_LEN, +}; #[inline(never)] fn scalar_log10_sum(input: &[f32]) -> f32 { @@ -92,6 +95,46 @@ simd_unsafe_generate_all!( } ); +#[inline(never)] +fn forced_scalar_log10_sum(input: &[f32]) -> f32 { + shared::force_scalar_sum(input, |v: ::Vf32| v.log10_u35()) +} + +#[inline(never)] +fn forced_scalar_atan2_sum(a: &[f32], b: &[f32]) -> f32 { + shared::force_scalar_binary_sum(a, b, |x: ::Vf32, y| x.atan2_u35(y)) +} + +#[inline(never)] +fn forced_scalar_hypot_sum(a: &[f32], b: &[f32]) -> f32 { + shared::force_scalar_binary_sum(a, b, |x: ::Vf32, y| x.hypot_u35(y)) +} + +#[inline(never)] +fn forced_scalar_fmod_sum(a: &[f32], b: &[f32]) -> f32 { + shared::force_scalar_binary_sum(a, b, |x: ::Vf32, y| x.fmod(y)) +} + +#[inline(never)] +fn forced_scalar_log10_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.log10_u35()) +} + +#[inline(never)] +fn forced_scalar_atan2_sum_f64(a: &[f64], b: &[f64]) -> f64 { + shared::force_scalar_binary_sum_f64(a, b, |x: ::Vf64, y| x.atan2_u35(y)) +} + +#[inline(never)] +fn forced_scalar_hypot_sum_f64(a: &[f64], b: &[f64]) -> f64 { + shared::force_scalar_binary_sum_f64(a, b, |x: ::Vf64, y| x.hypot_u35(y)) +} + +#[inline(never)] +fn forced_scalar_fmod_sum_f64(a: &[f64], b: &[f64]) -> f64 { + shared::force_scalar_binary_sum_f64(a, b, |x: ::Vf64, y| x.fmod(y)) +} + pub fn register(c: &mut Criterion) { let log10_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB004, 1.0e-20, 1.0e20); let (atan2_y, atan2_x) = @@ -119,68 +162,156 @@ pub fn register(c: &mut Criterion) { } } - shared::bench_unary( + shared::bench_unary_variants( c, "simd_math_baseline/f32/log10_u35", &log10_inputs, - scalar_log10_sum, - simdeez_log10_sum, + BenchTargets { + scalar_native: scalar_log10_sum, + simdeez_runtime: simdeez_log10_sum, + simdeez_scalar: forced_scalar_log10_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_log10_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_log10_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_log10_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_log10_sum_avx512, + }, ); - shared::bench_binary( + shared::bench_binary_variants( c, "simd_math_baseline/f32/atan2_u35", &atan2_y, &atan2_x, - scalar_atan2_sum, - simdeez_atan2_sum, + BinaryBenchTargets { + scalar_native: scalar_atan2_sum, + simdeez_runtime: simdeez_atan2_sum, + simdeez_scalar: forced_scalar_atan2_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_atan2_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_atan2_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_atan2_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_atan2_sum_avx512, + }, ); - shared::bench_binary( + shared::bench_binary_variants( c, "simd_math_baseline/f32/hypot_u35", &hypot_x, &hypot_y, - scalar_hypot_sum, - simdeez_hypot_sum, + BinaryBenchTargets { + scalar_native: scalar_hypot_sum, + simdeez_runtime: simdeez_hypot_sum, + simdeez_scalar: forced_scalar_hypot_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_hypot_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_hypot_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_hypot_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_hypot_sum_avx512, + }, ); - shared::bench_binary( + shared::bench_binary_variants( c, "simd_math_baseline/f32/fmod", &fmod_x, &fmod_y, - scalar_fmod_sum, - simdeez_fmod_sum, + BinaryBenchTargets { + scalar_native: scalar_fmod_sum, + simdeez_runtime: simdeez_fmod_sum, + simdeez_scalar: forced_scalar_fmod_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_fmod_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_fmod_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_fmod_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_fmod_sum_avx512, + }, ); - shared::bench_unary_f64( + shared::bench_unary_variants_f64( c, "simd_math_baseline/f64/log10_u35", &log10_inputs_f64, - scalar_log10_sum_f64, - simdeez_log10_sum_f64, + BenchTargetsF64 { + scalar_native: scalar_log10_sum_f64, + simdeez_runtime: simdeez_log10_sum_f64, + simdeez_scalar: forced_scalar_log10_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_log10_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_log10_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_log10_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_log10_sum_f64_avx512, + }, ); - shared::bench_binary_f64( + shared::bench_binary_variants_f64( c, "simd_math_baseline/f64/atan2_u35", &atan2_y_f64, &atan2_x_f64, - scalar_atan2_sum_f64, - simdeez_atan2_sum_f64, + BinaryBenchTargetsF64 { + scalar_native: scalar_atan2_sum_f64, + simdeez_runtime: simdeez_atan2_sum_f64, + simdeez_scalar: forced_scalar_atan2_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_atan2_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_atan2_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_atan2_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_atan2_sum_f64_avx512, + }, ); - shared::bench_binary_f64( + shared::bench_binary_variants_f64( c, "simd_math_baseline/f64/hypot_u35", &hypot_x_f64, &hypot_y_f64, - scalar_hypot_sum_f64, - simdeez_hypot_sum_f64, + BinaryBenchTargetsF64 { + scalar_native: scalar_hypot_sum_f64, + simdeez_runtime: simdeez_hypot_sum_f64, + simdeez_scalar: forced_scalar_hypot_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_hypot_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_hypot_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_hypot_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_hypot_sum_f64_avx512, + }, ); - shared::bench_binary_f64( + shared::bench_binary_variants_f64( c, "simd_math_baseline/f64/fmod", &fmod_x_f64, &fmod_y_f64, - scalar_fmod_sum_f64, - simdeez_fmod_sum_f64, + BinaryBenchTargetsF64 { + scalar_native: scalar_fmod_sum_f64, + simdeez_runtime: simdeez_fmod_sum_f64, + simdeez_scalar: forced_scalar_fmod_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_fmod_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_fmod_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_fmod_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_fmod_sum_f64_avx512, + }, ); } diff --git a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs index 2ea9d50..a810d70 100644 --- a/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs +++ b/benches/simd_math_remaining_baseline/inverse_hyperbolic.rs @@ -1,8 +1,9 @@ use criterion::Criterion; use simdeez::math::{SimdMathF32InverseHyperbolic, SimdMathF64InverseHyperbolic}; +use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; -use crate::shared::{self, INPUT_LEN}; +use crate::shared::{self, BenchTargets, BenchTargetsF64, INPUT_LEN}; #[inline(never)] fn scalar_asinh_sum(input: &[f32]) -> f32 { @@ -70,6 +71,36 @@ simd_unsafe_generate_all!( } ); +#[inline(never)] +fn forced_scalar_asinh_sum(input: &[f32]) -> f32 { + shared::force_scalar_sum(input, |v: ::Vf32| v.asinh_u35()) +} + +#[inline(never)] +fn forced_scalar_acosh_sum(input: &[f32]) -> f32 { + shared::force_scalar_sum(input, |v: ::Vf32| v.acosh_u35()) +} + +#[inline(never)] +fn forced_scalar_atanh_sum(input: &[f32]) -> f32 { + shared::force_scalar_sum(input, |v: ::Vf32| v.atanh_u35()) +} + +#[inline(never)] +fn forced_scalar_asinh_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.asinh_u35()) +} + +#[inline(never)] +fn forced_scalar_acosh_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.acosh_u35()) +} + +#[inline(never)] +fn forced_scalar_atanh_sum_f64(input: &[f64]) -> f64 { + shared::force_scalar_sum_f64(input, |v: ::Vf64| v.atanh_u35()) +} + pub fn register(c: &mut Criterion) { let asinh_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB001, -16_384.0..16_384.0); let acosh_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB002, 1.0, 16_384.0); @@ -81,51 +112,117 @@ pub fn register(c: &mut Criterion) { let atanh_inputs_f64 = shared::make_unary_inputs_f64(INPUT_LEN, 0xDEADB103, -0.999_999_999_999..0.999_999_999_999); - shared::bench_unary( + shared::bench_unary_variants( c, "simd_math_baseline/f32/asinh_u35", &asinh_inputs, - scalar_asinh_sum, - simdeez_asinh_sum, + BenchTargets { + scalar_native: scalar_asinh_sum, + simdeez_runtime: simdeez_asinh_sum, + simdeez_scalar: forced_scalar_asinh_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_asinh_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_asinh_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_asinh_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_asinh_sum_avx512, + }, ); - shared::bench_unary( + shared::bench_unary_variants( c, "simd_math_baseline/f32/acosh_u35", &acosh_inputs, - scalar_acosh_sum, - simdeez_acosh_sum, + BenchTargets { + scalar_native: scalar_acosh_sum, + simdeez_runtime: simdeez_acosh_sum, + simdeez_scalar: forced_scalar_acosh_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_acosh_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_acosh_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_acosh_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_acosh_sum_avx512, + }, ); - shared::bench_unary( + shared::bench_unary_variants( c, "simd_math_baseline/f32/atanh_u35", &atanh_inputs, - scalar_atanh_sum, - simdeez_atanh_sum, + BenchTargets { + scalar_native: scalar_atanh_sum, + simdeez_runtime: simdeez_atanh_sum, + simdeez_scalar: forced_scalar_atanh_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_atanh_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_atanh_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_atanh_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_atanh_sum_avx512, + }, ); - shared::bench_unary_f64( + shared::bench_unary_variants_f64( c, "simd_math_baseline/f64/asinh_u35", &asinh_inputs_f64, - scalar_asinh_sum_f64, - simdeez_asinh_sum_f64, + BenchTargetsF64 { + scalar_native: scalar_asinh_sum_f64, + simdeez_runtime: simdeez_asinh_sum_f64, + simdeez_scalar: forced_scalar_asinh_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_asinh_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_asinh_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_asinh_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_asinh_sum_f64_avx512, + }, ); - shared::bench_unary_f64( + shared::bench_unary_variants_f64( c, "simd_math_baseline/f64/acosh_u35", &acosh_inputs_f64, - scalar_acosh_sum_f64, - simdeez_acosh_sum_f64, + BenchTargetsF64 { + scalar_native: scalar_acosh_sum_f64, + simdeez_runtime: simdeez_acosh_sum_f64, + simdeez_scalar: forced_scalar_acosh_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_acosh_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_acosh_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_acosh_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_acosh_sum_f64_avx512, + }, ); - shared::bench_unary_f64( + shared::bench_unary_variants_f64( c, "simd_math_baseline/f64/atanh_u35", &atanh_inputs_f64, - scalar_atanh_sum_f64, - simdeez_atanh_sum_f64, + BenchTargetsF64 { + scalar_native: scalar_atanh_sum_f64, + simdeez_runtime: simdeez_atanh_sum_f64, + simdeez_scalar: forced_scalar_atanh_sum_f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_atanh_sum_f64_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_atanh_sum_f64_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_atanh_sum_f64_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_atanh_sum_f64_avx512, + }, ); } diff --git a/benches/simd_math_remaining_baseline/inverse_trig.rs b/benches/simd_math_remaining_baseline/inverse_trig.rs index e37d66b..50a401e 100644 --- a/benches/simd_math_remaining_baseline/inverse_trig.rs +++ b/benches/simd_math_remaining_baseline/inverse_trig.rs @@ -1,8 +1,9 @@ use criterion::Criterion; use simdeez::math::SimdMathF32InverseTrig; +use simdeez::scalar::Scalar; use simdeez::{prelude::*, simd_unsafe_generate_all}; -use crate::shared::{self, INPUT_LEN}; +use crate::shared::{self, BenchTargets, INPUT_LEN}; #[inline(never)] fn scalar_asin_sum(input: &[f32]) -> f32 { @@ -15,14 +16,30 @@ simd_unsafe_generate_all!( } ); +#[inline(never)] +fn forced_scalar_asin_sum(input: &[f32]) -> f32 { + shared::force_scalar_sum(input, |v: ::Vf32| v.asin_u35()) +} + pub fn register(c: &mut Criterion) { let asin_inputs = shared::make_unary_inputs(INPUT_LEN, 0xDEADB001, -1.0..1.0); - shared::bench_unary( + shared::bench_unary_variants( c, "simd_math_baseline/f32/asin_u35", &asin_inputs, - scalar_asin_sum, - simdeez_asin_sum, + BenchTargets { + scalar_native: scalar_asin_sum, + simdeez_runtime: simdeez_asin_sum, + simdeez_scalar: forced_scalar_asin_sum, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse2: simdeez_asin_sum_sse2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_sse41: simdeez_asin_sum_sse41, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx2: simdeez_asin_sum_avx2, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + simdeez_avx512: simdeez_asin_sum_avx512, + }, ); } diff --git a/benches/simd_math_remaining_baseline/shared.rs b/benches/simd_math_remaining_baseline/shared.rs index a4b9ae8..f4804e1 100644 --- a/benches/simd_math_remaining_baseline/shared.rs +++ b/benches/simd_math_remaining_baseline/shared.rs @@ -2,6 +2,7 @@ use criterion::{Criterion, Throughput}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; use simdeez::prelude::*; +use simdeez::scalar::Scalar; use std::hint::black_box; pub const INPUT_LEN: usize = 1 << 20; @@ -108,76 +109,294 @@ pub fn simdeez_binary_sum_impl_f64( sum } -pub fn bench_unary( - c: &mut Criterion, - name: &str, - input: &[f32], - scalar: fn(&[f32]) -> f32, - simd: fn(&[f32]) -> f32, -) { +type ScalarVf32 = ::Vf32; +type ScalarVf64 = ::Vf64; + +pub struct BenchTargets { + pub scalar_native: fn(&[f32]) -> f32, + pub simdeez_runtime: fn(&[f32]) -> f32, + pub simdeez_scalar: fn(&[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse2: unsafe fn(&[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse41: unsafe fn(&[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx2: unsafe fn(&[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx512: unsafe fn(&[f32]) -> f32, +} + +pub struct BenchTargetsF64 { + pub scalar_native: fn(&[f64]) -> f64, + pub simdeez_runtime: fn(&[f64]) -> f64, + pub simdeez_scalar: fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse2: unsafe fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse41: unsafe fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx2: unsafe fn(&[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx512: unsafe fn(&[f64]) -> f64, +} + +pub struct BinaryBenchTargets { + pub scalar_native: fn(&[f32], &[f32]) -> f32, + pub simdeez_runtime: fn(&[f32], &[f32]) -> f32, + pub simdeez_scalar: fn(&[f32], &[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse2: unsafe fn(&[f32], &[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse41: unsafe fn(&[f32], &[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx2: unsafe fn(&[f32], &[f32]) -> f32, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx512: unsafe fn(&[f32], &[f32]) -> f32, +} + +pub struct BinaryBenchTargetsF64 { + pub scalar_native: fn(&[f64], &[f64]) -> f64, + pub simdeez_runtime: fn(&[f64], &[f64]) -> f64, + pub simdeez_scalar: fn(&[f64], &[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse2: unsafe fn(&[f64], &[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_sse41: unsafe fn(&[f64], &[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx2: unsafe fn(&[f64], &[f64]) -> f64, + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + pub simdeez_avx512: unsafe fn(&[f64], &[f64]) -> f64, +} + +#[inline(never)] +pub fn force_scalar_sum(input: &[f32], op: impl Fn(ScalarVf32) -> ScalarVf32) -> f32 { + simdeez_unary_sum_impl::(input, op) +} + +#[inline(never)] +pub fn force_scalar_sum_f64(input: &[f64], op: impl Fn(ScalarVf64) -> ScalarVf64) -> f64 { + simdeez_unary_sum_impl_f64::(input, op) +} + +#[inline(never)] +pub fn force_scalar_binary_sum( + a: &[f32], + b: &[f32], + op: impl Fn(ScalarVf32, ScalarVf32) -> ScalarVf32, +) -> f32 { + simdeez_binary_sum_impl::(a, b, op) +} + +#[inline(never)] +pub fn force_scalar_binary_sum_f64( + a: &[f64], + b: &[f64], + op: impl Fn(ScalarVf64, ScalarVf64) -> ScalarVf64, +) -> f64 { + simdeez_binary_sum_impl_f64::(a, b, op) +} + +pub fn bench_unary_variants(c: &mut Criterion, name: &str, input: &[f32], targets: BenchTargets) { let mut group = c.benchmark_group(name); group.throughput(Throughput::Elements(input.len() as u64)); group.bench_function("scalar-native", |b| { - b.iter(|| black_box(scalar(black_box(input)))) + b.iter(|| black_box((targets.scalar_native)(black_box(input)))) }); group.bench_function("simdeez-runtime", |b| { - b.iter(|| black_box(simd(black_box(input)))) + b.iter(|| black_box((targets.simdeez_runtime)(black_box(input)))) + }); + group.bench_function("simdeez-forced-scalar", |b| { + b.iter(|| black_box((targets.simdeez_scalar)(black_box(input)))) }); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if std::is_x86_feature_detected!("sse2") { + group.bench_function("simdeez-forced-sse2", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_sse2)(black_box(input))) }) + }); + } + if std::is_x86_feature_detected!("sse4.1") { + group.bench_function("simdeez-forced-sse41", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_sse41)(black_box(input))) }) + }); + } + if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") { + group.bench_function("simdeez-forced-avx2", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_avx2)(black_box(input))) }) + }); + } + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512dq") + { + group.bench_function("simdeez-forced-avx512", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_avx512)(black_box(input))) }) + }); + } + } + group.finish(); } -pub fn bench_unary_f64( +pub fn bench_unary_variants_f64( c: &mut Criterion, name: &str, input: &[f64], - scalar: fn(&[f64]) -> f64, - simd: fn(&[f64]) -> f64, + targets: BenchTargetsF64, ) { let mut group = c.benchmark_group(name); group.throughput(Throughput::Elements(input.len() as u64)); group.bench_function("scalar-native", |b| { - b.iter(|| black_box(scalar(black_box(input)))) + b.iter(|| black_box((targets.scalar_native)(black_box(input)))) }); group.bench_function("simdeez-runtime", |b| { - b.iter(|| black_box(simd(black_box(input)))) + b.iter(|| black_box((targets.simdeez_runtime)(black_box(input)))) }); + group.bench_function("simdeez-forced-scalar", |b| { + b.iter(|| black_box((targets.simdeez_scalar)(black_box(input)))) + }); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if std::is_x86_feature_detected!("sse2") { + group.bench_function("simdeez-forced-sse2", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_sse2)(black_box(input))) }) + }); + } + if std::is_x86_feature_detected!("sse4.1") { + group.bench_function("simdeez-forced-sse41", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_sse41)(black_box(input))) }) + }); + } + if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") { + group.bench_function("simdeez-forced-avx2", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_avx2)(black_box(input))) }) + }); + } + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512dq") + { + group.bench_function("simdeez-forced-avx512", |b| { + b.iter(|| unsafe { black_box((targets.simdeez_avx512)(black_box(input))) }) + }); + } + } + group.finish(); } -pub fn bench_binary( +pub fn bench_binary_variants( c: &mut Criterion, name: &str, a: &[f32], b: &[f32], - scalar: fn(&[f32], &[f32]) -> f32, - simd: fn(&[f32], &[f32]) -> f32, + targets: BinaryBenchTargets, ) { let mut group = c.benchmark_group(name); group.throughput(Throughput::Elements(a.len() as u64)); group.bench_function("scalar-native", |ben| { - ben.iter(|| black_box(scalar(black_box(a), black_box(b)))) + ben.iter(|| black_box((targets.scalar_native)(black_box(a), black_box(b)))) }); group.bench_function("simdeez-runtime", |ben| { - ben.iter(|| black_box(simd(black_box(a), black_box(b)))) + ben.iter(|| black_box((targets.simdeez_runtime)(black_box(a), black_box(b)))) + }); + group.bench_function("simdeez-forced-scalar", |ben| { + ben.iter(|| black_box((targets.simdeez_scalar)(black_box(a), black_box(b)))) }); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if std::is_x86_feature_detected!("sse2") { + group.bench_function("simdeez-forced-sse2", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_sse2)(black_box(a), black_box(b))) + }) + }); + } + if std::is_x86_feature_detected!("sse4.1") { + group.bench_function("simdeez-forced-sse41", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_sse41)(black_box(a), black_box(b))) + }) + }); + } + if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") { + group.bench_function("simdeez-forced-avx2", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_avx2)(black_box(a), black_box(b))) + }) + }); + } + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512dq") + { + group.bench_function("simdeez-forced-avx512", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_avx512)(black_box(a), black_box(b))) + }) + }); + } + } + group.finish(); } -pub fn bench_binary_f64( +pub fn bench_binary_variants_f64( c: &mut Criterion, name: &str, a: &[f64], b: &[f64], - scalar: fn(&[f64], &[f64]) -> f64, - simd: fn(&[f64], &[f64]) -> f64, + targets: BinaryBenchTargetsF64, ) { let mut group = c.benchmark_group(name); group.throughput(Throughput::Elements(a.len() as u64)); group.bench_function("scalar-native", |ben| { - ben.iter(|| black_box(scalar(black_box(a), black_box(b)))) + ben.iter(|| black_box((targets.scalar_native)(black_box(a), black_box(b)))) }); group.bench_function("simdeez-runtime", |ben| { - ben.iter(|| black_box(simd(black_box(a), black_box(b)))) + ben.iter(|| black_box((targets.simdeez_runtime)(black_box(a), black_box(b)))) + }); + group.bench_function("simdeez-forced-scalar", |ben| { + ben.iter(|| black_box((targets.simdeez_scalar)(black_box(a), black_box(b)))) }); + + #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + { + if std::is_x86_feature_detected!("sse2") { + group.bench_function("simdeez-forced-sse2", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_sse2)(black_box(a), black_box(b))) + }) + }); + } + if std::is_x86_feature_detected!("sse4.1") { + group.bench_function("simdeez-forced-sse41", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_sse41)(black_box(a), black_box(b))) + }) + }); + } + if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") { + group.bench_function("simdeez-forced-avx2", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_avx2)(black_box(a), black_box(b))) + }) + }); + } + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512dq") + { + group.bench_function("simdeez-forced-avx512", |ben| { + ben.iter(|| unsafe { + black_box((targets.simdeez_avx512)(black_box(a), black_box(b))) + }) + }); + } + } + group.finish(); } diff --git a/src/math/f32/hyperbolic.rs b/src/math/f32/hyperbolic.rs index 69c31e1..5b22f53 100644 --- a/src/math/f32/hyperbolic.rs +++ b/src/math/f32/hyperbolic.rs @@ -9,6 +9,14 @@ const SINH_COSH_FAST_ABS_MAX: f32 = 40.0; const TANH_SMALL_ABS: f32 = 0.625; const TANH_FAST_ABS_MAX: f32 = 40.0; +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f32 sinh_u35 / cosh_u35 / tanh_u35 +// Why kept: +// - local benches show large wins over native scalar across runtime-selected and AVX2 paths +// - the family already centralizes exceptional-lane scalar patching cleanly +// Revisit when: +// - the exp/log backbone or fast-range cutovers change materially + #[inline(always)] fn any_lane_nonzero(mask: SimdI32) -> bool where diff --git a/src/math/f32/mod.rs b/src/math/f32/mod.rs index 02ca809..56fff9e 100644 --- a/src/math/f32/mod.rs +++ b/src/math/f32/mod.rs @@ -1,6 +1,6 @@ //! f32 SIMD math kernel layering: //! - `portable`: backend-agnostic reduction/polynomial kernels + scalar lane patching. -//! - `x86_avx2`: optional hand-optimized override(s) for specific functions. +//! - `x86_avx2`: the only currently justified backend override (`log2_u35`). //! - this module: dispatch glue selecting overrides without changing the public API. mod hyperbolic; @@ -11,6 +11,14 @@ mod x86_avx2; use crate::{Simd, SimdFloat32}; +// DECISION(2026-03-23): KEEP_SIMD_OVERRIDE +// Function(s): f32 log2_u35 dispatch +// Why kept: +// - local benches show the AVX2 path materially ahead of native scalar and forced scalar +// - the portable fallback also stays worthwhile on non-AVX2 backends +// Revisit when: +// - the portable log2 kernel or the AVX2 override changes materially + #[inline(always)] pub(crate) fn log2_u35(input: V) -> V where diff --git a/src/math/f32/portable.rs b/src/math/f32/portable.rs index b39f1d2..657b645 100644 --- a/src/math/f32/portable.rs +++ b/src/math/f32/portable.rs @@ -8,6 +8,30 @@ pub(super) const F32_MANTISSA_MASK: i32 = 0x007F_FFFF; pub(super) const F32_LOG_NORM_MANTISSA: i32 = 0x3F00_0000; pub(super) const F32_EXPONENT_BIAS_ADJUST: i32 = 126; +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f32 log2_u35 portable fallback / exp2_u35 +// Why kept: +// - local benches show both kernels materially ahead of native scalar on this host +// - scalar patching already contains non-finite, zero, and subnormal edge lanes +// Revisit when: +// - a new approximation family lands or non-x86 evidence disagrees sharply + +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f32 sin_u35 / cos_u35 / tan_u35 +// Why kept: +// - runtime-selected throughput is far above native scalar on the local machine +// - targeted boundary and mixed-lane tests cover current reduction and tan-pole handling +// Revisit when: +// - large-argument reduction strategy changes materially + +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f32 asinh_u35 / acosh_u35 / atanh_u35 +// Why kept: +// - the restored inverse-hyperbolic paths beat native scalar in local benchmarks +// - exceptional-domain lanes already fall back to scalar references +// Revisit when: +// - the shared log/exp kernels change enough to affect the current balance + #[inline(always)] fn any_lane_nonzero(mask: SimdI32) -> bool where diff --git a/src/math/f32/x86_avx2.rs b/src/math/f32/x86_avx2.rs index fba9db6..df23eeb 100644 --- a/src/math/f32/x86_avx2.rs +++ b/src/math/f32/x86_avx2.rs @@ -7,6 +7,14 @@ use crate::math::f32::portable; use crate::math::scalar; use crate::{Simd, SimdFloat32}; +// DECISION(2026-03-23): KEEP_SIMD_OVERRIDE +// Function(s): f32 log2_u35 AVX2 override +// Why kept: +// - the AVX2 path is the fastest local benchmark variant for the restored log2 kernel +// - exceptional semantics still route through the shared portable scalar patching +// Revisit when: +// - the portable fallback catches up or semantic divergence appears + #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] diff --git a/src/math/f64/binary_misc.rs b/src/math/f64/binary_misc.rs index b99f792..a5d1b45 100644 --- a/src/math/f64/binary_misc.rs +++ b/src/math/f64/binary_misc.rs @@ -1,6 +1,22 @@ use crate::math::{f64, scalar}; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; +// DECISION(2026-03-23): KEEP_MIXED +// Function(s): f64 log10_u35 +// Why kept: +// - local runtime-selected performance is clearly better than native scalar +// - the current implementation still rides scalar-reference log2_u35 underneath +// Revisit when: +// - f64 log2_u35 gets a new keep/revert outcome + +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f64 atan2_u35 / hypot_u35 / fmod +// Why kept: +// - all three are strong local wins over native scalar, especially hypot and fmod +// - targeted adversarial tests already cover signed-zero, scale, and exceptional-lane behavior +// Revisit when: +// - backend-specific behavior diverges or the scalar contracts change materially + type SimdI64 = <::Engine as Simd>::Vi64; const F64_SIGN_MASK: i64 = i64::MIN; diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs index b6a51bb..819d2be 100644 --- a/src/math/f64/core.rs +++ b/src/math/f64/core.rs @@ -1,55 +1,21 @@ use crate::math::{map, scalar}; -use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; - -type SimdI64 = <::Engine as Simd>::Vi64; - -#[inline(always)] -fn any_lane_nonzero(mask: SimdI64) -> bool -where - V: SimdFloat64, - V::Engine: Simd, -{ - unsafe { - let lanes = mask.as_array(); - for lane in 0..V::WIDTH { - if lanes[lane] != 0 { - return true; - } - } - } - - false -} - -#[inline(always)] -fn patch_exceptional_lanes( - input: V, - output: V, - exceptional_mask: SimdI64, - scalar_fallback: fn(f64) -> f64, -) -> V -where - V: SimdFloat64, - V::Engine: Simd, -{ - if !any_lane_nonzero::(exceptional_mask) { - return output; - } - - unsafe { - let input_lanes = input.as_array(); - let mask_lanes = exceptional_mask.as_array(); - let mut output_lanes = output.as_array(); - - for lane in 0..V::WIDTH { - if mask_lanes[lane] != 0 { - output_lanes[lane] = scalar_fallback(input_lanes[lane]); - } - } - - V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) - } -} +use crate::{Simd, SimdFloat64}; + +// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// Function(s): f64 log2_u35 / exp2_u35 / ln_u35 / exp_u35 +// Why scalar: +// - local benches keep putting runtime-selected behavior at or below native scalar +// - family structure stays useful, but the current default is still scalar-reference +// Revisit when: +// - a genuinely worthwhile f64 log/exp SIMD kernel exists + +// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// Function(s): f64 sin_u35 / cos_u35 / tan_u35 +// Why scalar: +// - the previous portable trig fast path still lagged native scalar on this host +// - the implementation was reverted to scalar-reference while preserving family ownership +// Revisit when: +// - a stronger range-reduction strategy or cheaper trig kernel appears #[inline(always)] pub(crate) fn log2_u35(input: V) -> V @@ -82,107 +48,30 @@ where pub(crate) fn exp_u35(input: V) -> V where V: SimdFloat64, - V::Engine: Simd, { map::unary_f64(input, scalar::exp_u35_f64) } -#[inline(always)] -fn trig_exceptional_mask(input: V) -> SimdI64 -where - V: SimdFloat64, - V::Engine: Simd, -{ - let finite_mask = input.cmp_eq(input).bitcast_i64(); - let within_fast_range = input - .abs() - .cmp_lte(V::set1(core::f64::consts::FRAC_PI_4)) - .bitcast_i64(); - let non_zero = input.cmp_neq(V::zeroes()).bitcast_i64(); - (finite_mask & within_fast_range & non_zero).cmp_eq(SimdI64::::zeroes()) -} - -#[inline(always)] -fn sin_cos_fast(input: V) -> (V, V) -where - V: SimdFloat64, - V::Engine: Simd, -{ - let two_over_pi = V::set1(core::f64::consts::FRAC_2_PI); - let n = (input * two_over_pi).round().cast_i64(); - - let n_f = n.cast_f64(); - let r = ((input - n_f * V::set1(core::f64::consts::FRAC_PI_2)) - - n_f * V::set1(6.123_233_995_736_766e-17)) - - n_f * V::set1(-2.022_266_248_795_951e-21); - let r2 = r * r; - - let mut sin_poly = V::set1(1.589_690_995_211_55e-10); - sin_poly = (sin_poly * r2) + V::set1(-2.505_076_025_340_686_3e-8); - sin_poly = (sin_poly * r2) + V::set1(2.755_731_370_707_006_8e-6); - sin_poly = (sin_poly * r2) + V::set1(-1.984_126_982_985_795e-4); - sin_poly = (sin_poly * r2) + V::set1(8.333_333_333_322_49e-3); - sin_poly = (sin_poly * r2) + V::set1(-1.666_666_666_666_632_4e-1); - let sin_r = ((sin_poly * r2) * r) + r; - - let mut cos_poly = V::set1(-1.135_964_755_778_819_5e-11); - cos_poly = (cos_poly * r2) + V::set1(2.087_572_321_298_175e-9); - cos_poly = (cos_poly * r2) + V::set1(-2.755_731_435_139_066_3e-7); - cos_poly = (cos_poly * r2) + V::set1(2.480_158_728_947_673e-5); - cos_poly = (cos_poly * r2) + V::set1(-1.388_888_888_887_305_6e-3); - cos_poly = (cos_poly * r2) + V::set1(4.166_666_666_666_659e-2); - let cos_r = (cos_poly * r2 * r2) - (V::set1(0.5) * r2) + V::set1(1.0); - - let q = n & SimdI64::::set1(3); - let q0 = q.cmp_eq(SimdI64::::zeroes()).bitcast_f64(); - let q1 = q.cmp_eq(SimdI64::::set1(1)).bitcast_f64(); - let q2 = q.cmp_eq(SimdI64::::set1(2)).bitcast_f64(); - - let mut sin_out = q0.blendv(V::zeroes(), sin_r); - sin_out = q1.blendv(sin_out, cos_r); - sin_out = q2.blendv(sin_out, -sin_r); - sin_out = (q0 | q1 | q2).cmp_eq(V::zeroes()).blendv(sin_out, -cos_r); - - let mut cos_out = q0.blendv(V::zeroes(), cos_r); - cos_out = q1.blendv(cos_out, -sin_r); - cos_out = q2.blendv(cos_out, -cos_r); - cos_out = (q0 | q1 | q2).cmp_eq(V::zeroes()).blendv(cos_out, sin_r); - - (sin_out, cos_out) -} - #[inline(always)] pub(crate) fn sin_u35(input: V) -> V where V: SimdFloat64, - V::Engine: Simd, { - let exceptional_mask = trig_exceptional_mask(input); - let (sin_fast, _) = sin_cos_fast(input); - patch_exceptional_lanes(input, sin_fast, exceptional_mask, scalar::sin_u35_f64) + map::unary_f64(input, scalar::sin_u35_f64) } #[inline(always)] pub(crate) fn cos_u35(input: V) -> V where V: SimdFloat64, - V::Engine: Simd, { - let exceptional_mask = trig_exceptional_mask(input); - let (_, cos_fast) = sin_cos_fast(input); - patch_exceptional_lanes(input, cos_fast, exceptional_mask, scalar::cos_u35_f64) + map::unary_f64(input, scalar::cos_u35_f64) } #[inline(always)] pub(crate) fn tan_u35(input: V) -> V where V: SimdFloat64, - V::Engine: Simd, { - let base_exceptional = trig_exceptional_mask(input); - let (sin_fast, cos_fast) = sin_cos_fast(input); - let dangerous = cos_fast.abs().cmp_lt(V::set1(1.0e-12)).bitcast_i64(); - let exceptional_mask = base_exceptional | dangerous; - let fast = sin_fast / cos_fast; - patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tan_u35_f64) + map::unary_f64(input, scalar::tan_u35_f64) } diff --git a/src/math/f64/hyperbolic.rs b/src/math/f64/hyperbolic.rs index 111aa78..ae02352 100644 --- a/src/math/f64/hyperbolic.rs +++ b/src/math/f64/hyperbolic.rs @@ -1,6 +1,14 @@ use crate::math::{map, scalar}; use crate::SimdFloat64; +// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// Function(s): f64 sinh_u35 / cosh_u35 / tanh_u35 +// Why scalar: +// - local benches do not justify a portable SIMD default for this family on the current host +// - keeping the family split still preserves test and ownership structure for a later retry +// Revisit when: +// - a cheaper f64 exp/log backbone or a dedicated hyperbolic kernel lands + #[inline(always)] pub(crate) fn sinh_u35(input: V) -> V where diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs index 50914bb..d399edf 100644 --- a/src/math/f64/inverse_hyperbolic.rs +++ b/src/math/f64/inverse_hyperbolic.rs @@ -1,6 +1,22 @@ use crate::math::{f64, map, scalar}; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; +// DECISION(2026-03-23): KEEP_MIXED +// Function(s): f64 asinh_u35 +// Why kept: +// - local benches show the current hybrid path materially ahead of native scalar +// - the fast path still depends on scalar-reference ln_u35, so this is not a full SIMD keep +// Revisit when: +// - f64 ln_u35 stops being scalar-reference or asinh gets its own cheaper core + +// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// Function(s): f64 acosh_u35 / atanh_u35 +// Why scalar: +// - local runtime-selected results do not beat native scalar on this host +// - scalar-reference keeps semantics honest without adding more f64 complexity today +// Revisit when: +// - a stronger f64 inverse-hyperbolic kernel family exists + type SimdI64 = <::Engine as Simd>::Vi64; #[inline(always)] diff --git a/src/math/f64/mod.rs b/src/math/f64/mod.rs index 18e5923..8b9302d 100644 --- a/src/math/f64/mod.rs +++ b/src/math/f64/mod.rs @@ -1,7 +1,10 @@ //! f64 SIMD math dispatch layering: //! - family-local modules own the public internal routing points for each math family. -//! - current implementations remain scalar-mapped through `map` + `scalar`. -//! - follow-up optimization PRs can replace one family module at a time. +//! - current decisions are intentionally mixed: +//! scalar-reference for core/trig and hyperbolic defaults, +//! portable SIMD for inverse trig and several binary-misc kernels, +//! and hybrid paths where scalar sub-ops still underpin the fast path. +//! - follow-up optimization work can still replace one family module at a time. mod binary_misc; mod core; diff --git a/src/math/families/binary_misc/mod.rs b/src/math/families/binary_misc/mod.rs index 7ce0e5a..aa2e21d 100644 --- a/src/math/families/binary_misc/mod.rs +++ b/src/math/families/binary_misc/mod.rs @@ -3,6 +3,14 @@ mod portable_f32; use crate::math::{f64, map, scalar}; use crate::{Simd, SimdFloat32, SimdFloat64}; +// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// Function(s): f32 fmod +// Why scalar: +// - local benches still favor native scalar and there is no convincing portable SIMD default yet +// - the public trait entry point stays stable while the honest implementation remains scalar-reference +// Revisit when: +// - quotient-range handling becomes cheap enough for a worthwhile portable kernel + pub trait SimdMathF32BinaryMisc: SimdFloat32 { #[inline(always)] fn log10_u35(self) -> Self diff --git a/src/math/families/binary_misc/portable_f32.rs b/src/math/families/binary_misc/portable_f32.rs index fe86128..53ad795 100644 --- a/src/math/families/binary_misc/portable_f32.rs +++ b/src/math/families/binary_misc/portable_f32.rs @@ -1,6 +1,14 @@ use crate::math::{f32, scalar}; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdFloat32, SimdInt32}; +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f32 log10_u35 / atan2_u35 / hypot_u35 +// Why kept: +// - local benches show each of these kernels materially ahead of native scalar +// - targeted edge tests already cover domain, signed-zero, and scale-stress behavior +// Revisit when: +// - the shared log2/atan primitives or exceptional-lane rules change materially + type SimdI32 = <::Engine as Simd>::Vi32; #[inline(always)] diff --git a/src/math/families/core.rs b/src/math/families/core.rs index 01da52e..fc15e2c 100644 --- a/src/math/families/core.rs +++ b/src/math/families/core.rs @@ -1,6 +1,14 @@ use crate::math::{f32, f64, map, scalar}; use crate::{Simd, SimdFloat32, SimdFloat64}; +// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +// Function(s): f32 ln_u35 / exp_u35 +// Why scalar: +// - local benches keep the current runtime-selected path below native scalar +// - retaining the family entry points still preserves structure for later retries +// Revisit when: +// - a better shared f32 log/exp kernel exists + pub trait SimdMathF32Core: SimdFloat32 { #[inline(always)] fn log2_u35(self) -> Self diff --git a/src/math/families/inverse_trig/portable_f32.rs b/src/math/families/inverse_trig/portable_f32.rs index ee0c47a..5472e8c 100644 --- a/src/math/families/inverse_trig/portable_f32.rs +++ b/src/math/families/inverse_trig/portable_f32.rs @@ -1,6 +1,14 @@ use crate::math::scalar; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat32, SimdInt32}; +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f32 asin_u35 / acos_u35 / atan_u35 +// Why kept: +// - these remain some of the strongest portable SIMD wins in local benchmarks +// - targeted near-edge and symmetry tests match the current reduction thresholds +// Revisit when: +// - the approximation family or fallback boundaries move materially + type SimdI32 = <::Engine as Simd>::Vi32; const F32_EXPONENT_MASK: i32 = 0x7F80_0000u32 as i32; diff --git a/src/math/families/inverse_trig/portable_f64.rs b/src/math/families/inverse_trig/portable_f64.rs index 817b39b..5d362b2 100644 --- a/src/math/families/inverse_trig/portable_f64.rs +++ b/src/math/families/inverse_trig/portable_f64.rs @@ -1,6 +1,14 @@ use crate::math::scalar; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE +// Function(s): f64 asin_u35 / acos_u35 / atan_u35 +// Why kept: +// - local runtime-selected throughput stays well above native scalar for all three functions +// - the current targeted tests cover near-one fallback boundaries and atan reduction thresholds +// Revisit when: +// - the fallback boundaries or rational approximations change materially + type SimdI64 = <::Engine as Simd>::Vi64; const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64; diff --git a/src/math/mod.rs b/src/math/mod.rs index 21b643f..d9169af 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -11,16 +11,24 @@ //! centralized fallback for non-finite, very-large, and tan-pole-adjacent lanes. //! `sinh_u35` / `cosh_u35` / `tanh_u35` now use family-local portable SIMD //! kernels with centralized scalar patching for exceptional lanes. -//! Remaining historical SLEEF surface in this baseline pass is otherwise still -//! lane-wise scalar mapped for correctness-first portability. +//! The stabilized `f64` map is intentionally mixed: +//! scalar-reference for the current losing core/trig and hyperbolic families, +//! portable SIMD for inverse trig and several binary-misc kernels, +//! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops. //! //! Structure notes: //! - `families/` owns public extension traits grouped by math family. //! - `scalar/` owns scalar fallback helpers using the same family boundaries. -//! - `f64/` mirrors the family split; core u35 kernels now have portable SIMD -//! fast paths with scalar-lane patching for exceptional inputs. +//! - `f64/` mirrors the family split so future rescue-or-revert work can stay localized. //! - `contracts.rs` and `map.rs` stay stable so follow-up optimization PRs can //! target a single family file with minimal overlap. +//! +//! Decision vocabulary used by the math audit ledger: +//! - `KEEP_SIMD_PORTABLE`: portable SIMD stays enabled by default. +//! - `KEEP_SIMD_OVERRIDE`: portable SIMD stays enabled and a backend override stays justified. +//! - `KEEP_SCALAR_REFERENCE`: the honest default remains lane-wise scalar reference. +//! - `KEEP_MIXED`: keep a hybrid path that combines vector structure with scalar sub-ops or patching. +//! - `RESEARCH_NEEDED`: current evidence is not strong enough for a cleaner keep/revert call. pub mod contracts; mod f32;