diff --git a/src/math/f64/binary_misc.rs b/src/math/f64/binary_misc.rs index a5d1b45..b6beee5 100644 --- a/src/math/f64/binary_misc.rs +++ b/src/math/f64/binary_misc.rs @@ -1,13 +1,13 @@ use crate::math::{f64, scalar}; use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64}; -// DECISION(2026-03-23): KEEP_MIXED +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE // Function(s): f64 log10_u35 // Why kept: // - local runtime-selected performance is clearly better than native scalar -// - the current implementation still rides scalar-reference log2_u35 underneath +// - the current implementation now rides the kept portable f64 log2_u35 core // Revisit when: -// - f64 log2_u35 gets a new keep/revert outcome +// - f64 core log2_u35 changes materially or non-x86 evidence disagrees sharply // DECISION(2026-03-23): KEEP_SIMD_PORTABLE // Function(s): f64 atan2_u35 / hypot_u35 / fmod diff --git a/src/math/f64/core.rs b/src/math/f64/core.rs index 819d2be..28f743f 100644 --- a/src/math/f64/core.rs +++ b/src/math/f64/core.rs @@ -1,13 +1,22 @@ use crate::math::{map, scalar}; -use crate::{Simd, SimdFloat64}; +use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt, SimdInt64}; -// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE +type SimdI64 = <::Engine as Simd>::Vi64; + +const F64_EXPONENT_MASK: i64 = 0x7FF0_0000_0000_0000u64 as i64; +const F64_MANTISSA_MASK: i64 = 0x000F_FFFF_FFFF_FFFFu64 as i64; +const F64_LOG_NORM_MANTISSA: i64 = 0x3FE0_0000_0000_0000u64 as i64; +const F64_EXPONENT_BIAS_ADJUST: i64 = 1022; +const F64_EXP_LN2_HI: f64 = 6.931_471_803_691_238e-1; +const F64_EXP_LN2_LO: f64 = 1.908_214_929_270_587_7e-10; + +// DECISION(2026-03-23): KEEP_SIMD_PORTABLE // Function(s): f64 log2_u35 / exp2_u35 / ln_u35 / exp_u35 -// Why scalar: -// - local benches keep putting runtime-selected behavior at or below native scalar -// - family structure stays useful, but the current default is still scalar-reference +// Why kept: +// - the revived portable log/exp kernels now beat native scalar in local f64 core benches +// - exceptional lanes still patch back to scalar references without giving up the fast AVX2 path // Revisit when: -// - a genuinely worthwhile f64 log/exp SIMD kernel exists +// - non-x86 evidence disagrees sharply or a cheaper approximation family appears // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE // Function(s): f64 sin_u35 / cos_u35 / tan_u35 @@ -17,13 +26,116 @@ use crate::{Simd, SimdFloat64}; // Revisit when: // - a stronger range-reduction strategy or cheaper trig kernel appears +#[inline(always)] +fn any_lane_nonzero(mask: SimdI64) -> bool +where + V: SimdFloat64, + V::Engine: Simd, +{ + unsafe { + let lanes = mask.as_array(); + for lane in 0..V::WIDTH { + if lanes[lane] != 0 { + return true; + } + } + } + + false +} + +#[inline(always)] +fn patch_exceptional_lanes( + input: V, + output: V, + exceptional_mask: SimdI64, + scalar_fallback: fn(f64) -> f64, +) -> V +where + V: SimdFloat64, + V::Engine: Simd, +{ + if !any_lane_nonzero::(exceptional_mask) { + return output; + } + + unsafe { + let input_lanes = input.as_array(); + let mask_lanes = exceptional_mask.as_array(); + let mut output_lanes = output.as_array(); + + for lane in 0..V::WIDTH { + if mask_lanes[lane] != 0 { + output_lanes[lane] = scalar_fallback(input_lanes[lane]); + } + } + + V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64) + } +} + +#[inline(always)] +fn log2_exceptional_mask(input: V) -> SimdI64 +where + V: SimdFloat64, + V::Engine: Simd, +{ + let bits = input.bitcast_i64(); + let exponent_bits = bits & F64_EXPONENT_MASK; + + let non_positive = input + .cmp_gt(V::zeroes()) + .bitcast_i64() + .cmp_eq(SimdI64::::zeroes()); + let subnormal_or_zero = exponent_bits.cmp_eq(SimdI64::::zeroes()); + let inf_or_nan = exponent_bits.cmp_eq(SimdI64::::set1(F64_EXPONENT_MASK)); + + non_positive | subnormal_or_zero | inf_or_nan +} + #[inline(always)] pub(crate) fn log2_u35(input: V) -> V where V: SimdFloat64, V::Engine: Simd, { - map::unary_f64(input, scalar::log2_u35_f64) + let bits = input.bitcast_i64(); + let exponent_bits = bits & F64_EXPONENT_MASK; + let mantissa_bits = bits & F64_MANTISSA_MASK; + + let exceptional_mask = log2_exceptional_mask(input); + + let exponent = + (exponent_bits.shr(52) - SimdI64::::set1(F64_EXPONENT_BIAS_ADJUST)).cast_f64(); + let normalized_mantissa = (mantissa_bits | F64_LOG_NORM_MANTISSA).bitcast_f64(); + + let one = V::set1(1.0); + let sqrt_half = V::set1(core::f64::consts::FRAC_1_SQRT_2); + + let adjust_mask = normalized_mantissa.cmp_lt(sqrt_half); + let exponent = exponent - adjust_mask.blendv(V::zeroes(), one); + let mantissa = adjust_mask.blendv( + normalized_mantissa, + normalized_mantissa + normalized_mantissa, + ); + + let t = (mantissa - one) / (mantissa + one); + let t2 = t * t; + + let mut poly = V::set1(1.0 / 19.0); + poly = (poly * t2) + V::set1(1.0 / 17.0); + poly = (poly * t2) + V::set1(1.0 / 15.0); + poly = (poly * t2) + V::set1(1.0 / 13.0); + poly = (poly * t2) + V::set1(1.0 / 11.0); + poly = (poly * t2) + V::set1(1.0 / 9.0); + poly = (poly * t2) + V::set1(1.0 / 7.0); + poly = (poly * t2) + V::set1(1.0 / 5.0); + poly = (poly * t2) + V::set1(1.0 / 3.0); + + let log2_mantissa = V::set1(2.0 * core::f64::consts::LOG2_E) * t * ((poly * t2) + one); + let fast = exponent + log2_mantissa; + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::log2_u35_f64) } #[inline(always)] @@ -32,7 +144,34 @@ where V: SimdFloat64, V::Engine: Simd, { - map::unary_f64(input, scalar::exp2_u35_f64) + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let in_lower_bound = input.cmp_gte(V::set1(-1022.0)).bitcast_i64(); + let in_upper_bound = input.cmp_lte(V::set1(1023.0)).bitcast_i64(); + let fast_mask = finite_mask & in_lower_bound & in_upper_bound; + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + + let integral = input.round().cast_i64(); + let fractional = input - integral.cast_f64(); + let reduced = fractional * V::set1(core::f64::consts::LN_2); + + let mut poly = V::set1(1.0 / 479_001_600.0); + poly = (poly * reduced) + V::set1(1.0 / 39_916_800.0); + poly = (poly * reduced) + V::set1(1.0 / 3_628_800.0); + poly = (poly * reduced) + V::set1(1.0 / 362_880.0); + poly = (poly * reduced) + V::set1(1.0 / 40_320.0); + poly = (poly * reduced) + V::set1(1.0 / 5_040.0); + poly = (poly * reduced) + V::set1(1.0 / 720.0); + poly = (poly * reduced) + V::set1(1.0 / 120.0); + poly = (poly * reduced) + V::set1(1.0 / 24.0); + poly = (poly * reduced) + V::set1(1.0 / 6.0); + poly = (poly * reduced) + V::set1(0.5); + + let exp_reduced = (poly * reduced * reduced) + reduced + V::set1(1.0); + let scale_bits = (integral + SimdI64::::set1(1023)).shl(52); + let scale = scale_bits.bitcast_f64(); + let fast = exp_reduced * scale; + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::exp2_u35_f64) } #[inline(always)] @@ -41,15 +180,81 @@ where V: SimdFloat64, V::Engine: Simd, { - map::unary_f64(input, scalar::ln_u35_f64) + let bits = input.bitcast_i64(); + let exponent_bits = bits & F64_EXPONENT_MASK; + let mantissa_bits = bits & F64_MANTISSA_MASK; + + let exceptional_mask = log2_exceptional_mask(input); + + let exponent = + (exponent_bits.shr(52) - SimdI64::::set1(F64_EXPONENT_BIAS_ADJUST)).cast_f64(); + let normalized_mantissa = (mantissa_bits | F64_LOG_NORM_MANTISSA).bitcast_f64(); + + let one = V::set1(1.0); + let sqrt_half = V::set1(core::f64::consts::FRAC_1_SQRT_2); + + let adjust_mask = normalized_mantissa.cmp_lt(sqrt_half); + let exponent = exponent - adjust_mask.blendv(V::zeroes(), one); + let mantissa = adjust_mask.blendv( + normalized_mantissa, + normalized_mantissa + normalized_mantissa, + ); + + let t = (mantissa - one) / (mantissa + one); + let t2 = t * t; + + let mut poly = V::set1(1.0 / 19.0); + poly = (poly * t2) + V::set1(1.0 / 17.0); + poly = (poly * t2) + V::set1(1.0 / 15.0); + poly = (poly * t2) + V::set1(1.0 / 13.0); + poly = (poly * t2) + V::set1(1.0 / 11.0); + poly = (poly * t2) + V::set1(1.0 / 9.0); + poly = (poly * t2) + V::set1(1.0 / 7.0); + poly = (poly * t2) + V::set1(1.0 / 5.0); + poly = (poly * t2) + V::set1(1.0 / 3.0); + + let ln_mantissa = V::set1(2.0) * t * ((poly * t2) + one); + let fast = exponent * V::set1(core::f64::consts::LN_2) + ln_mantissa; + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::ln_u35_f64) } #[inline(always)] pub(crate) fn exp_u35(input: V) -> V where V: SimdFloat64, + V::Engine: Simd, { - map::unary_f64(input, scalar::exp_u35_f64) + let finite_mask = input.cmp_eq(input).bitcast_i64(); + let in_lower_bound = input.cmp_gte(V::set1(-708.0)).bitcast_i64(); + let in_upper_bound = input.cmp_lte(V::set1(709.0)).bitcast_i64(); + let fast_mask = finite_mask & in_lower_bound & in_upper_bound; + let exceptional_mask = fast_mask.cmp_eq(SimdI64::::zeroes()); + + let scaled = input * V::set1(core::f64::consts::LOG2_E); + let integral = scaled.round().cast_i64(); + let integral_f = integral.cast_f64(); + let reduced = + (input - integral_f * V::set1(F64_EXP_LN2_HI)) - integral_f * V::set1(F64_EXP_LN2_LO); + + let mut poly = V::set1(1.0 / 479_001_600.0); + poly = (poly * reduced) + V::set1(1.0 / 39_916_800.0); + poly = (poly * reduced) + V::set1(1.0 / 3_628_800.0); + poly = (poly * reduced) + V::set1(1.0 / 362_880.0); + poly = (poly * reduced) + V::set1(1.0 / 40_320.0); + poly = (poly * reduced) + V::set1(1.0 / 5_040.0); + poly = (poly * reduced) + V::set1(1.0 / 720.0); + poly = (poly * reduced) + V::set1(1.0 / 120.0); + poly = (poly * reduced) + V::set1(1.0 / 24.0); + poly = (poly * reduced) + V::set1(1.0 / 6.0); + poly = (poly * reduced) + V::set1(0.5); + + let exp_reduced = (poly * reduced * reduced) + reduced + V::set1(1.0); + let scale_bits = (integral + SimdI64::::set1(1023)).shl(52); + let scale = scale_bits.bitcast_f64(); + let fast = exp_reduced * scale; + + patch_exceptional_lanes(input, fast, exceptional_mask, scalar::exp_u35_f64) } #[inline(always)] diff --git a/src/math/f64/inverse_hyperbolic.rs b/src/math/f64/inverse_hyperbolic.rs index d399edf..bec3a35 100644 --- a/src/math/f64/inverse_hyperbolic.rs +++ b/src/math/f64/inverse_hyperbolic.rs @@ -5,9 +5,9 @@ use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64}; // Function(s): f64 asinh_u35 // Why kept: // - local benches show the current hybrid path materially ahead of native scalar -// - the fast path still depends on scalar-reference ln_u35, so this is not a full SIMD keep +// - the fast path still uses an explicit scalar-lane ln step to preserve the stricter 1-ULP contract // Revisit when: -// - f64 ln_u35 stops being scalar-reference or asinh gets its own cheaper core +// - asinh gets its own cheaper core or can safely absorb the relaxed portable ln_u35 error budget // DECISION(2026-03-23): KEEP_SCALAR_REFERENCE // Function(s): f64 acosh_u35 / atanh_u35 @@ -82,7 +82,7 @@ where finite_mask.cmp_eq(SimdI64::::zeroes()) | tiny_mask | large_mask | zero_mask; let radicand = (abs_x * abs_x) + V::set1(1.0); - let magnitude = f64::ln_u35(abs_x + radicand.sqrt()); + let magnitude = map::unary_f64(abs_x + radicand.sqrt(), scalar::ln_u35_f64); let negative_mask = input.cmp_lt(V::zeroes()); let fast = negative_mask.blendv(magnitude, -magnitude); diff --git a/src/math/f64/mod.rs b/src/math/f64/mod.rs index 8b9302d..7378791 100644 --- a/src/math/f64/mod.rs +++ b/src/math/f64/mod.rs @@ -1,9 +1,9 @@ //! f64 SIMD math dispatch layering: //! - family-local modules own the public internal routing points for each math family. //! - current decisions are intentionally mixed: -//! scalar-reference for core/trig and hyperbolic defaults, -//! portable SIMD for inverse trig and several binary-misc kernels, -//! and hybrid paths where scalar sub-ops still underpin the fast path. +//! portable SIMD for the revived core log/exp family, inverse trig, and binary misc, +//! scalar-reference for trig and the losing hyperbolic defaults, +//! and hybrid paths where a stricter scalar sub-op still underpins the fast path. //! - follow-up optimization work can still replace one family module at a time. mod binary_misc; diff --git a/src/math/mod.rs b/src/math/mod.rs index d9169af..df421a5 100644 --- a/src/math/mod.rs +++ b/src/math/mod.rs @@ -12,8 +12,8 @@ //! `sinh_u35` / `cosh_u35` / `tanh_u35` now use family-local portable SIMD //! kernels with centralized scalar patching for exceptional lanes. //! The stabilized `f64` map is intentionally mixed: -//! scalar-reference for the current losing core/trig and hyperbolic families, -//! portable SIMD for inverse trig and several binary-misc kernels, +//! portable SIMD for the revived core log/exp family, inverse trig, and several binary-misc kernels, +//! scalar-reference for the current losing trig and hyperbolic families, //! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops. //! //! Structure notes: diff --git a/src/tests/simd_math_targeted_edges/core.rs b/src/tests/simd_math_targeted_edges/core.rs index 1e7a22c..a1cb1e0 100644 --- a/src/tests/simd_math_targeted_edges/core.rs +++ b/src/tests/simd_math_targeted_edges/core.rs @@ -258,7 +258,7 @@ simd_math_targeted_all_backends!( ); fn run_f64_log_exp_boundary_lanes() { - let inputs_log = vec![ + let mut inputs_log = vec![ f64::from_bits(1), f64::MIN_POSITIVE, 0.5, @@ -273,6 +273,13 @@ fn run_f64_log_exp_boundary_lanes() { -0.0, ]; + for &scale in &[0.5f64, 1.0, 2.0, 8.0, 1024.0] { + let pivot = std::f64::consts::FRAC_1_SQRT_2 * scale; + inputs_log.push(f64::from_bits(pivot.to_bits() - 1)); + inputs_log.push(pivot); + inputs_log.push(f64::from_bits(pivot.to_bits() + 1)); + } + check_targeted_unary_f64::( "log2_u35", &inputs_log, @@ -288,7 +295,7 @@ fn run_f64_log_exp_boundary_lanes() { f64::ln, ); - let inputs_exp = vec![ + let mut inputs_exp = vec![ -1022.0, -1021.75, -10.0, @@ -305,6 +312,21 @@ fn run_f64_log_exp_boundary_lanes() { f64::NAN, ]; + for k in -4..=4 { + let center = k as f64; + inputs_exp.push(center - 1.0 / 4096.0); + inputs_exp.push(center); + inputs_exp.push(center + 1.0 / 4096.0); + } + + for ¢er in &[ + -1022.0f64, -1021.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1022.0, 1023.0, + ] { + inputs_exp.push(f64::from_bits(center.to_bits().saturating_sub(1))); + inputs_exp.push(center); + inputs_exp.push(f64::from_bits(center.to_bits().saturating_add(1))); + } + check_targeted_unary_f64::( "exp2_u35", &inputs_exp,