Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 132 additions & 9 deletions src/math/f64/hyperbolic.rs
Original file line number Diff line number Diff line change
@@ -1,26 +1,135 @@
use crate::math::{map, scalar};
use crate::SimdFloat64;
use crate::math::{f64, map, scalar};
use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64, SimdInt64};

// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;

const SIGN_MASK: i64 = i64::MIN;
const SINH_COSH_SCALAR_PATCH_ABS: f64 = 1.0;
const SINH_COSH_FAST_ABS_MAX: f64 = 20.0;
const TANH_SCALAR_PATCH_ABS: f64 = 1.0;
const TANH_FAST_ABS_MAX: f64 = 20.0;

// DECISION(2026-03-23): KEEP_MIXED
// Function(s): f64 sinh_u35 / cosh_u35 / tanh_u35
// Why scalar:
// - local benches do not justify a portable SIMD default for this family on the current host
// - keeping the family split still preserves test and ownership structure for a later retry
// Why kept:
// - local runtime-selected benches show clear wins for sinh_u35 and tanh_u35 after restoring
// scalar-lane patching for the strict 1-ULP near-zero region
// - cosh_u35 still loses to native scalar on this host, so it stays scalar-reference
// Revisit when:
// - a cheaper f64 exp/log backbone or a dedicated hyperbolic kernel lands
// - cosh_u35 gets a better kernel or non-x86 evidence shifts the keep/revert balance

#[inline(always)]
fn any_lane_nonzero<V>(mask: SimdI64<V>) -> bool
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
unsafe {
let lanes = mask.as_array();
for lane in 0..V::WIDTH {
if lanes[lane] != 0 {
return true;
}
}
}

false
}

#[inline(always)]
fn patch_exceptional_lanes<V>(
input: V,
output: V,
exceptional_mask: SimdI64<V>,
scalar_fallback: fn(f64) -> f64,
) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
if !any_lane_nonzero::<V>(exceptional_mask) {
return output;
}

unsafe {
let input_lanes = input.as_array();
let mask_lanes = exceptional_mask.as_array();
let mut output_lanes = output.as_array();

for lane in 0..V::WIDTH {
if mask_lanes[lane] != 0 {
output_lanes[lane] = scalar_fallback(input_lanes[lane]);
}
}

V::load_from_ptr_unaligned(&output_lanes as *const V::ArrayRepresentation as *const f64)
}
}

#[inline(always)]
fn apply_input_sign<V>(magnitude: V, input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
let sign_bits = input.bitcast_i64() & SimdI64::<V>::set1(SIGN_MASK);
(magnitude.bitcast_i64() | sign_bits).bitcast_f64()
}

#[inline(always)]
fn sinh_cosh_medium<V>(abs_input: V) -> (V, V)
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
let exp_abs = f64::exp_u35(abs_input);
let exp_neg_abs = V::set1(1.0) / exp_abs;
let half = V::set1(0.5);

(
(exp_abs - exp_neg_abs) * half,
(exp_abs + exp_neg_abs) * half,
)
}

#[inline(always)]
fn sinh_cosh_masks<V>(input: V) -> (SimdI64<V>, V, V)
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
let abs_input = input.abs();
let finite_mask = input.cmp_eq(input).bitcast_i64();
let within_fast_range = abs_input
.cmp_lte(V::set1(SINH_COSH_FAST_ABS_MAX))
.bitcast_i64();

(finite_mask & within_fast_range, abs_input, input * input)
}

#[inline(always)]
pub(crate) fn sinh_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::sinh_u35_f64)
let (fast_mask, abs_input, _) = sinh_cosh_masks(input);
let small_scalar_mask = abs_input
.cmp_lt(V::set1(SINH_COSH_SCALAR_PATCH_ABS))
.bitcast_i64();
let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes()) | small_scalar_mask;

let (sinh_medium, _) = sinh_cosh_medium(abs_input);
let fast = apply_input_sign(sinh_medium, input);

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::sinh_u35_f64)
}

#[inline(always)]
pub(crate) fn cosh_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::cosh_u35_f64)
}
Expand All @@ -29,6 +138,20 @@ where
pub(crate) fn tanh_u35<V>(input: V) -> V
where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::tanh_u35_f64)
let abs_input = input.abs();
let finite_mask = input.cmp_eq(input).bitcast_i64();
let within_fast_range = abs_input.cmp_lte(V::set1(TANH_FAST_ABS_MAX)).bitcast_i64();
let small_scalar_mask = abs_input
.cmp_lt(V::set1(TANH_SCALAR_PATCH_ABS))
.bitcast_i64();
let exceptional_mask =
(finite_mask & within_fast_range).cmp_eq(SimdI64::<V>::zeroes()) | small_scalar_mask;

let exp_neg_2x = f64::exp_u35(abs_input * V::set1(-2.0));
let tanh_medium = (V::set1(1.0) - exp_neg_2x) / (V::set1(1.0) + exp_neg_2x);
let fast = apply_input_sign(tanh_medium, input);

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::tanh_u35_f64)
}
22 changes: 16 additions & 6 deletions src/math/f64/inverse_hyperbolic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ use crate::{Simd, SimdBaseIo, SimdBaseOps, SimdConsts, SimdFloat64};
// Revisit when:
// - asinh gets its own cheaper core or can safely absorb the relaxed portable ln_u35 error budget

// DECISION(2026-03-23): KEEP_SCALAR_REFERENCE
// DECISION(2026-03-23): KEEP_MIXED
// Function(s): f64 acosh_u35 / atanh_u35
// Why scalar:
// - local runtime-selected results do not beat native scalar on this host
// - scalar-reference keeps semantics honest without adding more f64 complexity today
// Why kept:
// - acosh_u35 now passes the strict contract and beats native scalar on local runtime-selected benches
// - atanh_u35's retry never held the strict 1-ULP contract without collapsing the fast band too far,
// so it remains scalar-reference on this host
// Revisit when:
// - a stronger f64 inverse-hyperbolic kernel family exists
// - atanh_u35 gets a tighter portable kernel or cleaner cross-host evidence appears

type SimdI64<V> = <<V as SimdConsts>::Engine as Simd>::Vi64;

Expand Down Expand Up @@ -95,7 +96,16 @@ where
V: SimdFloat64,
V::Engine: Simd<Vf64 = V>,
{
map::unary_f64(input, scalar::acosh_u35_f64)
let finite_mask = input.cmp_eq(input).bitcast_i64();
let in_domain_mask = input.cmp_gte(V::set1(1.0)).bitcast_i64();
let away_from_one_mask = input.cmp_gte(V::set1(1.5)).bitcast_i64();
let fast_mask = finite_mask & in_domain_mask & away_from_one_mask;
let exceptional_mask = fast_mask.cmp_eq(SimdI64::<V>::zeroes());

let root_term = ((input - V::set1(1.0)).sqrt()) * ((input + V::set1(1.0)).sqrt());
let fast = f64::ln_u35(input + root_term);

patch_exceptional_lanes(input, fast, exceptional_mask, scalar::acosh_u35_f64)
}

#[inline(always)]
Expand Down
4 changes: 2 additions & 2 deletions src/math/f64/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
//! - family-local modules own the public internal routing points for each math family.
//! - current decisions are intentionally mixed:
//! portable SIMD for the revived core log/exp family, inverse trig, and binary misc,
//! scalar-reference for trig and the losing hyperbolic defaults,
//! and hybrid paths where a stricter scalar sub-op still underpins the fast path.
//! scalar-reference for trig and the losing `cosh_u35` / `atanh_u35` defaults,
//! and hybrid paths where a stricter scalar sub-op or scalar-lane patch still underpins the fast path.
//! - follow-up optimization work can still replace one family module at a time.

mod binary_misc;
Expand Down
15 changes: 12 additions & 3 deletions src/math/families/hyperbolic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,26 @@ impl<T: SimdFloat32> SimdMathF32Hyperbolic for T {}

pub trait SimdMathF64Hyperbolic: SimdFloat64 {
#[inline(always)]
fn sinh_u35(self) -> Self {
fn sinh_u35(self) -> Self
where
Self::Engine: Simd<Vf64 = Self>,
{
f64::sinh_u35(self)
}

#[inline(always)]
fn cosh_u35(self) -> Self {
fn cosh_u35(self) -> Self
where
Self::Engine: Simd<Vf64 = Self>,
{
f64::cosh_u35(self)
}

#[inline(always)]
fn tanh_u35(self) -> Self {
fn tanh_u35(self) -> Self
where
Self::Engine: Simd<Vf64 = Self>,
{
f64::tanh_u35(self)
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/math/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
//! kernels with centralized scalar patching for exceptional lanes.
//! The stabilized `f64` map is intentionally mixed:
//! portable SIMD for the revived core log/exp family, inverse trig, and several binary-misc kernels,
//! scalar-reference for the current losing trig and hyperbolic families,
//! scalar-reference for the current losing trig family plus selected `f64` holdouts such as `cosh_u35`
//! and `atanh_u35`,
//! and hybrid keep decisions where SIMD structure still relies on scalar sub-ops.
//!
//! Structure notes:
Expand Down
Loading