diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 2686f6f..28fe418 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -7,73 +7,43 @@ jobs: name: Code Checks (formatting, clippy) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - name: Initialize Cargo x86 - with: - profile: minimal - toolchain: stable - override: true - components: rustfmt, clippy + - uses: actions/checkout@v4 - - uses: actions-rs/toolchain@v1 - name: Initialize Cargo aarch64 + - name: Initialize Cargo + uses: dtolnay/rust-toolchain@stable with: - profile: minimal - toolchain: stable - target: aarch64-unknown-linux-gnu - override: true components: rustfmt, clippy + targets: aarch64-unknown-linux-gnu - - uses: Swatinem/rust-cache@v2 - name: Cargo Cache + - name: Cargo Cache + uses: Swatinem/rust-cache@v2 - - uses: actions-rs/cargo@v1 - name: Check code formatting - with: - command: fmt - args: --all -- --check + - name: Check code formatting + run: cargo fmt --all -- --check - - uses: actions-rs/cargo@v1 - name: Clippy check (x86) - with: - command: clippy - args: --all --all-targets -- --deny "warnings" + - name: Clippy check (x86) + run: cargo clippy --all --all-targets -- --deny warnings - - uses: actions-rs/cargo@v1 - name: Clippy check (arm64) - with: - command: clippy - args: --all --all-targets --target aarch64-unknown-linux-gnu -- --deny "warnings" + - name: Clippy check (arm64) + run: cargo clippy --all --all-targets --target aarch64-unknown-linux-gnu -- --deny warnings x86_tests: name: x86 Tests runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 - name: Initialize Cargo - with: - profile: minimal - toolchain: stable - override: true - components: rustfmt, clippy + - uses: actions/checkout@v4 - - uses: Swatinem/rust-cache@v2 - name: Cargo Cache + - name: Initialize Cargo + uses: dtolnay/rust-toolchain@stable - # Ensure that everything happens safely (with runtime checks) - - uses: actions-rs/cargo@v1 - name: Run tests (debug) - with: - command: test + - name: Cargo Cache + uses: Swatinem/rust-cache@v2 - # Ensure that no behavior gets optimized out - - uses: actions-rs/cargo@v1 - name: Run tests (release) - with: - command: test - args: --release + - name: Run tests (debug) + run: cargo test + + - name: Run tests (release) + run: cargo test --release # AVX-512 is not available on GitHub's hosted runners, so compile the library and # test harness with the relevant target features enabled to keep that path covered. @@ -89,33 +59,26 @@ jobs: env: QEMU_LD_PREFIX: /usr/aarch64-linux-gnu steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Install dependencies run: sudo apt-get update && sudo apt-get install -y gcc-aarch64-linux-gnu qemu-user - - uses: actions-rs/toolchain@v1 - name: Initialize Cargo + - name: Initialize Cargo + uses: dtolnay/rust-toolchain@stable with: - profile: minimal - toolchain: stable - target: aarch64-unknown-linux-gnu - override: true - components: rustfmt, clippy + targets: aarch64-unknown-linux-gnu - - uses: Swatinem/rust-cache@v2 - name: Cargo Cache + - name: Cargo Cache + uses: Swatinem/rust-cache@v2 - name: Set up QEMU - uses: docker/setup-qemu-action@v2 + uses: docker/setup-qemu-action@v3 with: platforms: aarch64 - - uses: actions-rs/cargo@v1 - name: Run tests (release) - with: - command: test - args: --release --target aarch64-unknown-linux-gnu + - name: Run tests (release) + run: cargo test --release --target aarch64-unknown-linux-gnu wasm_tests: name: WebAssembly Tests @@ -123,25 +86,18 @@ jobs: env: WASMTIME_BACKTRACE_DETAILS: 1 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Setup `wasmtime` uses: bytecodealliance/actions/wasmtime/setup@v1 - - uses: actions-rs/toolchain@v1 - name: Initialize Cargo + - name: Initialize Cargo + uses: dtolnay/rust-toolchain@stable with: - profile: minimal - toolchain: stable - target: wasm32-wasip1 - override: true - components: rustfmt, clippy + targets: wasm32-wasip1 - - uses: Swatinem/rust-cache@v2 - name: Cargo Cache + - name: Cargo Cache + uses: Swatinem/rust-cache@v2 - - uses: actions-rs/cargo@v1 - name: Run tests (release) - with: - command: test - args: --release --target wasm32-wasip1 + - name: Run tests (release) + run: cargo test --release --target wasm32-wasip1 diff --git a/Cargo.toml b/Cargo.toml index 7252a08..d0f7f00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ license = "Apache-2.0/MIT" documentation = "https://docs.rs/simdeez/" readme = "README.md" keywords = ["SIMD", "avx2", "sse", "performance", "no_std"] -repository = "https://github.com/jackmott/simdeez" +repository = "https://github.com/arduano/simdeez" categories = ["hardware-support", "science", "game-engines"] edition = "2021" diff --git a/README.md b/README.md index 14f1f47..351201e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ A library that abstracts over SIMD instruction sets, including ones with differi SIMDeez is designed to allow you to write a function one time and produce SSE2, SSE41, AVX2, AVX-512, Neon and WebAssembly SIMD versions of the function. You can either have the version you want chosen at compile time or automatically at runtime. -Originally developed by @jackmott, however I volunteered to take over ownership. +Originally developed by @jackmott. Active maintenance and releases now happen from this repository. If there are intrinsics you need that are not currently implemented, create an issue and I'll add them. PRs to add more intrinsics are welcome. Currently things are well fleshed out for i32, i64, f32, and f64 types. @@ -26,17 +26,19 @@ Refer to the excellent [Intel Intrinsics Guide](https://software.intel.com/sites # SIMD math revival status -SIMDeez now includes a native, pure-Rust math surface for the restored historical SLEEF-backed families: +SIMDeez includes a native, pure-Rust math surface for the restored historical SLEEF-style families exposed through `simdeez::math` and re-exported in `simdeez::prelude`. -- `log2_u35`, `exp2_u35`, `ln_u35`, `exp_u35` -- `sin_u35`, `cos_u35`, `tan_u35` -- `asin_u35`, `acos_u35`, `atan_u35`, `atan2_u35` -- `sinh_u35`, `cosh_u35`, `tanh_u35` -- `asinh_u35`, `acosh_u35`, `atanh_u35` -- `log10_u35`, `hypot_u35` -- `fmod` (named without `u35` to reflect remainder semantics rather than an explicit ULP contract tier) +Covered families include: +- core log/exp: `log2_u35`, `exp2_u35`, `ln_u35`, `exp_u35` +- trig and inverse trig: `sin_u35`, `cos_u35`, `tan_u35`, `asin_u35`, `acos_u35`, `atan_u35`, `atan2_u35` +- hyperbolic and inverse hyperbolic: `sinh_u35`, `cosh_u35`, `tanh_u35`, `asinh_u35`, `acosh_u35`, `atanh_u35` +- binary misc: `log10_u35`, `hypot_u35`, `fmod` -These are exposed via extension traits in `simdeez::math` and re-exported in `simdeez::prelude`: +The old `sleef-sys` feature remains historical/deprecated and is not the primary implementation path. + +For implementation notes, current keep/revert shape, and benchmark guidance, see [SLEEF.md](SLEEF.md). + +Example: ```rust use simdeez::prelude::*; @@ -47,36 +49,6 @@ fn apply_math(x: S::Vf32) -> S::Vf32 { } ``` -The old `sleef-sys` feature remains historical/deprecated and is **not** the primary implementation path for this revived surface. - -### Kernel layering blueprint (v0.1) - -The restored `f32` path now demonstrates the intended extension architecture: - -1. **Portable SIMD kernels** (`src/math/f32/portable.rs`) implement reduction + polynomial logic with backend-agnostic simdeez primitives. -2. **Backend override dispatch** (`src/math/f32/mod.rs`) selects architecture-tuned kernels without changing the public `SimdMathF32` API. -3. **Hand-optimized backend implementation** (`src/math/f32/x86_avx2.rs`) provides a real AVX2/FMA override for `log2_u35`. -4. **Scalar fallback patching** remains centralized in the portable layer for exceptional lanes, preserving special-value semantics. - -To add the next SLEEF-style function, follow the same pattern: start portable, wire dispatch, then add optional backend overrides only where profiling justifies complexity. - -### Benchmarking restored math - -An in-repo Criterion benchmark target is available for this revived surface: - -```bash -cargo bench --bench simd_math -cargo bench --bench simd_math_remaining_baseline -``` - -This benchmark reports per-function throughput for: - -- native scalar loop baseline (`f32::{log2, exp2, ln, exp, sin, cos, tan}`) -- simdeez runtime-selected path -- forced backend variants (`scalar`, `sse2`, `sse41`, `avx2`, and `avx512` when available on host) - -Current expectation: `log2_u35` and `exp2_u35` should show clear speedups on SIMD-capable backends (notably AVX2 on x86 hosts), `sin_u35`/`cos_u35`/`tan_u35` should now also show meaningful SIMD wins on realistic finite ranges, while `ln_u35`/`exp_u35` remain scalar-reference quality-first baselines. Use these benches to validate both performance and dispatch behavior as new kernels/overrides are added. - # Compared to packed_simd * SIMDeez can abstract over differing simd widths. packed_simd does not @@ -87,7 +59,7 @@ Current expectation: `log2_u35` and `exp2_u35` should show clear speedups on SIM * SIMDeez can be used with runtime selection, Faster cannot. * SIMDeez has faster fallbacks for some functions * SIMDeez does not currently work with iterators, Faster does. -* SIMDeez uses more idiomatic intrinsic syntax while Faster uses more idomatic Rust syntax +* SIMDeez uses more idiomatic intrinsic syntax while Faster uses more idiomatic Rust syntax * SIMDeez builds on stable rust now, Faster does not. All of the above could change! Faster seems to generally have the same diff --git a/SLEEF.md b/SLEEF.md new file mode 100644 index 0000000..b86c824 --- /dev/null +++ b/SLEEF.md @@ -0,0 +1,48 @@ +# SLEEF Math Status + +SIMDeez includes a native, pure-Rust SIMD math surface modeled after the historical SLEEF family layout. + +The public surface currently covers: +- core log/exp: `log2_u35`, `exp2_u35`, `ln_u35`, `exp_u35` +- trig and inverse trig: `sin_u35`, `cos_u35`, `tan_u35`, `asin_u35`, `acos_u35`, `atan_u35`, `atan2_u35` +- hyperbolic and inverse hyperbolic: `sinh_u35`, `cosh_u35`, `tanh_u35`, `asinh_u35`, `acosh_u35`, `atanh_u35` +- binary misc: `log10_u35`, `hypot_u35`, `fmod` + +These APIs are exposed through `simdeez::math` and re-exported by `simdeez::prelude`. + +## Current Shape + +- Most `f32` families use portable SIMD by default. +- `f32 log2_u35` keeps an AVX2 override where local benchmarks justify it. +- Revived `f64` log/exp, inverse trig, and binary-misc families keep SIMD defaults. +- Some `f64` families intentionally remain mixed or scalar-reference where the local rescue passes did not justify a SIMD default. +- The old `sleef-sys` feature remains deprecated and is not the primary implementation path. + +## Implementation Pattern + +The maintained pattern is: +1. start with portable SIMD kernels +2. add dispatch glue without changing the public API +3. add backend-specific overrides only where profiling justifies them +4. keep scalar-lane patching centralized for exceptional semantics + +The restored `f32` path in `src/math/f32/` is the clearest reference implementation of that layering. + +## Benchmarks + +Criterion targets for the restored math surface: + +```bash +cargo bench --bench simd_math +cargo bench --bench simd_math_remaining_baseline +``` + +These benches report: +- native scalar loop baselines +- the simdeez runtime-selected path +- forced backend variants such as `scalar`, `sse2`, `sse41`, `avx2`, and `avx512` when available + +Current expectation: +- `log2_u35` and `exp2_u35` should show clear speedups on SIMD-capable backends +- revived `f64` log/exp and much of the inverse-trig and hyperbolic surface should remain worthwhile on the runtime-selected path +- documented scalar-reference holdouts should stay correctness-first until a later rescue pass produces stronger evidence diff --git a/src/lib.rs b/src/lib.rs index 94ff2f8..685c5bc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,5 @@ //! A library that abstracts over SIMD instruction sets, including ones with differing widths. -//! SIMDeez is designed to allow you to write a function one time and produce scalar, SSE2, SSE41, AVX2, AVX-512 and Neon versions of the function. +//! SIMDeez is designed to allow you to write a function one time and produce scalar, SSE2, SSE41, AVX2, AVX-512, Neon, and WebAssembly SIMD versions of the function. //! You can either have the version you want selected automatically at runtime, at compiletime, or //! select yourself by hand. //! @@ -13,7 +13,7 @@ //! //! # Features //! -//! * SSE2, SSE41, AVX2, AVX-512, Neon and scalar fallback +//! * SSE2, SSE41, AVX2, AVX-512, Neon, WebAssembly SIMD, and scalar fallback //! * Can be used with compile time or run time selection //! * No runtime overhead //! * Uses familiar intel intrinsic naming conventions, easy to port. @@ -34,8 +34,9 @@ //! These methods are available through `simdeez::math` and re-exported by `simdeez::prelude`. //! The implementation follows a layered blueprint: portable kernels first, //! backend-specific overrides where justified (currently a hand-tuned AVX2 `log2_u35`), -//! and scalar fallback patching for exceptional lanes. Several newly-restored families -//! are intentionally correctness-first scalar-reference mappings in this baseline pass. +//! and scalar fallback patching for exceptional lanes. The stabilized map is intentionally mixed: +//! most `f32` families and the revived `f64` log/exp, inverse-trig, and binary-misc families +//! keep SIMD defaults, while the known losing holdouts remain explicit scalar-reference mappings. //! The historical `sleef` feature remains deprecated and is not the primary implementation path. //! //! # Compared to stdsimd @@ -48,7 +49,7 @@ //! * SIMDeez can be used with runtime selection, Faster cannot. //! * SIMDeez has faster fallbacks for some functions //! * SIMDeez does not currently work with iterators, Faster does. -//! * SIMDeez uses more idiomatic intrinsic syntax while Faster uses more idomatic Rust syntax +//! * SIMDeez uses more idiomatic intrinsic syntax while Faster uses more idiomatic Rust syntax //! * SIMDeez can be used by `#[no_std]` projects //! * SIMDeez builds on stable rust now, Faster does not. //! @@ -139,7 +140,7 @@ //!} //! ``` //! -//! This will generate 5 functions for you: +//! This will generate the following functions for you: //! * `distance` the generic version of your function //! * `distance_scalar` a scalar fallback //! * `distance_sse2` SSE2 version @@ -157,7 +158,7 @@ //! produce 2 active functions via the `cfg` attribute feature: //! //! * `distance` the generic version of your function -//! * `distance_compiletime` the fastest instruction set availble for the given compile time +//! * `distance_compiletime` the fastest instruction set available for the given compile time //! feature set //! //! You may also forgo the macros if you know what you are doing, just keep in mind there are lots