From be54e4c7e1e39cd22f33611d35538e24ea708192 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 22:23:33 +1200 Subject: [PATCH] update --- src/row/arch/neon.rs | 109 ++++++++++++++++++-- src/row/arch/neon/tests.rs | 130 +++++++++++++++++++++++ src/row/arch/wasm_simd128.rs | 110 ++++++++++++++++++-- src/row/arch/wasm_simd128/tests.rs | 146 ++++++++++++++++++++++++++ src/row/arch/x86_avx2.rs | 107 +++++++++++++++++-- src/row/arch/x86_avx2/tests.rs | 140 +++++++++++++++++++++++++ src/row/arch/x86_avx512.rs | 110 ++++++++++++++++++-- src/row/arch/x86_avx512/tests.rs | 160 +++++++++++++++++++++++++++++ src/row/arch/x86_sse41.rs | 108 +++++++++++++++++-- src/row/arch/x86_sse41/tests.rs | 149 +++++++++++++++++++++++++++ src/row/mod.rs | 88 +++++++++++----- src/sinker/mixed/yuva_4_4_4.rs | 6 -- 12 files changed, 1280 insertions(+), 83 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index f7972c0..3bb4011 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -703,7 +703,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -730,7 +733,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// [`yuv_444p_n_to_rgb_row`]; the only differences are the per-pixel /// stride (4 vs 3) and the constant alpha byte (`0xFF`, opaque). /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -748,24 +752,75 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); + } +} + +/// NEON YUVA 4:4:4 planar high-bit-depth → packed **8-bit RGBA** with +/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted +/// via `>> (BITS - 8)` to fit `u8`) instead of being constant `0xFF`. +/// Same numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "neon")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); } } /// Shared NEON high-bit-depth YUV 4:4:4 kernel for -/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and -/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with -/// constant `0xFF` alpha vector). +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `vst3q_u8`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `vst4q_u8` with constant `0xFF` alpha vector) +/// and [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `vst4q_u8` with the alpha lane loaded and +/// depth-converted from `a_src`). /// /// # Safety /// /// 1. **NEON must be available.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` must be one of `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -773,13 +828,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -863,9 +925,28 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index d312142..2355c27 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -2547,6 +2547,136 @@ fn neon_p416_rgba_matches_scalar_all_matrices() { } } +// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) -------------------- +// +// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source +// path: per-pixel alpha byte is loaded from the source plane, masked +// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo- +// random alpha is used to flush out lane-order corruption that a +// solid-alpha buffer would mask. + +fn check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_neon, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p10_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p10_rgba_matches_scalar_widths() { + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [16usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p10_rgba_matches_scalar_random_alpha() { + // Different alpha seeds — ensures the alpha lane order through + // `vst4q_u8` is not confused with R/G/B. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>( + 31, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p_n_rgba_matches_scalar_all_bits() { + // BITS = 9, 12, 14 (BITS = 10 is covered above with full matrix + // sweep). Confirms the variable shift count `BITS - 8` resolves + // correctly across the supported bit depths. + for full in [true, false] { + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53); + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<12>(16, ColorMatrix::Bt709, full, 53); + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<14>( + 16, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p_n_rgba_matches_scalar_all_bits_widths() { + // BITS = 9, 12, 14 across tail widths — the variable-shift alpha + // path applies across both SIMD body and scalar tail. + for w in [17usize, 47, 1922] { + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<14>(w, ColorMatrix::YCgCo, false, 89); + } +} + // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- // // u16 RGBA wrappers share the math of their u16 RGB siblings — only diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index b5eab6a..34dd85f 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -639,7 +639,8 @@ fn clamp_u16_max_wasm(v: v128, zero_v: v128, max_v: v128) -> v128 { /// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. Block size 16 pixels. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -659,7 +660,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -667,7 +670,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -685,24 +689,76 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); + } +} + +/// WASM simd128 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit +/// RGBA** with the per-pixel alpha byte **sourced from `a_src`** +/// (depth-converted via `>> (BITS - 8)`) instead of being constant +/// `0xFF`. Same numerical contract as [`yuv_444p_n_to_rgba_row`] for +/// R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "simd128")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); } } /// Shared WASM simd128 high-bit-depth YUV 4:4:4 kernel for -/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and -/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with -/// constant `0xFF` alpha vector). +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `write_rgb_16`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `write_rgba_16` with constant `0xFF` alpha) and +/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `write_rgba_16` with the alpha lane loaded and +/// depth-converted from `a_src`). /// /// # Safety /// /// 1. **simd128 must be enabled at compile time.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` must be one of `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -710,13 +766,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -794,7 +857,26 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index 56603cd..21e4377 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -2084,6 +2084,152 @@ fn simd128_p416_rgba_matches_scalar_all_matrices() { } } +// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) -------------------- +// +// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source +// path: per-pixel alpha byte is loaded from the source plane, masked +// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo- +// random alpha is used to flush out lane-order corruption that a +// solid-alpha buffer would mask. (Module-level cfg gates these on +// `target_feature = "simd128"`, so no per-test feature guard is +// needed.) + +fn check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_wasm, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn simd128_yuva444p10_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +fn simd128_yuva444p10_rgba_matches_scalar_widths() { + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [16usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); + } +} + +#[test] +fn simd128_yuva444p10_rgba_matches_scalar_random_alpha() { + // Different alpha seeds — `u8x16_narrow_i16x8` followed by + // `write_rgba_16` must place alpha in the 4th channel without + // lane-order corruption. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>( + 31, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn simd128_yuva444p_n_rgba_matches_scalar_all_bits() { + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms `u16x8_shr` + // with count `(BITS - 8)` resolves correctly across the supported + // bit depths. + for full in [true, false] { + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<9>( + 16, + ColorMatrix::Bt601, + full, + 53, + ); + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<12>( + 16, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<14>( + 16, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn simd128_yuva444p_n_rgba_matches_scalar_all_bits_widths() { + for w in [17usize, 47, 1922] { + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} + // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- fn check_yuv444p_n_u16_simd128_rgba_equivalence( diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 070f9e7..8ba3785 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -772,7 +772,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -780,7 +782,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -798,24 +801,75 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); + } +} + +/// AVX2 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** with +/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted +/// via `>> (BITS - 8)`) instead of being constant `0xFF`. Same +/// numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx2")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); } } /// Shared AVX2 high-bit-depth YUV 4:4:4 kernel for -/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_32`) and -/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_32` with -/// constant `0xFF` alpha vector). +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `write_rgb_32`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `write_rgba_32` with constant `0xFF` alpha) and +/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `write_rgba_32` with the alpha lane loaded and +/// depth-converted from `a_src`). /// /// # Safety /// /// 1. **AVX2 must be available on the current CPU.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` must be one of `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -823,13 +877,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -932,7 +993,27 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row` requires a literal shift, so + // use `_mm256_srl_epi16` with a count vector built from + // `BITS - 8`. `_mm256_packus_epi16` interleaves the two + // 128-bit lanes — `narrow_u8x32` already pays this cost for + // R/G/B; we use the same helper for the alpha lane. + let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32); + let a_lo_shifted = _mm256_srl_epi16(a_lo, a_shr); + let a_hi_shifted = _mm256_srl_epi16(a_hi, a_shr); + narrow_u8x32(a_lo_shifted, a_hi_shifted) + } else { + alpha_u8 + }; + write_rgba_32(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4)); } else { write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); } @@ -946,7 +1027,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index e6a6009..d464ee4 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -2335,6 +2335,146 @@ fn avx2_p416_rgba_matches_scalar_all_matrices() { } } +// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) -------------------- +// +// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source +// path: per-pixel alpha byte is loaded from the source plane, masked +// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo- +// random alpha is used to flush out lane-order corruption that a +// solid-alpha buffer would mask. AVX2's `narrow_u8x32` per-lane +// permute fixup is exercised on the alpha lane just like R/G/B. + +fn check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn avx2_yuva444p10_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>(32, m, full, 89); + } + } +} + +#[test] +fn avx2_yuva444p10_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [32usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +fn avx2_yuva444p10_rgba_matches_scalar_random_alpha() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Different alpha seeds — `_mm256_packus_epi16` followed by the + // `narrow_u8x32` per-lane permute fixup must place alpha in the 4th + // channel without lane-order corruption. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>( + 32, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>( + 47, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn avx2_yuva444p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms + // `_mm256_srl_epi16` with count `(BITS - 8)` resolves correctly + // across the supported bit depths. + for full in [true, false] { + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<9>(32, ColorMatrix::Bt601, full, 53); + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<12>(32, ColorMatrix::Bt709, full, 53); + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<14>( + 32, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn avx2_yuva444p_n_rgba_matches_scalar_all_bits_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [17usize, 47, 1922] { + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<14>(w, ColorMatrix::YCgCo, false, 89); + } +} + // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- fn check_yuv444p_n_u16_avx2_rgba_equivalence( diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index ba587d3..147bc7a 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -822,7 +822,8 @@ unsafe fn write_quarter_rgba( /// AVX-512 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. Block size 64 pixels. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -842,7 +843,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -850,7 +853,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -868,24 +872,76 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); + } +} + +/// AVX-512 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** +/// with the per-pixel alpha byte **sourced from `a_src`** +/// (depth-converted via `>> (BITS - 8)`) instead of being constant +/// `0xFF`. Same numerical contract as [`yuv_444p_n_to_rgba_row`] for +/// R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); } } /// Shared AVX-512 high-bit-depth YUV 4:4:4 kernel for -/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_64`) and -/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_64` with -/// constant `0xFF` alpha vector). +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `write_rgb_64`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `write_rgba_64` with constant `0xFF` alpha) and +/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `write_rgba_64` with the alpha lane loaded and +/// depth-converted from `a_src`). /// /// # Safety /// /// 1. **AVX-512F + AVX-512BW must be available.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` must be one of `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -893,13 +949,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1016,7 +1079,26 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row` requires a literal shift, so + // use `_mm512_srl_epi16` with a count vector built from + // `BITS - 8`. `narrow_u8x64` applies the per-lane permute + // fixup `pack_fixup` that R/G/B already pay for. + let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32); + let a_lo_shifted = _mm512_srl_epi16(a_lo, a_shr); + let a_hi_shifted = _mm512_srl_epi16(a_hi, a_shr); + narrow_u8x64(a_lo_shifted, a_hi_shifted, pack_fixup) + } else { + alpha_u8 + }; + write_rgba_64(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4)); } else { write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); } @@ -1030,7 +1112,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index 31152c3..bba9c0f 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -2359,6 +2359,166 @@ fn avx512_p416_rgba_matches_scalar_all_matrices() { } } +// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) -------------------- +// +// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source +// path: per-pixel alpha byte is loaded from the source plane, masked +// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo- +// random alpha is used to flush out lane-order corruption that a +// solid-alpha buffer would mask. AVX-512's `narrow_u8x64` per-lane +// permute fixup is exercised on the alpha lane just like R/G/B. + +fn check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn avx512_yuva444p10_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>(64, m, full, 89); + } + } +} + +#[test] +fn avx512_yuva444p10_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [64usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); + } +} + +#[test] +fn avx512_yuva444p10_rgba_matches_scalar_random_alpha() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // Different alpha seeds — `_mm512_packus_epi16` followed by the + // `narrow_u8x64` per-lane permute fixup must place alpha in the 4th + // channel without lane-order corruption. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>( + 64, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>( + 127, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn avx512_yuva444p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms + // `_mm512_srl_epi16` with count `(BITS - 8)` resolves correctly + // across the supported bit depths. + for full in [true, false] { + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<9>( + 64, + ColorMatrix::Bt601, + full, + 53, + ); + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<12>( + 64, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<14>( + 64, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn avx512_yuva444p_n_rgba_matches_scalar_all_bits_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [17usize, 47, 1922] { + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} + // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- fn check_yuv444p_n_u16_avx512_rgba_equivalence( diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 390b8cc..fda5824 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1087,7 +1087,8 @@ fn clamp_u16_max(v: __m128i, zero_v: __m128i, max_v: __m128i) -> __m128i { /// skipping the horizontal chroma-duplication step (4:4:4 chroma is /// 1:1 with Y, not paired). /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Numerical contract /// @@ -1111,7 +1112,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -1119,7 +1122,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -1137,24 +1141,75 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); + } +} + +/// SSE4.1 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** with +/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted +/// via `>> (BITS - 8)`) instead of being constant `0xFF`. Same +/// numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "sse4.1")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); } } /// Shared SSE4.1 high-bit-depth YUV 4:4:4 kernel for -/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and -/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with -/// constant `0xFF` alpha vector). +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `write_rgb_16`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `write_rgba_16` with constant `0xFF` alpha) and +/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `write_rgba_16` with the alpha lane loaded and +/// depth-converted from `a_src`). /// /// # Safety /// /// 1. **SSE4.1 must be available on the current CPU.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` must be one of `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -1162,13 +1217,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1248,7 +1310,25 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row` requires a literal const generic + // shift, so use `_mm_srl_epi16` with a count vector built + // from `BITS - 8`. + let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32); + let a_lo_shifted = _mm_srl_epi16(a_lo, a_shr); + let a_hi_shifted = _mm_srl_epi16(a_hi, a_shr); + _mm_packus_epi16(a_lo_shifted, a_hi_shifted) + } else { + alpha_u8 + }; + write_rgba_16(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4)); } else { write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); } @@ -1262,7 +1342,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index 253925b..d02ecb1 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -2375,6 +2375,155 @@ fn sse41_p416_rgba_matches_scalar_all_matrices() { } } +// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) -------------------- +// +// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source +// path: per-pixel alpha byte is loaded from the source plane, masked +// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo- +// random alpha is used to flush out lane-order corruption that a +// solid-alpha buffer would mask. + +fn check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn sse41_yuva444p10_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +fn sse41_yuva444p10_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [16usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +fn sse41_yuva444p10_rgba_matches_scalar_random_alpha() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Different alpha seeds — `_mm_packus_epi16` lane order through + // `write_rgba_16` must put alpha in the 4th channel, not collide + // with R/G/B. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>( + 31, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn sse41_yuva444p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // BITS = 9, 12, 14 (BITS = 10 covered above with full matrix sweep). + // Confirms `_mm_srl_epi16` with count `(BITS - 8)` resolves + // correctly across the supported bit depths. + for full in [true, false] { + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53); + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<12>( + 16, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<14>( + 16, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn sse41_yuva444p_n_rgba_matches_scalar_all_bits_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [17usize, 47, 1922] { + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} + // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- fn check_yuv444p_n_u16_sse41_rgba_equivalence( diff --git a/src/row/mod.rs b/src/row/mod.rs index d74a7ca..bd104f7 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -5004,21 +5004,14 @@ pub fn yuv444p16_to_rgba_u16_row( scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); } -// ---- YUVA 4:4:4 RGBA dispatchers (Ship 8b‑1a prep) -------------------- +// ---- YUVA 4:4:4 RGBA dispatchers -------------------------------------- // // Per-row dispatchers for the YUVA source family (currently Yuva444p10 -// only). The `use_simd` parameter is accepted for API parity with the -// rest of the dispatcher family but is ignored in this PR — SIMD per-arch -// routes for the alpha-source path land in: -// -// - Ship 8b‑1b: u8 RGBA SIMD on neon / avx512 / avx2 / sse41 / -// wasm_simd128. -// - Ship 8b‑1c: u16 RGBA SIMD on the same backends. -// -// Until then the dispatcher routes unconditionally to the scalar path -// (`scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>` / its u16 -// counterpart), which already monomorphizes optimally — the -// `ALPHA_SRC = true` branch is the only live arm in the per-pixel store. +// only). The u8 RGBA dispatcher routes through the per-arch +// `yuv_444p_n_to_rgba_with_alpha_src_row` SIMD wrappers, mirroring the +// `yuv444p10_to_rgba_row` dispatcher's pattern. The u16 RGBA +// dispatcher (`yuva444p10_to_rgba_u16_row`) stays scalar until SIMD +// wiring lands in **Ship 8b‑1c**. /// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit** /// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family @@ -5026,15 +5019,8 @@ pub fn yuv444p16_to_rgba_u16_row( /// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) /// instead of being constant `0xFF`. /// -/// # ⚠ Scalar-only as of Ship 8b‑1a -/// -/// `use_simd` is accepted for forward-compatible API parity with the -/// rest of the dispatcher family **but is ignored in this PR**. Every -/// invocation runs the scalar reference regardless of the flag — SIMD -/// wiring lands in **Ship 8b‑1b**. Throughput on 4:4:4 + alpha is -/// substantially below the 4:4:4-no-alpha SIMD path until then; -/// callers benchmarking the alpha-source path should re-measure once -/// 8b‑1b lands. See the section comment above for staging context. +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuva444p10_to_rgba_row( @@ -5055,7 +5041,63 @@ pub fn yuva444p10_to_rgba_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8b‑1b PR. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( y, u, v, a, rgba_out, width, matrix, full_range, ); diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs index 620e139..23af88d 100644 --- a/src/sinker/mixed/yuva_4_4_4.rs +++ b/src/sinker/mixed/yuva_4_4_4.rs @@ -28,12 +28,6 @@ impl<'a> MixedSinker<'a, Yuva444p10> { /// kernel family used by [`MixedSinker::with_rgba`]; the /// per-pixel alpha byte is **sourced from the alpha plane** /// (depth-converted via `a >> 2` to fit `u8`) — not constant `0xFF`. - /// - /// **Performance note (Ship 8b‑1a):** the alpha-source path runs - /// scalar regardless of `with_simd(true)` until SIMD wiring lands in - /// **Ship 8b‑1b**. The non-alpha 4:4:4 paths - /// (`MixedSinker`) already have native SIMD — only the - /// alpha-aware fork is staged. #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { self.set_rgba(buf)?;