From ea749e5b42a71415ffa3b19902ff6749b100b0c0 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 23:03:24 +1200 Subject: [PATCH 1/2] update --- src/row/arch/neon.rs | 110 ++++++++++++++++--- src/row/arch/neon/tests.rs | 140 +++++++++++++++++++++++++ src/row/arch/wasm_simd128.rs | 112 +++++++++++++++++--- src/row/arch/wasm_simd128/tests.rs | 149 ++++++++++++++++++++++++++ src/row/arch/x86_avx2.rs | 122 ++++++++++++++++++--- src/row/arch/x86_avx2/tests.rs | 152 +++++++++++++++++++++++++++ src/row/arch/x86_avx512.rs | 140 +++++++++++++++++++++---- src/row/arch/x86_avx512/tests.rs | 163 +++++++++++++++++++++++++++++ src/row/arch/x86_sse41.rs | 117 +++++++++++++++++---- src/row/arch/x86_sse41/tests.rs | 159 ++++++++++++++++++++++++++++ src/row/mod.rs | 79 +++++++++++--- src/sinker/mixed/yuva_4_4_4.rs | 5 - 12 files changed, 1350 insertions(+), 98 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 3bb4011..f62c91c 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -983,7 +983,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// NEON YUV 4:4:4 planar high-bit-depth → **native-depth u16** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -1001,7 +1002,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -1009,7 +1012,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the /// input bit depth) — matches `scalar::yuv_444p_n_to_rgba_u16_row`. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -1028,24 +1032,78 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); } } -/// Shared NEON high-bit YUV 4:4:4 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` -/// writes RGBA quads via `vst4q_u16` with constant alpha -/// `(1 << BITS) - 1`. +/// NEON YUVA 4:4:4 planar high-bit-depth → **native-depth `u16`** +/// packed RGBA with the per-pixel alpha element **sourced from +/// `a_src`** (already at the source's native bit depth — no depth +/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`. +/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "neon")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); + } +} + +/// Shared NEON high-bit YUV 4:4:4 → native-depth `u16` kernel for +/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `vst3q_u16`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `vst4q_u16` with constant alpha +/// `(1 << BITS) - 1`) and [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] +/// (`ALPHA = true, ALPHA_SRC = true`, `vst4q_u16` with the alpha lane +/// loaded from `a_src` and masked to native bit depth — no shift since +/// both the source alpha and the u16 output element are at the same +/// native bit depth). /// /// # Safety /// /// 1. **NEON must be available.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -1053,16 +1111,23 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row, ) { // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`. const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1140,8 +1205,21 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index 2355c27..9ba7b46 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -2864,3 +2864,143 @@ fn neon_p416_rgba_u16_matches_scalar_all_matrices() { check_p_n_444_16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ---- +// +// Mirrors the u8 RGBA alpha-source tests above for the u16 output +// path: per-pixel alpha element is loaded from the source plane, +// AND-masked with `bits_mask::<10>()`, and stored at native depth (no +// `>> (BITS - 8)` since both source alpha and output element are at +// the same bit depth). Pseudo-random alpha flushes lane-order +// corruption that a solid-alpha buffer would mask. + +fn check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_neon, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p10_rgba_u16_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p10_rgba_u16_matches_scalar_widths() { + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [16usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p10_rgba_u16_matches_scalar_random_alpha() { + // Different alpha seeds — ensures the alpha lane order through + // `vst4q_u16` is not confused with R/G/B. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>( + 31, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p_n_rgba_u16_matches_scalar_all_bits() { + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the + // AND-mask `mask_v` resolves correctly across the supported bit + // depths (no shift count to vary in the u16 path). + for full in [true, false] { + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53); + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<12>( + 16, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<14>( + 16, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { + // BITS = 9, 12, 14 across tail widths. + for w in [17usize, 47, 1922] { + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 34dd85f..6f55315 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -912,7 +912,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 16 pixels per iter. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -930,7 +931,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -938,6 +941,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum /// at the input bit depth). /// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. +/// /// # Safety /// /// Same as [`yuv_444p_n_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. @@ -954,24 +960,80 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); } } -/// Shared WASM simd128 high-bit YUV 4:4:4 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; -/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with -/// constant alpha `(1 << BITS) - 1`. +/// WASM simd128 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth +/// `u16`** packed RGBA with the per-pixel alpha element **sourced +/// from `a_src`** (already at the source's native bit depth — no +/// depth conversion) instead of being the opaque maximum +/// `(1 << BITS) - 1`. Same numerical contract as +/// [`yuv_444p_n_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "simd128")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); + } +} + +/// Shared WASM simd128 high-bit YUV 4:4:4 → native-depth `u16` kernel +/// for [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `write_rgb_u16_8`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `write_rgba_u16_8` with constant alpha +/// `(1 << BITS) - 1`) and +/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `write_rgba_u16_8` with the alpha lane loaded +/// from `a_src` and masked to native bit depth — no shift since both +/// the source alpha and the u16 output element are at the same native +/// bit depth). /// /// # Safety /// /// 1. **simd128 must be enabled at compile time.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -979,13 +1041,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1060,9 +1129,22 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index 21e4377..8c53f5b 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -2413,3 +2413,152 @@ fn simd128_p416_rgba_u16_matches_scalar_all_matrices() { check_p_n_444_16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ---- +// +// Mirrors the u8 RGBA alpha-source tests above for the u16 output +// path: per-pixel alpha element is loaded from the source plane, +// AND-masked with `bits_mask::<10>()`, and stored at native depth (no +// `>> (BITS - 8)` since both source alpha and output element are at +// the same bit depth). 16 px per iter → two `v128_load`s of 8 alpha +// u16 each, fed straight into `write_rgba_u16_8`. + +fn check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "WASM simd128 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn simd128_yuva444p10_rgba_u16_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +fn simd128_yuva444p10_rgba_u16_matches_scalar_widths() { + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [16usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); + } +} + +#[test] +fn simd128_yuva444p10_rgba_u16_matches_scalar_random_alpha() { + // Different alpha seeds — `write_rgba_u16_8` lane order must put + // alpha in the 4th channel, not collide with R/G/B. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>( + 31, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn simd128_yuva444p_n_rgba_u16_matches_scalar_all_bits() { + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the + // AND-mask `mask_v` resolves correctly across the supported bit + // depths. + for full in [true, false] { + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>( + 16, + ColorMatrix::Bt601, + full, + 53, + ); + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<12>( + 16, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<14>( + 16, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn simd128_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { + for w in [17usize, 47, 1922] { + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<12>( + w, + ColorMatrix::Fcc, + true, + 89, + ); + check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 8ba3785..61de0ef 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -1049,7 +1049,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// AVX2 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 32 pixels per iter. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -1067,7 +1068,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -1075,7 +1078,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the /// input bit depth). /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -1093,24 +1097,79 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); } } -/// Shared AVX2 high-bit YUV 4:4:4 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8`; -/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with -/// constant alpha `(1 << BITS) - 1`. +/// AVX2 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth `u16`** +/// packed RGBA with the per-pixel alpha element **sourced from +/// `a_src`** (already at the source's native bit depth — no depth +/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`. +/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx2")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); + } +} + +/// Shared AVX2 high-bit YUV 4:4:4 → native-depth `u16` kernel for +/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// 4× `write_rgb_u16_8`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, 4× `write_rgba_u16_8` with constant alpha +/// `(1 << BITS) - 1`) and +/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, 4× `write_rgba_u16_8` with the alpha lane loaded +/// from `a_src` and masked to native bit depth — no shift since both +/// the source alpha and the u16 output element are at the same native +/// bit depth). /// /// # Safety /// /// 1. **AVX2 must be available on the current CPU.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -1118,13 +1177,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1223,33 +1289,53 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row(a_lo_v), + _mm256_castsi256_si128(a_hi_v), + _mm256_extracti128_si256::<1>(a_hi_v), + ) + } else { + (alpha_u16, alpha_u16, alpha_u16, alpha_u16) + }; let dst = out.as_mut_ptr().add(x * 4); write_rgba_u16_8( _mm256_castsi256_si128(r_lo), _mm256_castsi256_si128(g_lo), _mm256_castsi256_si128(b_lo), - alpha_u16, + a_lo_q0, dst, ); write_rgba_u16_8( _mm256_extracti128_si256::<1>(r_lo), _mm256_extracti128_si256::<1>(g_lo), _mm256_extracti128_si256::<1>(b_lo), - alpha_u16, + a_lo_q1, dst.add(32), ); write_rgba_u16_8( _mm256_castsi256_si128(r_hi), _mm256_castsi256_si128(g_hi), _mm256_castsi256_si128(b_hi), - alpha_u16, + a_hi_q0, dst.add(64), ); write_rgba_u16_8( _mm256_extracti128_si256::<1>(r_hi), _mm256_extracti128_si256::<1>(g_hi), _mm256_extracti128_si256::<1>(b_hi), - alpha_u16, + a_hi_q1, dst.add(96), ); } else { @@ -1289,7 +1375,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index d464ee4..6a029c5 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -2668,3 +2668,155 @@ fn avx2_p416_rgba_u16_matches_scalar_all_matrices() { check_p_n_444_16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ---- +// +// Mirrors the u8 RGBA alpha-source tests above for the u16 output +// path: per-pixel alpha element is loaded from the source plane, +// AND-masked with `bits_mask::<10>()`, and stored at native depth (no +// `>> (BITS - 8)` since both source alpha and output element are at +// the same bit depth). 32 px per iter → 16 alpha u16 per `__m256i` +// load × 2 halves; per-half splits into two `__m128i` quarters fed to +// the four `write_rgba_u16_8` calls per iter. + +fn check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn avx2_yuva444p10_rgba_u16_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(32, m, full, 89); + } + } +} + +#[test] +fn avx2_yuva444p10_rgba_u16_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [32usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +fn avx2_yuva444p10_rgba_u16_matches_scalar_random_alpha() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Different alpha seeds — the 256-bit alpha load splits into two + // 128-bit quarters via `_mm256_castsi256_si128` / + // `_mm256_extracti128_si256::<1>`; the lane order through + // `write_rgba_u16_8` must put alpha in the 4th channel. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>( + 32, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>( + 63, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn avx2_yuva444p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the + // AND-mask `mask_v` resolves correctly across the supported bit + // depths. + for full in [true, false] { + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(32, ColorMatrix::Bt601, full, 53); + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<12>( + 32, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<14>( + 32, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn avx2_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [33usize, 47, 1922] { + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 147bc7a..effb52d 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -1134,7 +1134,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// AVX-512 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 64 pixels per iter. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -1152,14 +1153,17 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } /// AVX-512 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16` /// output. Alpha samples are `(1 << BITS) - 1`. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -1177,24 +1181,79 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); } } -/// Shared AVX-512 high-bit YUV 4:4:4 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via 8× `write_quarter`; -/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with -/// constant alpha `(1 << BITS) - 1`. +/// AVX-512 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth `u16`** +/// packed RGBA with the per-pixel alpha element **sourced from +/// `a_src`** (already at the source's native bit depth — no depth +/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`. +/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); + } +} + +/// Shared AVX-512 high-bit YUV 4:4:4 → native-depth `u16` kernel for +/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// 8× `write_quarter`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, 8× `write_quarter_rgba` with constant alpha +/// `(1 << BITS) - 1`) and +/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, 8× `write_quarter_rgba` with the alpha quarters +/// loaded from `a_src` and masked to native bit depth — no shift since +/// both the source alpha and the u16 output element are at the same +/// native bit depth). /// /// # Safety /// /// 1. **AVX-512F + AVX-512BW must be available.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -1202,13 +1261,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1321,15 +1387,45 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row(a_lo_v), + _mm512_extracti32x4_epi32::<1>(a_lo_v), + _mm512_extracti32x4_epi32::<2>(a_lo_v), + _mm512_extracti32x4_epi32::<3>(a_lo_v), + _mm512_extracti32x4_epi32::<0>(a_hi_v), + _mm512_extracti32x4_epi32::<1>(a_hi_v), + _mm512_extracti32x4_epi32::<2>(a_hi_v), + _mm512_extracti32x4_epi32::<3>(a_hi_v), + ) + } else { + ( + alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, + ) + }; let dst = out.as_mut_ptr().add(x * 4); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32)); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64)); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224)); + write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q0, 0, dst); + write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q1, 1, dst.add(32)); + write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q2, 2, dst.add(64)); + write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q3, 3, dst.add(96)); + write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q0, 0, dst.add(128)); + write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q1, 1, dst.add(160)); + write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q2, 2, dst.add(192)); + write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q3, 3, dst.add(224)); } else { let dst = out.as_mut_ptr().add(x * 3); write_quarter(r_lo, g_lo, b_lo, 0, dst); @@ -1351,7 +1447,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index bba9c0f..d04388a 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -2720,3 +2720,166 @@ fn avx512_p416_rgba_u16_matches_scalar_all_matrices() { check_p_n_444_16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ---- +// +// Mirrors the u8 RGBA alpha-source tests above for the u16 output +// path: per-pixel alpha element is loaded from the source plane, +// AND-masked with `bits_mask::<10>()`, and stored at native depth (no +// `>> (BITS - 8)` since both source alpha and output element are at +// the same bit depth). 64 px per iter → 32 alpha u16 per `__m512i` +// load × 2 halves; per-half splits into four `__m128i` quarters via +// `_mm512_extracti32x4_epi32::<0..3>` fed to the eight +// `write_quarter_rgba` calls per iter. + +fn check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn avx512_yuva444p10_rgba_u16_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(64, m, full, 89); + } + } +} + +#[test] +fn avx512_yuva444p10_rgba_u16_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [64usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); + } +} + +#[test] +fn avx512_yuva444p10_rgba_u16_matches_scalar_random_alpha() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // Different alpha seeds — the 512-bit alpha load splits into four + // 128-bit quarters via `_mm512_extracti32x4_epi32::<0..3>`; each + // quarter feeds `write_quarter_rgba`, which routes the alpha lane + // into the 4th channel of the RGBA output. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>( + 64, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>( + 127, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn avx512_yuva444p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the + // AND-mask `mask_v` resolves correctly across the supported bit + // depths. + for full in [true, false] { + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>( + 64, + ColorMatrix::Bt601, + full, + 53, + ); + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<12>( + 64, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<14>( + 64, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn avx512_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [65usize, 95, 1922] { + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index fda5824..2397a66 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1364,7 +1364,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// SSE4.1 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB. /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = false, ALPHA_SRC = false`. /// /// # Safety /// @@ -1382,7 +1383,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgb_out, width, matrix, full_range, None, + ); } } @@ -1390,7 +1393,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the /// input bit depth). /// -/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = false`. /// /// # Safety /// @@ -1408,24 +1412,79 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, u, v, rgba_out, width, matrix, full_range, None, + ); } } -/// Shared SSE4.1 high-bit YUV 4:4:4 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; -/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with -/// constant alpha `(1 << BITS) - 1`. +/// SSE4.1 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth `u16`** +/// packed RGBA with the per-pixel alpha element **sourced from +/// `a_src`** (already at the source's native bit depth — no depth +/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`. +/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "sse4.1")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u: &[u16], + v: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::( + y, + u, + v, + rgba_out, + width, + matrix, + full_range, + Some(a_src), + ); + } +} + +/// Shared SSE4.1 high-bit YUV 4:4:4 → native-depth `u16` kernel for +/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `write_rgb_u16_8`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `write_rgba_u16_8` with constant alpha +/// `(1 << BITS) - 1`) and +/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true, +/// ALPHA_SRC = true`, `write_rgba_u16_8` with the alpha lane loaded +/// from `a_src` and masked to native bit depth — no shift since both +/// the source alpha and the u16 output element are at the same native +/// bit depth). /// /// # Safety /// /// 1. **SSE4.1 must be available on the current CPU.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -1433,16 +1492,23 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row, ) { // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`. const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output — there is no 3 bpp store with + // alpha to put it in. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1517,14 +1583,21 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index d02ecb1..7c47891 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -2721,3 +2721,162 @@ fn sse41_p416_rgba_u16_matches_scalar_all_matrices() { check_p_n_444_16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ---- +// +// Mirrors the u8 RGBA alpha-source tests above for the u16 output +// path: per-pixel alpha element is loaded from the source plane, +// AND-masked with `bits_mask::<10>()`, and stored at native depth (no +// `>> (BITS - 8)` since both source alpha and output element are at +// the same bit depth). Pseudo-random alpha flushes lane-order +// corruption that a solid-alpha buffer would mask. + +fn check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn sse41_yuva444p10_rgba_u16_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +fn sse41_yuva444p10_rgba_u16_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Natural width + tail widths forcing scalar-tail dispatch. + for w in [16usize, 17, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); + } +} + +#[test] +fn sse41_yuva444p10_rgba_u16_matches_scalar_random_alpha() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Different alpha seeds — `write_rgba_u16_8` lane order must put + // alpha in the 4th channel, not collide with R/G/B. + for seed in [13usize, 41, 89, 127, 211] { + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>( + 31, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +fn sse41_yuva444p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the + // AND-mask `mask_v` resolves correctly across the supported bit + // depths. + for full in [true, false] { + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>( + 16, + ColorMatrix::Bt601, + full, + 53, + ); + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<12>( + 16, + ColorMatrix::Bt709, + full, + 53, + ); + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<14>( + 16, + ColorMatrix::Bt2020Ncl, + full, + 53, + ); + } +} + +#[test] +fn sse41_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [17usize, 47, 1922] { + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Smpte240m, + false, + 89, + ); + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89); + check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<14>( + w, + ColorMatrix::YCgCo, + false, + 89, + ); + } +} diff --git a/src/row/mod.rs b/src/row/mod.rs index bd104f7..50552a0 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -5007,11 +5007,11 @@ pub fn yuv444p16_to_rgba_u16_row( // ---- YUVA 4:4:4 RGBA dispatchers -------------------------------------- // // Per-row dispatchers for the YUVA source family (currently Yuva444p10 -// only). The u8 RGBA dispatcher routes through the per-arch -// `yuv_444p_n_to_rgba_with_alpha_src_row` SIMD wrappers, mirroring the -// `yuv444p10_to_rgba_row` dispatcher's pattern. The u16 RGBA -// dispatcher (`yuva444p10_to_rgba_u16_row`) stays scalar until SIMD -// wiring lands in **Ship 8b‑1c**. +// only). Both the u8 RGBA dispatcher (`yuva444p10_to_rgba_row`) and +// the u16 RGBA dispatcher (`yuva444p10_to_rgba_u16_row`) route through +// per-arch `yuv_444p_n_to_rgba*_with_alpha_src_row` SIMD wrappers, +// mirroring the `yuv444p10_to_rgba_row` / `yuv444p10_to_rgba_u16_row` +// dispatchers' patterns. /// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit** /// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family @@ -5109,15 +5109,8 @@ pub fn yuva444p10_to_rgba_row( /// source's native bit depth) instead of being the opaque maximum /// `1023`. /// -/// # ⚠ Scalar-only as of Ship 8b‑1a -/// -/// `use_simd` is accepted for forward-compatible API parity with the -/// rest of the dispatcher family **but is ignored in this PR**. Every -/// invocation runs the scalar reference regardless of the flag — SIMD -/// wiring lands in **Ship 8b‑1c**. Throughput on 4:4:4 + alpha is -/// substantially below the 4:4:4-no-alpha SIMD path until then; -/// callers benchmarking the alpha-source path should re-measure once -/// 8b‑1c lands. See the section comment above for staging context. +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuva444p10_to_rgba_u16_row( @@ -5138,7 +5131,63 @@ pub fn yuva444p10_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8b‑1c PR. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( y, u, v, a, rgba_out, width, matrix, full_range, ); diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs index 23af88d..4f0e3f9 100644 --- a/src/sinker/mixed/yuva_4_4_4.rs +++ b/src/sinker/mixed/yuva_4_4_4.rs @@ -50,11 +50,6 @@ impl<'a> MixedSinker<'a, Yuva444p10> { /// Attaches a packed **`u16`** RGBA output buffer. 10-bit /// low-packed (`[0, 1023]`); the per-pixel alpha element is /// **sourced from the alpha plane** at native depth. - /// - /// **Performance note (Ship 8b‑1a):** the alpha-source u16 path runs - /// scalar regardless of `with_simd(true)` until SIMD wiring lands in - /// **Ship 8b‑1c**. See [`Self::with_rgba`] for the same warning on - /// the u8 path. #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { self.set_rgba_u16(buf)?; From 2e6dc8c13fdca4ffee67280fa2cca8579b61e3ab Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 23:30:07 +1200 Subject: [PATCH 2/2] update --- CHANGELOG.md | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ae086b..5b708a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -154,6 +154,132 @@ scheduled as a dedicated follow-up PR (`feat/bayer-simd`). end-to-end "all three channels at MAX_COEFFICIENT, all pixels 255" stays inside the `u32` accumulator and clamps to 255. +## Ship 8b — source-side YUVA (alpha-preserving RGBA output) + +The follow-up to Ship 8: source-side alpha. Where Ship 8 padded the +output alpha lane to `0xFF` / `(1 << BITS) - 1` regardless of source, +Ship 8b adds **YUVA source types** that carry an alpha plane through +to the RGBA output. The first vertical slice ships `Yuva444p10` +(ProRes 4444 + α territory — the highest-value VFX format from the +Format Share table § 2a-1 row 10). + +### Strategy B (forked kernels) over Strategy A (separate splice) + +Two implementation strategies were considered: + +- **Strategy A** (deferred) — run the existing RGBA kernel (alpha = + opaque), then a second-pass helper reads source alpha + overwrites + the alpha byte. Memory traffic 6W per pixel; ~50 LOC + 1 helper. +- **Strategy B** (adopted) — extend each kernel's const-`ALPHA` + template with a third `ALPHA_SRC: bool` generic. Source-alpha is + loaded inside the kernel, masked, and stored straight into the + alpha lane in the same pass. Memory traffic 5W per pixel (single + pass); ~3,000 LOC across 30+ kernels for an L1-noise ~10% perf + win in the alpha-present case. + +Strategy B was picked for best alpha-present throughput on the +high-bandwidth 4:4:4 + α format that motivated the work. Existing +`*_to_rgb_*` and `*_to_rgba_*` public wrappers are backward-compat +shims passing `ALPHA_SRC = false` and `None` to the templates — zero +overhead when alpha-source is off; existing call sites compile +unchanged. + +### Vertical slice 1: `Yuva444p10` (3 PRs) + +The first format follows the same staging pattern as Ship 8 high-bit +tranches (5/6/7): scalar prep first (call-site stable), then u8 SIMD, +then u16 SIMD. + +| # | Tranche | Status | +|---|---|---| +| 1 | scalar prep + Frame + walker + dispatchers + sinker integration | ✅ shipped (PR #32) — `Yuva444pFrame16`, `Yuva444p10Frame` alias, `yuva444p10_to` walker, `MixedSinker`, scalar tests | +| 1b | u8 RGBA SIMD across all 5 backends | ✅ shipped (PR #33) | +| 1c | u16 RGBA SIMD across all 5 backends | ✅ shipped (PR #34) | + +### Surface added + +- **`Yuva444pFrame16<'a, const BITS: u32>`** — mirrors `Yuv444pFrame16` + with an extra `a` slice + `a_stride`. Const-asserted `BITS == 10` + in this slice; other bit depths land in subsequent vertical slices. + `try_new` validates dimensions + plane lengths; `try_new_checked` + additionally validates every active sample range. +- **`Yuva444p10Frame<'a>`** type alias. +- **`Yuva444p10`** marker + `Yuva444p10Row<'a>` (carries `a` slice) + + `Yuva444p10Sink` trait + `yuva444p10_to` walker. +- **`MixedSinker`** with `with_rgba` / `set_rgba` (u8) + + `with_rgba_u16` / `set_rgba_u16` (u16) per-format builders, plus + `with_rgb` / `with_rgb_u16` / `with_luma` / `with_hsv` alpha-drop + paths that reuse the `Yuv444p10` row dispatchers verbatim. +- **Public dispatchers** in `colconv::row`: `yuva444p10_to_rgba_row` + and `yuva444p10_to_rgba_u16_row` — same SIMD-via-`use_simd` shape + as `yuv444p10_to_rgba_*`. + +### Strategy B template extension + +The four 4:4:4 const-`ALPHA` templates gained the `ALPHA_SRC` third +generic in this slice (only the BITS-generic planar variant is in +scope for this vertical slice; other 4:4:4 variants land later): + +- `scalar::yuv_444p_n_to_rgb_or_rgba_row` (u8) +- `scalar::yuv_444p_n_to_rgb_or_rgba_u16_row` (u16) +- Same SIMD templates × 5 backends (NEON / SSE4.1 / AVX2 / AVX-512 / + wasm simd128) — refactor in PRs #33 (u8) and #34 (u16). + +Per-pixel store branched on three combinations: + +| `ALPHA` | `ALPHA_SRC` | Per-pixel alpha | +|---|---|---| +| false | false | RGB-only (no alpha lane) | +| true | false | RGBA, alpha = `0xFF` u8 / `(1 << BITS) - 1` u16 (existing path) | +| true | true | RGBA, alpha = `(a_src[x] & bits_mask::())` from source plane; depth-converted via `>> (BITS - 8)` for u8 output, native depth for u16 output | + +`!ALPHA_SRC || ALPHA` const-asserted at every template top. + +### Hardenings (Codex review fixes) + +- **Source alpha is masked with `bits_mask::()` before depth + conversion** — `Yuva444p10Frame::try_new` accepts unchecked u16 + samples; without masking an overrange `1024` at BITS=10 would shift + to `256` and cast to u8 zero, silently turning over-range alpha + into transparent output. Same masking pattern that Y/U/V already + use. Pinned by 2 regression tests at the sinker layer. +- **`MixedSinker` wires alpha-drop paths** for `with_rgb` + / `with_rgb_u16` / `with_luma` / `with_hsv` (declared on the + generic `MixedSinker` impl) — initial implementation only wrote + RGBA buffers, leaving the others as silent stale-buffer bugs. + Pinned by 4 cross-format byte-equivalence tests against + `MixedSinker`. + +### Tests + +- **Per-backend SIMD equivalence tests**: 30 per backend × 5 backends + for `Yuva444p10` (5 u8 added in PR #33 + 5 u16 added in PR #34). + Solid-alpha + random-alpha + tail-width coverage. All x86 tests + carry `is_x86_feature_detected!` early-return guards. +- **Sinker integration tests**: 17 (PR #32 added 7 covering alpha + pass-through / opacity contracts / buffer-too-short error paths; + PR #32 review-fix added 7 covering alpha-drop paths + Strategy A + combine; PR #32 review-fix added 2 covering overrange-alpha + masking). +- **Test count growth**: 578 → 588 on aarch64-darwin host (583 after + PR #33, 588 after PR #34); +5 NEON tests run at each tranche; the + +20 x86/wasm tests fire on their respective CI runners. + +### Notes + +- **Sink-side YUVA + Ship 8 sinks are now end-to-end for the format**: + with `Yuva444p10Frame` source and `MixedSinker` sink, + the alpha plane flows through to `with_rgba` / `with_rgba_u16` + output. `with_rgb` / `with_rgb_u16` / `with_luma` / `with_hsv` + are alpha-drop (reuse `Yuv444p10` row kernels). +- **Subsequent vertical slices (Ship 8b‑2 onward)** will mass-apply + the established Strategy B template to other Yuva format families: + `Yuva420p*` (4:2:0 with α — `yuva420p`, `yuva420p9/10/16`), + `Yuva422p*` (4:2:2 with α — `yuva422p`, `yuva422p9/10/16`), and + the remaining `Yuva444p*` variants (8-bit, 9-bit, 16-bit). The + template's third generic + per-backend wrapper pattern is now + proven; subsequent slices reuse it mechanically. + ## Ship 8 — alpha + RGBA output (`with_rgba` / `with_rgba_u16`) Adds packed RGBA output across the YUV format inventory. Every YUV