From f639f4274d01f1e499a872ae09dd1fbd1c99fcbd Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 18:32:12 +1200 Subject: [PATCH] update --- src/row/arch/neon.rs | 253 ++++++++++-- src/row/arch/wasm_simd128.rs | 211 ++++++++-- src/row/arch/x86_avx2.rs | 225 ++++++++-- src/row/arch/x86_avx512.rs | 225 ++++++++-- src/row/arch/x86_sse41.rs | 225 ++++++++-- src/row/mod.rs | 150 +++++++ src/row/scalar.rs | 144 ++++++- src/sinker/mixed.rs | 770 ++++++++++++++++++++++++++++++++--- 8 files changed, 1966 insertions(+), 237 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index c5c72c7..ce5e882 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -1349,11 +1349,19 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< } /// NEON NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over -/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`. +/// [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same contract as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with +/// `ALPHA = false` (so `out.len() >= width * 3` specializes to +/// `rgb_out.len() >= 3 * width`): +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `y.len() >= width`. +/// 3. `uv.len() >= 2 * width`. +/// 4. `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv24_to_rgb_row( @@ -1366,16 +1374,18 @@ pub(crate) unsafe fn nv24_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, uv, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgb_out, width, matrix, full_range); } } /// NEON NV42 → packed RGB (VU-ordered, 4:4:4). Thin wrapper over -/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = true`. +/// [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same contract as [`nv24_to_rgb_row`]; `vu` carries the same +/// `2 * width` bytes but in V-then-U order per chroma pair. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv42_to_rgb_row( @@ -1388,40 +1398,96 @@ pub(crate) unsafe fn nv42_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, vu, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgb_out, width, matrix, full_range); } } -/// Shared NEON NV24/NV42 kernel (4:4:4 semi-planar). Unlike -/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one -/// UV pair per Y pixel, so the chroma-duplication step (`vzip*`) -/// disappears: compute 16 chroma values per 16 Y pixels directly. +/// NEON NV24 → packed RGBA (R, G, B, `0xFF` per pixel). Same +/// contract as [`nv24_to_rgb_row`] but writes 4 bytes per pixel via +/// `vst4q_u8`. `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv24_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgba_out, width, matrix, full_range); + } +} + +/// NEON NV42 → packed RGBA (R, G, B, `0xFF` per pixel). Same +/// contract as [`nv42_to_rgb_row`] but writes 4 bytes per pixel via +/// `vst4q_u8`. `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv42_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON NV24/NV42 kernel (4:4:4 semi-planar) at 3 bpp (RGB) +/// or 4 bpp + opaque alpha (RGBA). Unlike +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not +/// subsampled — one UV pair per Y pixel, so the chroma-duplication +/// step (`vzip*`) disappears: compute 16 chroma values per 16 Y +/// pixels directly. /// /// `SWAP_UV = false` selects NV24 (even byte = U, odd = V); -/// `SWAP_UV = true` selects NV42 (even = V, odd = U). +/// `SWAP_UV = true` selects NV42 (even = V, odd = U). `ALPHA = true` +/// writes via `vst4q_u8` with constant `0xFF` alpha; `ALPHA = false` +/// writes via `vst3q_u8`. Both const generics drive compile-time +/// monomorphization. /// /// # Safety /// /// 1. **NEON must be available on the current CPU.** /// 2. `y.len() >= width`. -/// 3. `uv_or_vu.len() >= 2 * width` (one UV pair per Y pixel = -/// `2 * width` bytes). -/// 4. `rgb_out.len() >= 3 * width`. +/// 3. `uv_or_vu.len() >= 2 * width` (one UV pair per Y pixel). +/// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. /// /// No width parity constraint (4:4:4). #[inline] #[target_feature(enable = "neon")] -unsafe fn nv24_or_nv42_to_rgb_row_impl( +pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { debug_assert!(y.len() >= width); debug_assert!(uv_or_vu.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1442,6 +1508,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1510,32 +1577,36 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8); + vst4q_u8(out.as_mut_ptr().add(x * 4), rgba); + } else { + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(out.as_mut_ptr().add(x * 3), rgb); + } x += 16; } // Scalar tail for 0..15 leftover pixels. if x < width { - if SWAP_UV { - scalar::nv42_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv24_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu[x * 2..width * 2]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -4150,6 +4221,114 @@ mod tests { } } + // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ---------------- + + fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + + scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv24_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + + if rgba_scalar != rgba_neon { + let first_diff = rgba_scalar + .iter() + .zip(rgba_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "NEON NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgba_scalar[first_diff], rgba_neon[first_diff] + ); + } + } + + fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + + scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv42_to_rgba_row(&y, &vu, &mut rgba_neon, width, matrix, full_range); + } + + if rgba_scalar != rgba_neon { + let first_diff = rgba_scalar + .iter() + .zip(rgba_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "NEON NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgba_scalar[first_diff], rgba_neon[first_diff] + ); + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv24_neon_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv24_rgba_equivalence(16, m, full); + } + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv24_neon_rgba_matches_scalar_widths() { + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv42_neon_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv42_rgba_equivalence(16, m, full); + } + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv42_neon_rgba_matches_scalar_widths() { + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444_to_rgb_row equivalence --------------------------------- fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 7286e0b..92ca897 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -1606,12 +1606,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< } } -/// wasm simd128 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper -/// over [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`. +/// wasm simd128 NV24 → packed RGB (UV-ordered, 4:4:4). /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv24_to_rgb_row( @@ -1624,7 +1623,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, uv, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgb_out, width, matrix, full_range); } } @@ -1632,7 +1631,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv42_to_rgb_row( @@ -1645,7 +1644,49 @@ pub(crate) unsafe fn nv42_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, vu, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgba_out, width, matrix, full_range); + } +} + +/// wasm simd128 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgba_out, width, matrix, full_range); } } @@ -1658,20 +1699,24 @@ pub(crate) unsafe fn nv42_to_rgb_row( /// /// 1. **simd128 must be available** (compile-time `target_feature`). /// 2. `y.len() >= width`, `uv_or_vu.len() >= 2 * width`, -/// `rgb_out.len() >= 3 * width`. +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "simd128")] -unsafe fn nv24_or_nv42_to_rgb_row_impl( +pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_or_vu.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1691,6 +1736,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1828,30 +1874,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - if SWAP_UV { - scalar::nv42_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv24_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -3737,6 +3786,110 @@ mod tests { } } + // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ---------------- + + fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + + scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv24_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + + if rgba_scalar != rgba_simd { + let first_diff = rgba_scalar + .iter() + .zip(rgba_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "simd128 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgba_scalar[first_diff], rgba_simd[first_diff] + ); + } + } + + fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + + scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv42_to_rgba_row(&y, &vu, &mut rgba_simd, width, matrix, full_range); + } + + if rgba_scalar != rgba_simd { + let first_diff = rgba_scalar + .iter() + .zip(rgba_simd.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "simd128 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}", + rgba_scalar[first_diff], rgba_simd[first_diff] + ); + } + } + + #[test] + fn simd128_nv24_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv24_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_nv24_rgba_matches_scalar_widths() { + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + + #[test] + fn simd128_nv42_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv42_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_nv42_rgba_matches_scalar_widths() { + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444_to_rgb_row equivalence --------------------------------- fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 24ceda0..53d55b6 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -1799,12 +1799,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< } } -/// AVX2 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over -/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`. +/// AVX2 NV24 → packed RGB (UV-ordered, 4:4:4). /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv24_to_rgb_row( @@ -1817,7 +1816,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, uv, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgb_out, width, matrix, full_range); } } @@ -1825,7 +1824,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv42_to_rgb_row( @@ -1838,13 +1837,55 @@ pub(crate) unsafe fn nv42_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, vu, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgba_out, width, matrix, full_range); + } +} + +/// AVX2 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgba_out, width, matrix, full_range); } } /// Shared AVX2 NV24/NV42 kernel (4:4:4 semi-planar). 32 Y pixels / 32 /// chroma pairs / 64 UV bytes per iteration. Unlike -/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not subsampled — one /// UV pair per Y pixel — so the `chroma_dup` step disappears; two /// `chroma_i16x16` calls per channel produce 32 chroma values /// directly. @@ -1854,20 +1895,24 @@ pub(crate) unsafe fn nv42_to_rgb_row( /// 1. **AVX2 must be available on the current CPU.** /// 2. `y.len() >= width`. /// 3. `uv_or_vu.len() >= 2 * width`. -/// 4. `rgb_out.len() >= 3 * width`. +/// 4. `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx2")] -unsafe fn nv24_or_nv42_to_rgb_row_impl( +pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_or_vu.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1888,6 +1933,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); // Same per-lane deinterleave mask as the NV12 kernel: within each // 128-bit lane, pack even bytes into low 8, odd bytes into high 8. @@ -2003,30 +2049,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - if SWAP_UV { - scalar::nv42_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv24_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -4062,6 +4111,122 @@ mod tests { } } + // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ---------------- + + fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx2 = std::vec![0u8; width * 4]; + + scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv24_to_rgba_row(&y, &uv, &mut rgba_avx2, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx2 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx2.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX2 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}", + rgba_scalar[first_diff], rgba_avx2[first_diff] + ); + } + } + + fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx2 = std::vec![0u8; width * 4]; + + scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv42_to_rgba_row(&y, &vu, &mut rgba_avx2, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx2 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx2.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX2 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}", + rgba_scalar[first_diff], rgba_avx2[first_diff] + ); + } + } + + #[test] + fn avx2_nv24_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv24_rgba_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_nv24_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [31usize, 32, 33, 63, 64, 65, 1920, 1921] { + check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + + #[test] + fn avx2_nv42_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv42_rgba_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_nv42_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [31usize, 32, 33, 63, 64, 65, 1920, 1921] { + check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444_to_rgb_row equivalence --------------------------------- fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 04de26d..3e56ed4 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -1851,12 +1851,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< } } -/// AVX-512 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over -/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`. +/// AVX-512 NV24 → packed RGB (UV-ordered, 4:4:4). /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv24_to_rgb_row( @@ -1869,7 +1868,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, uv, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgb_out, width, matrix, full_range); } } @@ -1877,7 +1876,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv42_to_rgb_row( @@ -1890,13 +1889,55 @@ pub(crate) unsafe fn nv42_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, vu, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgba_out, width, matrix, full_range); + } +} + +/// AVX-512 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgba_out, width, matrix, full_range); } } /// Shared AVX-512 NV24/NV42 kernel (4:4:4 semi-planar). 64 Y pixels / /// 64 chroma pairs / 128 UV bytes per iteration. Unlike -/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not subsampled — one /// UV pair per Y pixel — so the `chroma_dup` step disappears; two /// `chroma_i16x32` calls per channel produce 64 chroma values /// directly. @@ -1906,20 +1947,24 @@ pub(crate) unsafe fn nv42_to_rgb_row( /// 1. **AVX-512F + AVX-512BW must be available.** /// 2. `y.len() >= width`. /// 3. `uv_or_vu.len() >= 2 * width`. -/// 4. `rgb_out.len() >= 3 * width`. +/// 4. `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn nv24_or_nv42_to_rgb_row_impl( +pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_or_vu.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1940,6 +1985,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); // Same lane fixups as NV12 kernel — inherited verbatim. let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -2068,30 +2114,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - if SWAP_UV { - scalar::nv42_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv24_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -4232,6 +4281,122 @@ mod tests { } } + // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ---------------- + + fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx512 = std::vec![0u8; width * 4]; + + scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv24_to_rgba_row(&y, &uv, &mut rgba_avx512, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx512 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx512.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX-512 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}", + rgba_scalar[first_diff], rgba_avx512[first_diff] + ); + } + } + + fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx512 = std::vec![0u8; width * 4]; + + scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv42_to_rgba_row(&y, &vu, &mut rgba_avx512, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx512 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx512.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX-512 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}", + rgba_scalar[first_diff], rgba_avx512[first_diff] + ); + } + } + + #[test] + fn avx512_nv24_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv24_rgba_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_nv24_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [63usize, 64, 65, 127, 128, 129, 1920, 1921] { + check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + + #[test] + fn avx512_nv42_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv42_rgba_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_nv42_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [63usize, 64, 65, 127, 128, 129, 1920, 1921] { + check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444_to_rgb_row equivalence --------------------------------- fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 35a9453..e3077d7 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1594,12 +1594,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< } } -/// SSE4.1 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over -/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`. +/// SSE4.1 NV24 → packed RGB (UV-ordered, 4:4:4). /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv24_to_rgb_row( @@ -1612,7 +1611,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, uv, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgb_out, width, matrix, full_range); } } @@ -1620,7 +1619,7 @@ pub(crate) unsafe fn nv24_to_rgb_row( /// /// # Safety /// -/// Same as [`nv24_or_nv42_to_rgb_row_impl`]. +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`]. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv42_to_rgb_row( @@ -1633,12 +1632,54 @@ pub(crate) unsafe fn nv42_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv24_or_nv42_to_rgb_row_impl::(y, vu, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgba_out, width, matrix, full_range); + } +} + +/// SSE4.1 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha). +/// +/// # Safety +/// +/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgba_out, width, matrix, full_range); } } /// Shared SSE4.1 NV24/NV42 kernel (4:4:4 semi-planar). Unlike -/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not subsampled — one /// UV pair per Y pixel. Per 16 Y pixels, load 32 UV bytes (two /// `_mm_loadu_si128`), deinterleave, compute 16 chroma values per /// channel directly, and skip the `_mm_unpacklo/hi_epi16` chroma @@ -1649,20 +1690,24 @@ pub(crate) unsafe fn nv42_to_rgb_row( /// 1. **SSE4.1 must be available on the current CPU.** /// 2. `y.len() >= width`. /// 3. `uv_or_vu.len() >= 2 * width`. -/// 4. `rgb_out.len() >= 3 * width`. +/// 4. `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "sse4.1")] -unsafe fn nv24_or_nv42_to_rgb_row_impl( +pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_or_vu.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1683,6 +1728,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); // Shuffle masks to deinterleave 16 UV bytes into 8 U + 8 V (low // lanes). The upper 8 lanes are zeroed by `_mm_shuffle_epi8` @@ -1766,30 +1812,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - if SWAP_UV { - scalar::nv42_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv24_to_rgb_row( - &y[x..width], - &uv_or_vu[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -3561,6 +3610,122 @@ mod tests { } } + // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ---------------- + + fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_sse41 = std::vec![0u8; width * 4]; + + scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv24_to_rgba_row(&y, &uv, &mut rgba_sse41, width, matrix, full_range); + } + + if rgba_scalar != rgba_sse41 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_sse41.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "SSE4.1 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}", + rgba_scalar[first_diff], rgba_sse41[first_diff] + ); + } + } + + fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_sse41 = std::vec![0u8; width * 4]; + + scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv42_to_rgba_row(&y, &vu, &mut rgba_sse41, width, matrix, full_range); + } + + if rgba_scalar != rgba_sse41 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_sse41.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "SSE4.1 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}", + rgba_scalar[first_diff], rgba_sse41[first_diff] + ); + } + } + + #[test] + fn sse41_nv24_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv24_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_nv24_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + + #[test] + fn sse41_nv42_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv42_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_nv42_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444_to_rgb_row equivalence --------------------------------- fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/mod.rs b/src/row/mod.rs index 6499cec..c64403c 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -34,6 +34,8 @@ pub(crate) mod arch; pub(crate) mod scalar; +pub(crate) use scalar::expand_rgb_to_rgba_row; + use crate::ColorMatrix; /// Converts one row of 4:2:0 YUV to packed RGB. @@ -713,6 +715,154 @@ pub fn nv42_to_rgb_row( scalar::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); } +/// Converts one row of NV24 (semi‑planar 4:4:4, UV-ordered) to packed +/// **RGBA** (8-bit). Same numerical contract as [`nv24_to_rgb_row`]; +/// alpha defaults to `0xFF` (opaque). +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = match width.checked_mul(2) { + Some(n) => n, + None => panic!("width ({width}) × 2 overflows usize"), + }; + assert!(y.len() >= width, "y row too short"); + assert!(uv.len() >= uv_min, "uv row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); +} + +/// Converts one row of NV42 (semi‑planar 4:4:4, VU-ordered) to packed +/// **RGBA** (8-bit). Same as [`nv24_to_rgba_row`] but with swapped +/// chroma byte order. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let vu_min = match width.checked_mul(2) { + Some(n) => n, + None => panic!("width ({width}) × 2 overflows usize"), + }; + assert!(y.len() >= width, "y row too short"); + assert!(vu.len() >= vu_min, "vu row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); +} + /// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches /// to the best available SIMD backend for the current target. /// diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 1a76503..efd7f59 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -282,7 +282,8 @@ pub(crate) fn nv12_or_nv21_to_rgb_or_rgba_row_impl(y, uv, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgb_out, width, matrix, full_range); } /// NV42 (semi-planar 4:4:4, VU-ordered) → packed RGB. Thin wrapper -/// over [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = true`. +/// over [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn nv42_to_rgb_row( y: &[u8], @@ -306,31 +308,63 @@ pub(crate) fn nv42_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - nv24_or_nv42_to_rgb_row_impl::(y, vu, rgb_out, width, matrix, full_range); + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgb_out, width, matrix, full_range); } -/// Shared scalar kernel for NV24 (SWAP_UV=false) and NV42 -/// (SWAP_UV=true). Identical math and numerical contract to -/// [`yuv_420_to_rgb_row`]; the difference from NV12/NV21 is -/// 4:4:4 — one UV pair per Y pixel, no chroma upsampling. -/// No width parity constraint. +/// NV24 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`. +/// First three bytes per pixel are byte-identical to +/// [`nv24_to_rgb_row`]. `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, uv, rgba_out, width, matrix, full_range); +} + +/// NV42 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`. +/// First three bytes per pixel are byte-identical to +/// [`nv42_to_rgb_row`]. `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + nv24_or_nv42_to_rgb_or_rgba_row_impl::(y, vu, rgba_out, width, matrix, full_range); +} + +/// Shared scalar kernel for NV24 (`SWAP_UV = false`) / NV42 +/// (`SWAP_UV = true`) at 3 bpp (`ALPHA = false`) or 4 bpp + opaque +/// alpha (`ALPHA = true`). Identical math to [`yuv_444_to_rgb_row`] +/// (4:4:4 — one UV pair per Y pixel, no chroma upsampling); only +/// the per-pixel store stride differs. Both `const` generics drive +/// compile-time monomorphization. /// /// # Panics (debug builds) /// /// - `y.len() >= width`, `uv_or_vu.len() >= 2 * width`, -/// `rgb_out.len() >= 3 * width`. +/// `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -fn nv24_or_nv42_to_rgb_row_impl( +pub(crate) fn nv24_or_nv42_to_rgb_or_rgba_row_impl( y: &[u8], uv_or_vu: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_or_vu.len() >= 2 * width, "chroma row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params(full_range); @@ -351,9 +385,12 @@ fn nv24_or_nv42_to_rgb_row_impl( let b_chroma = (coeffs.b_u() * u_d + coeffs.b_v() * v_d + RND) >> 15; let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15; - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } } } @@ -451,6 +488,39 @@ fn clamp_u8(v: i32) -> u8 { v.clamp(0, 255) as u8 } +// ---- RGB → RGBA expand (Strategy A combined-buffer optimization) ------ + +/// Reads packed `R, G, B` triples and writes packed `R, G, B, A` +/// quadruplets with `A = 0xFF` (opaque). Used by `MixedSinker` impls +/// when callers attach **both** `with_rgb` and `with_rgba`: instead +/// of running the YUV→RGB math twice (once per output format), we +/// run the RGB kernel into the user's RGB buffer and then expand +/// here to derive the RGBA buffer with a single per-byte pass. +/// +/// The 3W read is L1-hot from the just-completed RGB write, so the +/// effective memory traffic is roughly 3W RGB write + 4W RGBA write +/// = 7W per row — same as the existing native-RGBA path, but with +/// only one pass through the YUV→RGB math instead of two. See +/// `docs/color-conversion-functions.md` § Ship 8 for the full +/// design discussion (Strategy A vs the alternative B "combined +/// kernel writes both per pixel" deferred to a future PR). +/// +/// # Panics (debug builds) +/// +/// - `rgb.len() >= 3 * width` +/// - `rgba_out.len() >= 4 * width` +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usize) { + debug_assert!(rgb.len() >= width * 3, "rgb row too short"); + debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); + for x in 0..width { + rgba_out[x * 4] = rgb[x * 3]; + rgba_out[x * 4 + 1] = rgb[x * 3 + 1]; + rgba_out[x * 4 + 2] = rgb[x * 3 + 2]; + rgba_out[x * 4 + 3] = 0xFF; + } +} + // ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) ------------- /// Converts one row of high-bit-depth 4:2:0 YUV (`u16` samples in the @@ -2005,6 +2075,48 @@ fn clamp_u16_round(v: f32, max: f32) -> u16 { mod tests { use super::*; + // ---- expand_rgb_to_rgba_row ----------------------------------------- + + #[test] + fn expand_rgb_to_rgba_row_pads_alpha_and_preserves_rgb() { + // Each source pixel's R/G/B must land in the matching slot, with + // alpha forced to 0xFF — Strategy A's correctness depends on this. + let rgb: std::vec::Vec = (0..16 * 3).map(|i| i as u8).collect(); + let mut rgba = std::vec![0u8; 16 * 4]; + expand_rgb_to_rgba_row(&rgb, &mut rgba, 16); + for x in 0..16 { + assert_eq!(rgba[x * 4], rgb[x * 3], "R at px {x}"); + assert_eq!(rgba[x * 4 + 1], rgb[x * 3 + 1], "G at px {x}"); + assert_eq!(rgba[x * 4 + 2], rgb[x * 3 + 2], "B at px {x}"); + assert_eq!(rgba[x * 4 + 3], 0xFF, "A at px {x}"); + } + } + + #[test] + fn expand_rgb_to_rgba_row_only_writes_first_width_pixels() { + // Caller may pass over-sized RGBA buffers; we must not stomp on + // the trailing region. Pre-fill 0xAA, expand into the head, and + // verify the tail still reads 0xAA. + let rgb: std::vec::Vec = (0..8 * 3).map(|i| (i + 1) as u8).collect(); + let mut rgba = std::vec![0xAAu8; 16 * 4]; + expand_rgb_to_rgba_row(&rgb, &mut rgba, 8); + for x in 0..8 { + assert_eq!(rgba[x * 4], rgb[x * 3]); + assert_eq!(rgba[x * 4 + 3], 0xFF); + } + for &b in &rgba[8 * 4..] { + assert_eq!(b, 0xAA, "tail must be untouched"); + } + } + + #[test] + fn expand_rgb_to_rgba_row_zero_width_is_noop() { + let rgb: std::vec::Vec = std::vec::Vec::new(); + let mut rgba = std::vec![0u8; 0]; + expand_rgb_to_rgba_row(&rgb, &mut rgba, 0); + assert!(rgba.is_empty()); + } + // ---- yuv_420_to_rgb_row ---------------------------------------------- #[test] diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 94ca4e4..524c54f 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -62,17 +62,18 @@ use crate::{ HsvBuffers, PixelSink, SourceFormat, raw::{Bayer, Bayer16, BayerRow, BayerRow16, BayerSink, BayerSink16}, row::{ - bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, nv12_to_rgb_row, - nv12_to_rgba_row, nv21_to_rgb_row, nv21_to_rgba_row, nv24_to_rgb_row, nv42_to_rgb_row, - p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row, - p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row, - p412_to_rgb_u16_row, p416_to_rgb_row, p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, - yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv_444_to_rgba_row, yuv420p9_to_rgb_row, - yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, - yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, - yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, - yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, - yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row, + bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, expand_rgb_to_rgba_row, + nv12_to_rgb_row, nv12_to_rgba_row, nv21_to_rgb_row, nv21_to_rgba_row, nv24_to_rgb_row, + nv24_to_rgba_row, nv42_to_rgb_row, nv42_to_rgba_row, p010_to_rgb_row, p010_to_rgb_u16_row, + p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row, + p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row, + p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_420_to_rgba_row, + yuv_444_to_rgb_row, yuv_444_to_rgba_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, + yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, + yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, + yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, + yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, + yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row, }, yuv::{ Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row, @@ -1110,12 +1111,12 @@ impl<'a> MixedSinker<'a, Yuv420p> { /// /// ```compile_fail /// // Attaching RGBA to a sink that doesn't write it is rejected - /// // at compile time. Nv24 (4:4:4 semi‑planar) has not yet been + /// // at compile time. Yuv440p (4:4:0 planar) has not yet been /// // wired for RGBA; once that lands the negative example here /// // moves to the next not‑yet‑wired format. - /// use colconv::{sinker::MixedSinker, yuv::Nv24}; + /// use colconv::{sinker::MixedSinker, yuv::Yuv440p}; /// let mut buf = vec![0u8; 16 * 8 * 4]; - /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); + /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); /// ``` #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { @@ -1229,12 +1230,20 @@ impl PixelSink for MixedSinker<'_, Yuv420p> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } - // Native RGBA: independent kernel run, separate from RGB. Avoids - // the compose-and-expand cost — the const-generic - // `yuv_420_to_rgba_row` writes 4 bytes per pixel directly. - // Default alpha = 0xFF (opaque); future YUVA source impls will - // copy alpha through from the source plane. - if let Some(buf) = rgba.as_deref_mut() { + // Output mode resolution (Strategy A): + // - RGBA-only: run dedicated `yuv_420_to_rgba_row` (4 bpp store). + // - RGB / HSV (with or without RGBA): run RGB kernel once, then if + // RGBA is also requested, fan it out via `expand_rgb_to_rgba_row` + // (memory-bound copy + 0xFF alpha pad). Saves the second YUV→RGB + // per-pixel math when both buffers are attached. + // - None of the above: nothing to do beyond luma above. + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_plane_end = one_plane_end .checked_mul(4) @@ -1243,22 +1252,21 @@ impl PixelSink for MixedSinker<'_, Yuv420p> { height: h, channels: 4, })?; - let rgba_plane_start = one_plane_start * 4; // ≤ rgba_plane_end. + let rgba_plane_start = one_plane_start * 4; yuv_420_to_rgba_row( row.y(), row.u_half(), row.v_half(), - &mut buf[rgba_plane_start..rgba_plane_end], + &mut rgba_buf[rgba_plane_start..rgba_plane_end], w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - let want_rgb = rgb.is_some(); - let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + if !need_rgb_kernel { return Ok(()); } @@ -1324,6 +1332,20 @@ impl PixelSink for MixedSinker<'_, Yuv420p> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -1437,11 +1459,16 @@ impl PixelSink for MixedSinker<'_, Yuv422p> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } - // Native RGBA: independent kernel run, separate from RGB. Default - // alpha = 0xFF since Yuv422p has no alpha plane. Reuses the - // Yuv420p RGBA dispatcher — 4:2:2's per-row contract is - // identical (half-width chroma, one pair per Y pair). - if let Some(buf) = rgba.as_deref_mut() { + // Strategy A output mode resolution — see Yuv420p impl above. + // Reuses Yuv420p dispatchers (RGB and RGBA) since 4:2:2's per-row + // contract is identical (half-width chroma, one pair per Y pair). + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_plane_end = one_plane_end .checked_mul(4) @@ -1455,17 +1482,16 @@ impl PixelSink for MixedSinker<'_, Yuv422p> { row.y(), row.u_half(), row.v_half(), - &mut buf[rgba_plane_start..rgba_plane_end], + &mut rgba_buf[rgba_plane_start..rgba_plane_end], w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - let want_rgb = rgb.is_some(); - let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + if !need_rgb_kernel { return Ok(()); } @@ -1518,6 +1544,20 @@ impl PixelSink for MixedSinker<'_, Yuv422p> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -1623,9 +1663,14 @@ impl PixelSink for MixedSinker<'_, Yuv444p> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } - // Native RGBA: independent kernel run, separate from RGB. Default - // alpha = 0xFF since Yuv444p has no alpha plane. - if let Some(buf) = rgba.as_deref_mut() { + // Strategy A output mode resolution — see Yuv420p impl above. + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_plane_end = one_plane_end .checked_mul(4) @@ -1639,17 +1684,16 @@ impl PixelSink for MixedSinker<'_, Yuv444p> { row.y(), row.u(), row.v(), - &mut buf[rgba_plane_start..rgba_plane_end], + &mut rgba_buf[rgba_plane_start..rgba_plane_end], w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - let want_rgb = rgb.is_some(); - let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + if !need_rgb_kernel { return Ok(()); } @@ -1700,6 +1744,20 @@ impl PixelSink for MixedSinker<'_, Yuv444p> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -1817,9 +1875,14 @@ impl PixelSink for MixedSinker<'_, Nv12> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } - // Native RGBA: independent kernel run, separate from RGB. Default - // alpha = 0xFF since NV12 has no alpha plane. - if let Some(buf) = rgba.as_deref_mut() { + // Strategy A output mode resolution — see Yuv420p impl above. + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_plane_end = one_plane_end .checked_mul(4) @@ -1832,17 +1895,16 @@ impl PixelSink for MixedSinker<'_, Nv12> { nv12_to_rgba_row( row.y(), row.uv_half(), - &mut buf[rgba_plane_start..rgba_plane_end], + &mut rgba_buf[rgba_plane_start..rgba_plane_end], w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - let want_rgb = rgb.is_some(); - let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + if !need_rgb_kernel { return Ok(()); } @@ -1894,6 +1956,20 @@ impl PixelSink for MixedSinker<'_, Nv12> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -2002,10 +2078,16 @@ impl PixelSink for MixedSinker<'_, Nv16> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } - // Native RGBA: independent kernel run, separate from RGB. Default - // alpha = 0xFF since NV16 has no alpha plane. Reuses the NV12 - // RGBA dispatcher — 4:2:2's row contract is identical. - if let Some(buf) = rgba.as_deref_mut() { + // Strategy A output mode resolution — see Yuv420p impl above. + // Reuses NV12 dispatchers (RGB and RGBA) since 4:2:2's row + // contract is identical. + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_plane_end = one_plane_end .checked_mul(4) @@ -2018,17 +2100,16 @@ impl PixelSink for MixedSinker<'_, Nv16> { nv12_to_rgba_row( row.y(), row.uv(), - &mut buf[rgba_plane_start..rgba_plane_end], + &mut rgba_buf[rgba_plane_start..rgba_plane_end], w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - let want_rgb = rgb.is_some(); - let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + if !need_rgb_kernel { return Ok(()); } @@ -2079,6 +2160,20 @@ impl PixelSink for MixedSinker<'_, Nv16> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -2185,9 +2280,14 @@ impl PixelSink for MixedSinker<'_, Nv21> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } - // Native RGBA: independent kernel run, separate from RGB. Default - // alpha = 0xFF since NV21 has no alpha plane. - if let Some(buf) = rgba.as_deref_mut() { + // Strategy A output mode resolution — see Yuv420p impl above. + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_plane_end = one_plane_end .checked_mul(4) @@ -2200,17 +2300,16 @@ impl PixelSink for MixedSinker<'_, Nv21> { nv21_to_rgba_row( row.y(), row.vu_half(), - &mut buf[rgba_plane_start..rgba_plane_end], + &mut rgba_buf[rgba_plane_start..rgba_plane_end], w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - let want_rgb = rgb.is_some(); - let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + if !need_rgb_kernel { return Ok(()); } @@ -2262,6 +2361,20 @@ impl PixelSink for MixedSinker<'_, Nv21> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -2273,6 +2386,38 @@ impl PixelSink for MixedSinker<'_, Nv21> { // is its own family (`nv24_to_rgb_row`) since chroma is no longer // duplicated across columns. +impl<'a> MixedSinker<'a, Nv24> { + /// Attaches a packed 32‑bit RGBA output buffer. + /// + /// Only available on sinker types whose `PixelSink` impl writes + /// RGBA — see [`MixedSinker::::with_rgba`] for the same + /// rationale and constraints. Nv24 has no alpha plane, so every + /// alpha byte is filled with `0xFF` (opaque). + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } +} + impl Nv24Sink for MixedSinker<'_, Nv24> {} impl PixelSink for MixedSinker<'_, Nv24> { @@ -2322,6 +2467,7 @@ impl PixelSink for MixedSinker<'_, Nv24> { let Self { rgb, + rgba, luma, hsv, rgb_scratch, @@ -2336,8 +2482,37 @@ impl PixelSink for MixedSinker<'_, Nv24> { } let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + // Standalone RGBA path: the caller wants only RGBA (no RGB / HSV), + // so run the dedicated RGBA kernel directly into the output buffer. + // Avoids both the scratch allocation and the expand-pad pass. + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + nv24_to_rgba_row( + row.y(), + row.uv(), + &mut rgba_buf[rgba_plane_start..rgba_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -2387,6 +2562,23 @@ impl PixelSink for MixedSinker<'_, Nv24> { use_simd, ); } + + // Strategy A: when both RGB-side and RGBA outputs are requested, + // derive RGBA from the just-computed RGB row (memory-bound copy + + // 0xFF alpha pad) instead of running a second YUV→RGB kernel. + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -2396,6 +2588,33 @@ impl PixelSink for MixedSinker<'_, Nv24> { // Structurally identical to the Nv24 impl — the row primitive hides // the V/U byte-order difference. +impl<'a> MixedSinker<'a, Nv42> { + /// Attaches a packed 32‑bit RGBA output buffer. + /// + /// See [`MixedSinker::::with_rgba`] for the same rationale and + /// constraints; Nv42 differs only in chroma byte order (V before U). + /// Returns `Err(RgbaBufferTooShort)` if `buf.len() < width × height × 4`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } +} + impl Nv42Sink for MixedSinker<'_, Nv42> {} impl PixelSink for MixedSinker<'_, Nv42> { @@ -2442,6 +2661,7 @@ impl PixelSink for MixedSinker<'_, Nv42> { let Self { rgb, + rgba, luma, hsv, rgb_scratch, @@ -2456,8 +2676,34 @@ impl PixelSink for MixedSinker<'_, Nv42> { } let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + nv42_to_rgba_row( + row.y(), + row.vu(), + &mut rgba_buf[rgba_plane_start..rgba_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -2507,6 +2753,20 @@ impl PixelSink for MixedSinker<'_, Nv42> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w); + } + Ok(()) } } @@ -9876,6 +10136,386 @@ mod tests { ); } + // ---- Nv24/Nv42 RGBA (Ship 8 PR 4b) tests -------------------------------- + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv24_rgba_only_converts_gray_to_gray_with_opaque_alpha() { + let (yp, uvp) = solid_nv24_frame(16, 8, 128, 128, 128); + let src = Nv24Frame::new(&yp, &uvp, 16, 8, 16, 32); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + nv24_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1, "R"); + assert_eq!(px[0], px[1], "RGB monochromatic"); + assert_eq!(px[1], px[2], "RGB monochromatic"); + assert_eq!(px[3], 0xFF, "alpha must default to opaque"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv24_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + // Strategy A invariant: when both RGB and RGBA are attached, the + // RGBA bytes must be byte-for-byte identical to the RGB row + + // 0xFF alpha. This is the cross-format guarantee that holds even + // after we replaced the dual-kernel path with the + // expand_rgb_to_rgba_row fan-out. + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; + let (yp, uvp) = solid_nv24_frame(w, h, 180, 60, 200); + let src = Nv24Frame::new(&yp, &uvp, w, h, w, 2 * w); + + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv24_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for i in 0..(ws * hs) { + assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}"); + } + } + + #[test] + fn nv24_rgba_buffer_too_short_returns_err() { + let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1]; + let result = MixedSinker::::new(16, 8).with_rgba(&mut rgba_short); + let Err(err) = result else { + panic!("expected RgbaBufferTooShort error"); + }; + assert!(matches!( + err, + MixedSinkerError::RgbaBufferTooShort { + expected: 512, + actual: 511, + } + )); + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv24_rgba_simd_matches_scalar_with_random_yuv() { + // Width 1922 forces both the SIMD main loop AND scalar tail across + // every backend block size (16/32/64). + let w = 1922usize; + let h = 4usize; + let mut yp = std::vec![0u8; w * h]; + let mut uvp = std::vec![0u8; 2 * w * h]; + pseudo_random_u8(&mut yp, 0xC001_C0DE); + pseudo_random_u8(&mut uvp, 0xCAFE_F00D); + let src = Nv24Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, (2 * w) as u32); + + for &matrix in &[ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::YCgCo, + ] { + for &full_range in &[true, false] { + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut s_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + nv24_to(&src, full_range, matrix, &mut s_simd).unwrap(); + + let mut s_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + s_scalar.set_simd(false); + nv24_to(&src, full_range, matrix, &mut s_scalar).unwrap(); + + assert_eq!( + rgba_simd, rgba_scalar, + "Nv24 RGBA SIMD ≠ scalar (matrix={matrix:?}, full_range={full_range})" + ); + } + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv42_rgba_only_converts_gray_to_gray_with_opaque_alpha() { + let (yp, vup) = solid_nv42_frame(16, 8, 128, 128, 128); + let src = Nv42Frame::new(&yp, &vup, 16, 8, 16, 32); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + nv42_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv42_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; + let (yp, vup) = solid_nv42_frame(w, h, 180, 60, 200); + let src = Nv42Frame::new(&yp, &vup, w, h, w, 2 * w); + + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv42_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for i in 0..(ws * hs) { + assert_eq!(rgba[i * 4], rgb[i * 3]); + assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1]); + assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2]); + assert_eq!(rgba[i * 4 + 3], 0xFF); + } + } + + #[test] + fn nv42_rgba_buffer_too_short_returns_err() { + let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1]; + let result = MixedSinker::::new(16, 8).with_rgba(&mut rgba_short); + let Err(err) = result else { + panic!("expected RgbaBufferTooShort error"); + }; + assert!(matches!( + err, + MixedSinkerError::RgbaBufferTooShort { + expected: 512, + actual: 511, + } + )); + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv42_rgba_simd_matches_scalar_with_random_yuv() { + let w = 1922usize; + let h = 4usize; + let mut yp = std::vec![0u8; w * h]; + let mut vup = std::vec![0u8; 2 * w * h]; + pseudo_random_u8(&mut yp, 0xC001_C0DE); + pseudo_random_u8(&mut vup, 0xCAFE_F00D); + let src = Nv42Frame::new(&yp, &vup, w as u32, h as u32, w as u32, (2 * w) as u32); + + for &matrix in &[ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::YCgCo, + ] { + for &full_range in &[true, false] { + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut s_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + nv42_to(&src, full_range, matrix, &mut s_simd).unwrap(); + + let mut s_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + s_scalar.set_simd(false); + nv42_to(&src, full_range, matrix, &mut s_scalar).unwrap(); + + assert_eq!( + rgba_simd, rgba_scalar, + "Nv42 RGBA SIMD ≠ scalar (matrix={matrix:?}, full_range={full_range})" + ); + } + } + } + + // Cross-format Strategy A invariant: when both RGB+RGBA are + // attached, all 8 wired families derive RGBA from the RGB row via + // expand_rgb_to_rgba_row. This test runs all 8 process methods with + // the same gray input and asserts every RGBA sample equals the RGB + // sample with alpha = 0xFF — proving the fan-out shape never + // diverges from the kernel output. + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn strategy_a_rgb_and_rgba_byte_identical_for_all_wired_families() { + let w: u32 = 32; + let h: u32 = 8; + let ws = w as usize; + let hs = h as usize; + + let assert_match = |rgb: &[u8], rgba: &[u8], who: &str| { + for i in 0..(ws * hs) { + assert_eq!(rgba[i * 4], rgb[i * 3], "{who}: R differs at px {i}"); + assert_eq!( + rgba[i * 4 + 1], + rgb[i * 3 + 1], + "{who}: G differs at px {i}" + ); + assert_eq!( + rgba[i * 4 + 2], + rgb[i * 3 + 2], + "{who}: B differs at px {i}" + ); + assert_eq!(rgba[i * 4 + 3], 0xFF, "{who}: alpha not opaque at px {i}"); + } + }; + + { + let (yp, up, vp) = solid_yuv420p_frame(w, h, 200, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Yuv420p"); + } + + { + let (yp, up, vp) = solid_yuv422p_frame(w, h, 200, 128, 128); + let src = Yuv422pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv422p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Yuv422p"); + } + + { + let (yp, up, vp) = solid_yuv444p_frame(w, h, 200, 128, 128); + let src = Yuv444pFrame::new(&yp, &up, &vp, w, h, w, w, w); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv444p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Yuv444p"); + } + + { + let (yp, uvp) = solid_nv12_frame(w, h, 200, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, w, h, w, w); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Nv12"); + } + + { + let (yp, vup) = solid_nv21_frame(w, h, 200, 128, 128); + let src = Nv21Frame::new(&yp, &vup, w, h, w, w); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Nv21"); + } + + { + let (yp, uvp) = solid_nv16_frame(w, h, 200, 128, 128); + let src = Nv16Frame::new(&yp, &uvp, w, h, w, w); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Nv16"); + } + + { + let (yp, uvp) = solid_nv24_frame(w, h, 200, 128, 128); + let src = Nv24Frame::new(&yp, &uvp, w, h, w, 2 * w); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv24_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Nv24"); + } + + { + let (yp, vup) = solid_nv42_frame(w, h, 200, 128, 128); + let src = Nv42Frame::new(&yp, &vup, w, h, w, 2 * w); + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv42_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + assert_match(&rgb, &rgba, "Nv42"); + } + } + // ---- Yuv420p10 -------------------------------------------------------- fn solid_yuv420p10_frame(