From 44a23b9037bd5bef0f7358c68e3626b9512582bc Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 15:53:47 +1200 Subject: [PATCH 1/2] update --- src/row/arch/neon.rs | 288 ++++++++++++++++---- src/row/arch/wasm_simd128.rs | 237 ++++++++++++++--- src/row/arch/x86_avx2.rs | 281 +++++++++++++++++--- src/row/arch/x86_avx512.rs | 281 +++++++++++++++++--- src/row/arch/x86_sse41.rs | 278 +++++++++++++++++--- src/row/mod.rs | 144 ++++++++++ src/row/scalar.rs | 94 +++++-- src/sinker/mixed.rs | 490 +++++++++++++++++++++++++++++++++-- 8 files changed, 1844 insertions(+), 249 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 24f4ae1..ae95884 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -1078,12 +1078,9 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( } } -/// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over the -/// shared [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1096,16 +1093,15 @@ pub(crate) unsafe fn nv12_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, uv_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgb_out, width, matrix, full_range, + ); } } /// NEON NV21 → packed RGB (VU-ordered chroma). Thin wrapper over -/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1118,15 +1114,62 @@ pub(crate) unsafe fn nv21_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, vu_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// NEON NV12 → packed RGBA (R, G, B, `0xFF` per pixel). Same +/// contract as [`nv12_to_rgb_row`] but writes 4 bytes per pixel via +/// `vst4q_u8`. `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// NEON NV21 → packed RGBA (R, G, B, `0xFF` per pixel). Same +/// contract as [`nv21_to_rgb_row`] but writes 4 bytes per pixel via +/// `vst4q_u8`. `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgba_out, width, matrix, full_range, + ); } } -/// Shared NEON NV12/NV21 kernel. `SWAP_UV = false` selects NV12 -/// (even byte = U, odd = V); `SWAP_UV = true` selects NV21 (even = -/// V, odd = U). The const generic drives monomorphization — the -/// branch is eliminated in each instantiation and both wrappers -/// produce byte‑identical output to the scalar reference. +/// Shared NEON NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque +/// alpha (RGBA). `SWAP_UV = false` selects NV12 (even byte = U, odd = +/// V); `SWAP_UV = true` selects NV21 (even = V, odd = U). `ALPHA = +/// true` writes via `vst4q_u8` with constant `0xFF` alpha; `ALPHA = +/// false` writes via `vst3q_u8`. Both const generics drive +/// compile-time monomorphization — branches are eliminated and each +/// of the four wrappers produces byte‑identical output to the scalar +/// reference. /// /// # Safety /// @@ -1135,17 +1178,20 @@ pub(crate) unsafe fn nv21_to_rgb_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`. /// 4. `uv_or_vu_half.len() >= width` (2 × (width / 2) interleaved bytes). -/// 5. `rgb_out.len() >= 3 * width`. +/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. /// /// Bounds are `debug_assert`-checked; release builds trust the caller /// because the kernel uses unchecked pointer arithmetic (`vld1q_u8`, -/// `vld2_u8`, `vst3q_u8`). +/// `vld2_u8`, `vst3q_u8` / `vst4q_u8`). #[inline] #[target_feature(enable = "neon")] -unsafe fn nv12_or_nv21_to_rgb_row_impl( +pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu_half: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, @@ -1153,7 +1199,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width"); debug_assert!(y.len() >= width); debug_assert!(uv_or_vu_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1174,6 +1221,9 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + // Constant opaque-alpha vector for the RGBA path; DCE'd when + // ALPHA = false. + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1233,33 +1283,37 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8); + vst4q_u8(out.as_mut_ptr().add(x * 4), rgba); + } else { + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(out.as_mut_ptr().add(x * 3), rgb); + } x += 16; } // Scalar tail for the 0..14 leftover pixels. Dispatch to the - // matching scalar kernel based on SWAP_UV. + // matching scalar kernel based on SWAP_UV × ALPHA. if x < width { - if SWAP_UV { - scalar::nv21_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv12_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu_half[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -3699,6 +3753,152 @@ mod tests { } } + // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ---------------- + // + // Direct backend tests for the new RGBA path, mirroring the RGB + // pattern above. Bypasses the dispatcher so the NEON `vst4q_u8` + // store is exercised regardless of what tier the dispatcher picks. + + fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + + scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv12_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + + if rgba_scalar != rgba_neon { + let first_diff = rgba_scalar + .iter() + .zip(rgba_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "NEON NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgba_scalar[first_diff], rgba_neon[first_diff] + ); + } + } + + fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + + scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv21_to_rgba_row(&y, &vu, &mut rgba_neon, width, matrix, full_range); + } + + if rgba_scalar != rgba_neon { + let first_diff = rgba_scalar + .iter() + .zip(rgba_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "NEON NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgba_scalar[first_diff], rgba_neon[first_diff] + ); + } + } + + /// Cross-format invariant: NV12 RGBA must match Yuv420p RGBA on + /// equivalent UV bytes. Catches U/V swap regressions specific to + /// the new RGBA store path. + fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let uv: std::vec::Vec = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect(); + + let mut rgba_yuv420p = std::vec![0u8; width * 4]; + let mut rgba_nv12 = std::vec![0u8; width * 4]; + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range); + nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range); + } + assert_eq!( + rgba_yuv420p, rgba_nv12, + "NEON NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv12_neon_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv12_rgba_equivalence(16, m, full); + } + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv12_neon_rgba_matches_scalar_widths() { + for w in [18usize, 30, 34, 1920, 1922] { + check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv12_neon_rgba_matches_yuv420p_rgba_neon() { + for w in [16usize, 30, 64, 1920] { + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false); + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true); + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv21_neon_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv21_rgba_equivalence(16, m, full); + } + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn nv21_neon_rgba_matches_scalar_widths() { + for w in [18usize, 30, 34, 1920, 1922] { + check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- nv24_to_rgb_row / nv42_to_rgb_row equivalence ------------------ fn check_nv24_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 2ff0619..f8a794d 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -1317,12 +1317,9 @@ unsafe fn deinterleave_uv_u16_wasm(ptr: *const u16) -> (v128, v128) { } } -/// WASM simd128 NV12 → packed RGB (UV-ordered chroma). Thin wrapper -/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// WASM simd128 NV12 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1333,18 +1330,16 @@ pub(crate) unsafe fn nv12_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, uv_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgb_out, width, matrix, full_range, + ); } } -/// WASM simd128 NV21 → packed RGB (VU-ordered chroma). Thin wrapper -/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// WASM simd128 NV21 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1355,29 +1350,75 @@ pub(crate) unsafe fn nv21_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, vu_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// WASM simd128 NV12 → packed RGBA. Same contract as +/// [`nv12_to_rgb_row`] but writes 4 bytes per pixel via +/// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// WASM simd128 NV21 → packed RGBA. Same contract as +/// [`nv21_to_rgb_row`] but writes 4 bytes per pixel via +/// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgba_out, width, matrix, full_range, + ); } } -/// Shared wasm simd128 NV12/NV21 kernel. `SWAP_UV` selects chroma -/// byte order at compile time. +/// Shared wasm simd128 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + +/// opaque alpha (RGBA). `SWAP_UV` selects chroma byte order; +/// `ALPHA = true` writes via [`write_rgba_16`], `ALPHA = false` via +/// [`write_rgb_16`]. Both const generics drive compile-time +/// monomorphization. /// /// # Safety /// -/// 1. **simd128 must be enabled at compile time** (same obligation as -/// [`yuv_420_to_rgb_row`]). +/// 1. **simd128 must be enabled at compile time.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`. /// 4. `uv_or_vu_half.len() >= width` (16 interleaved bytes per 16 Y pixels). -/// 5. `rgb_out.len() >= 3 * width`. +/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -unsafe fn nv12_or_nv21_to_rgb_row_impl( +pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu_half: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, @@ -1385,7 +1426,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width"); debug_assert!(y.len() >= width); debug_assert!(uv_or_vu_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1407,6 +1449,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1502,30 +1545,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - if SWAP_UV { - scalar::nv21_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv12_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu_half[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -3877,6 +3923,111 @@ mod tests { check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true); } } + + // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ---------------- + + fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + + scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv12_to_rgba_row(&y, &uv, &mut rgba_wasm, width, matrix, full_range); + } + + if rgba_scalar != rgba_wasm { + let first_diff = rgba_scalar + .iter() + .zip(rgba_wasm.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "wasm simd128 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}", + rgba_scalar[first_diff], rgba_wasm[first_diff] + ); + } + } + + fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + + scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv21_to_rgba_row(&y, &vu, &mut rgba_wasm, width, matrix, full_range); + } + + if rgba_scalar != rgba_wasm { + let first_diff = rgba_scalar + .iter() + .zip(rgba_wasm.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "wasm simd128 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}", + rgba_scalar[first_diff], rgba_wasm[first_diff] + ); + } + } + + #[test] + fn nv12_wasm_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv12_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn nv12_wasm_rgba_matches_scalar_widths() { + for w in [18usize, 30, 34, 1920, 1922] { + check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + + #[test] + fn nv21_wasm_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv21_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn nv21_wasm_rgba_matches_scalar_widths() { + for w in [18usize, 30, 34, 1920, 1922] { + check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- rgb_to_hsv_row equivalence -------------------------------------- fn check_hsv_equivalence(rgb: &[u8], width: usize) { diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 54742dc..3312a79 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -1523,12 +1523,9 @@ unsafe fn deinterleave_uv_u16_avx2(ptr: *const u16) -> (__m256i, __m256i) { } } -/// AVX2 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over -/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// AVX2 NV12 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1539,18 +1536,16 @@ pub(crate) unsafe fn nv12_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, uv_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgb_out, width, matrix, full_range, + ); } } -/// AVX2 NV21 → packed RGB (VU-ordered chroma). Thin wrapper over -/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// AVX2 NV21 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1561,29 +1556,74 @@ pub(crate) unsafe fn nv21_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, vu_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX2 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`] +/// but writes 4 bytes per pixel via [`write_rgba_32`]. +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgba_out, width, matrix, full_range, + ); } } -/// Shared AVX2 NV12/NV21 kernel. `SWAP_UV` selects chroma byte order -/// at compile time. +/// AVX2 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`] +/// but writes 4 bytes per pixel via [`write_rgba_32`]. +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX2 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque +/// alpha (RGBA). `SWAP_UV` selects chroma byte order; `ALPHA = true` +/// writes via [`write_rgba_32`], `ALPHA = false` via [`write_rgb_32`]. +/// Both const generics drive compile-time monomorphization. /// /// # Safety /// -/// 1. **AVX2 must be available on the current CPU** (same obligation -/// as [`yuv_420_to_rgb_row`]). +/// 1. **AVX2 must be available on the current CPU.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`. /// 4. `uv_or_vu_half.len() >= width` (32 interleaved bytes per 32 Y pixels). -/// 5. `rgb_out.len() >= 3 * width`. +/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -unsafe fn nv12_or_nv21_to_rgb_row_impl( +pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu_half: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, @@ -1591,7 +1631,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width"); debug_assert!(y.len() >= width); debug_assert!(uv_or_vu_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1612,6 +1653,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); // 0xFF as i8 // Per‑lane shuffle: pack U bytes (even offsets) into low 8 of each // 128‑bit lane, V bytes (odd offsets) into the high 8. Applied to a @@ -1696,30 +1738,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - if SWAP_UV { - scalar::nv21_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv12_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu_half[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -4240,6 +4285,156 @@ mod tests { check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true); } } + + // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ---------------- + + fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx2 = std::vec![0u8; width * 4]; + + scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv12_to_rgba_row(&y, &uv, &mut rgba_avx2, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx2 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx2.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX2 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}", + rgba_scalar[first_diff], rgba_avx2[first_diff] + ); + } + } + + fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx2 = std::vec![0u8; width * 4]; + + scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv21_to_rgba_row(&y, &vu, &mut rgba_avx2, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx2 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx2.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX2 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}", + rgba_scalar[first_diff], rgba_avx2[first_diff] + ); + } + } + + fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let uv: std::vec::Vec = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect(); + + let mut rgba_yuv420p = std::vec![0u8; width * 4]; + let mut rgba_nv12 = std::vec![0u8; width * 4]; + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range); + nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range); + } + assert_eq!( + rgba_yuv420p, rgba_nv12, + "AVX2 NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + #[test] + fn nv12_avx2_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv12_rgba_equivalence(32, m, full); + } + } + } + + #[test] + fn nv12_avx2_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [34usize, 46, 62, 1920, 1922] { + check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + + #[test] + fn nv12_avx2_rgba_matches_yuv420p_rgba_avx2() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [32usize, 64, 1920] { + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false); + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true); + } + } + + #[test] + fn nv21_avx2_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv21_rgba_equivalence(32, m, full); + } + } + } + + #[test] + fn nv21_avx2_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [34usize, 46, 62, 1920, 1922] { + check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- rgb_to_hsv_row equivalence -------------------------------------- fn check_hsv_equivalence(rgb: &[u8], width: usize) { diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 9e17ee9..77ce177 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -1570,12 +1570,9 @@ unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) { } } -/// AVX‑512 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over -/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// AVX‑512 NV12 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1586,18 +1583,16 @@ pub(crate) unsafe fn nv12_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, uv_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgb_out, width, matrix, full_range, + ); } } -/// AVX‑512 NV21 → packed RGB (VU-ordered chroma). Thin wrapper over -/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// AVX‑512 NV21 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1608,29 +1603,74 @@ pub(crate) unsafe fn nv21_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, vu_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX‑512 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`] +/// but writes 4 bytes per pixel via [`write_rgba_64`]. +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgba_out, width, matrix, full_range, + ); } } -/// Shared AVX‑512 NV12/NV21 kernel. `SWAP_UV` selects chroma byte -/// order at compile time. +/// AVX‑512 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`] +/// but writes 4 bytes per pixel via [`write_rgba_64`]. +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX‑512 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque +/// alpha (RGBA). `SWAP_UV` selects chroma byte order; `ALPHA = true` +/// writes via [`write_rgba_64`], `ALPHA = false` via [`write_rgb_64`]. +/// Both const generics drive compile-time monomorphization. /// /// # Safety /// -/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU** -/// (same obligation as [`yuv_420_to_rgb_row`]). +/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`. /// 4. `uv_or_vu_half.len() >= width` (64 interleaved bytes per 64 Y pixels). -/// 5. `rgb_out.len() >= 3 * width`. +/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn nv12_or_nv21_to_rgb_row_impl( +pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu_half: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, @@ -1638,7 +1678,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width"); debug_assert!(y.len() >= width); debug_assert!(uv_or_vu_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1659,6 +1700,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); // 0xFF as i8 let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); @@ -1748,30 +1790,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - if SWAP_UV { - scalar::nv21_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv12_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu_half[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -4420,6 +4465,156 @@ mod tests { check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true); } } + + // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ---------------- + + fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx512 = std::vec![0u8; width * 4]; + + scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv12_to_rgba_row(&y, &uv, &mut rgba_avx512, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx512 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx512.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX-512 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}", + rgba_scalar[first_diff], rgba_avx512[first_diff] + ); + } + } + + fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx512 = std::vec![0u8; width * 4]; + + scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv21_to_rgba_row(&y, &vu, &mut rgba_avx512, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx512 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx512.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX-512 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}", + rgba_scalar[first_diff], rgba_avx512[first_diff] + ); + } + } + + fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let uv: std::vec::Vec = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect(); + + let mut rgba_yuv420p = std::vec![0u8; width * 4]; + let mut rgba_nv12 = std::vec![0u8; width * 4]; + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range); + nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range); + } + assert_eq!( + rgba_yuv420p, rgba_nv12, + "AVX-512 NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + #[test] + fn nv12_avx512_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv12_rgba_equivalence(64, m, full); + } + } + } + + #[test] + fn nv12_avx512_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [66usize, 94, 126, 1920, 1922] { + check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + + #[test] + fn nv12_avx512_rgba_matches_yuv420p_rgba_avx512() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [64usize, 128, 1920] { + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false); + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true); + } + } + + #[test] + fn nv21_avx512_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv21_rgba_equivalence(64, m, full); + } + } + } + + #[test] + fn nv21_avx512_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [66usize, 94, 126, 1920, 1922] { + check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- rgb_to_hsv_row equivalence -------------------------------------- fn check_hsv_equivalence(rgb: &[u8], width: usize) { diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index c431c72..ad8925b 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1336,7 +1336,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( } } -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// SSE4.1 NV12 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1347,18 +1349,16 @@ pub(crate) unsafe fn nv12_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, uv_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgb_out, width, matrix, full_range, + ); } } -/// SSE4.1 NV21 → packed RGB (VU-ordered chroma). Thin wrapper over -/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`. -/// -/// # Safety -/// -/// Same as [`nv12_or_nv21_to_rgb_row_impl`]. +/// SSE4.1 NV21 → packed RGB. Thin wrapper over +/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1369,30 +1369,75 @@ pub(crate) unsafe fn nv21_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - // SAFETY: caller obligations forwarded to the shared impl. unsafe { - nv12_or_nv21_to_rgb_row_impl::(y, vu_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`] +/// but writes 4 bytes per pixel via [`write_rgba_16`]. +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`] +/// but writes 4 bytes per pixel via [`write_rgba_16`]. +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgba_out, width, matrix, full_range, + ); } } -/// Shared SSE4.1 NV12/NV21 kernel. `SWAP_UV = false` → NV12, -/// `SWAP_UV = true` → NV21. Const generic drives monomorphization — -/// the swap is resolved at compile time. +/// Shared SSE4.1 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque +/// alpha (RGBA). `SWAP_UV = false` → NV12; `SWAP_UV = true` → NV21. +/// `ALPHA = true` writes via [`write_rgba_16`]; `ALPHA = false` via +/// [`write_rgb_16`]. Both const generics drive compile-time +/// monomorphization. /// /// # Safety /// -/// 1. **SSE4.1 must be available on the current CPU** (same obligation -/// as [`yuv_420_to_rgb_row`]). +/// 1. **SSE4.1 must be available on the current CPU.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`. /// 4. `uv_or_vu_half.len() >= width` (2 × (width / 2) interleaved bytes). -/// 5. `rgb_out.len() >= 3 * width`. +/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -unsafe fn nv12_or_nv21_to_rgb_row_impl( +pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl< + const SWAP_UV: bool, + const ALPHA: bool, +>( y: &[u8], uv_or_vu_half: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, @@ -1400,7 +1445,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width"); debug_assert!(y.len() >= width); debug_assert!(uv_or_vu_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1421,6 +1467,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); // 0xFF as i8 // Deinterleave masks: `even_mask` pulls even-offset bytes into // lanes 0..7, `odd_mask` pulls odd-offset bytes. For NV12 that's @@ -1486,30 +1533,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - if SWAP_UV { - scalar::nv21_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); - } else { - scalar::nv12_to_rgb_row( - &y[x..width], - &uv_or_vu_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_or_vu_half[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + match (SWAP_UV, ALPHA) { + (false, false) => { + scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, false) => { + scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (false, true) => { + scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } + (true, true) => { + scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range) + } } } } @@ -3744,6 +3794,156 @@ mod tests { check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true); } } + + // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ---------------- + + fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let uv: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_sse41 = std::vec![0u8; width * 4]; + + scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv12_to_rgba_row(&y, &uv, &mut rgba_sse41, width, matrix, full_range); + } + + if rgba_scalar != rgba_sse41 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_sse41.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "SSE4.1 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}", + rgba_scalar[first_diff], rgba_sse41[first_diff] + ); + } + } + + fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let vu: std::vec::Vec = (0..width / 2) + .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8]) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_sse41 = std::vec![0u8; width * 4]; + + scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range); + unsafe { + nv21_to_rgba_row(&y, &vu, &mut rgba_sse41, width, matrix, full_range); + } + + if rgba_scalar != rgba_sse41 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_sse41.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "SSE4.1 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}", + rgba_scalar[first_diff], rgba_sse41[first_diff] + ); + } + } + + fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let uv: std::vec::Vec = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect(); + + let mut rgba_yuv420p = std::vec![0u8; width * 4]; + let mut rgba_nv12 = std::vec![0u8; width * 4]; + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range); + nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range); + } + assert_eq!( + rgba_yuv420p, rgba_nv12, + "SSE4.1 NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})" + ); + } + + #[test] + fn nv12_sse41_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv12_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn nv12_sse41_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [18usize, 30, 34, 1920, 1922] { + check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + + #[test] + fn nv12_sse41_rgba_matches_yuv420p_rgba_sse41() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [16usize, 30, 64, 1920] { + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false); + check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true); + } + } + + #[test] + fn nv21_sse41_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_nv21_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn nv21_sse41_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [18usize, 30, 34, 1920, 1922] { + check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- rgb_to_hsv_row equivalence -------------------------------------- fn check_hsv_equivalence(rgb: &[u8], width: usize) { diff --git a/src/row/mod.rs b/src/row/mod.rs index 2523eda..f30667d 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -406,6 +406,150 @@ pub fn nv21_to_rgb_row( scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); } +/// Converts one row of NV12 (semi‑planar 4:2:0) to packed **RGBA** +/// (8-bit). Same numerical contract as [`nv12_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel — sources without an alpha plane +/// produce opaque output). +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary — see + // [`yuv_420_to_rgba_row`] for rationale, including the checked + // `width × 4` multiplication via [`rgba_row_bytes`]. + assert_eq!(width & 1, 0, "NV12 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to +/// packed **RGBA** (8-bit). Same numerical contract as +/// [`nv21_to_rgb_row`]; alpha defaults to `0xFF` (opaque). +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "NV21 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(vu_half.len() >= width, "vu_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); +} + /// Converts one row of NV24 (semi‑planar 4:4:4, UV‑ordered) to packed /// RGB. Dispatches to the best available SIMD backend for the current /// target (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), falling diff --git a/src/row/scalar.rs b/src/row/scalar.rs index c41a2c8..e6d07ac 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -139,7 +139,8 @@ pub(crate) fn yuv_420_to_rgb_or_rgba_row( } /// NV12 (semi‑planar 4:2:0, UV-ordered) → packed RGB. Thin wrapper -/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`. +/// over [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = false, ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn nv12_to_rgb_row( y: &[u8], @@ -149,11 +150,14 @@ pub(crate) fn nv12_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - nv12_or_nv21_to_rgb_row_impl::(y, uv_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgb_out, width, matrix, full_range, + ); } /// NV21 (semi‑planar 4:2:0, VU-ordered) → packed RGB. Thin wrapper -/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`. +/// over [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with +/// `SWAP_UV = true, ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn nv21_to_rgb_row( y: &[u8], @@ -163,26 +167,63 @@ pub(crate) fn nv21_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - nv12_or_nv21_to_rgb_row_impl::(y, vu_half, rgb_out, width, matrix, full_range); + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgb_out, width, matrix, full_range, + ); } -/// Shared scalar kernel for NV12 (SWAP_UV=false) and NV21 -/// (SWAP_UV=true). Identical math and numerical contract to -/// [`yuv_420_to_rgb_row`]; the only difference is chroma byte order -/// in the interleaved plane. `const` generic drives compile-time -/// monomorphization — each wrapper is inlined with the branch -/// eliminated. +/// NV12 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`. +/// First three bytes per pixel are byte-identical to +/// [`nv12_to_rgb_row`]. `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, uv_half, rgba_out, width, matrix, full_range, + ); +} + +/// NV21 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`. +/// First three bytes per pixel are byte-identical to +/// [`nv21_to_rgb_row`]. `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + nv12_or_nv21_to_rgb_or_rgba_row_impl::( + y, vu_half, rgba_out, width, matrix, full_range, + ); +} + +/// Shared scalar kernel for NV12 (`SWAP_UV = false`) / NV21 +/// (`SWAP_UV = true`) at 3 bpp (`ALPHA = false`) or 4 bpp + opaque +/// alpha (`ALPHA = true`). Identical math to [`yuv_420_to_rgb_row`]; +/// the only differences are chroma byte order in the interleaved +/// plane and the per-pixel store stride. Both `const` generics drive +/// compile-time monomorphization — each wrapper is inlined with both +/// branches eliminated. /// /// # Panics (debug builds) /// /// - `width` must be even (4:2:0 pairs pixel columns). /// - `y.len() >= width`, `uv_or_vu_half.len() >= width`, -/// `rgb_out.len() >= 3 * width`. +/// `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -fn nv12_or_nv21_to_rgb_row_impl( +pub(crate) fn nv12_or_nv21_to_rgb_or_rgba_row_impl( y: &[u8], uv_or_vu_half: &[u8], - rgb_out: &mut [u8], + out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, @@ -190,7 +231,8 @@ fn nv12_or_nv21_to_rgb_row_impl( debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_or_vu_half.len() >= width, "chroma row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params(full_range); @@ -214,14 +256,26 @@ fn nv12_or_nv21_to_rgb_row_impl( let b_chroma = (coeffs.b_u() * u_d + coeffs.b_v() * v_d + RND) >> 15; let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15; - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + let r0 = clamp_u8(y0 + r_chroma); + let g0 = clamp_u8(y0 + g_chroma); + let b0 = clamp_u8(y0 + b_chroma); + out[x * bpp] = r0; + out[x * bpp + 1] = g0; + out[x * bpp + 2] = b0; + if ALPHA { + out[x * bpp + 3] = 0xFF; + } let y1 = ((y[x + 1] as i32 - y_off) * y_scale + RND) >> 15; - rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); - rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); - rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + let r1 = clamp_u8(y1 + r_chroma); + let g1 = clamp_u8(y1 + g_chroma); + let b1 = clamp_u8(y1 + b_chroma); + out[(x + 1) * bpp] = r1; + out[(x + 1) * bpp + 1] = g1; + out[(x + 1) * bpp + 2] = b1; + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFF; + } x += 2; } diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 5e852fb..53c6e2c 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -62,17 +62,17 @@ use crate::{ HsvBuffers, PixelSink, SourceFormat, raw::{Bayer, Bayer16, BayerRow, BayerRow16, BayerSink, BayerSink16}, row::{ - bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, nv12_to_rgb_row, nv21_to_rgb_row, - nv24_to_rgb_row, nv42_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, - p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row, - p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row, - p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_420_to_rgba_row, - yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, - yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, - yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, - yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, - yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, - yuv444p16_to_rgb_u16_row, + bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, nv12_to_rgb_row, + nv12_to_rgba_row, nv21_to_rgb_row, nv21_to_rgba_row, nv24_to_rgb_row, nv42_to_rgb_row, + p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row, + p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row, + p412_to_rgb_u16_row, p416_to_rgb_row, p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, + yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, + yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, + yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, + yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, + yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, + yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row, }, yuv::{ Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row, @@ -1110,10 +1110,12 @@ impl<'a> MixedSinker<'a, Yuv420p> { /// /// ```compile_fail /// // Attaching RGBA to a sink that doesn't write it is rejected - /// // at compile time: - /// use colconv::{sinker::MixedSinker, yuv::Nv12}; + /// // at compile time. Nv16 (4:2:2 semi‑planar) has not yet been + /// // wired for RGBA; once that lands the negative example here + /// // moves to the next not‑yet‑wired format. + /// use colconv::{sinker::MixedSinker, yuv::Nv16}; /// let mut buf = vec![0u8; 16 * 8 * 4]; - /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); + /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); /// ``` #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { @@ -1586,6 +1588,44 @@ impl PixelSink for MixedSinker<'_, Yuv444p> { // ---- Nv12 impl ---------------------------------------------------------- +impl<'a> MixedSinker<'a, Nv12> { + /// Attaches a packed 32‑bit RGBA output buffer. + /// + /// Only available on sinker types whose `PixelSink` impl writes + /// RGBA — calling `with_rgba` on a sink that doesn't (e.g. a + /// not‑yet‑wired `MixedSinker` today) is a compile error + /// rather than a silent no‑op. Each format that adds RGBA support + /// adds its own impl block here. + /// + /// The fourth byte per pixel is alpha. NV12 has no alpha plane, + /// so every alpha byte is filled with `0xFF` (opaque). Future + /// YUVA source impls will copy alpha through from the source + /// plane. + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } +} + impl Nv12Sink for MixedSinker<'_, Nv12> {} impl PixelSink for MixedSinker<'_, Nv12> { @@ -1641,15 +1681,16 @@ impl PixelSink for MixedSinker<'_, Nv12> { let Self { rgb, + rgba, luma, hsv, rgb_scratch, .. } = self; - // Single-plane row ranges are guaranteed to fit; RGB ranges use - // checked arithmetic (see the Yuv420p impl above for the full - // rationale — hsv-only attachment never validated `× 3`). + // Single-plane row ranges are guaranteed to fit; RGB / RGBA + // ranges use checked arithmetic (see the Yuv420p impl above for + // the full rationale — hsv-only attachment never validated × 3). let one_plane_start = idx * w; let one_plane_end = one_plane_start + w; @@ -1658,6 +1699,29 @@ impl PixelSink for MixedSinker<'_, Nv12> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } + // Native RGBA: independent kernel run, separate from RGB. Default + // alpha = 0xFF since NV12 has no alpha plane. + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + nv12_to_rgba_row( + row.y(), + row.uv_half(), + &mut buf[rgba_plane_start..rgba_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + let want_rgb = rgb.is_some(); let want_hsv = hsv.is_some(); if !want_rgb && !want_hsv { @@ -1849,6 +1913,38 @@ impl PixelSink for MixedSinker<'_, Nv16> { // the U/V byte-order difference. Only the trait `Input<'r>` and the // primitive name change. +impl<'a> MixedSinker<'a, Nv21> { + /// Attaches a packed 32‑bit RGBA output buffer. + /// + /// Only available on sinker types whose `PixelSink` impl writes + /// RGBA — see [`MixedSinker::::with_rgba`] for the same + /// rationale and constraints. NV21 has no alpha plane, so every + /// alpha byte is filled with `0xFF` (opaque). + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } +} + impl Nv21Sink for MixedSinker<'_, Nv21> {} impl PixelSink for MixedSinker<'_, Nv21> { @@ -1899,6 +1995,7 @@ impl PixelSink for MixedSinker<'_, Nv21> { let Self { rgb, + rgba, luma, hsv, rgb_scratch, @@ -1912,6 +2009,29 @@ impl PixelSink for MixedSinker<'_, Nv21> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } + // Native RGBA: independent kernel run, separate from RGB. Default + // alpha = 0xFF since NV21 has no alpha plane. + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + nv21_to_rgba_row( + row.y(), + row.vu_half(), + &mut buf[rgba_plane_start..rgba_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + let want_rgb = rgb.is_some(); let want_hsv = hsv.is_some(); if !want_rgb && !want_hsv { @@ -8421,6 +8541,183 @@ mod tests { assert_eq!(rgb_yuv420p, rgb_nv12); } + // ---- NV12 RGBA (Ship 8 PR 2) tests -------------------------------------- + // + // Mirrors the Yuv420p RGBA test set. Adds a cross-format invariant + // proving NV12 RGBA is byte-identical to Yuv420p RGBA when fed the + // same pixels — catches U/V swap bugs in the new RGBA path that + // a pure RGB-path test would miss. + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv12_rgba_only_converts_gray_to_gray_with_opaque_alpha() { + let (yp, uvp) = solid_nv12_frame(16, 8, 128, 128, 128); + let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1, "R"); + assert_eq!(px[0], px[1], "RGB monochromatic"); + assert_eq!(px[1], px[2], "RGB monochromatic"); + assert_eq!(px[3], 0xFF, "alpha must default to opaque"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv12_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + let w = 32usize; + let h = 16usize; + let (yp, uvp) = solid_nv12_frame(w as u32, h as u32, 180, 60, 200); + let src = Nv12Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, w as u32); + + let mut rgb = std::vec![0u8; w * h * 3]; + let mut rgba = std::vec![0u8; w * h * 4]; + let mut sink = MixedSinker::::new(w, h) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}"); + } + } + + #[test] + fn nv12_rgba_buffer_too_short_returns_err() { + let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1]; + let result = MixedSinker::::new(16, 8).with_rgba(&mut rgba_short); + let Err(err) = result else { + panic!("expected RgbaBufferTooShort error"); + }; + assert!(matches!( + err, + MixedSinkerError::RgbaBufferTooShort { + expected: 512, + actual: 511, + } + )); + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv12_rgba_simd_matches_scalar_with_random_yuv() { + // Pseudo-random per-pixel YUV across all 4 matrices × both + // ranges. Width 1922 forces both the SIMD main loop AND a scalar + // tail across every backend block size (16 / 32 / 64). + let w = 1922usize; + let h = 4usize; + let mut yp = std::vec![0u8; w * h]; + let mut uvp = std::vec![0u8; w * (h / 2)]; + pseudo_random_u8(&mut yp, 0xC001_C0DE); + pseudo_random_u8(&mut uvp, 0xCAFE_F00D); + let src = Nv12Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, w as u32); + + for &matrix in &[ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::YCgCo, + ] { + for &full_range in &[true, false] { + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut s_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + nv12_to(&src, full_range, matrix, &mut s_simd).unwrap(); + + let mut s_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + s_scalar.set_simd(false); + nv12_to(&src, full_range, matrix, &mut s_scalar).unwrap(); + + if rgba_simd != rgba_scalar { + let mismatch = rgba_simd + .iter() + .zip(rgba_scalar.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = mismatch / 4; + let channel = ["R", "G", "B", "A"][mismatch % 4]; + panic!( + "NV12 RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) for matrix={matrix:?} full_range={full_range}: simd={} scalar={}", + rgba_simd[mismatch], rgba_scalar[mismatch] + ); + } + } + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv12_rgba_matches_yuv420p_rgba_with_same_pixels() { + // Cross-format invariant: NV12 RGBA byte-identical to Yuv420p + // RGBA when the chroma is the same. Mirrors the existing + // `nv12_matches_yuv420p_mixed_sinker` RGB-path test for the new + // RGBA path. Catches U/V swap bugs in the NV12 RGBA kernel that + // would silently differ from the planar reference. + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; + let yp: Vec = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let up: Vec = (0..(ws / 2) * (hs / 2)) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let vp: Vec = (0..(ws / 2) * (hs / 2)) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let mut uvp: Vec = std::vec![0u8; ws * (hs / 2)]; + for r in 0..hs / 2 { + for c in 0..ws / 2 { + uvp[r * ws + 2 * c] = up[r * (ws / 2) + c]; + uvp[r * ws + 2 * c + 1] = vp[r * (ws / 2) + c]; + } + } + + let yuv420p_src = Yuv420pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2); + let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w); + + let mut rgba_yuv420p = std::vec![0u8; ws * hs * 4]; + let mut sink_yuv420p = MixedSinker::::new(ws, hs) + .with_rgba(&mut rgba_yuv420p) + .unwrap(); + yuv420p_to(&yuv420p_src, true, ColorMatrix::Bt709, &mut sink_yuv420p).unwrap(); + + let mut rgba_nv12 = std::vec![0u8; ws * hs * 4]; + let mut sink_nv12 = MixedSinker::::new(ws, hs) + .with_rgba(&mut rgba_nv12) + .unwrap(); + nv12_to(&nv12_src, true, ColorMatrix::Bt709, &mut sink_nv12).unwrap(); + + assert_eq!(rgba_yuv420p, rgba_nv12); + } + // ---- NV16 MixedSinker --------------------------------------------------- // // 4:2:2: chroma is half-width, full-height. Per-row math is @@ -8742,6 +9039,165 @@ mod tests { assert_eq!(rgb_nv12, rgb_nv21); } + // ---- NV21 RGBA (Ship 8 PR 2) tests -------------------------------------- + // + // Mirrors the NV12 RGBA tests. The cross-format invariant against + // NV12 RGBA (with byte-swapped chroma) catches the case where + // SWAP_UV is wired through correctly for the RGB path but not the + // RGBA path. + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv21_rgba_only_converts_gray_to_gray_with_opaque_alpha() { + let (yp, vup) = solid_nv21_frame(16, 8, 128, 128, 128); + let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1, "R"); + assert_eq!(px[0], px[1], "RGB monochromatic"); + assert_eq!(px[1], px[2], "RGB monochromatic"); + assert_eq!(px[3], 0xFF, "alpha must default to opaque"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv21_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + let w = 32usize; + let h = 16usize; + let (yp, vup) = solid_nv21_frame(w as u32, h as u32, 180, 60, 200); + let src = Nv21Frame::new(&yp, &vup, w as u32, h as u32, w as u32, w as u32); + + let mut rgb = std::vec![0u8; w * h * 3]; + let mut rgba = std::vec![0u8; w * h * 4]; + let mut sink = MixedSinker::::new(w, h) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv21_rgba_simd_matches_scalar_with_random_yuv() { + let w = 1922usize; + let h = 4usize; + let mut yp = std::vec![0u8; w * h]; + let mut vup = std::vec![0u8; w * (h / 2)]; + pseudo_random_u8(&mut yp, 0xC001_C0DE); + pseudo_random_u8(&mut vup, 0xCAFE_F00D); + let src = Nv21Frame::new(&yp, &vup, w as u32, h as u32, w as u32, w as u32); + + for &matrix in &[ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::YCgCo, + ] { + for &full_range in &[true, false] { + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut s_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + nv21_to(&src, full_range, matrix, &mut s_simd).unwrap(); + + let mut s_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + s_scalar.set_simd(false); + nv21_to(&src, full_range, matrix, &mut s_scalar).unwrap(); + + if rgba_simd != rgba_scalar { + let mismatch = rgba_simd + .iter() + .zip(rgba_scalar.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = mismatch / 4; + let channel = ["R", "G", "B", "A"][mismatch % 4]; + panic!( + "NV21 RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) for matrix={matrix:?} full_range={full_range}: simd={} scalar={}", + rgba_simd[mismatch], rgba_scalar[mismatch] + ); + } + } + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn nv21_rgba_matches_nv12_rgba_with_swapped_chroma() { + // Cross-format invariant on the RGBA path. Same shape as + // `nv21_matches_nv12_mixed_sinker_with_swapped_chroma` for RGB: + // building NV21 from NV12's bytes with the chroma pairs swapped + // must produce byte-identical RGBA. Catches cases where SWAP_UV + // is honored for RGB but not RGBA. + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; + + let yp: Vec = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let mut uvp: Vec = std::vec![0u8; ws * (hs / 2)]; + for r in 0..hs / 2 { + for c in 0..ws / 2 { + uvp[r * ws + 2 * c] = ((c + r * 53) & 0xFF) as u8; + uvp[r * ws + 2 * c + 1] = ((c + r * 71) & 0xFF) as u8; + } + } + let mut vup: Vec = uvp.clone(); + for r in 0..hs / 2 { + for c in 0..ws / 2 { + vup[r * ws + 2 * c] = uvp[r * ws + 2 * c + 1]; + vup[r * ws + 2 * c + 1] = uvp[r * ws + 2 * c]; + } + } + + let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w); + let nv21_src = Nv21Frame::new(&yp, &vup, w, h, w, w); + + let mut rgba_nv12 = std::vec![0u8; ws * hs * 4]; + let mut rgba_nv21 = std::vec![0u8; ws * hs * 4]; + let mut s_nv12 = MixedSinker::::new(ws, hs) + .with_rgba(&mut rgba_nv12) + .unwrap(); + let mut s_nv21 = MixedSinker::::new(ws, hs) + .with_rgba(&mut rgba_nv21) + .unwrap(); + nv12_to(&nv12_src, false, ColorMatrix::Bt709, &mut s_nv12).unwrap(); + nv21_to(&nv21_src, false, ColorMatrix::Bt709, &mut s_nv21).unwrap(); + + assert_eq!(rgba_nv12, rgba_nv21); + } + // ---- NV24 MixedSinker --------------------------------------------------- // // 4:4:4 semi-planar: UV row is `2 * width` bytes (one UV pair per From d8c0f6421e94c93bb2923238ea029635c6dfe091 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 16:40:32 +1200 Subject: [PATCH 2/2] update --- .github/workflows/benchmark.yml | 2 +- .github/workflows/ci.yml | 2 +- .github/workflows/coverage.yml | 2 +- .github/workflows/loc.yml | 2 +- src/row/arch/neon.rs | 54 ++++++++++++++++++++++++++++++--- src/row/arch/wasm_simd128.rs | 29 ++++++++++++++++++ src/row/arch/x86_avx2.rs | 29 ++++++++++++++++++ src/row/arch/x86_avx512.rs | 29 ++++++++++++++++++ src/row/arch/x86_sse41.rs | 29 ++++++++++++++++++ 9 files changed, 169 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 2b1cf96..81e5159 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -247,7 +247,7 @@ jobs: if: always() steps: - name: Download all benchmark results - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v8 with: path: all-results diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0fc00b..fdf5548 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -186,7 +186,7 @@ jobs: - name: Install Rust run: rustup update stable --no-self-update && rustup default stable - name: Install Intel SDE - uses: petarpetrovt/setup-sde@v2.4 + uses: petarpetrovt/setup-sde@v3.0 with: sdeVersion: 9.33.0 environmentVariableName: SDE_PATH diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 3e65542..fdac3f0 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -156,7 +156,7 @@ jobs: - uses: actions/checkout@v6 - name: Download ${{ matrix.label }} report - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v8 with: name: coverage-${{ matrix.label }} path: coverage/ diff --git a/.github/workflows/loc.yml b/.github/workflows/loc.yml index 669041e..c53d960 100644 --- a/.github/workflows/loc.yml +++ b/.github/workflows/loc.yml @@ -41,7 +41,7 @@ jobs: run: | tokeit --lang rust - name: Upload total loc to GitHub Gist - uses: actions/github-script@v8 + uses: actions/github-script@v9 with: github-token: ${{ secrets.GIST_PAT }} script: | diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index ae95884..d9563c9 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -1081,6 +1081,18 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( /// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = false, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]: +/// +/// 1. **NEON must be available on the current CPU.** Direct callers +/// are responsible for verifying this; the dispatcher in +/// [`crate::row::nv12_to_rgb_row`] checks it. +/// 2. `width & 1 == 0` (4:2:0 requires even width). +/// 3. `y.len() >= width`. +/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair). +/// 5. `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1102,6 +1114,12 @@ pub(crate) unsafe fn nv12_to_rgb_row( /// NEON NV21 → packed RGB (VU-ordered chroma). Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = true, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same +/// number of bytes (`>= width`) but in V-then-U order per chroma +/// pair. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1123,6 +1141,12 @@ pub(crate) unsafe fn nv21_to_rgb_row( /// NEON NV12 → packed RGBA (R, G, B, `0xFF` per pixel). Same /// contract as [`nv12_to_rgb_row`] but writes 4 bytes per pixel via /// `vst4q_u8`. `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv12_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes (one extra byte per pixel for the opaque +/// alpha). #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv12_to_rgba_row( @@ -1144,6 +1168,11 @@ pub(crate) unsafe fn nv12_to_rgba_row( /// NEON NV21 → packed RGBA (R, G, B, `0xFF` per pixel). Same /// contract as [`nv21_to_rgb_row`] but writes 4 bytes per pixel via /// `vst4q_u8`. `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv21_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn nv21_to_rgba_row( @@ -4108,7 +4137,22 @@ mod tests { } // ---- rgb_to_hsv_row equivalence ------------------------------------ - + // + // The NEON HSV kernel uses `vmaxq_f32` / `vminq_f32` / `vdivq_f32` + // (true f32 ops). Miri's interpreter does not currently implement + // these aarch64 NEON f32 intrinsics — under + // `cargo miri test --target aarch64-unknown-linux-gnu` the calls + // raise `unsupported operation: can't call foreign function + // "llvm.aarch64.neon.fmax.v4f32"`. The previous + // `#[cfg_attr(miri, ignore = ...)]` annotations didn't suffice in + // CI (Miri tried to evaluate them anyway). Compiling the helper + // and the tests *out* under `cfg(miri)` removes the f32 + // intrinsics from the binary entirely so Miri can't trip on them. + // The other backends (wasm / x86) are tested by their respective + // arch modules; correctness of the NEON HSV path is still covered + // by host-arch CI runs that don't go through Miri. + + #[cfg(not(miri))] fn check_hsv_equivalence(rgb: &[u8], width: usize) { let mut h_scalar = std::vec![0u8; width]; let mut s_scalar = std::vec![0u8; width]; @@ -4160,21 +4204,21 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + #[cfg(not(miri))] fn hsv_neon_matches_scalar_pseudo_random_16() { let rgb = pseudo_random_bgr(16); check_hsv_equivalence(&rgb, 16); } #[test] - #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + #[cfg(not(miri))] fn hsv_neon_matches_scalar_pseudo_random_1920() { let rgb = pseudo_random_bgr(1920); check_hsv_equivalence(&rgb, 1920); } #[test] - #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + #[cfg(not(miri))] fn hsv_neon_matches_scalar_tail_widths() { // Widths that force a non‑trivial scalar tail (non‑multiple of 16). for w in [1usize, 7, 15, 17, 31, 1921] { @@ -4184,7 +4228,7 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + #[cfg(not(miri))] fn hsv_neon_matches_scalar_primaries_and_edges() { // Primary colors, grays, near‑saturation — exercise each hue branch // and the v==0, delta==0, h<0 wrap paths. diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index f8a794d..ffdd522 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -1320,6 +1320,18 @@ unsafe fn deinterleave_uv_u16_wasm(ptr: *const u16) -> (v128, v128) { /// WASM simd128 NV12 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = false, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]: +/// +/// 1. **simd128 must be enabled at compile time.** WASM has no +/// runtime CPU detection — the module's SIMD support is fixed at +/// produce time. +/// 2. `width & 1 == 0` (4:2:0 requires even width). +/// 3. `y.len() >= width`. +/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair). +/// 5. `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1340,6 +1352,12 @@ pub(crate) unsafe fn nv12_to_rgb_row( /// WASM simd128 NV21 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = true, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same +/// number of bytes (`>= width`) but in V-then-U order per chroma +/// pair. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1360,6 +1378,12 @@ pub(crate) unsafe fn nv21_to_rgb_row( /// WASM simd128 NV12 → packed RGBA. Same contract as /// [`nv12_to_rgb_row`] but writes 4 bytes per pixel via /// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv12_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes (one extra byte per pixel for the opaque +/// alpha). #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv12_to_rgba_row( @@ -1380,6 +1404,11 @@ pub(crate) unsafe fn nv12_to_rgba_row( /// WASM simd128 NV21 → packed RGBA. Same contract as /// [`nv21_to_rgb_row`] but writes 4 bytes per pixel via /// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv21_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn nv21_to_rgba_row( diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 3312a79..50c7aaf 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -1526,6 +1526,18 @@ unsafe fn deinterleave_uv_u16_avx2(ptr: *const u16) -> (__m256i, __m256i) { /// AVX2 NV12 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = false, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]: +/// +/// 1. **AVX2 must be available on the current CPU.** Direct callers +/// are responsible for verifying this; the dispatcher in +/// [`crate::row::nv12_to_rgb_row`] checks it. +/// 2. `width & 1 == 0` (4:2:0 requires even width). +/// 3. `y.len() >= width`. +/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair). +/// 5. `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1546,6 +1558,12 @@ pub(crate) unsafe fn nv12_to_rgb_row( /// AVX2 NV21 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = true, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same +/// number of bytes (`>= width`) but in V-then-U order per chroma +/// pair. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1566,6 +1584,12 @@ pub(crate) unsafe fn nv21_to_rgb_row( /// AVX2 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`] /// but writes 4 bytes per pixel via [`write_rgba_32`]. /// `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv12_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes (one extra byte per pixel for the opaque +/// alpha). #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv12_to_rgba_row( @@ -1586,6 +1610,11 @@ pub(crate) unsafe fn nv12_to_rgba_row( /// AVX2 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`] /// but writes 4 bytes per pixel via [`write_rgba_32`]. /// `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv21_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn nv21_to_rgba_row( diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 77ce177..e1bd915 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -1573,6 +1573,18 @@ unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) { /// AVX‑512 NV12 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = false, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]: +/// +/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.** +/// Direct callers are responsible for verifying this; the +/// dispatcher in [`crate::row::nv12_to_rgb_row`] checks it. +/// 2. `width & 1 == 0` (4:2:0 requires even width). +/// 3. `y.len() >= width`. +/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair). +/// 5. `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1593,6 +1605,12 @@ pub(crate) unsafe fn nv12_to_rgb_row( /// AVX‑512 NV21 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = true, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same +/// number of bytes (`>= width`) but in V-then-U order per chroma +/// pair. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1613,6 +1631,12 @@ pub(crate) unsafe fn nv21_to_rgb_row( /// AVX‑512 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`] /// but writes 4 bytes per pixel via [`write_rgba_64`]. /// `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv12_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes (one extra byte per pixel for the opaque +/// alpha). #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv12_to_rgba_row( @@ -1633,6 +1657,11 @@ pub(crate) unsafe fn nv12_to_rgba_row( /// AVX‑512 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`] /// but writes 4 bytes per pixel via [`write_rgba_64`]. /// `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv21_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn nv21_to_rgba_row( diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index ad8925b..d4a342e 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1339,6 +1339,18 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// SSE4.1 NV12 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = false, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]: +/// +/// 1. **SSE4.1 must be available on the current CPU.** Direct +/// callers are responsible for verifying this; the dispatcher in +/// [`crate::row::nv12_to_rgb_row`] checks it. +/// 2. `width & 1 == 0` (4:2:0 requires even width). +/// 3. `y.len() >= width`. +/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair). +/// 5. `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv12_to_rgb_row( @@ -1359,6 +1371,12 @@ pub(crate) unsafe fn nv12_to_rgb_row( /// SSE4.1 NV21 → packed RGB. Thin wrapper over /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with /// `SWAP_UV = true, ALPHA = false`. +/// +/// # Safety +/// +/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same +/// number of bytes (`>= width`) but in V-then-U order per chroma +/// pair. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv21_to_rgb_row( @@ -1379,6 +1397,12 @@ pub(crate) unsafe fn nv21_to_rgb_row( /// SSE4.1 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`] /// but writes 4 bytes per pixel via [`write_rgba_16`]. /// `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv12_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes (one extra byte per pixel for the opaque +/// alpha). #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv12_to_rgba_row( @@ -1399,6 +1423,11 @@ pub(crate) unsafe fn nv12_to_rgba_row( /// SSE4.1 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`] /// but writes 4 bytes per pixel via [`write_rgba_16`]. /// `rgba_out.len() >= 4 * width`. +/// +/// # Safety +/// +/// Same as [`nv21_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn nv21_to_rgba_row(