From 443efa44ebfa95a21ebaaca35a5699acf5f46a45 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:31:31 +1200 Subject: [PATCH 1/2] update --- src/row/arch/neon.rs | 145 ++++++++++++++++++++++-- src/row/arch/wasm_simd128.rs | 139 ++++++++++++++++++++--- src/row/arch/x86_avx2.rs | 145 +++++++++++++++++++++--- src/row/arch/x86_avx512.rs | 145 +++++++++++++++++++++--- src/row/arch/x86_sse41.rs | 147 ++++++++++++++++++++++--- src/row/mod.rs | 68 ++++++++++++ src/row/scalar.rs | 61 ++++++++-- src/sinker/mixed.rs | 208 +++++++++++++++++++++++++++++++++-- 8 files changed, 972 insertions(+), 86 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index d9563c9..a8ea1e9 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -1548,6 +1548,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( /// /// # Safety /// +/// Same contract as [`yuv_444_to_rgb_or_rgba_row`]: +/// /// 1. **NEON must be available on the current CPU.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`. /// 3. `rgb_out.len() >= 3 * width`. @@ -1563,11 +1565,69 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked NEON availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// NEON YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract +/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via +/// `vst4q_u8` (R, G, B, `0xFF`). +/// +/// # Safety +/// +/// Same as [`yuv_444_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked NEON availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`] +/// (`ALPHA = false`, `vst3q_u8`) and [`yuv_444_to_rgba_row`] +/// (`ALPHA = true`, `vst4q_u8` with constant `0xFF` alpha). Math is +/// byte-identical to `scalar::yuv_444_to_rgb_or_rgba_row::`; +/// only the per-block store intrinsic differs. +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`. +/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +/// +/// No width parity constraint (4:4:4). +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1588,6 +1648,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1646,22 +1707,28 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8); + vst4q_u8(out.as_mut_ptr().add(x * 4), rgba); + } else { + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(out.as_mut_ptr().add(x * 3), rgb); + } x += 16; } if x < width { - scalar::yuv_444_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + if ALPHA { + scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } else { + scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -4136,6 +4203,60 @@ mod tests { } } + // ---- yuv_444_to_rgba_row equivalence -------------------------------- + + fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect(); + let v: std::vec::Vec = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + + scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + + if rgba_scalar != rgba_neon { + let first_diff = rgba_scalar + .iter() + .zip(rgba_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "NEON yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgba_scalar[first_diff], rgba_neon[first_diff] + ); + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn yuv_444_neon_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv_444_rgba_equivalence(16, m, full); + } + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn yuv_444_neon_rgba_matches_scalar_widths() { + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- rgb_to_hsv_row equivalence ------------------------------------ // // The NEON HSV kernel uses `vmaxq_f32` / `vminq_f32` / `vdivq_f32` diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index ffdd522..7286e0b 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -1857,9 +1857,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( } } -/// wasm simd128 YUV 4:4:4 planar → packed RGB. 16 Y + 16 U + 16 V -/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] but U and V -/// come from separate planes (no deinterleave). +/// wasm simd128 YUV 4:4:4 planar → packed RGB. Thin wrapper over +/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`. /// /// # Safety /// @@ -1876,11 +1875,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked simd128 availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same +/// contract as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel +/// via [`write_rgba_16`] (R, G, B, `0xFF`). +/// +/// # Safety +/// +/// Same as [`yuv_444_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked simd128 availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`] +/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_444_to_rgba_row`] +/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`. +/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1898,6 +1951,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1953,21 +2007,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_444_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + if ALPHA { + scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } else { + scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3721,6 +3780,58 @@ mod tests { } } + // ---- yuv_444_to_rgba_row equivalence -------------------------------- + + fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect(); + let v: std::vec::Vec = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + + scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); + } + + if rgba_scalar != rgba_wasm { + let first_diff = rgba_scalar + .iter() + .zip(rgba_wasm.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "wasm simd128 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}", + rgba_scalar[first_diff], rgba_wasm[first_diff] + ); + } + } + + #[test] + fn simd128_yuv_444_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv_444_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_yuv_444_rgba_matches_scalar_widths() { + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444p_n + yuv_444p16 equivalence ---------------------- fn check_yuv_444p_n_equivalence( diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 50c7aaf..24ceda0 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -2032,9 +2032,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( } } -/// AVX2 YUV 4:4:4 planar → packed RGB. 32 Y pixels + 32 U + 32 V -/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] with U / V -/// loaded directly from separate planes (no deinterleave step). +/// AVX2 YUV 4:4:4 planar → packed RGB. Thin wrapper over +/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`. /// /// # Safety /// @@ -2051,11 +2050,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked AVX2 availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract +/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via +/// [`write_rgba_32`] (R, G, B, `0xFF`). +/// +/// # Safety +/// +/// Same as [`yuv_444_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked AVX2 availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`] +/// (`ALPHA = false`, [`write_rgb_32`]) and [`yuv_444_to_rgba_row`] +/// (`ALPHA = true`, [`write_rgba_32`] with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`. +/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -2073,6 +2126,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); // 0xFF as i8 let mut x = 0usize; while x + 32 <= width { @@ -2158,21 +2212,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_444_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + if ALPHA { + scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } else { + scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -4053,6 +4112,64 @@ mod tests { } } + // ---- yuv_444_to_rgba_row equivalence -------------------------------- + + fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect(); + let v: std::vec::Vec = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx2 = std::vec![0u8; width * 4]; + + scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_avx2, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx2 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx2.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX2 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}", + rgba_scalar[first_diff], rgba_avx2[first_diff] + ); + } + } + + #[test] + fn avx2_yuv_444_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv_444_rgba_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_yuv_444_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [31usize, 32, 33, 63, 64, 65, 1920, 1921] { + check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444p_n + yuv_444p16 equivalence ---------------------- fn check_yuv_444p_n_equivalence( diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index e1bd915..04de26d 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -2097,9 +2097,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( } } -/// AVX-512 YUV 4:4:4 planar → packed RGB. 64 Y pixels + 64 U + 64 V -/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] with U / V -/// loaded directly from separate planes (no deinterleave step). +/// AVX-512 YUV 4:4:4 planar → packed RGB. Thin wrapper over +/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`. /// /// # Safety /// @@ -2116,11 +2115,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked AVX-512BW availability + slice bounds — + // see [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract +/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via +/// [`write_rgba_64`] (R, G, B, `0xFF`). +/// +/// # Safety +/// +/// Same as [`yuv_444_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked AVX-512BW availability + slice bounds — + // see [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`] +/// (`ALPHA = false`, [`write_rgb_64`]) and [`yuv_444_to_rgba_row`] +/// (`ALPHA = true`, [`write_rgba_64`] with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`. +/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -2138,6 +2191,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); // 0xFF as i8 let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -2237,21 +2291,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::yuv_444_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + if ALPHA { + scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } else { + scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -4224,6 +4283,64 @@ mod tests { } } + // ---- yuv_444_to_rgba_row equivalence -------------------------------- + + fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect(); + let v: std::vec::Vec = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx512 = std::vec![0u8; width * 4]; + + scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_avx512, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx512 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx512.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX-512 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}", + rgba_scalar[first_diff], rgba_avx512[first_diff] + ); + } + } + + #[test] + fn avx512_yuv_444_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv_444_rgba_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_yuv_444_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [63usize, 64, 65, 127, 128, 129, 1920, 1921] { + check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444p_n + yuv_444p16 equivalence ---------------------- fn check_yuv_444p_n_equivalence( diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index d4a342e..35a9453 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1795,9 +1795,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( } } -/// SSE4.1 YUV 4:4:4 planar → packed RGB. 16 Y pixels + 16 U + 16 V -/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] but U and V -/// come from separate planes (no deinterleave). +/// SSE4.1 YUV 4:4:4 planar → packed RGB. Thin wrapper over +/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`. /// /// # Safety /// @@ -1814,11 +1813,67 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked SSE4.1 availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract +/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via +/// [`write_rgba_16`] (R, G, B, `0xFF`). +/// +/// # Safety +/// +/// Same as [`yuv_444_to_rgb_row`] except the output slice must be +/// `>= 4 * width` bytes. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked SSE4.1 availability + slice bounds — see + // [`yuv_444_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`] +/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_444_to_rgba_row`] +/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha). +/// Math is byte-identical to +/// `scalar::yuv_444_to_rgb_or_rgba_row::`. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`. +/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -1836,6 +1891,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); // 0xFF as i8 let mut x = 0usize; while x + 16 <= width { @@ -1892,21 +1948,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_444_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_w = width - x; + let tail_out = &mut out[x * bpp..width * bpp]; + if ALPHA { + scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } else { + scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3549,6 +3610,64 @@ mod tests { } } + // ---- yuv_444_to_rgba_row equivalence -------------------------------- + + fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect(); + let v: std::vec::Vec = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_sse41 = std::vec![0u8; width * 4]; + + scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_sse41, width, matrix, full_range); + } + + if rgba_scalar != rgba_sse41 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_sse41.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "SSE4.1 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}", + rgba_scalar[first_diff], rgba_sse41[first_diff] + ); + } + } + + #[test] + fn sse41_yuv_444_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv_444_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_yuv_444_rgba_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] { + check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false); + } + } + // ---- yuv_444p_n + yuv_444p16 equivalence ---------------------- fn check_yuv_444p_n_equivalence( diff --git a/src/row/mod.rs b/src/row/mod.rs index f30667d..6499cec 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -790,6 +790,74 @@ pub fn yuv_444_to_rgb_row( scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); } +/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit). +/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`. +/// `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); +} + /// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const /// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best /// available backend for the current target (NEON / SSE4.1 / AVX2 / diff --git a/src/row/scalar.rs b/src/row/scalar.rs index e6d07ac..1a76503 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -357,10 +357,12 @@ fn nv24_or_nv42_to_rgb_row_impl( } } -/// YUV 4:4:4 planar → packed RGB. One UV pair per Y pixel, U/V from -/// separate planes. Same arithmetic as -/// [`nv24_to_rgb_row`] (4:4:4 semi-planar) but without the -/// deinterleave step — U and V come pre-separated. +/// YUV 4:4:4 planar → packed RGB. Thin wrapper over +/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// +/// One UV pair per Y pixel, U/V from separate planes. Same +/// arithmetic as [`nv24_to_rgb_row`] (4:4:4 semi-planar) but +/// without the deinterleave step — U and V come pre-separated. /// /// # Panics (debug builds) /// @@ -375,11 +377,51 @@ pub(crate) fn yuv_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar → packed `R, G, B, A` quadruplets with constant +/// `A = 0xFF`. First three bytes per pixel are byte-identical to +/// [`yuv_444_to_rgb_row`]. `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_444_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Shared scalar kernel for [`yuv_444_to_rgb_row`] (`ALPHA = false`, +/// 3 bpp) and [`yuv_444_to_rgba_row`] (`ALPHA = true`, 4 bpp + opaque +/// alpha). Math is identical; only the per-pixel store stride +/// differs. `const` generic monomorphizes per call site, so the +/// `if ALPHA` branches are eliminated. +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444_to_rgb_or_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u.len() >= width, "u row too short"); debug_assert!(v.len() >= width, "v row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params(full_range); @@ -395,9 +437,12 @@ pub(crate) fn yuv_444_to_rgb_row( let b_chroma = (coeffs.b_u() * u_d + coeffs.b_v() * v_d + RND) >> 15; let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15; - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } } } diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index 146db35..94ca4e4 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -67,12 +67,12 @@ use crate::{ p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row, p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, - yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, - yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, - yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, - yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, - yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, - yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row, + yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv_444_to_rgba_row, yuv420p9_to_rgb_row, + yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, + yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, + yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, + yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, + yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row, }, yuv::{ Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row, @@ -1110,12 +1110,12 @@ impl<'a> MixedSinker<'a, Yuv420p> { /// /// ```compile_fail /// // Attaching RGBA to a sink that doesn't write it is rejected - /// // at compile time. Yuv444p (4:4:4 planar) has not yet been + /// // at compile time. Nv24 (4:4:4 semi‑planar) has not yet been /// // wired for RGBA; once that lands the negative example here /// // moves to the next not‑yet‑wired format. - /// use colconv::{sinker::MixedSinker, yuv::Yuv444p}; + /// use colconv::{sinker::MixedSinker, yuv::Nv24}; /// let mut buf = vec![0u8; 16 * 8 * 4]; - /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); + /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); /// ``` #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { @@ -1525,7 +1525,40 @@ impl PixelSink for MixedSinker<'_, Yuv422p> { // ---- Yuv444p impl ------------------------------------------------------- // // 4:4:4 planar: U and V are full-width, full-height. No width parity -// constraint. Uses the new `yuv_444_to_rgb_row` kernel family. +// constraint. Uses the `yuv_444_to_rgb_row` / `yuv_444_to_rgba_row` +// kernel family. + +impl<'a> MixedSinker<'a, Yuv444p> { + /// Attaches a packed 32‑bit RGBA output buffer. + /// + /// Only available on sinker types whose `PixelSink` impl writes + /// RGBA — see [`MixedSinker::::with_rgba`] for the same + /// rationale and constraints. Yuv444p has no alpha plane, so every + /// alpha byte is filled with `0xFF` (opaque). + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } +} impl Yuv444pSink for MixedSinker<'_, Yuv444p> {} @@ -1576,6 +1609,7 @@ impl PixelSink for MixedSinker<'_, Yuv444p> { let Self { rgb, + rgba, luma, hsv, rgb_scratch, @@ -1589,6 +1623,30 @@ impl PixelSink for MixedSinker<'_, Yuv444p> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } + // Native RGBA: independent kernel run, separate from RGB. Default + // alpha = 0xFF since Yuv444p has no alpha plane. + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; + yuv_444_to_rgba_row( + row.y(), + row.u(), + row.v(), + &mut buf[rgba_plane_start..rgba_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + let want_rgb = rgb.is_some(); let want_hsv = hsv.is_some(); if !want_rgb && !want_hsv { @@ -11181,6 +11239,136 @@ mod tests { } } + // ---- Yuv444p RGBA (Ship 8 PR 4a) tests ---------------------------------- + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn yuv444p_rgba_only_converts_gray_to_gray_with_opaque_alpha() { + let (yp, up, vp) = solid_yuv444p_frame(16, 8, 128, 128, 128); + let src = Yuv444pFrame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv444p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1, "R"); + assert_eq!(px[0], px[1], "RGB monochromatic"); + assert_eq!(px[1], px[2], "RGB monochromatic"); + assert_eq!(px[3], 0xFF, "alpha must default to opaque"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn yuv444p_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + let w = 32u32; + let h = 16u32; + let ws = w as usize; + let hs = h as usize; + let (yp, up, vp) = solid_yuv444p_frame(w, h, 180, 60, 200); + let src = Yuv444pFrame::new(&yp, &up, &vp, w, h, w, w, w); + + let mut rgb = std::vec![0u8; ws * hs * 3]; + let mut rgba = std::vec![0u8; ws * hs * 4]; + let mut sink = MixedSinker::::new(ws, hs) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv444p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for i in 0..(ws * hs) { + assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}"); + } + } + + #[test] + fn yuv444p_rgba_buffer_too_short_returns_err() { + let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1]; + let result = MixedSinker::::new(16, 8).with_rgba(&mut rgba_short); + let Err(err) = result else { + panic!("expected RgbaBufferTooShort error"); + }; + assert!(matches!( + err, + MixedSinkerError::RgbaBufferTooShort { + expected: 512, + actual: 511, + } + )); + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn yuv444p_rgba_simd_matches_scalar_with_random_yuv() { + // 4:4:4 has full-width chroma — U / V are width-sized per row. + // Width 1922 forces both the SIMD main loop AND scalar tail + // across every backend block size (16/32/64). + let w = 1922usize; + let h = 4usize; + let mut yp = std::vec![0u8; w * h]; + let mut up = std::vec![0u8; w * h]; + let mut vp = std::vec![0u8; w * h]; + pseudo_random_u8(&mut yp, 0xC001_C0DE); + pseudo_random_u8(&mut up, 0xCAFE_F00D); + pseudo_random_u8(&mut vp, 0xDEAD_BEEF); + let src = Yuv444pFrame::new( + &yp, &up, &vp, w as u32, h as u32, w as u32, w as u32, w as u32, + ); + + for &matrix in &[ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::YCgCo, + ] { + for &full_range in &[true, false] { + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut s_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + yuv444p_to(&src, full_range, matrix, &mut s_simd).unwrap(); + + let mut s_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + s_scalar.set_simd(false); + yuv444p_to(&src, full_range, matrix, &mut s_scalar).unwrap(); + + if rgba_simd != rgba_scalar { + let mismatch = rgba_simd + .iter() + .zip(rgba_scalar.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = mismatch / 4; + let channel = ["R", "G", "B", "A"][mismatch % 4]; + panic!( + "Yuv444p RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) for matrix={matrix:?} full_range={full_range}: simd={} scalar={}", + rgba_simd[mismatch], rgba_scalar[mismatch] + ); + } + } + } + } + #[test] #[cfg_attr( miri, From fa48ba17f5871e3e20b9822359fa8217af0d9815 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:49:01 +1200 Subject: [PATCH 2/2] update --- src/row/arch/neon.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index a8ea1e9..c5c72c7 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -1548,7 +1548,9 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl( /// /// # Safety /// -/// Same contract as [`yuv_444_to_rgb_or_rgba_row`]: +/// Same contract as [`yuv_444_to_rgb_or_rgba_row`] with +/// `ALPHA = false` (so `out.len() >= width * 3` specializes to +/// `rgb_out.len() >= 3 * width`): /// /// 1. **NEON must be available on the current CPU.** /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.