From 9bd491b96fffd8052c06ef074af35550b6fb99af Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 13:46:56 +1200 Subject: [PATCH 1/5] update --- src/row/arch/neon.rs | 82 +++++++++- src/row/arch/wasm_simd128.rs | 145 +++++++++++++++++- src/row/arch/x86_avx2.rs | 104 ++++++++++++- src/row/arch/x86_avx512.rs | 113 +++++++++++++- src/row/arch/x86_common.rs | 86 +++++++++++ src/row/arch/x86_sse41.rs | 82 +++++++++- src/row/mod.rs | 103 +++++++++++++ src/row/scalar.rs | 71 ++++++++- src/sinker/mixed.rs | 279 ++++++++++++++++++++++++++++++++++- 9 files changed, 1023 insertions(+), 42 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 8d40e61..bfc52f6 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -69,12 +69,71 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked NEON availability + slice bounds — see + // [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// NEON YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as +/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B, +/// `0xFF`). +/// +/// # Safety +/// +/// 1. NEON must be available on the current CPU. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked NEON availability + slice bounds — see + // [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`, +/// `vst3q_u8`) and [`yuv_420_to_rgba_row`] (`ALPHA = true`, +/// `vst4q_u8` with constant `0xFF` alpha). Math is byte-identical to +/// `scalar::yuv_420_to_rgb_or_rgba_row::`; only the per-block +/// store intrinsic differs. `const` generic monomorphizes per call +/// site, so the `if ALPHA` branches are eliminated. +/// +/// # Safety +/// +/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the +/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })` +/// bytes long. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -97,6 +156,10 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + // Constant opaque-alpha vector for the RGBA path. Materializing + // it outside the loop costs one `vdupq_n_u8` regardless of + // ALPHA; the compiler DCE's it when ALPHA = false. + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -158,9 +221,16 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)), ); - // vst3q_u8 writes 48 bytes as interleaved R, G, B triples. - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + // vst4q_u8 writes 64 bytes as interleaved R, G, B, A + // quadruplets — native AArch64 4-channel store. + let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8); + vst4q_u8(out.as_mut_ptr().add(x * 4), rgba); + } else { + // vst3q_u8 writes 48 bytes as interleaved R, G, B triples. + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(out.as_mut_ptr().add(x * 3), rgb); + } x += 16; } @@ -168,11 +238,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( // Scalar tail for the 0..14 leftover pixels (always even, 4:2:0 // requires even width so x/2 and width/2 are well‑defined). if x < width { - scalar::yuv_420_to_rgb_row( + scalar::yuv_420_to_rgb_or_rgba_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], + &mut out[x * bpp..width * bpp], width - x, matrix, full_range, diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index b13a9a2..26b8363 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -73,12 +73,70 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked simd128 availability + slice bounds — + // see [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// WASM simd128 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract +/// as [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B, +/// `0xFF`). +/// +/// # Safety +/// +/// 1. simd128 must be enabled at compile time. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked simd128 availability + slice bounds — + // see [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared WASM simd128 kernel for [`yuv_420_to_rgb_row`] +/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_420_to_rgba_row`] +/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha). +/// Math is byte-identical to +/// `scalar::yuv_420_to_rgb_or_rgba_row::`. +/// +/// # Safety +/// +/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the +/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })` +/// bytes long. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -100,6 +158,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + // Constant opaque-alpha vector for the RGBA path; DCE'd when + // ALPHA = false. + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -160,19 +221,24 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - // 3‑way interleave → packed RGB (48 bytes). - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + // 4‑way interleave → packed RGBA (64 bytes). + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + // 3‑way interleave → packed RGB (48 bytes). + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } // Scalar tail for the 0..14 leftover pixels. if x < width { - scalar::yuv_420_to_rgb_row( + scalar::yuv_420_to_rgb_or_rgba_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], + &mut out[x * bpp..width * bpp], width - x, matrix, full_range, @@ -1958,6 +2024,75 @@ unsafe fn write_rgb_16(r: v128, g: v128, b: v128, ptr: *mut u8) { } } +/// Writes 16 pixels of packed RGBA (64 bytes) from four u8x16 channel +/// vectors. Mirror of [`write_rgb_16`] for the 4-channel output path. +/// +/// The 4-byte stride aligns cleanly with the 16-byte register width: +/// each output block holds exactly 4 RGBA quads (16 bytes), with R, +/// G, B, A interleaved at positions `(0, 1, 2, 3)`, `(4, 5, 6, 7)`, +/// etc. `u8x16_swizzle` indices ≥ 16 zero the lane. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 writable bytes. +#[inline(always)] +unsafe fn write_rgba_16(r: v128, g: v128, b: v128, a: v128, ptr: *mut u8) { + unsafe { + // Block 0 (bytes 0..16): pixels 0..3, source bytes 0..3. + let r0 = i8x16(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1); + let g0 = i8x16(-1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1); + let b0 = i8x16(-1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1); + let a0 = i8x16(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r0), u8x16_swizzle(g, g0)), + v128_or(u8x16_swizzle(b, b0), u8x16_swizzle(a, a0)), + ); + + // Block 1 (bytes 16..32): pixels 4..7, source bytes 4..7. + let r1 = i8x16(4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1); + let g1 = i8x16(-1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1); + let b1 = i8x16(-1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1); + let a1 = i8x16(-1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r1), u8x16_swizzle(g, g1)), + v128_or(u8x16_swizzle(b, b1), u8x16_swizzle(a, a1)), + ); + + // Block 2 (bytes 32..48): pixels 8..11, source bytes 8..11. + let r2 = i8x16(8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1, -1); + let g2 = i8x16(-1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1); + let b2 = i8x16(-1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1); + let a2 = i8x16(-1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r2), u8x16_swizzle(g, g2)), + v128_or(u8x16_swizzle(b, b2), u8x16_swizzle(a, a2)), + ); + + // Block 3 (bytes 48..64): pixels 12..15, source bytes 12..15. + let r3 = i8x16( + 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1, + ); + let g3 = i8x16( + -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, + ); + let b3 = i8x16( + -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, + ); + let a3 = i8x16( + -1, -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, + ); + let out3 = v128_or( + v128_or(u8x16_swizzle(r, r3), u8x16_swizzle(g, g3)), + v128_or(u8x16_swizzle(b, b3), u8x16_swizzle(a, a3)), + ); + + v128_store(ptr.cast(), out0); + v128_store(ptr.add(16).cast(), out1); + v128_store(ptr.add(32).cast(), out2); + v128_store(ptr.add(48).cast(), out3); + } +} + // ===== 16-bit YUV → RGB ================================================== /// `(Y_u16x8 - y_off) * y_scale + RND >> 15` for full u16 Y samples. diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 87c65a3..dd384cb 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -43,7 +43,9 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, row::{ - arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8}, + arch::x86_common::{ + rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16, + }, scalar, }, }; @@ -82,12 +84,69 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked AVX2 availability + slice bounds — see + // [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as +/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B, +/// `0xFF`). +/// +/// # Safety +/// +/// 1. AVX2 must be available on the current CPU. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked AVX2 availability + slice bounds — see + // [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`, +/// [`write_rgb_32`]) and [`yuv_420_to_rgba_row`] (`ALPHA = true`, +/// [`write_rgba_32`] with constant `0xFF` alpha). Math is +/// byte-identical to `scalar::yuv_420_to_rgb_or_rgba_row::`. +/// +/// # Safety +/// +/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the +/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })` +/// bytes long. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -109,6 +168,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + // Constant opaque-alpha vector for the RGBA path; DCE'd when + // ALPHA = false. + let alpha_u8 = _mm256_set1_epi8(-1); // 0xFF as i8 let mut x = 0usize; while x + 32 <= width { @@ -178,8 +240,13 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - // 3‑way interleave → packed RGB (96 bytes = 3 × 32). - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + // 4‑way interleave → packed RGBA (128 bytes = 4 × 32). + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + // 3‑way interleave → packed RGB (96 bytes = 3 × 32). + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } @@ -187,11 +254,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( // Scalar tail for the 0..30 leftover pixels (always even; 4:2:0 // requires even width so x/2 and width/2 are well‑defined). if x < width { - scalar::yuv_420_to_rgb_row( + scalar::yuv_420_to_rgb_or_rgba_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], + &mut out[x * bpp..width * bpp], width - x, matrix, full_range, @@ -2162,6 +2229,31 @@ unsafe fn write_rgb_32(r: __m256i, g: __m256i, b: __m256i, ptr: *mut u8) { } } +/// Writes 32 pixels of packed RGBA (128 bytes) by interleaving four +/// u8x32 R/G/B/A channel vectors. Processed as two 16‑pixel halves +/// via the shared +/// [`write_rgba_16`](super::x86_common::write_rgba_16) helper. +/// +/// # Safety +/// +/// `ptr` must point to at least 128 writable bytes. +#[inline(always)] +unsafe fn write_rgba_32(r: __m256i, g: __m256i, b: __m256i, a: __m256i, ptr: *mut u8) { + unsafe { + let r_lo = _mm256_castsi256_si128(r); + let r_hi = _mm256_extracti128_si256::<1>(r); + let g_lo = _mm256_castsi256_si128(g); + let g_hi = _mm256_extracti128_si256::<1>(g); + let b_lo = _mm256_castsi256_si128(b); + let b_hi = _mm256_extracti128_si256::<1>(b); + let a_lo = _mm256_castsi256_si128(a); + let a_hi = _mm256_extracti128_si256::<1>(a); + + write_rgba_16(r_lo, g_lo, b_lo, a_lo, ptr); + write_rgba_16(r_hi, g_hi, b_hi, a_hi, ptr.add(64)); + } +} + // ===== 16-bit YUV → RGB ================================================== /// `(Y_u16x16 - y_off) * y_scale + RND >> 15` for full u16 Y samples. diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 1b8fae1..3e01120 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -57,7 +57,9 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, row::{ - arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8}, + arch::x86_common::{ + rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16, + }, scalar, }, }; @@ -96,12 +98,69 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked AVX-512BW availability + slice bounds — + // see [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX‑512 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as +/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B, +/// `0xFF`). +/// +/// # Safety +/// +/// 1. AVX‑512F + AVX‑512BW must be available on the current CPU. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked AVX-512BW availability + slice bounds — + // see [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX‑512 kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`, +/// [`write_rgb_64`]) and [`yuv_420_to_rgba_row`] (`ALPHA = true`, +/// [`write_rgba_64`] with constant `0xFF` alpha). Math is +/// byte-identical to `scalar::yuv_420_to_rgb_or_rgba_row::`. +/// +/// # Safety +/// +/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the +/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })` +/// bytes long. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -123,6 +182,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + // Constant opaque-alpha vector for the RGBA path; DCE'd when + // ALPHA = false. + let alpha_u8 = _mm512_set1_epi8(-1); // 0xFF as i8 // Lane‑fixup permute indices, computed once per call. let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -193,8 +255,13 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - // 3‑way interleave → packed RGB (192 bytes = 4 × 48). - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + // 4‑way interleave → packed RGBA (256 bytes = 4 × 64). + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + // 3‑way interleave → packed RGB (192 bytes = 4 × 48). + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } @@ -202,11 +269,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( // Scalar tail for the 0..62 leftover pixels (always even; 4:2:0 // requires even width so x/2 and width/2 are well‑defined). if x < width { - scalar::yuv_420_to_rgb_row( + scalar::yuv_420_to_rgb_or_rgba_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], + &mut out[x * bpp..width * bpp], width - x, matrix, full_range, @@ -2230,6 +2297,40 @@ unsafe fn write_rgb_64(r: __m512i, g: __m512i, b: __m512i, ptr: *mut u8) { } } +/// Writes 64 pixels of packed RGBA (256 bytes) by splitting the +/// u8x64 channel vectors into four 128‑bit quarters and calling the +/// shared [`write_rgba_16`] helper four times. +/// +/// # Safety +/// +/// `ptr` must point to at least 256 writable bytes. +#[inline(always)] +unsafe fn write_rgba_64(r: __m512i, g: __m512i, b: __m512i, a: __m512i, ptr: *mut u8) { + unsafe { + let r0: __m128i = _mm512_castsi512_si128(r); + let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r); + let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r); + let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r); + let g0: __m128i = _mm512_castsi512_si128(g); + let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g); + let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g); + let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g); + let b0: __m128i = _mm512_castsi512_si128(b); + let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b); + let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b); + let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b); + let a0: __m128i = _mm512_castsi512_si128(a); + let a1: __m128i = _mm512_extracti32x4_epi32::<1>(a); + let a2: __m128i = _mm512_extracti32x4_epi32::<2>(a); + let a3: __m128i = _mm512_extracti32x4_epi32::<3>(a); + + write_rgba_16(r0, g0, b0, a0, ptr); + write_rgba_16(r1, g1, b1, a1, ptr.add(64)); + write_rgba_16(r2, g2, b2, a2, ptr.add(128)); + write_rgba_16(r3, g3, b3, a3, ptr.add(192)); + } +} + // ===== 16-bit u16-output helpers ======================================== /// `(c_u * u_d + c_v * v_d + RND) >> 15` in i64 via two diff --git a/src/row/arch/x86_common.rs b/src/row/arch/x86_common.rs index 2f8807b..7cf7aca 100644 --- a/src/row/arch/x86_common.rs +++ b/src/row/arch/x86_common.rs @@ -79,6 +79,92 @@ pub(super) unsafe fn write_rgb_16(r: __m128i, g: __m128i, b: __m128i, ptr: *mut } } +/// Writes 16 pixels of packed RGBA (64 bytes) from four u8x16 channel +/// vectors. Mirrors [`write_rgb_16`] for the 4-channel output path. +/// +/// The 4-byte stride aligns cleanly with the 16-byte register width: +/// each output block holds exactly 4 RGBA quads (16 bytes), with R, +/// G, B, A interleaved at positions `(0, 1, 2, 3)`, `(4, 5, 6, 7)`, +/// etc. The shuffle masks are simpler than the 3-channel pattern +/// because a single source byte goes to a single output byte (no +/// channel "split across blocks" boundary). +/// +/// Conceptually: +/// - Block 0 (bytes 0..16): R0,G0,B0,A0, R1,G1,B1,A1, R2,G2,B2,A2, +/// R3,G3,B3,A3 +/// - Block 1 (bytes 16..32): pixels 4..7 +/// - Block 2 (bytes 32..48): pixels 8..11 +/// - Block 3 (bytes 48..64): pixels 12..15 +/// +/// Each block is the OR of four `_mm_shuffle_epi8` gathers — one per +/// channel — with `0x80` (`-1` as i8) zeroing lanes that another +/// channel's shuffle will fill. +/// +/// # Safety +/// +/// - `ptr` must point to at least 64 writable bytes. +/// - The calling function must have SSSE3 available (via +/// `#[target_feature(enable = "ssse3")]` or a superset). +#[inline(always)] +pub(super) unsafe fn write_rgba_16(r: __m128i, g: __m128i, b: __m128i, a: __m128i, ptr: *mut u8) { + unsafe { + // Block 0 (bytes 0..16): pixels 0..3, source bytes 0..3 from + // each channel placed at output positions + // (0, 1, 2, 3) for pixel 0, (4, 5, 6, 7) for pixel 1, etc. + let r0 = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1); + let g0 = _mm_setr_epi8(-1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1); + let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1); + let a0 = _mm_setr_epi8(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3); + let out0 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)), + _mm_or_si128(_mm_shuffle_epi8(b, b0), _mm_shuffle_epi8(a, a0)), + ); + + // Block 1 (bytes 16..32): pixels 4..7, source bytes 4..7. + let r1 = _mm_setr_epi8(4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1); + let g1 = _mm_setr_epi8(-1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1); + let b1 = _mm_setr_epi8(-1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1); + let a1 = _mm_setr_epi8(-1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7); + let out1 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)), + _mm_or_si128(_mm_shuffle_epi8(b, b1), _mm_shuffle_epi8(a, a1)), + ); + + // Block 2 (bytes 32..48): pixels 8..11, source bytes 8..11. + let r2 = _mm_setr_epi8(8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1, -1); + let g2 = _mm_setr_epi8(-1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1); + let b2 = _mm_setr_epi8(-1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1); + let a2 = _mm_setr_epi8(-1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11); + let out2 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)), + _mm_or_si128(_mm_shuffle_epi8(b, b2), _mm_shuffle_epi8(a, a2)), + ); + + // Block 3 (bytes 48..64): pixels 12..15, source bytes 12..15. + let r3 = _mm_setr_epi8( + 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1, + ); + let g3 = _mm_setr_epi8( + -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, + ); + let b3 = _mm_setr_epi8( + -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, + ); + let a3 = _mm_setr_epi8( + -1, -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, + ); + let out3 = _mm_or_si128( + _mm_or_si128(_mm_shuffle_epi8(r, r3), _mm_shuffle_epi8(g, g3)), + _mm_or_si128(_mm_shuffle_epi8(b, b3), _mm_shuffle_epi8(a, a3)), + ); + + _mm_storeu_si128(ptr.cast(), out0); + _mm_storeu_si128(ptr.add(16).cast(), out1); + _mm_storeu_si128(ptr.add(32).cast(), out2); + _mm_storeu_si128(ptr.add(48).cast(), out3); + } +} + /// Writes 8 pixels of packed **`u16`** RGB (48 bytes = 24 `u16`) /// from three `u16x8` channel vectors. Drives the SSE4.1 / AVX2 / /// AVX‑512 high‑bit‑depth kernels' u16 output path. diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 73efdb5..6d79ad8 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -40,7 +40,9 @@ use core::arch::x86_64::*; use crate::{ ColorMatrix, row::{ - arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8}, + arch::x86_common::{ + rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16, + }, scalar, }, }; @@ -79,12 +81,72 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller-checked SSE4.1 availability + slice bounds — see + // [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as +/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B, +/// `0xFF`). +/// +/// # Safety +/// +/// 1. SSE4.1 must be available on the current CPU. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller-checked SSE4.1 availability + slice bounds — see + // [`yuv_420_to_rgb_or_rgba_row`] safety contract. + unsafe { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`, +/// [`write_rgb_16`]) and [`yuv_420_to_rgba_row`] (`ALPHA = true`, +/// [`write_rgba_16`] with constant `0xFF` alpha). Math is +/// byte-identical to `scalar::yuv_420_to_rgb_or_rgba_row::`; +/// only the per-block store helper differs. `const` generic +/// monomorphizes per call site, so the `if ALPHA` branches are +/// eliminated. +/// +/// # Safety +/// +/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the +/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })` +/// bytes long. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params(full_range); @@ -106,6 +168,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + // Constant opaque-alpha vector for the RGBA path; DCE'd when + // ALPHA = false. + let alpha_u8 = _mm_set1_epi8(-1); // 0xFF as i8 let mut x = 0usize; while x + 16 <= width { @@ -166,19 +231,24 @@ pub(crate) unsafe fn yuv_420_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - // 3‑way interleave → packed RGB (48 bytes). - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + // 4‑way interleave → packed RGBA (64 bytes). + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + // 3‑way interleave → packed RGB (48 bytes). + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } // Scalar tail for the 0..14 leftover pixels. if x < width { - scalar::yuv_420_to_rgb_row( + scalar::yuv_420_to_rgb_or_rgba_row::( &y[x..width], &u_half[x / 2..width / 2], &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], + &mut out[x * bpp..width * bpp], width - x, matrix, full_range, diff --git a/src/row/mod.rs b/src/row/mod.rs index 35b15d5..2523eda 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -152,6 +152,98 @@ pub fn yuv_420_to_rgb_row( scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); } +/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit). +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel — sources without an alpha plane +/// produce opaque output). The first three bytes per pixel are +/// byte-identical to what [`yuv_420_to_rgb_row`] would write. +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the +/// scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary — see + // [`yuv_420_to_rgb_row`] for rationale, including the checked + // `width × 4` multiplication via [`rgba_row_bytes`]. + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + unsafe { + arch::x86_avx512::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: `avx2_available()` verified AVX2 is present. + unsafe { + arch::x86_avx2::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: `sse41_available()` verified SSE4.1 is present. + unsafe { + arch::x86_sse41::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time availability verified. + unsafe { + arch::wasm_simd128::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + /// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB. /// /// Same numerical contract as [`yuv_420_to_rgb_row`]; the only @@ -2719,6 +2811,17 @@ fn rgb_row_bytes(width: usize) -> usize { } } +/// Byte length of one packed‑RGBA row (`width × 4`) with overflow +/// checking. Same purpose as [`rgb_row_bytes`] for the 4-channel +/// path used by the RGBA dispatchers. +#[cfg_attr(not(tarpaulin), inline(always))] +fn rgba_row_bytes(width: usize) -> usize { + match width.checked_mul(4) { + Some(n) => n, + None => panic!("width ({width}) × 4 overflows usize"), + } +} + /// Element count of one packed `u16`‑RGB row (`width × 3`). Identical /// math to [`rgb_row_bytes`] — the returned value is in `u16` /// elements, not bytes. Callers use it to size `&mut [u16]` buffers diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 82e9582..c41a2c8 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -35,12 +35,57 @@ pub(crate) fn yuv_420_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Same as [`yuv_420_to_rgb_row`] but writes packed `R, G, B, A` +/// quadruplets, with `A = 0xFF` (opaque) for every pixel. The first +/// three bytes per pixel are byte-identical to what +/// [`yuv_420_to_rgb_row`] would write — only the per-pixel stride +/// (4 vs 3) and the alpha byte differ. `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_420_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Shared scalar kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`, +/// 3 bytes / pixel) and [`yuv_420_to_rgba_row`] (`ALPHA = true`, +/// 4 bytes / pixel — 4th is opaque `0xFF`). The math is identical; +/// only the per-pixel store differs. `const` generic drives +/// compile-time monomorphization — each public wrapper is inlined +/// with the branch eliminated. +/// +/// # Panics (debug builds) +/// +/// - `width` must be even. +/// - `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * (if ALPHA { 4 } else { 3 })`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_420_to_rgb_or_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u_half.len() >= width / 2, "u_half row too short"); debug_assert!(v_half.len() >= width / 2, "v_half row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + let bpp: usize = if ALPHA { 4 } else { 3 }; + debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params(full_range); @@ -67,15 +112,27 @@ pub(crate) fn yuv_420_to_rgb_row( // Pixel x. let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15; - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + let r0 = clamp_u8(y0 + r_chroma); + let g0 = clamp_u8(y0 + g_chroma); + let b0 = clamp_u8(y0 + b_chroma); + out[x * bpp] = r0; + out[x * bpp + 1] = g0; + out[x * bpp + 2] = b0; + if ALPHA { + out[x * bpp + 3] = 0xFF; + } // Pixel x+1 shares chroma. let y1 = ((y[x + 1] as i32 - y_off) * y_scale + RND) >> 15; - rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma); - rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma); - rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma); + let r1 = clamp_u8(y1 + r_chroma); + let g1 = clamp_u8(y1 + g_chroma); + let b1 = clamp_u8(y1 + b_chroma); + out[(x + 1) * bpp] = r1; + out[(x + 1) * bpp + 1] = g1; + out[(x + 1) * bpp + 2] = b1; + if ALPHA { + out[(x + 1) * bpp + 3] = 0xFF; + } x += 2; } diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index d37146d..e2907a2 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -66,12 +66,13 @@ use crate::{ nv24_to_rgb_row, nv42_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row, - p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_444_to_rgb_row, - yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, - yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, - yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, - yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, - yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row, + p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_420_to_rgba_row, + yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, + yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, + yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, + yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, + yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, + yuv444p16_to_rgb_u16_row, }, yuv::{ Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row, @@ -139,6 +140,32 @@ pub enum MixedSinkerError { actual: usize, }, + /// RGBA buffer attached via [`MixedSinker::with_rgba`] / + /// [`MixedSinker::set_rgba`] is shorter than `width × height × 4`. + /// The fourth byte per pixel is alpha — opaque (`0xFF`) by default + /// when the source has no alpha plane. + #[error("MixedSinker rgba buffer too short: expected >= {expected} bytes, got {actual}")] + RgbaBufferTooShort { + /// Minimum bytes required (`width × height × 4`). + expected: usize, + /// Bytes supplied. + actual: usize, + }, + + /// `u16` RGBA buffer attached via [`MixedSinker::with_rgba_u16`] / + /// [`MixedSinker::set_rgba_u16`] is shorter than `width × height × 4` + /// `u16` elements. Only high‑bit‑depth source impls write into this + /// buffer; the fourth `u16` per pixel is alpha — opaque + /// (`(1 << BITS) - 1`) by default when the source has no alpha + /// plane. + #[error("MixedSinker rgba_u16 buffer too short: expected >= {expected} elements, got {actual}")] + RgbaU16BufferTooShort { + /// Minimum `u16` elements required (`width × height × 4`). + expected: usize, + /// `u16` elements supplied. + actual: usize, + }, + /// Luma buffer is shorter than `width × height`. #[error("MixedSinker luma buffer too short: expected >= {expected} bytes, got {actual}")] LumaBufferTooShort { @@ -473,6 +500,8 @@ pub enum RowSlice { pub struct MixedSinker<'a, F: SourceFormat> { rgb: Option<&'a mut [u8]>, rgb_u16: Option<&'a mut [u16]>, + rgba: Option<&'a mut [u8]>, + rgba_u16: Option<&'a mut [u16]>, luma: Option<&'a mut [u8]>, hsv: Option>, width: usize, @@ -818,6 +847,8 @@ impl MixedSinker<'_, F> { Self { rgb: None, rgb_u16: None, + rgba: None, + rgba_u16: None, luma: None, hsv: None, width, @@ -850,6 +881,24 @@ impl MixedSinker<'_, F> { self.rgb_u16.is_some() } + /// Returns `true` iff the sinker will write 8‑bit RGBA. The + /// fourth byte per pixel is alpha — opaque (`0xFF`) by default + /// when the source has no alpha plane. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn produces_rgba(&self) -> bool { + self.rgba.is_some() + } + + /// Returns `true` iff the sinker will write `u16` RGBA at the + /// source's native bit depth. The fourth `u16` per pixel is alpha + /// — opaque (`(1 << BITS) - 1`) by default when the source has no + /// alpha plane. Only high‑bit‑depth source impls honor this + /// buffer. + #[cfg_attr(not(tarpaulin), inline(always))] + pub const fn produces_rgba_u16(&self) -> bool { + self.rgba_u16.is_some() + } + /// Returns `true` iff the sinker will write luma. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn produces_luma(&self) -> bool { @@ -949,6 +998,41 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> { // compile error, not a silent stale‑state bug. Future high‑bit‑depth // markers (12‑bit, 14‑bit, P010) will add their own impl blocks. + /// Attaches a packed 32-bit RGBA output buffer. + /// + /// The fourth byte per pixel is alpha. For sources that **don't** + /// carry an alpha plane (every YUV format shipped today), every + /// alpha byte is filled with `0xFF` (opaque). Future YUVA source + /// impls will copy alpha through from the source plane. + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + // NOTE: `with_rgba_u16` / `set_rgba_u16` are **not** declared here + // for the same reason as `with_rgb_u16` — they live on the + // format‑specific impl blocks for high‑bit‑depth sources so the + // buffer can only be attached to sinks that actually write it. + /// Attaches a single-plane luma output buffer. /// Returns `Err(LumaBufferTooShort)` if `buf.len() < width × height`, /// or `Err(GeometryOverflow)` on 32‑bit overflow. @@ -1094,6 +1178,7 @@ impl PixelSink for MixedSinker<'_, Yuv420p> { // collide with the `rgb` read-after-write chain below. let Self { rgb, + rgba, luma, hsv, rgb_scratch, @@ -1113,6 +1198,33 @@ impl PixelSink for MixedSinker<'_, Yuv420p> { luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]); } + // Native RGBA: independent kernel run, separate from RGB. Avoids + // the compose-and-expand cost — the const-generic + // `yuv_420_to_rgba_row` writes 4 bytes per pixel directly. + // Default alpha = 0xFF (opaque); future YUVA source impls will + // copy alpha through from the source plane. + if let Some(buf) = rgba.as_deref_mut() { + let rgba_plane_end = + one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width: w, + height: h, + channels: 4, + })?; + let rgba_plane_start = one_plane_start * 4; // ≤ rgba_plane_end. + yuv_420_to_rgba_row( + row.y(), + row.u_half(), + row.v_half(), + &mut buf[rgba_plane_start..rgba_plane_end], + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } + let want_rgb = rgb.is_some(); let want_hsv = hsv.is_some(); if !want_rgb && !want_hsv { @@ -7298,6 +7410,161 @@ mod tests { assert!(v.iter().all(|&b| b.abs_diff(200) <= 1)); } + // ---- RGBA (Ship 8) tests ------------------------------------------------ + // + // Yuv420p is the template format for the const-generic-ALPHA + // refactor — proves the kernel writes 4 bytes per pixel correctly, + // alpha defaults to 0xFF (sources with no alpha plane), the RGB + // bytes match what `with_rgb` would have written, and SIMD ≡ + // scalar bit-for-bit. Future formats inherit the pattern. + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn rgba_only_converts_gray_to_gray_with_opaque_alpha() { + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1, "R"); + assert_eq!(px[0], px[1], "RGB monochromatic"); + assert_eq!(px[1], px[2], "RGB monochromatic"); + assert_eq!(px[3], 0xFF, "alpha must default to opaque"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn rgba_alpha_is_opaque_for_arbitrary_color() { + // Non-gray content. The RGB three bytes will vary by pixel; alpha + // must stay 0xFF because Yuv420p has no alpha plane. + let (yp, up, vp) = solid_yuv420p_frame(16, 8, 180, 60, 200); + let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for (i, px) in rgba.chunks(4).enumerate() { + assert_eq!(px[3], 0xFF, "alpha must be opaque (px {i})"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + // Cross-format invariant: alpha is the *only* difference between + // the two output buffers. RGBA bytes 0..3 of each pixel must + // equal the corresponding RGB pixel exactly. + let w = 32usize; + let h = 16usize; + let (yp, up, vp) = solid_yuv420p_frame(w as u32, h as u32, 180, 60, 200); + let src = Yuv420pFrame::new( + &yp, + &up, + &vp, + w as u32, + h as u32, + w as u32, + (w / 2) as u32, + (w / 2) as u32, + ); + + let mut rgb = std::vec![0u8; w * h * 3]; + let mut rgba = std::vec![0u8; w * h * 4]; + let mut sink = MixedSinker::::new(w, h) + .with_rgb(&mut rgb) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for i in 0..(w * h) { + assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}"); + assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}"); + } + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn rgba_with_simd_false_matches_with_simd_true() { + // SIMD ≡ scalar parity for the RGBA path. Widths chosen to force + // both the SIMD main loop and the scalar tail across every + // backend block size (16 / 32 / 64). 4:2:0 requires even width, + // so the tail is exercised via `block + 2/4/6` rather than odd + // widths. + for &w in &[16usize, 18, 32, 34, 64, 66, 128, 130] { + let h = 8usize; + let (yp, up, vp) = solid_yuv420p_frame(w as u32, h as u32, 180, 60, 200); + let src = Yuv420pFrame::new( + &yp, + &up, + &vp, + w as u32, + h as u32, + w as u32, + (w / 2) as u32, + (w / 2) as u32, + ); + + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut sink_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink_simd).unwrap(); + + let mut sink_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + sink_scalar.set_simd(false); + yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink_scalar).unwrap(); + + assert_eq!( + rgba_simd, rgba_scalar, + "SIMD vs scalar diverged at width {w}" + ); + } + } + + #[test] + fn rgba_buffer_too_short_returns_err() { + let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1]; + let result = MixedSinker::::new(16, 8).with_rgba(&mut rgba_short); + let Err(err) = result else { + panic!("expected RgbaBufferTooShort error"); + }; + assert!(matches!( + err, + MixedSinkerError::RgbaBufferTooShort { + expected: 512, + actual: 511, + } + )); + } + #[test] #[cfg_attr( miri, From aa62bff0520c9d14c49f5ee25ce7ea469f8a53b0 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:01:21 +1200 Subject: [PATCH 2/5] update --- src/sinker/mixed.rs | 93 +++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index e2907a2..cd680c5 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -998,40 +998,20 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> { // compile error, not a silent stale‑state bug. Future high‑bit‑depth // markers (12‑bit, 14‑bit, P010) will add their own impl blocks. - /// Attaches a packed 32-bit RGBA output buffer. - /// - /// The fourth byte per pixel is alpha. For sources that **don't** - /// carry an alpha plane (every YUV format shipped today), every - /// alpha byte is filled with `0xFF` (opaque). Future YUVA source - /// impls will copy alpha through from the source plane. - /// - /// Returns `Err(RgbaBufferTooShort)` if - /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on - /// 32‑bit targets when the product overflows. - #[cfg_attr(not(tarpaulin), inline(always))] - pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { - self.set_rgba(buf)?; - Ok(self) - } - - /// In-place variant of [`with_rgba`](Self::with_rgba). - #[cfg_attr(not(tarpaulin), inline(always))] - pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { - let expected = self.frame_bytes(4)?; - if buf.len() < expected { - return Err(MixedSinkerError::RgbaBufferTooShort { - expected, - actual: buf.len(), - }); - } - self.rgba = Some(buf); - Ok(self) - } + // NOTE: `with_rgba` / `set_rgba` are **not** declared here either — + // same rationale as `with_rgb_u16` above. The Ship 8 RGBA path is + // currently wired only on [`MixedSinker`]; attaching an + // RGBA buffer to a sink whose `PixelSink::process` doesn't write + // it would silently leave the caller buffer untouched while + // `produces_rgba()` returned `true`. Each format that writes RGBA + // gets its own format‑specific impl block exposing the accessors. + // Future formats (NV12 / NV21 / Yuv422p / Yuv444p / P010 / etc.) + // add their own impl blocks as RGBA support lands. // NOTE: `with_rgba_u16` / `set_rgba_u16` are **not** declared here - // for the same reason as `with_rgb_u16` — they live on the - // format‑specific impl blocks for high‑bit‑depth sources so the - // buffer can only be attached to sinks that actually write it. + // for the same reason — they live on the format‑specific impl + // blocks for high‑bit‑depth sources that actually write + // native‑depth RGBA. /// Attaches a single-plane luma output buffer. /// Returns `Err(LumaBufferTooShort)` if `buf.len() < width × height`, @@ -1107,6 +1087,55 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> { // ---- Yuv420p impl -------------------------------------------------------- +impl<'a> MixedSinker<'a, Yuv420p> { + /// Attaches a packed 32‑bit RGBA output buffer. + /// + /// Only available on sinker types whose `PixelSink` impl writes + /// RGBA — calling `with_rgba` on a sink that doesn't (e.g. + /// [`MixedSinker`] today) is a compile error rather than a + /// silent no‑op that would leave the caller's buffer stale while + /// [`Self::produces_rgba`] returned `true`. The compile-time + /// scoping is load-bearing: if a future format adds RGBA, it must + /// add its own impl block here, which both wires the new path and + /// prevents accidental cross-format leakage. + /// + /// The fourth byte per pixel is alpha. [`Yuv420p`] has no alpha + /// plane, so every alpha byte is filled with `0xFF` (opaque). + /// Future YUVA source impls will copy alpha through from the + /// source plane. + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + /// + /// ```compile_fail + /// // Attaching RGBA to a sink that doesn't write it is rejected + /// // at compile time: + /// use colconv::{sinker::MixedSinker, yuv::Nv12}; + /// let mut buf = vec![0u8; 16 * 8 * 4]; + /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); + /// ``` + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } +} + impl PixelSink for MixedSinker<'_, Yuv420p> { type Input<'r> = Yuv420pRow<'r>; type Error = MixedSinkerError; From 36fee2633545a25956adb15d05a5741c0b1fe6bf Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:12:42 +1200 Subject: [PATCH 3/5] update --- src/sinker/mixed.rs | 88 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index cd680c5..bf52196 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -7594,6 +7594,86 @@ mod tests { )); } + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn yuv_420_to_rgba_simd_matches_scalar_with_random_yuv() { + // The earlier `rgba_with_simd_false_matches_with_simd_true` test + // uses solid Y/U/V, so every pixel collapses to the same RGBA + // quad and the new RGBA shuffle masks could permute / duplicate + // lanes within a SIMD block undetected. This test uses + // **pseudo-random per-pixel Y/U/V** so a bad shuffle in any of + // `write_rgba_16` (SSE4.1 / AVX2 / AVX-512 / wasm), `vst4q_u8` + // (NEON), or the scalar-tail handoff produces a measurable + // diff against the scalar reference. Width 1922 forces both + // the SIMD main loop AND a scalar tail across every backend + // block size (16 / 32 / 64). All four `ColorMatrix` variants + // exercise different `(r_u, r_v, g_u, g_v, b_u, b_v)` + // coefficient sets, and both ranges exercise the `y_off` / + // `y_scale` / `c_scale` parameter shape. + let w = 1922usize; + let h = 4usize; + let mut yp = std::vec![0u8; w * h]; + let mut up = std::vec![0u8; (w / 2) * (h / 2)]; + let mut vp = std::vec![0u8; (w / 2) * (h / 2)]; + pseudo_random_u8(&mut yp, 0xC001_C0DE); + pseudo_random_u8(&mut up, 0xCAFE_F00D); + pseudo_random_u8(&mut vp, 0xDEAD_BEEF); + let src = Yuv420pFrame::new( + &yp, + &up, + &vp, + w as u32, + h as u32, + w as u32, + (w / 2) as u32, + (w / 2) as u32, + ); + + for &matrix in &[ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::YCgCo, + ] { + for &full_range in &[true, false] { + let mut rgba_simd = std::vec![0u8; w * h * 4]; + let mut rgba_scalar = std::vec![0u8; w * h * 4]; + + let mut s_simd = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_simd) + .unwrap(); + yuv420p_to(&src, full_range, matrix, &mut s_simd).unwrap(); + + let mut s_scalar = MixedSinker::::new(w, h) + .with_rgba(&mut rgba_scalar) + .unwrap(); + s_scalar.set_simd(false); + yuv420p_to(&src, full_range, matrix, &mut s_scalar).unwrap(); + + // Locate the first divergence to make backend-bug + // diagnosis tractable instead of dumping ~30 KB of bytes. + if rgba_simd != rgba_scalar { + let mismatch = rgba_simd + .iter() + .zip(rgba_scalar.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = mismatch / 4; + let channel = ["R", "G", "B", "A"][mismatch % 4]; + panic!( + "RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) \ + for matrix={matrix:?} full_range={full_range}: \ + simd={} scalar={}", + rgba_simd[mismatch], rgba_scalar[mismatch] + ); + } + } + } + } + #[test] #[cfg_attr( miri, @@ -10798,6 +10878,14 @@ mod tests { } } + fn pseudo_random_u8(buf: &mut [u8], seed: u32) { + let mut state = seed; + for b in buf { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + *b = (state >> 16) as u8; + } + } + #[test] #[cfg_attr( miri, From c6b0526ad589509f0b61761a234c73939391b174 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 14:28:05 +1200 Subject: [PATCH 4/5] update --- src/row/arch/neon.rs | 78 ++++++++++++++++++++++++++++++++ src/row/arch/wasm_simd128.rs | 60 +++++++++++++++++++++++++ src/row/arch/x86_avx2.rs | 86 +++++++++++++++++++++++++++++++++++ src/row/arch/x86_avx512.rs | 86 +++++++++++++++++++++++++++++++++++ src/row/arch/x86_sse41.rs | 87 ++++++++++++++++++++++++++++++++++++ 5 files changed, 397 insertions(+) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index bfc52f6..24f4ae1 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -3416,6 +3416,84 @@ mod tests { } } + // ---- yuv_420_to_rgba_row equivalence -------------------------------- + // + // Direct backend test for the new RGBA path: bypasses the public + // dispatcher so the NEON `vst4q_u8` write is exercised regardless + // of what tier the dispatcher would pick on the current runner. + // Catches lane-order or alpha-splat corruption in `vst4q_u8` that + // a dispatcher-routed test on a different host would miss. + + fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + + scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + + if rgba_scalar != rgba_neon { + let first_diff = rgba_scalar + .iter() + .zip(rgba_neon.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "NEON RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}", + rgba_scalar[first_diff], rgba_neon[first_diff] + ); + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn neon_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_rgba_equivalence(16, m, full); + } + } + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn neon_rgba_matches_scalar_width_32() { + check_rgba_equivalence(32, ColorMatrix::Bt601, true); + check_rgba_equivalence(32, ColorMatrix::Bt709, false); + check_rgba_equivalence(32, ColorMatrix::YCgCo, true); + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn neon_rgba_matches_scalar_width_1920() { + check_rgba_equivalence(1920, ColorMatrix::Bt709, false); + } + + #[test] + #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] + fn neon_rgba_matches_scalar_odd_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- nv12_to_rgb_row equivalence ------------------------------------ /// Scalar‑equivalence fixture for NV12. Builds an interleaved UV row diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 26b8363..2ff0619 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -3419,6 +3419,66 @@ mod tests { } } + // ---- yuv_420_to_rgba_row equivalence -------------------------------- + // + // Direct backend test for the new RGBA path: bypasses the public + // dispatcher so the wasm `write_rgba_16` swizzle (4-mask + 4 + // store) is exercised on every wasm32+simd128 target. + + fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + + scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); + } + + if rgba_scalar != rgba_wasm { + let first_diff = rgba_scalar + .iter() + .zip(rgba_wasm.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "wasm simd128 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}", + rgba_scalar[first_diff], rgba_wasm[first_diff] + ); + } + } + + #[test] + fn simd128_rgba_matches_scalar_all_matrices_16() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn simd128_rgba_matches_scalar_tail_widths() { + for w in [18usize, 30, 34, 1922] { + check_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- nv12_to_rgb_row equivalence ------------------------------------ fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index dd384cb..54742dc 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -3663,6 +3663,92 @@ mod tests { } } + // ---- yuv_420_to_rgba_row equivalence -------------------------------- + // + // Direct backend test for the new RGBA path: bypasses the public + // dispatcher so the AVX2 `write_rgba_32` path (two halves through + // `write_rgba_16`) is exercised regardless of what tier the + // dispatcher would pick on the current runner. + + fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx2 = std::vec![0u8; width * 4]; + + scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_avx2, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx2 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx2.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX2 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}", + rgba_scalar[first_diff], rgba_avx2[first_diff] + ); + } + } + + #[test] + fn avx2_rgba_matches_scalar_all_matrices_32() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_rgba_equivalence(32, m, full); + } + } + } + + #[test] + fn avx2_rgba_matches_scalar_width_64() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + check_rgba_equivalence(64, ColorMatrix::Bt601, true); + check_rgba_equivalence(64, ColorMatrix::Bt709, false); + check_rgba_equivalence(64, ColorMatrix::YCgCo, true); + } + + #[test] + fn avx2_rgba_matches_scalar_width_1920() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + check_rgba_equivalence(1920, ColorMatrix::Bt709, false); + } + + #[test] + fn avx2_rgba_matches_scalar_odd_tail_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Widths that leave a non‑trivial scalar tail (non‑multiple of 32). + for w in [34usize, 46, 62, 1922] { + check_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- nv12_to_rgb_row equivalence ------------------------------------ fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 3e01120..9e17ee9 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -3832,6 +3832,92 @@ mod tests { } } + // ---- yuv_420_to_rgba_row equivalence -------------------------------- + // + // Direct backend test for the new RGBA path: bypasses the public + // dispatcher so the AVX‑512 `write_rgba_64` path (four quarters + // through `write_rgba_16`) is exercised regardless of what tier + // the dispatcher would pick on the current runner. + + fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_avx512 = std::vec![0u8; width * 4]; + + scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_avx512, width, matrix, full_range); + } + + if rgba_scalar != rgba_avx512 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_avx512.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "AVX‑512 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}", + rgba_scalar[first_diff], rgba_avx512[first_diff] + ); + } + } + + #[test] + fn avx512_rgba_matches_scalar_all_matrices_64() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_rgba_equivalence(64, m, full); + } + } + } + + #[test] + fn avx512_rgba_matches_scalar_width_128() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + check_rgba_equivalence(128, ColorMatrix::Bt601, true); + check_rgba_equivalence(128, ColorMatrix::Bt709, false); + check_rgba_equivalence(128, ColorMatrix::YCgCo, true); + } + + #[test] + fn avx512_rgba_matches_scalar_width_1920() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + check_rgba_equivalence(1920, ColorMatrix::Bt709, false); + } + + #[test] + fn avx512_rgba_matches_scalar_odd_tail_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + // Widths that leave a non‑trivial scalar tail (non‑multiple of 64). + for w in [66usize, 94, 126, 1922] { + check_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- nv12_to_rgb_row equivalence ------------------------------------ fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 6d79ad8..c431c72 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -3142,6 +3142,93 @@ mod tests { } } + // ---- yuv_420_to_rgba_row equivalence -------------------------------- + // + // Direct backend test for the new RGBA path: bypasses the public + // dispatcher so the SSE4.1 `write_rgba_16` shuffle masks are + // exercised regardless of what tier the dispatcher would pick on + // the current runner. Catches lane-order, shuffle-mask, or alpha + // splat corruption that an AVX2- or AVX-512-routed test would + // miss. + + fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y: std::vec::Vec = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect(); + let u: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 53 + 23) & 0xFF) as u8) + .collect(); + let v: std::vec::Vec = (0..width / 2) + .map(|i| ((i * 71 + 91) & 0xFF) as u8) + .collect(); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_sse41 = std::vec![0u8; width * 4]; + + scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_sse41, width, matrix, full_range); + } + + if rgba_scalar != rgba_sse41 { + let first_diff = rgba_scalar + .iter() + .zip(rgba_sse41.iter()) + .position(|(a, b)| a != b) + .unwrap(); + let pixel = first_diff / 4; + let channel = ["R", "G", "B", "A"][first_diff % 4]; + panic!( + "SSE4.1 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}", + rgba_scalar[first_diff], rgba_sse41[first_diff] + ); + } + } + + #[test] + fn sse41_rgba_matches_scalar_all_matrices_16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_rgba_equivalence(16, m, full); + } + } + } + + #[test] + fn sse41_rgba_matches_scalar_width_32() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + check_rgba_equivalence(32, ColorMatrix::Bt601, true); + check_rgba_equivalence(32, ColorMatrix::Bt709, false); + check_rgba_equivalence(32, ColorMatrix::YCgCo, true); + } + + #[test] + fn sse41_rgba_matches_scalar_width_1920() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + check_rgba_equivalence(1920, ColorMatrix::Bt709, false); + } + + #[test] + fn sse41_rgba_matches_scalar_odd_tail_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [18usize, 30, 34, 1922] { + check_rgba_equivalence(w, ColorMatrix::Bt601, false); + } + } + // ---- nv12_to_rgb_row equivalence ------------------------------------ fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { From be024521109b771a32cc8ad6b26b9d378980a3a0 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 15:04:19 +1200 Subject: [PATCH 5/5] update --- src/sinker/mixed.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs index bf52196..5e852fb 100644 --- a/src/sinker/mixed.rs +++ b/src/sinker/mixed.rs @@ -152,12 +152,12 @@ pub enum MixedSinkerError { actual: usize, }, - /// `u16` RGBA buffer attached via [`MixedSinker::with_rgba_u16`] / - /// [`MixedSinker::set_rgba_u16`] is shorter than `width × height × 4` - /// `u16` elements. Only high‑bit‑depth source impls write into this - /// buffer; the fourth `u16` per pixel is alpha — opaque - /// (`(1 << BITS) - 1`) by default when the source has no alpha - /// plane. + /// `u16` RGBA buffer attached via `with_rgba_u16` / `set_rgba_u16` + /// (per-format impl, not yet shipped on any sink) is shorter than + /// `width × height × 4` `u16` elements. Only high‑bit‑depth source + /// impls write into this buffer; the fourth `u16` per pixel is + /// alpha — opaque (`(1 << BITS) - 1`) by default when the source + /// has no alpha plane. #[error("MixedSinker rgba_u16 buffer too short: expected >= {expected} elements, got {actual}")] RgbaU16BufferTooShort { /// Minimum `u16` elements required (`width × height × 4`).