From 4a79d1e541a883bf86bbb9cf569ce269bf6ec4d3 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 14:46:08 +1200 Subject: [PATCH 1/2] update --- src/row/arch/neon.rs | 355 +++++++++++++++++++++---- src/row/arch/neon/tests.rs | 179 +++++++++++++ src/row/arch/wasm_simd128.rs | 332 +++++++++++++++++++++--- src/row/arch/wasm_simd128/tests.rs | 183 +++++++++++++ src/row/arch/x86_avx2.rs | 332 +++++++++++++++++++++--- src/row/arch/x86_avx2/tests.rs | 193 ++++++++++++++ src/row/arch/x86_avx512.rs | 331 +++++++++++++++++++++--- src/row/arch/x86_avx512/tests.rs | 197 ++++++++++++++ src/row/arch/x86_sse41.rs | 333 +++++++++++++++++++++--- src/row/arch/x86_sse41/tests.rs | 193 ++++++++++++++ src/row/mod.rs | 398 ++++++++++++++++++++++++++++- 11 files changed, 2794 insertions(+), 232 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 9114045..e3079b6 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -703,6 +703,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row= width`, `u.len() >= width`, `v.len() >= width`, `rgb_out.len() >= 3 * width`. @@ -716,12 +718,68 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// NEON YUV 4:4:4 planar high-bit-depth → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p_n_to_rgb_row`]; the only differences are the per-pixel +/// stride (4 vs 3) and the constant alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON high-bit-depth YUV 4:4:4 kernel for +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and +/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with +/// constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **NEON must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -741,6 +799,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -803,22 +862,33 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + vst4q_u8( + out.as_mut_ptr().add(x * 4), + uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8), + ); + } else { + vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8)); + } x += 16; } if x < width { - scalar::yuv_444p_n_to_rgb_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2576,6 +2646,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( /// chroma pipeline as 10/12/14 (u8 output clamps `c_scale` down); /// 1:1 chroma per Y pixel, no width parity. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`yuv_444p_n_to_rgb_row`] but with full `u16` samples. @@ -2590,10 +2662,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// NEON YUV 4:4:4 planar **16-bit** → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p16_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON 16-bit YUV 4:4:4 kernel for [`yuv_444p16_to_rgb_row`] +/// (`ALPHA = false`, `vst3q_u8`) and [`yuv_444p16_to_rgba_row`] +/// (`ALPHA = true`, `vst4q_u8` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. NEON must be available on the current CPU. +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2612,6 +2737,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -2688,23 +2814,30 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, b_chroma_hi)), ); - vst3q_u8( - rgb_out.as_mut_ptr().add(x * 3), - uint8x16x3_t(r_u8, g_u8, b_u8), - ); + if ALPHA { + vst4q_u8( + out.as_mut_ptr().add(x * 4), + uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8), + ); + } else { + vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8)); + } x += 16; } if x < width { - scalar::yuv_444p16_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3233,6 +3366,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( /// NEON Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. NEON must be available on the current CPU. @@ -3247,11 +3382,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// NEON Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **8-bit +/// RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON Pn 4:4:4 high-bit-packed kernel for +/// [`p_n_444_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and +/// [`p_n_444_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with constant +/// `0xFF` alpha). +/// +/// # Safety +/// +/// 1. NEON must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3275,6 +3462,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -3343,21 +3531,28 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + vst4q_u8( + out.as_mut_ptr().add(x * 4), + uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8), + ); + } else { + vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8)); + } x += 16; } if x < width { - scalar::p_n_444_to_rgb_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -3490,6 +3685,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// × u_d` within i32 for u8 output). Mirror `yuv_444p16_to_rgb_row` /// with full-width interleaved UV via `vld2q_u16`. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. NEON must be available. @@ -3505,9 +3702,61 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// NEON P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`p_n_444_16_to_rgb_row`]. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_16_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON P416 (semi-planar 4:4:4, 16-bit) kernel for +/// [`p_n_444_16_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and +/// [`p_n_444_16_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with constant +/// `0xFF` alpha). +/// +/// # Safety +/// +/// 1. NEON must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -3526,6 +3775,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -3608,22 +3858,27 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, b_chroma_hi)), ); - vst3q_u8( - rgb_out.as_mut_ptr().add(x * 3), - uint8x16x3_t(r_u8, g_u8, b_u8), - ); + if ALPHA { + vst4q_u8( + out.as_mut_ptr().add(x * 4), + uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8), + ); + } else { + vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8)); + } x += 16; } if x < width { - scalar::p_n_444_16_to_rgb_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index c84627c..7c3b684 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -2367,3 +2367,182 @@ fn neon_p416_matches_scalar_tail_widths() { check_p_n_444_16_u16_neon_equivalence(w, ColorMatrix::Bt2020Ncl, true); } } + +// ---- High-bit 4:4:4 u8 RGBA equivalence (Ship 8 Tranche 7b) --------- +// +// Mirrors the 4:2:0 RGBA pattern in PR #25 (Tranche 5a). Each kernel +// family — Yuv444p_n (BITS-generic), Yuv444p16, Pn_444 (BITS-generic), +// Pn_444_16 — has its NEON RGBA kernel byte-pinned against the scalar +// reference at the natural width and a sweep of tail widths. + +fn check_yuv444p_n_u8_neon_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuv444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u8_neon_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane::(width, 37); + let u = high_bit_plane::(width, 53); + let v = high_bit_plane::(width, 71); + let uv = interleave_uv(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Pn4:4:4<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width, 53); + let v = p16_plane_neon(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuv444p16 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width, 53); + let v = p16_plane_neon(width, 71); + let uv = interleave_uv(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON P416 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv444p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_neon_rgba_equivalence::<9>(16, m, full); + check_yuv444p_n_u8_neon_rgba_equivalence::<10>(16, m, full); + check_yuv444p_n_u8_neon_rgba_equivalence::<12>(16, m, full); + check_yuv444p_n_u8_neon_rgba_equivalence::<14>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv444p_n_rgba_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_neon_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u8_neon_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u8_neon_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u8_neon_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_444_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u8_neon_rgba_equivalence::<10>(16, m, full); + check_pn_444_u8_neon_rgba_equivalence::<12>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_444_rgba_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u8_neon_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u8_neon_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv444p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u8_neon_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_p416_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u8_neon_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 19aed79..a4a94cd 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -639,6 +639,8 @@ fn clamp_u16_max_wasm(v: v128, zero_v: v128, max_v: v128) -> v128 { /// WASM simd128 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. /// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 16 pixels. /// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **simd128 must be enabled at compile time.** @@ -654,12 +656,67 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// WASM simd128 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p_n_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared WASM simd128 high-bit-depth YUV 4:4:4 kernel for +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and +/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with +/// constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -679,6 +736,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -735,21 +793,30 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_444p_n_to_rgb_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -872,6 +939,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// WASM simd128 YUV 4:4:4 planar **16-bit** → packed **u8** RGB. /// Stays on the i32 Q15 pipeline. 16 pixels per iter. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **simd128 must be enabled at compile time.** @@ -888,10 +957,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// WASM simd128 YUV 4:4:4 planar **16-bit** → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared WASM simd128 16-bit YUV 4:4:4 kernel for +/// [`yuv_444p16_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and +/// [`yuv_444p16_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with +/// constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -909,6 +1031,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -963,20 +1086,27 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_444p16_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3327,6 +3457,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( /// wasm simd128 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed /// **u8** RGB. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. simd128 must be enabled at compile time. @@ -3341,11 +3473,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed +/// **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 Pn 4:4:4 high-bit-packed kernel for +/// [`p_n_444_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and +/// [`p_n_444_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with +/// constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. simd128 must be enabled at compile time. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3364,6 +3548,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let shr = (16 - BITS) as u32; @@ -3424,20 +3609,25 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p_n_444_to_rgb_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -3566,6 +3756,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// wasm simd128 P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB. /// 16 pixels per iter. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. simd128 must be enabled at compile time. @@ -3581,9 +3773,59 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit +/// RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_16_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 P416 kernel for [`p_n_444_16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_16`) and [`p_n_444_16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. simd128 must be enabled at compile time. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -3601,6 +3843,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -3654,19 +3897,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p_n_444_16_to_rgb_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index 21c9489..f7529f3 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -1900,3 +1900,186 @@ fn simd128_p416_matches_scalar_tail_widths() { check_p_n_444_16_u16_simd128_equivalence(w, ColorMatrix::Bt2020Ncl, true); } } + +// ---- High-bit 4:4:4 u8 RGBA equivalence (Ship 8 Tranche 7b) --------- +// +// Mirrors the 4:2:0 RGBA pattern in PR #25 (Tranche 5a). Each kernel +// family — Yuv444p_n (BITS-generic), Yuv444p16, Pn_444 (BITS-generic), +// Pn_444_16 — has its wasm simd128 RGBA kernel byte-pinned against the +// scalar reference at the natural width and a sweep of tail widths. +// (Module-level cfg gates these on `target_feature = "simd128"`, so no +// per-test feature guard is needed.) + +fn check_yuv444p_n_u8_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 Yuv444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u8_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_wasm::(width, 37); + let u = high_bit_plane_wasm::(width, 53); + let v = high_bit_plane_wasm::(width, 71); + let uv = interleave_uv_wasm(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_wasm, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 Pn4:4:4<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u8_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width, 53); + let v = p16_plane_wasm(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 Yuv444p16 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u8_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width, 53); + let v = p16_plane_wasm(width, 71); + let uv = interleave_uv_wasm(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_wasm = std::vec![0u8; width * 4]; + scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_wasm, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 P416 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn simd128_yuv444p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_simd128_rgba_equivalence::<9>(16, m, full); + check_yuv444p_n_u8_simd128_rgba_equivalence::<10>(16, m, full); + check_yuv444p_n_u8_simd128_rgba_equivalence::<12>(16, m, full); + check_yuv444p_n_u8_simd128_rgba_equivalence::<14>(16, m, full); + } + } +} + +#[test] +fn simd128_yuv444p_n_rgba_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_simd128_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u8_simd128_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u8_simd128_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u8_simd128_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn simd128_pn_444_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u8_simd128_rgba_equivalence::<10>(16, m, full); + check_pn_444_u8_simd128_rgba_equivalence::<12>(16, m, full); + } + } +} + +#[test] +fn simd128_pn_444_rgba_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u8_simd128_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u8_simd128_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn simd128_yuv444p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u8_simd128_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn simd128_p416_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u8_simd128_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index ad52597..10861d5 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -752,6 +752,8 @@ fn clamp_u16_max_x16(v: __m256i, zero_v: __m256i, max_v: __m256i) -> __m256i { /// AVX2 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. /// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 32 pixels. /// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **AVX2 must be available on the current CPU.** @@ -767,12 +769,67 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p_n_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 high-bit-depth YUV 4:4:4 kernel for +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_32`) and +/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_32` with +/// constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -792,6 +849,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -873,21 +931,30 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_444p_n_to_rgb_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1057,6 +1124,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// the i32 Q15 pipeline — output-range scaling keeps `coeff × u_d` /// within i32 for u8 output. 32 pixels per iter. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **AVX2 must be available.** @@ -1073,10 +1142,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 YUV 4:4:4 planar **16-bit** → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p16_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 16-bit YUV 4:4:4 kernel for [`yuv_444p16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_32`) and [`yuv_444p16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_32` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **AVX2 must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -1094,6 +1216,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -1172,20 +1295,27 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let b_u8 = narrow_u8x32(b_lo, b_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_444p16_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3624,6 +3754,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( /// AVX2 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. AVX2 must be available on the current CPU. @@ -3638,11 +3770,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **8-bit +/// RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 Pn 4:4:4 high-bit-packed kernel for +/// [`p_n_444_to_rgb_row`] (`ALPHA = false`, `write_rgb_32`) and +/// [`p_n_444_to_rgba_row`] (`ALPHA = true`, `write_rgba_32` with +/// constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. AVX2 must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3662,6 +3846,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -3745,20 +3930,25 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::p_n_444_to_rgb_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -3930,6 +4120,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// AVX2 P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB. /// 32 pixels per iter. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -3945,9 +4137,59 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_16_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 P416 kernel for [`p_n_444_16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_32`) and [`p_n_444_16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_32` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. AVX2 must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -3965,6 +4207,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -4042,19 +4285,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let b_u8 = narrow_u8x32(b_lo, b_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::p_n_444_16_to_rgb_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index 37bb6b8..d7cf589 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -2141,3 +2141,196 @@ fn avx2_p416_matches_scalar_tail_widths() { check_p_n_444_16_u16_avx2_equivalence(w, ColorMatrix::Bt2020Ncl, true); } } + +// ---- High-bit 4:4:4 u8 RGBA equivalence (Ship 8 Tranche 7b) --------- +// +// Mirrors the 4:2:0 RGBA pattern in PR #25 (Tranche 5a). Each kernel +// family — Yuv444p_n (BITS-generic), Yuv444p16, Pn_444 (BITS-generic), +// Pn_444_16 — has its AVX2 RGBA kernel byte-pinned against the scalar +// reference at the natural width and a sweep of tail widths. Each test +// gates on `is_x86_feature_detected!("avx2")` to stay clean under +// sanitizer/Miri/non-feature-flagged CI runners. + +fn check_yuv444p_n_u8_avx2_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuv444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u8_avx2_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_avx2::(width, 37); + let u = high_bit_plane_avx2::(width, 53); + let v = high_bit_plane_avx2::(width, 71); + let uv = interleave_uv_avx2(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Pn4:4:4<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width, 53); + let v = p16_plane_avx2(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuv444p16 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width, 53); + let v = p16_plane_avx2(width, 71); + let uv = interleave_uv_avx2(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 P416 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx2_yuv444p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_avx2_rgba_equivalence::<9>(32, m, full); + check_yuv444p_n_u8_avx2_rgba_equivalence::<10>(32, m, full); + check_yuv444p_n_u8_avx2_rgba_equivalence::<12>(32, m, full); + check_yuv444p_n_u8_avx2_rgba_equivalence::<14>(32, m, full); + } + } +} + +#[test] +fn avx2_yuv444p_n_rgba_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_avx2_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u8_avx2_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u8_avx2_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u8_avx2_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx2_pn_444_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u8_avx2_rgba_equivalence::<10>(32, m, full); + check_pn_444_u8_avx2_rgba_equivalence::<12>(32, m, full); + } + } +} + +#[test] +fn avx2_pn_444_rgba_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u8_avx2_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u8_avx2_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx2_yuv444p16_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u8_avx2_rgba_equivalence(32, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx2_p416_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u8_avx2_rgba_equivalence(32, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 8138c70..1c91620 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -822,6 +822,8 @@ unsafe fn write_quarter_rgba( /// AVX-512 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. /// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 64 pixels. /// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **AVX-512F + AVX-512BW must be available.** @@ -837,12 +839,67 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p_n_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 high-bit-depth YUV 4:4:4 kernel for +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_64`) and +/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_64` with +/// constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -862,6 +919,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -957,21 +1015,30 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::yuv_444p_n_to_rgb_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1139,6 +1206,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// the i32 Q15 pipeline — output-range scaling keeps `coeff × u_d` /// within i32 for u8 output. 64 pixels per iter. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **AVX-512F + AVX-512BW must be available.** @@ -1155,10 +1224,62 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 YUV 4:4:4 planar **16-bit** → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 16-bit YUV 4:4:4 kernel for [`yuv_444p16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_64`) and [`yuv_444p16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_64` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -1176,6 +1297,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let mut x = 0usize; @@ -1267,20 +1389,27 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::yuv_444p16_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3759,6 +3888,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( /// 64 pixels per iter via 512-bit vectors; 128 UV elements (= 64 pairs) /// deinterleaved per iter via two `deinterleave_uv_u16_avx512` calls. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. AVX-512F + AVX-512BW must be available on the current CPU. @@ -3773,11 +3904,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **8-bit +/// RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 Pn 4:4:4 high-bit-packed kernel for +/// [`p_n_444_to_rgb_row`] (`ALPHA = false`, `write_rgb_64`) and +/// [`p_n_444_to_rgba_row`] (`ALPHA = true`, `write_rgba_64` with +/// constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. AVX-512F + AVX-512BW must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3797,6 +3980,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -3894,20 +4078,25 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::p_n_444_to_rgb_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -4075,6 +4264,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// AVX-512 P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB. /// 64 pixels per iter; Y stays on i32. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. AVX-512F + AVX-512BW must be available. @@ -4090,9 +4281,59 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_16_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 P416 kernel for [`p_n_444_16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_64`) and [`p_n_444_16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_64` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. AVX-512F + AVX-512BW must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -4110,6 +4351,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -4201,19 +4443,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::p_n_444_16_to_rgb_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index 44ab725..b66d3a1 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -2161,3 +2161,200 @@ fn avx512_p416_matches_scalar_tail_widths() { check_p_n_444_16_u16_avx512_equivalence(w, ColorMatrix::Bt2020Ncl, true); } } + +// ---- High-bit 4:4:4 u8 RGBA equivalence (Ship 8 Tranche 7b) --------- +// +// Mirrors the 4:2:0 RGBA pattern in PR #25 (Tranche 5a). Each kernel +// family — Yuv444p_n (BITS-generic), Yuv444p16, Pn_444 (BITS-generic), +// Pn_444_16 — has its AVX-512 RGBA kernel byte-pinned against the +// scalar reference at the natural width and a sweep of tail widths. +// Each test gates on `is_x86_feature_detected!("avx512bw")` to stay +// clean under sanitizer/Miri/non-feature-flagged CI runners. + +fn check_yuv444p_n_u8_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuv444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u8_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_avx512::(width, 37); + let u = high_bit_plane_avx512::(width, 53); + let v = high_bit_plane_avx512::(width, 71); + let uv = interleave_uv_avx512(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Pn4:4:4<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width, 53); + let v = p16_plane_avx512(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuv444p16 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u8_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width, 53); + let v = p16_plane_avx512(width, 71); + let uv = interleave_uv_avx512(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 P416 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx512_yuv444p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_avx512_rgba_equivalence::<9>(64, m, full); + check_yuv444p_n_u8_avx512_rgba_equivalence::<10>(64, m, full); + check_yuv444p_n_u8_avx512_rgba_equivalence::<12>(64, m, full); + check_yuv444p_n_u8_avx512_rgba_equivalence::<14>(64, m, full); + } + } +} + +#[test] +fn avx512_yuv444p_n_rgba_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_avx512_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u8_avx512_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u8_avx512_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u8_avx512_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx512_pn_444_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u8_avx512_rgba_equivalence::<10>(64, m, full); + check_pn_444_u8_avx512_rgba_equivalence::<12>(64, m, full); + } + } +} + +#[test] +fn avx512_pn_444_rgba_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u8_avx512_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u8_avx512_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx512_yuv444p16_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u8_avx512_rgba_equivalence(64, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx512_p416_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u8_avx512_rgba_equivalence(64, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index a05ce4b..90c5cc8 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1087,6 +1087,8 @@ fn clamp_u16_max(v: __m128i, zero_v: __m128i, max_v: __m128i) -> __m128i { /// skipping the horizontal chroma-duplication step (4:4:4 chroma is /// 1:1 with Y, not paired). /// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Numerical contract /// /// Byte-identical to [`scalar::yuv_444p_n_to_rgb_row::`]. @@ -1106,12 +1108,67 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p_n_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 high-bit-depth YUV 4:4:4 kernel for +/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and +/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with +/// constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1131,6 +1188,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -1189,21 +1247,30 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_444p_n_to_rgb_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1329,6 +1396,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// the i32 Q15 pipeline — output-range scaling keeps `coeff × u_d` /// within i32 for u8 output. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **SSE4.1 must be available.** @@ -1345,10 +1414,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 YUV 4:4:4 planar **16-bit** → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`yuv_444p16_to_rgb_row`]. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 16-bit YUV 4:4:4 kernel for [`yuv_444p16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_16`) and [`yuv_444p16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -1366,6 +1488,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -1420,20 +1543,27 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let b_u8 = _mm_packus_epi16(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_444p16_to_rgb_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -3172,6 +3302,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( /// SSE4.1 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. SSE4.1 must be available on the current CPU. @@ -3186,11 +3318,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **8-bit +/// RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 Pn 4:4:4 high-bit-packed kernel for +/// [`p_n_444_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and +/// [`p_n_444_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with +/// constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. SSE4.1 must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3210,6 +3394,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -3268,20 +3453,25 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p_n_444_to_rgb_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -3407,6 +3597,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// within i32 for u8 output). Mirrors `yuv_444p16_to_rgb_row` with /// full-width interleaved UV via `deinterleave_uv_u16`. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -3422,9 +3614,60 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as +/// [`p_n_444_16_to_rgb_row`]. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_16_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 P416 kernel for [`p_n_444_16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_16`) and [`p_n_444_16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. SSE4.1 must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -3442,6 +3685,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -3496,19 +3740,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let b_u8 = _mm_packus_epi16(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p_n_444_16_to_rgb_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index f60f081..330a59a 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -2181,3 +2181,196 @@ fn sse41_p416_matches_scalar_tail_widths() { check_p_n_444_16_u16_sse41_equivalence(w, ColorMatrix::Bt2020Ncl, true); } } + +// ---- High-bit 4:4:4 u8 RGBA equivalence (Ship 8 Tranche 7b) --------- +// +// Mirrors the 4:2:0 RGBA pattern in PR #25 (Tranche 5a). Each kernel +// family — Yuv444p_n (BITS-generic), Yuv444p16, Pn_444 (BITS-generic), +// Pn_444_16 — has its SSE4.1 RGBA kernel byte-pinned against the scalar +// reference at the natural width and a sweep of tail widths. Each test +// gates on `is_x86_feature_detected!("sse4.1")` to stay clean under +// sanitizer/Miri/non-feature-flagged CI runners. + +fn check_yuv444p_n_u8_sse41_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuv444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u8_sse41_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_sse41::(width, 37); + let u = high_bit_plane_sse41::(width, 53); + let v = high_bit_plane_sse41::(width, 71); + let uv = interleave_uv_sse41(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Pn4:4:4<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width, 53); + let v = p16_plane(width, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuv444p16 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width, 53); + let v = p16_plane(width, 71); + let uv = interleave_uv_sse41(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 P416 → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn sse41_yuv444p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u8_sse41_rgba_equivalence::<9>(16, m, full); + check_yuv444p_n_u8_sse41_rgba_equivalence::<10>(16, m, full); + check_yuv444p_n_u8_sse41_rgba_equivalence::<12>(16, m, full); + check_yuv444p_n_u8_sse41_rgba_equivalence::<14>(16, m, full); + } + } +} + +#[test] +fn sse41_yuv444p_n_rgba_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u8_sse41_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u8_sse41_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u8_sse41_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u8_sse41_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn sse41_pn_444_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u8_sse41_rgba_equivalence::<10>(16, m, full); + check_pn_444_u8_sse41_rgba_equivalence::<12>(16, m, full); + } + } +} + +#[test] +fn sse41_pn_444_rgba_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u8_sse41_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u8_sse41_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn sse41_yuv444p16_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u8_sse41_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn sse41_p416_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u8_sse41_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/mod.rs b/src/row/mod.rs index 1b400e9..cec5c09 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -4264,13 +4264,15 @@ pub fn p412_to_rgb_u16_row( p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); } -// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7 prep) ---------- +// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- // -// Both u8 and native-depth `u16` RGBA dispatchers route to the scalar -// reference path. SIMD per-arch routes land in the follow-up Ship 8 -// Tranche 7b (u8) and Tranche 7c (u16) PRs; the `use_simd` parameter -// is held in the signature for API stability, but every body is -// `let _ = use_simd;` plus a scalar call until the SIMD wiring lands. +// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche +// 7b). The native-depth `u16` RGBA dispatchers stay on the scalar +// reference path until the follow-up Ship 8 Tranche 7c PR; the +// `use_simd` parameter is held in their signatures for API stability, +// but their bodies remain `let _ = use_simd;` plus a scalar call +// until the SIMD wiring lands. `use_simd = false` forces the scalar +// reference path on every dispatcher. /// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the @@ -4299,7 +4301,53 @@ pub fn yuv444p9_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4354,7 +4402,53 @@ pub fn yuv444p10_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4407,7 +4501,53 @@ pub fn yuv444p12_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4460,7 +4600,53 @@ pub fn yuv444p14_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4514,7 +4700,53 @@ pub fn yuv444p16_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); } @@ -4567,7 +4799,53 @@ pub fn p410_to_rgba_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); } @@ -4617,7 +4895,53 @@ pub fn p412_to_rgba_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); } @@ -4668,7 +4992,53 @@ pub fn p416_to_rgba_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); } From 22b79073ce1cb3bba1fb9e6814763ce98198b1ca Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 16:09:19 +1200 Subject: [PATCH 2/2] docs(simd): fix yuv_444p_n doc to include BITS=9 in supported set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 4:4:4 high-bit YUV planar SIMD docs claimed `BITS ∈ {10, 12, 14}` across all 5 backends, but the const-assert in every implementation accepts `BITS == 9 || 10 || 12 || 14` and the `yuv444p9_to_rgba_row` public dispatcher (added in PR #29) instantiates the kernel with `<9>`. The doc string was stale from before BITS=9 was added in Ship 6b. Updates both the const-generic bound (`{10, 12, 14}` → `{9, 10, 12, 14}`) and the prose bit-list (`10/12/14-bit` → `9/10/12/14-bit`) on every 4:4:4 planar SIMD doc — covers the u8 RGB, u8 RGBA (added in this PR), and u16 RGB siblings across NEON, SSE4.1, AVX2, AVX-512, and wasm simd128. 23 lines updated total. Addresses Copilot review comments on PR #30. Also retroactively fixes the matching drift on the u16 RGB and pre-existing u8 RGB docs that Copilot didn't explicitly flag but had identical wording. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon.rs | 6 +++--- src/row/arch/wasm_simd128.rs | 10 +++++----- src/row/arch/x86_avx2.rs | 10 +++++----- src/row/arch/x86_avx512.rs | 10 +++++----- src/row/arch/x86_sse41.rs | 10 +++++----- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index e3079b6..3a6acc7 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -699,7 +699,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row v128 { i16x8_min(i16x8_max(v, zero_v), max_v) } -/// WASM simd128 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. -/// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 16 pixels. +/// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB. +/// Const-generic over `BITS ∈ {9, 10, 12, 14}`. Block size 16 pixels. /// /// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. /// @@ -663,7 +663,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( } } -/// WASM simd128 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// @@ -821,8 +821,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row __m256i { unsafe { _mm256_min_epi16(_mm256_max_epi16(v, zero_v), max_v) } } -/// AVX2 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. -/// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 32 pixels. +/// AVX2 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB. +/// Const-generic over `BITS ∈ {9, 10, 12, 14}`. Block size 32 pixels. /// /// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. /// @@ -776,7 +776,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( } } -/// AVX2 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// AVX2 YUV 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// @@ -959,8 +959,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( } } -/// AVX-512 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// AVX-512 YUV 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// @@ -1043,8 +1043,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row __m128i { unsafe { _mm_min_epi16(_mm_max_epi16(v, zero_v), max_v) } } -/// SSE4.1 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. -/// Const-generic over `BITS ∈ {10, 12, 14}`. +/// SSE4.1 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB. +/// Const-generic over `BITS ∈ {9, 10, 12, 14}`. /// /// Block size: 16 pixels per iteration (same as the 4:2:0 sibling). /// Differs from [`yuv_420p_n_to_rgb_row`] by loading full-width U/V @@ -1115,7 +1115,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( } } -/// SSE4.1 YUV 4:4:4 planar 10/12/14-bit → packed **8-bit RGBA** +/// SSE4.1 YUV 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** /// (`R, G, B, 0xFF`). Same numerical contract as /// [`yuv_444p_n_to_rgb_row`]. /// @@ -1275,8 +1275,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row