From 0798cf2b1cbdb30b6574e36da6f028550b9e8c4c Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 01:00:37 +1200 Subject: [PATCH 1/3] update --- src/row/arch/neon.rs | 388 ++++++++++++++++---- src/row/arch/neon/tests.rs | 188 ++++++++++ src/row/arch/wasm_simd128.rs | 378 +++++++++++++++++--- src/row/arch/wasm_simd128/tests.rs | 180 ++++++++++ src/row/arch/x86_avx2.rs | 557 +++++++++++++++++++++++------ src/row/arch/x86_avx2/tests.rs | 194 ++++++++++ src/row/arch/x86_avx512.rs | 458 ++++++++++++++++++++---- src/row/arch/x86_avx512/tests.rs | 198 ++++++++++ src/row/arch/x86_common.rs | 36 ++ src/row/arch/x86_sse41.rs | 356 +++++++++++++++--- src/row/arch/x86_sse41/tests.rs | 198 ++++++++++ src/row/mod.rs | 457 ++++++++++++++++++++--- 12 files changed, 3205 insertions(+), 383 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 917c64b..9114045 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -519,13 +519,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// NEON sibling of [`yuv_420p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth) — matches `scalar::yuv_420p_n_to_rgba_u16_row`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_u16_row`], plus +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared NEON high-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` +/// writes RGBA quads via `vst4q_u16` with constant alpha +/// `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **NEON must be available.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -552,6 +609,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u16 = vdupq_n_u16(out_max as u16); let mut x = 0usize; while x + 16 <= width { @@ -605,25 +663,37 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let b_lo = clamp_u16_max(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v); - // Two interleaved u16 writes — each `vst3q_u16` covers 8 pixels. - let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); - let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + if ALPHA { + let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16); + let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16); + vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo); + vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi); + } else { + // Two interleaved u16 writes — each `vst3q_u16` covers 8 pixels. + let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); + let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); + vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo); + vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + } x += 16; } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1125,10 +1195,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// NEON sibling of [`p_n_to_rgba_row`] for native-depth `u16` output. +/// Alpha samples are `(1 << BITS) - 1` (opaque maximum at the input +/// bit depth) — matches `scalar::p_n_to_rgba_u16_row`. P016 has its +/// own kernel family — never routed here. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_u16_row`], plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON Pn → native-depth `u16` kernel. `ALPHA = false` writes +/// RGB triples via `vst3q_u16`; `ALPHA = true` writes RGBA quads via +/// `vst4q_u16` with constant alpha `(1 << BITS) - 1`. P016 has its +/// own kernel family — never routed here. +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1152,6 +1274,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u16 = vdupq_n_u16(out_max as u16); let mut x = 0usize; while x + 16 <= width { @@ -1198,23 +1321,31 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let b_lo = clamp_u16_max(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v); - let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); - let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + if ALPHA { + let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16); + let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16); + vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo); + vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi); + } else { + let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); + let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); + vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo); + vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + } x += 16; } if x < width { - scalar::p_n_to_rgb_u16_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2233,11 +2364,65 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// NEON sibling of [`yuv_420p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha is `0xFFFF` — matches `scalar::yuv_420p16_to_rgba_u16_row`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared NEON 16-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` +/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. NEON must be available. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -2245,6 +2430,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( const RND: i32 = 1 << 14; unsafe { + let alpha_u16 = vdupq_n_u16(0xFFFF); let rnd_v = vdupq_n_s32(RND); let rnd64 = vdupq_n_s64(RND as i64); let y_off_v = vdupq_n_s32(y_off); @@ -2345,27 +2531,43 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)), ); - vst3q_u16( - rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), - ); - vst3q_u16( - rgb_out.as_mut_ptr().add(x * 3 + 24), - uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), - ); + if ALPHA { + vst4q_u16( + out.as_mut_ptr().add(x * 4), + uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + ); + vst4q_u16( + out.as_mut_ptr().add(x * 4 + 32), + uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + ); + } else { + vst3q_u16( + out.as_mut_ptr().add(x * 3), + uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), + ); + vst3q_u16( + out.as_mut_ptr().add(x * 3 + 24), + uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), + ); + } x += 16; } if x < width { - scalar::yuv_420p16_to_rgb_u16_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2827,10 +3029,57 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// NEON sibling of [`p16_to_rgba_row`] for native-depth `u16` output. +/// Alpha is `0xFFFF` — matches `scalar::p16_to_rgba_u16_row`. +/// +/// # Safety +/// +/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p16_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON 16-bit P016 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` +/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. NEON must be available. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -2838,6 +3087,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( const RND: i32 = 1 << 14; unsafe { + let alpha_u16 = vdupq_n_u16(0xFFFF); let rnd_v = vdupq_n_s32(RND); let rnd64 = vdupq_n_s64(RND as i64); let y_off_v = vdupq_n_s32(y_off); @@ -2935,26 +3185,38 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)), ); - vst3q_u16( - rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), - ); - vst3q_u16( - rgb_out.as_mut_ptr().add(x * 3 + 24), - uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), - ); + if ALPHA { + vst4q_u16( + out.as_mut_ptr().add(x * 4), + uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + ); + vst4q_u16( + out.as_mut_ptr().add(x * 4 + 32), + uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + ); + } else { + vst3q_u16( + out.as_mut_ptr().add(x * 3), + uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), + ); + vst3q_u16( + out.as_mut_ptr().add(x * 3 + 24), + uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), + ); + } x += 16; } if x < width { - scalar::p16_to_rgb_u16_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index 41dd909..c84627c 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -1828,6 +1828,194 @@ fn neon_p016_rgba_matches_scalar_all_matrices() { } } +// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ---- +// +// u16 RGBA wrappers share the math of their u16 RGB siblings — only +// the store (and tail dispatch) branches on `ALPHA`, with alpha set to +// `(1 << BITS) - 1` for BITS-generic kernels and `0xFFFF` for 16-bit +// kernels. Tests pin byte-identical output against the scalar RGBA +// reference. + +fn check_planar_u16_neon_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u16_neon_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv420p_n_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u16_neon_rgba_equivalence_n::<9>(16, m, full); + check_planar_u16_neon_rgba_equivalence_n::<10>(16, m, full); + check_planar_u16_neon_rgba_equivalence_n::<12>(16, m, full); + check_planar_u16_neon_rgba_equivalence_n::<14>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_planar_u16_neon_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u16_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u16_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u16_neon_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u16_neon_rgba_equivalence_n::<10>(16, m, full); + check_pn_u16_neon_rgba_equivalence_n::<12>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_rgba_u16_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_pn_u16_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u16_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +fn check_yuv420p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width / 2, 53); + let v = p16_plane_neon(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p016_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width / 2, 53); + let v = p16_plane_neon(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_u16_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv420p16_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_neon_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_yuv420p16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_p016_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p016_u16_neon_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_p016_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Yuv444p_n NEON equivalence (10/12/14) -------------------------- fn check_yuv444p_n_u8_neon_equivalence( diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index e9d02c4..19aed79 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -464,13 +464,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// wasm simd128 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth +/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum +/// at the input bit depth). +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared wasm simd128 high-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -495,6 +552,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u16 = u16x8_splat(out_max as u16); let mut x = 0usize; while x + 16 <= width { @@ -539,23 +597,34 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8(r_lo, g_lo, b_lo, dst); - write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst); + write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8(r_lo, g_lo, b_lo, dst); + write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + } x += 16; } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1105,6 +1174,44 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { } } +/// Interleaves 8 R/G/B/A `u16` samples into packed RGBA quads (32 +/// `u16` = 64 bytes). Two `i16x8_shuffle` stages: first interleave +/// R+G and B+A into pairs, then combine pair-vectors into RGBA quads. +/// +/// # Safety +/// +/// `ptr` must point to at least 64 writable bytes. Caller must have +/// `simd128` enabled at compile time. +#[inline(always)] +unsafe fn write_rgba_u16_8(r: v128, g: v128, b: v128, a: v128, ptr: *mut u16) { + unsafe { + // Stage 1: interleave R+G and B+A pairwise. + // rg_lo = [R0, G0, R1, G1, R2, G2, R3, G3] + // rg_hi = [R4, G4, R5, G5, R6, G6, R7, G7] + // ba_lo = [B0, A0, B1, A1, B2, A2, B3, A3] + // ba_hi = [B4, A4, B5, A5, B6, A6, B7, A7] + let rg_lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(r, g); + let rg_hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(r, g); + let ba_lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(b, a); + let ba_hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(b, a); + + // Stage 2: combine RG pairs with BA pairs to produce RGBA quads. + // q0 = [R0, G0, B0, A0, R1, G1, B1, A1] + // q1 = [R2, G2, B2, A2, R3, G3, B3, A3] + // q2 = [R4, G4, B4, A4, R5, G5, B5, A5] + // q3 = [R6, G6, B6, A6, R7, G7, B7, A7] + let q0 = i16x8_shuffle::<0, 1, 8, 9, 2, 3, 10, 11>(rg_lo, ba_lo); + let q1 = i16x8_shuffle::<4, 5, 12, 13, 6, 7, 14, 15>(rg_lo, ba_lo); + let q2 = i16x8_shuffle::<0, 1, 8, 9, 2, 3, 10, 11>(rg_hi, ba_hi); + let q3 = i16x8_shuffle::<4, 5, 12, 13, 6, 7, 14, 15>(rg_hi, ba_hi); + + v128_store(ptr.cast(), q0); + v128_store(ptr.add(8).cast(), q1); + v128_store(ptr.add(16).cast(), q2); + v128_store(ptr.add(24).cast(), q3); + } +} + /// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → /// packed **8‑bit** RGB. /// @@ -1310,10 +1417,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 sibling of [`p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). P016 has its own kernel family — never routed here. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 Pn → native-depth `u16` kernel. `ALPHA = false` +/// writes RGB triples via `write_rgb_u16_8`; `ALPHA = true` writes +/// RGBA quads via `write_rgba_u16_8` with constant alpha +/// `(1 << BITS) - 1`. P016 has its own kernel family — never routed +/// here. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1337,6 +1496,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u16 = u16x8_splat(out_max as u16); // High-bit-packed samples: shift right by `16 - BITS`. let shr = (16 - BITS) as u32; @@ -1383,22 +1543,29 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8(r_lo, g_lo, b_lo, dst); - write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst); + write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8(r_lo, g_lo, b_lo, dst); + write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + } x += 16; } if x < width { - scalar::p_n_to_rgb_u16_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2645,11 +2812,66 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// wasm simd128 sibling of [`yuv_420p16_to_rgba_row`] for native-depth +/// `u16` output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared wasm simd128 16-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -2657,6 +2879,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( const RND_I32: i32 = 1 << 14; unsafe { + let alpha_u16 = u16x8_splat(0xFFFF); let rnd_i64 = i64x2_splat(RND_I64); let rnd_i32 = i32x4_splat(RND_I32); let y_off32 = i32x4_splat(y_off); @@ -2739,20 +2962,29 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( i32x4_add(y_hi_scaled, b_dup_hi), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::yuv_420p16_to_rgb_u16_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2921,10 +3153,58 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 sibling of [`p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p16_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 16-bit P016 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -2932,6 +3212,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( const RND_I32: i32 = 1 << 14; unsafe { + let alpha_u16 = u16x8_splat(0xFFFF); let rnd_i64 = i64x2_splat(RND_I64); let rnd_i32 = i32x4_splat(RND_I32); let y_off32 = i32x4_splat(y_off); @@ -3014,19 +3295,24 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( i32x4_add(y_hi_scaled, b_dup_hi), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::p16_to_rgb_u16_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index d394f33..21c9489 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -1564,6 +1564,186 @@ fn simd128_p016_rgba_matches_scalar_all_matrices() { } } +// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ---- + +fn check_planar_u16_simd128_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u16_simd128_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u16_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width / 2, 53); + let v = p16_plane_wasm(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u16_simd128_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width / 2, 53); + let v = p16_plane_wasm(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn simd128_yuv420p_n_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u16_simd128_rgba_equivalence_n::<9>(16, m, full); + check_planar_u16_simd128_rgba_equivalence_n::<10>(16, m, full); + check_planar_u16_simd128_rgba_equivalence_n::<12>(16, m, full); + check_planar_u16_simd128_rgba_equivalence_n::<14>(16, m, full); + } + } +} + +#[test] +fn simd128_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_planar_u16_simd128_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u16_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u16_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u16_simd128_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn simd128_pn_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u16_simd128_rgba_equivalence_n::<10>(16, m, full); + check_pn_u16_simd128_rgba_equivalence_n::<12>(16, m, full); + } + } +} + +#[test] +fn simd128_pn_rgba_u16_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_pn_u16_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u16_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn simd128_yuv420p16_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_simd128_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_yuv420p16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn simd128_p016_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u16_simd128_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_p16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) wasm simd128 equivalence --------- fn high_bit_plane_wasm(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index d0f72fd..ad52597 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -45,6 +45,7 @@ use crate::{ row::{ arch::x86_common::{ rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16, + write_rgba_u16_8, }, scalar, }, @@ -508,13 +509,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX2 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX2 high-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8` per +/// 32-pixel block; `ALPHA = true` writes RGBA quads via 4× +/// `write_rgba_u16_8` with constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -538,6 +596,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 32 <= width { @@ -601,45 +660,82 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( // Four 8‑pixel u16 writes per 32‑pixel block. Each extracts a // 128‑bit half of an i16x16 channel and hands it to the shared // SSE4.1 u16 interleave helper. - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_lo), - _mm256_castsi256_si128(g_lo), - _mm256_castsi256_si128(b_lo), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_lo), - _mm256_extracti128_si256::<1>(g_lo), - _mm256_extracti128_si256::<1>(b_lo), - dst.add(24), - ); - write_rgb_u16_8( - _mm256_castsi256_si128(r_hi), - _mm256_castsi256_si128(g_hi), - _mm256_castsi256_si128(b_hi), - dst.add(48), - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_hi), - _mm256_extracti128_si256::<1>(g_hi), - _mm256_extracti128_si256::<1>(b_hi), - dst.add(72), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + alpha_u16, + dst.add(32), + ); + write_rgba_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + alpha_u16, + dst.add(64), + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + alpha_u16, + dst.add(96), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + dst.add(24), + ); + write_rgb_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + dst.add(48), + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + dst.add(72), + ); + } x += 32; } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1469,10 +1565,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 sibling of [`p_n_to_rgba_row`] for native-depth `u16` output. +/// Alpha samples are `(1 << BITS) - 1` (opaque maximum at the input +/// bit depth). P016 has its own kernel family — never routed here. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 Pn → native-depth `u16` kernel. `ALPHA = false` writes +/// RGB triples via 4× `write_rgb_u16_8` per 32-pixel block; +/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. P016 has its own kernel family — +/// never routed here. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1497,6 +1645,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 32 <= width { @@ -1550,44 +1699,77 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_lo), - _mm256_castsi256_si128(g_lo), - _mm256_castsi256_si128(b_lo), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_lo), - _mm256_extracti128_si256::<1>(g_lo), - _mm256_extracti128_si256::<1>(b_lo), - dst.add(24), - ); - write_rgb_u16_8( - _mm256_castsi256_si128(r_hi), - _mm256_castsi256_si128(g_hi), - _mm256_castsi256_si128(b_hi), - dst.add(48), - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_hi), - _mm256_extracti128_si256::<1>(g_hi), - _mm256_extracti128_si256::<1>(b_hi), - dst.add(72), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + alpha_u16, + dst.add(32), + ); + write_rgba_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + alpha_u16, + dst.add(64), + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + alpha_u16, + dst.add(96), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + dst.add(24), + ); + write_rgb_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + dst.add(48), + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + dst.add(72), + ); + } x += 32; } if x < width { - scalar::p_n_to_rgb_u16_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2855,17 +3037,72 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX2 sibling of [`yuv_420p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX2 16-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA +/// quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **AVX2 must be available.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm256_set1_epi64x(RND); let y_off_v = _mm256_set1_epi32(y_off); let y_scale_v = _mm256_set1_epi32(y_scale); @@ -2952,34 +3189,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( _mm256_add_epi32(y_hi_scaled, b_dup_hi), )); - // Write 16 pixels = 48 u16 via two 8-pixel helper calls. - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - dst.add(24), - ); + // Write 16 pixels via two 8-pixel helper calls. + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + dst.add(24), + ); + } x += 16; } if x < width { - scalar::yuv_420p16_to_rgb_u16_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3157,16 +3417,64 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 sibling of [`p16_to_rgba_row`] for native-depth `u16` output. +/// Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p16_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 16-bit P016 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA +/// quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **AVX2 must be available.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm256_set1_epi64x(RND); let y_off_v = _mm256_set1_epi32(y_off); let y_scale_v = _mm256_set1_epi32(y_scale); @@ -3257,32 +3565,51 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( _mm256_add_epi32(y_hi_scaled, b_dup_hi), )); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - dst.add(24), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + dst.add(24), + ); + } x += 16; } if x < width { - scalar::p16_to_rgb_u16_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index f8f28eb..37bb6b8 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -1779,6 +1779,200 @@ fn avx2_p016_rgba_matches_scalar_all_matrices() { } } +// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ---- + +fn check_planar_u16_avx2_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u16_avx2_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width / 2, 53); + let v = p16_plane_avx2(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width / 2, 53); + let v = p16_plane_avx2(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx2_yuv420p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u16_avx2_rgba_equivalence_n::<9>(32, m, full); + check_planar_u16_avx2_rgba_equivalence_n::<10>(32, m, full); + check_planar_u16_avx2_rgba_equivalence_n::<12>(32, m, full); + check_planar_u16_avx2_rgba_equivalence_n::<14>(32, m, full); + } + } +} + +#[test] +fn avx2_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [34usize, 48, 62, 1920, 1922] { + check_planar_u16_avx2_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u16_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u16_avx2_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx2_pn_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u16_avx2_rgba_equivalence_n::<10>(32, m, full); + check_pn_u16_avx2_rgba_equivalence_n::<12>(32, m, full); + } + } +} + +#[test] +fn avx2_pn_rgba_u16_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [34usize, 48, 62, 1920, 1922] { + check_pn_u16_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u16_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx2_yuv420p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_avx2_rgba_equivalence(32, m, full); + } + } + for w in [34usize, 48, 62, 1920, 1922] { + check_yuv420p16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx2_p016_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u16_avx2_rgba_equivalence(32, m, full); + } + } + for w in [34usize, 48, 62, 1920, 1922] { + check_p16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) AVX2 equivalence ----------------- fn high_bit_plane_avx2(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 48e7cfa..8138c70 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -59,6 +59,7 @@ use crate::{ row::{ arch::x86_common::{ rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16, + write_rgba_u16_8, }, scalar, }, @@ -522,13 +523,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX-512 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth +/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum +/// at the input bit depth). +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX-512 high-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via 8× `write_quarter` per +/// 64-pixel block; `ALPHA = true` writes RGBA quads via 8× +/// `write_quarter_rgba` with constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -552,6 +610,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); @@ -618,29 +677,46 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( // Eight 8‑pixel u16 writes per 64‑pixel block. For each i16x32 // channel vector we extract four 128‑bit quarters and hand each // to the shared SSE4.1 u16 interleave helper. - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_quarter(r_lo, g_lo, b_lo, 0, dst); - write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); - write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); - write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); - write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); - write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); - write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); - write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_quarter(r_lo, g_lo, b_lo, 0, dst); + write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); + write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); + write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); + write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); + write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); + write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); + write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + } x += 64; } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -696,6 +772,53 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u } } +/// RGBA sibling of [`write_quarter`]. Extracts one 128‑bit quarter of +/// each `i16x32` channel vector and hands it (plus a splatted alpha) +/// to [`write_rgba_u16_8`]. +/// +/// # Safety +/// +/// Same as [`write_rgba_u16_8`] — `ptr` must point to at least 64 +/// writable bytes (32 `u16`). Caller's `target_feature` must include +/// AVX‑512F + AVX‑512BW (so `_mm512_extracti32x4_epi32` is available) +/// and SSE2 (for the underlying unpack/store inside +/// `write_rgba_u16_8`). +#[inline(always)] +unsafe fn write_quarter_rgba( + r: __m512i, + g: __m512i, + b: __m512i, + a: __m128i, + idx: u8, + ptr: *mut u16, +) { + unsafe { + let (rq, gq, bq) = match idx { + 0 => ( + _mm512_extracti32x4_epi32::<0>(r), + _mm512_extracti32x4_epi32::<0>(g), + _mm512_extracti32x4_epi32::<0>(b), + ), + 1 => ( + _mm512_extracti32x4_epi32::<1>(r), + _mm512_extracti32x4_epi32::<1>(g), + _mm512_extracti32x4_epi32::<1>(b), + ), + 2 => ( + _mm512_extracti32x4_epi32::<2>(r), + _mm512_extracti32x4_epi32::<2>(g), + _mm512_extracti32x4_epi32::<2>(b), + ), + _ => ( + _mm512_extracti32x4_epi32::<3>(r), + _mm512_extracti32x4_epi32::<3>(g), + _mm512_extracti32x4_epi32::<3>(b), + ), + }; + write_rgba_u16_8(rq, gq, bq, a, ptr); + } +} + /// AVX-512 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB. /// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 64 pixels. /// @@ -1530,10 +1653,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 sibling of [`p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). P016 has its own kernel family — never routed here. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p_n_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 Pn → native-depth `u16` kernel. `ALPHA = false` +/// writes RGB triples via 8× `write_quarter` per 64-pixel block; +/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with +/// constant alpha `(1 << BITS) - 1`. P016 has its own kernel family — +/// never routed here. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1558,6 +1733,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); @@ -1615,28 +1791,41 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_quarter(r_lo, g_lo, b_lo, 0, dst); - write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); - write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); - write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); - write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); - write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); - write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); - write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_quarter(r_lo, g_lo, b_lo, 0, dst); + write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); + write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); + write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); + write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); + write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); + write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); + write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + } x += 64; } if x < width { - scalar::p_n_to_rgb_u16_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2730,6 +2919,40 @@ unsafe fn write_rgb_u16_32(r: __m512i, g: __m512i, b: __m512i, ptr: *mut u16) { } } +/// Writes 32 pixels of packed RGBA-u16 (128 u16 = 256 bytes) by +/// splitting each u16x32 channel vector into four 128-bit halves and +/// calling the shared [`write_rgba_u16_8`] helper four times. Alpha +/// is supplied as a single i16x8 vector splatted into all 32 alpha +/// lanes. +/// +/// # Safety +/// +/// `ptr` must point to at least 256 writable bytes. +#[inline(always)] +unsafe fn write_rgba_u16_32(r: __m512i, g: __m512i, b: __m512i, a: __m128i, ptr: *mut u16) { + unsafe { + let r0: __m128i = _mm512_castsi512_si128(r); + let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r); + let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r); + let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r); + let g0: __m128i = _mm512_castsi512_si128(g); + let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g); + let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g); + let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g); + let b0: __m128i = _mm512_castsi512_si128(b); + let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b); + let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b); + let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b); + + // Each `write_rgba_u16_8` writes 8 pixels × 4 × u16 = 64 bytes = + // 32 u16 elements. Four calls → 128 u16 = 32 pixels. + write_rgba_u16_8(r0, g0, b0, a, ptr); + write_rgba_u16_8(r1, g1, b1, a, ptr.add(32)); + write_rgba_u16_8(r2, g2, b2, a, ptr.add(64)); + write_rgba_u16_8(r3, g3, b3, a, ptr.add(96)); + } +} + // ===== 16-bit YUV → RGB ================================================== /// `(Y_u16x32 - y_off) * y_scale + RND >> 15` for full u16 Y samples. @@ -2957,11 +3180,66 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX-512 sibling of [`yuv_420p16_to_rgba_row`] for native-depth +/// `u16` output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX-512 16-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with +/// constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -2972,6 +3250,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( // adds below are bounded by `while x + 32 <= width` and the caller- // promised slice lengths. unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_i64_v = _mm512_set1_epi64(RND_I64); let rnd_i32_v = _mm512_set1_epi32(RND_I32); let y_off_v = _mm512_set1_epi32(y_off); @@ -3081,22 +3360,31 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32)); let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32)); - // Write 32 pixels (96 u16) via 4× 8-pixel helper. - write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + // Write 32 pixels via the appropriate 4× 8-pixel helper. + if ALPHA { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_420p16_to_rgb_u16_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3260,7 +3548,6 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( } /// AVX-512 P016 → packed **16-bit** RGB. -/// Delegates to SSE4.1 (i64 arithmetic). /// /// # Safety /// @@ -3276,34 +3563,59 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_u16_row_impl(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } -/// Native AVX-512 P016 → u16 kernel, 32 pixels per iter. Shares the -/// 16-bit u16 arithmetic structure with -/// [`yuv_420p16_to_rgb_u16_row`]; the only difference is an inline -/// U/V deinterleave at the UV load. +/// AVX-512 sibling of [`p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha is `0xFFFF`. /// /// # Safety /// -/// Must be called from an AVX-512BW-enabled context. Caller upholds -/// `width & 1 == 0`, `y.len() >= width`, `uv_half.len() >= width`, -/// `rgb_out.len() >= 3 * width`. +/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn p16_to_rgb_u16_row_impl( +pub(crate) unsafe fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], - rgb_out: &mut [u16], + rgba_out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 16-bit P016 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with +/// constant alpha `0xFFFF`. 32 pixels per iter. Shares the 16-bit +/// arithmetic structure with [`yuv_420p16_to_rgb_or_rgba_u16_row`]; +/// only difference is an inline U/V deinterleave at the UV load. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3314,6 +3626,7 @@ unsafe fn p16_to_rgb_u16_row_impl( // adds are bounded by `while x + 32 <= width` and caller-promised // slice lengths. unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_i64_v = _mm512_set1_epi64(RND_I64); let rnd_i32_v = _mm512_set1_epi32(RND_I32); let y_off_v = _mm512_set1_epi32(y_off); @@ -3409,20 +3722,25 @@ unsafe fn p16_to_rgb_u16_row_impl( let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32)); let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32)); - write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::p16_to_rgb_u16_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index 2591060..44ab725 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -1793,6 +1793,204 @@ fn avx512_p016_rgba_matches_scalar_all_matrices() { } } +// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ---- + +fn check_planar_u16_avx512_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u16_avx512_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u16_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width / 2, 53); + let v = p16_plane_avx512(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u16_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width / 2, 53); + let v = p16_plane_avx512(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx512_yuv420p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u16_avx512_rgba_equivalence_n::<9>(64, m, full); + check_planar_u16_avx512_rgba_equivalence_n::<10>(64, m, full); + check_planar_u16_avx512_rgba_equivalence_n::<12>(64, m, full); + check_planar_u16_avx512_rgba_equivalence_n::<14>(64, m, full); + } + } +} + +#[test] +fn avx512_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [66usize, 96, 126, 1920, 1922] { + check_planar_u16_avx512_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u16_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u16_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u16_avx512_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx512_pn_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u16_avx512_rgba_equivalence_n::<10>(64, m, full); + check_pn_u16_avx512_rgba_equivalence_n::<12>(64, m, full); + } + } +} + +#[test] +fn avx512_pn_rgba_u16_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [66usize, 96, 126, 1920, 1922] { + check_pn_u16_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u16_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx512_yuv420p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_avx512_rgba_equivalence(64, m, full); + } + } + for w in [66usize, 96, 126, 1920, 1922] { + check_yuv420p16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx512_p016_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u16_avx512_rgba_equivalence(64, m, full); + } + } + for w in [66usize, 96, 126, 1920, 1922] { + check_p16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) AVX-512 equivalence ------------- fn high_bit_plane_avx512(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/arch/x86_common.rs b/src/row/arch/x86_common.rs index 7cf7aca..d7ff79b 100644 --- a/src/row/arch/x86_common.rs +++ b/src/row/arch/x86_common.rs @@ -233,6 +233,42 @@ pub(super) unsafe fn write_rgb_u16_8(r: __m128i, g: __m128i, b: __m128i, ptr: *m } } +/// Interleaves 8 R/G/B/A `u16` samples into packed RGBA quads (32 +/// `u16` = 64 bytes). Two 16-bit unpack stages followed by two 32-bit +/// unpack stages produce four 16-byte chunks of `[R, G, B, A]` quads, +/// stored back-to-back via `_mm_storeu_si128`. +/// +/// # Safety +/// +/// - `ptr` must point to at least 64 writable bytes (aligned or +/// unaligned — we use `storeu`). +/// - The calling function must have SSE2 available (the unpack + +/// `storeu_si128` intrinsics; SSE4.1 / AVX2 / AVX-512 supersets all +/// satisfy this). +#[inline(always)] +pub(super) unsafe fn write_rgba_u16_8( + r: __m128i, + g: __m128i, + b: __m128i, + a: __m128i, + ptr: *mut u16, +) { + unsafe { + let rg_lo = _mm_unpacklo_epi16(r, g); + let rg_hi = _mm_unpackhi_epi16(r, g); + let ba_lo = _mm_unpacklo_epi16(b, a); + let ba_hi = _mm_unpackhi_epi16(b, a); + let q0 = _mm_unpacklo_epi32(rg_lo, ba_lo); + let q1 = _mm_unpackhi_epi32(rg_lo, ba_lo); + let q2 = _mm_unpacklo_epi32(rg_hi, ba_hi); + let q3 = _mm_unpackhi_epi32(rg_hi, ba_hi); + _mm_storeu_si128(ptr.cast(), q0); + _mm_storeu_si128(ptr.add(8).cast(), q1); + _mm_storeu_si128(ptr.add(16).cast(), q2); + _mm_storeu_si128(ptr.add(24).cast(), q3); + } +} + /// Swaps the outer two channels of 16 packed 3‑byte pixels (48 bytes /// in, 48 bytes out). Drives both BGR→RGB and RGB→BGR conversions /// since the transformation is self‑inverse. diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 43400f9..a05ce4b 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -42,6 +42,7 @@ use crate::{ row::{ arch::x86_common::{ rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16, + write_rgba_u16_8, }, scalar, }, @@ -473,10 +474,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 sibling of [`p_n_to_rgba_row`] for native-depth `u16` output. +/// Alpha samples are `(1 << BITS) - 1` (opaque maximum at the input +/// bit depth). P016 has its own kernel family — never routed here. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 Pn → native-depth `u16` kernel. `ALPHA = false` +/// writes RGB triples via `write_rgb_u16_8`; `ALPHA = true` writes +/// RGBA quads via `write_rgba_u16_8` with constant alpha +/// `(1 << BITS) - 1`. P016 has its own kernel family — never routed +/// here. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -502,6 +555,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 16 <= width { @@ -545,21 +599,33 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( let b_lo = clamp_u16_max(_mm_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v); let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); - write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3)); - write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24)); + if ALPHA { + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4)); + write_rgba_u16_8( + r_hi, + g_hi, + b_hi, + alpha_u16, + out.as_mut_ptr().add(x * 4 + 32), + ); + } else { + write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3)); + write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24)); + } x += 16; } if x < width { - scalar::p_n_to_rgb_u16_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -830,13 +896,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared SSE4.1 high-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -860,6 +983,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 16 <= width { @@ -910,22 +1034,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v); // Two 8‑pixel u16 writes cover the 16‑pixel block. - write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3)); - write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24)); + if ALPHA { + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4)); + write_rgba_u16_8( + r_hi, + g_hi, + b_hi, + alpha_u16, + out.as_mut_ptr().add(x * 4 + 32), + ); + } else { + write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3)); + write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24)); + } x += 16; } if x < width { - scalar::yuv_420p_n_to_rgb_u16_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2460,17 +2600,72 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 sibling of [`yuv_420p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared SSE4.1 16-bit YUV 4:2:0 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA +/// quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm_set1_epi64x(RND); let y_off_v = _mm_set1_epi32(y_off); let y_scale_v = _mm_set1_epi32(y_scale); @@ -2580,25 +2775,35 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( _mm_add_epi32(y_hi_i32, b_dup_hi), ); - write_rgb_u16_8( - r_lo_u16, - g_lo_u16, - b_lo_u16, - rgb_out.as_mut_ptr().add(x * 3), - ); + if ALPHA { + write_rgba_u16_8( + r_lo_u16, + g_lo_u16, + b_lo_u16, + alpha_u16, + out.as_mut_ptr().add(x * 4), + ); + } else { + write_rgb_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::yuv_420p16_to_rgb_u16_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2779,16 +2984,64 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 sibling of [`p16_to_rgba_row`] for native-depth `u16` output. +/// Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p16_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + unsafe { + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 16-bit P016 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA +/// quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm_set1_epi64x(RND); let y_off_v = _mm_set1_epi32(y_off); let y_scale_v = _mm_set1_epi32(y_scale); @@ -2886,19 +3139,24 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( _mm_add_epi32(y_hi_i32, b_dup_hi), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::p16_to_rgb_u16_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index 284f2b3..f60f081 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -1815,6 +1815,204 @@ fn sse41_p016_rgba_matches_scalar_all_matrices() { } } +// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ---- +// +// u16 RGBA wrappers share the math of their u16 RGB siblings — only +// the store (and tail dispatch) branches on `ALPHA`, with alpha set to +// `(1 << BITS) - 1` for BITS-generic kernels and `0xFFFF` for 16-bit. + +fn check_planar_u16_sse41_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u16_sse41_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width / 2, 53); + let v = p16_plane(width / 2, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width / 2, 53); + let v = p16_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn sse41_yuv420p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u16_sse41_rgba_equivalence_n::<9>(16, m, full); + check_planar_u16_sse41_rgba_equivalence_n::<10>(16, m, full); + check_planar_u16_sse41_rgba_equivalence_n::<12>(16, m, full); + check_planar_u16_sse41_rgba_equivalence_n::<14>(16, m, full); + } + } +} + +#[test] +fn sse41_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [18usize, 30, 34, 1920, 1922] { + check_planar_u16_sse41_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u16_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u16_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u16_sse41_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn sse41_pn_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u16_sse41_rgba_equivalence_n::<10>(16, m, full); + check_pn_u16_sse41_rgba_equivalence_n::<12>(16, m, full); + } + } +} + +#[test] +fn sse41_pn_rgba_u16_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [18usize, 30, 34, 1920, 1922] { + check_pn_u16_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u16_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn sse41_yuv420p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_sse41_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_yuv420p16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn sse41_p016_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u16_sse41_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_p16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) SSE4.1 equivalence --------------- fn high_bit_plane_sse41(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/mod.rs b/src/row/mod.rs index d67ce1b..1ee3025 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -2661,10 +2661,9 @@ pub fn p016_to_rgb_u16_row( // ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- // -// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche -// 5a). u16 RGBA dispatchers stay scalar-only until the follow-up Ship -// 8 Tranche 5b PR adds u16 SIMD; their `use_simd` parameter is held in -// the signature so wiring 5b is a non-breaking change. +// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch +// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces +// the scalar reference path on every dispatcher. /// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the @@ -2758,10 +2757,8 @@ pub fn yuv420p9_to_rgba_row( /// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` /// (opaque maximum at the input bit depth). /// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p9_to_rgba_u16_row( @@ -2781,7 +2778,61 @@ pub fn yuv420p9_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -2876,10 +2927,8 @@ pub fn yuv420p10_to_rgba_row( /// in the low bits of each `u16`); alpha element is `(1 << 10) - 1` /// (opaque maximum at the input bit depth). /// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p10_to_rgba_u16_row( @@ -2899,7 +2948,61 @@ pub fn yuv420p10_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -2980,10 +3083,8 @@ pub fn p010_to_rgba_row( /// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output /// is low-bit-packed; alpha element is `(1 << 10) - 1`. /// -/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p010_to_rgba_u16_row( @@ -3001,7 +3102,53 @@ pub fn p010_to_rgba_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); } @@ -3096,10 +3243,8 @@ pub fn yuv420p12_to_rgba_row( /// in the low bits of each `u16`); alpha element is `(1 << 12) - 1` /// (opaque maximum at the input bit depth). /// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p12_to_rgba_u16_row( @@ -3119,7 +3264,61 @@ pub fn yuv420p12_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -3214,10 +3413,8 @@ pub fn yuv420p14_to_rgba_row( /// in the low bits of each `u16`); alpha element is `(1 << 14) - 1` /// (opaque maximum at the input bit depth). /// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p14_to_rgba_u16_row( @@ -3237,7 +3434,61 @@ pub fn yuv420p14_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -3318,10 +3569,8 @@ pub fn p012_to_rgba_row( /// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output /// is low-bit-packed; alpha element is `(1 << 12) - 1`. /// -/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p012_to_rgba_u16_row( @@ -3339,7 +3588,53 @@ pub fn p012_to_rgba_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); } @@ -3420,8 +3715,8 @@ pub fn yuv420p16_to_rgba_row( /// /// Routes through the dedicated 16-bit u16-output scalar kernel /// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply -/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR. +/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p16_to_rgba_u16_row( @@ -3441,7 +3736,48 @@ pub fn yuv420p16_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -3518,8 +3854,8 @@ pub fn p016_to_rgba_row( /// `0xFFFF`. /// /// Routes through the dedicated 16-bit u16-output P016 scalar kernel -/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR. +/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p016_to_rgba_u16_row( @@ -3537,7 +3873,48 @@ pub fn p016_to_rgba_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); } From ad52ef6782d1c9e4254ad8d7f00e1d6b26812feb Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 01:21:06 +1200 Subject: [PATCH 2/3] update --- src/row/mod.rs | 2 + src/sinker/mixed/mod.rs | 24 + src/sinker/mixed/planar_8bit.rs | 8 +- src/sinker/mixed/subsampled_4_2_0_high_bit.rs | 874 +++++++++++++++++- src/sinker/mixed/tests.rs | 202 ++++ 5 files changed, 1076 insertions(+), 34 deletions(-) diff --git a/src/row/mod.rs b/src/row/mod.rs index 1ee3025..b33cfa4 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -41,6 +41,8 @@ pub(crate) mod scalar; // be unused, which is a hard error under `cargo clippy -- -D warnings`. #[cfg(any(feature = "std", feature = "alloc"))] pub(crate) use scalar::expand_rgb_to_rgba_row; +#[cfg(any(feature = "std", feature = "alloc"))] +pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row; use crate::ColorMatrix; diff --git a/src/sinker/mixed/mod.rs b/src/sinker/mixed/mod.rs index 0369929..69208e0 100644 --- a/src/sinker/mixed/mod.rs +++ b/src/sinker/mixed/mod.rs @@ -1119,6 +1119,30 @@ pub(super) fn rgba_plane_row_slice( Ok(&mut buf[start..end]) } +/// `u16` analogue of [`rgba_plane_row_slice`] — slices the RGBA row out +/// of an attached `u16` RGBA plane buffer. Element count and byte +/// offsets are identical (both `× 4`); the only difference is the +/// element type, so the overflow check is the same. Used by the +/// high-bit-depth 4:2:0 sinkers that fan `u16` RGB out to `u16` RGBA. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) fn rgba_u16_plane_row_slice( + buf: &mut [u16], + one_plane_start: usize, + one_plane_end: usize, + width: usize, + height: usize, +) -> Result<&mut [u16], MixedSinkerError> { + let end = one_plane_end + .checked_mul(4) + .ok_or(MixedSinkerError::GeometryOverflow { + width, + height, + channels: 4, + })?; + let start = one_plane_start * 4; // ≤ end, fits. + Ok(&mut buf[start..end]) +} + /// Pick an RGB row buffer for the kernel to write into: caller's RGB /// plane slice when attached, or the growing scratch buffer otherwise /// (HSV-only callers don't allocate an RGB plane). Returns diff --git a/src/sinker/mixed/planar_8bit.rs b/src/sinker/mixed/planar_8bit.rs index a0d5c7e..10224dd 100644 --- a/src/sinker/mixed/planar_8bit.rs +++ b/src/sinker/mixed/planar_8bit.rs @@ -31,12 +31,12 @@ impl<'a> MixedSinker<'a, Yuv420p> { /// /// ```compile_fail /// // Attaching RGBA to a sink that doesn't write it is rejected - /// // at compile time. Yuv420p10 (10‑bit 4:2:0 planar) has not yet - /// // been wired for RGBA — Tranche 5 covers it; once that lands the + /// // at compile time. Yuv422p10 (10‑bit 4:2:2 planar) has not yet + /// // been wired for RGBA — once a future tranche lands it the /// // negative example here moves to the next not‑yet‑wired format. - /// use colconv::{sinker::MixedSinker, yuv::Yuv420p10}; + /// use colconv::{sinker::MixedSinker, yuv::Yuv422p10}; /// let mut buf = vec![0u8; 16 * 8 * 4]; - /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); + /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); /// ``` #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { diff --git a/src/sinker/mixed/subsampled_4_2_0_high_bit.rs b/src/sinker/mixed/subsampled_4_2_0_high_bit.rs index 9531a17..b203693 100644 --- a/src/sinker/mixed/subsampled_4_2_0_high_bit.rs +++ b/src/sinker/mixed/subsampled_4_2_0_high_bit.rs @@ -2,6 +2,7 @@ use super::{ MixedSinker, MixedSinkerError, RowSlice, check_dimensions_match, rgb_row_buf_or_scratch, + rgba_plane_row_slice, rgba_u16_plane_row_slice, }; use crate::{PixelSink, row::*, yuv::*}; @@ -33,6 +34,56 @@ impl<'a> MixedSinker<'a, Yuv420p9> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 9‑bit YUV + /// source is converted to 8‑bit RGBA via the same `BITS = 9` Q15 + /// kernel family used by [`Self::with_rgb`]; the fourth byte per + /// pixel is alpha = `0xFF` (Yuv420p9 has no alpha plane). + /// + /// Returns `Err(RgbaBufferTooShort)` if + /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on + /// 32‑bit targets when the product overflows. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 9‑bit low‑packed + /// (`(1 << 9) - 1 = 511` max). Length is measured in `u16` + /// **elements** (`width × height × 4`). Alpha element is + /// `(1 << 9) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv420p9Sink for MixedSinker<'_, Yuv420p9> {} @@ -92,6 +143,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -107,7 +160,29 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + // Compute u16 RGB once (to caller's buffer when attached) and fan + // out to u16 RGBA via the cheap per-pixel pad. RGBA-only avoids the + // RGB kernel entirely and writes RGBA directly. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p9_to_rgba_u16_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -117,19 +192,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv420p9_to_rgb_u16_row( row.y(), row.u_half(), row.v_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } - if rgb.is_none() && hsv.is_none() { + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p9_to_rgba_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -163,6 +267,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -211,6 +321,52 @@ impl<'a> MixedSinker<'a, Yuv420p10> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 10‑bit YUV + /// source is converted to 8‑bit RGBA via the `BITS = 10` Q15 kernel + /// family; the fourth byte per pixel is alpha = `0xFF` (Yuv420p10 + /// has no alpha plane). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 10‑bit + /// low‑packed (`(1 << 10) - 1 = 1023` max). Length is measured in + /// `u16` **elements** (`width × height × 4`). Alpha element is + /// `(1 << 10) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv420p10Sink for MixedSinker<'_, Yuv420p10> {} @@ -276,6 +432,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -296,11 +454,32 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> { } } - // `u16` RGB output — written directly via the native‑depth row - // primitive. Computed independently of the u8 path: the two - // outputs have different scale params inside `range_params_n`, - // so they can't share an intermediate without losing precision. - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + // u16 outputs are written via the native-depth row primitive, kept + // independent of the u8 path: the two have different scale params + // inside `range_params_n` and can't share an intermediate without + // losing precision. Within the u16 family, however, the RGB row + // and RGBA row are bit-identical for R/G/B, so we run the RGB + // kernel once and fan out to RGBA via the cheap pad. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p10_to_rgba_u16_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -310,21 +489,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv420p10_to_rgb_u16_row( row.y(), row.u_half(), row.v_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p10_to_rgba_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -361,6 +567,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -392,6 +604,51 @@ impl<'a> MixedSinker<'a, Yuv420p12> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 12‑bit YUV + /// source is converted to 8‑bit RGBA via the `BITS = 12` Q15 kernel + /// family; alpha = `0xFF` (Yuv420p12 has no alpha plane). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 12‑bit + /// low‑packed (`(1 << 12) - 1 = 4095` max). Length is measured in + /// `u16` **elements** (`width × height × 4`). Alpha element is + /// `(1 << 12) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv420p12Sink for MixedSinker<'_, Yuv420p12> {} @@ -454,6 +711,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -470,7 +729,26 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p12_to_rgba_u16_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -480,21 +758,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv420p12_to_rgb_u16_row( row.y(), row.u_half(), row.v_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p12_to_rgba_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -528,6 +833,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -558,6 +869,51 @@ impl<'a> MixedSinker<'a, Yuv420p14> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 14‑bit YUV + /// source is converted to 8‑bit RGBA via the `BITS = 14` Q15 kernel + /// family; alpha = `0xFF` (Yuv420p14 has no alpha plane). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 14‑bit + /// low‑packed (`(1 << 14) - 1 = 16383` max). Length is measured in + /// `u16` **elements** (`width × height × 4`). Alpha element is + /// `(1 << 14) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv420p14Sink for MixedSinker<'_, Yuv420p14> {} @@ -618,6 +974,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -634,7 +992,26 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p14_to_rgba_u16_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -644,21 +1021,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv420p14_to_rgb_u16_row( row.y(), row.u_half(), row.v_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p14_to_rgba_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -692,6 +1096,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -721,6 +1131,50 @@ impl<'a> MixedSinker<'a, Yuv420p16> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 16‑bit YUV + /// source is converted to 8‑bit RGBA via the dedicated `BITS = 16` + /// kernel family; alpha = `0xFF` (Yuv420p16 has no alpha plane). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 16‑bit output + /// (full `u16` range). Length is measured in `u16` **elements** + /// (`width × height × 4`). Alpha element is `u16::MAX`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv420p16Sink for MixedSinker<'_, Yuv420p16> {} @@ -782,6 +1236,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -798,7 +1254,26 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p16_to_rgba_u16_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -808,21 +1283,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv420p16_to_rgb_u16_row( row.y(), row.u_half(), row.v_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv420p16_to_rgba_row( + row.y(), + row.u_half(), + row.v_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -856,6 +1358,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -893,6 +1401,52 @@ impl<'a> MixedSinker<'a, P010> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 10‑bit P010 + /// source (semi‑planar, high‑bit‑packed) is converted to 8‑bit RGBA + /// via the `BITS = 10` Q15 kernel family; alpha = `0xFF` (P010 has + /// no alpha plane). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. Output is + /// **low‑bit‑packed** 10‑bit values (`yuv420p10le` convention) — not + /// P010 high‑bit packing. Length is measured in `u16` **elements** + /// (`width × height × 4`). Alpha element is `(1 << 10) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl P010Sink for MixedSinker<'_, P010> {} @@ -909,6 +1463,10 @@ impl PixelSink for MixedSinker<'_, P010> { } fn process(&mut self, row: P010Row<'_>) -> Result<(), Self::Error> { + // P010 stores 10‑bit samples high‑bit‑packed; bit depth is fixed + // by the format. Used for the u16 RGBA expand path's alpha pad. + const BITS: u32 = 10; + let w = self.width; let h = self.height; let idx = row.row(); @@ -944,6 +1502,8 @@ impl PixelSink for MixedSinker<'_, P010> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -964,9 +1524,27 @@ impl PixelSink for MixedSinker<'_, P010> { } } - // `u16` RGB output — low-bit-packed 10-bit values (yuv420p10le - // convention), not P010's high-bit packing. - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale. + // u16 outputs are low-bit-packed (yuv420p10le convention), not + // P010's high-bit packing. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + p010_to_rgba_u16_row( + row.y(), + row.uv_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -976,20 +1554,46 @@ impl PixelSink for MixedSinker<'_, P010> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; p010_to_rgb_u16_row( row.y(), row.uv_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + p010_to_rgba_row( + row.y(), + row.uv_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -1022,6 +1626,12 @@ impl PixelSink for MixedSinker<'_, P010> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -1053,6 +1663,52 @@ impl<'a> MixedSinker<'a, P012> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 12‑bit P012 + /// source (semi‑planar, high‑bit‑packed) is converted to 8‑bit RGBA + /// via the `BITS = 12` Q15 kernel family; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. Output is + /// **low‑bit‑packed** 12‑bit values (`yuv420p12le` convention) — + /// not P012 high‑bit packing. Length is measured in `u16` + /// **elements** (`width × height × 4`). Alpha element is + /// `(1 << 12) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl P012Sink for MixedSinker<'_, P012> {} @@ -1069,6 +1725,10 @@ impl PixelSink for MixedSinker<'_, P012> { } fn process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error> { + // P012 stores 12‑bit samples high‑bit‑packed; bit depth is fixed + // by the format. Used for the u16 RGBA expand path's alpha pad. + const BITS: u32 = 12; + let w = self.width; let h = self.height; let idx = row.row(); @@ -1103,6 +1763,8 @@ impl PixelSink for MixedSinker<'_, P012> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -1123,7 +1785,27 @@ impl PixelSink for MixedSinker<'_, P012> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale. + // u16 outputs are low-bit-packed (yuv420p12le convention), not + // P012's high-bit packing. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + p012_to_rgba_u16_row( + row.y(), + row.uv_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -1133,20 +1815,46 @@ impl PixelSink for MixedSinker<'_, P012> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; p012_to_rgb_u16_row( row.y(), row.uv_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + p012_to_rgba_row( + row.y(), + row.uv_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -1179,6 +1887,12 @@ impl PixelSink for MixedSinker<'_, P012> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -1209,6 +1923,50 @@ impl<'a> MixedSinker<'a, P016> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8‑bit** RGBA output buffer. The 16‑bit P016 + /// source (semi‑planar) is converted to 8‑bit RGBA via the dedicated + /// `BITS = 16` kernel family; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 16‑bit output + /// (full `u16` range). Length is measured in `u16` **elements** + /// (`width × height × 4`). Alpha element is `u16::MAX`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl P016Sink for MixedSinker<'_, P016> {} @@ -1225,6 +1983,10 @@ impl PixelSink for MixedSinker<'_, P016> { } fn process(&mut self, row: P016Row<'_>) -> Result<(), Self::Error> { + // Bit depth is fixed by the format (16). Used for the u16 RGBA + // expand path's alpha pad (`alpha = u16::MAX` at this depth). + const BITS: u32 = 16; + let w = self.width; let h = self.height; let idx = row.row(); @@ -1259,6 +2021,8 @@ impl PixelSink for MixedSinker<'_, P016> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -1276,7 +2040,25 @@ impl PixelSink for MixedSinker<'_, P016> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale. + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + p016_to_rgba_u16_row( + row.y(), + row.uv_half(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -1286,20 +2068,46 @@ impl PixelSink for MixedSinker<'_, P016> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; p016_to_rgb_u16_row( row.y(), row.uv_half(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== let want_rgb = rgb.is_some(); + let want_rgba = rgba.is_some(); let want_hsv = hsv.is_some(); - if !want_rgb && !want_hsv { + let need_rgb_kernel = want_rgb || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + p016_to_rgba_row( + row.y(), + row.uv_half(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -1332,6 +2140,12 @@ impl PixelSink for MixedSinker<'_, P016> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } diff --git a/src/sinker/mixed/tests.rs b/src/sinker/mixed/tests.rs index d17849d..b65f7de 100644 --- a/src/sinker/mixed/tests.rs +++ b/src/sinker/mixed/tests.rs @@ -2839,6 +2839,154 @@ fn yuv420p10_with_simd_false_matches_with_simd_true() { assert_eq!(rgb_u16_scalar, rgb_u16_simd); } +// ---- Yuv420p10 RGBA (Ship 8 Tranche 5b) ------------------------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p10_rgba_u8_only_gray_with_opaque_alpha() { + // 10-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255) per pixel. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p10_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 10-bit mid-gray → u16 RGBA: each color element ≈ 512, alpha = 1023. + let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p10_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + // Strategy A: when both rgb and rgba are attached, the rgb buffer is + // populated by the RGB kernel and the rgba buffer is populated via a + // cheap expand pass. RGB triples must be byte-identical to the + // standalone RGB-only run. + let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + + let mut rgb_solo = std::vec![0u8; 64 * 16 * 3]; + let mut s_solo = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_solo) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap(); + + let mut rgb_combined = std::vec![0u8; 64 * 16 * 3]; + let mut rgba = std::vec![0u8; 64 * 16 * 4]; + let mut s_combined = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_combined) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap(); + + assert_eq!(rgb_solo, rgb_combined, "RGB bytes must match across runs"); + for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) { + assert_eq!(rgb_px[0], rgba_px[0]); + assert_eq!(rgb_px[1], rgba_px[1]); + assert_eq!(rgb_px[2], rgba_px[2]); + assert_eq!(rgba_px[3], 0xFF); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p10_with_rgb_u16_and_with_rgba_u16_produce_byte_identical_rgb_elems() { + // Strategy A on the u16 path: rgb_u16 buffer populated by the u16 RGB + // kernel, rgba_u16 fanned out via expand_rgb_u16_to_rgba_u16_row<10>. + let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700); + let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32); + + let mut rgb_solo = std::vec![0u16; 64 * 16 * 3]; + let mut s_solo = MixedSinker::::new(64, 16) + .with_rgb_u16(&mut rgb_solo) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap(); + + let mut rgb_combined = std::vec![0u16; 64 * 16 * 3]; + let mut rgba = std::vec![0u16; 64 * 16 * 4]; + let mut s_combined = MixedSinker::::new(64, 16) + .with_rgb_u16(&mut rgb_combined) + .unwrap() + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap(); + + assert_eq!( + rgb_solo, rgb_combined, + "RGB u16 elements must match across runs" + ); + for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) { + assert_eq!(rgb_px[0], rgba_px[0]); + assert_eq!(rgb_px[1], rgba_px[1]); + assert_eq!(rgb_px[2], rgba_px[2]); + assert_eq!(rgba_px[3], 1023, "alpha = (1 << 10) - 1"); + } +} + +#[test] +fn yuv420p10_rgba_too_short_returns_err() { + let mut rgba = std::vec![0u8; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .err() + .expect("expected RgbaBufferTooShort"); + assert!(matches!(err, MixedSinkerError::RgbaBufferTooShort { .. })); +} + +#[test] +fn yuv420p10_rgba_u16_too_short_returns_err() { + let mut rgba = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .err() + .expect("expected RgbaU16BufferTooShort"); + assert!(matches!( + err, + MixedSinkerError::RgbaU16BufferTooShort { .. } + )); +} + // ---- P010 -------------------------------------------------------------- // // Semi-planar 10-bit, high-bit-packed (samples in high 10 of each @@ -3039,6 +3187,33 @@ fn p010_with_simd_false_matches_with_simd_true() { assert_eq!(rgb_u16_scalar, rgb_u16_simd); } +// ---- P010 RGBA (Ship 8 Tranche 5b) ------------------------------------ + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p010_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // P010 mid-gray (10-bit values shifted into the high 10): Y/U/V = 512 << 6. + // Output u16 RGBA: each color element ≈ 512, alpha = 1023. + let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512); + let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 1023, "alpha = (1 << 10) - 1"); + } +} + // ---- Yuv420p12 --------------------------------------------------------- // // Planar 12-bit, low-bit-packed. Mirrors the Yuv420p10 shape — same @@ -3728,6 +3903,33 @@ fn yuv420p16_with_simd_false_matches_with_simd_true() { assert_eq!(rgb_u16_scalar, rgb_u16_simd); } +// ---- Yuv420p16 RGBA (Ship 8 Tranche 5b) ------------------------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p16_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 16-bit mid-gray: Y=UV=32768. Output u16 RGBA: each color element ≈ + // 32768, alpha = 0xFFFF. + let (yp, up, vp) = solid_yuv420p16_frame(16, 8, 32768, 32768, 32768); + let src = Yuv420p16Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv420p16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(32768) <= 8, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF"); + } +} + // ---- P016 -------------------------------------------------------------- fn solid_p016_frame(width: u32, height: u32, y: u16, u: u16, v: u16) -> (Vec, Vec) { From 2da756a27eabaa0acba2bf9baa6ca9fe96188979 Mon Sep 17 00:00:00 2001 From: Al Liu Date: Sun, 26 Apr 2026 21:29:22 +0800 Subject: [PATCH 3/3] Update src/sinker/mixed/mod.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/sinker/mixed/mod.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sinker/mixed/mod.rs b/src/sinker/mixed/mod.rs index 69208e0..f278805 100644 --- a/src/sinker/mixed/mod.rs +++ b/src/sinker/mixed/mod.rs @@ -1120,9 +1120,10 @@ pub(super) fn rgba_plane_row_slice( } /// `u16` analogue of [`rgba_plane_row_slice`] — slices the RGBA row out -/// of an attached `u16` RGBA plane buffer. Element count and byte -/// offsets are identical (both `× 4`); the only difference is the -/// element type, so the overflow check is the same. Used by the +/// of an attached `u16` RGBA plane buffer. This helper indexes in `u16` +/// elements, not bytes: like the `u8` variant, RGBA rows use `× 4` +/// elements per pixel, so the overflow check is the same, but the byte +/// offsets differ because each element is 2 bytes. Used by the /// high-bit-depth 4:2:0 sinkers that fan `u16` RGB out to `u16` RGBA. #[cfg_attr(not(tarpaulin), inline(always))] pub(super) fn rgba_u16_plane_row_slice(