From 0cc022cb6c6e6c512e05593f5cac653528894176 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:46:16 +1200 Subject: [PATCH 1/5] update --- src/row/arch/neon.rs | 102 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 13 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 443b469..ca4cfaf 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -282,8 +282,10 @@ unsafe fn yuv_420_to_rgb_or_rgba_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. -/// 4. `BITS` must be one of `{10, 12, 14}` — the Q15 pipeline +/// 4. `BITS` must be one of `{9, 10, 12, 14}` — the Q15 pipeline /// overflows i32 at 16 bits; see [`scalar::range_params_n`]. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn yuv_420p_n_to_rgb_row( @@ -294,13 +296,76 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// NEON high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Same numerical contract as [`yuv_420p_n_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the constant +/// alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared NEON high-bit-depth YUV 4:2:0 kernel for +/// [`yuv_420p_n_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and +/// [`yuv_420p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with +/// constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` must be one of `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -324,6 +389,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -385,23 +451,33 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8); + vst4q_u8(out.as_mut_ptr().add(x * 4), rgba); + } else { + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(out.as_mut_ptr().add(x * 3), rgb); + } x += 16; } // Scalar tail — remaining < 16 pixels (always even per 4:2:0). if x < width { - scalar::yuv_420p_n_to_rgb_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } From cfff0dcd336b0e9e8840891f18ba0a1a6a6bc18a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 22:51:02 +1200 Subject: [PATCH 2/5] update --- src/row/arch/neon.rs | 263 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 225 insertions(+), 38 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index ca4cfaf..917c64b 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -913,6 +913,8 @@ fn clamp_u16_max(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_ /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. /// 4. `BITS` must be one of `{10, 12}`. +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn p_n_to_rgb_row( @@ -923,10 +925,66 @@ pub(crate) unsafe fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// NEON high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). Same numerical contract as [`p_n_to_rgb_row`]; +/// 4 bpp store with constant alpha. +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON kernel for [`p_n_to_rgb_row`] (`ALPHA = false`, +/// `vst3q_u8`) and [`p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` +/// with constant `0xFF` alpha vector). +/// +/// # Safety +/// +/// 1. **NEON must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // P016 (BITS=16) routes through `p16_to_rgb_or_rgba_row` (i64 chroma); + // attempting `::<16, _>` here would silently overflow on high chroma. + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -949,6 +1007,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -1006,21 +1065,27 @@ pub(crate) unsafe fn p_n_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)), ); - let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + if ALPHA { + let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8); + vst4q_u8(out.as_mut_ptr().add(x * 4), rgba); + } else { + let rgb = uint8x16x3_t(r_u8, g_u8, b_u8); + vst3q_u8(out.as_mut_ptr().add(x * 3), rgb); + } x += 16; } if x < width { - scalar::p_n_to_rgb_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -1968,6 +2033,8 @@ fn scale_y( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn yuv_420p16_to_rgb_row( @@ -1979,11 +2046,64 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// NEON 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON 16-bit YUV 4:2:0 kernel for [`yuv_420p16_to_rgb_row`] +/// (`ALPHA = false`, `vst3q_u8`) and [`yuv_420p16_to_rgba_row`] +/// (`ALPHA = true`, `vst4q_u8` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. NEON must be available on the current CPU. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2002,6 +2122,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -2061,23 +2182,30 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)), ); - vst3q_u8( - rgb_out.as_mut_ptr().add(x * 3), - uint8x16x3_t(r_u8, g_u8, b_u8), - ); + if ALPHA { + vst4q_u8( + out.as_mut_ptr().add(x * 4), + uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8), + ); + } else { + vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8)); + } x += 16; } if x < width { - scalar::yuv_420p16_to_rgb_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -2511,6 +2639,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// 1. NEON must be available. /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] pub(crate) unsafe fn p16_to_rgb_row( @@ -2521,10 +2651,61 @@ pub(crate) unsafe fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// NEON P016 (semi-planar 4:2:0, full 16-bit) → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p16_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON P016 kernel for [`p16_to_rgb_row`] (`ALPHA = false`, +/// `vst3q_u8`) and [`p16_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` +/// with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. NEON must be available. +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2543,6 +2724,7 @@ pub(crate) unsafe fn p16_to_rgb_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u8 = vdupq_n_u8(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -2601,22 +2783,27 @@ pub(crate) unsafe fn p16_to_rgb_row( vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)), ); - vst3q_u8( - rgb_out.as_mut_ptr().add(x * 3), - uint8x16x3_t(r_u8, g_u8, b_u8), - ); + if ALPHA { + vst4q_u8( + out.as_mut_ptr().add(x * 4), + uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8), + ); + } else { + vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8)); + } x += 16; } if x < width { - scalar::p16_to_rgb_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } From 67062755c7c08eb6988db79ea6bdf5878485bda9 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 23:16:24 +1200 Subject: [PATCH 3/5] update --- src/row/arch/x86_sse41.rs | 336 +++++++++++++++++++++++++++++++++----- 1 file changed, 294 insertions(+), 42 deletions(-) diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 4aaecaf..43400f9 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -281,6 +281,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn p_n_to_rgb_row( @@ -291,10 +293,63 @@ pub(crate) unsafe fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 P010/P012 kernel for [`p_n_to_rgb_row`] (`ALPHA = false`, +/// `write_rgb_16`) and [`p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` +/// with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}` — P016 has its own kernel family. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -318,6 +373,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -369,20 +425,25 @@ pub(crate) unsafe fn p_n_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p_n_to_rgb_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -554,6 +615,8 @@ unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) { /// /// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`]. /// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Safety /// /// 1. **SSE4.1 must be available on the current CPU.** @@ -570,13 +633,71 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared SSE4.1 high-bit YUV 4:2:0 kernel for `yuv_420p_n_to_rgb_row` +/// (`ALPHA = false`, `write_rgb_16`) and `yuv_420p_n_to_rgba_row` +/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -600,6 +721,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -650,21 +772,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let r_u8 = _mm_packus_epi16(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_420p_n_to_rgb_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2146,6 +2277,8 @@ fn scale_y16_i64(y_minus_off: __m128i, y_scale_v: __m128i, rnd_v: __m128i) -> __ /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn yuv_420p16_to_rgb_row( @@ -2157,11 +2290,64 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 16-bit YUV 4:2:0 kernel for [`yuv_420p16_to_rgb_row`] +/// (`ALPHA = false`, `write_rgb_16`) and [`yuv_420p16_to_rgba_row`] +/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `u_half.len() >= width / 2`, +/// `v_half.len() >= width / 2`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2180,6 +2366,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -2228,20 +2415,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let b_u8 = _mm_packus_epi16(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_420p16_to_rgb_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -2419,6 +2613,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// 1. **SSE4.1 must be available on the current CPU.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "sse4.1")] pub(crate) unsafe fn p16_to_rgb_row( @@ -2429,10 +2625,60 @@ pub(crate) unsafe fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p16_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 P016 kernel for [`p16_to_rgb_row`] (`ALPHA = false`, +/// `write_rgb_16`) and [`p16_to_rgba_row`] (`ALPHA = true`, +/// `write_rgba_16` with constant `0xFF` alpha). +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2450,6 +2696,7 @@ pub(crate) unsafe fn p16_to_rgb_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); let mut x = 0usize; while x + 16 <= width { @@ -2495,19 +2742,24 @@ pub(crate) unsafe fn p16_to_rgb_row( let g_u8 = _mm_packus_epi16(g_lo, g_hi); let b_u8 = _mm_packus_epi16(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p16_to_rgb_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } From 88a0de80a8d0a94bb8af2c9ece6803c81034028a Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sun, 26 Apr 2026 23:56:28 +1200 Subject: [PATCH 4/5] update --- src/row/arch/neon/tests.rs | 179 ++++++++++++ src/row/arch/wasm_simd128.rs | 300 ++++++++++++++++--- src/row/arch/wasm_simd128/tests.rs | 177 +++++++++++ src/row/arch/x86_avx2.rs | 301 ++++++++++++++++--- src/row/arch/x86_avx2/tests.rs | 173 +++++++++++ src/row/arch/x86_avx512.rs | 300 ++++++++++++++++--- src/row/arch/x86_avx512/tests.rs | 173 +++++++++++ src/row/arch/x86_sse41/tests.rs | 173 +++++++++++ src/row/mod.rs | 453 ++++++++++++++++++++++++++--- 9 files changed, 2066 insertions(+), 163 deletions(-) diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index a792c0d..41dd909 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -1649,6 +1649,185 @@ fn neon_p14_matches_scalar_tail_widths() { } } +// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ---------- +// +// RGBA wrappers share the math of their RGB siblings — only the store +// (and tail dispatch) branches on `ALPHA`. These tests pin that the +// SIMD RGBA path produces byte-identical output to the scalar RGBA +// reference, which already encodes the alpha = 0xFF contract. + +fn check_planar_u8_neon_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u8_neon_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv420p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_neon_rgba_equivalence_n::<9>(16, m, full); + check_planar_u8_neon_rgba_equivalence_n::<10>(16, m, full); + check_planar_u8_neon_rgba_equivalence_n::<12>(16, m, full); + check_planar_u8_neon_rgba_equivalence_n::<14>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_planar_u8_neon_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u8_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u8_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u8_neon_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u8_neon_rgba_equivalence_n::<10>(16, m, full); + check_pn_u8_neon_rgba_equivalence_n::<12>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_rgba_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_pn_u8_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u8_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +fn check_yuv420p16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width / 2, 53); + let v = p16_plane_neon(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p016_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width / 2, 53); + let v = p16_plane_neon(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_neon = std::vec![0u8; width * 4]; + scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv420p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u8_neon_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_yuv420p16_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_p016_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p016_u8_neon_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_p016_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Yuv444p_n NEON equivalence (10/12/14) -------------------------- fn check_yuv444p_n_u8_neon_equivalence( diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 29e3f5a..e9d02c4 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -270,6 +270,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn yuv_420p_n_to_rgb_row( @@ -280,13 +282,64 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// wasm simd128 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared wasm simd128 high-bit YUV 4:2:0 kernel. `ALPHA = false` uses +/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant +/// `0xFF` alpha. +/// +/// # Safety +/// +/// 1. **simd128 enabled at compile time.** +/// 2. `width & 1 == 0`. 3. slices long enough + +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -308,6 +361,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -356,21 +410,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_420p_n_to_rgb_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1065,6 +1128,8 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) { /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn p_n_to_rgb_row( @@ -1075,10 +1140,57 @@ pub(crate) unsafe fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 high-bit-packed semi-planar 4:2:0 → packed **8-bit +/// RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 P010/P012 kernel. `ALPHA = false` uses +/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant +/// `0xFF` alpha. +/// +/// # Safety +/// +/// 1. **simd128 enabled at compile time.** +/// 2. `width & 1 == 0`. 3. slices long enough + +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1099,6 +1211,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); // High-bit-packed samples: shift right by `16 - BITS`. let shr = (16 - BITS) as u32; @@ -1149,20 +1262,25 @@ pub(crate) unsafe fn p_n_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p_n_to_rgb_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2359,6 +2477,8 @@ fn scale_y_i32x4_i64_wasm(y_minus_off: v128, y_scale_i64: v128, rnd_i64: v128) - /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn yuv_420p16_to_rgb_row( @@ -2370,11 +2490,52 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 16-bit YUV 4:2:0 kernel. `ALPHA = false` uses +/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant +/// `0xFF` alpha. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2392,6 +2553,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -2438,20 +2600,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::yuv_420p16_to_rgb_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -2595,6 +2764,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// 1. **simd128 must be enabled at compile time.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] pub(crate) unsafe fn p16_to_rgb_row( @@ -2605,10 +2776,49 @@ pub(crate) unsafe fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p16_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 P016 kernel. `ALPHA = false` uses +/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant +/// `0xFF` alpha. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2626,6 +2836,7 @@ pub(crate) unsafe fn p16_to_rgb_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u8 = u8x16_splat(0xFF); let mut x = 0usize; while x + 16 <= width { @@ -2671,19 +2882,24 @@ pub(crate) unsafe fn p16_to_rgb_row( let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 16; } if x < width { - scalar::p16_to_rgb_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index 0e05462..d394f33 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -1387,6 +1387,183 @@ fn simd128_yuv420p16_u16_matches_scalar_tight_widths() { } } +// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ---------- +// +// RGBA wrappers share the math of their RGB siblings — only the store +// (and tail dispatch) branches on `ALPHA`. These tests pin that the +// SIMD RGBA path produces byte-identical output to the scalar RGBA +// reference. + +fn check_planar_u8_simd128_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u8_simd128_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u8_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width / 2, 53); + let v = p16_plane_wasm(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u8_simd128_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width / 2, 53); + let v = p16_plane_wasm(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "simd128 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn simd128_yuv420p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_simd128_rgba_equivalence_n::<9>(16, m, full); + check_planar_u8_simd128_rgba_equivalence_n::<10>(16, m, full); + check_planar_u8_simd128_rgba_equivalence_n::<12>(16, m, full); + check_planar_u8_simd128_rgba_equivalence_n::<14>(16, m, full); + } + } +} + +#[test] +fn simd128_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_planar_u8_simd128_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u8_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u8_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u8_simd128_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn simd128_pn_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u8_simd128_rgba_equivalence_n::<10>(16, m, full); + check_pn_u8_simd128_rgba_equivalence_n::<12>(16, m, full); + } + } +} + +#[test] +fn simd128_pn_rgba_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_pn_u8_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u8_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn simd128_yuv420p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u8_simd128_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_yuv420p16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn simd128_p016_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u8_simd128_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_p16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) wasm simd128 equivalence --------- fn high_bit_plane_wasm(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 2882928..d0f72fd 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -292,6 +292,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn yuv_420p_n_to_rgb_row( @@ -302,13 +304,65 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX2 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX2 high-bit YUV 4:2:0 kernel. `ALPHA = false` uses +/// `write_rgb_32`; `ALPHA = true` uses `write_rgba_32` with constant +/// `0xFF` alpha. +/// +/// # Safety +/// +/// 1. **AVX2 must be available.** +/// 2. `width & 1 == 0`. 3. slices long enough for `BITS` semantics + +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ +/// `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -329,6 +383,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -393,21 +448,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_420p_n_to_rgb_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1211,6 +1275,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn p_n_to_rgb_row( @@ -1221,10 +1287,58 @@ pub(crate) unsafe fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 P010/P012 kernel. `ALPHA = false` uses `write_rgb_32`; +/// `ALPHA = true` uses `write_rgba_32` with constant `0xFF` alpha. +/// +/// # Safety +/// +/// 1. **AVX2 must be available.** +/// 2. `width & 1 == 0`. +/// 3. `y.len() >= width`, `uv_half.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1246,6 +1360,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -1306,20 +1421,25 @@ pub(crate) unsafe fn p_n_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let r_u8 = narrow_u8x32(r_lo, r_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::p_n_to_rgb_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2558,6 +2678,8 @@ fn chroma_dup_i32(chroma: __m256i) -> (__m256i, __m256i) { /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn yuv_420p16_to_rgb_row( @@ -2569,11 +2691,52 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 16-bit YUV 4:2:0 kernel. `ALPHA = false` uses +/// `write_rgb_32`; `ALPHA = true` uses `write_rgba_32` with constant +/// `0xFF` alpha. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2591,6 +2754,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -2646,20 +2810,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let b_u8 = narrow_u8x32(b_lo, b_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_420p16_to_rgb_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -2820,6 +2991,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// 1. **AVX2 must be available.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn p16_to_rgb_row( @@ -2830,10 +3003,48 @@ pub(crate) unsafe fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p16_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 P016 kernel. `ALPHA = false` uses `write_rgb_32`; +/// `ALPHA = true` uses `write_rgba_32` with constant `0xFF` alpha. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2851,6 +3062,7 @@ pub(crate) unsafe fn p16_to_rgb_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1); let mut x = 0usize; while x + 32 <= width { @@ -2907,19 +3119,24 @@ pub(crate) unsafe fn p16_to_rgb_row( let g_u8 = narrow_u8x32(g_lo, g_hi); let b_u8 = narrow_u8x32(b_lo, b_hi); - write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::p16_to_rgb_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index fd3927a..b0cd2ef 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -1588,6 +1588,179 @@ fn avx2_p16_matches_scalar_1920() { check_p16_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } +// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ---------- +// +// RGBA wrappers share the math of their RGB siblings — only the store +// (and tail dispatch) branches on `ALPHA`. These tests pin that the +// SIMD RGBA path produces byte-identical output to the scalar RGBA +// reference. + +fn check_planar_u8_avx2_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u8_avx2_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width / 2, 53); + let v = p16_plane_avx2(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width / 2, 53); + let v = p16_plane_avx2(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx2_yuv420p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx2_rgba_equivalence_n::<9>(32, m, full); + check_planar_u8_avx2_rgba_equivalence_n::<10>(32, m, full); + check_planar_u8_avx2_rgba_equivalence_n::<12>(32, m, full); + check_planar_u8_avx2_rgba_equivalence_n::<14>(32, m, full); + } + } +} + +#[test] +fn avx2_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + for w in [34usize, 48, 62, 1920, 1922] { + check_planar_u8_avx2_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u8_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u8_avx2_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx2_pn_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u8_avx2_rgba_equivalence_n::<10>(32, m, full); + check_pn_u8_avx2_rgba_equivalence_n::<12>(32, m, full); + } + } +} + +#[test] +fn avx2_pn_rgba_matches_scalar_tail_and_1920() { + for w in [34usize, 48, 62, 1920, 1922] { + check_pn_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u8_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx2_yuv420p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u8_avx2_rgba_equivalence(32, m, full); + } + } + for w in [34usize, 48, 62, 1920, 1922] { + check_yuv420p16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx2_p016_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u8_avx2_rgba_equivalence(32, m, full); + } + } + for w in [34usize, 48, 62, 1920, 1922] { + check_p16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) AVX2 equivalence ----------------- fn high_bit_plane_avx2(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index e0385c4..48e7cfa 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -307,6 +307,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn yuv_420p_n_to_rgb_row( @@ -317,13 +319,65 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } +} + +/// AVX-512 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } +} + +/// Shared AVX-512 high-bit YUV 4:2:0 kernel. `ALPHA = false` uses +/// `write_rgb_64`; `ALPHA = true` uses `write_rgba_64` with constant +/// `0xFF` alpha. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `width & 1 == 0`. +/// 3. slices long enough + `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -344,6 +398,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); @@ -411,21 +466,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::yuv_420p_n_to_rgb_row::( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p_n_to_rgba_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p_n_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1272,6 +1336,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn p_n_to_rgb_row( @@ -1282,10 +1348,57 @@ pub(crate) unsafe fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA** +/// (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p_n_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 P010/P012 kernel. `ALPHA = false` uses +/// `write_rgb_64`; `ALPHA = true` uses `write_rgba_64` with constant +/// `0xFF` alpha. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `width & 1 == 0`. 3. slices long enough + +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1307,6 +1420,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); @@ -1368,20 +1482,25 @@ pub(crate) unsafe fn p_n_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::p_n_to_rgb_row::( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } @@ -2652,6 +2771,8 @@ fn scale_y_u16_avx512( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn yuv_420p16_to_rgb_row( @@ -2663,11 +2784,52 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 16-bit YUV 4:2:0 kernel. `ALPHA = false` uses +/// `write_rgb_64`; `ALPHA = true` uses `write_rgba_64` with constant +/// `0xFF` alpha. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2685,6 +2847,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); @@ -2743,20 +2906,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::yuv_420p16_to_rgb_row( - &y[x..width], - &u_half[x / 2..width / 2], - &v_half[x / 2..width / 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u_half[x / 2..width / 2]; + let tail_v = &v_half[x / 2..width / 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_420p16_to_rgba_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + } } } } @@ -2938,6 +3108,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// 1. **AVX-512F + AVX-512BW must be available.** /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] pub(crate) unsafe fn p16_to_rgb_row( @@ -2948,10 +3120,48 @@ pub(crate) unsafe fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`). +/// +/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p16_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 P016 kernel. `ALPHA = false` uses `write_rgb_64`; +/// `ALPHA = true` uses `write_rgba_64` with constant `0xFF` alpha. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( + y: &[u16], + uv_half: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(uv_half.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range); @@ -2969,6 +3179,7 @@ pub(crate) unsafe fn p16_to_rgb_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm512_set1_epi8(-1); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); @@ -3026,19 +3237,24 @@ pub(crate) unsafe fn p16_to_rgb_row( let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup); let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup); - write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } x += 64; } if x < width { - scalar::p16_to_rgb_row( - &y[x..width], - &uv_half[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_half[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index 5ee3453..9afb142 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -1602,6 +1602,179 @@ fn avx512_p16_matches_scalar_1920() { check_p16_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } +// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ---------- +// +// RGBA wrappers share the math of their RGB siblings — only the store +// (and tail dispatch) branches on `ALPHA`. These tests pin that the +// SIMD RGBA path produces byte-identical output to the scalar RGBA +// reference. + +fn check_planar_u8_avx512_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u8_avx512_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width / 2, 53); + let v = p16_plane_avx512(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width / 2, 53); + let v = p16_plane_avx512(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx512_yuv420p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_avx512_rgba_equivalence_n::<9>(64, m, full); + check_planar_u8_avx512_rgba_equivalence_n::<10>(64, m, full); + check_planar_u8_avx512_rgba_equivalence_n::<12>(64, m, full); + check_planar_u8_avx512_rgba_equivalence_n::<14>(64, m, full); + } + } +} + +#[test] +fn avx512_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + for w in [66usize, 96, 126, 1920, 1922] { + check_planar_u8_avx512_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u8_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u8_avx512_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx512_pn_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u8_avx512_rgba_equivalence_n::<10>(64, m, full); + check_pn_u8_avx512_rgba_equivalence_n::<12>(64, m, full); + } + } +} + +#[test] +fn avx512_pn_rgba_matches_scalar_tail_and_1920() { + for w in [66usize, 96, 126, 1920, 1922] { + check_pn_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u8_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx512_yuv420p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u8_avx512_rgba_equivalence(64, m, full); + } + } + for w in [66usize, 96, 126, 1920, 1922] { + check_yuv420p16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx512_p016_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u8_avx512_rgba_equivalence(64, m, full); + } + } + for w in [66usize, 96, 126, 1920, 1922] { + check_p16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) AVX-512 equivalence ------------- fn high_bit_plane_avx512(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index 513341c..239b943 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -1624,6 +1624,179 @@ fn sse41_p16_matches_scalar_1920() { check_p16_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false); } +// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ---------- +// +// RGBA wrappers share the math of their RGB siblings — only the store +// (and tail dispatch) branches on `ALPHA`. These tests pin that the +// SIMD RGBA path produces byte-identical output to the scalar RGBA +// reference. + +fn check_planar_u8_sse41_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_u8_sse41_rgba_equivalence_n( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p_n_packed_plane::(width, 37); + let u = p_n_packed_plane::(width / 2, 53); + let v = p_n_packed_plane::(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv420p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width / 2, 53); + let v = p16_plane(width / 2, 71); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width / 2, 53); + let v = p16_plane(width / 2, 71); + let uv = p010_uv_interleave(&u, &v); + let mut rgba_scalar = std::vec![0u8; width * 4]; + let mut rgba_simd = std::vec![0u8; width * 4]; + scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn sse41_yuv420p_n_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_planar_u8_sse41_rgba_equivalence_n::<9>(16, m, full); + check_planar_u8_sse41_rgba_equivalence_n::<10>(16, m, full); + check_planar_u8_sse41_rgba_equivalence_n::<12>(16, m, full); + check_planar_u8_sse41_rgba_equivalence_n::<14>(16, m, full); + } + } +} + +#[test] +fn sse41_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_planar_u8_sse41_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); + check_planar_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); + check_planar_u8_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_planar_u8_sse41_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn sse41_pn_rgba_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_u8_sse41_rgba_equivalence_n::<10>(16, m, full); + check_pn_u8_sse41_rgba_equivalence_n::<12>(16, m, full); + } + } +} + +#[test] +fn sse41_pn_rgba_matches_scalar_tail_and_1920() { + for w in [18usize, 30, 34, 1920, 1922] { + check_pn_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); + check_pn_u8_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn sse41_yuv420p16_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u8_sse41_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_yuv420p16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn sse41_p016_rgba_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p16_u8_sse41_rgba_equivalence(16, m, full); + } + } + for w in [18usize, 30, 34, 1920, 1922] { + check_p16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + // ---- Pn 4:4:4 (P410 / P412 / P416) SSE4.1 equivalence --------------- fn high_bit_plane_sse41(n: usize, seed: usize) -> std::vec::Vec { diff --git a/src/row/mod.rs b/src/row/mod.rs index 15f8c10..d67ce1b 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -2659,12 +2659,12 @@ pub fn p016_to_rgb_u16_row( scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); } -// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5 prep) ---------- +// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- // -// Scalar prep: dispatchers route through the new RGBA scalar kernels -// (`scalar::*_to_rgba*_row`). The `use_simd` parameter is held in the -// signature so the follow-up SIMD/backend PRs (Ship 8 Tranche 5a/5b) -// can fill in per-arch branches without breaking callers. +// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche +// 5a). u16 RGBA dispatchers stay scalar-only until the follow-up Ship +// 8 Tranche 5b PR adds u16 SIMD; their `use_simd` parameter is held in +// the signature so wiring 5b is a non-breaking change. /// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the @@ -2674,9 +2674,7 @@ pub fn p016_to_rgb_u16_row( /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See /// `scalar::yuv_420p_n_to_rgba_row` for the reference. /// -/// `use_simd = false` forces scalar. SIMD per-arch routes land in the -/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always -/// runs the scalar reference regardless of `use_simd`. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p9_to_rgba_row( @@ -2696,7 +2694,62 @@ pub fn yuv420p9_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified on this CPU; bounds / parity are + // the caller's obligation (asserted above). + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -2740,9 +2793,7 @@ pub fn yuv420p9_to_rgba_u16_row( /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See /// `scalar::yuv_420p_n_to_rgba_row` for the reference. /// -/// `use_simd = false` forces scalar. SIMD per-arch routes land in the -/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always -/// runs the scalar reference regardless of `use_simd`. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p10_to_rgba_row( @@ -2762,7 +2813,61 @@ pub fn yuv420p10_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -2802,10 +2907,8 @@ pub fn yuv420p10_to_rgba_u16_row( /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to /// `0xFF` (opaque). /// -/// See `scalar::p_n_to_rgba_row::<10>` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::p_n_to_rgba_row::<10>` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p010_to_rgba_row( @@ -2823,7 +2926,53 @@ pub fn p010_to_rgba_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); } @@ -2864,9 +3013,7 @@ pub fn p010_to_rgba_u16_row( /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See /// `scalar::yuv_420p_n_to_rgba_row` for the reference. /// -/// `use_simd = false` forces scalar. SIMD per-arch routes land in the -/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always -/// runs the scalar reference regardless of `use_simd`. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p12_to_rgba_row( @@ -2886,7 +3033,61 @@ pub fn yuv420p12_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -2930,9 +3131,7 @@ pub fn yuv420p12_to_rgba_u16_row( /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See /// `scalar::yuv_420p_n_to_rgba_row` for the reference. /// -/// `use_simd = false` forces scalar. SIMD per-arch routes land in the -/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always -/// runs the scalar reference regardless of `use_simd`. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p14_to_rgba_row( @@ -2952,7 +3151,61 @@ pub fn yuv420p14_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -2992,10 +3245,8 @@ pub fn yuv420p14_to_rgba_u16_row( /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to /// `0xFF` (opaque). /// -/// See `scalar::p_n_to_rgba_row::<12>` for the reference. SIMD -/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for -/// now this dispatcher always runs the scalar reference regardless of -/// `use_simd`. +/// See `scalar::p_n_to_rgba_row::<12>` for the reference. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p012_to_rgba_row( @@ -3013,7 +3264,53 @@ pub fn p012_to_rgba_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); } @@ -3051,8 +3348,8 @@ pub fn p012_to_rgba_u16_row( /// /// Routes through the dedicated 16-bit scalar kernel /// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient -/// for u8 output even at 16-bit input. SIMD per-arch routes land in -/// the follow-up Ship 8 Tranche 5a PR. +/// for u8 output even at 16-bit input. `use_simd = false` forces the +/// scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv420p16_to_rgba_row( @@ -3072,7 +3369,48 @@ pub fn yuv420p16_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); } @@ -3111,8 +3449,8 @@ pub fn yuv420p16_to_rgba_u16_row( /// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. /// /// Routes through the dedicated 16-bit P016 scalar kernel -/// (`scalar::p16_to_rgba_row`). SIMD per-arch routes land in the -/// follow-up Ship 8 Tranche 5a PR. +/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar +/// reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p016_to_rgba_row( @@ -3130,7 +3468,48 @@ pub fn p016_to_rgba_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); } From 00c73d62a6c7514a3ed46394706e5d0dd54deaf4 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Mon, 27 Apr 2026 00:06:34 +1200 Subject: [PATCH 5/5] update --- src/row/arch/x86_avx2/tests.rs | 18 ++++++++++++++++++ src/row/arch/x86_avx512/tests.rs | 18 ++++++++++++++++++ src/row/arch/x86_sse41/tests.rs | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+) diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index b0cd2ef..f8f28eb 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -1671,6 +1671,9 @@ fn check_p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ra #[test] fn avx2_yuv420p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1690,6 +1693,9 @@ fn avx2_yuv420p_n_rgba_matches_scalar_all_bits() { #[test] fn avx2_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } for w in [34usize, 48, 62, 1920, 1922] { check_planar_u8_avx2_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); check_planar_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); @@ -1700,6 +1706,9 @@ fn avx2_yuv420p_n_rgba_matches_scalar_tail_and_1920() { #[test] fn avx2_pn_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1717,6 +1726,9 @@ fn avx2_pn_rgba_matches_scalar_all_bits() { #[test] fn avx2_pn_rgba_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } for w in [34usize, 48, 62, 1920, 1922] { check_pn_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); check_pn_u8_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); @@ -1725,6 +1737,9 @@ fn avx2_pn_rgba_matches_scalar_tail_and_1920() { #[test] fn avx2_yuv420p16_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1744,6 +1759,9 @@ fn avx2_yuv420p16_rgba_matches_scalar_all_matrices() { #[test] fn avx2_p016_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index 9afb142..2591060 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -1685,6 +1685,9 @@ fn check_p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ #[test] fn avx512_yuv420p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1704,6 +1707,9 @@ fn avx512_yuv420p_n_rgba_matches_scalar_all_bits() { #[test] fn avx512_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } for w in [66usize, 96, 126, 1920, 1922] { check_planar_u8_avx512_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); check_planar_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); @@ -1714,6 +1720,9 @@ fn avx512_yuv420p_n_rgba_matches_scalar_tail_and_1920() { #[test] fn avx512_pn_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1731,6 +1740,9 @@ fn avx512_pn_rgba_matches_scalar_all_bits() { #[test] fn avx512_pn_rgba_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } for w in [66usize, 96, 126, 1920, 1922] { check_pn_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); check_pn_u8_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); @@ -1739,6 +1751,9 @@ fn avx512_pn_rgba_matches_scalar_tail_and_1920() { #[test] fn avx512_yuv420p16_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1758,6 +1773,9 @@ fn avx512_yuv420p16_rgba_matches_scalar_all_matrices() { #[test] fn avx512_p016_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index 239b943..284f2b3 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -1707,6 +1707,9 @@ fn check_p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_r #[test] fn sse41_yuv420p_n_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1726,6 +1729,9 @@ fn sse41_yuv420p_n_rgba_matches_scalar_all_bits() { #[test] fn sse41_yuv420p_n_rgba_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } for w in [18usize, 30, 34, 1920, 1922] { check_planar_u8_sse41_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false); check_planar_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true); @@ -1736,6 +1742,9 @@ fn sse41_yuv420p_n_rgba_matches_scalar_tail_and_1920() { #[test] fn sse41_pn_rgba_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1753,6 +1762,9 @@ fn sse41_pn_rgba_matches_scalar_all_bits() { #[test] fn sse41_pn_rgba_matches_scalar_tail_and_1920() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } for w in [18usize, 30, 34, 1920, 1922] { check_pn_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false); check_pn_u8_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true); @@ -1761,6 +1773,9 @@ fn sse41_pn_rgba_matches_scalar_tail_and_1920() { #[test] fn sse41_yuv420p16_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709, @@ -1780,6 +1795,9 @@ fn sse41_yuv420p16_rgba_matches_scalar_all_matrices() { #[test] fn sse41_p016_rgba_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } for m in [ ColorMatrix::Bt601, ColorMatrix::Bt709,