Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 98 additions & 11 deletions src/row/arch/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
/// [`yuv_420p_n_to_rgb_row`] but with full-width U/V (no chroma
/// duplication) and no width parity constraint.
///
/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
/// `ALPHA = false, ALPHA_SRC = false`.
///
/// # Safety
///
Expand All @@ -721,7 +722,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
) {
// SAFETY: caller obligations forwarded to the shared impl.
unsafe {
yuv_444p_n_to_rgb_or_rgba_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
yuv_444p_n_to_rgb_or_rgba_row::<BITS, false, false>(
y, u, v, rgb_out, width, matrix, full_range, None,
);
}
}

Expand All @@ -730,7 +733,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
/// [`yuv_444p_n_to_rgb_row`]; the only differences are the per-pixel
/// stride (4 vs 3) and the constant alpha byte (`0xFF`, opaque).
///
/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
/// `ALPHA = true, ALPHA_SRC = false`.
///
/// # Safety
///
Expand All @@ -748,38 +752,96 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row<const BITS: u32>(
) {
// SAFETY: caller obligations forwarded to the shared impl.
unsafe {
yuv_444p_n_to_rgb_or_rgba_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, false>(
y, u, v, rgba_out, width, matrix, full_range, None,
);
}
}

/// NEON YUVA 4:4:4 planar high-bit-depth → packed **8-bit RGBA** with
/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted
/// via `>> (BITS - 8)` to fit `u8`) instead of being constant `0xFF`.
/// Same numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B.
///
/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
/// `ALPHA = true, ALPHA_SRC = true`.
///
/// # Safety
///
/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`.
#[inline]
#[target_feature(enable = "neon")]
#[allow(clippy::too_many_arguments)]
pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row<const BITS: u32>(
y: &[u16],
u: &[u16],
v: &[u16],
a_src: &[u16],
rgba_out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
// SAFETY: caller obligations forwarded to the shared impl.
unsafe {
yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, true>(
y,
u,
v,
rgba_out,
width,
matrix,
full_range,
Some(a_src),
);
}
}

/// Shared NEON high-bit-depth YUV 4:4:4 kernel for
/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and
/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with
/// constant `0xFF` alpha vector).
/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`,
/// `vst3q_u8`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true,
/// ALPHA_SRC = false`, `vst4q_u8` with constant `0xFF` alpha vector)
/// and [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true,
/// ALPHA_SRC = true`, `vst4q_u8` with the alpha lane loaded and
/// depth-converted from `a_src`).
///
/// # Safety
///
/// 1. **NEON must be available.**
/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
/// `out.len() >= width * if ALPHA { 4 } else { 3 }`.
/// 3. `BITS` must be one of `{9, 10, 12, 14}`.
/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
/// `a_src.unwrap().len() >= width`.
/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
#[allow(clippy::too_many_arguments)]
pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
const BITS: u32,
const ALPHA: bool,
const ALPHA_SRC: bool,
>(
y: &[u16],
u: &[u16],
v: &[u16],
out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
a_src: Option<&[u16]>,
) {
const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
// Source alpha requires RGBA output — there is no 3 bpp store with
// alpha to put it in.
const { assert!(!ALPHA_SRC || ALPHA) };
let bpp: usize = if ALPHA { 4 } else { 3 };
debug_assert!(y.len() >= width);
debug_assert!(u.len() >= width);
debug_assert!(v.len() >= width);
debug_assert!(out.len() >= width * bpp);
if ALPHA_SRC {
debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
}

let coeffs = scalar::Coefficients::for_matrix(matrix);
let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
Expand Down Expand Up @@ -863,9 +925,28 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
);

if ALPHA {
let a_u8 = if ALPHA_SRC {
// SAFETY (const-checked): ALPHA_SRC = true implies the
// wrapper passed Some(_), validated by debug_assert above.
let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
let a_lo_u16 = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v);
let a_hi_u16 = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v);
// Mask before shifting to harden against over-range source
// alpha (e.g. 1024 at BITS=10), matching scalar. NEON's
// `vshrq_n_u16` requires a literal const generic shift, but
// `BITS - 8` is not a stable const expression on a const
// generic — `vshlq_u16` with a negative count vector
// performs the same logical right shift dynamically.
let a_shr = vdupq_n_s16(-((BITS - 8) as i16));
let a_lo_shifted = vshlq_u16(a_lo_u16, a_shr);
let a_hi_shifted = vshlq_u16(a_hi_u16, a_shr);
vcombine_u8(vqmovn_u16(a_lo_shifted), vqmovn_u16(a_hi_shifted))
} else {
alpha_u8
};
vst4q_u8(
out.as_mut_ptr().add(x * 4),
uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8),
uint8x16x4_t(r_u8, g_u8, b_u8, a_u8),
);
} else {
vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8));
Expand All @@ -880,7 +961,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
let tail_v = &v[x..width];
let tail_out = &mut out[x * bpp..width * bpp];
let tail_w = width - x;
if ALPHA {
if ALPHA_SRC {
// SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
);
} else if ALPHA {
scalar::yuv_444p_n_to_rgba_row::<BITS>(
tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
);
Expand Down
130 changes: 130 additions & 0 deletions src/row/arch/neon/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2547,6 +2547,136 @@ fn neon_p416_rgba_matches_scalar_all_matrices() {
}
}

// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) --------------------
//
// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source
// path: per-pixel alpha byte is loaded from the source plane, masked
// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo-
// random alpha is used to flush out lane-order corruption that a
// solid-alpha buffer would mask.

fn check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence<const BITS: u32>(
width: usize,
matrix: ColorMatrix,
full_range: bool,
alpha_seed: usize,
) {
let y = planar_n_plane::<BITS>(width, 37);
let u = planar_n_plane::<BITS>(width, 53);
let v = planar_n_plane::<BITS>(width, 71);
let a_src = planar_n_plane::<BITS>(width, alpha_seed);
let mut rgba_scalar = std::vec![0u8; width * 4];
let mut rgba_neon = std::vec![0u8; width * 4];
scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
&y,
&u,
&v,
&a_src,
&mut rgba_scalar,
width,
matrix,
full_range,
);
unsafe {
yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
&y,
&u,
&v,
&a_src,
&mut rgba_neon,
width,
matrix,
full_range,
);
}
assert_eq!(
rgba_scalar, rgba_neon,
"NEON Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
);
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_yuva444p10_rgba_matches_scalar_all_matrices_16() {
for m in [
ColorMatrix::Bt601,
ColorMatrix::Bt709,
ColorMatrix::Bt2020Ncl,
ColorMatrix::Smpte240m,
ColorMatrix::Fcc,
ColorMatrix::YCgCo,
] {
for full in [true, false] {
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
}
}
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_yuva444p10_rgba_matches_scalar_widths() {
// Natural width + tail widths forcing scalar-tail dispatch.
for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
}
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_yuva444p10_rgba_matches_scalar_random_alpha() {
// Different alpha seeds — ensures the alpha lane order through
// `vst4q_u8` is not confused with R/G/B.
for seed in [13usize, 41, 89, 127, 211] {
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(
16,
ColorMatrix::Bt601,
false,
seed,
);
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(
31,
ColorMatrix::Bt2020Ncl,
true,
seed,
);
}
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_yuva444p_n_rgba_matches_scalar_all_bits() {
// BITS = 9, 12, 14 (BITS = 10 is covered above with full matrix
// sweep). Confirms the variable shift count `BITS - 8` resolves
// correctly across the supported bit depths.
for full in [true, false] {
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53);
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<12>(16, ColorMatrix::Bt709, full, 53);
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<14>(
16,
ColorMatrix::Bt2020Ncl,
full,
53,
);
}
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn neon_yuva444p_n_rgba_matches_scalar_all_bits_widths() {
// BITS = 9, 12, 14 across tail widths — the variable-shift alpha
// path applies across both SIMD body and scalar tail.
for w in [17usize, 47, 1922] {
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<9>(
w,
ColorMatrix::Smpte240m,
false,
89,
);
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<14>(w, ColorMatrix::YCgCo, false, 89);
}
}

// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
//
// u16 RGBA wrappers share the math of their u16 RGB siblings — only
Expand Down
Loading
Loading