Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 135 additions & 12 deletions src/row/arch/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1548,6 +1548,10 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
///
/// # Safety
///
/// Same contract as [`yuv_444_to_rgb_or_rgba_row`] with
/// `ALPHA = false` (so `out.len() >= width * 3` specializes to
/// `rgb_out.len() >= 3 * width`):
///
/// 1. **NEON must be available on the current CPU.**
/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
/// 3. `rgb_out.len() >= 3 * width`.
Expand All @@ -1563,11 +1567,69 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
// SAFETY: caller-checked NEON availability + slice bounds — see
// [`yuv_444_to_rgb_or_rgba_row`] safety contract.
unsafe {
yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
}
}

/// NEON YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract
/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via
/// `vst4q_u8` (R, G, B, `0xFF`).
///
/// # Safety
///
/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
/// `>= 4 * width` bytes.
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn yuv_444_to_rgba_row(
y: &[u8],
u: &[u8],
v: &[u8],
rgba_out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
// SAFETY: caller-checked NEON availability + slice bounds — see
// [`yuv_444_to_rgb_or_rgba_row`] safety contract.
unsafe {
yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
}
}

/// Shared NEON YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
/// (`ALPHA = false`, `vst3q_u8`) and [`yuv_444_to_rgba_row`]
/// (`ALPHA = true`, `vst4q_u8` with constant `0xFF` alpha). Math is
/// byte-identical to `scalar::yuv_444_to_rgb_or_rgba_row::<ALPHA>`;
/// only the per-block store intrinsic differs.
///
/// # Safety
///
/// 1. **NEON must be available on the current CPU.**
/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
///
/// No width parity constraint (4:4:4).
#[inline]
#[target_feature(enable = "neon")]
pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
y: &[u8],
u: &[u8],
v: &[u8],
out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(y.len() >= width);
debug_assert!(u.len() >= width);
debug_assert!(v.len() >= width);
debug_assert!(rgb_out.len() >= width * 3);
let bpp: usize = if ALPHA { 4 } else { 3 };
debug_assert!(out.len() >= width * bpp);

let coeffs = scalar::Coefficients::for_matrix(matrix);
let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
Expand All @@ -1588,6 +1650,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
let cgv = vdupq_n_s32(coeffs.g_v());
let cbu = vdupq_n_s32(coeffs.b_u());
let cbv = vdupq_n_s32(coeffs.b_v());
let alpha_u8 = vdupq_n_u8(0xFF);

let mut x = 0usize;
while x + 16 <= width {
Expand Down Expand Up @@ -1646,22 +1709,28 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)),
);

let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
if ALPHA {
let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
} else {
let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
}

x += 16;
}

if x < width {
scalar::yuv_444_to_rgb_row(
&y[x..width],
&u[x..width],
&v[x..width],
&mut rgb_out[x * 3..width * 3],
width - x,
matrix,
full_range,
);
let tail_y = &y[x..width];
let tail_u = &u[x..width];
let tail_v = &v[x..width];
let tail_w = width - x;
let tail_out = &mut out[x * bpp..width * bpp];
if ALPHA {
scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
} else {
scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
}
}
}
}
Expand Down Expand Up @@ -4136,6 +4205,60 @@ mod tests {
}
}

// ---- yuv_444_to_rgba_row equivalence --------------------------------

fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
let mut rgba_scalar = std::vec![0u8; width * 4];
let mut rgba_neon = std::vec![0u8; width * 4];

scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
unsafe {
yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
}

if rgba_scalar != rgba_neon {
let first_diff = rgba_scalar
.iter()
.zip(rgba_neon.iter())
.position(|(a, b)| a != b)
.unwrap();
let pixel = first_diff / 4;
let channel = ["R", "G", "B", "A"][first_diff % 4];
panic!(
"NEON yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
rgba_scalar[first_diff], rgba_neon[first_diff]
);
}
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn yuv_444_neon_rgba_matches_scalar_all_matrices_16() {
for m in [
ColorMatrix::Bt601,
ColorMatrix::Bt709,
ColorMatrix::Bt2020Ncl,
ColorMatrix::Smpte240m,
ColorMatrix::Fcc,
ColorMatrix::YCgCo,
] {
for full in [true, false] {
check_yuv_444_rgba_equivalence(16, m, full);
}
}
}

#[test]
#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
fn yuv_444_neon_rgba_matches_scalar_widths() {
for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
}
}

// ---- rgb_to_hsv_row equivalence ------------------------------------
//
// The NEON HSV kernel uses `vmaxq_f32` / `vminq_f32` / `vdivq_f32`
Expand Down
139 changes: 125 additions & 14 deletions src/row/arch/wasm_simd128.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1857,9 +1857,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
}
}

/// wasm simd128 YUV 4:4:4 planar → packed RGB. 16 Y + 16 U + 16 V
/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] but U and V
/// come from separate planes (no deinterleave).
/// wasm simd128 YUV 4:4:4 planar → packed RGB. Thin wrapper over
/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`.
///
/// # Safety
///
Expand All @@ -1876,11 +1875,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
// SAFETY: caller-checked simd128 availability + slice bounds — see
// [`yuv_444_to_rgb_or_rgba_row`] safety contract.
unsafe {
yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
}
}

/// wasm simd128 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same
/// contract as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel
/// via [`write_rgba_16`] (R, G, B, `0xFF`).
///
/// # Safety
///
/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
/// `>= 4 * width` bytes.
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn yuv_444_to_rgba_row(
y: &[u8],
u: &[u8],
v: &[u8],
rgba_out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
// SAFETY: caller-checked simd128 availability + slice bounds — see
// [`yuv_444_to_rgb_or_rgba_row`] safety contract.
unsafe {
yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
}
}

/// Shared wasm simd128 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_444_to_rgba_row`]
/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha).
///
/// # Safety
///
/// 1. **simd128 must be enabled at compile time.**
/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
#[inline]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
y: &[u8],
u: &[u8],
v: &[u8],
out: &mut [u8],
width: usize,
matrix: ColorMatrix,
full_range: bool,
) {
debug_assert!(y.len() >= width);
debug_assert!(u.len() >= width);
debug_assert!(v.len() >= width);
debug_assert!(rgb_out.len() >= width * 3);
let bpp: usize = if ALPHA { 4 } else { 3 };
debug_assert!(out.len() >= width * bpp);

let coeffs = scalar::Coefficients::for_matrix(matrix);
let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
Expand All @@ -1898,6 +1951,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
let cgv = i32x4_splat(coeffs.g_v());
let cbu = i32x4_splat(coeffs.b_u());
let cbv = i32x4_splat(coeffs.b_v());
let alpha_u8 = u8x16_splat(0xFF);

let mut x = 0usize;
while x + 16 <= width {
Expand Down Expand Up @@ -1953,21 +2007,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);

write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
if ALPHA {
write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
} else {
write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
}

x += 16;
}

if x < width {
scalar::yuv_444_to_rgb_row(
&y[x..width],
&u[x..width],
&v[x..width],
&mut rgb_out[x * 3..width * 3],
width - x,
matrix,
full_range,
);
let tail_y = &y[x..width];
let tail_u = &u[x..width];
let tail_v = &v[x..width];
let tail_w = width - x;
let tail_out = &mut out[x * bpp..width * bpp];
if ALPHA {
scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
} else {
scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
}
}
}
}
Expand Down Expand Up @@ -3721,6 +3780,58 @@ mod tests {
}
}

// ---- yuv_444_to_rgba_row equivalence --------------------------------

fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
let mut rgba_scalar = std::vec![0u8; width * 4];
let mut rgba_wasm = std::vec![0u8; width * 4];

scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
unsafe {
yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range);
}

if rgba_scalar != rgba_wasm {
let first_diff = rgba_scalar
.iter()
.zip(rgba_wasm.iter())
.position(|(a, b)| a != b)
.unwrap();
let pixel = first_diff / 4;
let channel = ["R", "G", "B", "A"][first_diff % 4];
panic!(
"wasm simd128 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}",
rgba_scalar[first_diff], rgba_wasm[first_diff]
);
}
}

#[test]
fn simd128_yuv_444_rgba_matches_scalar_all_matrices_16() {
for m in [
ColorMatrix::Bt601,
ColorMatrix::Bt709,
ColorMatrix::Bt2020Ncl,
ColorMatrix::Smpte240m,
ColorMatrix::Fcc,
ColorMatrix::YCgCo,
] {
for full in [true, false] {
check_yuv_444_rgba_equivalence(16, m, full);
}
}
}

#[test]
fn simd128_yuv_444_rgba_matches_scalar_widths() {
for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
}
}

// ---- yuv_444p_n<BITS> + yuv_444p16 equivalence ----------------------

fn check_yuv_444p_n_equivalence<const BITS: u32>(
Expand Down
Loading
Loading