diff --git a/src/row/mod.rs b/src/row/mod.rs index b33cfa4..b4287df 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -4264,6 +4264,441 @@ pub fn p412_to_rgb_u16_row( p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); } +// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 6 prep) ---------- +// +// Both u8 and native-depth `u16` RGBA dispatchers route to the scalar +// reference path. SIMD per-arch routes land in the follow-up Ship 8 +// Tranche 6b (u8) and Tranche 6c (u16) PRs; the `use_simd` parameter +// is held in the signature for API stability, but every body is +// `let _ = use_simd;` plus a scalar call until the SIMD wiring lands. + +/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the +/// per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_444p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha +/// element is `1023`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha +/// element is `4095`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha +/// element is `16383`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit +/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar +/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); +} + +/// P410 (semi-planar 4:4:4, 10-bit high-packed) → packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P410 → **native-depth `u16`** packed **RGBA** — output is +/// low-bit-packed (`[0, 1023]`); alpha element is `1023`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P412 → **native-depth `u16`** packed **RGBA** — output is +/// low-bit-packed (`[0, 4095]`); alpha element is `4095`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA** +/// (`R, G, B, 0xFF`). Routes through the dedicated 16-bit scalar +/// kernel (`scalar::p_n_444_16_to_rgba_row`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6b. + scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P416 → **native-depth `u16`** packed **RGBA** — full-range output +/// `[0, 65535]`; alpha element is `0xFFFF`. Routes through the +/// dedicated 16-bit u16-output scalar kernel +/// (`scalar::p_n_444_16_to_rgba_u16_row`) — i64 chroma multiply. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 6c. + scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); +} + /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit /// encoding). See `scalar::rgb_to_hsv_row` for semantics. /// diff --git a/src/row/scalar.rs b/src/row/scalar.rs index 5390a49..2824ed2 100644 --- a/src/row/scalar.rs +++ b/src/row/scalar.rs @@ -900,10 +900,12 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row= width`, `u.len() >= width`, `v.len() >= width`, @@ -917,17 +919,66 @@ pub(crate) fn yuv_444p_n_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar high‑bit‑depth → **u8** packed **RGBA**. Same +/// numerical contract as [`yuv_444p_n_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque). +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p_n_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_444p_n_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Shared kernel for [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, +/// 3 bpp store) and [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, +/// 4 bpp store with constant `0xFF` alpha). +/// +/// The compiler monomorphizes into two separate functions; the +/// `if ALPHA` branch is DCE'd at each call site. +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p_n_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { // Compile-time guard — fails monomorphization for any BITS outside - // {10, 12, 14}. The 16-bit path lives in `yuv_444p16_to_rgb_row` + // {9, 10, 12, 14}. The 16-bit path lives in `yuv_444p16_to_rgb_row` // (i32 u8-output kernel family). Without this guard a caller - // invoking ::<16> would reach the NEON clamp where + // invoking ::<16, _> would reach the NEON clamp where // `(1 << BITS) - 1 as i16` silently wraps to -1. const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u.len() >= width, "u row too short"); debug_assert!(v.len() >= width, "v row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); @@ -944,14 +995,19 @@ pub(crate) fn yuv_444p_n_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } } } /// YUV 4:4:4 planar high‑bit‑depth → **native‑depth `u16`** packed RGB. -/// Const‑generic over `BITS ∈ {10, 12, 14}`. Low‑bit‑packed output. +/// Const‑generic over `BITS ∈ {9, 10, 12, 14}`. Low‑bit‑packed output. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. /// /// # Panics (debug builds) /// @@ -967,20 +1023,68 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - // Compile-time guard — see note on `yuv_444p_n_to_rgb_row`. The - // 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row` (i64 + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar high‑bit‑depth → **native‑depth `u16`** packed +/// **RGBA**. Same numerical contract as [`yuv_444p_n_to_rgb_u16_row`]; +/// the only differences are the per-pixel stride (4 vs 3 `u16` +/// elements) and the alpha element, `(1 << BITS) - 1` (opaque maximum +/// at the input bit depth). +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p_n_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Shared kernel for [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false`, +/// 3 bpp store) and [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true`, +/// 4 bpp store with opaque alpha = `(1 << BITS) - 1`). +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // Compile-time guard — see note on `yuv_444p_n_to_rgb_or_rgba_row`. + // The 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row` (i64 // chroma family). const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u.len() >= width, "u row too short"); debug_assert!(v.len() >= width, "v row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); let bias = chroma_bias::(); let out_max: i32 = (1i32 << BITS) - 1; let mask = bits_mask::(); + let alpha_max: u16 = out_max as u16; for x in 0..width { let u_d = q15_scale((u[x] & mask) as i32 - bias, c_scale); @@ -991,9 +1095,12 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = alpha_max; + } } } @@ -1257,6 +1364,8 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row( /// YUV 4:4:4 planar **16‑bit** → packed **8‑bit** RGB. Same i32 /// chroma pipeline as 10/12/14 (output‑range scaling keeps `coeff × u_d` /// inside i32 for u8 target). 1:1 chroma per Y pixel, no width parity. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_444p16_to_rgb_row( y: &[u16], @@ -1267,10 +1376,50 @@ pub(crate) fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar **16‑bit** → packed **8‑bit** **RGBA**. Same +/// numerical contract as [`yuv_444p16_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque). +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Shared 16-bit YUV 4:4:4 → 8-bit RGB / RGBA kernel. `ALPHA = false` +/// emits 3 bpp; `ALPHA = true` emits 4 bpp with constant `0xFF` alpha. +/// +/// 16-bit input has no AND-mask (every `u16` is a valid sample) and +/// uses i32 chroma — output-target scaling keeps `u_d * coeff` inside +/// i32 for u8 output (the i64 chroma family lives in +/// [`yuv_444p16_to_rgb_or_rgba_u16_row`]). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p16_to_rgb_or_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u.len() >= width, "u row too short"); debug_assert!(v.len() >= width, "v row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range); @@ -1285,9 +1434,12 @@ pub(crate) fn yuv_444p16_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } } } @@ -1296,6 +1448,8 @@ pub(crate) fn yuv_444p16_to_rgb_row( /// ~2.31·10⁹ at limited‑range 16→u16 — overflows i32). Y path widens /// via [`q15_scale64`] to handle unclamped Y samples above the /// limited‑range nominal max. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] pub(crate) fn yuv_444p16_to_rgb_u16_row( y: &[u16], @@ -1306,10 +1460,47 @@ pub(crate) fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar **16‑bit** → packed **native‑depth `u16`** **RGBA** +/// — alpha element is `0xFFFF` (opaque maximum at 16‑bit). +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Shared 16-bit YUV 4:4:4 → native-depth `u16` RGB / RGBA kernel. +/// `ALPHA = false` emits 3 bpp; `ALPHA = true` emits 4 bpp with +/// constant `0xFFFF` alpha. +/// +/// Uses i64 chroma multiply (same rationale as +/// [`yuv_444p16_to_rgb_u16_row`]). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn yuv_444p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(u.len() >= width, "u row too short"); debug_assert!(v.len() >= width, "v row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range); @@ -1325,9 +1516,12 @@ pub(crate) fn yuv_444p16_to_rgb_u16_row( let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = 0xFFFF; + } } } @@ -1800,6 +1994,8 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row( /// shifted right by `16 - BITS` to extract the active value before /// running the standard Q15 i32 pipeline. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, @@ -1812,11 +2008,55 @@ pub(crate) fn p_n_444_to_rgb_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// Converts one row of high-bit-packed semi-planar 4:4:4 (P410, P412) +/// to **8-bit** packed **RGBA**. Same numerical contract as +/// [`p_n_444_to_rgb_row`]; the only differences are the per-pixel +/// stride (4 vs 3) and the alpha byte (`0xFF`, opaque). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = true`. +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// Shared kernel for [`p_n_444_to_rgb_row`] (`ALPHA = false`, 3 bpp +/// store) and [`p_n_444_to_rgba_row`] (`ALPHA = true`, 4 bpp store +/// with constant `0xFF` alpha). +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_full.len() >= 2 * width, "uv_full row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); @@ -1835,9 +2075,12 @@ pub(crate) fn p_n_444_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } } } @@ -1847,6 +2090,8 @@ pub(crate) fn p_n_444_to_rgb_row( /// zero), matching the [`yuv_444p_n_to_rgb_u16_row`] convention. /// `BITS ∈ {10, 12}`. /// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, @@ -1859,17 +2104,62 @@ pub(crate) fn p_n_444_to_rgb_u16_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// Converts one row of high-bit-packed semi-planar 4:4:4 (P410, P412) +/// to **native-depth `u16`** packed **RGBA** — low-bit-packed output; +/// alpha element is `(1 << BITS) - 1` (opaque maximum at the input +/// bit depth). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `rgba_out.len() >= 4 * width`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// Shared kernel for [`p_n_444_to_rgb_u16_row`] (`ALPHA = false`, +/// 3 bpp store) and [`p_n_444_to_rgba_u16_row`] (`ALPHA = true`, +/// 4 bpp store with opaque alpha = `(1 << BITS) - 1`). +/// +/// # Panics (debug builds) +/// +/// - `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_full.len() >= 2 * width, "uv_full row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::(full_range); let bias = chroma_bias::(); let out_max: i32 = (1i32 << BITS) - 1; let shift = 16 - BITS; + let alpha_max: u16 = out_max as u16; for x in 0..width { let u_sample = uv_full[x * 2] >> shift; @@ -1882,9 +2172,12 @@ pub(crate) fn p_n_444_to_rgb_u16_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = alpha_max; + } } } @@ -1892,6 +2185,8 @@ pub(crate) fn p_n_444_to_rgb_u16_row( /// **8-bit** packed RGB. Y and chroma both stay on i32 — same logic /// as `p16_to_rgb_row` plus the full-width UV layout. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, @@ -1905,9 +2200,42 @@ pub(crate) fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// Converts one row of P416 to **8-bit** packed **RGBA**. Same +/// numerical contract as [`p_n_444_16_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque). +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_16_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// Shared P416 → 8-bit RGB / RGBA kernel. `ALPHA = false` emits 3 bpp; +/// `ALPHA = true` emits 4 bpp with constant `0xFF` alpha. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_16_to_rgb_or_rgba_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_full.len() >= 2 * width, "uv_full row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 8>(full_range); @@ -1924,9 +2252,12 @@ pub(crate) fn p_n_444_16_to_rgb_row( let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = clamp_u8(y0 + r_chroma); - rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma); - rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma); + out[x * bpp] = clamp_u8(y0 + r_chroma); + out[x * bpp + 1] = clamp_u8(y0 + g_chroma); + out[x * bpp + 2] = clamp_u8(y0 + b_chroma); + if ALPHA { + out[x * bpp + 3] = 0xFF; + } } } @@ -1936,6 +2267,8 @@ pub(crate) fn p_n_444_16_to_rgb_row( /// `yuv_444p16_to_rgb_u16_row`: `coeff × u_d` overflows i32 at 16 /// bits for the BT.2020 blue coefficient). /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Panics (debug builds) /// /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, @@ -1949,9 +2282,43 @@ pub(crate) fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// Converts one row of P416 to **native-depth `u16`** packed +/// **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_16_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// Shared P416 → native-depth `u16` RGB / RGBA kernel. `ALPHA = false` +/// emits 3 bpp; `ALPHA = true` emits 4 bpp with constant `0xFFFF` +/// alpha. Uses i64 chroma multiply (same rationale as +/// [`p_n_444_16_to_rgb_u16_row`]). +#[cfg_attr(not(tarpaulin), inline(always))] +pub(crate) fn p_n_444_16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width, "y row too short"); debug_assert!(uv_full.len() >= 2 * width, "uv_full row too short"); - debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); + debug_assert!(out.len() >= width * bpp, "out row too short"); let coeffs = Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = range_params_n::<16, 16>(full_range); @@ -1969,9 +2336,12 @@ pub(crate) fn p_n_444_16_to_rgb_u16_row( let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); - rgb_out[x * 3] = (y0 + r_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; - rgb_out[x * 3 + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; + out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; + out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; + if ALPHA { + out[x * bpp + 3] = 0xFFFF; + } } } diff --git a/src/row/scalar/tests.rs b/src/row/scalar/tests.rs index 46352e2..b3eca8c 100644 --- a/src/row/scalar/tests.rs +++ b/src/row/scalar/tests.rs @@ -487,3 +487,150 @@ fn p010_rgb_u16_limited_range_endpoints() { assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); } + +// ---- yuv_444p_n_to_rgba_row (10-bit → u8 RGBA) ---------------------- + +#[test] +fn yuv444p10_rgba_gray_alpha_is_ff() { + // Mid-gray 10-bit Y=512 ↔ 8-bit ≈128. RGBA stride is 4 bytes/px; + // alpha must be 0xFF on every pixel. + let y = [512u16; 4]; + let u = [512u16; 4]; + let v = [512u16; 4]; + let mut rgba = [0u8; 16]; + yuv_444p_n_to_rgba_row::<10>(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b, a) = ( + rgba[x * 4], + rgba[x * 4 + 1], + rgba[x * 4 + 2], + rgba[x * 4 + 3], + ); + assert_eq!(r, g, "RGB should be gray"); + assert_eq!(g, b, "RGB should be gray"); + assert!(r.abs_diff(128) <= 1, "got R={r}"); + assert_eq!(a, 0xFF, "alpha must be 0xFF at px {x}"); + } +} + +// ---- yuv_444p_n_to_rgba_u16_row (10-bit → 10-bit u16 RGBA) --------- + +#[test] +fn yuv444p10_rgba_u16_gray_alpha_is_1023() { + // 10-bit u16 RGBA: alpha element is `(1 << BITS) - 1 = 1023`. + let y = [512u16; 4]; + let u = [512u16; 4]; + let v = [512u16; 4]; + let mut rgba = [0u16; 16]; + yuv_444p_n_to_rgba_u16_row::<10>(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b, a) = ( + rgba[x * 4], + rgba[x * 4 + 1], + rgba[x * 4 + 2], + rgba[x * 4 + 3], + ); + assert_eq!(r, g); + assert_eq!(g, b); + assert!(r.abs_diff(512) <= 1, "got R={r}"); + assert_eq!(a, 1023, "alpha must be (1 << 10) - 1 at px {x}"); + } +} + +// ---- yuv_444p16_to_rgba_row (16-bit → u8 RGBA) ---------------------- + +#[test] +fn yuv444p16_rgba_gray_alpha_is_ff() { + // 16-bit mid-gray Y = 0x8000 → 8-bit ≈128. Alpha = 0xFF. + let y = [0x8000u16; 4]; + let u = [0x8000u16; 4]; + let v = [0x8000u16; 4]; + let mut rgba = [0u8; 16]; + yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b, a) = ( + rgba[x * 4], + rgba[x * 4 + 1], + rgba[x * 4 + 2], + rgba[x * 4 + 3], + ); + assert_eq!(r, g); + assert_eq!(g, b); + assert!(r.abs_diff(128) <= 1, "got R={r}"); + assert_eq!(a, 0xFF, "alpha must be 0xFF at px {x}"); + } +} + +// ---- yuv_444p16_to_rgba_u16_row (16-bit → 16-bit u16 RGBA) --------- + +#[test] +fn yuv444p16_rgba_u16_gray_alpha_is_ffff() { + // 16-bit u16 RGBA: alpha element is `0xFFFF`. + let y = [0x8000u16; 4]; + let u = [0x8000u16; 4]; + let v = [0x8000u16; 4]; + let mut rgba = [0u16; 16]; + yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b, a) = ( + rgba[x * 4], + rgba[x * 4 + 1], + rgba[x * 4 + 2], + rgba[x * 4 + 3], + ); + assert_eq!(r, g); + assert_eq!(g, b); + // Y=0x8000 in full-range 16→16 maps near 32768; allow rounding. + assert!(r.abs_diff(0x8000) <= 1, "got R={r}"); + assert_eq!(a, 0xFFFF, "alpha must be 0xFFFF at px {x}"); + } +} + +// ---- p_n_444_to_rgba_row (P410 → u8 RGBA) --------------------------- + +#[test] +fn p410_rgba_gray_alpha_is_ff() { + // P410: 10 active bits in HIGH 10 of each u16. Mid-gray 10-bit + // Y=512 → P410 Y = 0x8000. UV interleaved: U V U V ... full width. + let y = [0x8000u16; 4]; + // 4 pixels × (U,V) per pixel = 8 elements. + let uv = [0x8000u16; 8]; + let mut rgba = [0u8; 16]; + p_n_444_to_rgba_row::<10>(&y, &uv, &mut rgba, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b, a) = ( + rgba[x * 4], + rgba[x * 4 + 1], + rgba[x * 4 + 2], + rgba[x * 4 + 3], + ); + assert_eq!(r, g); + assert_eq!(g, b); + assert!(r.abs_diff(128) <= 1, "got R={r}"); + assert_eq!(a, 0xFF, "alpha must be 0xFF at px {x}"); + } +} + +// ---- p_n_444_16_to_rgba_u16_row (P416 → 16-bit u16 RGBA) ----------- + +#[test] +fn p416_rgba_u16_gray_alpha_is_ffff() { + // P416: full 16-bit samples. Mid-gray Y=0x8000, neutral UV=0x8000. + // 16-bit u16 RGBA: alpha element is `0xFFFF`. + let y = [0x8000u16; 4]; + let uv = [0x8000u16; 8]; + let mut rgba = [0u16; 16]; + p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba, 4, ColorMatrix::Bt601, true); + for x in 0..4 { + let (r, g, b, a) = ( + rgba[x * 4], + rgba[x * 4 + 1], + rgba[x * 4 + 2], + rgba[x * 4 + 3], + ); + assert_eq!(r, g); + assert_eq!(g, b); + assert!(r.abs_diff(0x8000) <= 1, "got R={r}"); + assert_eq!(a, 0xFFFF, "alpha must be 0xFFFF at px {x}"); + } +}