diff --git a/CHANGELOG.md b/CHANGELOG.md index 4be82d2..1ae086b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -199,20 +199,22 @@ backends are wired in follow-up sub-PRs without breaking call sites.** | 4c | 4:4:0 planar | `Yuv440p` | ✅ shipped (PR #22) — wiring-only (reuses `yuv_444_to_rgba_row`) | | 5 | High-bit 4:2:0 | `Yuv420p9/10/12/14/16`, `P010/P012/P016` | ✅ shipped — **5** scalar prep + dispatchers (PR #24); **5a** u8 SIMD across all 5 backends (PR #25); **5b** u16 SIMD + sinker integration (PR #26) | | 6 | High-bit 4:2:2 | `Yuv422p9/10/12/14/16`, `P210/P212/P216` | ✅ shipped (PR #28) — sinker-only; reuses tranche-5 row kernels via the established 4:2:2 → 4:2:0 dispatcher pattern. (`Yuv440p10/12` deferred to tranche 7 alongside the 4:4:4 work it depends on.) | -| 7 | High-bit 4:4:4 + 4:4:0 | `Yuv444p9/10/12/14/16`, `P410/P412/P416`, `Yuv440p10/12` | ⏳ **in progress** — **7** scalar prep + dispatchers shipped (PR #29; `use_simd` parameter held in the signature but routes to scalar until 7b/7c wire SIMD). 7b u8 SIMD pending; 7c u16 SIMD + sinker integration pending. | +| 7 | High-bit 4:4:4 + 4:4:0 | `Yuv444p9/10/12/14/16`, `P410/P412/P416`, `Yuv440p10/12` | ✅ shipped — **7** scalar prep + dispatchers (PR #29); **7b** u8 SIMD across all 5 backends (PR #30); **7c** u16 SIMD + sinker integration incl. `Yuv440p10/12` reusing 4:4:4 dispatchers (PR #31) | | 8 | RAW | `Bayer`, `Bayer16` | (deferred — RAW already has `with_luma_coefficients`) | ### SIMD coverage -For tranches 1–6 (everything shipped): all 5 backends (NEON, SSE4.1, -AVX2, AVX-512, wasm simd128) have the const-ALPHA `<…, ALPHA>` template -wired for both u8 and u16 RGBA paths. Per-arch RGBA store helpers added -where needed: `vst4q_u8` / `vst4q_u16` (NEON), `write_rgba_16` / -`write_rgba_u16_8` (SSE4.1, AVX2 via re-export), `write_rgba_64` / -`write_rgba_u16_32` + `write_quarter_rgba` (AVX-512), `u8x16_splat` / -`i16x8_shuffle`-based `write_rgba_u16_8` (wasm). - -For tranche 7: scalar-only as of PR #29. SIMD backends land in 7b/7c. +**All 7 tranches (Ship 8 complete)**: 5 backends (NEON, SSE4.1, AVX2, +AVX-512, wasm simd128) have the const-ALPHA `<…, ALPHA>` template +wired for both u8 and u16 RGBA paths across every high-bit kernel +family (4:2:0 in tranche 5; 4:4:4 + Pn-444 in tranche 7). 4:2:2 and +4:4:0 sinkers reuse 4:2:0 / 4:4:4 dispatchers respectively — no new +SIMD code needed for those subsampling families. Per-arch RGBA store +helpers added in tranche 5: `vst4q_u8` / `vst4q_u16` (NEON), +`write_rgba_16` / `write_rgba_u16_8` (SSE4.1, AVX2 via re-export), +`write_rgba_64` / `write_rgba_u16_32` + `write_quarter_rgba` +(AVX-512), `u8x16_splat` / `i16x8_shuffle`-based `write_rgba_u16_8` +(wasm). Reused verbatim across tranches 5–7. ### Cleanup PRs @@ -226,17 +228,20 @@ For tranche 7: scalar-only as of PR #29. SIMD backends land in 7b/7c. (`src/frame.rs`, `src/raw/types.rs`, `src/raw/bayer.rs`, `src/raw/bayer16.rs`) into sibling files. Same shape as PR #21. -### Tests (cumulative through PR #29) - -- **513 tests pass on aarch64-darwin** (host) at the end of tranche 7 - scalar prep; +6 since tranche 6 (PR #28: 507) for the new 4:4:4 - scalar reference paths. -- Per-arch RGBA equivalence tests: 30 tests × 5 backends per high-bit - family (Tranche 5 added BITS=9/10/12/14 + 16 + Pn for both u8 and - u16 paths, all matrices × ranges × tail widths). -- Sinker integration tests: 8 new in PR #26 (4:2:0), 8 in PR #28 - (4:2:2), 6 in PR #29 (4:4:4 scalar). Cover both standalone-RGBA - and Strategy A combine paths plus buffer-too-short error variants. +### Tests (cumulative through PR #31, Ship 8 complete) + +- **534 tests pass on aarch64-darwin** (host) at Ship 8 close; + trajectory: 507 (PR #28, 4:2:2 sinker) → 513 (PR #29, 4:4:4 scalar + prep) → 519 (PR #30, 4:4:4 u8 SIMD) → 534 (PR #31, 4:4:4 u16 SIMD + + sinker). +- Per-arch RGBA equivalence tests: ~30 per high-bit family across all + 5 backends — tranche 5 added 4:2:0 (u8 + u16, BITS=9/10/12/14 + 16 + + Pn); tranche 7b/7c added 4:4:4 (u8 + u16, BITS=9/10/12/14 + 16 + + Pn-444). All matrices × ranges × natural-block + tail widths. +- Sinker integration tests: 8 in PR #26 (4:2:0), 8 in PR #28 (4:2:2), + 6 in PR #29 (4:4:4 scalar), 9 in PR #31 (4:4:4 + Yuv440p10 cross- + family kernel-reuse proof). Cover standalone-RGBA, Strategy A + combine, and buffer-too-short error variants. - All x86 `#[test]` functions exercising new SIMD kernels include `is_x86_feature_detected!` early-return guards (per the PR #25 CI fallout — without them, ASAN sanitizer saw `SIGILL` and Miri diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index 3a6acc7..f7972c0 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -896,6 +896,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// NEON sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth) — matches `scalar::yuv_444p_n_to_rgba_u16_row`. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_u16_row`], plus +/// `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON high-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` +/// writes RGBA quads via `vst4q_u16` with constant alpha +/// `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **NEON must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`. const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -940,6 +998,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u16 = vdupq_n_u16(out_max as u16); let mut x = 0usize; while x + 16 <= width { @@ -993,24 +1052,36 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let b_lo = clamp_u16_max(vqaddq_s16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); - let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + if ALPHA { + let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16); + let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16); + vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo); + vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi); + } else { + let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); + let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); + vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo); + vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + } x += 16; } if x < width { - scalar::yuv_444p_n_to_rgb_u16_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -2846,6 +2917,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( /// i64 chroma + i64 Y (same widening as `yuv_420p16_to_rgb_u16_row`); /// full-width U/V (no chroma duplication step). /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -2860,10 +2933,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// NEON sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF` (opaque maximum at u16 range) — +/// matches `scalar::yuv_444p16_to_rgba_u16_row`. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON 16-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` +/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **NEON must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -2883,6 +3009,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u16 = vdupq_n_u16(0xFFFF); let mut x = 0usize; while x + 8 <= width { @@ -2943,23 +3070,35 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( vqmovun_s32(vaddq_s32(ys_hi, b_ch_hi)), ); - vst3q_u16( - rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(r_u16, g_u16, b_u16), - ); + if ALPHA { + vst4q_u16( + out.as_mut_ptr().add(x * 4), + uint16x8x4_t(r_u16, g_u16, b_u16, alpha_u16), + ); + } else { + vst3q_u16( + out.as_mut_ptr().add(x * 3), + uint16x8x3_t(r_u16, g_u16, b_u16), + ); + } x += 8; } if x < width { - scalar::yuv_444p16_to_rgb_u16_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3560,6 +3699,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// NEON sibling of [`p_n_444_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth) — matches `scalar::p_n_444_to_rgba_u16_row`. +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON Pn 4:4:4 high-bit-packed → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` +/// writes RGBA quads via `vst4q_u16` with constant alpha +/// `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. NEON must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3601,6 +3795,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u16 = vdupq_n_u16(out_max as u16); let mut x = 0usize; while x + 16 <= width { @@ -3659,23 +3854,35 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let b_lo = clamp_u16_max(vaddq_s16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max(vaddq_s16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); - let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + if ALPHA { + let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16); + let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16); + vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo); + vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi); + } else { + let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo); + let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi); + vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo); + vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi); + } x += 16; } if x < width { - scalar::p_n_444_to_rgb_u16_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::p_n_444_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3887,6 +4094,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// RGB. i64 chroma + i64 Y (chroma matrix multiply-add overflows i32 /// at u16 output for the BT.2020 blue coefficient). 8 pixels per iter. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -3900,9 +4109,61 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// NEON sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF` (opaque maximum at u16 range) — +/// matches `scalar::p_n_444_16_to_rgba_u16_row`. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared NEON P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16` +/// kernel. `ALPHA = false` writes RGB triples via `vst3q_u16`; +/// `ALPHA = true` writes RGBA quads via `vst4q_u16` with constant alpha +/// `0xFFFF`. +/// +/// # Safety +/// +/// 1. NEON must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "neon")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3922,6 +4183,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( let cgv = vdupq_n_s32(coeffs.g_v()); let cbu = vdupq_n_s32(coeffs.b_u()); let cbv = vdupq_n_s32(coeffs.b_v()); + let alpha_u16 = vdupq_n_u16(0xFFFF); let mut x = 0usize; while x + 8 <= width { @@ -3980,22 +4242,30 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( vqmovun_s32(vaddq_s32(ys_hi, b_ch_hi)), ); - vst3q_u16( - rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(r_u16, g_u16, b_u16), - ); + if ALPHA { + vst4q_u16( + out.as_mut_ptr().add(x * 4), + uint16x8x4_t(r_u16, g_u16, b_u16, alpha_u16), + ); + } else { + vst3q_u16( + out.as_mut_ptr().add(x * 3), + uint16x8x3_t(r_u16, g_u16, b_u16), + ); + } x += 8; } if x < width { - scalar::p_n_444_16_to_rgb_u16_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index 7c3b684..d312142 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -2546,3 +2546,191 @@ fn neon_p416_rgba_matches_scalar_all_matrices() { check_p_n_444_16_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- +// +// u16 RGBA wrappers share the math of their u16 RGB siblings — only +// the store (and tail dispatch) branches on `ALPHA`, with alpha set to +// `(1 << BITS) - 1` for BITS-generic kernels and `0xFFFF` for 16-bit +// kernels. Tests pin byte-identical output against the scalar RGBA +// reference. + +fn check_yuv444p_n_u16_neon_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u16_neon_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane::(width, 37); + let u = high_bit_plane::(width, 53); + let v = high_bit_plane::(width, 71); + let uv = interleave_uv(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width, 53); + let v = p16_plane_neon(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width, 53); + let v = p16_plane_neon(width, 71); + let uv = interleave_uv(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv444p_n_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_neon_rgba_equivalence::<9>(16, m, full); + check_yuv444p_n_u16_neon_rgba_equivalence::<10>(16, m, full); + check_yuv444p_n_u16_neon_rgba_equivalence::<12>(16, m, full); + check_yuv444p_n_u16_neon_rgba_equivalence::<14>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_neon_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u16_neon_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u16_neon_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u16_neon_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_444_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u16_neon_rgba_equivalence::<10>(16, m, full); + check_pn_444_u16_neon_rgba_equivalence::<12>(16, m, full); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_pn_444_rgba_u16_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u16_neon_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u16_neon_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuv444p16_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u16_neon_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_p416_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u16_neon_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index a147d92..b5eab6a 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -824,6 +824,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// WASM simd128 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth +/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum +/// at the input bit depth). +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared WASM simd128 high-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -865,6 +920,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u16 = u16x8_splat(out_max as u16); let mut x = 0usize; while x + 16 <= width { @@ -915,23 +971,34 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8(r_lo, g_lo, b_lo, dst); - write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst); + write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8(r_lo, g_lo, b_lo, dst); + write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + } x += 16; } if x < width { - scalar::yuv_444p_n_to_rgb_u16_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1112,10 +1179,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( } /// WASM simd128 YUV 4:4:4 planar **16-bit** → packed **u16** RGB. -/// Falls through to scalar — simd128 has no native `i64x2` arithmetic -/// shift, and the `srai64_15` bias trick at 128 bits (single i64 pair -/// per lane) is not cheaper than scalar. Same rationale as -/// [`yuv_420p16_to_rgb_u16_row`]. +/// 8 pixels per iter on the i64 chroma pipeline. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. /// /// # Safety /// @@ -1131,10 +1197,61 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 sibling of [`yuv_444p16_to_rgba_row`] for native-depth +/// `u16` output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 16-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **simd128 must be enabled at compile time.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -1142,6 +1259,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( const RND_I32: i32 = 1 << 14; unsafe { + let alpha_u16 = u16x8_splat(0xFFFF); let rnd_i64 = i64x2_splat(RND_I64); let rnd_i32 = i32x4_splat(RND_I32); let y_off32 = i32x4_splat(y_off); @@ -1230,20 +1348,29 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( i32x4_add(y_hi_scaled, b_ch_hi), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::yuv_444p16_to_rgb_u16_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3635,6 +3762,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 sibling of [`p_n_444_to_rgba_row`] for native-depth +/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum +/// at the input bit depth). +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 Pn 4:4:4 high-bit-packed → native-depth `u16` +/// kernel. `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. simd128 must be enabled at compile time. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` must be one of `{10, 12}`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3673,6 +3853,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let cgv = i32x4_splat(coeffs.g_v()); let cbu = i32x4_splat(coeffs.b_u()); let cbv = i32x4_splat(coeffs.b_v()); + let alpha_u16 = u16x8_splat(out_max as u16); let shr = (16 - BITS) as u32; @@ -3733,22 +3914,33 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8(r_lo, g_lo, b_lo, dst); - write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst); + write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8(r_lo, g_lo, b_lo, dst); + write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24)); + } x += 16; } if x < width { - scalar::p_n_444_to_rgb_u16_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::p_n_444_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3922,6 +4114,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// wasm simd128 P416 → packed **native-depth `u16`** RGB. i64 chroma /// via native `i64x2_shr` (no bias trick needed). 8 pixels per iter. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -3935,9 +4129,58 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// wasm simd128 sibling of [`p_n_444_16_to_rgba_row`] for native-depth +/// `u16` output. Alpha is `0xFFFF`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared wasm simd128 P416 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. simd128 must be enabled at compile time. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3945,6 +4188,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( const RND_I32: i32 = 1 << 14; unsafe { + let alpha_u16 = u16x8_splat(0xFFFF); let rnd_i64 = i64x2_splat(RND_I64); let rnd_i32 = i32x4_splat(RND_I32); let y_off32 = i32x4_splat(y_off); @@ -4028,19 +4272,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( i32x4_add(y_hi_scaled, b_ch_hi), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::p_n_444_16_to_rgb_u16_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index f7529f3..56603cd 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -2083,3 +2083,187 @@ fn simd128_p416_rgba_matches_scalar_all_matrices() { check_p_n_444_16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- + +fn check_yuv444p_n_u16_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "wasm simd128 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u16_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_wasm::(width, 37); + let u = high_bit_plane_wasm::(width, 53); + let v = high_bit_plane_wasm::(width, 71); + let uv = interleave_uv_wasm(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "wasm simd128 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u16_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width, 53); + let v = p16_plane_wasm(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "wasm simd128 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u16_simd128_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width, 53); + let v = p16_plane_wasm(width, 71); + let uv = interleave_uv_wasm(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "wasm simd128 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn simd128_yuv444p_n_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_simd128_rgba_equivalence::<9>(16, m, full); + check_yuv444p_n_u16_simd128_rgba_equivalence::<10>(16, m, full); + check_yuv444p_n_u16_simd128_rgba_equivalence::<12>(16, m, full); + check_yuv444p_n_u16_simd128_rgba_equivalence::<14>(16, m, full); + } + } +} + +#[test] +fn simd128_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_simd128_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u16_simd128_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u16_simd128_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u16_simd128_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn simd128_pn_444_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u16_simd128_rgba_equivalence::<10>(16, m, full); + check_pn_444_u16_simd128_rgba_equivalence::<12>(16, m, full); + } + } +} + +#[test] +fn simd128_pn_444_rgba_u16_matches_scalar_tail_and_widths() { + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u16_simd128_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u16_simd128_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn simd128_yuv444p16_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u16_simd128_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn simd128_p416_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u16_simd128_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 4ad6d35..070f9e7 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -962,6 +962,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 high-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1003,6 +1060,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 32 <= width { @@ -1077,45 +1135,82 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_lo), - _mm256_castsi256_si128(g_lo), - _mm256_castsi256_si128(b_lo), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_lo), - _mm256_extracti128_si256::<1>(g_lo), - _mm256_extracti128_si256::<1>(b_lo), - dst.add(24), - ); - write_rgb_u16_8( - _mm256_castsi256_si128(r_hi), - _mm256_castsi256_si128(g_hi), - _mm256_castsi256_si128(b_hi), - dst.add(48), - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_hi), - _mm256_extracti128_si256::<1>(g_hi), - _mm256_extracti128_si256::<1>(b_hi), - dst.add(72), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + alpha_u16, + dst.add(32), + ); + write_rgba_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + alpha_u16, + dst.add(64), + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + alpha_u16, + dst.add(96), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + dst.add(24), + ); + write_rgb_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + dst.add(48), + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + dst.add(72), + ); + } x += 32; } if x < width { - scalar::yuv_444p_n_to_rgb_u16_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1326,6 +1421,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( /// SSE4.1 rate, and no chroma-duplication step since 4:4:4 chroma /// is 1:1 with Y. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -1340,16 +1437,69 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF`. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 16-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA +/// quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **AVX2 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm256_set1_epi64x(RND); let y_off_v = _mm256_set1_epi32(y_off); let y_scale_v = _mm256_set1_epi32(y_scale); @@ -1446,33 +1596,56 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( _mm256_add_epi32(y_hi_scaled, b_ch_hi), )); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - dst.add(24), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + dst.add(24), + ); + } x += 16; } if x < width { - scalar::yuv_444p16_to_rgb_u16_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3956,6 +4129,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 sibling of [`p_n_444_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 Pn 4:4:4 high-bit-packed → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. AVX2 must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3997,6 +4224,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let cgv = _mm256_set1_epi32(coeffs.g_v()); let cbu = _mm256_set1_epi32(coeffs.b_u()); let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 32 <= width { @@ -4075,44 +4303,81 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let b_lo = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_lo), - _mm256_castsi256_si128(g_lo), - _mm256_castsi256_si128(b_lo), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_lo), - _mm256_extracti128_si256::<1>(g_lo), - _mm256_extracti128_si256::<1>(b_lo), - dst.add(24), - ); - write_rgb_u16_8( - _mm256_castsi256_si128(r_hi), - _mm256_castsi256_si128(g_hi), - _mm256_castsi256_si128(b_hi), - dst.add(48), - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_hi), - _mm256_extracti128_si256::<1>(g_hi), - _mm256_extracti128_si256::<1>(b_hi), - dst.add(72), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + alpha_u16, + dst.add(32), + ); + write_rgba_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + alpha_u16, + dst.add(64), + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + alpha_u16, + dst.add(96), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_lo), + _mm256_castsi256_si128(g_lo), + _mm256_castsi256_si128(b_lo), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_lo), + _mm256_extracti128_si256::<1>(g_lo), + _mm256_extracti128_si256::<1>(b_lo), + dst.add(24), + ); + write_rgb_u16_8( + _mm256_castsi256_si128(r_hi), + _mm256_castsi256_si128(g_hi), + _mm256_castsi256_si128(b_hi), + dst.add(48), + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_hi), + _mm256_extracti128_si256::<1>(g_hi), + _mm256_extracti128_si256::<1>(b_hi), + dst.add(72), + ); + } x += 32; } if x < width { - scalar::p_n_444_to_rgb_u16_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::p_n_444_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -4311,6 +4576,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// `srai64_15_x4` bias trick (AVX2 lacks `_mm256_srai_epi64`). /// 16 pixels per iter (i64 narrows throughput). /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -4324,15 +4591,66 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX2 sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF`. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX2 P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16` +/// kernel. `ALPHA = false` writes RGB triples; `ALPHA = true` writes +/// RGBA quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. AVX2 must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm256_set1_epi64x(RND); let y_off_v = _mm256_set1_epi32(y_off); let y_scale_v = _mm256_set1_epi32(y_scale); @@ -4425,32 +4743,51 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( _mm256_add_epi32(y_hi_scaled, b_ch_hi), )); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - dst.add(24), - ); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + dst.add(24), + ); + } x += 16; } if x < width { - scalar::p_n_444_16_to_rgb_u16_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index d7cf589..e6a6009 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -2334,3 +2334,197 @@ fn avx2_p416_rgba_matches_scalar_all_matrices() { check_p_n_444_16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- + +fn check_yuv444p_n_u16_avx2_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u16_avx2_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_avx2::(width, 37); + let u = high_bit_plane_avx2::(width, 53); + let v = high_bit_plane_avx2::(width, 71); + let uv = interleave_uv_avx2(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width, 53); + let v = p16_plane_avx2(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width, 53); + let v = p16_plane_avx2(width, 71); + let uv = interleave_uv_avx2(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx2_yuv444p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_avx2_rgba_equivalence::<9>(32, m, full); + check_yuv444p_n_u16_avx2_rgba_equivalence::<10>(32, m, full); + check_yuv444p_n_u16_avx2_rgba_equivalence::<12>(32, m, full); + check_yuv444p_n_u16_avx2_rgba_equivalence::<14>(32, m, full); + } + } +} + +#[test] +fn avx2_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_avx2_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u16_avx2_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u16_avx2_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u16_avx2_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx2_pn_444_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u16_avx2_rgba_equivalence::<10>(32, m, full); + check_pn_444_u16_avx2_rgba_equivalence::<12>(32, m, full); + } + } +} + +#[test] +fn avx2_pn_444_rgba_u16_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u16_avx2_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u16_avx2_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx2_yuv444p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u16_avx2_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx2_p416_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u16_avx2_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 4d28528..ba587d3 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -1046,6 +1046,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 high-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via 8× `write_quarter`; +/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1087,6 +1143,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -1175,29 +1232,46 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let b_lo = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_quarter(r_lo, g_lo, b_lo, 0, dst); - write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); - write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); - write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); - write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); - write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); - write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); - write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_quarter(r_lo, g_lo, b_lo, 0, dst); + write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); + write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); + write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); + write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); + write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); + write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); + write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + } x += 64; } if x < width { - scalar::yuv_444p_n_to_rgb_u16_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1421,6 +1495,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( /// [`yuv_420p16_to_rgb_u16_row`] but with full-width chroma loads /// and no duplication step (4:4:4 is 1:1 with Y). /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -1435,10 +1511,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF`. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 16-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with +/// constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -1446,6 +1575,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( const RND_I32: i32 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_i64_v = _mm512_set1_epi64(RND_I64); let rnd_i32_v = _mm512_set1_epi32(RND_I32); let y_off_v = _mm512_set1_epi32(y_off); @@ -1544,21 +1674,30 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32)); let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32)); - write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::yuv_444p16_to_rgb_u16_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -4104,6 +4243,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 sibling of [`p_n_444_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 Pn 4:4:4 high-bit-packed → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via 8× `write_quarter`; +/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -4143,6 +4336,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let cgv = _mm512_set1_epi32(coeffs.g_v()); let cbu = _mm512_set1_epi32(coeffs.b_u()); let cbv = _mm512_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); @@ -4235,28 +4429,45 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let b_lo = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - let dst = rgb_out.as_mut_ptr().add(x * 3); - write_quarter(r_lo, g_lo, b_lo, 0, dst); - write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); - write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); - write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); - write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); - write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); - write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); - write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64)); + write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192)); + write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224)); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_quarter(r_lo, g_lo, b_lo, 0, dst); + write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24)); + write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48)); + write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72)); + write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96)); + write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120)); + write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144)); + write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168)); + } x += 64; } if x < width { - scalar::p_n_444_to_rgb_u16_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::p_n_444_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -4469,6 +4680,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// iter (i64 narrows). Native `_mm512_srai_epi64` via /// `chroma_i64x8_avx512` + `scale_y_i32x16_i64` — no bias trick. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -4482,9 +4695,59 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// AVX-512 sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF`. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared AVX-512 P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16` +/// kernel. `ALPHA = false` writes RGB triples; `ALPHA = true` writes +/// RGBA quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **AVX-512F + AVX-512BW must be available.** +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "avx512bw,avx512f")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -4492,6 +4755,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( const RND_I32: i32 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_i64_v = _mm512_set1_epi64(RND_I64); let rnd_i32_v = _mm512_set1_epi32(RND_I32); let y_off_v = _mm512_set1_epi32(y_off); @@ -4586,19 +4850,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32)); let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32)); - write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 32; } if x < width { - scalar::p_n_444_16_to_rgb_u16_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index b66d3a1..31152c3 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -2358,3 +2358,205 @@ fn avx512_p416_rgba_matches_scalar_all_matrices() { check_p_n_444_16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- + +fn check_yuv444p_n_u16_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u16_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_avx512::(width, 37); + let u = high_bit_plane_avx512::(width, 53); + let v = high_bit_plane_avx512::(width, 71); + let uv = interleave_uv_avx512(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u16_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width, 53); + let v = p16_plane_avx512(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u16_avx512_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width, 53); + let v = p16_plane_avx512(width, 71); + let uv = interleave_uv_avx512(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn avx512_yuv444p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_avx512_rgba_equivalence::<9>(64, m, full); + check_yuv444p_n_u16_avx512_rgba_equivalence::<10>(64, m, full); + check_yuv444p_n_u16_avx512_rgba_equivalence::<12>(64, m, full); + check_yuv444p_n_u16_avx512_rgba_equivalence::<14>(64, m, full); + } + } +} + +#[test] +fn avx512_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_avx512_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u16_avx512_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u16_avx512_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u16_avx512_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn avx512_pn_444_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u16_avx512_rgba_equivalence::<10>(64, m, full); + check_pn_444_u16_avx512_rgba_equivalence::<12>(64, m, full); + } + } +} + +#[test] +fn avx512_pn_444_rgba_u16_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u16_avx512_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u16_avx512_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn avx512_yuv444p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u16_avx512_rgba_equivalence(32, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn avx512_p416_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u16_avx512_rgba_equivalence(32, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index ac8ccaa..390b8cc 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1278,6 +1278,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). +/// +/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p_n_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 high-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{9, 10, 12, 14}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`. const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1322,6 +1379,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 16 <= width { @@ -1372,22 +1430,38 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( let b_lo = clamp_u16_max(_mm_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3)); - write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24)); + if ALPHA { + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4)); + write_rgba_u16_8( + r_hi, + g_hi, + b_hi, + alpha_u16, + out.as_mut_ptr().add(x * 4 + 32), + ); + } else { + write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3)); + write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24)); + } x += 16; } if x < width { - scalar::yuv_444p_n_to_rgb_u16_row::( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p_n_to_rgba_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p_n_to_rgb_u16_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -1578,6 +1652,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( /// U/V (vs 4 half-width), computing 8 chroma values (vs 4 + dup), and /// skipping the chroma-duplication step. /// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -1592,16 +1668,69 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF`. +/// +/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_444p16_to_rgb_or_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 16-bit YUV 4:4:4 → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA +/// quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. **SSE4.1 must be available on the current CPU.** +/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(u.len() >= width); debug_assert!(v.len() >= width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm_set1_epi64x(RND); let y_off_v = _mm_set1_epi32(y_off); let y_scale_v = _mm_set1_epi32(y_scale); @@ -1719,20 +1848,29 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( _mm_add_epi32(y_hi_i32, b_ch_hi_i32), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::yuv_444p16_to_rgb_u16_row( - &y[x..width], - &u[x..width], - &v[x..width], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_u = &u[x..width]; + let tail_v = &v[x..width]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::yuv_444p16_to_rgba_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::yuv_444p16_to_rgb_u16_row( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3479,6 +3617,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( width: usize, matrix: ColorMatrix, full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 sibling of [`p_n_444_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the +/// input bit depth). +/// +/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 Pn 4:4:4 high-bit-packed → native-depth `u16` kernel. +/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; +/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// +/// # Safety +/// +/// 1. SSE4.1 must be available on the current CPU. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 3. `BITS` ∈ `{10, 12}`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, ) { const { assert!(BITS == 10 || BITS == 12) }; + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -3520,6 +3713,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let cgv = _mm_set1_epi32(coeffs.g_v()); let cbu = _mm_set1_epi32(coeffs.b_u()); let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u16 = _mm_set1_epi16(out_max); let mut x = 0usize; while x + 16 <= width { @@ -3573,21 +3767,37 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( let b_lo = clamp_u16_max(_mm_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v); let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v); - write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3)); - write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24)); + if ALPHA { + write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4)); + write_rgba_u16_8( + r_hi, + g_hi, + b_hi, + alpha_u16, + out.as_mut_ptr().add(x * 4 + 32), + ); + } else { + write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3)); + write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24)); + } x += 16; } if x < width { - scalar::p_n_444_to_rgb_u16_row::( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } else { + scalar::p_n_444_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); + } } } } @@ -3766,6 +3976,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// `_mm_mul_epi32` + `srai64_15` bias trick (mirroring /// `yuv_444p16_to_rgb_u16_row`). 8 pixels per iter. /// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. +/// /// # Safety /// /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. @@ -3779,15 +3991,66 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } +} + +/// SSE4.1 sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16` +/// output. Alpha samples are `0xFFFF`. +/// +/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. +/// +/// # Safety +/// +/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + } +} + +/// Shared SSE4.1 P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16` +/// kernel. `ALPHA = false` writes RGB triples; `ALPHA = true` writes +/// RGBA quads with constant alpha `0xFFFF`. +/// +/// # Safety +/// +/// 1. SSE4.1 must be available. +/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`, +/// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +#[inline] +#[target_feature(enable = "sse4.1")] +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert!(y.len() >= width); debug_assert!(uv_full.len() >= 2 * width); - debug_assert!(rgb_out.len() >= width * 3); + debug_assert!(out.len() >= width * bpp); let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); const RND: i64 = 1 << 14; unsafe { + let alpha_u16 = _mm_set1_epi16(-1i16); let rnd_v = _mm_set1_epi64x(RND); let y_off_v = _mm_set1_epi32(y_off); let y_scale_v = _mm_set1_epi32(y_scale); @@ -3900,19 +4163,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( _mm_add_epi32(y_hi_i32, b_ch_hi_i32), ); - write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } x += 8; } if x < width { - scalar::p_n_444_16_to_rgb_u16_row( - &y[x..width], - &uv_full[x * 2..width * 2], - &mut rgb_out[x * 3..width * 3], - width - x, - matrix, - full_range, - ); + let tail_y = &y[x..width]; + let tail_uv = &uv_full[x * 2..width * 2]; + let tail_out = &mut out[x * bpp..width * bpp]; + let tail_w = width - x; + if ALPHA { + scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } else { + scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + } } } } diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index 330a59a..253925b 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -2374,3 +2374,201 @@ fn sse41_p416_rgba_matches_scalar_all_matrices() { check_p_n_444_16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); } } + +// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ---- + +fn check_yuv444p_n_u16_sse41_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width, 53); + let v = planar_n_plane::(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_pn_444_u16_sse41_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = high_bit_plane_sse41::(width, 37); + let u = high_bit_plane_sse41::(width, 53); + let v = high_bit_plane_sse41::(width, 71); + let uv = interleave_uv_sse41(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_yuv444p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) { + let y = p16_plane(width, 37); + let u = p16_plane(width, 53); + let v = p16_plane(width, 71); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + unsafe { + yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +fn check_p_n_444_16_u16_sse41_rgba_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + let y = p16_plane(width, 37); + let u = p16_plane(width, 53); + let v = p16_plane(width, 71); + let uv = interleave_uv_sse41(&u, &v); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + unsafe { + p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})" + ); +} + +#[test] +fn sse41_yuv444p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p_n_u16_sse41_rgba_equivalence::<9>(16, m, full); + check_yuv444p_n_u16_sse41_rgba_equivalence::<10>(16, m, full); + check_yuv444p_n_u16_sse41_rgba_equivalence::<12>(16, m, full); + check_yuv444p_n_u16_sse41_rgba_equivalence::<14>(16, m, full); + } + } +} + +#[test] +fn sse41_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p_n_u16_sse41_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false); + check_yuv444p_n_u16_sse41_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true); + check_yuv444p_n_u16_sse41_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false); + check_yuv444p_n_u16_sse41_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true); + } +} + +#[test] +fn sse41_pn_444_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_pn_444_u16_sse41_rgba_equivalence::<10>(16, m, full); + check_pn_444_u16_sse41_rgba_equivalence::<12>(16, m, full); + } + } +} + +#[test] +fn sse41_pn_444_rgba_u16_matches_scalar_tail_and_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_pn_444_u16_sse41_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false); + check_pn_444_u16_sse41_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true); + } +} + +#[test] +fn sse41_yuv444p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv444p16_u16_sse41_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_yuv444p16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} + +#[test] +fn sse41_p416_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_p_n_444_16_u16_sse41_rgba_equivalence(16, m, full); + } + } + for w in [17usize, 31, 47, 63, 1920, 1922] { + check_p_n_444_16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false); + } +} diff --git a/src/row/mod.rs b/src/row/mod.rs index cec5c09..dfbbd1d 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -4266,13 +4266,9 @@ pub fn p412_to_rgb_u16_row( // ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- // -// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche -// 7b). The native-depth `u16` RGBA dispatchers stay on the scalar -// reference path until the follow-up Ship 8 Tranche 7c PR; the -// `use_simd` parameter is held in their signatures for API stability, -// but their bodies remain `let _ = use_simd;` plus a scalar call -// until the SIMD wiring lands. `use_simd = false` forces the scalar -// reference path on every dispatcher. +// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch +// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces +// the scalar reference path on every dispatcher. /// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the @@ -4376,7 +4372,53 @@ pub fn yuv444p9_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4475,7 +4517,53 @@ pub fn yuv444p10_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4574,7 +4662,53 @@ pub fn yuv444p12_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4673,7 +4807,53 @@ pub fn yuv444p14_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); } @@ -4774,7 +4954,53 @@ pub fn yuv444p16_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); } @@ -4870,7 +5096,53 @@ pub fn p410_to_rgba_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); } @@ -4966,7 +5238,53 @@ pub fn p412_to_rgba_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); } @@ -5065,7 +5383,53 @@ pub fn p416_to_rgba_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); } diff --git a/src/sinker/mixed/planar_8bit.rs b/src/sinker/mixed/planar_8bit.rs index 41d4707..375b3dc 100644 --- a/src/sinker/mixed/planar_8bit.rs +++ b/src/sinker/mixed/planar_8bit.rs @@ -31,12 +31,13 @@ impl<'a> MixedSinker<'a, Yuv420p> { /// /// ```compile_fail /// // Attaching RGBA to a sink that doesn't write it is rejected - /// // at compile time. Yuv444p10 (10‑bit 4:4:4 planar) has not yet - /// // been wired for RGBA — once a future tranche lands it the - /// // negative example here moves to the next not‑yet‑wired format. - /// use colconv::{sinker::MixedSinker, yuv::Yuv444p10}; + /// // at compile time. `Bayer` (RAW Bayer-mosaic) has no RGBA path — + /// // there's no inherent alpha channel and the format demosaics to + /// // RGB only. Once / if a future PR adds RGBA, the negative example + /// // here moves to the next not‑yet‑wired format. + /// use colconv::{sinker::MixedSinker, raw::Bayer}; /// let mut buf = vec![0u8; 16 * 8 * 4]; - /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); + /// let _ = MixedSinker::::new(16, 8).with_rgba(&mut buf); /// ``` #[cfg_attr(not(tarpaulin), inline(always))] pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { diff --git a/src/sinker/mixed/subsampled_4_2_2_high_bit.rs b/src/sinker/mixed/subsampled_4_2_2_high_bit.rs index 0f1a062..8c5c4d9 100644 --- a/src/sinker/mixed/subsampled_4_2_2_high_bit.rs +++ b/src/sinker/mixed/subsampled_4_2_2_high_bit.rs @@ -1330,6 +1330,48 @@ impl<'a> MixedSinker<'a, Yuv440p10> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. Yuv440p10 reuses + /// the `BITS = 10` 4:4:4 RGBA kernel; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 10-bit low-packed + /// (`[0, 1023]`); alpha element is `1023`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv440p10Sink for MixedSinker<'_, Yuv440p10> {} @@ -1383,6 +1425,8 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -1398,7 +1442,26 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p10_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -1408,19 +1471,47 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p10_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } - if rgb.is_none() && hsv.is_none() { + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p10_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -1454,6 +1545,12 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -1480,6 +1577,48 @@ impl<'a> MixedSinker<'a, Yuv440p12> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. Yuv440p12 reuses + /// the `BITS = 12` 4:4:4 RGBA kernel; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 12-bit low-packed + /// (`[0, 4095]`); alpha element is `4095`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv440p12Sink for MixedSinker<'_, Yuv440p12> {} @@ -1533,6 +1672,8 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -1548,7 +1689,26 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p12_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -1558,19 +1718,47 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p12_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, w, row.matrix(), row.full_range(), use_simd, ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } } - if rgb.is_none() && hsv.is_none() { + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p12_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + return Ok(()); + } + + if !need_rgb_kernel { return Ok(()); } @@ -1604,6 +1792,12 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } diff --git a/src/sinker/mixed/subsampled_4_4_4_high_bit.rs b/src/sinker/mixed/subsampled_4_4_4_high_bit.rs index f1dde93..3d36e10 100644 --- a/src/sinker/mixed/subsampled_4_4_4_high_bit.rs +++ b/src/sinker/mixed/subsampled_4_4_4_high_bit.rs @@ -2,6 +2,7 @@ use super::{ MixedSinker, MixedSinkerError, RowSlice, check_dimensions_match, rgb_row_buf_or_scratch, + rgba_plane_row_slice, rgba_u16_plane_row_slice, }; use crate::{PixelSink, row::*, yuv::*}; @@ -27,6 +28,51 @@ impl<'a> MixedSinker<'a, Yuv444p9> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. The 9-bit YUV + /// source is converted to 8-bit RGBA via the same `BITS = 9` Q15 + /// kernel family used by [`Self::with_rgb`]; the fourth byte per + /// pixel is alpha = `0xFF` (Yuv444p9 has no alpha plane). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 9-bit low-packed + /// (`(1 << 9) - 1 = 511` max). Length is measured in `u16` + /// **elements** (`width × height × 4`). Alpha element is `(1 << 9) - 1`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv444p9Sink for MixedSinker<'_, Yuv444p9> {} @@ -80,6 +126,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -95,7 +143,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p9_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -105,19 +172,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p9_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p9_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -151,6 +246,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -177,6 +278,49 @@ impl<'a> MixedSinker<'a, Yuv444p10> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. The 10-bit YUV + /// source is converted to 8-bit RGBA via the same `BITS = 10` Q15 + /// kernel family used by [`Self::with_rgb`]; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 10-bit low-packed + /// (`[0, 1023]`); alpha element is `1023`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv444p10Sink for MixedSinker<'_, Yuv444p10> {} @@ -230,6 +374,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -245,7 +391,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p10_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -255,19 +420,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p10_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p10_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -301,6 +494,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -325,6 +524,49 @@ impl<'a> MixedSinker<'a, Yuv444p12> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. The 12-bit YUV + /// source is converted to 8-bit RGBA via the same `BITS = 12` Q15 + /// kernel family used by [`Self::with_rgb`]; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 12-bit low-packed + /// (`[0, 4095]`); alpha element is `4095`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv444p12Sink for MixedSinker<'_, Yuv444p12> {} @@ -378,6 +620,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -393,7 +637,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p12_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -403,19 +666,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p12_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p12_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -449,6 +740,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -473,6 +770,49 @@ impl<'a> MixedSinker<'a, Yuv444p14> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. The 14-bit YUV + /// source is converted to 8-bit RGBA via the same `BITS = 14` Q15 + /// kernel family used by [`Self::with_rgb`]; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 14-bit low-packed + /// (`[0, 16383]`); alpha element is `16383`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv444p14Sink for MixedSinker<'_, Yuv444p14> {} @@ -526,6 +866,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -541,7 +883,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p14_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -551,19 +912,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p14_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p14_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -597,6 +986,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -622,6 +1017,49 @@ impl<'a> MixedSinker<'a, Yuv444p16> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. The 16-bit YUV + /// source is converted to 8-bit RGBA via the dedicated 16-bit + /// kernel; alpha = `0xFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. Full-range output + /// `[0, 65535]`; alpha element is `0xFFFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl Yuv444p16Sink for MixedSinker<'_, Yuv444p16> {} @@ -675,6 +1113,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -690,7 +1130,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p16_to_rgba_u16_row( + row.y(), + row.u(), + row.v(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -700,19 +1159,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; yuv444p16_to_rgb_u16_row( row.y(), row.u(), row.v(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + yuv444p16_to_rgba_row( + row.y(), + row.u(), + row.v(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -746,6 +1233,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -777,6 +1270,48 @@ impl<'a> MixedSinker<'a, P410> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. P410 has no alpha + /// plane, so alpha = `0xFF` (opaque). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 10-bit low-packed + /// (`[0, 1023]`); alpha element is `1023`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl P410Sink for MixedSinker<'_, P410> {} @@ -790,6 +1325,7 @@ impl PixelSink for MixedSinker<'_, P410> { } fn process(&mut self, row: P410Row<'_>) -> Result<(), Self::Error> { + const BITS: u32 = 10; let w = self.width; let h = self.height; let idx = row.row(); @@ -822,6 +1358,8 @@ impl PixelSink for MixedSinker<'_, P410> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -837,7 +1375,25 @@ impl PixelSink for MixedSinker<'_, P410> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + p410_to_rgba_u16_row( + row.y(), + row.uv_full(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -847,18 +1403,45 @@ impl PixelSink for MixedSinker<'_, P410> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; p410_to_rgb_u16_row( row.y(), row.uv_full(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + p410_to_rgba_row( + row.y(), + row.uv_full(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -891,6 +1474,12 @@ impl PixelSink for MixedSinker<'_, P410> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -917,6 +1506,48 @@ impl<'a> MixedSinker<'a, P412> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. P412 has no alpha + /// plane, so alpha = `0xFF` (opaque). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. 12-bit low-packed + /// (`[0, 4095]`); alpha element is `4095`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl P412Sink for MixedSinker<'_, P412> {} @@ -930,6 +1561,7 @@ impl PixelSink for MixedSinker<'_, P412> { } fn process(&mut self, row: P412Row<'_>) -> Result<(), Self::Error> { + const BITS: u32 = 12; let w = self.width; let h = self.height; let idx = row.row(); @@ -961,6 +1593,8 @@ impl PixelSink for MixedSinker<'_, P412> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -976,7 +1610,25 @@ impl PixelSink for MixedSinker<'_, P412> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + p412_to_rgba_u16_row( + row.y(), + row.uv_full(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -986,18 +1638,45 @@ impl PixelSink for MixedSinker<'_, P412> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; p412_to_rgb_u16_row( row.y(), row.uv_full(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + p412_to_rgba_row( + row.y(), + row.uv_full(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -1030,6 +1709,12 @@ impl PixelSink for MixedSinker<'_, P412> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } @@ -1060,6 +1745,48 @@ impl<'a> MixedSinker<'a, P416> { self.rgb_u16 = Some(buf); Ok(self) } + + /// Attaches a packed **8-bit** RGBA output buffer. P416 has no alpha + /// plane, so alpha = `0xFF` (opaque). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result { + self.set_rgba(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba`](Self::with_rgba). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaBufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba = Some(buf); + Ok(self) + } + + /// Attaches a packed **`u16`** RGBA output buffer. Full-range output + /// `[0, 65535]`; alpha element is `0xFFFF`. + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result { + self.set_rgba_u16(buf)?; + Ok(self) + } + /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16). + #[cfg_attr(not(tarpaulin), inline(always))] + pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> { + let expected = self.frame_bytes(4)?; + if buf.len() < expected { + return Err(MixedSinkerError::RgbaU16BufferTooShort { + expected, + actual: buf.len(), + }); + } + self.rgba_u16 = Some(buf); + Ok(self) + } } impl P416Sink for MixedSinker<'_, P416> {} @@ -1073,6 +1800,7 @@ impl PixelSink for MixedSinker<'_, P416> { } fn process(&mut self, row: P416Row<'_>) -> Result<(), Self::Error> { + const BITS: u32 = 16; let w = self.width; let h = self.height; let idx = row.row(); @@ -1104,6 +1832,8 @@ impl PixelSink for MixedSinker<'_, P416> { let Self { rgb, rgb_u16, + rgba, + rgba_u16, luma, hsv, rgb_scratch, @@ -1119,7 +1849,25 @@ impl PixelSink for MixedSinker<'_, P416> { } } - if let Some(buf) = rgb_u16.as_deref_mut() { + // ===== u16 RGB / RGBA path (Strategy A) ===== + let want_rgb_u16 = rgb_u16.is_some(); + let want_rgba_u16 = rgba_u16.is_some(); + + if want_rgba_u16 && !want_rgb_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + p416_to_rgba_u16_row( + row.y(), + row.uv_full(), + rgba_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + } else if want_rgb_u16 { + let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); let rgb_plane_end = one_plane_end .checked_mul(3) @@ -1129,18 +1877,45 @@ impl PixelSink for MixedSinker<'_, P416> { channels: 3, })?; let rgb_plane_start = one_plane_start * 3; + let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; p416_to_rgb_u16_row( row.y(), row.uv_full(), - &mut buf[rgb_plane_start..rgb_plane_end], + rgb_u16_row, + w, + row.matrix(), + row.full_range(), + use_simd, + ); + if want_rgba_u16 { + let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); + let rgba_u16_row = + rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_u16_to_rgba_u16_row::(rgb_u16_row, rgba_u16_row, w); + } + } + + // ===== u8 RGB / RGBA / HSV path (Strategy A) ===== + let want_rgba = rgba.is_some(); + let want_hsv = hsv.is_some(); + let need_rgb_kernel = rgb.is_some() || want_hsv; + + if want_rgba && !need_rgb_kernel { + let rgba_buf = rgba.as_deref_mut().unwrap(); + let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; + p416_to_rgba_row( + row.y(), + row.uv_full(), + rgba_row, w, row.matrix(), row.full_range(), use_simd, ); + return Ok(()); } - if rgb.is_none() && hsv.is_none() { + if !need_rgb_kernel { return Ok(()); } @@ -1173,6 +1948,12 @@ impl PixelSink for MixedSinker<'_, P416> { use_simd, ); } + + if let Some(buf) = rgba.as_deref_mut() { + let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; + expand_rgb_to_rgba_row(rgb_row, rgba_row, w); + } + Ok(()) } } diff --git a/src/sinker/mixed/tests.rs b/src/sinker/mixed/tests.rs index c5e87dd..440fce4 100644 --- a/src/sinker/mixed/tests.rs +++ b/src/sinker/mixed/tests.rs @@ -3394,6 +3394,54 @@ fn yuv420p12_with_simd_false_matches_with_simd_true() { assert_eq!(rgb_u16_scalar, rgb_u16_simd); } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p12_rgba_u8_only_gray_with_opaque_alpha() { + // 12-bit mid-gray (Y=U=V=2048) → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095. + let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048); + let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + // ---- Yuv420p14 --------------------------------------------------------- fn solid_yuv420p14_frame( @@ -3533,6 +3581,54 @@ fn yuv420p14_with_simd_false_matches_with_simd_true() { assert_eq!(rgb_u16_scalar, rgb_u16_simd); } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p14_rgba_u8_only_gray_with_opaque_alpha() { + // 14-bit mid-gray (Y=U=V=8192) → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p14_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 14-bit mid-gray → u16 RGBA: each color element ≈ 8192, alpha = 16383. + let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192); + let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(8192) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 16383, "alpha must equal (1 << 14) - 1"); + } +} + // ---- P012 -------------------------------------------------------------- // // Semi-planar 12-bit, high-bit-packed (samples in high 12 of each @@ -3726,6 +3822,56 @@ fn p012_with_simd_false_matches_with_simd_true() { assert_eq!(rgb_u16_scalar, rgb_u16_simd); } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p012_rgba_u8_only_gray_with_opaque_alpha() { + // P012 mid-gray (12-bit values shifted into the high 12): Y/U/V = 2048 << 4. + // Output 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p012_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // P012 mid-gray → u16 RGBA: each color element ≈ 2048 (low-bit-packed), + // alpha = (1 << 12) - 1 = 4095. + let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048); + let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + // ---- Yuv420p16 --------------------------------------------------------- // // Planar 16-bit, full u16 range. Mid-gray is Y=UV=32768; full-range @@ -4392,6 +4538,54 @@ fn yuv422p12_gray_to_gray() { } } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv422p12_rgba_u8_only_gray_with_opaque_alpha() { + // 12-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 2048, 2048, 2048); + let src = Yuv422p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv422p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv422p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095. + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 2048, 2048, 2048); + let src = Yuv422p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv422p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + #[test] #[cfg_attr( miri, @@ -4414,6 +4608,54 @@ fn yuv422p14_gray_to_gray() { } } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv422p14_rgba_u8_only_gray_with_opaque_alpha() { + // 14-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 8192, 8192, 8192); + let src = Yuv422p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv422p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv422p14_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 14-bit mid-gray → u16 RGBA: each color element ≈ 8192, alpha = 16383. + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 8192, 8192, 8192); + let src = Yuv422p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv422p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(8192) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 16383, "alpha must equal (1 << 14) - 1"); + } +} + #[test] #[cfg_attr( miri, @@ -4493,20 +4735,22 @@ fn yuv444p12_gray_to_gray() { miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn yuv444p14_gray_to_gray() { - let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192); - let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); +fn yuv444p12_rgba_u8_only_gray_with_opaque_alpha() { + // 12-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 2048, 2048, 2048); + let src = Yuv444p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb) + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) .unwrap(); - yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv444p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - for px in rgb.chunks(3) { + for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); } } @@ -4515,26 +4759,120 @@ fn yuv444p14_gray_to_gray() { miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn yuv444p16_gray_to_gray_u16() { - let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 32768, 32768, 32768); - let src = Yuv444p16Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); +fn yuv444p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095. + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 2048, 2048, 2048); + let src = Yuv444p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); - let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; - let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb_u8) - .unwrap() - .with_rgb_u16(&mut rgb_u16) + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) .unwrap(); - yuv444p16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + yuv444p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); - for px in rgb_u8.chunks(3) { - assert!(px[0].abs_diff(128) <= 1); - assert_eq!(px[0], px[1]); - assert_eq!(px[1], px[2]); - } - for px in rgb_u16.chunks(3) { - assert!(px[0].abs_diff(32768) <= 256); + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p14_gray_to_gray() { + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192); + let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb) + .unwrap(); + yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p14_rgba_u8_only_gray_with_opaque_alpha() { + // 14-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192); + let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p14_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 14-bit mid-gray → u16 RGBA: each color element ≈ 8192, alpha = 16383. + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192); + let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(8192) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 16383, "alpha must equal (1 << 14) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p16_gray_to_gray_u16() { + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 32768, 32768, 32768); + let src = Yuv444p16Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + yuv444p16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgb_u8.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + for px in rgb_u16.chunks(3) { + assert!(px[0].abs_diff(32768) <= 256); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); } @@ -4807,6 +5145,58 @@ fn yuv420p9_gray_to_gray() { } } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p9_rgba_u8_only_gray_with_opaque_alpha() { + // 9-bit mid-gray (Y=U=V=256) → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256); + let up = up[..8 * 4].to_vec(); + let vp = vp[..8 * 4].to_vec(); + let src = Yuv420p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv420p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p9_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 9-bit mid-gray → u16 RGBA: each color element ≈ 256, alpha = 511. + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256); + let up = up[..8 * 4].to_vec(); + let vp = vp[..8 * 4].to_vec(); + let src = Yuv420p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv420p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(256) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 511, "alpha must equal (1 << 9) - 1"); + } +} + #[test] #[cfg_attr( miri, @@ -4829,6 +5219,54 @@ fn yuv422p9_gray_to_gray() { } } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv422p9_rgba_u8_only_gray_with_opaque_alpha() { + // 9-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256); + let src = Yuv422p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv422p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv422p9_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 9-bit mid-gray → u16 RGBA: each color element ≈ 256, alpha = 511. + let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256); + let src = Yuv422p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv422p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(256) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 511, "alpha must equal (1 << 9) - 1"); + } +} + #[test] #[cfg_attr( miri, @@ -4851,6 +5289,54 @@ fn yuv444p9_gray_to_gray() { } } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p9_rgba_u8_only_gray_with_opaque_alpha() { + // 9-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 256, 256, 256); + let src = Yuv444p9Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv444p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p9_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 9-bit mid-gray → u16 RGBA: each color element ≈ 256, alpha = 511. + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 256, 256, 256); + let src = Yuv444p9Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv444p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(256) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 511, "alpha must equal (1 << 9) - 1"); + } +} + #[test] #[cfg_attr( miri, @@ -5039,6 +5525,55 @@ fn yuv440p12_gray_to_gray() { } } +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv440p12_rgba_u8_only_gray_with_opaque_alpha() { + // 4:4:0 reuses the 4:4:4 dispatcher. 12-bit mid-gray → 8-bit RGBA + // ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv440p_n_frame(16, 8, 2048, 2048, 2048); + let src = Yuv440p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv440p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv440p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095. + let (yp, up, vp) = solid_yuv440p_n_frame(16, 8, 2048, 2048, 2048); + let src = Yuv440p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv440p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + #[test] #[cfg_attr( miri, @@ -5288,18 +5823,239 @@ fn p212_gray_to_gray() { miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn p216_gray_to_gray_u16() { - let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768); - let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16); +fn p212_rgba_u8_only_gray_with_opaque_alpha() { + // P212 mid-gray (12-bit values shifted into the high 12): Y/U/V = 2048 << 4. + // Output 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, uvp) = solid_p2x0_frame(16, 8, 12, 2048, 2048, 2048); + let src = P212Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + p212_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p212_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // P212 mid-gray → u16 RGBA: each color element ≈ 2048 (low-bit-packed), + // alpha = (1 << 12) - 1 = 4095. + let (yp, uvp) = solid_p2x0_frame(16, 8, 12, 2048, 2048, 2048); + let src = P212Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + p212_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p216_gray_to_gray_u16() { + let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768); + let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; + let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgb(&mut rgb_u8) + .unwrap() + .with_rgb_u16(&mut rgb_u16) + .unwrap(); + p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgb_u8.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } + for px in rgb_u16.chunks(3) { + assert!(px[0].abs_diff(32768) <= 256); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p216_rgba_u8_only_gray_with_opaque_alpha() { + // P216 mid-gray (16-bit, no shift): Y/U/V = 32768. Output 8-bit RGBA + // ≈ (128, 128, 128, 255). + let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768); + let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p216_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 16-bit mid-gray → u16 RGBA: each color element ≈ 32768, alpha = 0xFFFF. + // Covers the 16-bit dedicated kernel family (no Q15 downshift). + let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768); + let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(32768) <= 256, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p410_gray_to_gray() { + // 4:4:4: uv_stride = 2 * width = 32 (16 pairs × 2 elements). + let (yp, uvp) = solid_p4x0_frame(16, 8, 10, 512, 512, 512); + let src = P410Frame::new(&yp, &uvp, 16, 8, 16, 32); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + p410_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p412_gray_to_gray() { + let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048); + let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32); + + let mut rgb = std::vec![0u8; 16 * 8 * 3]; + let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); + p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgb.chunks(3) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p412_rgba_u8_only_gray_with_opaque_alpha() { + // P412 mid-gray (12-bit values shifted into the high 12): Y/U/V = 2048 << 4. + // Output 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048); + let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p412_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // P412 mid-gray → u16 RGBA: each color element ≈ 2048 (low-bit-packed), + // alpha = (1 << 12) - 1 = 4095. + let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048); + let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(2048) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p416_gray_to_gray_u16() { + let (yp, uvp) = solid_p4x0_frame(16, 8, 16, 32768, 32768, 32768); + let src = P416Frame::new(&yp, &uvp, 16, 8, 16, 32); let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8) + let mut sink = MixedSinker::::new(16, 8) .with_rgb(&mut rgb_u8) .unwrap() .with_rgb_u16(&mut rgb_u16) .unwrap(); - p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + p416_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); for px in rgb_u8.chunks(3) { assert!(px[0].abs_diff(128) <= 1); @@ -5318,39 +6074,23 @@ fn p216_gray_to_gray_u16() { miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn p410_gray_to_gray() { - // 4:4:4: uv_stride = 2 * width = 32 (16 pairs × 2 elements). - let (yp, uvp) = solid_p4x0_frame(16, 8, 10, 512, 512, 512); - let src = P410Frame::new(&yp, &uvp, 16, 8, 16, 32); - - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - p410_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - - for px in rgb.chunks(3) { - assert!(px[0].abs_diff(128) <= 1); - assert_eq!(px[0], px[1]); - assert_eq!(px[1], px[2]); - } -} - -#[test] -#[cfg_attr( - miri, - ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" -)] -fn p412_gray_to_gray() { - let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048); - let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32); +fn p416_rgba_u8_only_gray_with_opaque_alpha() { + // P416 mid-gray (16-bit, no shift): Y/U/V = 32768. Output 8-bit RGBA + // ≈ (128, 128, 128, 255). + let (yp, uvp) = solid_p4x0_frame(16, 8, 16, 32768, 32768, 32768); + let src = P416Frame::new(&yp, &uvp, 16, 8, 16, 32); - let mut rgb = std::vec![0u8; 16 * 8 * 3]; - let mut sink = MixedSinker::::new(16, 8).with_rgb(&mut rgb).unwrap(); - p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + p416_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - for px in rgb.chunks(3) { + for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); } } @@ -5359,28 +6099,23 @@ fn p412_gray_to_gray() { miri, ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" )] -fn p416_gray_to_gray_u16() { +fn p416_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 16-bit mid-gray → u16 RGBA: each color element ≈ 32768, alpha = 0xFFFF. + // Covers the 16-bit dedicated kernel family (no Q15 downshift). let (yp, uvp) = solid_p4x0_frame(16, 8, 16, 32768, 32768, 32768); let src = P416Frame::new(&yp, &uvp, 16, 8, 16, 32); - let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3]; - let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3]; + let mut rgba = std::vec![0u16; 16 * 8 * 4]; let mut sink = MixedSinker::::new(16, 8) - .with_rgb(&mut rgb_u8) - .unwrap() - .with_rgb_u16(&mut rgb_u16) + .with_rgba_u16(&mut rgba) .unwrap(); p416_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); - for px in rgb_u8.chunks(3) { - assert!(px[0].abs_diff(128) <= 1); - assert_eq!(px[0], px[1]); - assert_eq!(px[1], px[2]); - } - for px in rgb_u16.chunks(3) { - assert!(px[0].abs_diff(32768) <= 256); + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(32768) <= 256, "got {px:?}"); assert_eq!(px[0], px[1]); assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF"); } } @@ -6421,3 +7156,233 @@ fn yuv422p16_rgba_u16_only_native_depth_gray_with_opaque_alpha() { assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF"); } } + +// ===== Ship 8 Tranche 7c — high-bit 4:4:4 RGBA sinker tests ========== +// +// Mirrors PR #26's 4:2:0 coverage scope: representative formats only, +// not exhaustive per-format. Yuv444p10 covers the BITS-generic planar +// path; P410 covers the Pn semi-planar path; Yuv444p16 covers the +// 16-bit dedicated kernel; Yuv440p10 covers the 4:4:0 kernel-reuse +// path. The remaining 4:4:4 high-bit formats (9/12/14, P412/P416, +// Yuv440p12) are exercised by row-layer tests + the compile-time +// guarantee that the new sinker builders typecheck. + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p10_rgba_u8_only_gray_with_opaque_alpha() { + // 10-bit mid-gray (Y=512, U=512, V=512) → 8-bit RGBA ≈ (128, 128, 128, 255). + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 512, 512, 512); + let src = Yuv444p10Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u8; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .unwrap(); + yuv444p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(128) <= 1); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFF, "alpha must be opaque"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p10_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 10-bit mid-gray → u16 RGBA: each color element ≈ 512, alpha = 1023. + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 512, 512, 512); + let src = Yuv444p10Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv444p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p10_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() { + // Strategy A on the u8 path: rgb buffer populated by the RGB kernel, + // rgba buffer populated via the cheap expand_rgb_to_rgba_row pass. + // RGB triples must be byte-identical to the standalone RGB-only run. + let (yp, up, vp) = solid_yuv444p_n_frame(64, 16, 600, 400, 700); + let src = Yuv444p10Frame::new(&yp, &up, &vp, 64, 16, 64, 64, 64); + + let mut rgb_solo = std::vec![0u8; 64 * 16 * 3]; + let mut s_solo = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_solo) + .unwrap(); + yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap(); + + let mut rgb_combined = std::vec![0u8; 64 * 16 * 3]; + let mut rgba = std::vec![0u8; 64 * 16 * 4]; + let mut s_combined = MixedSinker::::new(64, 16) + .with_rgb(&mut rgb_combined) + .unwrap() + .with_rgba(&mut rgba) + .unwrap(); + yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap(); + + assert_eq!(rgb_solo, rgb_combined, "RGB bytes must match across runs"); + for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) { + assert_eq!(rgb_px[0], rgba_px[0]); + assert_eq!(rgb_px[1], rgba_px[1]); + assert_eq!(rgb_px[2], rgba_px[2]); + assert_eq!(rgba_px[3], 0xFF); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p10_with_rgb_u16_and_with_rgba_u16_produce_byte_identical_rgb_elems() { + // Strategy A on the u16 path: rgb_u16 buffer populated by the u16 RGB + // kernel, rgba_u16 fanned out via expand_rgb_u16_to_rgba_u16_row<10>. + let (yp, up, vp) = solid_yuv444p_n_frame(64, 16, 600, 400, 700); + let src = Yuv444p10Frame::new(&yp, &up, &vp, 64, 16, 64, 64, 64); + + let mut rgb_solo = std::vec![0u16; 64 * 16 * 3]; + let mut s_solo = MixedSinker::::new(64, 16) + .with_rgb_u16(&mut rgb_solo) + .unwrap(); + yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap(); + + let mut rgb_combined = std::vec![0u16; 64 * 16 * 3]; + let mut rgba = std::vec![0u16; 64 * 16 * 4]; + let mut s_combined = MixedSinker::::new(64, 16) + .with_rgb_u16(&mut rgb_combined) + .unwrap() + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap(); + + assert_eq!( + rgb_solo, rgb_combined, + "RGB u16 elements must match across runs" + ); + for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) { + assert_eq!(rgb_px[0], rgba_px[0]); + assert_eq!(rgb_px[1], rgba_px[1]); + assert_eq!(rgb_px[2], rgba_px[2]); + assert_eq!(rgba_px[3], 1023, "alpha = (1 << 10) - 1"); + } +} + +#[test] +fn yuv444p10_rgba_too_short_returns_err() { + let mut rgba = std::vec![0u8; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgba(&mut rgba) + .err() + .expect("expected RgbaBufferTooShort"); + assert!(matches!(err, MixedSinkerError::RgbaBufferTooShort { .. })); +} + +#[test] +fn yuv444p10_rgba_u16_too_short_returns_err() { + let mut rgba = std::vec![0u16; 10]; + let err = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .err() + .expect("expected RgbaU16BufferTooShort"); + assert!(matches!( + err, + MixedSinkerError::RgbaU16BufferTooShort { .. } + )); +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p410_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // P410 (semi-planar 10-bit): mid-gray (high-bit-packed = 512 << 6). + // u16 RGBA output ≈ 512, alpha = 1023. + let (yp, uvp) = solid_p4x0_frame(16, 8, 10, 512, 512, 512); + let src = P410Frame::new(&yp, &uvp, 16, 8, 16, 32); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + p410_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p16_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 16-bit mid-gray → u16 RGBA: each color element ≈ 32768, alpha = 0xFFFF. + // Covers the 16-bit dedicated kernel family (no Q15 downshift). + let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 32768, 32768, 32768); + let src = Yuv444p16Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv444p16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(32768) <= 256, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv440p10_rgba_u16_only_native_depth_gray_with_opaque_alpha() { + // 4:4:0 reuses the 4:4:4 dispatcher. Confirms the kernel-reuse path + // wires through correctly at the sinker boundary. + let (yp, up, vp) = solid_yuv440p_n_frame(16, 8, 512, 512, 512); + let src = Yuv440p10Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16); + + let mut rgba = std::vec![0u16; 16 * 8 * 4]; + let mut sink = MixedSinker::::new(16, 8) + .with_rgba_u16(&mut rgba) + .unwrap(); + yuv440p10_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + for px in rgba.chunks(4) { + assert!(px[0].abs_diff(512) <= 1, "got {px:?}"); + assert_eq!(px[0], px[1]); + assert_eq!(px[1], px[2]); + assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1"); + } +}