diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4be82d2..1ae086b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -199,20 +199,22 @@ backends are wired in follow-up sub-PRs without breaking call sites.**
 | 4c | 4:4:0 planar | `Yuv440p` | ✅ shipped (PR #22) — wiring-only (reuses `yuv_444_to_rgba_row`) |
 | 5 | High-bit 4:2:0 | `Yuv420p9/10/12/14/16`, `P010/P012/P016` | ✅ shipped — **5** scalar prep + dispatchers (PR #24); **5a** u8 SIMD across all 5 backends (PR #25); **5b** u16 SIMD + sinker integration (PR #26) |
 | 6 | High-bit 4:2:2 | `Yuv422p9/10/12/14/16`, `P210/P212/P216` | ✅ shipped (PR #28) — sinker-only; reuses tranche-5 row kernels via the established 4:2:2 → 4:2:0 dispatcher pattern. (`Yuv440p10/12` deferred to tranche 7 alongside the 4:4:4 work it depends on.) |
-| 7 | High-bit 4:4:4 + 4:4:0 | `Yuv444p9/10/12/14/16`, `P410/P412/P416`, `Yuv440p10/12` | ⏳ **in progress** — **7** scalar prep + dispatchers shipped (PR #29; `use_simd` parameter held in the signature but routes to scalar until 7b/7c wire SIMD). 7b u8 SIMD pending; 7c u16 SIMD + sinker integration pending. |
+| 7 | High-bit 4:4:4 + 4:4:0 | `Yuv444p9/10/12/14/16`, `P410/P412/P416`, `Yuv440p10/12` | ✅ shipped — **7** scalar prep + dispatchers (PR #29); **7b** u8 SIMD across all 5 backends (PR #30); **7c** u16 SIMD + sinker integration incl. `Yuv440p10/12` reusing 4:4:4 dispatchers (PR #31) |
 | 8 | RAW | `Bayer`, `Bayer16<BITS>` | (deferred — RAW already has `with_luma_coefficients`) |
 
 ### SIMD coverage
 
-For tranches 1–6 (everything shipped): all 5 backends (NEON, SSE4.1,
-AVX2, AVX-512, wasm simd128) have the const-ALPHA `<…, ALPHA>` template
-wired for both u8 and u16 RGBA paths. Per-arch RGBA store helpers added
-where needed: `vst4q_u8` / `vst4q_u16` (NEON), `write_rgba_16` /
-`write_rgba_u16_8` (SSE4.1, AVX2 via re-export), `write_rgba_64` /
-`write_rgba_u16_32` + `write_quarter_rgba` (AVX-512), `u8x16_splat` /
-`i16x8_shuffle`-based `write_rgba_u16_8` (wasm).
-
-For tranche 7: scalar-only as of PR #29. SIMD backends land in 7b/7c.
+**All 7 tranches (Ship 8 complete)**: 5 backends (NEON, SSE4.1, AVX2,
+AVX-512, wasm simd128) have the const-ALPHA `<…, ALPHA>` template
+wired for both u8 and u16 RGBA paths across every high-bit kernel
+family (4:2:0 in tranche 5; 4:4:4 + Pn-444 in tranche 7). 4:2:2 and
+4:4:0 sinkers reuse 4:2:0 / 4:4:4 dispatchers respectively — no new
+SIMD code needed for those subsampling families. Per-arch RGBA store
+helpers added in tranche 5: `vst4q_u8` / `vst4q_u16` (NEON),
+`write_rgba_16` / `write_rgba_u16_8` (SSE4.1, AVX2 via re-export),
+`write_rgba_64` / `write_rgba_u16_32` + `write_quarter_rgba`
+(AVX-512), `u8x16_splat` / `i16x8_shuffle`-based `write_rgba_u16_8`
+(wasm). Reused verbatim across tranches 5–7.
 
 ### Cleanup PRs
 
@@ -226,17 +228,20 @@ For tranche 7: scalar-only as of PR #29. SIMD backends land in 7b/7c.
   (`src/frame.rs`, `src/raw/types.rs`, `src/raw/bayer.rs`,
   `src/raw/bayer16.rs`) into sibling files. Same shape as PR #21.
 
-### Tests (cumulative through PR #29)
-
-- **513 tests pass on aarch64-darwin** (host) at the end of tranche 7
-  scalar prep; +6 since tranche 6 (PR #28: 507) for the new 4:4:4
-  scalar reference paths.
-- Per-arch RGBA equivalence tests: 30 tests × 5 backends per high-bit
-  family (Tranche 5 added BITS=9/10/12/14 + 16 + Pn for both u8 and
-  u16 paths, all matrices × ranges × tail widths).
-- Sinker integration tests: 8 new in PR #26 (4:2:0), 8 in PR #28
-  (4:2:2), 6 in PR #29 (4:4:4 scalar). Cover both standalone-RGBA
-  and Strategy A combine paths plus buffer-too-short error variants.
+### Tests (cumulative through PR #31, Ship 8 complete)
+
+- **534 tests pass on aarch64-darwin** (host) at Ship 8 close;
+  trajectory: 507 (PR #28, 4:2:2 sinker) → 513 (PR #29, 4:4:4 scalar
+  prep) → 519 (PR #30, 4:4:4 u8 SIMD) → 534 (PR #31, 4:4:4 u16 SIMD
+  + sinker).
+- Per-arch RGBA equivalence tests: ~30 per high-bit family across all
+  5 backends — tranche 5 added 4:2:0 (u8 + u16, BITS=9/10/12/14 + 16
+  + Pn); tranche 7b/7c added 4:4:4 (u8 + u16, BITS=9/10/12/14 + 16 +
+  Pn-444). All matrices × ranges × natural-block + tail widths.
+- Sinker integration tests: 8 in PR #26 (4:2:0), 8 in PR #28 (4:2:2),
+  6 in PR #29 (4:4:4 scalar), 9 in PR #31 (4:4:4 + Yuv440p10 cross-
+  family kernel-reuse proof). Cover standalone-RGBA, Strategy A
+  combine, and buffer-too-short error variants.
 - All x86 `#[test]` functions exercising new SIMD kernels include
   `is_x86_feature_detected!` early-return guards (per the PR #25 CI
   fallout — without them, ASAN sanitizer saw `SIGILL` and Miri
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 3a6acc7..f7972c0 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -896,6 +896,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
 /// NEON YUV 4:4:4 planar high-bit-depth → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`.
 ///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -909,15 +911,71 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth) — matches `scalar::yuv_444p_n_to_rgba_u16_row`.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgb_u16_row`], plus
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON high-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
+/// writes RGBA quads via `vst4q_u16` with constant alpha
+/// `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **NEON must be available.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below
   // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The
   // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`.
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -940,6 +998,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u16 = vdupq_n_u16(out_max as u16);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -993,24 +1052,36 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(vqaddq_s16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
-      let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      if ALPHA {
+        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16);
+        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16);
+        vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
+        vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
+      } else {
+        let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
+        let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
+        vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo);
+        vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -2846,6 +2917,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// i64 chroma + i64 Y (same widening as `yuv_420p16_to_rgb_u16_row`);
 /// full-width U/V (no chroma duplication step).
 ///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -2860,10 +2933,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF` (opaque maximum at u16 range) —
+/// matches `scalar::yuv_444p16_to_rgba_u16_row`.
+///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON 16-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
+/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **NEON must be available.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -2883,6 +3009,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u16 = vdupq_n_u16(0xFFFF);
 
     let mut x = 0usize;
     while x + 8 <= width {
@@ -2943,23 +3070,35 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
         vqmovun_s32(vaddq_s32(ys_hi, b_ch_hi)),
       );
 
-      vst3q_u16(
-        rgb_out.as_mut_ptr().add(x * 3),
-        uint16x8x3_t(r_u16, g_u16, b_u16),
-      );
+      if ALPHA {
+        vst4q_u16(
+          out.as_mut_ptr().add(x * 4),
+          uint16x8x4_t(r_u16, g_u16, b_u16, alpha_u16),
+        );
+      } else {
+        vst3q_u16(
+          out.as_mut_ptr().add(x * 3),
+          uint16x8x3_t(r_u16, g_u16, b_u16),
+        );
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::yuv_444p16_to_rgb_u16_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3560,6 +3699,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bo
 /// NEON Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed
 /// **native-depth `u16`** RGB. Output is low-bit-packed.
 ///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// 1. NEON must be available on the current CPU.
@@ -3574,11 +3715,64 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON sibling of [`p_n_444_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth) — matches `scalar::p_n_444_to_rgba_u16_row`.
+///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_444_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON Pn 4:4:4 high-bit-packed → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
+/// writes RGBA quads via `vst4q_u16` with constant alpha
+/// `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. NEON must be available on the current CPU.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -3601,6 +3795,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u16 = vdupq_n_u16(out_max as u16);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -3659,23 +3854,35 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(vaddq_s16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(vaddq_s16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
-      let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      if ALPHA {
+        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16);
+        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16);
+        vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
+        vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
+      } else {
+        let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
+        let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
+        vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo);
+        vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_444_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_to_rgba_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::p_n_444_to_rgb_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3887,6 +4094,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// RGB. i64 chroma + i64 Y (chroma matrix multiply-add overflows i32
 /// at u16 output for the BT.2020 blue coefficient). 8 pixels per iter.
 ///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -3900,9 +4109,61 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF` (opaque maximum at u16 range) —
+/// matches `scalar::p_n_444_16_to_rgba_u16_row`.
+///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16`
+/// kernel. `ALPHA = false` writes RGB triples via `vst3q_u16`;
+/// `ALPHA = true` writes RGBA quads via `vst4q_u16` with constant alpha
+/// `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. NEON must be available.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3922,6 +4183,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u16 = vdupq_n_u16(0xFFFF);
 
     let mut x = 0usize;
     while x + 8 <= width {
@@ -3980,22 +4242,30 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
         vqmovun_s32(vaddq_s32(ys_hi, b_ch_hi)),
       );
 
-      vst3q_u16(
-        rgb_out.as_mut_ptr().add(x * 3),
-        uint16x8x3_t(r_u16, g_u16, b_u16),
-      );
+      if ALPHA {
+        vst4q_u16(
+          out.as_mut_ptr().add(x * 4),
+          uint16x8x4_t(r_u16, g_u16, b_u16, alpha_u16),
+        );
+      } else {
+        vst3q_u16(
+          out.as_mut_ptr().add(x * 3),
+          uint16x8x3_t(r_u16, g_u16, b_u16),
+        );
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::p_n_444_16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs
index 7c3b684..d312142 100644
--- a/src/row/arch/neon/tests.rs
+++ b/src/row/arch/neon/tests.rs
@@ -2546,3 +2546,191 @@ fn neon_p416_rgba_matches_scalar_all_matrices() {
     check_p_n_444_16_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
+//
+// u16 RGBA wrappers share the math of their u16 RGB siblings — only
+// the store (and tail dispatch) branches on `ALPHA`, with alpha set to
+// `(1 << BITS) - 1` for BITS-generic kernels and `0xFFFF` for 16-bit
+// kernels. Tests pin byte-identical output against the scalar RGBA
+// reference.
+
+fn check_yuv444p_n_u16_neon_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_444_u16_neon_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = high_bit_plane::<BITS>(width, 37);
+  let u = high_bit_plane::<BITS>(width, 53);
+  let v = high_bit_plane::<BITS>(width, 71);
+  let uv = interleave_uv(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv444p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width, 53);
+  let v = p16_plane_neon(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p_n_444_16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width, 53);
+  let v = p16_plane_neon(width, 71);
+  let uv = interleave_uv(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv444p_n_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_neon_rgba_equivalence::<9>(16, m, full);
+      check_yuv444p_n_u16_neon_rgba_equivalence::<10>(16, m, full);
+      check_yuv444p_n_u16_neon_rgba_equivalence::<12>(16, m, full);
+      check_yuv444p_n_u16_neon_rgba_equivalence::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() {
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_neon_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false);
+    check_yuv444p_n_u16_neon_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true);
+    check_yuv444p_n_u16_neon_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_yuv444p_n_u16_neon_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_pn_444_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_444_u16_neon_rgba_equivalence::<10>(16, m, full);
+      check_pn_444_u16_neon_rgba_equivalence::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_pn_444_rgba_u16_matches_scalar_tail_and_widths() {
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_pn_444_u16_neon_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_444_u16_neon_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv444p16_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p16_u16_neon_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_p416_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p_n_444_16_u16_neon_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_p_n_444_16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index a147d92..b5eab6a 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -824,6 +824,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
 /// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 16 pixels per iter.
 ///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -837,12 +839,65 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// WASM simd128 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth
+/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum
+/// at the input bit depth).
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared WASM simd128 high-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -865,6 +920,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u16 = u16x8_splat(out_max as u16);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -915,23 +971,34 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
-      write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst);
+        write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
+        write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1112,10 +1179,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row<const ALPHA: bool>(
 }
 
 /// WASM simd128 YUV 4:4:4 planar **16-bit** → packed **u16** RGB.
-/// Falls through to scalar — simd128 has no native `i64x2` arithmetic
-/// shift, and the `srai64_15` bias trick at 128 bits (single i64 pair
-/// per lane) is not cheaper than scalar. Same rationale as
-/// [`yuv_420p16_to_rgb_u16_row`].
+/// 8 pixels per iter on the i64 chroma pipeline.
+///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
 ///
 /// # Safety
 ///
@@ -1131,10 +1197,61 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 sibling of [`yuv_444p16_to_rgba_row`] for native-depth
+/// `u16` output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 16-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -1142,6 +1259,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = u16x8_splat(0xFFFF);
     let rnd_i64 = i64x2_splat(RND_I64);
     let rnd_i32 = i32x4_splat(RND_I32);
     let y_off32 = i32x4_splat(y_off);
@@ -1230,20 +1348,29 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
         i32x4_add(y_hi_scaled, b_ch_hi),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::yuv_444p16_to_rgb_u16_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3635,6 +3762,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bo
 /// wasm simd128 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed
 /// **native-depth `u16`** RGB.
 ///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -3647,11 +3776,62 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 sibling of [`p_n_444_to_rgba_row`] for native-depth
+/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum
+/// at the input bit depth).
+///
+/// # Safety
+///
+/// Same as [`p_n_444_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_444_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 Pn 4:4:4 high-bit-packed → native-depth `u16`
+/// kernel. `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. simd128 must be enabled at compile time.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` must be one of `{10, 12}`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -3673,6 +3853,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u16 = u16x8_splat(out_max as u16);
 
     let shr = (16 - BITS) as u32;
 
@@ -3733,22 +3914,33 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
-      write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst);
+        write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
+        write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_444_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_to_rgba_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::p_n_444_to_rgb_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3922,6 +4114,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// wasm simd128 P416 → packed **native-depth `u16`** RGB. i64 chroma
 /// via native `i64x2_shr` (no bias trick needed). 8 pixels per iter.
 ///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -3935,9 +4129,58 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 sibling of [`p_n_444_16_to_rgba_row`] for native-depth
+/// `u16` output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 P416 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. simd128 must be enabled at compile time.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3945,6 +4188,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = u16x8_splat(0xFFFF);
     let rnd_i64 = i64x2_splat(RND_I64);
     let rnd_i32 = i32x4_splat(RND_I32);
     let y_off32 = i32x4_splat(y_off);
@@ -4028,19 +4272,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
         i32x4_add(y_hi_scaled, b_ch_hi),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::p_n_444_16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs
index f7529f3..56603cd 100644
--- a/src/row/arch/wasm_simd128/tests.rs
+++ b/src/row/arch/wasm_simd128/tests.rs
@@ -2083,3 +2083,187 @@ fn simd128_p416_rgba_matches_scalar_all_matrices() {
     check_p_n_444_16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
+
+fn check_yuv444p_n_u16_simd128_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "wasm simd128 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_444_u16_simd128_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = high_bit_plane_wasm::<BITS>(width, 37);
+  let u = high_bit_plane_wasm::<BITS>(width, 53);
+  let v = high_bit_plane_wasm::<BITS>(width, 71);
+  let uv = interleave_uv_wasm(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "wasm simd128 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv444p16_u16_simd128_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width, 53);
+  let v = p16_plane_wasm(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "wasm simd128 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p_n_444_16_u16_simd128_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width, 53);
+  let v = p16_plane_wasm(width, 71);
+  let uv = interleave_uv_wasm(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "wasm simd128 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn simd128_yuv444p_n_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_simd128_rgba_equivalence::<9>(16, m, full);
+      check_yuv444p_n_u16_simd128_rgba_equivalence::<10>(16, m, full);
+      check_yuv444p_n_u16_simd128_rgba_equivalence::<12>(16, m, full);
+      check_yuv444p_n_u16_simd128_rgba_equivalence::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() {
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_simd128_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false);
+    check_yuv444p_n_u16_simd128_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true);
+    check_yuv444p_n_u16_simd128_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_yuv444p_n_u16_simd128_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn simd128_pn_444_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_444_u16_simd128_rgba_equivalence::<10>(16, m, full);
+      check_pn_444_u16_simd128_rgba_equivalence::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn simd128_pn_444_rgba_u16_matches_scalar_tail_and_widths() {
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_pn_444_u16_simd128_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_444_u16_simd128_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn simd128_yuv444p16_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p16_u16_simd128_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn simd128_p416_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p_n_444_16_u16_simd128_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_p_n_444_16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 4ad6d35..070f9e7 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -962,6 +962,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
 /// AVX2 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 32 pixels per iter.
 ///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -975,12 +977,67 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth).
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 high-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1003,6 +1060,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -1077,45 +1135,82 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_lo),
-        _mm256_castsi256_si128(g_lo),
-        _mm256_castsi256_si128(b_lo),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_lo),
-        _mm256_extracti128_si256::<1>(g_lo),
-        _mm256_extracti128_si256::<1>(b_lo),
-        dst.add(24),
-      );
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_hi),
-        _mm256_castsi256_si128(g_hi),
-        _mm256_castsi256_si128(b_hi),
-        dst.add(48),
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_hi),
-        _mm256_extracti128_si256::<1>(g_hi),
-        _mm256_extracti128_si256::<1>(b_hi),
-        dst.add(72),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          alpha_u16,
+          dst.add(32),
+        );
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          alpha_u16,
+          dst.add(64),
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          alpha_u16,
+          dst.add(96),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          dst.add(24),
+        );
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          dst.add(48),
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          dst.add(72),
+        );
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1326,6 +1421,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// SSE4.1 rate, and no chroma-duplication step since 4:4:4 chroma
 /// is 1:1 with Y.
 ///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -1340,16 +1437,69 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 16-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
+/// quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm256_set1_epi64x(RND);
     let y_off_v = _mm256_set1_epi32(y_off);
     let y_scale_v = _mm256_set1_epi32(y_scale);
@@ -1446,33 +1596,56 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
         _mm256_add_epi32(y_hi_scaled, b_ch_hi),
       ));
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_u16),
-        _mm256_castsi256_si128(g_u16),
-        _mm256_castsi256_si128(b_u16),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_u16),
-        _mm256_extracti128_si256::<1>(g_u16),
-        _mm256_extracti128_si256::<1>(b_u16),
-        dst.add(24),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          alpha_u16,
+          dst.add(32),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          dst.add(24),
+        );
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444p16_to_rgb_u16_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3956,6 +4129,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bo
 /// AVX2 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed
 /// **native-depth `u16`** RGB.
 ///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// 1. AVX2 must be available on the current CPU.
@@ -3970,11 +4145,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 sibling of [`p_n_444_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_444_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 Pn 4:4:4 high-bit-packed → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. AVX2 must be available on the current CPU.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -3997,6 +4224,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -4075,44 +4303,81 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_lo),
-        _mm256_castsi256_si128(g_lo),
-        _mm256_castsi256_si128(b_lo),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_lo),
-        _mm256_extracti128_si256::<1>(g_lo),
-        _mm256_extracti128_si256::<1>(b_lo),
-        dst.add(24),
-      );
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_hi),
-        _mm256_castsi256_si128(g_hi),
-        _mm256_castsi256_si128(b_hi),
-        dst.add(48),
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_hi),
-        _mm256_extracti128_si256::<1>(g_hi),
-        _mm256_extracti128_si256::<1>(b_hi),
-        dst.add(72),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          alpha_u16,
+          dst.add(32),
+        );
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          alpha_u16,
+          dst.add(64),
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          alpha_u16,
+          dst.add(96),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          dst.add(24),
+        );
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          dst.add(48),
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          dst.add(72),
+        );
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::p_n_444_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_to_rgba_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::p_n_444_to_rgb_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -4311,6 +4576,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// `srai64_15_x4` bias trick (AVX2 lacks `_mm256_srai_epi64`).
 /// 16 pixels per iter (i64 narrows throughput).
 ///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -4324,15 +4591,66 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF`.
+///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16`
+/// kernel. `ALPHA = false` writes RGB triples; `ALPHA = true` writes
+/// RGBA quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. AVX2 must be available.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm256_set1_epi64x(RND);
     let y_off_v = _mm256_set1_epi32(y_off);
     let y_scale_v = _mm256_set1_epi32(y_scale);
@@ -4425,32 +4743,51 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
         _mm256_add_epi32(y_hi_scaled, b_ch_hi),
       ));
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_u16),
-        _mm256_castsi256_si128(g_u16),
-        _mm256_castsi256_si128(b_u16),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_u16),
-        _mm256_extracti128_si256::<1>(g_u16),
-        _mm256_extracti128_si256::<1>(b_u16),
-        dst.add(24),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          alpha_u16,
+          dst.add(32),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          dst.add(24),
+        );
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_444_16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index d7cf589..e6a6009 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -2334,3 +2334,197 @@ fn avx2_p416_rgba_matches_scalar_all_matrices() {
     check_p_n_444_16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
+
+fn check_yuv444p_n_u16_avx2_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_444_u16_avx2_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = high_bit_plane_avx2::<BITS>(width, 37);
+  let u = high_bit_plane_avx2::<BITS>(width, 53);
+  let v = high_bit_plane_avx2::<BITS>(width, 71);
+  let uv = interleave_uv_avx2(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv444p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width, 53);
+  let v = p16_plane_avx2(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p_n_444_16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width, 53);
+  let v = p16_plane_avx2(width, 71);
+  let uv = interleave_uv_avx2(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn avx2_yuv444p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_avx2_rgba_equivalence::<9>(32, m, full);
+      check_yuv444p_n_u16_avx2_rgba_equivalence::<10>(32, m, full);
+      check_yuv444p_n_u16_avx2_rgba_equivalence::<12>(32, m, full);
+      check_yuv444p_n_u16_avx2_rgba_equivalence::<14>(32, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_avx2_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false);
+    check_yuv444p_n_u16_avx2_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true);
+    check_yuv444p_n_u16_avx2_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_yuv444p_n_u16_avx2_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn avx2_pn_444_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_444_u16_avx2_rgba_equivalence::<10>(32, m, full);
+      check_pn_444_u16_avx2_rgba_equivalence::<12>(32, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx2_pn_444_rgba_u16_matches_scalar_tail_and_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_pn_444_u16_avx2_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_444_u16_avx2_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn avx2_yuv444p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p16_u16_avx2_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn avx2_p416_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p_n_444_16_u16_avx2_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_p_n_444_16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 4d28528..ba587d3 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -1046,6 +1046,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
 /// AVX-512 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 64 pixels per iter.
 ///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -1059,12 +1061,66 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 high-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via 8× `write_quarter`;
+/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1087,6 +1143,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
 
@@ -1175,29 +1232,46 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_quarter(r_lo, g_lo, b_lo, 0, dst);
-      write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
-      write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
-      write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
-      write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
-      write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
-      write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
-      write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_quarter(r_lo, g_lo, b_lo, 0, dst);
+        write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
+        write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
+        write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
+        write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
+        write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
+        write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
+        write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1421,6 +1495,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// [`yuv_420p16_to_rgb_u16_row`] but with full-width chroma loads
 /// and no duplication step (4:4:4 is 1:1 with Y).
 ///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -1435,10 +1511,63 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 16-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with
+/// constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -1446,6 +1575,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_i64_v = _mm512_set1_epi64(RND_I64);
     let rnd_i32_v = _mm512_set1_epi32(RND_I32);
     let y_off_v = _mm512_set1_epi32(y_off);
@@ -1544,21 +1674,30 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
       let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32));
       let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32));
 
-      write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_444p16_to_rgb_u16_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -4104,6 +4243,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bo
 /// AVX-512 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed
 /// **native-depth `u16`** RGB. 64 pixels per iter.
 ///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -4116,11 +4257,63 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 sibling of [`p_n_444_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1`.
+///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512bw,avx512f")]
+pub(crate) unsafe fn p_n_444_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 Pn 4:4:4 high-bit-packed → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via 8× `write_quarter`;
+/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "avx512bw,avx512f")]
+pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -4143,6 +4336,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
 
@@ -4235,28 +4429,45 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_quarter(r_lo, g_lo, b_lo, 0, dst);
-      write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
-      write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
-      write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
-      write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
-      write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
-      write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
-      write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_quarter(r_lo, g_lo, b_lo, 0, dst);
+        write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
+        write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
+        write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
+        write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
+        write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
+        write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
+        write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::p_n_444_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_to_rgba_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::p_n_444_to_rgb_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -4469,6 +4680,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// iter (i64 narrows). Native `_mm512_srai_epi64` via
 /// `chroma_i64x8_avx512` + `scale_y_i32x16_i64` — no bias trick.
 ///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -4482,9 +4695,59 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF`.
+///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512bw,avx512f")]
+pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16`
+/// kernel. `ALPHA = false` writes RGB triples; `ALPHA = true` writes
+/// RGBA quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx512bw,avx512f")]
+pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -4492,6 +4755,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_i64_v = _mm512_set1_epi64(RND_I64);
     let rnd_i32_v = _mm512_set1_epi32(RND_I32);
     let y_off_v = _mm512_set1_epi32(y_off);
@@ -4586,19 +4850,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
       let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32));
       let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32));
 
-      write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 32;
     }
 
     if x < width {
-      scalar::p_n_444_16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index b66d3a1..31152c3 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -2358,3 +2358,205 @@ fn avx512_p416_rgba_matches_scalar_all_matrices() {
     check_p_n_444_16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
+
+fn check_yuv444p_n_u16_avx512_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_444_u16_avx512_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = high_bit_plane_avx512::<BITS>(width, 37);
+  let u = high_bit_plane_avx512::<BITS>(width, 53);
+  let v = high_bit_plane_avx512::<BITS>(width, 71);
+  let uv = interleave_uv_avx512(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv444p16_u16_avx512_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width, 53);
+  let v = p16_plane_avx512(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p_n_444_16_u16_avx512_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width, 53);
+  let v = p16_plane_avx512(width, 71);
+  let uv = interleave_uv_avx512(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn avx512_yuv444p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_avx512_rgba_equivalence::<9>(64, m, full);
+      check_yuv444p_n_u16_avx512_rgba_equivalence::<10>(64, m, full);
+      check_yuv444p_n_u16_avx512_rgba_equivalence::<12>(64, m, full);
+      check_yuv444p_n_u16_avx512_rgba_equivalence::<14>(64, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_avx512_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false);
+    check_yuv444p_n_u16_avx512_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true);
+    check_yuv444p_n_u16_avx512_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_yuv444p_n_u16_avx512_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn avx512_pn_444_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_444_u16_avx512_rgba_equivalence::<10>(64, m, full);
+      check_pn_444_u16_avx512_rgba_equivalence::<12>(64, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx512_pn_444_rgba_u16_matches_scalar_tail_and_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_pn_444_u16_avx512_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_444_u16_avx512_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn avx512_yuv444p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p16_u16_avx512_rgba_equivalence(32, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn avx512_p416_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p_n_444_16_u16_avx512_rgba_equivalence(32, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_p_n_444_16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index ac8ccaa..390b8cc 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1278,6 +1278,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
 /// SSE4.1 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`.
 ///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -1291,15 +1293,70 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth).
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 high-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below
   // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The
   // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`.
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1322,6 +1379,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1372,22 +1430,38 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(_mm_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3));
-      write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24));
+      if ALPHA {
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(
+          r_hi,
+          g_hi,
+          b_hi,
+          alpha_u16,
+          out.as_mut_ptr().add(x * 4 + 32),
+        );
+      } else {
+        write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3));
+        write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1578,6 +1652,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// U/V (vs 4 half-width), computing 8 chroma values (vs 4 + dup), and
 /// skipping the chroma-duplication step.
 ///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -1592,16 +1668,69 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 sibling of [`yuv_444p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF`.
+///
+/// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p16_to_rgb_or_rgba_u16_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 16-bit YUV 4:4:4 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
+/// quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm_set1_epi64x(RND);
     let y_off_v = _mm_set1_epi32(y_off);
     let y_scale_v = _mm_set1_epi32(y_scale);
@@ -1719,20 +1848,29 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
         _mm_add_epi32(y_hi_i32, b_ch_hi_i32),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::yuv_444p16_to_rgb_u16_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_444p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_444p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3479,6 +3617,8 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bo
 /// SSE4.1 Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed
 /// **native-depth `u16`** RGB. Output is low-bit-packed.
 ///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// 1. SSE4.1 must be available on the current CPU.
@@ -3493,11 +3633,64 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 sibling of [`p_n_444_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth).
+///
+/// Thin wrapper over [`p_n_444_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_444_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 Pn 4:4:4 high-bit-packed → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. SSE4.1 must be available on the current CPU.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 3. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -3520,6 +3713,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -3573,21 +3767,37 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(_mm_adds_epi16(y_scaled_lo, b_chroma_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
-      write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3));
-      write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24));
+      if ALPHA {
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(
+          r_hi,
+          g_hi,
+          b_hi,
+          alpha_u16,
+          out.as_mut_ptr().add(x * 4 + 32),
+        );
+      } else {
+        write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3));
+        write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_444_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_to_rgba_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::p_n_444_to_rgb_u16_row::<BITS>(
+          tail_y, tail_uv, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3766,6 +3976,8 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// `_mm_mul_epi32` + `srai64_15` bias trick (mirroring
 /// `yuv_444p16_to_rgb_u16_row`). 8 pixels per iter.
 ///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`.
@@ -3779,15 +3991,66 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<false>(y, uv_full, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `0xFFFF`.
+///
+/// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row(
+  y: &[u16],
+  uv_full: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_444_16_to_rgb_or_rgba_u16_row::<true>(y, uv_full, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 P416 (semi-planar 4:4:4, 16-bit) → native-depth `u16`
+/// kernel. `ALPHA = false` writes RGB triples; `ALPHA = true` writes
+/// RGBA quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. SSE4.1 must be available.
+/// 2. `y.len() >= width`, `uv_full.len() >= 2 * width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_full: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_full.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm_set1_epi64x(RND);
     let y_off_v = _mm_set1_epi32(y_off);
     let y_scale_v = _mm_set1_epi32(y_scale);
@@ -3900,19 +4163,24 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row(
         _mm_add_epi32(y_hi_i32, b_ch_hi_i32),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::p_n_444_16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_full[x * 2..width * 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_full[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 330a59a..253925b 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -2374,3 +2374,201 @@ fn sse41_p416_rgba_matches_scalar_all_matrices() {
     check_p_n_444_16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
+
+fn check_yuv444p_n_u16_sse41_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Yuv444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_444_u16_sse41_rgba_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = high_bit_plane_sse41::<BITS>(width, 37);
+  let u = high_bit_plane_sse41::<BITS>(width, 53);
+  let v = high_bit_plane_sse41::<BITS>(width, 71);
+  let uv = interleave_uv_sse41(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Pn4:4:4<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv444p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width, 53);
+  let v = p16_plane(width, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Yuv444p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p_n_444_16_u16_sse41_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width, 53);
+  let v = p16_plane(width, 71);
+  let uv = interleave_uv_sse41(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 P416 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn sse41_yuv444p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_sse41_rgba_equivalence::<9>(16, m, full);
+      check_yuv444p_n_u16_sse41_rgba_equivalence::<10>(16, m, full);
+      check_yuv444p_n_u16_sse41_rgba_equivalence::<12>(16, m, full);
+      check_yuv444p_n_u16_sse41_rgba_equivalence::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuv444p_n_rgba_u16_matches_scalar_tail_and_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_sse41_rgba_equivalence::<9>(w, ColorMatrix::Bt601, false);
+    check_yuv444p_n_u16_sse41_rgba_equivalence::<10>(w, ColorMatrix::Bt709, true);
+    check_yuv444p_n_u16_sse41_rgba_equivalence::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_yuv444p_n_u16_sse41_rgba_equivalence::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn sse41_pn_444_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_444_u16_sse41_rgba_equivalence::<10>(16, m, full);
+      check_pn_444_u16_sse41_rgba_equivalence::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn sse41_pn_444_rgba_u16_matches_scalar_tail_and_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_pn_444_u16_sse41_rgba_equivalence::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_444_u16_sse41_rgba_equivalence::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn sse41_yuv444p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p16_u16_sse41_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_yuv444p16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn sse41_p416_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p_n_444_16_u16_sse41_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [17usize, 31, 47, 63, 1920, 1922] {
+    check_p_n_444_16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
diff --git a/src/row/mod.rs b/src/row/mod.rs
index cec5c09..dfbbd1d 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -4266,13 +4266,9 @@ pub fn p412_to_rgb_u16_row(
 
 // ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) ---------------
 //
-// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche
-// 7b). The native-depth `u16` RGBA dispatchers stay on the scalar
-// reference path until the follow-up Ship 8 Tranche 7c PR; the
-// `use_simd` parameter is held in their signatures for API stability,
-// but their bodies remain `let _ = use_simd;` plus a scalar call
-// until the SIMD wiring lands. `use_simd = false` forces the scalar
-// reference path on every dispatcher.
+// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
+// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces
+// the scalar reference path on every dispatcher.
 
 /// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
@@ -4376,7 +4372,53 @@ pub fn yuv444p9_to_rgba_u16_row(
   assert!(v.len() >= width, "v row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range);
 }
 
@@ -4475,7 +4517,53 @@ pub fn yuv444p10_to_rgba_u16_row(
   assert!(v.len() >= width, "v row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range);
 }
 
@@ -4574,7 +4662,53 @@ pub fn yuv444p12_to_rgba_u16_row(
   assert!(v.len() >= width, "v row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range);
 }
 
@@ -4673,7 +4807,53 @@ pub fn yuv444p14_to_rgba_u16_row(
   assert!(v.len() >= width, "v row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range);
 }
 
@@ -4774,7 +4954,53 @@ pub fn yuv444p16_to_rgba_u16_row(
   assert!(v.len() >= width, "v row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
 }
 
@@ -4870,7 +5096,53 @@ pub fn p410_to_rgba_u16_row(
   assert!(uv_full.len() >= uv_min, "uv_full row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range);
 }
 
@@ -4966,7 +5238,53 @@ pub fn p412_to_rgba_u16_row(
   assert!(uv_full.len() >= uv_min, "uv_full row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range);
 }
 
@@ -5065,7 +5383,53 @@ pub fn p416_to_rgba_u16_row(
   assert!(uv_full.len() >= uv_min, "uv_full row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 7c.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range);
 }
 
diff --git a/src/sinker/mixed/planar_8bit.rs b/src/sinker/mixed/planar_8bit.rs
index 41d4707..375b3dc 100644
--- a/src/sinker/mixed/planar_8bit.rs
+++ b/src/sinker/mixed/planar_8bit.rs
@@ -31,12 +31,13 @@ impl<'a> MixedSinker<'a, Yuv420p> {
   ///
   /// ```compile_fail
   /// // Attaching RGBA to a sink that doesn't write it is rejected
-  /// // at compile time. Yuv444p10 (10‑bit 4:4:4 planar) has not yet
-  /// // been wired for RGBA — once a future tranche lands it the
-  /// // negative example here moves to the next not‑yet‑wired format.
-  /// use colconv::{sinker::MixedSinker, yuv::Yuv444p10};
+  /// // at compile time. `Bayer` (RAW Bayer-mosaic) has no RGBA path —
+  /// // there's no inherent alpha channel and the format demosaics to
+  /// // RGB only. Once / if a future PR adds RGBA, the negative example
+  /// // here moves to the next not‑yet‑wired format.
+  /// use colconv::{sinker::MixedSinker, raw::Bayer};
   /// let mut buf = vec![0u8; 16 * 8 * 4];
-  /// let _ = MixedSinker::<Yuv444p10>::new(16, 8).with_rgba(&mut buf);
+  /// let _ = MixedSinker::<Bayer>::new(16, 8).with_rgba(&mut buf);
   /// ```
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
diff --git a/src/sinker/mixed/subsampled_4_2_2_high_bit.rs b/src/sinker/mixed/subsampled_4_2_2_high_bit.rs
index 0f1a062..8c5c4d9 100644
--- a/src/sinker/mixed/subsampled_4_2_2_high_bit.rs
+++ b/src/sinker/mixed/subsampled_4_2_2_high_bit.rs
@@ -1330,6 +1330,48 @@ impl<'a> MixedSinker<'a, Yuv440p10> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. Yuv440p10 reuses
+  /// the `BITS = 10` 4:4:4 RGBA kernel; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 10-bit low-packed
+  /// (`[0, 1023]`); alpha element is `1023`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv440p10Sink for MixedSinker<'_, Yuv440p10> {}
@@ -1383,6 +1425,8 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -1398,7 +1442,26 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p10_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -1408,19 +1471,47 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p10_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p10_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1454,6 +1545,12 @@ impl PixelSink for MixedSinker<'_, Yuv440p10> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -1480,6 +1577,48 @@ impl<'a> MixedSinker<'a, Yuv440p12> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. Yuv440p12 reuses
+  /// the `BITS = 12` 4:4:4 RGBA kernel; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 12-bit low-packed
+  /// (`[0, 4095]`); alpha element is `4095`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv440p12Sink for MixedSinker<'_, Yuv440p12> {}
@@ -1533,6 +1672,8 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -1548,7 +1689,26 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p12_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -1558,19 +1718,47 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p12_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p12_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1604,6 +1792,12 @@ impl PixelSink for MixedSinker<'_, Yuv440p12> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
diff --git a/src/sinker/mixed/subsampled_4_4_4_high_bit.rs b/src/sinker/mixed/subsampled_4_4_4_high_bit.rs
index f1dde93..3d36e10 100644
--- a/src/sinker/mixed/subsampled_4_4_4_high_bit.rs
+++ b/src/sinker/mixed/subsampled_4_4_4_high_bit.rs
@@ -2,6 +2,7 @@
 
 use super::{
   MixedSinker, MixedSinkerError, RowSlice, check_dimensions_match, rgb_row_buf_or_scratch,
+  rgba_plane_row_slice, rgba_u16_plane_row_slice,
 };
 use crate::{PixelSink, row::*, yuv::*};
 
@@ -27,6 +28,51 @@ impl<'a> MixedSinker<'a, Yuv444p9> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. The 9-bit YUV
+  /// source is converted to 8-bit RGBA via the same `BITS = 9` Q15
+  /// kernel family used by [`Self::with_rgb`]; the fourth byte per
+  /// pixel is alpha = `0xFF` (Yuv444p9 has no alpha plane).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 9-bit low-packed
+  /// (`(1 << 9) - 1 = 511` max). Length is measured in `u16`
+  /// **elements** (`width × height × 4`). Alpha element is `(1 << 9) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv444p9Sink for MixedSinker<'_, Yuv444p9> {}
@@ -80,6 +126,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -95,7 +143,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p9_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -105,19 +172,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p9_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p9_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -151,6 +246,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p9> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -177,6 +278,49 @@ impl<'a> MixedSinker<'a, Yuv444p10> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. The 10-bit YUV
+  /// source is converted to 8-bit RGBA via the same `BITS = 10` Q15
+  /// kernel family used by [`Self::with_rgb`]; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 10-bit low-packed
+  /// (`[0, 1023]`); alpha element is `1023`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv444p10Sink for MixedSinker<'_, Yuv444p10> {}
@@ -230,6 +374,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -245,7 +391,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p10_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -255,19 +420,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p10_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p10_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -301,6 +494,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p10> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -325,6 +524,49 @@ impl<'a> MixedSinker<'a, Yuv444p12> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. The 12-bit YUV
+  /// source is converted to 8-bit RGBA via the same `BITS = 12` Q15
+  /// kernel family used by [`Self::with_rgb`]; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 12-bit low-packed
+  /// (`[0, 4095]`); alpha element is `4095`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv444p12Sink for MixedSinker<'_, Yuv444p12> {}
@@ -378,6 +620,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -393,7 +637,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p12_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -403,19 +666,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p12_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p12_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -449,6 +740,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p12> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -473,6 +770,49 @@ impl<'a> MixedSinker<'a, Yuv444p14> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. The 14-bit YUV
+  /// source is converted to 8-bit RGBA via the same `BITS = 14` Q15
+  /// kernel family used by [`Self::with_rgb`]; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 14-bit low-packed
+  /// (`[0, 16383]`); alpha element is `16383`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv444p14Sink for MixedSinker<'_, Yuv444p14> {}
@@ -526,6 +866,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -541,7 +883,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p14_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -551,19 +912,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p14_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p14_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -597,6 +986,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p14> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -622,6 +1017,49 @@ impl<'a> MixedSinker<'a, Yuv444p16> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. The 16-bit YUV
+  /// source is converted to 8-bit RGBA via the dedicated 16-bit
+  /// kernel; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. Full-range output
+  /// `[0, 65535]`; alpha element is `0xFFFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv444p16Sink for MixedSinker<'_, Yuv444p16> {}
@@ -675,6 +1113,8 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -690,7 +1130,26 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p16_to_rgba_u16_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -700,19 +1159,47 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv444p16_to_rgb_u16_row(
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv444p16_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -746,6 +1233,12 @@ impl PixelSink for MixedSinker<'_, Yuv444p16> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -777,6 +1270,48 @@ impl<'a> MixedSinker<'a, P410> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. P410 has no alpha
+  /// plane, so alpha = `0xFF` (opaque).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 10-bit low-packed
+  /// (`[0, 1023]`); alpha element is `1023`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl P410Sink for MixedSinker<'_, P410> {}
@@ -790,6 +1325,7 @@ impl PixelSink for MixedSinker<'_, P410> {
   }
 
   fn process(&mut self, row: P410Row<'_>) -> Result<(), Self::Error> {
+    const BITS: u32 = 10;
     let w = self.width;
     let h = self.height;
     let idx = row.row();
@@ -822,6 +1358,8 @@ impl PixelSink for MixedSinker<'_, P410> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -837,7 +1375,25 @@ impl PixelSink for MixedSinker<'_, P410> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      p410_to_rgba_u16_row(
+        row.y(),
+        row.uv_full(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -847,18 +1403,45 @@ impl PixelSink for MixedSinker<'_, P410> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       p410_to_rgb_u16_row(
         row.y(),
         row.uv_full(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      p410_to_rgba_row(
+        row.y(),
+        row.uv_full(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -891,6 +1474,12 @@ impl PixelSink for MixedSinker<'_, P410> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -917,6 +1506,48 @@ impl<'a> MixedSinker<'a, P412> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. P412 has no alpha
+  /// plane, so alpha = `0xFF` (opaque).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 12-bit low-packed
+  /// (`[0, 4095]`); alpha element is `4095`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl P412Sink for MixedSinker<'_, P412> {}
@@ -930,6 +1561,7 @@ impl PixelSink for MixedSinker<'_, P412> {
   }
 
   fn process(&mut self, row: P412Row<'_>) -> Result<(), Self::Error> {
+    const BITS: u32 = 12;
     let w = self.width;
     let h = self.height;
     let idx = row.row();
@@ -961,6 +1593,8 @@ impl PixelSink for MixedSinker<'_, P412> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -976,7 +1610,25 @@ impl PixelSink for MixedSinker<'_, P412> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      p412_to_rgba_u16_row(
+        row.y(),
+        row.uv_full(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -986,18 +1638,45 @@ impl PixelSink for MixedSinker<'_, P412> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       p412_to_rgb_u16_row(
         row.y(),
         row.uv_full(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      p412_to_rgba_row(
+        row.y(),
+        row.uv_full(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1030,6 +1709,12 @@ impl PixelSink for MixedSinker<'_, P412> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -1060,6 +1745,48 @@ impl<'a> MixedSinker<'a, P416> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8-bit** RGBA output buffer. P416 has no alpha
+  /// plane, so alpha = `0xFF` (opaque).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. Full-range output
+  /// `[0, 65535]`; alpha element is `0xFFFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl P416Sink for MixedSinker<'_, P416> {}
@@ -1073,6 +1800,7 @@ impl PixelSink for MixedSinker<'_, P416> {
   }
 
   fn process(&mut self, row: P416Row<'_>) -> Result<(), Self::Error> {
+    const BITS: u32 = 16;
     let w = self.width;
     let h = self.height;
     let idx = row.row();
@@ -1104,6 +1832,8 @@ impl PixelSink for MixedSinker<'_, P416> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -1119,7 +1849,25 @@ impl PixelSink for MixedSinker<'_, P416> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      p416_to_rgba_u16_row(
+        row.y(),
+        row.uv_full(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -1129,18 +1877,45 @@ impl PixelSink for MixedSinker<'_, P416> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       p416_to_rgb_u16_row(
         row.y(),
         row.uv_full(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
+    }
+
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = rgb.is_some() || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      p416_to_rgba_row(
+        row.y(),
+        row.uv_full(),
+        rgba_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1173,6 +1948,12 @@ impl PixelSink for MixedSinker<'_, P416> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
diff --git a/src/sinker/mixed/tests.rs b/src/sinker/mixed/tests.rs
index c5e87dd..440fce4 100644
--- a/src/sinker/mixed/tests.rs
+++ b/src/sinker/mixed/tests.rs
@@ -3394,6 +3394,54 @@ fn yuv420p12_with_simd_false_matches_with_simd_true() {
   assert_eq!(rgb_u16_scalar, rgb_u16_simd);
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p12_rgba_u8_only_gray_with_opaque_alpha() {
+  // 12-bit mid-gray (Y=U=V=2048) → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095.
+  let (yp, up, vp) = solid_yuv420p12_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv420p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p12>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv420p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
 // ---- Yuv420p14 ---------------------------------------------------------
 
 fn solid_yuv420p14_frame(
@@ -3533,6 +3581,54 @@ fn yuv420p14_with_simd_false_matches_with_simd_true() {
   assert_eq!(rgb_u16_scalar, rgb_u16_simd);
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p14_rgba_u8_only_gray_with_opaque_alpha() {
+  // 14-bit mid-gray (Y=U=V=8192) → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p14>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p14_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 14-bit mid-gray → u16 RGBA: each color element ≈ 8192, alpha = 16383.
+  let (yp, up, vp) = solid_yuv420p14_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv420p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p14>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv420p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(8192) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 16383, "alpha must equal (1 << 14) - 1");
+  }
+}
+
 // ---- P012 --------------------------------------------------------------
 //
 // Semi-planar 12-bit, high-bit-packed (samples in high 12 of each
@@ -3726,6 +3822,56 @@ fn p012_with_simd_false_matches_with_simd_true() {
   assert_eq!(rgb_u16_scalar, rgb_u16_simd);
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p012_rgba_u8_only_gray_with_opaque_alpha() {
+  // P012 mid-gray (12-bit values shifted into the high 12): Y/U/V = 2048 << 4.
+  // Output 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048);
+  let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P012>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p012_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // P012 mid-gray → u16 RGBA: each color element ≈ 2048 (low-bit-packed),
+  // alpha = (1 << 12) - 1 = 4095.
+  let (yp, uvp) = solid_p012_frame(16, 8, 2048, 2048, 2048);
+  let src = P012Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P012>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  p012_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
 // ---- Yuv420p16 ---------------------------------------------------------
 //
 // Planar 16-bit, full u16 range. Mid-gray is Y=UV=32768; full-range
@@ -4392,6 +4538,54 @@ fn yuv422p12_gray_to_gray() {
   }
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv422p12_rgba_u8_only_gray_with_opaque_alpha() {
+  // 12-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv422p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv422p12>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv422p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv422p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095.
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv422p12Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv422p12>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv422p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
 #[test]
 #[cfg_attr(
   miri,
@@ -4414,6 +4608,54 @@ fn yuv422p14_gray_to_gray() {
   }
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv422p14_rgba_u8_only_gray_with_opaque_alpha() {
+  // 14-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv422p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv422p14>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv422p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv422p14_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 14-bit mid-gray → u16 RGBA: each color element ≈ 8192, alpha = 16383.
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv422p14Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv422p14>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv422p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(8192) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 16383, "alpha must equal (1 << 14) - 1");
+  }
+}
+
 #[test]
 #[cfg_attr(
   miri,
@@ -4493,20 +4735,22 @@ fn yuv444p12_gray_to_gray() {
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn yuv444p14_gray_to_gray() {
-  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192);
-  let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+fn yuv444p12_rgba_u8_only_gray_with_opaque_alpha() {
+  // 12-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv444p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
 
-  let mut rgb = std::vec![0u8; 16 * 8 * 3];
-  let mut sink = MixedSinker::<Yuv444p14>::new(16, 8)
-    .with_rgb(&mut rgb)
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p12>::new(16, 8)
+    .with_rgba(&mut rgba)
     .unwrap();
-  yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+  yuv444p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
 
-  for px in rgb.chunks(3) {
+  for px in rgba.chunks(4) {
     assert!(px[0].abs_diff(128) <= 1);
     assert_eq!(px[0], px[1]);
     assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
   }
 }
 
@@ -4515,26 +4759,120 @@ fn yuv444p14_gray_to_gray() {
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn yuv444p16_gray_to_gray_u16() {
-  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 32768, 32768, 32768);
-  let src = Yuv444p16Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+fn yuv444p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095.
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv444p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
 
-  let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
-  let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
-  let mut sink = MixedSinker::<Yuv444p16>::new(16, 8)
-    .with_rgb(&mut rgb_u8)
-    .unwrap()
-    .with_rgb_u16(&mut rgb_u16)
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p12>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
     .unwrap();
-  yuv444p16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+  yuv444p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
 
-  for px in rgb_u8.chunks(3) {
-    assert!(px[0].abs_diff(128) <= 1);
-    assert_eq!(px[0], px[1]);
-    assert_eq!(px[1], px[2]);
-  }
-  for px in rgb_u16.chunks(3) {
-    assert!(px[0].abs_diff(32768) <= 256);
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p14_gray_to_gray() {
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgb = std::vec![0u8; 16 * 8 * 3];
+  let mut sink = MixedSinker::<Yuv444p14>::new(16, 8)
+    .with_rgb(&mut rgb)
+    .unwrap();
+  yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgb.chunks(3) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p14_rgba_u8_only_gray_with_opaque_alpha() {
+  // 14-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p14>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p14_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 14-bit mid-gray → u16 RGBA: each color element ≈ 8192, alpha = 16383.
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 8192, 8192, 8192);
+  let src = Yuv444p14Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p14>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv444p14_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(8192) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 16383, "alpha must equal (1 << 14) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p16_gray_to_gray_u16() {
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 32768, 32768, 32768);
+  let src = Yuv444p16Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+  let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+  let mut sink = MixedSinker::<Yuv444p16>::new(16, 8)
+    .with_rgb(&mut rgb_u8)
+    .unwrap()
+    .with_rgb_u16(&mut rgb_u16)
+    .unwrap();
+  yuv444p16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgb_u8.chunks(3) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+  }
+  for px in rgb_u16.chunks(3) {
+    assert!(px[0].abs_diff(32768) <= 256);
     assert_eq!(px[0], px[1]);
     assert_eq!(px[1], px[2]);
   }
@@ -4807,6 +5145,58 @@ fn yuv420p9_gray_to_gray() {
   }
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p9_rgba_u8_only_gray_with_opaque_alpha() {
+  // 9-bit mid-gray (Y=U=V=256) → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256);
+  let up = up[..8 * 4].to_vec();
+  let vp = vp[..8 * 4].to_vec();
+  let src = Yuv420p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p9>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv420p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p9_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 9-bit mid-gray → u16 RGBA: each color element ≈ 256, alpha = 511.
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256);
+  let up = up[..8 * 4].to_vec();
+  let vp = vp[..8 * 4].to_vec();
+  let src = Yuv420p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p9>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv420p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(256) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 511, "alpha must equal (1 << 9) - 1");
+  }
+}
+
 #[test]
 #[cfg_attr(
   miri,
@@ -4829,6 +5219,54 @@ fn yuv422p9_gray_to_gray() {
   }
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv422p9_rgba_u8_only_gray_with_opaque_alpha() {
+  // 9-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256);
+  let src = Yuv422p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv422p9>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv422p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv422p9_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 9-bit mid-gray → u16 RGBA: each color element ≈ 256, alpha = 511.
+  let (yp, up, vp) = solid_yuv422p_n_frame(16, 8, 256, 256, 256);
+  let src = Yuv422p9Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv422p9>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv422p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(256) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 511, "alpha must equal (1 << 9) - 1");
+  }
+}
+
 #[test]
 #[cfg_attr(
   miri,
@@ -4851,6 +5289,54 @@ fn yuv444p9_gray_to_gray() {
   }
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p9_rgba_u8_only_gray_with_opaque_alpha() {
+  // 9-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 256, 256, 256);
+  let src = Yuv444p9Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p9>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv444p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p9_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 9-bit mid-gray → u16 RGBA: each color element ≈ 256, alpha = 511.
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 256, 256, 256);
+  let src = Yuv444p9Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p9>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv444p9_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(256) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 511, "alpha must equal (1 << 9) - 1");
+  }
+}
+
 #[test]
 #[cfg_attr(
   miri,
@@ -5039,6 +5525,55 @@ fn yuv440p12_gray_to_gray() {
   }
 }
 
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv440p12_rgba_u8_only_gray_with_opaque_alpha() {
+  // 4:4:0 reuses the 4:4:4 dispatcher. 12-bit mid-gray → 8-bit RGBA
+  // ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv440p_n_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv440p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv440p12>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv440p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv440p12_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 12-bit mid-gray → u16 RGBA: each color element ≈ 2048, alpha = 4095.
+  let (yp, up, vp) = solid_yuv440p_n_frame(16, 8, 2048, 2048, 2048);
+  let src = Yuv440p12Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv440p12>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv440p12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
 #[test]
 #[cfg_attr(
   miri,
@@ -5288,18 +5823,239 @@ fn p212_gray_to_gray() {
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn p216_gray_to_gray_u16() {
-  let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768);
-  let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16);
+fn p212_rgba_u8_only_gray_with_opaque_alpha() {
+  // P212 mid-gray (12-bit values shifted into the high 12): Y/U/V = 2048 << 4.
+  // Output 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, uvp) = solid_p2x0_frame(16, 8, 12, 2048, 2048, 2048);
+  let src = P212Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P212>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  p212_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p212_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // P212 mid-gray → u16 RGBA: each color element ≈ 2048 (low-bit-packed),
+  // alpha = (1 << 12) - 1 = 4095.
+  let (yp, uvp) = solid_p2x0_frame(16, 8, 12, 2048, 2048, 2048);
+  let src = P212Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P212>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  p212_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p216_gray_to_gray_u16() {
+  let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768);
+  let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
+  let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+  let mut sink = MixedSinker::<P216>::new(16, 8)
+    .with_rgb(&mut rgb_u8)
+    .unwrap()
+    .with_rgb_u16(&mut rgb_u16)
+    .unwrap();
+  p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgb_u8.chunks(3) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+  }
+  for px in rgb_u16.chunks(3) {
+    assert!(px[0].abs_diff(32768) <= 256);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p216_rgba_u8_only_gray_with_opaque_alpha() {
+  // P216 mid-gray (16-bit, no shift): Y/U/V = 32768. Output 8-bit RGBA
+  // ≈ (128, 128, 128, 255).
+  let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768);
+  let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P216>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p216_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 16-bit mid-gray → u16 RGBA: each color element ≈ 32768, alpha = 0xFFFF.
+  // Covers the 16-bit dedicated kernel family (no Q15 downshift).
+  let (yp, uvp) = solid_p2x0_frame(16, 8, 16, 32768, 32768, 32768);
+  let src = P216Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P216>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(32768) <= 256, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p410_gray_to_gray() {
+  // 4:4:4: uv_stride = 2 * width = 32 (16 pairs × 2 elements).
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 10, 512, 512, 512);
+  let src = P410Frame::new(&yp, &uvp, 16, 8, 16, 32);
+
+  let mut rgb = std::vec![0u8; 16 * 8 * 3];
+  let mut sink = MixedSinker::<P410>::new(16, 8).with_rgb(&mut rgb).unwrap();
+  p410_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgb.chunks(3) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p412_gray_to_gray() {
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048);
+  let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32);
+
+  let mut rgb = std::vec![0u8; 16 * 8 * 3];
+  let mut sink = MixedSinker::<P412>::new(16, 8).with_rgb(&mut rgb).unwrap();
+  p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgb.chunks(3) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p412_rgba_u8_only_gray_with_opaque_alpha() {
+  // P412 mid-gray (12-bit values shifted into the high 12): Y/U/V = 2048 << 4.
+  // Output 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048);
+  let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P412>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p412_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // P412 mid-gray → u16 RGBA: each color element ≈ 2048 (low-bit-packed),
+  // alpha = (1 << 12) - 1 = 4095.
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048);
+  let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P412>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(2048) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 4095, "alpha must equal (1 << 12) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p416_gray_to_gray_u16() {
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 16, 32768, 32768, 32768);
+  let src = P416Frame::new(&yp, &uvp, 16, 8, 16, 32);
 
   let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
   let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
-  let mut sink = MixedSinker::<P216>::new(16, 8)
+  let mut sink = MixedSinker::<P416>::new(16, 8)
     .with_rgb(&mut rgb_u8)
     .unwrap()
     .with_rgb_u16(&mut rgb_u16)
     .unwrap();
-  p216_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+  p416_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
   for px in rgb_u8.chunks(3) {
     assert!(px[0].abs_diff(128) <= 1);
@@ -5318,39 +6074,23 @@ fn p216_gray_to_gray_u16() {
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn p410_gray_to_gray() {
-  // 4:4:4: uv_stride = 2 * width = 32 (16 pairs × 2 elements).
-  let (yp, uvp) = solid_p4x0_frame(16, 8, 10, 512, 512, 512);
-  let src = P410Frame::new(&yp, &uvp, 16, 8, 16, 32);
-
-  let mut rgb = std::vec![0u8; 16 * 8 * 3];
-  let mut sink = MixedSinker::<P410>::new(16, 8).with_rgb(&mut rgb).unwrap();
-  p410_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
-
-  for px in rgb.chunks(3) {
-    assert!(px[0].abs_diff(128) <= 1);
-    assert_eq!(px[0], px[1]);
-    assert_eq!(px[1], px[2]);
-  }
-}
-
-#[test]
-#[cfg_attr(
-  miri,
-  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
-)]
-fn p412_gray_to_gray() {
-  let (yp, uvp) = solid_p4x0_frame(16, 8, 12, 2048, 2048, 2048);
-  let src = P412Frame::new(&yp, &uvp, 16, 8, 16, 32);
+fn p416_rgba_u8_only_gray_with_opaque_alpha() {
+  // P416 mid-gray (16-bit, no shift): Y/U/V = 32768. Output 8-bit RGBA
+  // ≈ (128, 128, 128, 255).
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 16, 32768, 32768, 32768);
+  let src = P416Frame::new(&yp, &uvp, 16, 8, 16, 32);
 
-  let mut rgb = std::vec![0u8; 16 * 8 * 3];
-  let mut sink = MixedSinker::<P412>::new(16, 8).with_rgb(&mut rgb).unwrap();
-  p412_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P416>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  p416_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
-  for px in rgb.chunks(3) {
+  for px in rgba.chunks(4) {
     assert!(px[0].abs_diff(128) <= 1);
     assert_eq!(px[0], px[1]);
     assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
   }
 }
 
@@ -5359,28 +6099,23 @@ fn p412_gray_to_gray() {
   miri,
   ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
 )]
-fn p416_gray_to_gray_u16() {
+fn p416_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 16-bit mid-gray → u16 RGBA: each color element ≈ 32768, alpha = 0xFFFF.
+  // Covers the 16-bit dedicated kernel family (no Q15 downshift).
   let (yp, uvp) = solid_p4x0_frame(16, 8, 16, 32768, 32768, 32768);
   let src = P416Frame::new(&yp, &uvp, 16, 8, 16, 32);
 
-  let mut rgb_u8 = std::vec![0u8; 16 * 8 * 3];
-  let mut rgb_u16 = std::vec![0u16; 16 * 8 * 3];
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
   let mut sink = MixedSinker::<P416>::new(16, 8)
-    .with_rgb(&mut rgb_u8)
-    .unwrap()
-    .with_rgb_u16(&mut rgb_u16)
+    .with_rgba_u16(&mut rgba)
     .unwrap();
   p416_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
 
-  for px in rgb_u8.chunks(3) {
-    assert!(px[0].abs_diff(128) <= 1);
-    assert_eq!(px[0], px[1]);
-    assert_eq!(px[1], px[2]);
-  }
-  for px in rgb_u16.chunks(3) {
-    assert!(px[0].abs_diff(32768) <= 256);
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(32768) <= 256, "got {px:?}");
     assert_eq!(px[0], px[1]);
     assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF");
   }
 }
 
@@ -6421,3 +7156,233 @@ fn yuv422p16_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
     assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF");
   }
 }
+
+// ===== Ship 8 Tranche 7c — high-bit 4:4:4 RGBA sinker tests ==========
+//
+// Mirrors PR #26's 4:2:0 coverage scope: representative formats only,
+// not exhaustive per-format. Yuv444p10 covers the BITS-generic planar
+// path; P410 covers the Pn semi-planar path; Yuv444p16 covers the
+// 16-bit dedicated kernel; Yuv440p10 covers the 4:4:0 kernel-reuse
+// path. The remaining 4:4:4 high-bit formats (9/12/14, P412/P416,
+// Yuv440p12) are exercised by row-layer tests + the compile-time
+// guarantee that the new sinker builders typecheck.
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p10_rgba_u8_only_gray_with_opaque_alpha() {
+  // 10-bit mid-gray (Y=512, U=512, V=512) → 8-bit RGBA ≈ (128, 128, 128, 255).
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 512, 512, 512);
+  let src = Yuv444p10Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p10>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv444p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p10_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 10-bit mid-gray → u16 RGBA: each color element ≈ 512, alpha = 1023.
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 512, 512, 512);
+  let src = Yuv444p10Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p10>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv444p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p10_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+  // Strategy A on the u8 path: rgb buffer populated by the RGB kernel,
+  // rgba buffer populated via the cheap expand_rgb_to_rgba_row pass.
+  // RGB triples must be byte-identical to the standalone RGB-only run.
+  let (yp, up, vp) = solid_yuv444p_n_frame(64, 16, 600, 400, 700);
+  let src = Yuv444p10Frame::new(&yp, &up, &vp, 64, 16, 64, 64, 64);
+
+  let mut rgb_solo = std::vec![0u8; 64 * 16 * 3];
+  let mut s_solo = MixedSinker::<Yuv444p10>::new(64, 16)
+    .with_rgb(&mut rgb_solo)
+    .unwrap();
+  yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap();
+
+  let mut rgb_combined = std::vec![0u8; 64 * 16 * 3];
+  let mut rgba = std::vec![0u8; 64 * 16 * 4];
+  let mut s_combined = MixedSinker::<Yuv444p10>::new(64, 16)
+    .with_rgb(&mut rgb_combined)
+    .unwrap()
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap();
+
+  assert_eq!(rgb_solo, rgb_combined, "RGB bytes must match across runs");
+  for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) {
+    assert_eq!(rgb_px[0], rgba_px[0]);
+    assert_eq!(rgb_px[1], rgba_px[1]);
+    assert_eq!(rgb_px[2], rgba_px[2]);
+    assert_eq!(rgba_px[3], 0xFF);
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p10_with_rgb_u16_and_with_rgba_u16_produce_byte_identical_rgb_elems() {
+  // Strategy A on the u16 path: rgb_u16 buffer populated by the u16 RGB
+  // kernel, rgba_u16 fanned out via expand_rgb_u16_to_rgba_u16_row<10>.
+  let (yp, up, vp) = solid_yuv444p_n_frame(64, 16, 600, 400, 700);
+  let src = Yuv444p10Frame::new(&yp, &up, &vp, 64, 16, 64, 64, 64);
+
+  let mut rgb_solo = std::vec![0u16; 64 * 16 * 3];
+  let mut s_solo = MixedSinker::<Yuv444p10>::new(64, 16)
+    .with_rgb_u16(&mut rgb_solo)
+    .unwrap();
+  yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap();
+
+  let mut rgb_combined = std::vec![0u16; 64 * 16 * 3];
+  let mut rgba = std::vec![0u16; 64 * 16 * 4];
+  let mut s_combined = MixedSinker::<Yuv444p10>::new(64, 16)
+    .with_rgb_u16(&mut rgb_combined)
+    .unwrap()
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv444p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap();
+
+  assert_eq!(
+    rgb_solo, rgb_combined,
+    "RGB u16 elements must match across runs"
+  );
+  for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) {
+    assert_eq!(rgb_px[0], rgba_px[0]);
+    assert_eq!(rgb_px[1], rgba_px[1]);
+    assert_eq!(rgb_px[2], rgba_px[2]);
+    assert_eq!(rgba_px[3], 1023, "alpha = (1 << 10) - 1");
+  }
+}
+
+#[test]
+fn yuv444p10_rgba_too_short_returns_err() {
+  let mut rgba = std::vec![0u8; 10];
+  let err = MixedSinker::<Yuv444p10>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .err()
+    .expect("expected RgbaBufferTooShort");
+  assert!(matches!(err, MixedSinkerError::RgbaBufferTooShort { .. }));
+}
+
+#[test]
+fn yuv444p10_rgba_u16_too_short_returns_err() {
+  let mut rgba = std::vec![0u16; 10];
+  let err = MixedSinker::<Yuv444p10>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .err()
+    .expect("expected RgbaU16BufferTooShort");
+  assert!(matches!(
+    err,
+    MixedSinkerError::RgbaU16BufferTooShort { .. }
+  ));
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p410_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // P410 (semi-planar 10-bit): mid-gray (high-bit-packed = 512 << 6).
+  // u16 RGBA output ≈ 512, alpha = 1023.
+  let (yp, uvp) = solid_p4x0_frame(16, 8, 10, 512, 512, 512);
+  let src = P410Frame::new(&yp, &uvp, 16, 8, 16, 32);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P410>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  p410_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv444p16_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 16-bit mid-gray → u16 RGBA: each color element ≈ 32768, alpha = 0xFFFF.
+  // Covers the 16-bit dedicated kernel family (no Q15 downshift).
+  let (yp, up, vp) = solid_yuv444p_n_frame(16, 8, 32768, 32768, 32768);
+  let src = Yuv444p16Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv444p16>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv444p16_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(32768) <= 256, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv440p10_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 4:4:0 reuses the 4:4:4 dispatcher. Confirms the kernel-reuse path
+  // wires through correctly at the sinker boundary.
+  let (yp, up, vp) = solid_yuv440p_n_frame(16, 8, 512, 512, 512);
+  let src = Yuv440p10Frame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv440p10>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv440p10_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1");
+  }
+}