From 0798cf2b1cbdb30b6574e36da6f028550b9e8c4c Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 01:00:37 +1200
Subject: [PATCH 1/3] update

---
 src/row/arch/neon.rs               | 388 ++++++++++++++++----
 src/row/arch/neon/tests.rs         | 188 ++++++++++
 src/row/arch/wasm_simd128.rs       | 378 +++++++++++++++++---
 src/row/arch/wasm_simd128/tests.rs | 180 ++++++++++
 src/row/arch/x86_avx2.rs           | 557 +++++++++++++++++++++++------
 src/row/arch/x86_avx2/tests.rs     | 194 ++++++++++
 src/row/arch/x86_avx512.rs         | 458 ++++++++++++++++++++----
 src/row/arch/x86_avx512/tests.rs   | 198 ++++++++++
 src/row/arch/x86_common.rs         |  36 ++
 src/row/arch/x86_sse41.rs          | 356 +++++++++++++++---
 src/row/arch/x86_sse41/tests.rs    | 198 ++++++++++
 src/row/mod.rs                     | 457 ++++++++++++++++++++---
 12 files changed, 3205 insertions(+), 383 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 917c64b..9114045 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -519,13 +519,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// NEON sibling of [`yuv_420p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth) — matches `scalar::yuv_420p_n_to_rgba_u16_row`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_u16_row`], plus
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared NEON high-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
+/// writes RGBA quads via `vst4q_u16` with constant alpha
+/// `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **NEON must be available.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -552,6 +609,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u16 = vdupq_n_u16(out_max as u16);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -605,25 +663,37 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      // Two interleaved u16 writes — each `vst3q_u16` covers 8 pixels.
-      let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
-      let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      if ALPHA {
+        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16);
+        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16);
+        vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
+        vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
+      } else {
+        // Two interleaved u16 writes — each `vst3q_u16` covers 8 pixels.
+        let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
+        let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
+        vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo);
+        vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1125,10 +1195,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON sibling of [`p_n_to_rgba_row`] for native-depth `u16` output.
+/// Alpha samples are `(1 << BITS) - 1` (opaque maximum at the input
+/// bit depth) — matches `scalar::p_n_to_rgba_u16_row`. P016 has its
+/// own kernel family — never routed here.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_u16_row`], plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON Pn → native-depth `u16` kernel. `ALPHA = false` writes
+/// RGB triples via `vst3q_u16`; `ALPHA = true` writes RGBA quads via
+/// `vst4q_u16` with constant alpha `(1 << BITS) - 1`. P016 has its
+/// own kernel family — never routed here.
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1152,6 +1274,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u16 = vdupq_n_u16(out_max as u16);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1198,23 +1321,31 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(vqaddq_s16(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
-      let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb_lo);
-      vst3q_u16(rgb_out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      if ALPHA {
+        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16);
+        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16);
+        vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
+        vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
+      } else {
+        let rgb_lo = uint16x8x3_t(r_lo, g_lo, b_lo);
+        let rgb_hi = uint16x8x3_t(r_hi, g_hi, b_hi);
+        vst3q_u16(out.as_mut_ptr().add(x * 3), rgb_lo);
+        vst3q_u16(out.as_mut_ptr().add(x * 3 + 24), rgb_hi);
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2233,11 +2364,65 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// NEON sibling of [`yuv_420p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha is `0xFFFF` — matches `scalar::yuv_420p16_to_rgba_u16_row`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared NEON 16-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
+/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. NEON must be available.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -2245,6 +2430,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   const RND: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = vdupq_n_u16(0xFFFF);
     let rnd_v = vdupq_n_s32(RND);
     let rnd64 = vdupq_n_s64(RND as i64);
     let y_off_v = vdupq_n_s32(y_off);
@@ -2345,27 +2531,43 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
         vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)),
       );
 
-      vst3q_u16(
-        rgb_out.as_mut_ptr().add(x * 3),
-        uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16),
-      );
-      vst3q_u16(
-        rgb_out.as_mut_ptr().add(x * 3 + 24),
-        uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16),
-      );
+      if ALPHA {
+        vst4q_u16(
+          out.as_mut_ptr().add(x * 4),
+          uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16),
+        );
+        vst4q_u16(
+          out.as_mut_ptr().add(x * 4 + 32),
+          uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16),
+        );
+      } else {
+        vst3q_u16(
+          out.as_mut_ptr().add(x * 3),
+          uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16),
+        );
+        vst3q_u16(
+          out.as_mut_ptr().add(x * 3 + 24),
+          uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16),
+        );
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_u16_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -2827,10 +3029,57 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON sibling of [`p16_to_rgba_row`] for native-depth `u16` output.
+/// Alpha is `0xFFFF` — matches `scalar::p16_to_rgba_u16_row`.
+///
+/// # Safety
+///
+/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p16_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON 16-bit P016 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
+/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. NEON must be available.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -2838,6 +3087,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   const RND: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = vdupq_n_u16(0xFFFF);
     let rnd_v = vdupq_n_s32(RND);
     let rnd64 = vdupq_n_s64(RND as i64);
     let y_off_v = vdupq_n_s32(y_off);
@@ -2935,26 +3185,38 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
         vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)),
       );
 
-      vst3q_u16(
-        rgb_out.as_mut_ptr().add(x * 3),
-        uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16),
-      );
-      vst3q_u16(
-        rgb_out.as_mut_ptr().add(x * 3 + 24),
-        uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16),
-      );
+      if ALPHA {
+        vst4q_u16(
+          out.as_mut_ptr().add(x * 4),
+          uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16),
+        );
+        vst4q_u16(
+          out.as_mut_ptr().add(x * 4 + 32),
+          uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16),
+        );
+      } else {
+        vst3q_u16(
+          out.as_mut_ptr().add(x * 3),
+          uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16),
+        );
+        vst3q_u16(
+          out.as_mut_ptr().add(x * 3 + 24),
+          uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16),
+        );
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::p16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs
index 41dd909..c84627c 100644
--- a/src/row/arch/neon/tests.rs
+++ b/src/row/arch/neon/tests.rs
@@ -1828,6 +1828,194 @@ fn neon_p016_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ----
+//
+// u16 RGBA wrappers share the math of their u16 RGB siblings — only
+// the store (and tail dispatch) branches on `ALPHA`, with alpha set to
+// `(1 << BITS) - 1` for BITS-generic kernels and `0xFFFF` for 16-bit
+// kernels. Tests pin byte-identical output against the scalar RGBA
+// reference.
+
+fn check_planar_u16_neon_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u16_neon_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv420p_n_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u16_neon_rgba_equivalence_n::<9>(16, m, full);
+      check_planar_u16_neon_rgba_equivalence_n::<10>(16, m, full);
+      check_planar_u16_neon_rgba_equivalence_n::<12>(16, m, full);
+      check_planar_u16_neon_rgba_equivalence_n::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_planar_u16_neon_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u16_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u16_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u16_neon_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_pn_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u16_neon_rgba_equivalence_n::<10>(16, m, full);
+      check_pn_u16_neon_rgba_equivalence_n::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_pn_rgba_u16_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_pn_u16_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u16_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+fn check_yuv420p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width / 2, 53);
+  let v = p16_plane_neon(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p016_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width / 2, 53);
+  let v = p16_plane_neon(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_u16_row(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv420p16_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_neon_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_p016_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p016_u16_neon_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_p016_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Yuv444p_n NEON equivalence (10/12/14) --------------------------
 
 fn check_yuv444p_n_u8_neon_equivalence<const BITS: u32>(
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index e9d02c4..19aed79 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -464,13 +464,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// wasm simd128 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth
+/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum
+/// at the input bit depth).
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared wasm simd128 high-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -495,6 +552,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u16 = u16x8_splat(out_max as u16);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -539,23 +597,34 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
-      write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst);
+        write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
+        write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1105,6 +1174,44 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
   }
 }
 
+/// Interleaves 8 R/G/B/A `u16` samples into packed RGBA quads (32
+/// `u16` = 64 bytes). Two `i16x8_shuffle` stages: first interleave
+/// R+G and B+A into pairs, then combine pair-vectors into RGBA quads.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 writable bytes. Caller must have
+/// `simd128` enabled at compile time.
+#[inline(always)]
+unsafe fn write_rgba_u16_8(r: v128, g: v128, b: v128, a: v128, ptr: *mut u16) {
+  unsafe {
+    // Stage 1: interleave R+G and B+A pairwise.
+    // rg_lo = [R0, G0, R1, G1, R2, G2, R3, G3]
+    // rg_hi = [R4, G4, R5, G5, R6, G6, R7, G7]
+    // ba_lo = [B0, A0, B1, A1, B2, A2, B3, A3]
+    // ba_hi = [B4, A4, B5, A5, B6, A6, B7, A7]
+    let rg_lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(r, g);
+    let rg_hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(r, g);
+    let ba_lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(b, a);
+    let ba_hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(b, a);
+
+    // Stage 2: combine RG pairs with BA pairs to produce RGBA quads.
+    // q0 = [R0, G0, B0, A0, R1, G1, B1, A1]
+    // q1 = [R2, G2, B2, A2, R3, G3, B3, A3]
+    // q2 = [R4, G4, B4, A4, R5, G5, B5, A5]
+    // q3 = [R6, G6, B6, A6, R7, G7, B7, A7]
+    let q0 = i16x8_shuffle::<0, 1, 8, 9, 2, 3, 10, 11>(rg_lo, ba_lo);
+    let q1 = i16x8_shuffle::<4, 5, 12, 13, 6, 7, 14, 15>(rg_lo, ba_lo);
+    let q2 = i16x8_shuffle::<0, 1, 8, 9, 2, 3, 10, 11>(rg_hi, ba_hi);
+    let q3 = i16x8_shuffle::<4, 5, 12, 13, 6, 7, 14, 15>(rg_hi, ba_hi);
+
+    v128_store(ptr.cast(), q0);
+    v128_store(ptr.add(8).cast(), q1);
+    v128_store(ptr.add(16).cast(), q2);
+    v128_store(ptr.add(24).cast(), q3);
+  }
+}
+
 /// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) →
 /// packed **8‑bit** RGB.
 ///
@@ -1310,10 +1417,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 sibling of [`p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth). P016 has its own kernel family — never routed here.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 Pn → native-depth `u16` kernel. `ALPHA = false`
+/// writes RGB triples via `write_rgb_u16_8`; `ALPHA = true` writes
+/// RGBA quads via `write_rgba_u16_8` with constant alpha
+/// `(1 << BITS) - 1`. P016 has its own kernel family — never routed
+/// here.
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1337,6 +1496,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u16 = u16x8_splat(out_max as u16);
 
     // High-bit-packed samples: shift right by `16 - BITS`.
     let shr = (16 - BITS) as u32;
@@ -1383,22 +1543,29 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
-      write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst);
+        write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
+        write_rgb_u16_8(r_hi, g_hi, b_hi, dst.add(24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2645,11 +2812,66 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// wasm simd128 sibling of [`yuv_420p16_to_rgba_row`] for native-depth
+/// `u16` output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared wasm simd128 16-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -2657,6 +2879,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = u16x8_splat(0xFFFF);
     let rnd_i64 = i64x2_splat(RND_I64);
     let rnd_i32 = i32x4_splat(RND_I32);
     let y_off32 = i32x4_splat(y_off);
@@ -2739,20 +2962,29 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
         i32x4_add(y_hi_scaled, b_dup_hi),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_u16_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -2921,10 +3153,58 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 sibling of [`p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p16_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 16-bit P016 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -2932,6 +3212,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   const RND_I32: i32 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = u16x8_splat(0xFFFF);
     let rnd_i64 = i64x2_splat(RND_I64);
     let rnd_i32 = i32x4_splat(RND_I32);
     let y_off32 = i32x4_splat(y_off);
@@ -3014,19 +3295,24 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
         i32x4_add(y_hi_scaled, b_dup_hi),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::p16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs
index d394f33..21c9489 100644
--- a/src/row/arch/wasm_simd128/tests.rs
+++ b/src/row/arch/wasm_simd128/tests.rs
@@ -1564,6 +1564,186 @@ fn simd128_p016_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ----
+
+fn check_planar_u16_simd128_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u16_simd128_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u16_simd128_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width / 2, 53);
+  let v = p16_plane_wasm(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u16_simd128_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width / 2, 53);
+  let v = p16_plane_wasm(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn simd128_yuv420p_n_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u16_simd128_rgba_equivalence_n::<9>(16, m, full);
+      check_planar_u16_simd128_rgba_equivalence_n::<10>(16, m, full);
+      check_planar_u16_simd128_rgba_equivalence_n::<12>(16, m, full);
+      check_planar_u16_simd128_rgba_equivalence_n::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_planar_u16_simd128_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u16_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u16_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u16_simd128_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn simd128_pn_rgba_u16_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u16_simd128_rgba_equivalence_n::<10>(16, m, full);
+      check_pn_u16_simd128_rgba_equivalence_n::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn simd128_pn_rgba_u16_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_pn_u16_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u16_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn simd128_yuv420p16_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_simd128_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn simd128_p016_rgba_u16_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u16_simd128_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_p16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) wasm simd128 equivalence ---------
 
 fn high_bit_plane_wasm<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index d0f72fd..ad52597 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -45,6 +45,7 @@ use crate::{
   row::{
     arch::x86_common::{
       rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16,
+      write_rgba_u16_8,
     },
     scalar,
   },
@@ -508,13 +509,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX2 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth).
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX2 high-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8` per
+/// 32-pixel block; `ALPHA = true` writes RGBA quads via 4×
+/// `write_rgba_u16_8` with constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -538,6 +596,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -601,45 +660,82 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
       // Four 8‑pixel u16 writes per 32‑pixel block. Each extracts a
       // 128‑bit half of an i16x16 channel and hands it to the shared
       // SSE4.1 u16 interleave helper.
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_lo),
-        _mm256_castsi256_si128(g_lo),
-        _mm256_castsi256_si128(b_lo),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_lo),
-        _mm256_extracti128_si256::<1>(g_lo),
-        _mm256_extracti128_si256::<1>(b_lo),
-        dst.add(24),
-      );
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_hi),
-        _mm256_castsi256_si128(g_hi),
-        _mm256_castsi256_si128(b_hi),
-        dst.add(48),
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_hi),
-        _mm256_extracti128_si256::<1>(g_hi),
-        _mm256_extracti128_si256::<1>(b_hi),
-        dst.add(72),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          alpha_u16,
+          dst.add(32),
+        );
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          alpha_u16,
+          dst.add(64),
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          alpha_u16,
+          dst.add(96),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          dst.add(24),
+        );
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          dst.add(48),
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          dst.add(72),
+        );
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1469,10 +1565,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 sibling of [`p_n_to_rgba_row`] for native-depth `u16` output.
+/// Alpha samples are `(1 << BITS) - 1` (opaque maximum at the input
+/// bit depth). P016 has its own kernel family — never routed here.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 Pn → native-depth `u16` kernel. `ALPHA = false` writes
+/// RGB triples via 4× `write_rgb_u16_8` per 32-pixel block;
+/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`. P016 has its own kernel family —
+/// never routed here.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1497,6 +1645,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -1550,44 +1699,77 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_lo),
-        _mm256_castsi256_si128(g_lo),
-        _mm256_castsi256_si128(b_lo),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_lo),
-        _mm256_extracti128_si256::<1>(g_lo),
-        _mm256_extracti128_si256::<1>(b_lo),
-        dst.add(24),
-      );
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_hi),
-        _mm256_castsi256_si128(g_hi),
-        _mm256_castsi256_si128(b_hi),
-        dst.add(48),
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_hi),
-        _mm256_extracti128_si256::<1>(g_hi),
-        _mm256_extracti128_si256::<1>(b_hi),
-        dst.add(72),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          alpha_u16,
+          dst.add(32),
+        );
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          alpha_u16,
+          dst.add(64),
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          alpha_u16,
+          dst.add(96),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_lo),
+          _mm256_castsi256_si128(g_lo),
+          _mm256_castsi256_si128(b_lo),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_lo),
+          _mm256_extracti128_si256::<1>(g_lo),
+          _mm256_extracti128_si256::<1>(b_lo),
+          dst.add(24),
+        );
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_hi),
+          _mm256_castsi256_si128(g_hi),
+          _mm256_castsi256_si128(b_hi),
+          dst.add(48),
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_hi),
+          _mm256_extracti128_si256::<1>(g_hi),
+          _mm256_extracti128_si256::<1>(b_hi),
+          dst.add(72),
+        );
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2855,17 +3037,72 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX2 sibling of [`yuv_420p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX2 16-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
+/// quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm256_set1_epi64x(RND);
     let y_off_v = _mm256_set1_epi32(y_off);
     let y_scale_v = _mm256_set1_epi32(y_scale);
@@ -2952,34 +3189,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
         _mm256_add_epi32(y_hi_scaled, b_dup_hi),
       ));
 
-      // Write 16 pixels = 48 u16 via two 8-pixel helper calls.
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_u16),
-        _mm256_castsi256_si128(g_u16),
-        _mm256_castsi256_si128(b_u16),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_u16),
-        _mm256_extracti128_si256::<1>(g_u16),
-        _mm256_extracti128_si256::<1>(b_u16),
-        dst.add(24),
-      );
+      // Write 16 pixels via two 8-pixel helper calls.
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          alpha_u16,
+          dst.add(32),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          dst.add(24),
+        );
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_u16_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3157,16 +3417,64 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 sibling of [`p16_to_rgba_row`] for native-depth `u16` output.
+/// Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p16_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 16-bit P016 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
+/// quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm256_set1_epi64x(RND);
     let y_off_v = _mm256_set1_epi32(y_off);
     let y_scale_v = _mm256_set1_epi32(y_scale);
@@ -3257,32 +3565,51 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
         _mm256_add_epi32(y_hi_scaled, b_dup_hi),
       ));
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_rgb_u16_8(
-        _mm256_castsi256_si128(r_u16),
-        _mm256_castsi256_si128(g_u16),
-        _mm256_castsi256_si128(b_u16),
-        dst,
-      );
-      write_rgb_u16_8(
-        _mm256_extracti128_si256::<1>(r_u16),
-        _mm256_extracti128_si256::<1>(g_u16),
-        _mm256_extracti128_si256::<1>(b_u16),
-        dst.add(24),
-      );
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_rgba_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          alpha_u16,
+          dst,
+        );
+        write_rgba_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          alpha_u16,
+          dst.add(32),
+        );
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_rgb_u16_8(
+          _mm256_castsi256_si128(r_u16),
+          _mm256_castsi256_si128(g_u16),
+          _mm256_castsi256_si128(b_u16),
+          dst,
+        );
+        write_rgb_u16_8(
+          _mm256_extracti128_si256::<1>(r_u16),
+          _mm256_extracti128_si256::<1>(g_u16),
+          _mm256_extracti128_si256::<1>(b_u16),
+          dst.add(24),
+        );
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index f8f28eb..37bb6b8 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -1779,6 +1779,200 @@ fn avx2_p016_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ----
+
+fn check_planar_u16_avx2_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u16_avx2_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width / 2, 53);
+  let v = p16_plane_avx2(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width / 2, 53);
+  let v = p16_plane_avx2(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn avx2_yuv420p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u16_avx2_rgba_equivalence_n::<9>(32, m, full);
+      check_planar_u16_avx2_rgba_equivalence_n::<10>(32, m, full);
+      check_planar_u16_avx2_rgba_equivalence_n::<12>(32, m, full);
+      check_planar_u16_avx2_rgba_equivalence_n::<14>(32, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_planar_u16_avx2_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u16_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u16_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u16_avx2_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn avx2_pn_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u16_avx2_rgba_equivalence_n::<10>(32, m, full);
+      check_pn_u16_avx2_rgba_equivalence_n::<12>(32, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx2_pn_rgba_u16_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_pn_u16_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u16_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn avx2_yuv420p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_avx2_rgba_equivalence(32, m, full);
+    }
+  }
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_yuv420p16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn avx2_p016_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u16_avx2_rgba_equivalence(32, m, full);
+    }
+  }
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_p16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) AVX2 equivalence -----------------
 
 fn high_bit_plane_avx2<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 48e7cfa..8138c70 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -59,6 +59,7 @@ use crate::{
   row::{
     arch::x86_common::{
       rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16,
+      write_rgba_u16_8,
     },
     scalar,
   },
@@ -522,13 +523,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX-512 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth
+/// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum
+/// at the input bit depth).
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX-512 high-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via 8× `write_quarter` per
+/// 64-pixel block; `ALPHA = true` writes RGBA quads via 8×
+/// `write_quarter_rgba` with constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -552,6 +610,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
@@ -618,29 +677,46 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
       // Eight 8‑pixel u16 writes per 64‑pixel block. For each i16x32
       // channel vector we extract four 128‑bit quarters and hand each
       // to the shared SSE4.1 u16 interleave helper.
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_quarter(r_lo, g_lo, b_lo, 0, dst);
-      write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
-      write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
-      write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
-      write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
-      write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
-      write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
-      write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_quarter(r_lo, g_lo, b_lo, 0, dst);
+        write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
+        write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
+        write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
+        write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
+        write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
+        write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
+        write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -696,6 +772,53 @@ unsafe fn write_quarter(r: __m512i, g: __m512i, b: __m512i, idx: u8, ptr: *mut u
   }
 }
 
+/// RGBA sibling of [`write_quarter`]. Extracts one 128‑bit quarter of
+/// each `i16x32` channel vector and hands it (plus a splatted alpha)
+/// to [`write_rgba_u16_8`].
+///
+/// # Safety
+///
+/// Same as [`write_rgba_u16_8`] — `ptr` must point to at least 64
+/// writable bytes (32 `u16`). Caller's `target_feature` must include
+/// AVX‑512F + AVX‑512BW (so `_mm512_extracti32x4_epi32` is available)
+/// and SSE2 (for the underlying unpack/store inside
+/// `write_rgba_u16_8`).
+#[inline(always)]
+unsafe fn write_quarter_rgba(
+  r: __m512i,
+  g: __m512i,
+  b: __m512i,
+  a: __m128i,
+  idx: u8,
+  ptr: *mut u16,
+) {
+  unsafe {
+    let (rq, gq, bq) = match idx {
+      0 => (
+        _mm512_extracti32x4_epi32::<0>(r),
+        _mm512_extracti32x4_epi32::<0>(g),
+        _mm512_extracti32x4_epi32::<0>(b),
+      ),
+      1 => (
+        _mm512_extracti32x4_epi32::<1>(r),
+        _mm512_extracti32x4_epi32::<1>(g),
+        _mm512_extracti32x4_epi32::<1>(b),
+      ),
+      2 => (
+        _mm512_extracti32x4_epi32::<2>(r),
+        _mm512_extracti32x4_epi32::<2>(g),
+        _mm512_extracti32x4_epi32::<2>(b),
+      ),
+      _ => (
+        _mm512_extracti32x4_epi32::<3>(r),
+        _mm512_extracti32x4_epi32::<3>(g),
+        _mm512_extracti32x4_epi32::<3>(b),
+      ),
+    };
+    write_rgba_u16_8(rq, gq, bq, a, ptr);
+  }
+}
+
 /// AVX-512 YUV 4:4:4 planar 10/12/14-bit → packed **u8** RGB.
 /// Const-generic over `BITS ∈ {10, 12, 14}`. Block size 64 pixels.
 ///
@@ -1530,10 +1653,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 sibling of [`p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth). P016 has its own kernel family — never routed here.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 Pn → native-depth `u16` kernel. `ALPHA = false`
+/// writes RGB triples via 8× `write_quarter` per 64-pixel block;
+/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with
+/// constant alpha `(1 << BITS) - 1`. P016 has its own kernel family —
+/// never routed here.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1558,6 +1733,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
@@ -1615,28 +1791,41 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      let dst = rgb_out.as_mut_ptr().add(x * 3);
-      write_quarter(r_lo, g_lo, b_lo, 0, dst);
-      write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
-      write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
-      write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
-      write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
-      write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
-      write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
-      write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      if ALPHA {
+        let dst = out.as_mut_ptr().add(x * 4);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst);
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64));
+        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192));
+        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224));
+      } else {
+        let dst = out.as_mut_ptr().add(x * 3);
+        write_quarter(r_lo, g_lo, b_lo, 0, dst);
+        write_quarter(r_lo, g_lo, b_lo, 1, dst.add(24));
+        write_quarter(r_lo, g_lo, b_lo, 2, dst.add(48));
+        write_quarter(r_lo, g_lo, b_lo, 3, dst.add(72));
+        write_quarter(r_hi, g_hi, b_hi, 0, dst.add(96));
+        write_quarter(r_hi, g_hi, b_hi, 1, dst.add(120));
+        write_quarter(r_hi, g_hi, b_hi, 2, dst.add(144));
+        write_quarter(r_hi, g_hi, b_hi, 3, dst.add(168));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2730,6 +2919,40 @@ unsafe fn write_rgb_u16_32(r: __m512i, g: __m512i, b: __m512i, ptr: *mut u16) {
   }
 }
 
+/// Writes 32 pixels of packed RGBA-u16 (128 u16 = 256 bytes) by
+/// splitting each u16x32 channel vector into four 128-bit halves and
+/// calling the shared [`write_rgba_u16_8`] helper four times. Alpha
+/// is supplied as a single i16x8 vector splatted into all 32 alpha
+/// lanes.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 256 writable bytes.
+#[inline(always)]
+unsafe fn write_rgba_u16_32(r: __m512i, g: __m512i, b: __m512i, a: __m128i, ptr: *mut u16) {
+  unsafe {
+    let r0: __m128i = _mm512_castsi512_si128(r);
+    let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r);
+    let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r);
+    let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r);
+    let g0: __m128i = _mm512_castsi512_si128(g);
+    let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g);
+    let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g);
+    let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g);
+    let b0: __m128i = _mm512_castsi512_si128(b);
+    let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b);
+    let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b);
+    let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b);
+
+    // Each `write_rgba_u16_8` writes 8 pixels × 4 × u16 = 64 bytes =
+    // 32 u16 elements. Four calls → 128 u16 = 32 pixels.
+    write_rgba_u16_8(r0, g0, b0, a, ptr);
+    write_rgba_u16_8(r1, g1, b1, a, ptr.add(32));
+    write_rgba_u16_8(r2, g2, b2, a, ptr.add(64));
+    write_rgba_u16_8(r3, g3, b3, a, ptr.add(96));
+  }
+}
+
 // ===== 16-bit YUV → RGB ==================================================
 
 /// `(Y_u16x32 - y_off) * y_scale + RND >> 15` for full u16 Y samples.
@@ -2957,11 +3180,66 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX-512 sibling of [`yuv_420p16_to_rgba_row`] for native-depth
+/// `u16` output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX-512 16-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with
+/// constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -2972,6 +3250,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   // adds below are bounded by `while x + 32 <= width` and the caller-
   // promised slice lengths.
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_i64_v = _mm512_set1_epi64(RND_I64);
     let rnd_i32_v = _mm512_set1_epi32(RND_I32);
     let y_off_v = _mm512_set1_epi32(y_off);
@@ -3081,22 +3360,31 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
       let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32));
       let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32));
 
-      // Write 32 pixels (96 u16) via 4× 8-pixel helper.
-      write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      // Write 32 pixels via the appropriate 4× 8-pixel helper.
+      if ALPHA {
+        write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_u16_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -3260,7 +3548,6 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
 }
 
 /// AVX-512 P016 → packed **16-bit** RGB.
-/// Delegates to SSE4.1 (i64 arithmetic).
 ///
 /// # Safety
 ///
@@ -3276,34 +3563,59 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   full_range: bool,
 ) {
   unsafe {
-    p16_to_rgb_u16_row_impl(y, uv_half, rgb_out, width, matrix, full_range);
+    p16_to_rgb_or_rgba_u16_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
   }
 }
 
-/// Native AVX-512 P016 → u16 kernel, 32 pixels per iter. Shares the
-/// 16-bit u16 arithmetic structure with
-/// [`yuv_420p16_to_rgb_u16_row`]; the only difference is an inline
-/// U/V deinterleave at the UV load.
+/// AVX-512 sibling of [`p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha is `0xFFFF`.
 ///
 /// # Safety
 ///
-/// Must be called from an AVX-512BW-enabled context. Caller upholds
-/// `width & 1 == 0`, `y.len() >= width`, `uv_half.len() >= width`,
-/// `rgb_out.len() >= 3 * width`.
+/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn p16_to_rgb_u16_row_impl(
+pub(crate) unsafe fn p16_to_rgba_u16_row(
   y: &[u16],
   uv_half: &[u16],
-  rgb_out: &mut [u16],
+  rgba_out: &mut [u16],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 16-bit P016 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with
+/// constant alpha `0xFFFF`. 32 pixels per iter. Shares the 16-bit
+/// arithmetic structure with [`yuv_420p16_to_rgb_or_rgba_u16_row`];
+/// only difference is an inline U/V deinterleave at the UV load.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
@@ -3314,6 +3626,7 @@ unsafe fn p16_to_rgb_u16_row_impl(
   // adds are bounded by `while x + 32 <= width` and caller-promised
   // slice lengths.
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_i64_v = _mm512_set1_epi64(RND_I64);
     let rnd_i32_v = _mm512_set1_epi32(RND_I32);
     let y_off_v = _mm512_set1_epi32(y_off);
@@ -3409,20 +3722,25 @@ unsafe fn p16_to_rgb_u16_row_impl(
       let g_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(g_lo_i32, g_hi_i32));
       let b_u16 = _mm512_permutexvar_epi64(pack_fixup, _mm512_packus_epi32(b_lo_i32, b_hi_i32));
 
-      write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::p16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index 2591060..44ab725 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -1793,6 +1793,204 @@ fn avx512_p016_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ----
+
+fn check_planar_u16_avx512_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u16_avx512_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u16_avx512_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width / 2, 53);
+  let v = p16_plane_avx512(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u16_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width / 2, 53);
+  let v = p16_plane_avx512(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn avx512_yuv420p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u16_avx512_rgba_equivalence_n::<9>(64, m, full);
+      check_planar_u16_avx512_rgba_equivalence_n::<10>(64, m, full);
+      check_planar_u16_avx512_rgba_equivalence_n::<12>(64, m, full);
+      check_planar_u16_avx512_rgba_equivalence_n::<14>(64, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_planar_u16_avx512_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u16_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u16_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u16_avx512_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn avx512_pn_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u16_avx512_rgba_equivalence_n::<10>(64, m, full);
+      check_pn_u16_avx512_rgba_equivalence_n::<12>(64, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx512_pn_rgba_u16_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_pn_u16_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u16_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn avx512_yuv420p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_avx512_rgba_equivalence(64, m, full);
+    }
+  }
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_yuv420p16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn avx512_p016_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u16_avx512_rgba_equivalence(64, m, full);
+    }
+  }
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_p16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) AVX-512 equivalence -------------
 
 fn high_bit_plane_avx512<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/arch/x86_common.rs b/src/row/arch/x86_common.rs
index 7cf7aca..d7ff79b 100644
--- a/src/row/arch/x86_common.rs
+++ b/src/row/arch/x86_common.rs
@@ -233,6 +233,42 @@ pub(super) unsafe fn write_rgb_u16_8(r: __m128i, g: __m128i, b: __m128i, ptr: *m
   }
 }
 
+/// Interleaves 8 R/G/B/A `u16` samples into packed RGBA quads (32
+/// `u16` = 64 bytes). Two 16-bit unpack stages followed by two 32-bit
+/// unpack stages produce four 16-byte chunks of `[R, G, B, A]` quads,
+/// stored back-to-back via `_mm_storeu_si128`.
+///
+/// # Safety
+///
+/// - `ptr` must point to at least 64 writable bytes (aligned or
+///   unaligned — we use `storeu`).
+/// - The calling function must have SSE2 available (the unpack +
+///   `storeu_si128` intrinsics; SSE4.1 / AVX2 / AVX-512 supersets all
+///   satisfy this).
+#[inline(always)]
+pub(super) unsafe fn write_rgba_u16_8(
+  r: __m128i,
+  g: __m128i,
+  b: __m128i,
+  a: __m128i,
+  ptr: *mut u16,
+) {
+  unsafe {
+    let rg_lo = _mm_unpacklo_epi16(r, g);
+    let rg_hi = _mm_unpackhi_epi16(r, g);
+    let ba_lo = _mm_unpacklo_epi16(b, a);
+    let ba_hi = _mm_unpackhi_epi16(b, a);
+    let q0 = _mm_unpacklo_epi32(rg_lo, ba_lo);
+    let q1 = _mm_unpackhi_epi32(rg_lo, ba_lo);
+    let q2 = _mm_unpacklo_epi32(rg_hi, ba_hi);
+    let q3 = _mm_unpackhi_epi32(rg_hi, ba_hi);
+    _mm_storeu_si128(ptr.cast(), q0);
+    _mm_storeu_si128(ptr.add(8).cast(), q1);
+    _mm_storeu_si128(ptr.add(16).cast(), q2);
+    _mm_storeu_si128(ptr.add(24).cast(), q3);
+  }
+}
+
 /// Swaps the outer two channels of 16 packed 3‑byte pixels (48 bytes
 /// in, 48 bytes out). Drives both BGR→RGB and RGB→BGR conversions
 /// since the transformation is self‑inverse.
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 43400f9..a05ce4b 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -42,6 +42,7 @@ use crate::{
   row::{
     arch::x86_common::{
       rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16,
+      write_rgba_u16_8,
     },
     scalar,
   },
@@ -473,10 +474,62 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 sibling of [`p_n_to_rgba_row`] for native-depth `u16` output.
+/// Alpha samples are `(1 << BITS) - 1` (opaque maximum at the input
+/// bit depth). P016 has its own kernel family — never routed here.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 Pn → native-depth `u16` kernel. `ALPHA = false`
+/// writes RGB triples via `write_rgb_u16_8`; `ALPHA = true` writes
+/// RGBA quads via `write_rgba_u16_8` with constant alpha
+/// `(1 << BITS) - 1`. P016 has its own kernel family — never routed
+/// here.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -502,6 +555,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -545,21 +599,33 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
       let b_lo = clamp_u16_max(_mm_adds_epi16(y_scaled_lo, b_dup_lo), zero_v, max_v);
       let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
-      write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3));
-      write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24));
+      if ALPHA {
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(
+          r_hi,
+          g_hi,
+          b_hi,
+          alpha_u16,
+          out.as_mut_ptr().add(x * 4 + 32),
+        );
+      } else {
+        write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3));
+        write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_u16_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -830,13 +896,70 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 sibling of [`yuv_420p_n_to_rgba_row`] for native-depth `u16`
+/// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
+/// input bit depth).
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_u16_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared SSE4.1 high-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
+/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
+/// constant alpha `(1 << BITS) - 1`.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -860,6 +983,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u16 = _mm_set1_epi16(out_max);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -910,22 +1034,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row<const BITS: u32>(
       let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_dup_hi), zero_v, max_v);
 
       // Two 8‑pixel u16 writes cover the 16‑pixel block.
-      write_rgb_u16_8(r_lo, g_lo, b_lo, rgb_out.as_mut_ptr().add(x * 3));
-      write_rgb_u16_8(r_hi, g_hi, b_hi, rgb_out.as_mut_ptr().add(x * 3 + 24));
+      if ALPHA {
+        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(
+          r_hi,
+          g_hi,
+          b_hi,
+          alpha_u16,
+          out.as_mut_ptr().add(x * 4 + 32),
+        );
+      } else {
+        write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3));
+        write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_u16_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -2460,17 +2600,72 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 sibling of [`yuv_420p16_to_rgba_row`] for native-depth `u16`
+/// output. Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_u16_row::<true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared SSE4.1 16-bit YUV 4:2:0 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
+/// quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm_set1_epi64x(RND);
     let y_off_v = _mm_set1_epi32(y_off);
     let y_scale_v = _mm_set1_epi32(y_scale);
@@ -2580,25 +2775,35 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
         _mm_add_epi32(y_hi_i32, b_dup_hi),
       );
 
-      write_rgb_u16_8(
-        r_lo_u16,
-        g_lo_u16,
-        b_lo_u16,
-        rgb_out.as_mut_ptr().add(x * 3),
-      );
+      if ALPHA {
+        write_rgba_u16_8(
+          r_lo_u16,
+          g_lo_u16,
+          b_lo_u16,
+          alpha_u16,
+          out.as_mut_ptr().add(x * 4),
+        );
+      } else {
+        write_rgb_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_u16_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_u16_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -2779,16 +2984,64 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 sibling of [`p16_to_rgba_row`] for native-depth `u16` output.
+/// Alpha is `0xFFFF`.
+///
+/// # Safety
+///
+/// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p16_to_rgba_u16_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    p16_to_rgb_or_rgba_u16_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 16-bit P016 → native-depth `u16` kernel.
+/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA
+/// quads with constant alpha `0xFFFF`.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range);
   const RND: i64 = 1 << 14;
 
   unsafe {
+    let alpha_u16 = _mm_set1_epi16(-1i16);
     let rnd_v = _mm_set1_epi64x(RND);
     let y_off_v = _mm_set1_epi32(y_off);
     let y_scale_v = _mm_set1_epi32(y_scale);
@@ -2886,19 +3139,24 @@ pub(crate) unsafe fn p16_to_rgb_u16_row(
         _mm_add_epi32(y_hi_i32, b_dup_hi),
       );
 
-      write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3));
+      }
       x += 8;
     }
 
     if x < width {
-      scalar::p16_to_rgb_u16_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 284f2b3..f60f081 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -1815,6 +1815,204 @@ fn sse41_p016_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- High-bit 4:2:0 native-depth `u16` RGBA equivalence (Ship 8 Tranche 5b) ----
+//
+// u16 RGBA wrappers share the math of their u16 RGB siblings — only
+// the store (and tail dispatch) branches on `ALPHA`, with alpha set to
+// `(1 << BITS) - 1` for BITS-generic kernels and `0xFFFF` for 16-bit.
+
+fn check_planar_u16_sse41_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p_n_to_rgba_u16_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_420p_n_to_rgba_u16_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 yuv_420p_n<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u16_sse41_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_u16_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Pn<{BITS}>→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width / 2, 53);
+  let v = p16_plane(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 yuv_420p16→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width / 2, 53);
+  let v = p16_plane(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 P016→RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn sse41_yuv420p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u16_sse41_rgba_equivalence_n::<9>(16, m, full);
+      check_planar_u16_sse41_rgba_equivalence_n::<10>(16, m, full);
+      check_planar_u16_sse41_rgba_equivalence_n::<12>(16, m, full);
+      check_planar_u16_sse41_rgba_equivalence_n::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuv420p_n_rgba_u16_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_planar_u16_sse41_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u16_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u16_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u16_sse41_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn sse41_pn_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u16_sse41_rgba_equivalence_n::<10>(16, m, full);
+      check_pn_u16_sse41_rgba_equivalence_n::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn sse41_pn_rgba_u16_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_pn_u16_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u16_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn sse41_yuv420p16_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u16_sse41_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_yuv420p16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn sse41_p016_rgba_u16_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u16_sse41_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_p16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) SSE4.1 equivalence ---------------
 
 fn high_bit_plane_sse41<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/mod.rs b/src/row/mod.rs
index d67ce1b..1ee3025 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -2661,10 +2661,9 @@ pub fn p016_to_rgb_u16_row(
 
 // ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) ---------------
 //
-// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche
-// 5a). u16 RGBA dispatchers stay scalar-only until the follow-up Ship
-// 8 Tranche 5b PR adds u16 SIMD; their `use_simd` parameter is held in
-// the signature so wiring 5b is a non-breaking change.
+// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch
+// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces
+// the scalar reference path on every dispatcher.
 
 /// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
@@ -2758,10 +2757,8 @@ pub fn yuv420p9_to_rgba_row(
 /// in the low bits of each `u16`); alpha element is `(1 << 9) - 1`
 /// (opaque maximum at the input bit depth).
 ///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p9_to_rgba_u16_row(
@@ -2781,7 +2778,61 @@ pub fn yuv420p9_to_rgba_u16_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2876,10 +2927,8 @@ pub fn yuv420p10_to_rgba_row(
 /// in the low bits of each `u16`); alpha element is `(1 << 10) - 1`
 /// (opaque maximum at the input bit depth).
 ///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p10_to_rgba_u16_row(
@@ -2899,7 +2948,61 @@ pub fn yuv420p10_to_rgba_u16_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2980,10 +3083,8 @@ pub fn p010_to_rgba_row(
 /// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
 /// is low-bit-packed; alpha element is `(1 << 10) - 1`.
 ///
-/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn p010_to_rgba_u16_row(
@@ -3001,7 +3102,53 @@ pub fn p010_to_rgba_u16_row(
   assert!(uv_half.len() >= width, "uv_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3096,10 +3243,8 @@ pub fn yuv420p12_to_rgba_row(
 /// in the low bits of each `u16`); alpha element is `(1 << 12) - 1`
 /// (opaque maximum at the input bit depth).
 ///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p12_to_rgba_u16_row(
@@ -3119,7 +3264,61 @@ pub fn yuv420p12_to_rgba_u16_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3214,10 +3413,8 @@ pub fn yuv420p14_to_rgba_row(
 /// in the low bits of each `u16`); alpha element is `(1 << 14) - 1`
 /// (opaque maximum at the input bit depth).
 ///
-/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p14_to_rgba_u16_row(
@@ -3237,7 +3434,61 @@ pub fn yuv420p14_to_rgba_u16_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3318,10 +3569,8 @@ pub fn p012_to_rgba_row(
 /// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output
 /// is low-bit-packed; alpha element is `(1 << 12) - 1`.
 ///
-/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn p012_to_rgba_u16_row(
@@ -3339,7 +3588,53 @@ pub fn p012_to_rgba_u16_row(
   assert!(uv_half.len() >= width, "uv_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3420,8 +3715,8 @@ pub fn yuv420p16_to_rgba_row(
 ///
 /// Routes through the dedicated 16-bit u16-output scalar kernel
 /// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply
-/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR.
+/// for the wider `coeff × u_d` product at 16 → 16-bit scaling.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p16_to_rgba_u16_row(
@@ -3441,7 +3736,48 @@ pub fn yuv420p16_to_rgba_u16_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3518,8 +3854,8 @@ pub fn p016_to_rgba_row(
 /// `0xFFFF`.
 ///
 /// Routes through the dedicated 16-bit u16-output P016 scalar kernel
-/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5b PR.
+/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn p016_to_rgba_u16_row(
@@ -3537,7 +3873,48 @@ pub fn p016_to_rgba_u16_row(
   assert!(uv_half.len() >= width, "uv_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5b.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range);
 }
 

From ad52ef6782d1c9e4254ad8d7f00e1d6b26812feb Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 01:21:06 +1200
Subject: [PATCH 2/3] update

---
 src/row/mod.rs                                |   2 +
 src/sinker/mixed/mod.rs                       |  24 +
 src/sinker/mixed/planar_8bit.rs               |   8 +-
 src/sinker/mixed/subsampled_4_2_0_high_bit.rs | 874 +++++++++++++++++-
 src/sinker/mixed/tests.rs                     | 202 ++++
 5 files changed, 1076 insertions(+), 34 deletions(-)

diff --git a/src/row/mod.rs b/src/row/mod.rs
index 1ee3025..b33cfa4 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -41,6 +41,8 @@ pub(crate) mod scalar;
 // be unused, which is a hard error under `cargo clippy -- -D warnings`.
 #[cfg(any(feature = "std", feature = "alloc"))]
 pub(crate) use scalar::expand_rgb_to_rgba_row;
+#[cfg(any(feature = "std", feature = "alloc"))]
+pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row;
 
 use crate::ColorMatrix;
 
diff --git a/src/sinker/mixed/mod.rs b/src/sinker/mixed/mod.rs
index 0369929..69208e0 100644
--- a/src/sinker/mixed/mod.rs
+++ b/src/sinker/mixed/mod.rs
@@ -1119,6 +1119,30 @@ pub(super) fn rgba_plane_row_slice(
   Ok(&mut buf[start..end])
 }
 
+/// `u16` analogue of [`rgba_plane_row_slice`] — slices the RGBA row out
+/// of an attached `u16` RGBA plane buffer. Element count and byte
+/// offsets are identical (both `× 4`); the only difference is the
+/// element type, so the overflow check is the same. Used by the
+/// high-bit-depth 4:2:0 sinkers that fan `u16` RGB out to `u16` RGBA.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(super) fn rgba_u16_plane_row_slice(
+  buf: &mut [u16],
+  one_plane_start: usize,
+  one_plane_end: usize,
+  width: usize,
+  height: usize,
+) -> Result<&mut [u16], MixedSinkerError> {
+  let end = one_plane_end
+    .checked_mul(4)
+    .ok_or(MixedSinkerError::GeometryOverflow {
+      width,
+      height,
+      channels: 4,
+    })?;
+  let start = one_plane_start * 4; // ≤ end, fits.
+  Ok(&mut buf[start..end])
+}
+
 /// Pick an RGB row buffer for the kernel to write into: caller's RGB
 /// plane slice when attached, or the growing scratch buffer otherwise
 /// (HSV-only callers don't allocate an RGB plane). Returns
diff --git a/src/sinker/mixed/planar_8bit.rs b/src/sinker/mixed/planar_8bit.rs
index a0d5c7e..10224dd 100644
--- a/src/sinker/mixed/planar_8bit.rs
+++ b/src/sinker/mixed/planar_8bit.rs
@@ -31,12 +31,12 @@ impl<'a> MixedSinker<'a, Yuv420p> {
   ///
   /// ```compile_fail
   /// // Attaching RGBA to a sink that doesn't write it is rejected
-  /// // at compile time. Yuv420p10 (10‑bit 4:2:0 planar) has not yet
-  /// // been wired for RGBA — Tranche 5 covers it; once that lands the
+  /// // at compile time. Yuv422p10 (10‑bit 4:2:2 planar) has not yet
+  /// // been wired for RGBA — once a future tranche lands it the
   /// // negative example here moves to the next not‑yet‑wired format.
-  /// use colconv::{sinker::MixedSinker, yuv::Yuv420p10};
+  /// use colconv::{sinker::MixedSinker, yuv::Yuv422p10};
   /// let mut buf = vec![0u8; 16 * 8 * 4];
-  /// let _ = MixedSinker::<Yuv420p10>::new(16, 8).with_rgba(&mut buf);
+  /// let _ = MixedSinker::<Yuv422p10>::new(16, 8).with_rgba(&mut buf);
   /// ```
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
diff --git a/src/sinker/mixed/subsampled_4_2_0_high_bit.rs b/src/sinker/mixed/subsampled_4_2_0_high_bit.rs
index 9531a17..b203693 100644
--- a/src/sinker/mixed/subsampled_4_2_0_high_bit.rs
+++ b/src/sinker/mixed/subsampled_4_2_0_high_bit.rs
@@ -2,6 +2,7 @@
 
 use super::{
   MixedSinker, MixedSinkerError, RowSlice, check_dimensions_match, rgb_row_buf_or_scratch,
+  rgba_plane_row_slice, rgba_u16_plane_row_slice,
 };
 use crate::{PixelSink, row::*, yuv::*};
 
@@ -33,6 +34,56 @@ impl<'a> MixedSinker<'a, Yuv420p9> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 9‑bit YUV
+  /// source is converted to 8‑bit RGBA via the same `BITS = 9` Q15
+  /// kernel family used by [`Self::with_rgb`]; the fourth byte per
+  /// pixel is alpha = `0xFF` (Yuv420p9 has no alpha plane).
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 9‑bit low‑packed
+  /// (`(1 << 9) - 1 = 511` max). Length is measured in `u16`
+  /// **elements** (`width × height × 4`). Alpha element is
+  /// `(1 << 9) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv420p9Sink for MixedSinker<'_, Yuv420p9> {}
@@ -92,6 +143,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -107,7 +160,29 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    // Compute u16 RGB once (to caller's buffer when attached) and fan
+    // out to u16 RGBA via the cheap per-pixel pad. RGBA-only avoids the
+    // RGB kernel entirely and writes RGBA directly.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p9_to_rgba_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -117,19 +192,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv420p9_to_rgb_u16_row(
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
-    if rgb.is_none() && hsv.is_none() {
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p9_to_rgba_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -163,6 +267,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p9> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -211,6 +321,52 @@ impl<'a> MixedSinker<'a, Yuv420p10> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 10‑bit YUV
+  /// source is converted to 8‑bit RGBA via the `BITS = 10` Q15 kernel
+  /// family; the fourth byte per pixel is alpha = `0xFF` (Yuv420p10
+  /// has no alpha plane).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 10‑bit
+  /// low‑packed (`(1 << 10) - 1 = 1023` max). Length is measured in
+  /// `u16` **elements** (`width × height × 4`). Alpha element is
+  /// `(1 << 10) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv420p10Sink for MixedSinker<'_, Yuv420p10> {}
@@ -276,6 +432,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -296,11 +454,32 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> {
       }
     }
 
-    // `u16` RGB output — written directly via the native‑depth row
-    // primitive. Computed independently of the u8 path: the two
-    // outputs have different scale params inside `range_params_n`,
-    // so they can't share an intermediate without losing precision.
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) =====
+    // u16 outputs are written via the native-depth row primitive, kept
+    // independent of the u8 path: the two have different scale params
+    // inside `range_params_n` and can't share an intermediate without
+    // losing precision. Within the u16 family, however, the RGB row
+    // and RGBA row are bit-identical for R/G/B, so we run the RGB
+    // kernel once and fan out to RGBA via the cheap pad.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p10_to_rgba_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -310,21 +489,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv420p10_to_rgb_u16_row(
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p10_to_rgba_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -361,6 +567,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p10> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -392,6 +604,51 @@ impl<'a> MixedSinker<'a, Yuv420p12> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 12‑bit YUV
+  /// source is converted to 8‑bit RGBA via the `BITS = 12` Q15 kernel
+  /// family; alpha = `0xFF` (Yuv420p12 has no alpha plane).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 12‑bit
+  /// low‑packed (`(1 << 12) - 1 = 4095` max). Length is measured in
+  /// `u16` **elements** (`width × height × 4`). Alpha element is
+  /// `(1 << 12) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv420p12Sink for MixedSinker<'_, Yuv420p12> {}
@@ -454,6 +711,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -470,7 +729,26 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p12_to_rgba_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -480,21 +758,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv420p12_to_rgb_u16_row(
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p12_to_rgba_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -528,6 +833,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p12> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -558,6 +869,51 @@ impl<'a> MixedSinker<'a, Yuv420p14> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 14‑bit YUV
+  /// source is converted to 8‑bit RGBA via the `BITS = 14` Q15 kernel
+  /// family; alpha = `0xFF` (Yuv420p14 has no alpha plane).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 14‑bit
+  /// low‑packed (`(1 << 14) - 1 = 16383` max). Length is measured in
+  /// `u16` **elements** (`width × height × 4`). Alpha element is
+  /// `(1 << 14) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv420p14Sink for MixedSinker<'_, Yuv420p14> {}
@@ -618,6 +974,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -634,7 +992,26 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p14_to_rgba_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -644,21 +1021,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv420p14_to_rgb_u16_row(
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p14_to_rgba_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -692,6 +1096,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p14> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -721,6 +1131,50 @@ impl<'a> MixedSinker<'a, Yuv420p16> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 16‑bit YUV
+  /// source is converted to 8‑bit RGBA via the dedicated `BITS = 16`
+  /// kernel family; alpha = `0xFF` (Yuv420p16 has no alpha plane).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 16‑bit output
+  /// (full `u16` range). Length is measured in `u16` **elements**
+  /// (`width × height × 4`). Alpha element is `u16::MAX`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl Yuv420p16Sink for MixedSinker<'_, Yuv420p16> {}
@@ -782,6 +1236,8 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -798,7 +1254,26 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p16_to_rgba_u16_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -808,21 +1283,48 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       yuv420p16_to_rgb_u16_row(
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      yuv420p16_to_rgba_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -856,6 +1358,12 @@ impl PixelSink for MixedSinker<'_, Yuv420p16> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -893,6 +1401,52 @@ impl<'a> MixedSinker<'a, P010> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 10‑bit P010
+  /// source (semi‑planar, high‑bit‑packed) is converted to 8‑bit RGBA
+  /// via the `BITS = 10` Q15 kernel family; alpha = `0xFF` (P010 has
+  /// no alpha plane).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. Output is
+  /// **low‑bit‑packed** 10‑bit values (`yuv420p10le` convention) — not
+  /// P010 high‑bit packing. Length is measured in `u16` **elements**
+  /// (`width × height × 4`). Alpha element is `(1 << 10) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl P010Sink for MixedSinker<'_, P010> {}
@@ -909,6 +1463,10 @@ impl PixelSink for MixedSinker<'_, P010> {
   }
 
   fn process(&mut self, row: P010Row<'_>) -> Result<(), Self::Error> {
+    // P010 stores 10‑bit samples high‑bit‑packed; bit depth is fixed
+    // by the format. Used for the u16 RGBA expand path's alpha pad.
+    const BITS: u32 = 10;
+
     let w = self.width;
     let h = self.height;
     let idx = row.row();
@@ -944,6 +1502,8 @@ impl PixelSink for MixedSinker<'_, P010> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -964,9 +1524,27 @@ impl PixelSink for MixedSinker<'_, P010> {
       }
     }
 
-    // `u16` RGB output — low-bit-packed 10-bit values (yuv420p10le
-    // convention), not P010's high-bit packing.
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale.
+    // u16 outputs are low-bit-packed (yuv420p10le convention), not
+    // P010's high-bit packing.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      p010_to_rgba_u16_row(
+        row.y(),
+        row.uv_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -976,20 +1554,46 @@ impl PixelSink for MixedSinker<'_, P010> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       p010_to_rgb_u16_row(
         row.y(),
         row.uv_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      p010_to_rgba_row(
+        row.y(),
+        row.uv_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1022,6 +1626,12 @@ impl PixelSink for MixedSinker<'_, P010> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -1053,6 +1663,52 @@ impl<'a> MixedSinker<'a, P012> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 12‑bit P012
+  /// source (semi‑planar, high‑bit‑packed) is converted to 8‑bit RGBA
+  /// via the `BITS = 12` Q15 kernel family; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. Output is
+  /// **low‑bit‑packed** 12‑bit values (`yuv420p12le` convention) —
+  /// not P012 high‑bit packing. Length is measured in `u16`
+  /// **elements** (`width × height × 4`). Alpha element is
+  /// `(1 << 12) - 1`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl P012Sink for MixedSinker<'_, P012> {}
@@ -1069,6 +1725,10 @@ impl PixelSink for MixedSinker<'_, P012> {
   }
 
   fn process(&mut self, row: P012Row<'_>) -> Result<(), Self::Error> {
+    // P012 stores 12‑bit samples high‑bit‑packed; bit depth is fixed
+    // by the format. Used for the u16 RGBA expand path's alpha pad.
+    const BITS: u32 = 12;
+
     let w = self.width;
     let h = self.height;
     let idx = row.row();
@@ -1103,6 +1763,8 @@ impl PixelSink for MixedSinker<'_, P012> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -1123,7 +1785,27 @@ impl PixelSink for MixedSinker<'_, P012> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale.
+    // u16 outputs are low-bit-packed (yuv420p12le convention), not
+    // P012's high-bit packing.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      p012_to_rgba_u16_row(
+        row.y(),
+        row.uv_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -1133,20 +1815,46 @@ impl PixelSink for MixedSinker<'_, P012> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       p012_to_rgb_u16_row(
         row.y(),
         row.uv_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      p012_to_rgba_row(
+        row.y(),
+        row.uv_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1179,6 +1887,12 @@ impl PixelSink for MixedSinker<'_, P012> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
@@ -1209,6 +1923,50 @@ impl<'a> MixedSinker<'a, P016> {
     self.rgb_u16 = Some(buf);
     Ok(self)
   }
+
+  /// Attaches a packed **8‑bit** RGBA output buffer. The 16‑bit P016
+  /// source (semi‑planar) is converted to 8‑bit RGBA via the dedicated
+  /// `BITS = 16` kernel family; alpha = `0xFF`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  /// Attaches a packed **`u16`** RGBA output buffer. 16‑bit output
+  /// (full `u16` range). Length is measured in `u16` **elements**
+  /// (`width × height × 4`). Alpha element is `u16::MAX`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba_u16(buf)?;
+    Ok(self)
+  }
+  /// In-place variant of [`with_rgba_u16`](Self::with_rgba_u16).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba_u16(&mut self, buf: &'a mut [u16]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaU16BufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba_u16 = Some(buf);
+    Ok(self)
+  }
 }
 
 impl P016Sink for MixedSinker<'_, P016> {}
@@ -1225,6 +1983,10 @@ impl PixelSink for MixedSinker<'_, P016> {
   }
 
   fn process(&mut self, row: P016Row<'_>) -> Result<(), Self::Error> {
+    // Bit depth is fixed by the format (16). Used for the u16 RGBA
+    // expand path's alpha pad (`alpha = u16::MAX` at this depth).
+    const BITS: u32 = 16;
+
     let w = self.width;
     let h = self.height;
     let idx = row.row();
@@ -1259,6 +2021,8 @@ impl PixelSink for MixedSinker<'_, P016> {
     let Self {
       rgb,
       rgb_u16,
+      rgba,
+      rgba_u16,
       luma,
       hsv,
       rgb_scratch,
@@ -1276,7 +2040,25 @@ impl PixelSink for MixedSinker<'_, P016> {
       }
     }
 
-    if let Some(buf) = rgb_u16.as_deref_mut() {
+    // ===== u16 RGB / RGBA path (Strategy A) — see Yuv420p10 for rationale.
+    let want_rgb_u16 = rgb_u16.is_some();
+    let want_rgba_u16 = rgba_u16.is_some();
+
+    if want_rgba_u16 && !want_rgb_u16 {
+      let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+      let rgba_u16_row =
+        rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+      p016_to_rgba_u16_row(
+        row.y(),
+        row.uv_half(),
+        rgba_u16_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    } else if want_rgb_u16 {
+      let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap();
       let rgb_plane_end =
         one_plane_end
           .checked_mul(3)
@@ -1286,20 +2068,46 @@ impl PixelSink for MixedSinker<'_, P016> {
             channels: 3,
           })?;
       let rgb_plane_start = one_plane_start * 3;
+      let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end];
       p016_to_rgb_u16_row(
         row.y(),
         row.uv_half(),
-        &mut buf[rgb_plane_start..rgb_plane_end],
+        rgb_u16_row,
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      if want_rgba_u16 {
+        let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap();
+        let rgba_u16_row =
+          rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?;
+        expand_rgb_u16_to_rgba_u16_row::<BITS>(rgb_u16_row, rgba_u16_row, w);
+      }
     }
 
+    // ===== u8 RGB / RGBA / HSV path (Strategy A) =====
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?;
+      p016_to_rgba_row(
+        row.y(),
+        row.uv_half(),
+        rgba_row,
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1332,6 +2140,12 @@ impl PixelSink for MixedSinker<'_, P016> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?;
+      expand_rgb_to_rgba_row(rgb_row, rgba_row, w);
+    }
+
     Ok(())
   }
 }
diff --git a/src/sinker/mixed/tests.rs b/src/sinker/mixed/tests.rs
index d17849d..b65f7de 100644
--- a/src/sinker/mixed/tests.rs
+++ b/src/sinker/mixed/tests.rs
@@ -2839,6 +2839,154 @@ fn yuv420p10_with_simd_false_matches_with_simd_true() {
   assert_eq!(rgb_u16_scalar, rgb_u16_simd);
 }
 
+// ---- Yuv420p10 RGBA (Ship 8 Tranche 5b) -------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p10_rgba_u8_only_gray_with_opaque_alpha() {
+  // 10-bit mid-gray → 8-bit RGBA ≈ (128, 128, 128, 255) per pixel.
+  let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512);
+  let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u8; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(128) <= 1);
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFF, "alpha must be opaque");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p10_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 10-bit mid-gray → u16 RGBA: each color element ≈ 512, alpha = 1023.
+  let (yp, up, vp) = solid_yuv420p10_frame(16, 8, 512, 512, 512);
+  let src = Yuv420p10Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p10>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv420p10_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 1023, "alpha must equal (1 << 10) - 1");
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p10_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+  // Strategy A: when both rgb and rgba are attached, the rgb buffer is
+  // populated by the RGB kernel and the rgba buffer is populated via a
+  // cheap expand pass. RGB triples must be byte-identical to the
+  // standalone RGB-only run.
+  let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700);
+  let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32);
+
+  let mut rgb_solo = std::vec![0u8; 64 * 16 * 3];
+  let mut s_solo = MixedSinker::<Yuv420p10>::new(64, 16)
+    .with_rgb(&mut rgb_solo)
+    .unwrap();
+  yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap();
+
+  let mut rgb_combined = std::vec![0u8; 64 * 16 * 3];
+  let mut rgba = std::vec![0u8; 64 * 16 * 4];
+  let mut s_combined = MixedSinker::<Yuv420p10>::new(64, 16)
+    .with_rgb(&mut rgb_combined)
+    .unwrap()
+    .with_rgba(&mut rgba)
+    .unwrap();
+  yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap();
+
+  assert_eq!(rgb_solo, rgb_combined, "RGB bytes must match across runs");
+  for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) {
+    assert_eq!(rgb_px[0], rgba_px[0]);
+    assert_eq!(rgb_px[1], rgba_px[1]);
+    assert_eq!(rgb_px[2], rgba_px[2]);
+    assert_eq!(rgba_px[3], 0xFF);
+  }
+}
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p10_with_rgb_u16_and_with_rgba_u16_produce_byte_identical_rgb_elems() {
+  // Strategy A on the u16 path: rgb_u16 buffer populated by the u16 RGB
+  // kernel, rgba_u16 fanned out via expand_rgb_u16_to_rgba_u16_row<10>.
+  let (yp, up, vp) = solid_yuv420p10_frame(64, 16, 600, 400, 700);
+  let src = Yuv420p10Frame::new(&yp, &up, &vp, 64, 16, 64, 32, 32);
+
+  let mut rgb_solo = std::vec![0u16; 64 * 16 * 3];
+  let mut s_solo = MixedSinker::<Yuv420p10>::new(64, 16)
+    .with_rgb_u16(&mut rgb_solo)
+    .unwrap();
+  yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_solo).unwrap();
+
+  let mut rgb_combined = std::vec![0u16; 64 * 16 * 3];
+  let mut rgba = std::vec![0u16; 64 * 16 * 4];
+  let mut s_combined = MixedSinker::<Yuv420p10>::new(64, 16)
+    .with_rgb_u16(&mut rgb_combined)
+    .unwrap()
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv420p10_to(&src, true, ColorMatrix::Bt709, &mut s_combined).unwrap();
+
+  assert_eq!(
+    rgb_solo, rgb_combined,
+    "RGB u16 elements must match across runs"
+  );
+  for (rgb_px, rgba_px) in rgb_combined.chunks(3).zip(rgba.chunks(4)) {
+    assert_eq!(rgb_px[0], rgba_px[0]);
+    assert_eq!(rgb_px[1], rgba_px[1]);
+    assert_eq!(rgb_px[2], rgba_px[2]);
+    assert_eq!(rgba_px[3], 1023, "alpha = (1 << 10) - 1");
+  }
+}
+
+#[test]
+fn yuv420p10_rgba_too_short_returns_err() {
+  let mut rgba = std::vec![0u8; 10];
+  let err = MixedSinker::<Yuv420p10>::new(16, 8)
+    .with_rgba(&mut rgba)
+    .err()
+    .expect("expected RgbaBufferTooShort");
+  assert!(matches!(err, MixedSinkerError::RgbaBufferTooShort { .. }));
+}
+
+#[test]
+fn yuv420p10_rgba_u16_too_short_returns_err() {
+  let mut rgba = std::vec![0u16; 10];
+  let err = MixedSinker::<Yuv420p10>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .err()
+    .expect("expected RgbaU16BufferTooShort");
+  assert!(matches!(
+    err,
+    MixedSinkerError::RgbaU16BufferTooShort { .. }
+  ));
+}
+
 // ---- P010 --------------------------------------------------------------
 //
 // Semi-planar 10-bit, high-bit-packed (samples in high 10 of each
@@ -3039,6 +3187,33 @@ fn p010_with_simd_false_matches_with_simd_true() {
   assert_eq!(rgb_u16_scalar, rgb_u16_simd);
 }
 
+// ---- P010 RGBA (Ship 8 Tranche 5b) ------------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn p010_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // P010 mid-gray (10-bit values shifted into the high 10): Y/U/V = 512 << 6.
+  // Output u16 RGBA: each color element ≈ 512, alpha = 1023.
+  let (yp, uvp) = solid_p010_frame(16, 8, 512, 512, 512);
+  let src = P010Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<P010>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  p010_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(512) <= 1, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 1023, "alpha = (1 << 10) - 1");
+  }
+}
+
 // ---- Yuv420p12 ---------------------------------------------------------
 //
 // Planar 12-bit, low-bit-packed. Mirrors the Yuv420p10 shape — same
@@ -3728,6 +3903,33 @@ fn yuv420p16_with_simd_false_matches_with_simd_true() {
   assert_eq!(rgb_u16_scalar, rgb_u16_simd);
 }
 
+// ---- Yuv420p16 RGBA (Ship 8 Tranche 5b) -------------------------------
+
+#[test]
+#[cfg_attr(
+  miri,
+  ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+)]
+fn yuv420p16_rgba_u16_only_native_depth_gray_with_opaque_alpha() {
+  // 16-bit mid-gray: Y=UV=32768. Output u16 RGBA: each color element ≈
+  // 32768, alpha = 0xFFFF.
+  let (yp, up, vp) = solid_yuv420p16_frame(16, 8, 32768, 32768, 32768);
+  let src = Yuv420p16Frame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+  let mut rgba = std::vec![0u16; 16 * 8 * 4];
+  let mut sink = MixedSinker::<Yuv420p16>::new(16, 8)
+    .with_rgba_u16(&mut rgba)
+    .unwrap();
+  yuv420p16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+  for px in rgba.chunks(4) {
+    assert!(px[0].abs_diff(32768) <= 8, "got {px:?}");
+    assert_eq!(px[0], px[1]);
+    assert_eq!(px[1], px[2]);
+    assert_eq!(px[3], 0xFFFF, "alpha must equal 0xFFFF");
+  }
+}
+
 // ---- P016 --------------------------------------------------------------
 
 fn solid_p016_frame(width: u32, height: u32, y: u16, u: u16, v: u16) -> (Vec<u16>, Vec<u16>) {

From 2da756a27eabaa0acba2bf9baa6ca9fe96188979 Mon Sep 17 00:00:00 2001
From: Al Liu <scygliu1@gmail.com>
Date: Sun, 26 Apr 2026 21:29:22 +0800
Subject: [PATCH 3/3] Update src/sinker/mixed/mod.rs

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/sinker/mixed/mod.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/sinker/mixed/mod.rs b/src/sinker/mixed/mod.rs
index 69208e0..f278805 100644
--- a/src/sinker/mixed/mod.rs
+++ b/src/sinker/mixed/mod.rs
@@ -1120,9 +1120,10 @@ pub(super) fn rgba_plane_row_slice(
 }
 
 /// `u16` analogue of [`rgba_plane_row_slice`] — slices the RGBA row out
-/// of an attached `u16` RGBA plane buffer. Element count and byte
-/// offsets are identical (both `× 4`); the only difference is the
-/// element type, so the overflow check is the same. Used by the
+/// of an attached `u16` RGBA plane buffer. This helper indexes in `u16`
+/// elements, not bytes: like the `u8` variant, RGBA rows use `× 4`
+/// elements per pixel, so the overflow check is the same, but the byte
+/// offsets differ because each element is 2 bytes. Used by the
 /// high-bit-depth 4:2:0 sinkers that fan `u16` RGB out to `u16` RGBA.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(super) fn rgba_u16_plane_row_slice(