From ea749e5b42a71415ffa3b19902ff6749b100b0c0 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 23:03:24 +1200
Subject: [PATCH 1/2] update

---
 src/row/arch/neon.rs               | 110 ++++++++++++++++---
 src/row/arch/neon/tests.rs         | 140 +++++++++++++++++++++++++
 src/row/arch/wasm_simd128.rs       | 112 +++++++++++++++++---
 src/row/arch/wasm_simd128/tests.rs | 149 ++++++++++++++++++++++++++
 src/row/arch/x86_avx2.rs           | 122 ++++++++++++++++++---
 src/row/arch/x86_avx2/tests.rs     | 152 +++++++++++++++++++++++++++
 src/row/arch/x86_avx512.rs         | 140 +++++++++++++++++++++----
 src/row/arch/x86_avx512/tests.rs   | 163 +++++++++++++++++++++++++++++
 src/row/arch/x86_sse41.rs          | 117 +++++++++++++++++----
 src/row/arch/x86_sse41/tests.rs    | 159 ++++++++++++++++++++++++++++
 src/row/mod.rs                     |  79 +++++++++++---
 src/sinker/mixed/yuva_4_4_4.rs     |   5 -
 12 files changed, 1350 insertions(+), 98 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 3bb4011..f62c91c 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -983,7 +983,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
 /// NEON YUV 4:4:4 planar high-bit-depth → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1001,7 +1002,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -1009,7 +1012,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 /// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
 /// input bit depth) — matches `scalar::yuv_444p_n_to_rgba_u16_row`.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1028,24 +1032,78 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
   }
 }
 
-/// Shared NEON high-bit YUV 4:4:4 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true`
-/// writes RGBA quads via `vst4q_u16` with constant alpha
-/// `(1 << BITS) - 1`.
+/// NEON YUVA 4:4:4 planar high-bit-depth → **native-depth `u16`**
+/// packed RGBA with the per-pixel alpha element **sourced from
+/// `a_src`** (already at the source's native bit depth — no depth
+/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`.
+/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "neon")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
+  }
+}
+
+/// Shared NEON high-bit YUV 4:4:4 → native-depth `u16` kernel for
+/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `vst3q_u16`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `vst4q_u16` with constant alpha
+/// `(1 << BITS) - 1`) and [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`]
+/// (`ALPHA = true, ALPHA_SRC = true`, `vst4q_u16` with the alpha lane
+/// loaded from `a_src` and masked to native bit depth — no shift since
+/// both the source alpha and the u16 output element are at the same
+/// native bit depth).
 ///
 /// # Safety
 ///
 /// 1. **NEON must be available.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -1053,16 +1111,23 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below
   // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The
   // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`.
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1140,8 +1205,21 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max(vqaddq_s16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
       if ALPHA {
-        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, alpha_u16);
-        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, alpha_u16);
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // No depth conversion — both source alpha and u16 output are
+          // at the same native bit depth (BITS), so just mask off any
+          // over-range bits to match the scalar reference.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let lo = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v);
+          let hi = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v);
+          (lo, hi)
+        } else {
+          (alpha_u16, alpha_u16)
+        };
+        let rgba_lo = uint16x8x4_t(r_lo, g_lo, b_lo, a_lo_v);
+        let rgba_hi = uint16x8x4_t(r_hi, g_hi, b_hi, a_hi_v);
         vst4q_u16(out.as_mut_ptr().add(x * 4), rgba_lo);
         vst4q_u16(out.as_mut_ptr().add(x * 4 + 32), rgba_hi);
       } else {
@@ -1160,7 +1238,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs
index 2355c27..9ba7b46 100644
--- a/src/row/arch/neon/tests.rs
+++ b/src/row/arch/neon/tests.rs
@@ -2864,3 +2864,143 @@ fn neon_p416_rgba_u16_matches_scalar_all_matrices() {
     check_p_n_444_16_u16_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ----
+//
+// Mirrors the u8 RGBA alpha-source tests above for the u16 output
+// path: per-pixel alpha element is loaded from the source plane,
+// AND-masked with `bits_mask::<10>()`, and stored at native depth (no
+// `>> (BITS - 8)` since both source alpha and output element are at
+// the same bit depth). Pseudo-random alpha flushes lane-order
+// corruption that a solid-alpha buffer would mask.
+
+fn check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_neon = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_neon,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p10_rgba_u16_matches_scalar_all_matrices_16() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p10_rgba_u16_matches_scalar_widths() {
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p10_rgba_u16_matches_scalar_random_alpha() {
+  // Different alpha seeds — ensures the alpha lane order through
+  // `vst4q_u16` is not confused with R/G/B.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(
+      31,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p_n_rgba_u16_matches_scalar_all_bits() {
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the
+  // AND-mask `mask_v` resolves correctly across the supported bit
+  // depths (no shift count to vary in the u16 path).
+  for full in [true, false] {
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53);
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<12>(
+      16,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<14>(
+      16,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
+  // BITS = 9, 12, 14 across tail widths.
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 34dd85f..6f55315 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -912,7 +912,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
 /// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 16 pixels per iter.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -930,7 +931,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -938,6 +941,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 /// `u16` output. Alpha samples are `(1 << BITS) - 1` (opaque maximum
 /// at the input bit depth).
 ///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
+///
 /// # Safety
 ///
 /// Same as [`yuv_444p_n_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`.
@@ -954,24 +960,80 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
   }
 }
 
-/// Shared WASM simd128 high-bit YUV 4:4:4 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
-/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
-/// constant alpha `(1 << BITS) - 1`.
+/// WASM simd128 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth
+/// `u16`** packed RGBA with the per-pixel alpha element **sourced
+/// from `a_src`** (already at the source's native bit depth — no
+/// depth conversion) instead of being the opaque maximum
+/// `(1 << BITS) - 1`. Same numerical contract as
+/// [`yuv_444p_n_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
+  }
+}
+
+/// Shared WASM simd128 high-bit YUV 4:4:4 → native-depth `u16` kernel
+/// for [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `write_rgb_u16_8`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `write_rgba_u16_8` with constant alpha
+/// `(1 << BITS) - 1`) and
+/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `write_rgba_u16_8` with the alpha lane loaded
+/// from `a_src` and masked to native bit depth — no shift since both
+/// the source alpha and the u16 output element are at the same native
+/// bit depth).
 ///
 /// # Safety
 ///
 /// 1. **simd128 must be enabled at compile time.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -979,13 +1041,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1060,9 +1129,22 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max_wasm(i16x8_add_sat(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
       if ALPHA {
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // No depth conversion — both source alpha and u16 output are
+          // at the same native bit depth (BITS), so just AND-mask any
+          // over-range bits to match the scalar reference.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v);
+          let hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v);
+          (lo, hi)
+        } else {
+          (alpha_u16, alpha_u16)
+        };
         let dst = out.as_mut_ptr().add(x * 4);
-        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, dst);
-        write_rgba_u16_8(r_hi, g_hi, b_hi, alpha_u16, dst.add(32));
+        write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo_v, dst);
+        write_rgba_u16_8(r_hi, g_hi, b_hi, a_hi_v, dst.add(32));
       } else {
         let dst = out.as_mut_ptr().add(x * 3);
         write_rgb_u16_8(r_lo, g_lo, b_lo, dst);
@@ -1078,7 +1160,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs
index 21e4377..8c53f5b 100644
--- a/src/row/arch/wasm_simd128/tests.rs
+++ b/src/row/arch/wasm_simd128/tests.rs
@@ -2413,3 +2413,152 @@ fn simd128_p416_rgba_u16_matches_scalar_all_matrices() {
     check_p_n_444_16_u16_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ----
+//
+// Mirrors the u8 RGBA alpha-source tests above for the u16 output
+// path: per-pixel alpha element is loaded from the source plane,
+// AND-masked with `bits_mask::<10>()`, and stored at native depth (no
+// `>> (BITS - 8)` since both source alpha and output element are at
+// the same bit depth). 16 px per iter → two `v128_load`s of 8 alpha
+// u16 each, fed straight into `write_rgba_u16_8`.
+
+fn check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "WASM simd128 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn simd128_yuva444p10_rgba_u16_matches_scalar_all_matrices_16() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuva444p10_rgba_u16_matches_scalar_widths() {
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva444p10_rgba_u16_matches_scalar_random_alpha() {
+  // Different alpha seeds — `write_rgba_u16_8` lane order must put
+  // alpha in the 4th channel, not collide with R/G/B.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(
+      31,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva444p_n_rgba_u16_matches_scalar_all_bits() {
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the
+  // AND-mask `mask_v` resolves correctly across the supported bit
+  // depths.
+  for full in [true, false] {
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>(
+      16,
+      ColorMatrix::Bt601,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<12>(
+      16,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<14>(
+      16,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<12>(
+      w,
+      ColorMatrix::Fcc,
+      true,
+      89,
+    );
+    check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 8ba3785..61de0ef 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -1049,7 +1049,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
 /// AVX2 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 32 pixels per iter.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1067,7 +1068,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -1075,7 +1078,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 /// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
 /// input bit depth).
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1093,24 +1097,79 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
   }
 }
 
-/// Shared AVX2 high-bit YUV 4:4:4 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8`;
-/// `ALPHA = true` writes RGBA quads via 4× `write_rgba_u16_8` with
-/// constant alpha `(1 << BITS) - 1`.
+/// AVX2 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth `u16`**
+/// packed RGBA with the per-pixel alpha element **sourced from
+/// `a_src`** (already at the source's native bit depth — no depth
+/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`.
+/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
+  }
+}
+
+/// Shared AVX2 high-bit YUV 4:4:4 → native-depth `u16` kernel for
+/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// 4× `write_rgb_u16_8`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, 4× `write_rgba_u16_8` with constant alpha
+/// `(1 << BITS) - 1`) and
+/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, 4× `write_rgba_u16_8` with the alpha lane loaded
+/// from `a_src` and masked to native bit depth — no shift since both
+/// the source alpha and the u16 output element are at the same native
+/// bit depth).
 ///
 /// # Safety
 ///
 /// 1. **AVX2 must be available on the current CPU.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -1118,13 +1177,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1223,33 +1289,53 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
       if ALPHA {
+        let (a_lo_q0, a_lo_q1, a_hi_q0, a_hi_q1) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // No depth conversion — both source alpha and u16 output are
+          // at the same native bit depth (BITS). AND-mask any over-
+          // range bits, then split each 256-bit half into the two
+          // 128-bit quarters consumed by the four `write_rgba_u16_8`
+          // calls per iter (mirroring the R/G/B cast/extract pattern).
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo_v = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v);
+          let a_hi_v = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v);
+          (
+            _mm256_castsi256_si128(a_lo_v),
+            _mm256_extracti128_si256::<1>(a_lo_v),
+            _mm256_castsi256_si128(a_hi_v),
+            _mm256_extracti128_si256::<1>(a_hi_v),
+          )
+        } else {
+          (alpha_u16, alpha_u16, alpha_u16, alpha_u16)
+        };
         let dst = out.as_mut_ptr().add(x * 4);
         write_rgba_u16_8(
           _mm256_castsi256_si128(r_lo),
           _mm256_castsi256_si128(g_lo),
           _mm256_castsi256_si128(b_lo),
-          alpha_u16,
+          a_lo_q0,
           dst,
         );
         write_rgba_u16_8(
           _mm256_extracti128_si256::<1>(r_lo),
           _mm256_extracti128_si256::<1>(g_lo),
           _mm256_extracti128_si256::<1>(b_lo),
-          alpha_u16,
+          a_lo_q1,
           dst.add(32),
         );
         write_rgba_u16_8(
           _mm256_castsi256_si128(r_hi),
           _mm256_castsi256_si128(g_hi),
           _mm256_castsi256_si128(b_hi),
-          alpha_u16,
+          a_hi_q0,
           dst.add(64),
         );
         write_rgba_u16_8(
           _mm256_extracti128_si256::<1>(r_hi),
           _mm256_extracti128_si256::<1>(g_hi),
           _mm256_extracti128_si256::<1>(b_hi),
-          alpha_u16,
+          a_hi_q1,
           dst.add(96),
         );
       } else {
@@ -1289,7 +1375,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index d464ee4..6a029c5 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -2668,3 +2668,155 @@ fn avx2_p416_rgba_u16_matches_scalar_all_matrices() {
     check_p_n_444_16_u16_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ----
+//
+// Mirrors the u8 RGBA alpha-source tests above for the u16 output
+// path: per-pixel alpha element is loaded from the source plane,
+// AND-masked with `bits_mask::<10>()`, and stored at native depth (no
+// `>> (BITS - 8)` since both source alpha and output element are at
+// the same bit depth). 32 px per iter → 16 alpha u16 per `__m256i`
+// load × 2 halves; per-half splits into two `__m128i` quarters fed to
+// the four `write_rgba_u16_8` calls per iter.
+
+fn check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn avx2_yuva444p10_rgba_u16_matches_scalar_all_matrices_32() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(32, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuva444p10_rgba_u16_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [32usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+fn avx2_yuva444p10_rgba_u16_matches_scalar_random_alpha() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Different alpha seeds — the 256-bit alpha load splits into two
+  // 128-bit quarters via `_mm256_castsi256_si128` /
+  // `_mm256_extracti128_si256::<1>`; the lane order through
+  // `write_rgba_u16_8` must put alpha in the 4th channel.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(
+      32,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(
+      63,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn avx2_yuva444p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the
+  // AND-mask `mask_v` resolves correctly across the supported bit
+  // depths.
+  for full in [true, false] {
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(32, ColorMatrix::Bt601, full, 53);
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<12>(
+      32,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<14>(
+      32,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn avx2_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [33usize, 47, 1922] {
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 147bc7a..effb52d 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -1134,7 +1134,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
 /// AVX-512 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. 64 pixels per iter.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1152,14 +1153,17 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
 /// AVX-512 sibling of [`yuv_444p_n_to_rgba_row`] for native-depth `u16`
 /// output. Alpha samples are `(1 << BITS) - 1`.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1177,24 +1181,79 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
   }
 }
 
-/// Shared AVX-512 high-bit YUV 4:4:4 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via 8× `write_quarter`;
-/// `ALPHA = true` writes RGBA quads via 8× `write_quarter_rgba` with
-/// constant alpha `(1 << BITS) - 1`.
+/// AVX-512 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth `u16`**
+/// packed RGBA with the per-pixel alpha element **sourced from
+/// `a_src`** (already at the source's native bit depth — no depth
+/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`.
+/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
+  }
+}
+
+/// Shared AVX-512 high-bit YUV 4:4:4 → native-depth `u16` kernel for
+/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// 8× `write_quarter`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, 8× `write_quarter_rgba` with constant alpha
+/// `(1 << BITS) - 1`) and
+/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, 8× `write_quarter_rgba` with the alpha quarters
+/// loaded from `a_src` and masked to native bit depth — no shift since
+/// both the source alpha and the u16 output element are at the same
+/// native bit depth).
 ///
 /// # Safety
 ///
 /// 1. **AVX-512F + AVX-512BW must be available.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -1202,13 +1261,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1321,15 +1387,45 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
       if ALPHA {
+        // Per-quarter alpha vectors, one for each of the 8
+        // `write_quarter_rgba` calls below. With ALPHA_SRC = false they
+        // all collapse to the constant `alpha_u16`. With ALPHA_SRC =
+        // true we load 64 source-alpha u16 values (two `__m512i`
+        // halves), AND-mask any over-range bits to native depth (no
+        // shift — both source and output are at the same bit depth)
+        // and split each half into the four 128-bit quarters consumed
+        // by `write_quarter_rgba`.
+        let (a_lo_q0, a_lo_q1, a_lo_q2, a_lo_q3, a_hi_q0, a_hi_q1, a_hi_q2, a_hi_q3) = if ALPHA_SRC
+        {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo_v = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v);
+          let a_hi_v = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v);
+          (
+            _mm512_extracti32x4_epi32::<0>(a_lo_v),
+            _mm512_extracti32x4_epi32::<1>(a_lo_v),
+            _mm512_extracti32x4_epi32::<2>(a_lo_v),
+            _mm512_extracti32x4_epi32::<3>(a_lo_v),
+            _mm512_extracti32x4_epi32::<0>(a_hi_v),
+            _mm512_extracti32x4_epi32::<1>(a_hi_v),
+            _mm512_extracti32x4_epi32::<2>(a_hi_v),
+            _mm512_extracti32x4_epi32::<3>(a_hi_v),
+          )
+        } else {
+          (
+            alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16,
+          )
+        };
         let dst = out.as_mut_ptr().add(x * 4);
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst);
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32));
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64));
-        write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192));
-        write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224));
+        write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q0, 0, dst);
+        write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q1, 1, dst.add(32));
+        write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q2, 2, dst.add(64));
+        write_quarter_rgba(r_lo, g_lo, b_lo, a_lo_q3, 3, dst.add(96));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q0, 0, dst.add(128));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q1, 1, dst.add(160));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q2, 2, dst.add(192));
+        write_quarter_rgba(r_hi, g_hi, b_hi, a_hi_q3, 3, dst.add(224));
       } else {
         let dst = out.as_mut_ptr().add(x * 3);
         write_quarter(r_lo, g_lo, b_lo, 0, dst);
@@ -1351,7 +1447,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index bba9c0f..d04388a 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -2720,3 +2720,166 @@ fn avx512_p416_rgba_u16_matches_scalar_all_matrices() {
     check_p_n_444_16_u16_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ----
+//
+// Mirrors the u8 RGBA alpha-source tests above for the u16 output
+// path: per-pixel alpha element is loaded from the source plane,
+// AND-masked with `bits_mask::<10>()`, and stored at native depth (no
+// `>> (BITS - 8)` since both source alpha and output element are at
+// the same bit depth). 64 px per iter → 32 alpha u16 per `__m512i`
+// load × 2 halves; per-half splits into four `__m128i` quarters via
+// `_mm512_extracti32x4_epi32::<0..3>` fed to the eight
+// `write_quarter_rgba` calls per iter.
+
+fn check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn avx512_yuva444p10_rgba_u16_matches_scalar_all_matrices_64() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(64, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuva444p10_rgba_u16_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [64usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
+  }
+}
+
+#[test]
+fn avx512_yuva444p10_rgba_u16_matches_scalar_random_alpha() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // Different alpha seeds — the 512-bit alpha load splits into four
+  // 128-bit quarters via `_mm512_extracti32x4_epi32::<0..3>`; each
+  // quarter feeds `write_quarter_rgba`, which routes the alpha lane
+  // into the 4th channel of the RGBA output.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(
+      64,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(
+      127,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn avx512_yuva444p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the
+  // AND-mask `mask_v` resolves correctly across the supported bit
+  // depths.
+  for full in [true, false] {
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(
+      64,
+      ColorMatrix::Bt601,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<12>(
+      64,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<14>(
+      64,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn avx512_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [65usize, 95, 1922] {
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index fda5824..2397a66 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1364,7 +1364,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
 /// SSE4.1 YUV 4:4:4 planar 9/10/12/14-bit → **native-depth u16** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1382,7 +1383,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -1390,7 +1393,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row<const BITS: u32>(
 /// output. Alpha samples are `(1 << BITS) - 1` (opaque maximum at the
 /// input bit depth).
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1408,24 +1412,79 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
   }
 }
 
-/// Shared SSE4.1 high-bit YUV 4:4:4 → native-depth `u16` kernel.
-/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`;
-/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with
-/// constant alpha `(1 << BITS) - 1`.
+/// SSE4.1 YUVA 4:4:4 planar 9/10/12/14-bit → **native-depth `u16`**
+/// packed RGBA with the per-pixel alpha element **sourced from
+/// `a_src`** (already at the source's native bit depth — no depth
+/// conversion) instead of being the opaque maximum `(1 << BITS) - 1`.
+/// Same numerical contract as [`yuv_444p_n_to_rgba_u16_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_u16_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_u16_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u16],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_u16_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
+  }
+}
+
+/// Shared SSE4.1 high-bit YUV 4:4:4 → native-depth `u16` kernel for
+/// [`yuv_444p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `write_rgb_u16_8`), [`yuv_444p_n_to_rgba_u16_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `write_rgba_u16_8` with constant alpha
+/// `(1 << BITS) - 1`) and
+/// [`yuv_444p_n_to_rgba_u16_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `write_rgba_u16_8` with the alpha lane loaded
+/// from `a_src` and masked to native bit depth — no shift since both
+/// the source alpha and the u16 output element are at the same native
+/// bit depth).
 ///
 /// # Safety
 ///
 /// 1. **SSE4.1 must be available on the current CPU.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` ∈ `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -1433,16 +1492,23 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   // Compile-time guard — `out_max = ((1 << BITS) - 1) as i16` below
   // silently wraps to -1 at BITS=16, corrupting the u16 clamp. The
   // dedicated 16-bit u16-output path is `yuv_444p16_to_rgb_u16_row`.
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, BITS>(full_range);
@@ -1517,14 +1583,21 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let b_hi = clamp_u16_max(_mm_adds_epi16(y_scaled_hi, b_chroma_hi), zero_v, max_v);
 
       if ALPHA {
-        write_rgba_u16_8(r_lo, g_lo, b_lo, alpha_u16, out.as_mut_ptr().add(x * 4));
-        write_rgba_u16_8(
-          r_hi,
-          g_hi,
-          b_hi,
-          alpha_u16,
-          out.as_mut_ptr().add(x * 4 + 32),
-        );
+        let (a_lo_v, a_hi_v) = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          // No depth conversion — both source alpha and u16 output are
+          // at the same native bit depth (BITS), so just AND-mask any
+          // over-range bits to match the scalar reference.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v);
+          let hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v);
+          (lo, hi)
+        } else {
+          (alpha_u16, alpha_u16)
+        };
+        write_rgba_u16_8(r_lo, g_lo, b_lo, a_lo_v, out.as_mut_ptr().add(x * 4));
+        write_rgba_u16_8(r_hi, g_hi, b_hi, a_hi_v, out.as_mut_ptr().add(x * 4 + 32));
       } else {
         write_rgb_u16_8(r_lo, g_lo, b_lo, out.as_mut_ptr().add(x * 3));
         write_rgb_u16_8(r_hi, g_hi, b_hi, out.as_mut_ptr().add(x * 3 + 24));
@@ -1539,7 +1612,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_u16_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index d02ecb1..7c47891 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -2721,3 +2721,162 @@ fn sse41_p416_rgba_u16_matches_scalar_all_matrices() {
     check_p_n_444_16_u16_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
   }
 }
+
+// ---- YUVA 4:4:4 native-depth `u16` RGBA equivalence (Ship 8b‑1c) ----
+//
+// Mirrors the u8 RGBA alpha-source tests above for the u16 output
+// path: per-pixel alpha element is loaded from the source plane,
+// AND-masked with `bits_mask::<10>()`, and stored at native depth (no
+// `>> (BITS - 8)` since both source alpha and output element are at
+// the same bit depth). Pseudo-random alpha flushes lane-order
+// corruption that a solid-alpha buffer would mask.
+
+fn check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u16; width * 4];
+  let mut rgba_simd = std::vec![0u16; width * 4];
+  scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_u16_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Yuva444p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn sse41_yuva444p10_rgba_u16_matches_scalar_all_matrices_16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuva444p10_rgba_u16_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
+  }
+}
+
+#[test]
+fn sse41_yuva444p10_rgba_u16_matches_scalar_random_alpha() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Different alpha seeds — `write_rgba_u16_8` lane order must put
+  // alpha in the 4th channel, not collide with R/G/B.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(
+      31,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn sse41_yuva444p_n_rgba_u16_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms the
+  // AND-mask `mask_v` resolves correctly across the supported bit
+  // depths.
+  for full in [true, false] {
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(
+      16,
+      ColorMatrix::Bt601,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<12>(
+      16,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<14>(
+      16,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn sse41_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
diff --git a/src/row/mod.rs b/src/row/mod.rs
index bd104f7..50552a0 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -5007,11 +5007,11 @@ pub fn yuv444p16_to_rgba_u16_row(
 // ---- YUVA 4:4:4 RGBA dispatchers --------------------------------------
 //
 // Per-row dispatchers for the YUVA source family (currently Yuva444p10
-// only). The u8 RGBA dispatcher routes through the per-arch
-// `yuv_444p_n_to_rgba_with_alpha_src_row` SIMD wrappers, mirroring the
-// `yuv444p10_to_rgba_row` dispatcher's pattern. The u16 RGBA
-// dispatcher (`yuva444p10_to_rgba_u16_row`) stays scalar until SIMD
-// wiring lands in **Ship 8b‑1c**.
+// only). Both the u8 RGBA dispatcher (`yuva444p10_to_rgba_row`) and
+// the u16 RGBA dispatcher (`yuva444p10_to_rgba_u16_row`) route through
+// per-arch `yuv_444p_n_to_rgba*_with_alpha_src_row` SIMD wrappers,
+// mirroring the `yuv444p10_to_rgba_row` / `yuv444p10_to_rgba_u16_row`
+// dispatchers' patterns.
 
 /// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit**
 /// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
@@ -5109,15 +5109,8 @@ pub fn yuva444p10_to_rgba_row(
 /// source's native bit depth) instead of being the opaque maximum
 /// `1023`.
 ///
-/// # ⚠ Scalar-only as of Ship 8b‑1a
-///
-/// `use_simd` is accepted for forward-compatible API parity with the
-/// rest of the dispatcher family **but is ignored in this PR**. Every
-/// invocation runs the scalar reference regardless of the flag — SIMD
-/// wiring lands in **Ship 8b‑1c**. Throughput on 4:4:4 + alpha is
-/// substantially below the 4:4:4-no-alpha SIMD path until then;
-/// callers benchmarking the alpha-source path should re-measure once
-/// 8b‑1c lands. See the section comment above for staging context.
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuva444p10_to_rgba_u16_row(
@@ -5138,7 +5131,63 @@ pub fn yuva444p10_to_rgba_u16_row(
   assert!(a.len() >= width, "a row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8b‑1c PR.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>(
     y, u, v, a, rgba_out, width, matrix, full_range,
   );
diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs
index 23af88d..4f0e3f9 100644
--- a/src/sinker/mixed/yuva_4_4_4.rs
+++ b/src/sinker/mixed/yuva_4_4_4.rs
@@ -50,11 +50,6 @@ impl<'a> MixedSinker<'a, Yuva444p10> {
   /// Attaches a packed **`u16`** RGBA output buffer. 10-bit
   /// low-packed (`[0, 1023]`); the per-pixel alpha element is
   /// **sourced from the alpha plane** at native depth.
-  ///
-  /// **Performance note (Ship 8b‑1a):** the alpha-source u16 path runs
-  /// scalar regardless of `with_simd(true)` until SIMD wiring lands in
-  /// **Ship 8b‑1c**. See [`Self::with_rgba`] for the same warning on
-  /// the u8 path.
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba_u16(mut self, buf: &'a mut [u16]) -> Result<Self, MixedSinkerError> {
     self.set_rgba_u16(buf)?;

From 2e6dc8c13fdca4ffee67280fa2cca8579b61e3ab Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 23:30:07 +1200
Subject: [PATCH 2/2] update

---
 CHANGELOG.md | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ae086b..5b708a8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -154,6 +154,132 @@ scheduled as a dedicated follow-up PR (`feat/bayer-simd`).
   end-to-end "all three channels at MAX_COEFFICIENT, all pixels
   255" stays inside the `u32` accumulator and clamps to 255.
 
+## Ship 8b — source-side YUVA (alpha-preserving RGBA output)
+
+The follow-up to Ship 8: source-side alpha. Where Ship 8 padded the
+output alpha lane to `0xFF` / `(1 << BITS) - 1` regardless of source,
+Ship 8b adds **YUVA source types** that carry an alpha plane through
+to the RGBA output. The first vertical slice ships `Yuva444p10`
+(ProRes 4444 + α territory — the highest-value VFX format from the
+Format Share table § 2a-1 row 10).
+
+### Strategy B (forked kernels) over Strategy A (separate splice)
+
+Two implementation strategies were considered:
+
+- **Strategy A** (deferred) — run the existing RGBA kernel (alpha =
+  opaque), then a second-pass helper reads source alpha + overwrites
+  the alpha byte. Memory traffic 6W per pixel; ~50 LOC + 1 helper.
+- **Strategy B** (adopted) — extend each kernel's const-`ALPHA`
+  template with a third `ALPHA_SRC: bool` generic. Source-alpha is
+  loaded inside the kernel, masked, and stored straight into the
+  alpha lane in the same pass. Memory traffic 5W per pixel (single
+  pass); ~3,000 LOC across 30+ kernels for an L1-noise ~10% perf
+  win in the alpha-present case.
+
+Strategy B was picked for best alpha-present throughput on the
+high-bandwidth 4:4:4 + α format that motivated the work. Existing
+`*_to_rgb_*` and `*_to_rgba_*` public wrappers are backward-compat
+shims passing `ALPHA_SRC = false` and `None` to the templates — zero
+overhead when alpha-source is off; existing call sites compile
+unchanged.
+
+### Vertical slice 1: `Yuva444p10` (3 PRs)
+
+The first format follows the same staging pattern as Ship 8 high-bit
+tranches (5/6/7): scalar prep first (call-site stable), then u8 SIMD,
+then u16 SIMD.
+
+| # | Tranche | Status |
+|---|---|---|
+| 1 | scalar prep + Frame + walker + dispatchers + sinker integration | ✅ shipped (PR #32) — `Yuva444pFrame16<BITS=10>`, `Yuva444p10Frame` alias, `yuva444p10_to` walker, `MixedSinker<Yuva444p10>`, scalar tests |
+| 1b | u8 RGBA SIMD across all 5 backends | ✅ shipped (PR #33) |
+| 1c | u16 RGBA SIMD across all 5 backends | ✅ shipped (PR #34) |
+
+### Surface added
+
+- **`Yuva444pFrame16<'a, const BITS: u32>`** — mirrors `Yuv444pFrame16`
+  with an extra `a` slice + `a_stride`. Const-asserted `BITS == 10`
+  in this slice; other bit depths land in subsequent vertical slices.
+  `try_new` validates dimensions + plane lengths; `try_new_checked`
+  additionally validates every active sample range.
+- **`Yuva444p10Frame<'a>`** type alias.
+- **`Yuva444p10`** marker + `Yuva444p10Row<'a>` (carries `a` slice)
+  + `Yuva444p10Sink` trait + `yuva444p10_to` walker.
+- **`MixedSinker<Yuva444p10>`** with `with_rgba` / `set_rgba` (u8) +
+  `with_rgba_u16` / `set_rgba_u16` (u16) per-format builders, plus
+  `with_rgb` / `with_rgb_u16` / `with_luma` / `with_hsv` alpha-drop
+  paths that reuse the `Yuv444p10` row dispatchers verbatim.
+- **Public dispatchers** in `colconv::row`: `yuva444p10_to_rgba_row`
+  and `yuva444p10_to_rgba_u16_row` — same SIMD-via-`use_simd` shape
+  as `yuv444p10_to_rgba_*`.
+
+### Strategy B template extension
+
+The four 4:4:4 const-`ALPHA` templates gained the `ALPHA_SRC` third
+generic in this slice (only the BITS-generic planar variant is in
+scope for this vertical slice; other 4:4:4 variants land later):
+
+- `scalar::yuv_444p_n_to_rgb_or_rgba_row<BITS, ALPHA, ALPHA_SRC>` (u8)
+- `scalar::yuv_444p_n_to_rgb_or_rgba_u16_row<BITS, ALPHA, ALPHA_SRC>` (u16)
+- Same SIMD templates × 5 backends (NEON / SSE4.1 / AVX2 / AVX-512 /
+  wasm simd128) — refactor in PRs #33 (u8) and #34 (u16).
+
+Per-pixel store branched on three combinations:
+
+| `ALPHA` | `ALPHA_SRC` | Per-pixel alpha |
+|---|---|---|
+| false | false | RGB-only (no alpha lane) |
+| true | false | RGBA, alpha = `0xFF` u8 / `(1 << BITS) - 1` u16 (existing path) |
+| true | true | RGBA, alpha = `(a_src[x] & bits_mask::<BITS>())` from source plane; depth-converted via `>> (BITS - 8)` for u8 output, native depth for u16 output |
+
+`!ALPHA_SRC || ALPHA` const-asserted at every template top.
+
+### Hardenings (Codex review fixes)
+
+- **Source alpha is masked with `bits_mask::<BITS>()` before depth
+  conversion** — `Yuva444p10Frame::try_new` accepts unchecked u16
+  samples; without masking an overrange `1024` at BITS=10 would shift
+  to `256` and cast to u8 zero, silently turning over-range alpha
+  into transparent output. Same masking pattern that Y/U/V already
+  use. Pinned by 2 regression tests at the sinker layer.
+- **`MixedSinker<Yuva444p10>` wires alpha-drop paths** for `with_rgb`
+  / `with_rgb_u16` / `with_luma` / `with_hsv` (declared on the
+  generic `MixedSinker<F>` impl) — initial implementation only wrote
+  RGBA buffers, leaving the others as silent stale-buffer bugs.
+  Pinned by 4 cross-format byte-equivalence tests against
+  `MixedSinker<Yuv444p10>`.
+
+### Tests
+
+- **Per-backend SIMD equivalence tests**: 30 per backend × 5 backends
+  for `Yuva444p10` (5 u8 added in PR #33 + 5 u16 added in PR #34).
+  Solid-alpha + random-alpha + tail-width coverage. All x86 tests
+  carry `is_x86_feature_detected!` early-return guards.
+- **Sinker integration tests**: 17 (PR #32 added 7 covering alpha
+  pass-through / opacity contracts / buffer-too-short error paths;
+  PR #32 review-fix added 7 covering alpha-drop paths + Strategy A
+  combine; PR #32 review-fix added 2 covering overrange-alpha
+  masking).
+- **Test count growth**: 578 → 588 on aarch64-darwin host (583 after
+  PR #33, 588 after PR #34); +5 NEON tests run at each tranche; the
+  +20 x86/wasm tests fire on their respective CI runners.
+
+### Notes
+
+- **Sink-side YUVA + Ship 8 sinks are now end-to-end for the format**:
+  with `Yuva444p10Frame` source and `MixedSinker<Yuva444p10>` sink,
+  the alpha plane flows through to `with_rgba` / `with_rgba_u16`
+  output. `with_rgb` / `with_rgb_u16` / `with_luma` / `with_hsv`
+  are alpha-drop (reuse `Yuv444p10` row kernels).
+- **Subsequent vertical slices (Ship 8b‑2 onward)** will mass-apply
+  the established Strategy B template to other Yuva format families:
+  `Yuva420p*` (4:2:0 with α — `yuva420p`, `yuva420p9/10/16`),
+  `Yuva422p*` (4:2:2 with α — `yuva422p`, `yuva422p9/10/16`), and
+  the remaining `Yuva444p*` variants (8-bit, 9-bit, 16-bit). The
+  template's third generic + per-backend wrapper pattern is now
+  proven; subsequent slices reuse it mechanically.
+
 ## Ship 8 — alpha + RGBA output (`with_rgba` / `with_rgba_u16`)
 
 Adds packed RGBA output across the YUV format inventory. Every YUV