From be54e4c7e1e39cd22f33611d35538e24ea708192 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:23:33 +1200
Subject: [PATCH] update

---
 src/row/arch/neon.rs               | 109 ++++++++++++++++++--
 src/row/arch/neon/tests.rs         | 130 +++++++++++++++++++++++
 src/row/arch/wasm_simd128.rs       | 110 ++++++++++++++++++--
 src/row/arch/wasm_simd128/tests.rs | 146 ++++++++++++++++++++++++++
 src/row/arch/x86_avx2.rs           | 107 +++++++++++++++++--
 src/row/arch/x86_avx2/tests.rs     | 140 +++++++++++++++++++++++++
 src/row/arch/x86_avx512.rs         | 110 ++++++++++++++++++--
 src/row/arch/x86_avx512/tests.rs   | 160 +++++++++++++++++++++++++++++
 src/row/arch/x86_sse41.rs          | 108 +++++++++++++++++--
 src/row/arch/x86_sse41/tests.rs    | 149 +++++++++++++++++++++++++++
 src/row/mod.rs                     |  88 +++++++++++-----
 src/sinker/mixed/yuva_4_4_4.rs     |   6 --
 12 files changed, 1280 insertions(+), 83 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index f7972c0..3bb4011 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -703,7 +703,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row<const BITS: u32, const AL
 /// [`yuv_420p_n_to_rgb_row`] but with full-width U/V (no chroma
 /// duplication) and no width parity constraint.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -721,7 +722,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -730,7 +733,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 /// [`yuv_444p_n_to_rgb_row`]; the only differences are the per-pixel
 /// stride (4 vs 3) and the constant alpha byte (`0xFF`, opaque).
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -748,24 +752,75 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
+  }
+}
+
+/// NEON YUVA 4:4:4 planar high-bit-depth → packed **8-bit RGBA** with
+/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted
+/// via `>> (BITS - 8)` to fit `u8`) instead of being constant `0xFF`.
+/// Same numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "neon")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
   }
 }
 
 /// Shared NEON high-bit-depth YUV 4:4:4 kernel for
-/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and
-/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with
-/// constant `0xFF` alpha vector).
+/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `vst3q_u8`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `vst4q_u8` with constant `0xFF` alpha vector)
+/// and [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `vst4q_u8` with the alpha lane loaded and
+/// depth-converted from `a_src`).
 ///
 /// # Safety
 ///
 /// 1. **NEON must be available.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` must be one of `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "neon")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -773,13 +828,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -863,9 +925,28 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       );
 
       if ALPHA {
+        let a_u8 = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert above.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo_u16 = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v);
+          let a_hi_u16 = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v);
+          // Mask before shifting to harden against over-range source
+          // alpha (e.g. 1024 at BITS=10), matching scalar. NEON's
+          // `vshrq_n_u16` requires a literal const generic shift, but
+          // `BITS - 8` is not a stable const expression on a const
+          // generic — `vshlq_u16` with a negative count vector
+          // performs the same logical right shift dynamically.
+          let a_shr = vdupq_n_s16(-((BITS - 8) as i16));
+          let a_lo_shifted = vshlq_u16(a_lo_u16, a_shr);
+          let a_hi_shifted = vshlq_u16(a_hi_u16, a_shr);
+          vcombine_u8(vqmovn_u16(a_lo_shifted), vqmovn_u16(a_hi_shifted))
+        } else {
+          alpha_u8
+        };
         vst4q_u8(
           out.as_mut_ptr().add(x * 4),
-          uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8),
+          uint8x16x4_t(r_u8, g_u8, b_u8, a_u8),
         );
       } else {
         vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8));
@@ -880,7 +961,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs
index d312142..2355c27 100644
--- a/src/row/arch/neon/tests.rs
+++ b/src/row/arch/neon/tests.rs
@@ -2547,6 +2547,136 @@ fn neon_p416_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) --------------------
+//
+// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source
+// path: per-pixel alpha byte is loaded from the source plane, masked
+// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo-
+// random alpha is used to flush out lane-order corruption that a
+// solid-alpha buffer would mask.
+
+fn check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_neon = std::vec![0u8; width * 4];
+  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_neon,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p10_rgba_matches_scalar_all_matrices_16() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p10_rgba_matches_scalar_widths() {
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p10_rgba_matches_scalar_random_alpha() {
+  // Different alpha seeds — ensures the alpha lane order through
+  // `vst4q_u8` is not confused with R/G/B.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<10>(
+      31,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p_n_rgba_matches_scalar_all_bits() {
+  // BITS = 9, 12, 14 (BITS = 10 is covered above with full matrix
+  // sweep). Confirms the variable shift count `BITS - 8` resolves
+  // correctly across the supported bit depths.
+  for full in [true, false] {
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53);
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<12>(16, ColorMatrix::Bt709, full, 53);
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<14>(
+      16,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuva444p_n_rgba_matches_scalar_all_bits_widths() {
+  // BITS = 9, 12, 14 across tail widths — the variable-shift alpha
+  // path applies across both SIMD body and scalar tail.
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence::<14>(w, ColorMatrix::YCgCo, false, 89);
+  }
+}
+
 // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
 //
 // u16 RGBA wrappers share the math of their u16 RGB siblings — only
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index b5eab6a..34dd85f 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -639,7 +639,8 @@ fn clamp_u16_max_wasm(v: v128, zero_v: v128, max_v: v128) -> v128 {
 /// WASM simd128 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. Block size 16 pixels.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -659,7 +660,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -667,7 +670,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 /// (`R, G, B, 0xFF`). Same numerical contract as
 /// [`yuv_444p_n_to_rgb_row`].
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -685,24 +689,76 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
+  }
+}
+
+/// WASM simd128 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit
+/// RGBA** with the per-pixel alpha byte **sourced from `a_src`**
+/// (depth-converted via `>> (BITS - 8)`) instead of being constant
+/// `0xFF`. Same numerical contract as [`yuv_444p_n_to_rgba_row`] for
+/// R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
   }
 }
 
 /// Shared WASM simd128 high-bit-depth YUV 4:4:4 kernel for
-/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and
-/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with
-/// constant `0xFF` alpha vector).
+/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `write_rgb_16`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `write_rgba_16` with constant `0xFF` alpha) and
+/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `write_rgba_16` with the alpha lane loaded and
+/// depth-converted from `a_src`).
 ///
 /// # Safety
 ///
 /// 1. **simd128 must be enabled at compile time.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` must be one of `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -710,13 +766,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -794,7 +857,26 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
       if ALPHA {
-        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        let a_u8 = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v);
+          let a_hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v);
+          // Mask before shifting to harden against over-range source
+          // alpha (e.g. 1024 at BITS=10), matching scalar.
+          // `u16x8_shr` accepts a runtime u32 count; `BITS - 8` is a
+          // compile-time constant, but the call site is the same.
+          // Saturating narrow with `u8x16_narrow_i16x8` clamps to
+          // [0, 255] — which is the correct u8 range for a u16 alpha
+          // already in [0, 255] post-shift.
+          let a_lo_shifted = u16x8_shr(a_lo, BITS - 8);
+          let a_hi_shifted = u16x8_shr(a_hi, BITS - 8);
+          u8x16_narrow_i16x8(a_lo_shifted, a_hi_shifted)
+        } else {
+          alpha_u8
+        };
+        write_rgba_16(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4));
       } else {
         write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
       }
@@ -808,7 +890,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs
index 56603cd..21e4377 100644
--- a/src/row/arch/wasm_simd128/tests.rs
+++ b/src/row/arch/wasm_simd128/tests.rs
@@ -2084,6 +2084,152 @@ fn simd128_p416_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) --------------------
+//
+// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source
+// path: per-pixel alpha byte is loaded from the source plane, masked
+// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo-
+// random alpha is used to flush out lane-order corruption that a
+// solid-alpha buffer would mask. (Module-level cfg gates these on
+// `target_feature = "simd128"`, so no per-test feature guard is
+// needed.)
+
+fn check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_wasm = std::vec![0u8; width * 4];
+  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_wasm,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_wasm,
+    "wasm simd128 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn simd128_yuva444p10_rgba_matches_scalar_all_matrices_16() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuva444p10_rgba_matches_scalar_widths() {
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva444p10_rgba_matches_scalar_random_alpha() {
+  // Different alpha seeds — `u8x16_narrow_i16x8` followed by
+  // `write_rgba_16` must place alpha in the 4th channel without
+  // lane-order corruption.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<10>(
+      31,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva444p_n_rgba_matches_scalar_all_bits() {
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms `u16x8_shr`
+  // with count `(BITS - 8)` resolves correctly across the supported
+  // bit depths.
+  for full in [true, false] {
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<9>(
+      16,
+      ColorMatrix::Bt601,
+      full,
+      53,
+    );
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<12>(
+      16,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<14>(
+      16,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn simd128_yuva444p_n_rgba_matches_scalar_all_bits_widths() {
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
+
 // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
 
 fn check_yuv444p_n_u16_simd128_rgba_equivalence<const BITS: u32>(
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 070f9e7..8ba3785 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -772,7 +772,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -780,7 +782,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 /// (`R, G, B, 0xFF`). Same numerical contract as
 /// [`yuv_444p_n_to_rgb_row`].
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -798,24 +801,75 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
+  }
+}
+
+/// AVX2 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** with
+/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted
+/// via `>> (BITS - 8)`) instead of being constant `0xFF`. Same
+/// numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
   }
 }
 
 /// Shared AVX2 high-bit-depth YUV 4:4:4 kernel for
-/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_32`) and
-/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_32` with
-/// constant `0xFF` alpha vector).
+/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `write_rgb_32`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `write_rgba_32` with constant `0xFF` alpha) and
+/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `write_rgba_32` with the alpha lane loaded and
+/// depth-converted from `a_src`).
 ///
 /// # Safety
 ///
 /// 1. **AVX2 must be available on the current CPU.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` must be one of `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "avx2")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -823,13 +877,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -932,7 +993,27 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
       if ALPHA {
-        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        let a_u8 = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v);
+          let a_hi = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v);
+          // Mask before shifting to harden against over-range source
+          // alpha (e.g. 1024 at BITS=10), matching scalar. AVX2's
+          // `_mm256_srli_epi16::<IMM8>` requires a literal shift, so
+          // use `_mm256_srl_epi16` with a count vector built from
+          // `BITS - 8`. `_mm256_packus_epi16` interleaves the two
+          // 128-bit lanes — `narrow_u8x32` already pays this cost for
+          // R/G/B; we use the same helper for the alpha lane.
+          let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32);
+          let a_lo_shifted = _mm256_srl_epi16(a_lo, a_shr);
+          let a_hi_shifted = _mm256_srl_epi16(a_hi, a_shr);
+          narrow_u8x32(a_lo_shifted, a_hi_shifted)
+        } else {
+          alpha_u8
+        };
+        write_rgba_32(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4));
       } else {
         write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
       }
@@ -946,7 +1027,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index e6a6009..d464ee4 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -2335,6 +2335,146 @@ fn avx2_p416_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) --------------------
+//
+// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source
+// path: per-pixel alpha byte is loaded from the source plane, masked
+// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo-
+// random alpha is used to flush out lane-order corruption that a
+// solid-alpha buffer would mask. AVX2's `narrow_u8x32` per-lane
+// permute fixup is exercised on the alpha lane just like R/G/B.
+
+fn check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn avx2_yuva444p10_rgba_matches_scalar_all_matrices_32() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>(32, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuva444p10_rgba_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [32usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+fn avx2_yuva444p10_rgba_matches_scalar_random_alpha() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // Different alpha seeds — `_mm256_packus_epi16` followed by the
+  // `narrow_u8x32` per-lane permute fixup must place alpha in the 4th
+  // channel without lane-order corruption.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>(
+      32,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<10>(
+      47,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn avx2_yuva444p_n_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms
+  // `_mm256_srl_epi16` with count `(BITS - 8)` resolves correctly
+  // across the supported bit depths.
+  for full in [true, false] {
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<9>(32, ColorMatrix::Bt601, full, 53);
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<12>(32, ColorMatrix::Bt709, full, 53);
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<14>(
+      32,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn avx2_yuva444p_n_rgba_matches_scalar_all_bits_widths() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence::<14>(w, ColorMatrix::YCgCo, false, 89);
+  }
+}
+
 // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
 
 fn check_yuv444p_n_u16_avx2_rgba_equivalence<const BITS: u32>(
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index ba587d3..147bc7a 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -822,7 +822,8 @@ unsafe fn write_quarter_rgba(
 /// AVX-512 YUV 4:4:4 planar 9/10/12/14-bit → packed **u8** RGB.
 /// Const-generic over `BITS ∈ {9, 10, 12, 14}`. Block size 64 pixels.
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -842,7 +843,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -850,7 +853,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 /// (`R, G, B, 0xFF`). Same numerical contract as
 /// [`yuv_444p_n_to_rgb_row`].
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -868,24 +872,76 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
+  }
+}
+
+/// AVX-512 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA**
+/// with the per-pixel alpha byte **sourced from `a_src`**
+/// (depth-converted via `>> (BITS - 8)`) instead of being constant
+/// `0xFF`. Same numerical contract as [`yuv_444p_n_to_rgba_row`] for
+/// R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
   }
 }
 
 /// Shared AVX-512 high-bit-depth YUV 4:4:4 kernel for
-/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_64`) and
-/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_64` with
-/// constant `0xFF` alpha vector).
+/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `write_rgb_64`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `write_rgba_64` with constant `0xFF` alpha) and
+/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `write_rgba_64` with the alpha lane loaded and
+/// depth-converted from `a_src`).
 ///
 /// # Safety
 ///
 /// 1. **AVX-512F + AVX-512BW must be available.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` must be one of `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -893,13 +949,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -1016,7 +1079,26 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
       if ALPHA {
-        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        let a_u8 = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v);
+          let a_hi = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v);
+          // Mask before shifting to harden against over-range source
+          // alpha (e.g. 1024 at BITS=10), matching scalar. AVX-512's
+          // `_mm512_srli_epi16::<IMM8>` requires a literal shift, so
+          // use `_mm512_srl_epi16` with a count vector built from
+          // `BITS - 8`. `narrow_u8x64` applies the per-lane permute
+          // fixup `pack_fixup` that R/G/B already pay for.
+          let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32);
+          let a_lo_shifted = _mm512_srl_epi16(a_lo, a_shr);
+          let a_hi_shifted = _mm512_srl_epi16(a_hi, a_shr);
+          narrow_u8x64(a_lo_shifted, a_hi_shifted, pack_fixup)
+        } else {
+          alpha_u8
+        };
+        write_rgba_64(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4));
       } else {
         write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
       }
@@ -1030,7 +1112,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index 31152c3..bba9c0f 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -2359,6 +2359,166 @@ fn avx512_p416_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) --------------------
+//
+// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source
+// path: per-pixel alpha byte is loaded from the source plane, masked
+// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo-
+// random alpha is used to flush out lane-order corruption that a
+// solid-alpha buffer would mask. AVX-512's `narrow_u8x64` per-lane
+// permute fixup is exercised on the alpha lane just like R/G/B.
+
+fn check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn avx512_yuva444p10_rgba_matches_scalar_all_matrices_64() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>(64, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuva444p10_rgba_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [64usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>(
+      w,
+      ColorMatrix::Bt709,
+      true,
+      89,
+    );
+  }
+}
+
+#[test]
+fn avx512_yuva444p10_rgba_matches_scalar_random_alpha() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // Different alpha seeds — `_mm512_packus_epi16` followed by the
+  // `narrow_u8x64` per-lane permute fixup must place alpha in the 4th
+  // channel without lane-order corruption.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>(
+      64,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<10>(
+      127,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn avx512_yuva444p_n_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  // BITS = 9, 12, 14 (BITS = 10 covered above). Confirms
+  // `_mm512_srl_epi16` with count `(BITS - 8)` resolves correctly
+  // across the supported bit depths.
+  for full in [true, false] {
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<9>(
+      64,
+      ColorMatrix::Bt601,
+      full,
+      53,
+    );
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<12>(
+      64,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<14>(
+      64,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn avx512_yuva444p_n_rgba_matches_scalar_all_bits_widths() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
+
 // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
 
 fn check_yuv444p_n_u16_avx512_rgba_equivalence<const BITS: u32>(
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 390b8cc..fda5824 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1087,7 +1087,8 @@ fn clamp_u16_max(v: __m128i, zero_v: __m128i, max_v: __m128i) -> __m128i {
 /// skipping the horizontal chroma-duplication step (4:4:4 chroma is
 /// 1:1 with Y, not paired).
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = false, ALPHA_SRC = false`.
 ///
 /// # Numerical contract
 ///
@@ -1111,7 +1112,9 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false>(y, u, v, rgb_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, false, false>(
+      y, u, v, rgb_out, width, matrix, full_range, None,
+    );
   }
 }
 
@@ -1119,7 +1122,8 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row<const BITS: u32>(
 /// (`R, G, B, 0xFF`). Same numerical contract as
 /// [`yuv_444p_n_to_rgb_row`].
 ///
-/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = false`.
 ///
 /// # Safety
 ///
@@ -1137,24 +1141,75 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row<const BITS: u32>(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true>(y, u, v, rgba_out, width, matrix, full_range);
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, false>(
+      y, u, v, rgba_out, width, matrix, full_range, None,
+    );
+  }
+}
+
+/// SSE4.1 YUVA 4:4:4 planar 9/10/12/14-bit → packed **8-bit RGBA** with
+/// the per-pixel alpha byte **sourced from `a_src`** (depth-converted
+/// via `>> (BITS - 8)`) instead of being constant `0xFF`. Same
+/// numerical contract as [`yuv_444p_n_to_rgba_row`] for R/G/B.
+///
+/// Thin wrapper over [`yuv_444p_n_to_rgb_or_rgba_row`] with
+/// `ALPHA = true, ALPHA_SRC = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_444p_n_to_rgba_row`] plus `a_src.len() >= width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row<const BITS: u32>(
+  y: &[u16],
+  u: &[u16],
+  v: &[u16],
+  a_src: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_444p_n_to_rgb_or_rgba_row::<BITS, true, true>(
+      y,
+      u,
+      v,
+      rgba_out,
+      width,
+      matrix,
+      full_range,
+      Some(a_src),
+    );
   }
 }
 
 /// Shared SSE4.1 high-bit-depth YUV 4:4:4 kernel for
-/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false`, `write_rgb_16`) and
-/// [`yuv_444p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16` with
-/// constant `0xFF` alpha vector).
+/// [`yuv_444p_n_to_rgb_row`] (`ALPHA = false, ALPHA_SRC = false`,
+/// `write_rgb_16`), [`yuv_444p_n_to_rgba_row`] (`ALPHA = true,
+/// ALPHA_SRC = false`, `write_rgba_16` with constant `0xFF` alpha) and
+/// [`yuv_444p_n_to_rgba_with_alpha_src_row`] (`ALPHA = true,
+/// ALPHA_SRC = true`, `write_rgba_16` with the alpha lane loaded and
+/// depth-converted from `a_src`).
 ///
 /// # Safety
 ///
 /// 1. **SSE4.1 must be available on the current CPU.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
 ///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
-/// 3. `BITS` must be one of `{9, 10, 12, 14}`.
+/// 3. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and
+///    `a_src.unwrap().len() >= width`.
+/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+#[allow(clippy::too_many_arguments)]
+pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<
+  const BITS: u32,
+  const ALPHA: bool,
+  const ALPHA_SRC: bool,
+>(
   y: &[u16],
   u: &[u16],
   v: &[u16],
@@ -1162,13 +1217,20 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+  a_src: Option<&[u16]>,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  // Source alpha requires RGBA output — there is no 3 bpp store with
+  // alpha to put it in.
+  const { assert!(!ALPHA_SRC || ALPHA) };
   let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
   debug_assert!(out.len() >= width * bpp);
+  if ALPHA_SRC {
+    debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width));
+  }
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -1248,7 +1310,25 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
       if ALPHA {
-        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+        let a_u8 = if ALPHA_SRC {
+          // SAFETY (const-checked): ALPHA_SRC = true implies the
+          // wrapper passed Some(_), validated by debug_assert.
+          let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr();
+          let a_lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v);
+          let a_hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v);
+          // Mask before shifting to harden against over-range source
+          // alpha (e.g. 1024 at BITS=10), matching scalar. SSE4.1
+          // `_mm_srli_epi16::<IMM8>` requires a literal const generic
+          // shift, so use `_mm_srl_epi16` with a count vector built
+          // from `BITS - 8`.
+          let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32);
+          let a_lo_shifted = _mm_srl_epi16(a_lo, a_shr);
+          let a_hi_shifted = _mm_srl_epi16(a_hi, a_shr);
+          _mm_packus_epi16(a_lo_shifted, a_hi_shifted)
+        } else {
+          alpha_u8
+        };
+        write_rgba_16(r_u8, g_u8, b_u8, a_u8, out.as_mut_ptr().add(x * 4));
       } else {
         write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
       }
@@ -1262,7 +1342,13 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA:
       let tail_v = &v[x..width];
       let tail_out = &mut out[x * bpp..width * bpp];
       let tail_w = width - x;
-      if ALPHA {
+      if ALPHA_SRC {
+        // SAFETY (const-checked): ALPHA_SRC = true implies Some(_).
+        let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width];
+        scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range,
+        );
+      } else if ALPHA {
         scalar::yuv_444p_n_to_rgba_row::<BITS>(
           tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
         );
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 253925b..d02ecb1 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -2375,6 +2375,155 @@ fn sse41_p416_rgba_matches_scalar_all_matrices() {
   }
 }
 
+// ---- YUVA 4:4:4 u8 RGBA equivalence (Ship 8b‑1b) --------------------
+//
+// Mirrors the no-alpha 4:4:4 RGBA pattern above for the alpha-source
+// path: per-pixel alpha byte is loaded from the source plane, masked
+// with `bits_mask::<10>()`, and depth-converted via `>> 2`. Pseudo-
+// random alpha is used to flush out lane-order corruption that a
+// solid-alpha buffer would mask.
+
+fn check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  alpha_seed: usize,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width, 53);
+  let v = planar_n_plane::<BITS>(width, 71);
+  let a_src = planar_n_plane::<BITS>(width, alpha_seed);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+    &y,
+    &u,
+    &v,
+    &a_src,
+    &mut rgba_scalar,
+    width,
+    matrix,
+    full_range,
+  );
+  unsafe {
+    yuv_444p_n_to_rgba_with_alpha_src_row::<BITS>(
+      &y,
+      &u,
+      &v,
+      &a_src,
+      &mut rgba_simd,
+      width,
+      matrix,
+      full_range,
+    );
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Yuva444p<{BITS}> → RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})"
+  );
+}
+
+#[test]
+fn sse41_yuva444p10_rgba_matches_scalar_all_matrices_16() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuva444p10_rgba_matches_scalar_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Natural width + tail widths forcing scalar-tail dispatch.
+  for w in [16usize, 17, 31, 47, 63, 1920, 1922] {
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89);
+  }
+}
+
+#[test]
+fn sse41_yuva444p10_rgba_matches_scalar_random_alpha() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // Different alpha seeds — `_mm_packus_epi16` lane order through
+  // `write_rgba_16` must put alpha in the 4th channel, not collide
+  // with R/G/B.
+  for seed in [13usize, 41, 89, 127, 211] {
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>(
+      16,
+      ColorMatrix::Bt601,
+      false,
+      seed,
+    );
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<10>(
+      31,
+      ColorMatrix::Bt2020Ncl,
+      true,
+      seed,
+    );
+  }
+}
+
+#[test]
+fn sse41_yuva444p_n_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  // BITS = 9, 12, 14 (BITS = 10 covered above with full matrix sweep).
+  // Confirms `_mm_srl_epi16` with count `(BITS - 8)` resolves
+  // correctly across the supported bit depths.
+  for full in [true, false] {
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<9>(16, ColorMatrix::Bt601, full, 53);
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<12>(
+      16,
+      ColorMatrix::Bt709,
+      full,
+      53,
+    );
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<14>(
+      16,
+      ColorMatrix::Bt2020Ncl,
+      full,
+      53,
+    );
+  }
+}
+
+#[test]
+fn sse41_yuva444p_n_rgba_matches_scalar_all_bits_widths() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
+  for w in [17usize, 47, 1922] {
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<9>(
+      w,
+      ColorMatrix::Smpte240m,
+      false,
+      89,
+    );
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<12>(w, ColorMatrix::Fcc, true, 89);
+    check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence::<14>(
+      w,
+      ColorMatrix::YCgCo,
+      false,
+      89,
+    );
+  }
+}
+
 // ---- High-bit 4:4:4 native-depth `u16` RGBA equivalence (Ship 8 Tranche 7c) ----
 
 fn check_yuv444p_n_u16_sse41_rgba_equivalence<const BITS: u32>(
diff --git a/src/row/mod.rs b/src/row/mod.rs
index d74a7ca..bd104f7 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -5004,21 +5004,14 @@ pub fn yuv444p16_to_rgba_u16_row(
   scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range);
 }
 
-// ---- YUVA 4:4:4 RGBA dispatchers (Ship 8b‑1a prep) --------------------
+// ---- YUVA 4:4:4 RGBA dispatchers --------------------------------------
 //
 // Per-row dispatchers for the YUVA source family (currently Yuva444p10
-// only). The `use_simd` parameter is accepted for API parity with the
-// rest of the dispatcher family but is ignored in this PR — SIMD per-arch
-// routes for the alpha-source path land in:
-//
-//   - Ship 8b‑1b: u8 RGBA SIMD on neon / avx512 / avx2 / sse41 /
-//     wasm_simd128.
-//   - Ship 8b‑1c: u16 RGBA SIMD on the same backends.
-//
-// Until then the dispatcher routes unconditionally to the scalar path
-// (`scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>` / its u16
-// counterpart), which already monomorphizes optimally — the
-// `ALPHA_SRC = true` branch is the only live arm in the per-pixel store.
+// only). The u8 RGBA dispatcher routes through the per-arch
+// `yuv_444p_n_to_rgba_with_alpha_src_row` SIMD wrappers, mirroring the
+// `yuv444p10_to_rgba_row` dispatcher's pattern. The u16 RGBA
+// dispatcher (`yuva444p10_to_rgba_u16_row`) stays scalar until SIMD
+// wiring lands in **Ship 8b‑1c**.
 
 /// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit**
 /// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family
@@ -5026,15 +5019,8 @@ pub fn yuv444p16_to_rgba_u16_row(
 /// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`)
 /// instead of being constant `0xFF`.
 ///
-/// # ⚠ Scalar-only as of Ship 8b‑1a
-///
-/// `use_simd` is accepted for forward-compatible API parity with the
-/// rest of the dispatcher family **but is ignored in this PR**. Every
-/// invocation runs the scalar reference regardless of the flag — SIMD
-/// wiring lands in **Ship 8b‑1b**. Throughput on 4:4:4 + alpha is
-/// substantially below the 4:4:4-no-alpha SIMD path until then;
-/// callers benchmarking the alpha-source path should re-measure once
-/// 8b‑1b lands. See the section comment above for staging context.
+/// `use_simd = false` forces the scalar reference path; otherwise
+/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuva444p10_to_rgba_row(
@@ -5055,7 +5041,63 @@ pub fn yuva444p10_to_rgba_row(
   assert!(a.len() >= width, "a row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8b‑1b PR.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
+              y, u, v, a, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>(
     y, u, v, a, rgba_out, width, matrix, full_range,
   );
diff --git a/src/sinker/mixed/yuva_4_4_4.rs b/src/sinker/mixed/yuva_4_4_4.rs
index 620e139..23af88d 100644
--- a/src/sinker/mixed/yuva_4_4_4.rs
+++ b/src/sinker/mixed/yuva_4_4_4.rs
@@ -28,12 +28,6 @@ impl<'a> MixedSinker<'a, Yuva444p10> {
   /// kernel family used by [`MixedSinker<Yuv444p10>::with_rgba`]; the
   /// per-pixel alpha byte is **sourced from the alpha plane**
   /// (depth-converted via `a >> 2` to fit `u8`) — not constant `0xFF`.
-  ///
-  /// **Performance note (Ship 8b‑1a):** the alpha-source path runs
-  /// scalar regardless of `with_simd(true)` until SIMD wiring lands in
-  /// **Ship 8b‑1b**. The non-alpha 4:4:4 paths
-  /// (`MixedSinker<Yuv444p10>`) already have native SIMD — only the
-  /// alpha-aware fork is staged.
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
     self.set_rgba(buf)?;