From 0cc022cb6c6e6c512e05593f5cac653528894176 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:46:16 +1200
Subject: [PATCH 1/5] update

---
 src/row/arch/neon.rs | 102 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 13 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 443b469..ca4cfaf 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -282,8 +282,10 @@ unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
-/// 4. `BITS` must be one of `{10, 12, 14}` — the Q15 pipeline
+/// 4. `BITS` must be one of `{9, 10, 12, 14}` — the Q15 pipeline
 ///    overflows i32 at 16 bits; see [`scalar::range_params_n`].
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
@@ -294,13 +296,76 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// NEON high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Same numerical contract as [`yuv_420p_n_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the constant
+/// alpha byte (`0xFF`, opaque).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared NEON high-bit-depth YUV 4:2:0 kernel for
+/// [`yuv_420p_n_to_rgb_row`] (`ALPHA = false`, `vst3q_u8`) and
+/// [`yuv_420p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8` with
+/// constant `0xFF` alpha vector).
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` must be one of `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -324,6 +389,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -385,23 +451,33 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)),
       );
 
-      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
-      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+      if ALPHA {
+        let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
+        vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
+      } else {
+        let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+        vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
+      }
 
       x += 16;
     }
 
     // Scalar tail — remaining < 16 pixels (always even per 4:2:0).
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }

From cfff0dcd336b0e9e8840891f18ba0a1a6a6bc18a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 22:51:02 +1200
Subject: [PATCH 2/5] update

---
 src/row/arch/neon.rs | 263 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 225 insertions(+), 38 deletions(-)

diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index ca4cfaf..917c64b 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -913,6 +913,8 @@ fn clamp_u16_max(v: int16x8_t, zero_v: int16x8_t, max_v: int16x8_t) -> uint16x8_
 /// 3. `y.len() >= width`, `uv_half.len() >= width`,
 ///    `rgb_out.len() >= 3 * width`.
 /// 4. `BITS` must be one of `{10, 12}`.
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
@@ -923,10 +925,66 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA**
+/// (`R, G, B, 0xFF`). Same numerical contract as [`p_n_to_rgb_row`];
+/// 4 bpp store with constant alpha.
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON kernel for [`p_n_to_rgb_row`] (`ALPHA = false`,
+/// `vst3q_u8`) and [`p_n_to_rgba_row`] (`ALPHA = true`, `vst4q_u8`
+/// with constant `0xFF` alpha vector).
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` must be one of `{10, 12}`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // P016 (BITS=16) routes through `p16_to_rgb_or_rgba_row` (i64 chroma);
+  // attempting `::<16, _>` here would silently overflow on high chroma.
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -949,6 +1007,7 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1006,21 +1065,27 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)),
       );
 
-      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
-      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+      if ALPHA {
+        let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
+        vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
+      } else {
+        let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+        vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -1968,6 +2033,8 @@ fn scale_y(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn yuv_420p16_to_rgb_row(
@@ -1979,11 +2046,64 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON 16-bit YUV 4:2:0 kernel for [`yuv_420p16_to_rgb_row`]
+/// (`ALPHA = false`, `vst3q_u8`) and [`yuv_420p16_to_rgba_row`]
+/// (`ALPHA = true`, `vst4q_u8` with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. NEON must be available on the current CPU.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2002,6 +2122,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -2061,23 +2182,30 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)),
       );
 
-      vst3q_u8(
-        rgb_out.as_mut_ptr().add(x * 3),
-        uint8x16x3_t(r_u8, g_u8, b_u8),
-      );
+      if ALPHA {
+        vst4q_u8(
+          out.as_mut_ptr().add(x * 4),
+          uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8),
+        );
+      } else {
+        vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8));
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2511,6 +2639,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
 /// 1. NEON must be available.
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn p16_to_rgb_row(
@@ -2521,10 +2651,61 @@ pub(crate) unsafe fn p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON P016 (semi-planar 4:2:0, full 16-bit) → packed **8-bit RGBA**
+/// (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p16_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON P016 kernel for [`p16_to_rgb_row`] (`ALPHA = false`,
+/// `vst3q_u8`) and [`p16_to_rgba_row`] (`ALPHA = true`, `vst4q_u8`
+/// with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. NEON must be available.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2543,6 +2724,7 @@ pub(crate) unsafe fn p16_to_rgb_row(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -2601,22 +2783,27 @@ pub(crate) unsafe fn p16_to_rgb_row(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, b_dup_hi)),
       );
 
-      vst3q_u8(
-        rgb_out.as_mut_ptr().add(x * 3),
-        uint8x16x3_t(r_u8, g_u8, b_u8),
-      );
+      if ALPHA {
+        vst4q_u8(
+          out.as_mut_ptr().add(x * 4),
+          uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8),
+        );
+      } else {
+        vst3q_u8(out.as_mut_ptr().add(x * 3), uint8x16x3_t(r_u8, g_u8, b_u8));
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::p16_to_rgb_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }

From 67062755c7c08eb6988db79ea6bdf5878485bda9 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 23:16:24 +1200
Subject: [PATCH 3/5] update

---
 src/row/arch/x86_sse41.rs | 336 +++++++++++++++++++++++++++++++++-----
 1 file changed, 294 insertions(+), 42 deletions(-)

diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 4aaecaf..43400f9 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -281,6 +281,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`,
 ///    `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
@@ -291,10 +293,63 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA**
+/// (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 P010/P012 kernel for [`p_n_to_rgb_row`] (`ALPHA = false`,
+/// `write_rgb_16`) and [`p_n_to_rgba_row`] (`ALPHA = true`, `write_rgba_16`
+/// with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}` — P016 has its own kernel family.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -318,6 +373,7 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -369,20 +425,25 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -554,6 +615,8 @@ unsafe fn deinterleave_uv_u16(ptr: *const u16) -> (__m128i, __m128i) {
 ///
 /// Byte‑identical to [`scalar::yuv_420p_n_to_rgb_row::<10>`].
 ///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
+///
 /// # Safety
 ///
 /// 1. **SSE4.1 must be available on the current CPU.**
@@ -570,13 +633,71 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared SSE4.1 high-bit YUV 4:2:0 kernel for `yuv_420p_n_to_rgb_row`
+/// (`ALPHA = false`, `write_rgb_16`) and `yuv_420p_n_to_rgba_row`
+/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -600,6 +721,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -650,21 +772,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -2146,6 +2277,8 @@ fn scale_y16_i64(y_minus_off: __m128i, y_scale_v: __m128i, rnd_v: __m128i) -> __
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn yuv_420p16_to_rgb_row(
@@ -2157,11 +2290,64 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 16-bit YUV 4:2:0 kernel for [`yuv_420p16_to_rgb_row`]
+/// (`ALPHA = false`, `write_rgb_16`) and [`yuv_420p16_to_rgba_row`]
+/// (`ALPHA = true`, `write_rgba_16` with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2180,6 +2366,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -2228,20 +2415,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let b_u8 = _mm_packus_epi16(b_lo, b_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2419,6 +2613,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
 /// 1. **SSE4.1 must be available on the current CPU.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn p16_to_rgb_row(
@@ -2429,10 +2625,60 @@ pub(crate) unsafe fn p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+///
+/// # Safety
+///
+/// Same as [`p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p16_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 P016 kernel for [`p16_to_rgb_row`] (`ALPHA = false`,
+/// `write_rgb_16`) and [`p16_to_rgba_row`] (`ALPHA = true`,
+/// `write_rgba_16` with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2450,6 +2696,7 @@ pub(crate) unsafe fn p16_to_rgb_row(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -2495,19 +2742,24 @@ pub(crate) unsafe fn p16_to_rgb_row(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let b_u8 = _mm_packus_epi16(b_lo, b_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::p16_to_rgb_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }

From 88a0de80a8d0a94bb8af2c9ece6803c81034028a Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 23:56:28 +1200
Subject: [PATCH 4/5] update

---
 src/row/arch/neon/tests.rs         | 179 ++++++++++++
 src/row/arch/wasm_simd128.rs       | 300 ++++++++++++++++---
 src/row/arch/wasm_simd128/tests.rs | 177 +++++++++++
 src/row/arch/x86_avx2.rs           | 301 ++++++++++++++++---
 src/row/arch/x86_avx2/tests.rs     | 173 +++++++++++
 src/row/arch/x86_avx512.rs         | 300 ++++++++++++++++---
 src/row/arch/x86_avx512/tests.rs   | 173 +++++++++++
 src/row/arch/x86_sse41/tests.rs    | 173 +++++++++++
 src/row/mod.rs                     | 453 ++++++++++++++++++++++++++---
 9 files changed, 2066 insertions(+), 163 deletions(-)

diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs
index a792c0d..41dd909 100644
--- a/src/row/arch/neon/tests.rs
+++ b/src/row/arch/neon/tests.rs
@@ -1649,6 +1649,185 @@ fn neon_p14_matches_scalar_tail_widths() {
   }
 }
 
+// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ----------
+//
+// RGBA wrappers share the math of their RGB siblings — only the store
+// (and tail dispatch) branches on `ALPHA`. These tests pin that the
+// SIMD RGBA path produces byte-identical output to the scalar RGBA
+// reference, which already encodes the alpha = 0xFF contract.
+
+fn check_planar_u8_neon_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_neon = std::vec![0u8; width * 4];
+  scalar::yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u8_neon_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_neon = std::vec![0u8; width * 4];
+  scalar::p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv420p_n_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u8_neon_rgba_equivalence_n::<9>(16, m, full);
+      check_planar_u8_neon_rgba_equivalence_n::<10>(16, m, full);
+      check_planar_u8_neon_rgba_equivalence_n::<12>(16, m, full);
+      check_planar_u8_neon_rgba_equivalence_n::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_planar_u8_neon_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u8_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u8_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u8_neon_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_pn_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u8_neon_rgba_equivalence_n::<10>(16, m, full);
+      check_pn_u8_neon_rgba_equivalence_n::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_pn_rgba_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_pn_u8_neon_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u8_neon_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+fn check_yuv420p16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width / 2, 53);
+  let v = p16_plane_neon(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_neon = std::vec![0u8; width * 4];
+  scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p016_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_neon(width, 37);
+  let u = p16_plane_neon(width / 2, 53);
+  let v = p16_plane_neon(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_neon = std::vec![0u8; width * 4];
+  scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_neon,
+    "NEON P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_yuv420p16_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u8_neon_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_yuv420p16_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+fn neon_p016_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p016_u8_neon_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_p016_u8_neon_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Yuv444p_n NEON equivalence (10/12/14) --------------------------
 
 fn check_yuv444p_n_u8_neon_equivalence<const BITS: u32>(
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 29e3f5a..e9d02c4 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -270,6 +270,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
@@ -280,13 +282,64 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// wasm simd128 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared wasm simd128 high-bit YUV 4:2:0 kernel. `ALPHA = false` uses
+/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant
+/// `0xFF` alpha.
+///
+/// # Safety
+///
+/// 1. **simd128 enabled at compile time.**
+/// 2. `width & 1 == 0`. 3. slices long enough +
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -308,6 +361,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -356,21 +410,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1065,6 +1128,8 @@ unsafe fn write_rgb_u16_8(r: v128, g: v128, b: v128, ptr: *mut u16) {
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`,
 ///    `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
@@ -1075,10 +1140,57 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 high-bit-packed semi-planar 4:2:0 → packed **8-bit
+/// RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 P010/P012 kernel. `ALPHA = false` uses
+/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant
+/// `0xFF` alpha.
+///
+/// # Safety
+///
+/// 1. **simd128 enabled at compile time.**
+/// 2. `width & 1 == 0`. 3. slices long enough +
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -1099,6 +1211,7 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     // High-bit-packed samples: shift right by `16 - BITS`.
     let shr = (16 - BITS) as u32;
@@ -1149,20 +1262,25 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2359,6 +2477,8 @@ fn scale_y_i32x4_i64_wasm(y_minus_off: v128, y_scale_i64: v128, rnd_i64: v128) -
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn yuv_420p16_to_rgb_row(
@@ -2370,11 +2490,52 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 16-bit YUV 4:2:0 kernel. `ALPHA = false` uses
+/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant
+/// `0xFF` alpha.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2392,6 +2553,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -2438,20 +2600,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2595,6 +2764,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
 /// 1. **simd128 must be enabled at compile time.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn p16_to_rgb_row(
@@ -2605,10 +2776,49 @@ pub(crate) unsafe fn p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p16_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 P016 kernel. `ALPHA = false` uses
+/// `write_rgb_16`; `ALPHA = true` uses `write_rgba_16` with constant
+/// `0xFF` alpha.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2626,6 +2836,7 @@ pub(crate) unsafe fn p16_to_rgb_row(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -2671,19 +2882,24 @@ pub(crate) unsafe fn p16_to_rgb_row(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 16;
     }
 
     if x < width {
-      scalar::p16_to_rgb_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs
index 0e05462..d394f33 100644
--- a/src/row/arch/wasm_simd128/tests.rs
+++ b/src/row/arch/wasm_simd128/tests.rs
@@ -1387,6 +1387,183 @@ fn simd128_yuv420p16_u16_matches_scalar_tight_widths() {
   }
 }
 
+// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ----------
+//
+// RGBA wrappers share the math of their RGB siblings — only the store
+// (and tail dispatch) branches on `ALPHA`. These tests pin that the
+// SIMD RGBA path produces byte-identical output to the scalar RGBA
+// reference.
+
+fn check_planar_u8_simd128_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u8_simd128_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u8_simd128_rgba_equivalence(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width / 2, 53);
+  let v = p16_plane_wasm(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u8_simd128_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_wasm(width, 37);
+  let u = p16_plane_wasm(width / 2, 53);
+  let v = p16_plane_wasm(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "simd128 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn simd128_yuv420p_n_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u8_simd128_rgba_equivalence_n::<9>(16, m, full);
+      check_planar_u8_simd128_rgba_equivalence_n::<10>(16, m, full);
+      check_planar_u8_simd128_rgba_equivalence_n::<12>(16, m, full);
+      check_planar_u8_simd128_rgba_equivalence_n::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn simd128_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_planar_u8_simd128_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u8_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u8_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u8_simd128_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn simd128_pn_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u8_simd128_rgba_equivalence_n::<10>(16, m, full);
+      check_pn_u8_simd128_rgba_equivalence_n::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn simd128_pn_rgba_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_pn_u8_simd128_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u8_simd128_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn simd128_yuv420p16_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u8_simd128_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_yuv420p16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn simd128_p016_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u8_simd128_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_p16_u8_simd128_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) wasm simd128 equivalence ---------
 
 fn high_bit_plane_wasm<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 2882928..d0f72fd 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -292,6 +292,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
@@ -302,13 +304,65 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX2 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX2 high-bit YUV 4:2:0 kernel. `ALPHA = false` uses
+/// `write_rgb_32`; `ALPHA = true` uses `write_rgba_32` with constant
+/// `0xFF` alpha.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available.**
+/// 2. `width & 1 == 0`. 3. slices long enough for `BITS` semantics +
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈
+///    `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -329,6 +383,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -393,21 +448,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1211,6 +1275,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`,
 ///    `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
@@ -1221,10 +1287,58 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA**
+/// (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 P010/P012 kernel. `ALPHA = false` uses `write_rgb_32`;
+/// `ALPHA = true` uses `write_rgba_32` with constant `0xFF` alpha.
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available.**
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `uv_half.len() >= width`,
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -1246,6 +1360,7 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -1306,20 +1421,25 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2558,6 +2678,8 @@ fn chroma_dup_i32(chroma: __m256i) -> (__m256i, __m256i) {
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn yuv_420p16_to_rgb_row(
@@ -2569,11 +2691,52 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 16-bit YUV 4:2:0 kernel. `ALPHA = false` uses
+/// `write_rgb_32`; `ALPHA = true` uses `write_rgba_32` with constant
+/// `0xFF` alpha.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2591,6 +2754,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -2646,20 +2810,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let b_u8 = narrow_u8x32(b_lo, b_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2820,6 +2991,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
 /// 1. **AVX2 must be available.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn p16_to_rgb_row(
@@ -2830,10 +3003,48 @@ pub(crate) unsafe fn p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p16_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 P016 kernel. `ALPHA = false` uses `write_rgb_32`;
+/// `ALPHA = true` uses `write_rgba_32` with constant `0xFF` alpha.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2851,6 +3062,7 @@ pub(crate) unsafe fn p16_to_rgb_row(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1);
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -2907,19 +3119,24 @@ pub(crate) unsafe fn p16_to_rgb_row(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let b_u8 = narrow_u8x32(b_lo, b_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 32;
     }
 
     if x < width {
-      scalar::p16_to_rgb_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index fd3927a..b0cd2ef 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -1588,6 +1588,179 @@ fn avx2_p16_matches_scalar_1920() {
   check_p16_u16_avx2_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
 }
 
+// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ----------
+//
+// RGBA wrappers share the math of their RGB siblings — only the store
+// (and tail dispatch) branches on `ALPHA`. These tests pin that the
+// SIMD RGBA path produces byte-identical output to the scalar RGBA
+// reference.
+
+fn check_planar_u8_avx2_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u8_avx2_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width / 2, 53);
+  let v = p16_plane_avx2(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx2(width, 37);
+  let u = p16_plane_avx2(width / 2, 53);
+  let v = p16_plane_avx2(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX2 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn avx2_yuv420p_n_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u8_avx2_rgba_equivalence_n::<9>(32, m, full);
+      check_planar_u8_avx2_rgba_equivalence_n::<10>(32, m, full);
+      check_planar_u8_avx2_rgba_equivalence_n::<12>(32, m, full);
+      check_planar_u8_avx2_rgba_equivalence_n::<14>(32, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx2_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_planar_u8_avx2_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u8_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u8_avx2_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn avx2_pn_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u8_avx2_rgba_equivalence_n::<10>(32, m, full);
+      check_pn_u8_avx2_rgba_equivalence_n::<12>(32, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx2_pn_rgba_matches_scalar_tail_and_1920() {
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_pn_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u8_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn avx2_yuv420p16_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u8_avx2_rgba_equivalence(32, m, full);
+    }
+  }
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_yuv420p16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn avx2_p016_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u8_avx2_rgba_equivalence(32, m, full);
+    }
+  }
+  for w in [34usize, 48, 62, 1920, 1922] {
+    check_p16_u8_avx2_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) AVX2 equivalence -----------------
 
 fn high_bit_plane_avx2<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index e0385c4..48e7cfa 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -307,6 +307,8 @@ unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
@@ -317,13 +319,65 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, false>(
+      y, u_half, v_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX-512 high-bit-depth YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p_n_to_rgb_or_rgba_row::<BITS, true>(
+      y, u_half, v_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX-512 high-bit YUV 4:2:0 kernel. `ALPHA = false` uses
+/// `write_rgb_64`; `ALPHA = true` uses `write_rgba_64` with constant
+/// `0xFF` alpha.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `width & 1 == 0`.
+/// 3. slices long enough + `out.len() >= width * if ALPHA { 4 } else { 3 }`.
+/// 4. `BITS` ∈ `{9, 10, 12, 14}`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -344,6 +398,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1);
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
@@ -411,21 +466,30 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::yuv_420p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p_n_to_rgba_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p_n_to_rgb_row::<BITS>(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      }
     }
   }
 }
@@ -1272,6 +1336,8 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`,
 ///    `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
@@ -1282,10 +1348,57 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 high-bit-packed semi-planar 4:2:0 → packed **8-bit RGBA**
+/// (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p_n_to_rgba_row<const BITS: u32>(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p_n_to_rgb_or_rgba_row::<BITS, true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 P010/P012 kernel. `ALPHA = false` uses
+/// `write_rgb_64`; `ALPHA = true` uses `write_rgba_64` with constant
+/// `0xFF` alpha.
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `width & 1 == 0`. 3. slices long enough +
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{10, 12}`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p_n_to_rgb_or_rgba_row<const BITS: u32, const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  const { assert!(BITS == 10 || BITS == 12) };
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<BITS, 8>(full_range);
@@ -1307,6 +1420,7 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1);
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
@@ -1368,20 +1482,25 @@ pub(crate) unsafe fn p_n_to_rgb_row<const BITS: u32>(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::p_n_to_rgb_row::<BITS>(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p_n_to_rgba_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p_n_to_rgb_row::<BITS>(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2652,6 +2771,8 @@ fn scale_y_u16_avx512(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
 ///    `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn yuv_420p16_to_rgb_row(
@@ -2663,11 +2784,52 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 16-bit YUV 4:2:0 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p16_to_rgba_row(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    yuv_420p16_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 16-bit YUV 4:2:0 kernel. `ALPHA = false` uses
+/// `write_rgb_64`; `ALPHA = true` uses `write_rgba_64` with constant
+/// `0xFF` alpha.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  u_half: &[u16],
+  v_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2685,6 +2847,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1);
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
     let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
@@ -2743,20 +2906,27 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 64;
     }
 
     if x < width {
-      scalar::yuv_420p16_to_rgb_row(
-        &y[x..width],
-        &u_half[x / 2..width / 2],
-        &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u_half[x / 2..width / 2];
+      let tail_v = &v_half[x / 2..width / 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::yuv_420p16_to_rgba_row(
+          tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range,
+        );
+      } else {
+        scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -2938,6 +3108,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row(
 /// 1. **AVX-512F + AVX-512BW must be available.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`.
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn p16_to_rgb_row(
@@ -2948,10 +3120,48 @@ pub(crate) unsafe fn p16_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 P016 → packed **8-bit RGBA** (`R, G, B, 0xFF`).
+///
+/// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p16_to_rgba_row(
+  y: &[u16],
+  uv_half: &[u16],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    p16_to_rgb_or_rgba_row::<true>(y, uv_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 P016 kernel. `ALPHA = false` uses `write_rgb_64`;
+/// `ALPHA = true` uses `write_rgba_64` with constant `0xFF` alpha.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn p16_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u16],
+  uv_half: &[u16],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert_eq!(width & 1, 0);
   debug_assert!(y.len() >= width);
   debug_assert!(uv_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 8>(full_range);
@@ -2969,6 +3179,7 @@ pub(crate) unsafe fn p16_to_rgb_row(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1);
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
     let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15);
@@ -3026,19 +3237,24 @@ pub(crate) unsafe fn p16_to_rgb_row(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let b_u8 = narrow_u8x64(b_lo, b_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
       x += 64;
     }
 
     if x < width {
-      scalar::p16_to_rgb_row(
-        &y[x..width],
-        &uv_half[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_half[x..width];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      if ALPHA {
+        scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index 5ee3453..9afb142 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -1602,6 +1602,179 @@ fn avx512_p16_matches_scalar_1920() {
   check_p16_u16_avx512_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
 }
 
+// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ----------
+//
+// RGBA wrappers share the math of their RGB siblings — only the store
+// (and tail dispatch) branches on `ALPHA`. These tests pin that the
+// SIMD RGBA path produces byte-identical output to the scalar RGBA
+// reference.
+
+fn check_planar_u8_avx512_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u8_avx512_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width / 2, 53);
+  let v = p16_plane_avx512(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane_avx512(width, 37);
+  let u = p16_plane_avx512(width / 2, 53);
+  let v = p16_plane_avx512(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "AVX-512 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn avx512_yuv420p_n_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u8_avx512_rgba_equivalence_n::<9>(64, m, full);
+      check_planar_u8_avx512_rgba_equivalence_n::<10>(64, m, full);
+      check_planar_u8_avx512_rgba_equivalence_n::<12>(64, m, full);
+      check_planar_u8_avx512_rgba_equivalence_n::<14>(64, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx512_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_planar_u8_avx512_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u8_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u8_avx512_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn avx512_pn_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u8_avx512_rgba_equivalence_n::<10>(64, m, full);
+      check_pn_u8_avx512_rgba_equivalence_n::<12>(64, m, full);
+    }
+  }
+}
+
+#[test]
+fn avx512_pn_rgba_matches_scalar_tail_and_1920() {
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_pn_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u8_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn avx512_yuv420p16_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u8_avx512_rgba_equivalence(64, m, full);
+    }
+  }
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_yuv420p16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn avx512_p016_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u8_avx512_rgba_equivalence(64, m, full);
+    }
+  }
+  for w in [66usize, 96, 126, 1920, 1922] {
+    check_p16_u8_avx512_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) AVX-512 equivalence -------------
 
 fn high_bit_plane_avx512<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 513341c..239b943 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -1624,6 +1624,179 @@ fn sse41_p16_matches_scalar_1920() {
   check_p16_u16_sse41_equivalence(1920, ColorMatrix::Bt2020Ncl, false);
 }
 
+// ---- High-bit 4:2:0 RGBA equivalence (Ship 8 Tranche 5a) ----------
+//
+// RGBA wrappers share the math of their RGB siblings — only the store
+// (and tail dispatch) branches on `ALPHA`. These tests pin that the
+// SIMD RGBA path produces byte-identical output to the scalar RGBA
+// reference.
+
+fn check_planar_u8_sse41_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = planar_n_plane::<BITS>(width, 37);
+  let u = planar_n_plane::<BITS>(width / 2, 53);
+  let v = planar_n_plane::<BITS>(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p_n_to_rgba_row::<BITS>(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 yuv_420p_n<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_pn_u8_sse41_rgba_equivalence_n<const BITS: u32>(
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  let y = p_n_packed_plane::<BITS>(width, 37);
+  let u = p_n_packed_plane::<BITS>(width / 2, 53);
+  let v = p_n_packed_plane::<BITS>(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p_n_to_rgba_row::<BITS>(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 Pn<{BITS}>→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_yuv420p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width / 2, 53);
+  let v = p16_plane(width / 2, 71);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 yuv_420p16→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+fn check_p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+  let y = p16_plane(width, 37);
+  let u = p16_plane(width / 2, 53);
+  let v = p16_plane(width / 2, 71);
+  let uv = p010_uv_interleave(&u, &v);
+  let mut rgba_scalar = std::vec![0u8; width * 4];
+  let mut rgba_simd = std::vec![0u8; width * 4];
+  scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+  unsafe {
+    p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+  }
+  assert_eq!(
+    rgba_scalar, rgba_simd,
+    "SSE4.1 P016→RGBA u8 diverges (width={width}, matrix={matrix:?}, full_range={full_range})"
+  );
+}
+
+#[test]
+fn sse41_yuv420p_n_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_planar_u8_sse41_rgba_equivalence_n::<9>(16, m, full);
+      check_planar_u8_sse41_rgba_equivalence_n::<10>(16, m, full);
+      check_planar_u8_sse41_rgba_equivalence_n::<12>(16, m, full);
+      check_planar_u8_sse41_rgba_equivalence_n::<14>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn sse41_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_planar_u8_sse41_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
+    check_planar_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
+    check_planar_u8_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt2020Ncl, false);
+    check_planar_u8_sse41_rgba_equivalence_n::<14>(w, ColorMatrix::YCgCo, true);
+  }
+}
+
+#[test]
+fn sse41_pn_rgba_matches_scalar_all_bits() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_pn_u8_sse41_rgba_equivalence_n::<10>(16, m, full);
+      check_pn_u8_sse41_rgba_equivalence_n::<12>(16, m, full);
+    }
+  }
+}
+
+#[test]
+fn sse41_pn_rgba_matches_scalar_tail_and_1920() {
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_pn_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
+    check_pn_u8_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
+  }
+}
+
+#[test]
+fn sse41_yuv420p16_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_yuv420p16_u8_sse41_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_yuv420p16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
+#[test]
+fn sse41_p016_rgba_matches_scalar_all_matrices() {
+  for m in [
+    ColorMatrix::Bt601,
+    ColorMatrix::Bt709,
+    ColorMatrix::Bt2020Ncl,
+    ColorMatrix::Smpte240m,
+    ColorMatrix::Fcc,
+    ColorMatrix::YCgCo,
+  ] {
+    for full in [true, false] {
+      check_p16_u8_sse41_rgba_equivalence(16, m, full);
+    }
+  }
+  for w in [18usize, 30, 34, 1920, 1922] {
+    check_p16_u8_sse41_rgba_equivalence(w, ColorMatrix::Bt709, false);
+  }
+}
+
 // ---- Pn 4:4:4 (P410 / P412 / P416) SSE4.1 equivalence ---------------
 
 fn high_bit_plane_sse41<const BITS: u32>(n: usize, seed: usize) -> std::vec::Vec<u16> {
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 15f8c10..d67ce1b 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -2659,12 +2659,12 @@ pub fn p016_to_rgb_u16_row(
   scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range);
 }
 
-// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5 prep) ----------
+// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) ---------------
 //
-// Scalar prep: dispatchers route through the new RGBA scalar kernels
-// (`scalar::*_to_rgba*_row`). The `use_simd` parameter is held in the
-// signature so the follow-up SIMD/backend PRs (Ship 8 Tranche 5a/5b)
-// can fill in per-arch branches without breaking callers.
+// u8 RGBA dispatchers route to per-arch SIMD kernels (Ship 8 Tranche
+// 5a). u16 RGBA dispatchers stay scalar-only until the follow-up Ship
+// 8 Tranche 5b PR adds u16 SIMD; their `use_simd` parameter is held in
+// the signature so wiring 5b is a non-breaking change.
 
 /// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit**
 /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the
@@ -2674,9 +2674,7 @@ pub fn p016_to_rgb_u16_row(
 /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
 /// `scalar::yuv_420p_n_to_rgba_row` for the reference.
 ///
-/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
-/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
-/// runs the scalar reference regardless of `use_simd`.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p9_to_rgba_row(
@@ -2696,7 +2694,62 @@ pub fn yuv420p9_to_rgba_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified on this CPU; bounds / parity are
+          // the caller's obligation (asserted above).
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2740,9 +2793,7 @@ pub fn yuv420p9_to_rgba_u16_row(
 /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
 /// `scalar::yuv_420p_n_to_rgba_row` for the reference.
 ///
-/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
-/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
-/// runs the scalar reference regardless of `use_simd`.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p10_to_rgba_row(
@@ -2762,7 +2813,61 @@ pub fn yuv420p10_to_rgba_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2802,10 +2907,8 @@ pub fn yuv420p10_to_rgba_u16_row(
 /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
 /// `0xFF` (opaque).
 ///
-/// See `scalar::p_n_to_rgba_row::<10>` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::p_n_to_rgba_row::<10>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn p010_to_rgba_row(
@@ -2823,7 +2926,53 @@ pub fn p010_to_rgba_row(
   assert!(uv_half.len() >= width, "uv_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2864,9 +3013,7 @@ pub fn p010_to_rgba_u16_row(
 /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
 /// `scalar::yuv_420p_n_to_rgba_row` for the reference.
 ///
-/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
-/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
-/// runs the scalar reference regardless of `use_simd`.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p12_to_rgba_row(
@@ -2886,7 +3033,61 @@ pub fn yuv420p12_to_rgba_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2930,9 +3131,7 @@ pub fn yuv420p12_to_rgba_u16_row(
 /// for the per-pixel stride (4 vs 3) and the constant alpha byte. See
 /// `scalar::yuv_420p_n_to_rgba_row` for the reference.
 ///
-/// `use_simd = false` forces scalar. SIMD per-arch routes land in the
-/// follow-up Ship 8 Tranche 5a PR — for now this dispatcher always
-/// runs the scalar reference regardless of `use_simd`.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p14_to_rgba_row(
@@ -2952,7 +3151,61 @@ pub fn yuv420p14_to_rgba_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -2992,10 +3245,8 @@ pub fn yuv420p14_to_rgba_u16_row(
 /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to
 /// `0xFF` (opaque).
 ///
-/// See `scalar::p_n_to_rgba_row::<12>` for the reference. SIMD
-/// per-arch routes land in the follow-up Ship 8 Tranche 5a PR — for
-/// now this dispatcher always runs the scalar reference regardless of
-/// `use_simd`.
+/// See `scalar::p_n_to_rgba_row::<12>` for the reference.
+/// `use_simd = false` forces the scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn p012_to_rgba_row(
@@ -3013,7 +3264,53 @@ pub fn p012_to_rgba_row(
   assert!(uv_half.len() >= width, "uv_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time verified.
+          unsafe {
+            arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3051,8 +3348,8 @@ pub fn p012_to_rgba_u16_row(
 ///
 /// Routes through the dedicated 16-bit scalar kernel
 /// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient
-/// for u8 output even at 16-bit input. SIMD per-arch routes land in
-/// the follow-up Ship 8 Tranche 5a PR.
+/// for u8 output even at 16-bit input. `use_simd = false` forces the
+/// scalar reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn yuv420p16_to_rgba_row(
@@ -3072,7 +3369,48 @@ pub fn yuv420p16_to_rgba_row(
   assert!(v_half.len() >= width / 2, "v_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
 }
 
@@ -3111,8 +3449,8 @@ pub fn yuv420p16_to_rgba_u16_row(
 /// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`.
 ///
 /// Routes through the dedicated 16-bit P016 scalar kernel
-/// (`scalar::p16_to_rgba_row`). SIMD per-arch routes land in the
-/// follow-up Ship 8 Tranche 5a PR.
+/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar
+/// reference path.
 #[cfg_attr(not(tarpaulin), inline(always))]
 #[allow(clippy::too_many_arguments)]
 pub fn p016_to_rgba_row(
@@ -3130,7 +3468,48 @@ pub fn p016_to_rgba_row(
   assert!(uv_half.len() >= width, "uv_half row too short");
   assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
 
-  let _ = use_simd; // SIMD per-arch routes land in Ship 8 Tranche 5a.
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
   scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
 }
 

From 00c73d62a6c7514a3ed46394706e5d0dd54deaf4 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Mon, 27 Apr 2026 00:06:34 +1200
Subject: [PATCH 5/5] update

---
 src/row/arch/x86_avx2/tests.rs   | 18 ++++++++++++++++++
 src/row/arch/x86_avx512/tests.rs | 18 ++++++++++++++++++
 src/row/arch/x86_sse41/tests.rs  | 18 ++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs
index b0cd2ef..f8f28eb 100644
--- a/src/row/arch/x86_avx2/tests.rs
+++ b/src/row/arch/x86_avx2/tests.rs
@@ -1671,6 +1671,9 @@ fn check_p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ra
 
 #[test]
 fn avx2_yuv420p_n_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1690,6 +1693,9 @@ fn avx2_yuv420p_n_rgba_matches_scalar_all_bits() {
 
 #[test]
 fn avx2_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
   for w in [34usize, 48, 62, 1920, 1922] {
     check_planar_u8_avx2_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
     check_planar_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
@@ -1700,6 +1706,9 @@ fn avx2_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
 
 #[test]
 fn avx2_pn_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1717,6 +1726,9 @@ fn avx2_pn_rgba_matches_scalar_all_bits() {
 
 #[test]
 fn avx2_pn_rgba_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
   for w in [34usize, 48, 62, 1920, 1922] {
     check_pn_u8_avx2_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
     check_pn_u8_avx2_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
@@ -1725,6 +1737,9 @@ fn avx2_pn_rgba_matches_scalar_tail_and_1920() {
 
 #[test]
 fn avx2_yuv420p16_rgba_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1744,6 +1759,9 @@ fn avx2_yuv420p16_rgba_matches_scalar_all_matrices() {
 
 #[test]
 fn avx2_p016_rgba_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx2") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs
index 9afb142..2591060 100644
--- a/src/row/arch/x86_avx512/tests.rs
+++ b/src/row/arch/x86_avx512/tests.rs
@@ -1685,6 +1685,9 @@ fn check_p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_
 
 #[test]
 fn avx512_yuv420p_n_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1704,6 +1707,9 @@ fn avx512_yuv420p_n_rgba_matches_scalar_all_bits() {
 
 #[test]
 fn avx512_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
   for w in [66usize, 96, 126, 1920, 1922] {
     check_planar_u8_avx512_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
     check_planar_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
@@ -1714,6 +1720,9 @@ fn avx512_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
 
 #[test]
 fn avx512_pn_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1731,6 +1740,9 @@ fn avx512_pn_rgba_matches_scalar_all_bits() {
 
 #[test]
 fn avx512_pn_rgba_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
   for w in [66usize, 96, 126, 1920, 1922] {
     check_pn_u8_avx512_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
     check_pn_u8_avx512_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
@@ -1739,6 +1751,9 @@ fn avx512_pn_rgba_matches_scalar_tail_and_1920() {
 
 #[test]
 fn avx512_yuv420p16_rgba_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1758,6 +1773,9 @@ fn avx512_yuv420p16_rgba_matches_scalar_all_matrices() {
 
 #[test]
 fn avx512_p016_rgba_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("avx512bw") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs
index 239b943..284f2b3 100644
--- a/src/row/arch/x86_sse41/tests.rs
+++ b/src/row/arch/x86_sse41/tests.rs
@@ -1707,6 +1707,9 @@ fn check_p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_r
 
 #[test]
 fn sse41_yuv420p_n_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1726,6 +1729,9 @@ fn sse41_yuv420p_n_rgba_matches_scalar_all_bits() {
 
 #[test]
 fn sse41_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
   for w in [18usize, 30, 34, 1920, 1922] {
     check_planar_u8_sse41_rgba_equivalence_n::<9>(w, ColorMatrix::Bt601, false);
     check_planar_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt709, true);
@@ -1736,6 +1742,9 @@ fn sse41_yuv420p_n_rgba_matches_scalar_tail_and_1920() {
 
 #[test]
 fn sse41_pn_rgba_matches_scalar_all_bits() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1753,6 +1762,9 @@ fn sse41_pn_rgba_matches_scalar_all_bits() {
 
 #[test]
 fn sse41_pn_rgba_matches_scalar_tail_and_1920() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
   for w in [18usize, 30, 34, 1920, 1922] {
     check_pn_u8_sse41_rgba_equivalence_n::<10>(w, ColorMatrix::Bt601, false);
     check_pn_u8_sse41_rgba_equivalence_n::<12>(w, ColorMatrix::Bt709, true);
@@ -1761,6 +1773,9 @@ fn sse41_pn_rgba_matches_scalar_tail_and_1920() {
 
 #[test]
 fn sse41_yuv420p16_rgba_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,
@@ -1780,6 +1795,9 @@ fn sse41_yuv420p16_rgba_matches_scalar_all_matrices() {
 
 #[test]
 fn sse41_p016_rgba_matches_scalar_all_matrices() {
+  if !std::arch::is_x86_feature_detected!("sse4.1") {
+    return;
+  }
   for m in [
     ColorMatrix::Bt601,
     ColorMatrix::Bt709,