From 44a23b9037bd5bef0f7358c68e3626b9512582bc Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 15:53:47 +1200
Subject: [PATCH 1/2] update

---
 src/row/arch/neon.rs         | 288 ++++++++++++++++----
 src/row/arch/wasm_simd128.rs | 237 ++++++++++++++---
 src/row/arch/x86_avx2.rs     | 281 +++++++++++++++++---
 src/row/arch/x86_avx512.rs   | 281 +++++++++++++++++---
 src/row/arch/x86_sse41.rs    | 278 +++++++++++++++++---
 src/row/mod.rs               | 144 ++++++++++
 src/row/scalar.rs            |  94 +++++--
 src/sinker/mixed.rs          | 490 +++++++++++++++++++++++++++++++++--
 8 files changed, 1844 insertions(+), 249 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 24f4ae1..ae95884 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -1078,12 +1078,9 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
   }
 }
 
-/// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over the
-/// shared [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1096,16 +1093,15 @@ pub(crate) unsafe fn nv12_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, false>(
+      y, uv_half, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
 /// NEON NV21 → packed RGB (VU-ordered chroma). Thin wrapper over
-/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1118,15 +1114,62 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<true>(y, vu_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, false>(
+      y, vu_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// NEON NV12 → packed RGBA (R, G, B, `0xFF` per pixel). Same
+/// contract as [`nv12_to_rgb_row`] but writes 4 bytes per pixel via
+/// `vst4q_u8`. `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, true>(
+      y, uv_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// NEON NV21 → packed RGBA (R, G, B, `0xFF` per pixel). Same
+/// contract as [`nv21_to_rgb_row`] but writes 4 bytes per pixel via
+/// `vst4q_u8`. `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, true>(
+      y, vu_half, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
-/// Shared NEON NV12/NV21 kernel. `SWAP_UV = false` selects NV12
-/// (even byte = U, odd = V); `SWAP_UV = true` selects NV21 (even =
-/// V, odd = U). The const generic drives monomorphization — the
-/// branch is eliminated in each instantiation and both wrappers
-/// produce byte‑identical output to the scalar reference.
+/// Shared NEON NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque
+/// alpha (RGBA). `SWAP_UV = false` selects NV12 (even byte = U, odd =
+/// V); `SWAP_UV = true` selects NV21 (even = V, odd = U). `ALPHA =
+/// true` writes via `vst4q_u8` with constant `0xFF` alpha; `ALPHA =
+/// false` writes via `vst3q_u8`. Both const generics drive
+/// compile-time monomorphization — branches are eliminated and each
+/// of the four wrappers produces byte‑identical output to the scalar
+/// reference.
 ///
 /// # Safety
 ///
@@ -1135,17 +1178,20 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`.
 /// 4. `uv_or_vu_half.len() >= width` (2 × (width / 2) interleaved bytes).
-/// 5. `rgb_out.len() >= 3 * width`.
+/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 ///
 /// Bounds are `debug_assert`-checked; release builds trust the caller
 /// because the kernel uses unchecked pointer arithmetic (`vld1q_u8`,
-/// `vld2_u8`, `vst3q_u8`).
+/// `vld2_u8`, `vst3q_u8` / `vst4q_u8`).
 #[inline]
 #[target_feature(enable = "neon")]
-unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu_half: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
@@ -1153,7 +1199,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
   debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width");
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1174,6 +1221,9 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    // Constant opaque-alpha vector for the RGBA path; DCE'd when
+    // ALPHA = false.
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1233,33 +1283,37 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)),
       );
 
-      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
-      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+      if ALPHA {
+        let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
+        vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
+      } else {
+        let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+        vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
+      }
 
       x += 16;
     }
 
     // Scalar tail for the 0..14 leftover pixels. Dispatch to the
-    // matching scalar kernel based on SWAP_UV.
+    // matching scalar kernel based on SWAP_UV × ALPHA.
     if x < width {
-      if SWAP_UV {
-        scalar::nv21_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv12_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu_half[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -3699,6 +3753,152 @@ mod tests {
     }
   }
 
+  // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ----------------
+  //
+  // Direct backend tests for the new RGBA path, mirroring the RGB
+  // pattern above. Bypasses the dispatcher so the NEON `vst4q_u8`
+  // store is exercised regardless of what tier the dispatcher picks.
+
+  fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_neon = std::vec![0u8; width * 4];
+
+    scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv12_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_neon {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "NEON NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgba_scalar[first_diff], rgba_neon[first_diff]
+      );
+    }
+  }
+
+  fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_neon = std::vec![0u8; width * 4];
+
+    scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv21_to_rgba_row(&y, &vu, &mut rgba_neon, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_neon {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "NEON NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgba_scalar[first_diff], rgba_neon[first_diff]
+      );
+    }
+  }
+
+  /// Cross-format invariant: NV12 RGBA must match Yuv420p RGBA on
+  /// equivalent UV bytes. Catches U/V swap regressions specific to
+  /// the new RGBA store path.
+  fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let uv: std::vec::Vec<u8> = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect();
+
+    let mut rgba_yuv420p = std::vec![0u8; width * 4];
+    let mut rgba_nv12 = std::vec![0u8; width * 4];
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range);
+      nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgba_yuv420p, rgba_nv12,
+      "NEON NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})"
+    );
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv12_neon_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv12_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv12_neon_rgba_matches_scalar_widths() {
+    for w in [18usize, 30, 34, 1920, 1922] {
+      check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv12_neon_rgba_matches_yuv420p_rgba_neon() {
+    for w in [16usize, 30, 64, 1920] {
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false);
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true);
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv21_neon_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv21_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv21_neon_rgba_matches_scalar_widths() {
+    for w in [18usize, 30, 34, 1920, 1922] {
+      check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- nv24_to_rgb_row / nv42_to_rgb_row equivalence ------------------
 
   fn check_nv24_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 2ff0619..f8a794d 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -1317,12 +1317,9 @@ unsafe fn deinterleave_uv_u16_wasm(ptr: *const u16) -> (v128, v128) {
   }
 }
 
-/// WASM simd128 NV12 → packed RGB (UV-ordered chroma). Thin wrapper
-/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// WASM simd128 NV12 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1333,18 +1330,16 @@ pub(crate) unsafe fn nv12_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, false>(
+      y, uv_half, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
-/// WASM simd128 NV21 → packed RGB (VU-ordered chroma). Thin wrapper
-/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// WASM simd128 NV21 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1355,29 +1350,75 @@ pub(crate) unsafe fn nv21_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<true>(y, vu_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, false>(
+      y, vu_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// WASM simd128 NV12 → packed RGBA. Same contract as
+/// [`nv12_to_rgb_row`] but writes 4 bytes per pixel via
+/// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, true>(
+      y, uv_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// WASM simd128 NV21 → packed RGBA. Same contract as
+/// [`nv21_to_rgb_row`] but writes 4 bytes per pixel via
+/// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, true>(
+      y, vu_half, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
-/// Shared wasm simd128 NV12/NV21 kernel. `SWAP_UV` selects chroma
-/// byte order at compile time.
+/// Shared wasm simd128 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp +
+/// opaque alpha (RGBA). `SWAP_UV` selects chroma byte order;
+/// `ALPHA = true` writes via [`write_rgba_16`], `ALPHA = false` via
+/// [`write_rgb_16`]. Both const generics drive compile-time
+/// monomorphization.
 ///
 /// # Safety
 ///
-/// 1. **simd128 must be enabled at compile time** (same obligation as
-///    [`yuv_420_to_rgb_row`]).
+/// 1. **simd128 must be enabled at compile time.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`.
 /// 4. `uv_or_vu_half.len() >= width` (16 interleaved bytes per 16 Y pixels).
-/// 5. `rgb_out.len() >= 3 * width`.
+/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "simd128")]
-unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu_half: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
@@ -1385,7 +1426,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
   debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width");
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1407,6 +1449,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1502,30 +1545,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv21_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv12_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu_half[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -3877,6 +3923,111 @@ mod tests {
       check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true);
     }
   }
+
+  // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ----------------
+
+  fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_wasm = std::vec![0u8; width * 4];
+
+    scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv12_to_rgba_row(&y, &uv, &mut rgba_wasm, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_wasm {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_wasm.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "wasm simd128 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}",
+        rgba_scalar[first_diff], rgba_wasm[first_diff]
+      );
+    }
+  }
+
+  fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_wasm = std::vec![0u8; width * 4];
+
+    scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv21_to_rgba_row(&y, &vu, &mut rgba_wasm, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_wasm {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_wasm.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "wasm simd128 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}",
+        rgba_scalar[first_diff], rgba_wasm[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn nv12_wasm_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv12_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv12_wasm_rgba_matches_scalar_widths() {
+    for w in [18usize, 30, 34, 1920, 1922] {
+      check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
+  #[test]
+  fn nv21_wasm_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv21_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv21_wasm_rgba_matches_scalar_widths() {
+    for w in [18usize, 30, 34, 1920, 1922] {
+      check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- rgb_to_hsv_row equivalence --------------------------------------
 
   fn check_hsv_equivalence(rgb: &[u8], width: usize) {
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 54742dc..3312a79 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -1523,12 +1523,9 @@ unsafe fn deinterleave_uv_u16_avx2(ptr: *const u16) -> (__m256i, __m256i) {
   }
 }
 
-/// AVX2 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
-/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// AVX2 NV12 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1539,18 +1536,16 @@ pub(crate) unsafe fn nv12_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, false>(
+      y, uv_half, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
-/// AVX2 NV21 → packed RGB (VU-ordered chroma). Thin wrapper over
-/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// AVX2 NV21 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1561,29 +1556,74 @@ pub(crate) unsafe fn nv21_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<true>(y, vu_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, false>(
+      y, vu_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX2 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`]
+/// but writes 4 bytes per pixel via [`write_rgba_32`].
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, true>(
+      y, uv_half, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
-/// Shared AVX2 NV12/NV21 kernel. `SWAP_UV` selects chroma byte order
-/// at compile time.
+/// AVX2 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`]
+/// but writes 4 bytes per pixel via [`write_rgba_32`].
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, true>(
+      y, vu_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX2 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque
+/// alpha (RGBA). `SWAP_UV` selects chroma byte order; `ALPHA = true`
+/// writes via [`write_rgba_32`], `ALPHA = false` via [`write_rgb_32`].
+/// Both const generics drive compile-time monomorphization.
 ///
 /// # Safety
 ///
-/// 1. **AVX2 must be available on the current CPU** (same obligation
-///    as [`yuv_420_to_rgb_row`]).
+/// 1. **AVX2 must be available on the current CPU.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`.
 /// 4. `uv_or_vu_half.len() >= width` (32 interleaved bytes per 32 Y pixels).
-/// 5. `rgb_out.len() >= 3 * width`.
+/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx2")]
-unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu_half: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
@@ -1591,7 +1631,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
   debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width");
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1612,6 +1653,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1); // 0xFF as i8
 
     // Per‑lane shuffle: pack U bytes (even offsets) into low 8 of each
     // 128‑bit lane, V bytes (odd offsets) into the high 8. Applied to a
@@ -1696,30 +1738,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv21_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv12_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu_half[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -4240,6 +4285,156 @@ mod tests {
       check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true);
     }
   }
+
+  // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ----------------
+
+  fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx2 = std::vec![0u8; width * 4];
+
+    scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv12_to_rgba_row(&y, &uv, &mut rgba_avx2, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx2 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx2.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX2 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}",
+        rgba_scalar[first_diff], rgba_avx2[first_diff]
+      );
+    }
+  }
+
+  fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx2 = std::vec![0u8; width * 4];
+
+    scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv21_to_rgba_row(&y, &vu, &mut rgba_avx2, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx2 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx2.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX2 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}",
+        rgba_scalar[first_diff], rgba_avx2[first_diff]
+      );
+    }
+  }
+
+  fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let uv: std::vec::Vec<u8> = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect();
+
+    let mut rgba_yuv420p = std::vec![0u8; width * 4];
+    let mut rgba_nv12 = std::vec![0u8; width * 4];
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range);
+      nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgba_yuv420p, rgba_nv12,
+      "AVX2 NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})"
+    );
+  }
+
+  #[test]
+  fn nv12_avx2_rgba_matches_scalar_all_matrices_32() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv12_rgba_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv12_avx2_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for w in [34usize, 46, 62, 1920, 1922] {
+      check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
+  #[test]
+  fn nv12_avx2_rgba_matches_yuv420p_rgba_avx2() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for w in [32usize, 64, 1920] {
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false);
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true);
+    }
+  }
+
+  #[test]
+  fn nv21_avx2_rgba_matches_scalar_all_matrices_32() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv21_rgba_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv21_avx2_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for w in [34usize, 46, 62, 1920, 1922] {
+      check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- rgb_to_hsv_row equivalence --------------------------------------
 
   fn check_hsv_equivalence(rgb: &[u8], width: usize) {
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 9e17ee9..77ce177 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -1570,12 +1570,9 @@ unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) {
   }
 }
 
-/// AVX‑512 NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
-/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// AVX‑512 NV12 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1586,18 +1583,16 @@ pub(crate) unsafe fn nv12_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, false>(
+      y, uv_half, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
-/// AVX‑512 NV21 → packed RGB (VU-ordered chroma). Thin wrapper over
-/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// AVX‑512 NV21 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1608,29 +1603,74 @@ pub(crate) unsafe fn nv21_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<true>(y, vu_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, false>(
+      y, vu_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// AVX‑512 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`]
+/// but writes 4 bytes per pixel via [`write_rgba_64`].
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, true>(
+      y, uv_half, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
-/// Shared AVX‑512 NV12/NV21 kernel. `SWAP_UV` selects chroma byte
-/// order at compile time.
+/// AVX‑512 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`]
+/// but writes 4 bytes per pixel via [`write_rgba_64`].
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, true>(
+      y, vu_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// Shared AVX‑512 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque
+/// alpha (RGBA). `SWAP_UV` selects chroma byte order; `ALPHA = true`
+/// writes via [`write_rgba_64`], `ALPHA = false` via [`write_rgb_64`].
+/// Both const generics drive compile-time monomorphization.
 ///
 /// # Safety
 ///
-/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU**
-///    (same obligation as [`yuv_420_to_rgb_row`]).
+/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`.
 /// 4. `uv_or_vu_half.len() >= width` (64 interleaved bytes per 64 Y pixels).
-/// 5. `rgb_out.len() >= 3 * width`.
+/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu_half: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
@@ -1638,7 +1678,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
   debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width");
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1659,6 +1700,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1); // 0xFF as i8
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
     let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11);
@@ -1748,30 +1790,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 64;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv21_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv12_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu_half[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -4420,6 +4465,156 @@ mod tests {
       check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true);
     }
   }
+
+  // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ----------------
+
+  fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx512 = std::vec![0u8; width * 4];
+
+    scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv12_to_rgba_row(&y, &uv, &mut rgba_avx512, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx512 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx512.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX-512 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}",
+        rgba_scalar[first_diff], rgba_avx512[first_diff]
+      );
+    }
+  }
+
+  fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx512 = std::vec![0u8; width * 4];
+
+    scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv21_to_rgba_row(&y, &vu, &mut rgba_avx512, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx512 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx512.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX-512 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}",
+        rgba_scalar[first_diff], rgba_avx512[first_diff]
+      );
+    }
+  }
+
+  fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let uv: std::vec::Vec<u8> = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect();
+
+    let mut rgba_yuv420p = std::vec![0u8; width * 4];
+    let mut rgba_nv12 = std::vec![0u8; width * 4];
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range);
+      nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgba_yuv420p, rgba_nv12,
+      "AVX-512 NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})"
+    );
+  }
+
+  #[test]
+  fn nv12_avx512_rgba_matches_scalar_all_matrices_64() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv12_rgba_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv12_avx512_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for w in [66usize, 94, 126, 1920, 1922] {
+      check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
+  #[test]
+  fn nv12_avx512_rgba_matches_yuv420p_rgba_avx512() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for w in [64usize, 128, 1920] {
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false);
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true);
+    }
+  }
+
+  #[test]
+  fn nv21_avx512_rgba_matches_scalar_all_matrices_64() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv21_rgba_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv21_avx512_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for w in [66usize, 94, 126, 1920, 1922] {
+      check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- rgb_to_hsv_row equivalence --------------------------------------
 
   fn check_hsv_equivalence(rgb: &[u8], width: usize) {
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index c431c72..ad8925b 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1336,7 +1336,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
   }
 }
 
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// SSE4.1 NV12 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1347,18 +1349,16 @@ pub(crate) unsafe fn nv12_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, false>(
+      y, uv_half, rgb_out, width, matrix, full_range,
+    );
   }
 }
 
-/// SSE4.1 NV21 → packed RGB (VU-ordered chroma). Thin wrapper over
-/// [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`.
-///
-/// # Safety
-///
-/// Same as [`nv12_or_nv21_to_rgb_row_impl`].
+/// SSE4.1 NV21 → packed RGB. Thin wrapper over
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1369,30 +1369,75 @@ pub(crate) unsafe fn nv21_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv12_or_nv21_to_rgb_row_impl::<true>(y, vu_half, rgb_out, width, matrix, full_range);
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, false>(
+      y, vu_half, rgb_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`]
+/// but writes 4 bytes per pixel via [`write_rgba_16`].
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, true>(
+      y, uv_half, rgba_out, width, matrix, full_range,
+    );
+  }
+}
+
+/// SSE4.1 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`]
+/// but writes 4 bytes per pixel via [`write_rgba_16`].
+/// `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  unsafe {
+    nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, true>(
+      y, vu_half, rgba_out, width, matrix, full_range,
+    );
   }
 }
 
-/// Shared SSE4.1 NV12/NV21 kernel. `SWAP_UV = false` → NV12,
-/// `SWAP_UV = true` → NV21. Const generic drives monomorphization —
-/// the swap is resolved at compile time.
+/// Shared SSE4.1 NV12/NV21 kernel at 3 bpp (RGB) or 4 bpp + opaque
+/// alpha (RGBA). `SWAP_UV = false` → NV12; `SWAP_UV = true` → NV21.
+/// `ALPHA = true` writes via [`write_rgba_16`]; `ALPHA = false` via
+/// [`write_rgb_16`]. Both const generics drive compile-time
+/// monomorphization.
 ///
 /// # Safety
 ///
-/// 1. **SSE4.1 must be available on the current CPU** (same obligation
-///    as [`yuv_420_to_rgb_row`]).
+/// 1. **SSE4.1 must be available on the current CPU.**
 /// 2. `width & 1 == 0`.
 /// 3. `y.len() >= width`.
 /// 4. `uv_or_vu_half.len() >= width` (2 × (width / 2) interleaved bytes).
-/// 5. `rgb_out.len() >= 3 * width`.
+/// 5. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu_half: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
@@ -1400,7 +1445,8 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
   debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width");
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu_half.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1421,6 +1467,7 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1); // 0xFF as i8
 
     // Deinterleave masks: `even_mask` pulls even-offset bytes into
     // lanes 0..7, `odd_mask` pulls odd-offset bytes. For NV12 that's
@@ -1486,30 +1533,33 @@ unsafe fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv21_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv12_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu_half[x..width],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu_half[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv12_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv21_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv12_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv21_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -3744,6 +3794,156 @@ mod tests {
       check_nv21_matches_nv12_swapped(w, ColorMatrix::YCgCo, true);
     }
   }
+
+  // ---- nv12_to_rgba_row / nv21_to_rgba_row equivalence ----------------
+
+  fn check_nv12_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_sse41 = std::vec![0u8; width * 4];
+
+    scalar::nv12_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv12_to_rgba_row(&y, &uv, &mut rgba_sse41, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_sse41 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_sse41.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "SSE4.1 NV12 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}",
+        rgba_scalar[first_diff], rgba_sse41[first_diff]
+      );
+    }
+  }
+
+  fn check_nv21_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width / 2)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_sse41 = std::vec![0u8; width * 4];
+
+    scalar::nv21_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv21_to_rgba_row(&y, &vu, &mut rgba_sse41, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_sse41 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_sse41.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "SSE4.1 NV21 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}",
+        rgba_scalar[first_diff], rgba_sse41[first_diff]
+      );
+    }
+  }
+
+  fn check_nv12_rgba_matches_yuv420p_rgba(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let uv: std::vec::Vec<u8> = u.iter().zip(v.iter()).flat_map(|(a, b)| [*a, *b]).collect();
+
+    let mut rgba_yuv420p = std::vec![0u8; width * 4];
+    let mut rgba_nv12 = std::vec![0u8; width * 4];
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_yuv420p, width, matrix, full_range);
+      nv12_to_rgba_row(&y, &uv, &mut rgba_nv12, width, matrix, full_range);
+    }
+    assert_eq!(
+      rgba_yuv420p, rgba_nv12,
+      "SSE4.1 NV12 RGBA must match Yuv420p RGBA for equivalent UV (width={width}, matrix={matrix:?}, full_range={full_range})"
+    );
+  }
+
+  #[test]
+  fn nv12_sse41_rgba_matches_scalar_all_matrices_16() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv12_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv12_sse41_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [18usize, 30, 34, 1920, 1922] {
+      check_nv12_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
+  #[test]
+  fn nv12_sse41_rgba_matches_yuv420p_rgba_sse41() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [16usize, 30, 64, 1920] {
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::Bt709, false);
+      check_nv12_rgba_matches_yuv420p_rgba(w, ColorMatrix::YCgCo, true);
+    }
+  }
+
+  #[test]
+  fn nv21_sse41_rgba_matches_scalar_all_matrices_16() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv21_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn nv21_sse41_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [18usize, 30, 34, 1920, 1922] {
+      check_nv21_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- rgb_to_hsv_row equivalence --------------------------------------
 
   fn check_hsv_equivalence(rgb: &[u8], width: usize) {
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 2523eda..f30667d 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -406,6 +406,150 @@ pub fn nv21_to_rgb_row(
   scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of NV12 (semi‑planar 4:2:0) to packed **RGBA**
+/// (8-bit). Same numerical contract as [`nv12_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
+/// produce opaque output).
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary — see
+  // [`yuv_420_to_rgba_row`] for rationale, including the checked
+  // `width × 4` multiplication via [`rgba_row_bytes`].
+  assert_eq!(width & 1, 0, "NV12 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv_half.len() >= width, "uv_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to
+/// packed **RGBA** (8-bit). Same numerical contract as
+/// [`nv21_to_rgb_row`]; alpha defaults to `0xFF` (opaque).
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  assert_eq!(width & 1, 0, "NV21 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(vu_half.len() >= width, "vu_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range);
+}
+
 /// Converts one row of NV24 (semi‑planar 4:4:4, UV‑ordered) to packed
 /// RGB. Dispatches to the best available SIMD backend for the current
 /// target (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), falling
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index c41a2c8..e6d07ac 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -139,7 +139,8 @@ pub(crate) fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
 }
 
 /// NV12 (semi‑planar 4:2:0, UV-ordered) → packed RGB. Thin wrapper
-/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// over [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn nv12_to_rgb_row(
   y: &[u8],
@@ -149,11 +150,14 @@ pub(crate) fn nv12_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  nv12_or_nv21_to_rgb_row_impl::<false>(y, uv_half, rgb_out, width, matrix, full_range);
+  nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, false>(
+    y, uv_half, rgb_out, width, matrix, full_range,
+  );
 }
 
 /// NV21 (semi‑planar 4:2:0, VU-ordered) → packed RGB. Thin wrapper
-/// over [`nv12_or_nv21_to_rgb_row_impl`] with `SWAP_UV = true`.
+/// over [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn nv21_to_rgb_row(
   y: &[u8],
@@ -163,26 +167,63 @@ pub(crate) fn nv21_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  nv12_or_nv21_to_rgb_row_impl::<true>(y, vu_half, rgb_out, width, matrix, full_range);
+  nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, false>(
+    y, vu_half, rgb_out, width, matrix, full_range,
+  );
 }
 
-/// Shared scalar kernel for NV12 (SWAP_UV=false) and NV21
-/// (SWAP_UV=true). Identical math and numerical contract to
-/// [`yuv_420_to_rgb_row`]; the only difference is chroma byte order
-/// in the interleaved plane. `const` generic drives compile-time
-/// monomorphization — each wrapper is inlined with the branch
-/// eliminated.
+/// NV12 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`.
+/// First three bytes per pixel are byte-identical to
+/// [`nv12_to_rgb_row`]. `rgba_out.len() >= 4 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn nv12_to_rgba_row(
+  y: &[u8],
+  uv_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  nv12_or_nv21_to_rgb_or_rgba_row_impl::<false, true>(
+    y, uv_half, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// NV21 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`.
+/// First three bytes per pixel are byte-identical to
+/// [`nv21_to_rgb_row`]. `rgba_out.len() >= 4 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn nv21_to_rgba_row(
+  y: &[u8],
+  vu_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  nv12_or_nv21_to_rgb_or_rgba_row_impl::<true, true>(
+    y, vu_half, rgba_out, width, matrix, full_range,
+  );
+}
+
+/// Shared scalar kernel for NV12 (`SWAP_UV = false`) / NV21
+/// (`SWAP_UV = true`) at 3 bpp (`ALPHA = false`) or 4 bpp + opaque
+/// alpha (`ALPHA = true`). Identical math to [`yuv_420_to_rgb_row`];
+/// the only differences are chroma byte order in the interleaved
+/// plane and the per-pixel store stride. Both `const` generics drive
+/// compile-time monomorphization — each wrapper is inlined with both
+/// branches eliminated.
 ///
 /// # Panics (debug builds)
 ///
 /// - `width` must be even (4:2:0 pairs pixel columns).
 /// - `y.len() >= width`, `uv_or_vu_half.len() >= width`,
-///   `rgb_out.len() >= 3 * width`.
+///   `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) fn nv12_or_nv21_to_rgb_or_rgba_row_impl<const SWAP_UV: bool, const ALPHA: bool>(
   y: &[u8],
   uv_or_vu_half: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
@@ -190,7 +231,8 @@ fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
   debug_assert_eq!(width & 1, 0, "NV12/NV21 require even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_or_vu_half.len() >= width, "chroma row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params(full_range);
@@ -214,14 +256,26 @@ fn nv12_or_nv21_to_rgb_row_impl<const SWAP_UV: bool>(
     let b_chroma = (coeffs.b_u() * u_d + coeffs.b_v() * v_d + RND) >> 15;
 
     let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15;
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    let r0 = clamp_u8(y0 + r_chroma);
+    let g0 = clamp_u8(y0 + g_chroma);
+    let b0 = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = r0;
+    out[x * bpp + 1] = g0;
+    out[x * bpp + 2] = b0;
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
 
     let y1 = ((y[x + 1] as i32 - y_off) * y_scale + RND) >> 15;
-    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
-    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
-    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+    let r1 = clamp_u8(y1 + r_chroma);
+    let g1 = clamp_u8(y1 + g_chroma);
+    let b1 = clamp_u8(y1 + b_chroma);
+    out[(x + 1) * bpp] = r1;
+    out[(x + 1) * bpp + 1] = g1;
+    out[(x + 1) * bpp + 2] = b1;
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFF;
+    }
 
     x += 2;
   }
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 5e852fb..53c6e2c 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -62,17 +62,17 @@ use crate::{
   HsvBuffers, PixelSink, SourceFormat,
   raw::{Bayer, Bayer16, BayerRow, BayerRow16, BayerSink, BayerSink16},
   row::{
-    bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, nv12_to_rgb_row, nv21_to_rgb_row,
-    nv24_to_rgb_row, nv42_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row,
-    p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row,
-    p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row,
-    p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_420_to_rgba_row,
-    yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row,
-    yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row,
-    yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row,
-    yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row,
-    yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row,
-    yuv444p16_to_rgb_u16_row,
+    bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, nv12_to_rgb_row,
+    nv12_to_rgba_row, nv21_to_rgb_row, nv21_to_rgba_row, nv24_to_rgb_row, nv42_to_rgb_row,
+    p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row,
+    p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row,
+    p412_to_rgb_u16_row, p416_to_rgb_row, p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row,
+    yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row,
+    yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row,
+    yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row,
+    yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row,
+    yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row,
+    yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row,
   },
   yuv::{
     Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row,
@@ -1110,10 +1110,12 @@ impl<'a> MixedSinker<'a, Yuv420p> {
   ///
   /// ```compile_fail
   /// // Attaching RGBA to a sink that doesn't write it is rejected
-  /// // at compile time:
-  /// use colconv::{sinker::MixedSinker, yuv::Nv12};
+  /// // at compile time. Nv16 (4:2:2 semi‑planar) has not yet been
+  /// // wired for RGBA; once that lands the negative example here
+  /// // moves to the next not‑yet‑wired format.
+  /// use colconv::{sinker::MixedSinker, yuv::Nv16};
   /// let mut buf = vec![0u8; 16 * 8 * 4];
-  /// let _ = MixedSinker::<Nv12>::new(16, 8).with_rgba(&mut buf);
+  /// let _ = MixedSinker::<Nv16>::new(16, 8).with_rgba(&mut buf);
   /// ```
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
@@ -1586,6 +1588,44 @@ impl PixelSink for MixedSinker<'_, Yuv444p> {
 
 // ---- Nv12 impl ----------------------------------------------------------
 
+impl<'a> MixedSinker<'a, Nv12> {
+  /// Attaches a packed 32‑bit RGBA output buffer.
+  ///
+  /// Only available on sinker types whose `PixelSink` impl writes
+  /// RGBA — calling `with_rgba` on a sink that doesn't (e.g. a
+  /// not‑yet‑wired `MixedSinker<Nv16>` today) is a compile error
+  /// rather than a silent no‑op. Each format that adds RGBA support
+  /// adds its own impl block here.
+  ///
+  /// The fourth byte per pixel is alpha. NV12 has no alpha plane,
+  /// so every alpha byte is filled with `0xFF` (opaque). Future
+  /// YUVA source impls will copy alpha through from the source
+  /// plane.
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+}
+
 impl Nv12Sink for MixedSinker<'_, Nv12> {}
 
 impl PixelSink for MixedSinker<'_, Nv12> {
@@ -1641,15 +1681,16 @@ impl PixelSink for MixedSinker<'_, Nv12> {
 
     let Self {
       rgb,
+      rgba,
       luma,
       hsv,
       rgb_scratch,
       ..
     } = self;
 
-    // Single-plane row ranges are guaranteed to fit; RGB ranges use
-    // checked arithmetic (see the Yuv420p impl above for the full
-    // rationale — hsv-only attachment never validated `× 3`).
+    // Single-plane row ranges are guaranteed to fit; RGB / RGBA
+    // ranges use checked arithmetic (see the Yuv420p impl above for
+    // the full rationale — hsv-only attachment never validated × 3).
     let one_plane_start = idx * w;
     let one_plane_end = one_plane_start + w;
 
@@ -1658,6 +1699,29 @@ impl PixelSink for MixedSinker<'_, Nv12> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
+    // Native RGBA: independent kernel run, separate from RGB. Default
+    // alpha = 0xFF since NV12 has no alpha plane.
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      nv12_to_rgba_row(
+        row.y(),
+        row.uv_half(),
+        &mut buf[rgba_plane_start..rgba_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
     let want_rgb = rgb.is_some();
     let want_hsv = hsv.is_some();
     if !want_rgb && !want_hsv {
@@ -1849,6 +1913,38 @@ impl PixelSink for MixedSinker<'_, Nv16> {
 // the U/V byte-order difference. Only the trait `Input<'r>` and the
 // primitive name change.
 
+impl<'a> MixedSinker<'a, Nv21> {
+  /// Attaches a packed 32‑bit RGBA output buffer.
+  ///
+  /// Only available on sinker types whose `PixelSink` impl writes
+  /// RGBA — see [`MixedSinker::<Nv12>::with_rgba`] for the same
+  /// rationale and constraints. NV21 has no alpha plane, so every
+  /// alpha byte is filled with `0xFF` (opaque).
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+}
+
 impl Nv21Sink for MixedSinker<'_, Nv21> {}
 
 impl PixelSink for MixedSinker<'_, Nv21> {
@@ -1899,6 +1995,7 @@ impl PixelSink for MixedSinker<'_, Nv21> {
 
     let Self {
       rgb,
+      rgba,
       luma,
       hsv,
       rgb_scratch,
@@ -1912,6 +2009,29 @@ impl PixelSink for MixedSinker<'_, Nv21> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
+    // Native RGBA: independent kernel run, separate from RGB. Default
+    // alpha = 0xFF since NV21 has no alpha plane.
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      nv21_to_rgba_row(
+        row.y(),
+        row.vu_half(),
+        &mut buf[rgba_plane_start..rgba_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
     let want_rgb = rgb.is_some();
     let want_hsv = hsv.is_some();
     if !want_rgb && !want_hsv {
@@ -8421,6 +8541,183 @@ mod tests {
     assert_eq!(rgb_yuv420p, rgb_nv12);
   }
 
+  // ---- NV12 RGBA (Ship 8 PR 2) tests --------------------------------------
+  //
+  // Mirrors the Yuv420p RGBA test set. Adds a cross-format invariant
+  // proving NV12 RGBA is byte-identical to Yuv420p RGBA when fed the
+  // same pixels — catches U/V swap bugs in the new RGBA path that
+  // a pure RGB-path test would miss.
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv12_rgba_only_converts_gray_to_gray_with_opaque_alpha() {
+    let (yp, uvp) = solid_nv12_frame(16, 8, 128, 128, 128);
+    let src = Nv12Frame::new(&yp, &uvp, 16, 8, 16, 16);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Nv12>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgba.chunks(4) {
+      assert!(px[0].abs_diff(128) <= 1, "R");
+      assert_eq!(px[0], px[1], "RGB monochromatic");
+      assert_eq!(px[1], px[2], "RGB monochromatic");
+      assert_eq!(px[3], 0xFF, "alpha must default to opaque");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv12_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+    let w = 32usize;
+    let h = 16usize;
+    let (yp, uvp) = solid_nv12_frame(w as u32, h as u32, 180, 60, 200);
+    let src = Nv12Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, w as u32);
+
+    let mut rgb = std::vec![0u8; w * h * 3];
+    let mut rgba = std::vec![0u8; w * h * 4];
+    let mut sink = MixedSinker::<Nv12>::new(w, h)
+      .with_rgb(&mut rgb)
+      .unwrap()
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for i in 0..(w * h) {
+      assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}");
+    }
+  }
+
+  #[test]
+  fn nv12_rgba_buffer_too_short_returns_err() {
+    let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1];
+    let result = MixedSinker::<Nv12>::new(16, 8).with_rgba(&mut rgba_short);
+    let Err(err) = result else {
+      panic!("expected RgbaBufferTooShort error");
+    };
+    assert!(matches!(
+      err,
+      MixedSinkerError::RgbaBufferTooShort {
+        expected: 512,
+        actual: 511,
+      }
+    ));
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv12_rgba_simd_matches_scalar_with_random_yuv() {
+    // Pseudo-random per-pixel YUV across all 4 matrices × both
+    // ranges. Width 1922 forces both the SIMD main loop AND a scalar
+    // tail across every backend block size (16 / 32 / 64).
+    let w = 1922usize;
+    let h = 4usize;
+    let mut yp = std::vec![0u8; w * h];
+    let mut uvp = std::vec![0u8; w * (h / 2)];
+    pseudo_random_u8(&mut yp, 0xC001_C0DE);
+    pseudo_random_u8(&mut uvp, 0xCAFE_F00D);
+    let src = Nv12Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, w as u32);
+
+    for &matrix in &[
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::YCgCo,
+    ] {
+      for &full_range in &[true, false] {
+        let mut rgba_simd = std::vec![0u8; w * h * 4];
+        let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+        let mut s_simd = MixedSinker::<Nv12>::new(w, h)
+          .with_rgba(&mut rgba_simd)
+          .unwrap();
+        nv12_to(&src, full_range, matrix, &mut s_simd).unwrap();
+
+        let mut s_scalar = MixedSinker::<Nv12>::new(w, h)
+          .with_rgba(&mut rgba_scalar)
+          .unwrap();
+        s_scalar.set_simd(false);
+        nv12_to(&src, full_range, matrix, &mut s_scalar).unwrap();
+
+        if rgba_simd != rgba_scalar {
+          let mismatch = rgba_simd
+            .iter()
+            .zip(rgba_scalar.iter())
+            .position(|(a, b)| a != b)
+            .unwrap();
+          let pixel = mismatch / 4;
+          let channel = ["R", "G", "B", "A"][mismatch % 4];
+          panic!(
+            "NV12 RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) for matrix={matrix:?} full_range={full_range}: simd={} scalar={}",
+            rgba_simd[mismatch], rgba_scalar[mismatch]
+          );
+        }
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv12_rgba_matches_yuv420p_rgba_with_same_pixels() {
+    // Cross-format invariant: NV12 RGBA byte-identical to Yuv420p
+    // RGBA when the chroma is the same. Mirrors the existing
+    // `nv12_matches_yuv420p_mixed_sinker` RGB-path test for the new
+    // RGBA path. Catches U/V swap bugs in the NV12 RGBA kernel that
+    // would silently differ from the planar reference.
+    let w = 32u32;
+    let h = 16u32;
+    let ws = w as usize;
+    let hs = h as usize;
+    let yp: Vec<u8> = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let up: Vec<u8> = (0..(ws / 2) * (hs / 2))
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let vp: Vec<u8> = (0..(ws / 2) * (hs / 2))
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let mut uvp: Vec<u8> = std::vec![0u8; ws * (hs / 2)];
+    for r in 0..hs / 2 {
+      for c in 0..ws / 2 {
+        uvp[r * ws + 2 * c] = up[r * (ws / 2) + c];
+        uvp[r * ws + 2 * c + 1] = vp[r * (ws / 2) + c];
+      }
+    }
+
+    let yuv420p_src = Yuv420pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2);
+    let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w);
+
+    let mut rgba_yuv420p = std::vec![0u8; ws * hs * 4];
+    let mut sink_yuv420p = MixedSinker::<Yuv420p>::new(ws, hs)
+      .with_rgba(&mut rgba_yuv420p)
+      .unwrap();
+    yuv420p_to(&yuv420p_src, true, ColorMatrix::Bt709, &mut sink_yuv420p).unwrap();
+
+    let mut rgba_nv12 = std::vec![0u8; ws * hs * 4];
+    let mut sink_nv12 = MixedSinker::<Nv12>::new(ws, hs)
+      .with_rgba(&mut rgba_nv12)
+      .unwrap();
+    nv12_to(&nv12_src, true, ColorMatrix::Bt709, &mut sink_nv12).unwrap();
+
+    assert_eq!(rgba_yuv420p, rgba_nv12);
+  }
+
   // ---- NV16 MixedSinker ---------------------------------------------------
   //
   // 4:2:2: chroma is half-width, full-height. Per-row math is
@@ -8742,6 +9039,165 @@ mod tests {
     assert_eq!(rgb_nv12, rgb_nv21);
   }
 
+  // ---- NV21 RGBA (Ship 8 PR 2) tests --------------------------------------
+  //
+  // Mirrors the NV12 RGBA tests. The cross-format invariant against
+  // NV12 RGBA (with byte-swapped chroma) catches the case where
+  // SWAP_UV is wired through correctly for the RGB path but not the
+  // RGBA path.
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv21_rgba_only_converts_gray_to_gray_with_opaque_alpha() {
+    let (yp, vup) = solid_nv21_frame(16, 8, 128, 128, 128);
+    let src = Nv21Frame::new(&yp, &vup, 16, 8, 16, 16);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Nv21>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgba.chunks(4) {
+      assert!(px[0].abs_diff(128) <= 1, "R");
+      assert_eq!(px[0], px[1], "RGB monochromatic");
+      assert_eq!(px[1], px[2], "RGB monochromatic");
+      assert_eq!(px[3], 0xFF, "alpha must default to opaque");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv21_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+    let w = 32usize;
+    let h = 16usize;
+    let (yp, vup) = solid_nv21_frame(w as u32, h as u32, 180, 60, 200);
+    let src = Nv21Frame::new(&yp, &vup, w as u32, h as u32, w as u32, w as u32);
+
+    let mut rgb = std::vec![0u8; w * h * 3];
+    let mut rgba = std::vec![0u8; w * h * 4];
+    let mut sink = MixedSinker::<Nv21>::new(w, h)
+      .with_rgb(&mut rgb)
+      .unwrap()
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for i in 0..(w * h) {
+      assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv21_rgba_simd_matches_scalar_with_random_yuv() {
+    let w = 1922usize;
+    let h = 4usize;
+    let mut yp = std::vec![0u8; w * h];
+    let mut vup = std::vec![0u8; w * (h / 2)];
+    pseudo_random_u8(&mut yp, 0xC001_C0DE);
+    pseudo_random_u8(&mut vup, 0xCAFE_F00D);
+    let src = Nv21Frame::new(&yp, &vup, w as u32, h as u32, w as u32, w as u32);
+
+    for &matrix in &[
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::YCgCo,
+    ] {
+      for &full_range in &[true, false] {
+        let mut rgba_simd = std::vec![0u8; w * h * 4];
+        let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+        let mut s_simd = MixedSinker::<Nv21>::new(w, h)
+          .with_rgba(&mut rgba_simd)
+          .unwrap();
+        nv21_to(&src, full_range, matrix, &mut s_simd).unwrap();
+
+        let mut s_scalar = MixedSinker::<Nv21>::new(w, h)
+          .with_rgba(&mut rgba_scalar)
+          .unwrap();
+        s_scalar.set_simd(false);
+        nv21_to(&src, full_range, matrix, &mut s_scalar).unwrap();
+
+        if rgba_simd != rgba_scalar {
+          let mismatch = rgba_simd
+            .iter()
+            .zip(rgba_scalar.iter())
+            .position(|(a, b)| a != b)
+            .unwrap();
+          let pixel = mismatch / 4;
+          let channel = ["R", "G", "B", "A"][mismatch % 4];
+          panic!(
+            "NV21 RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) for matrix={matrix:?} full_range={full_range}: simd={} scalar={}",
+            rgba_simd[mismatch], rgba_scalar[mismatch]
+          );
+        }
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv21_rgba_matches_nv12_rgba_with_swapped_chroma() {
+    // Cross-format invariant on the RGBA path. Same shape as
+    // `nv21_matches_nv12_mixed_sinker_with_swapped_chroma` for RGB:
+    // building NV21 from NV12's bytes with the chroma pairs swapped
+    // must produce byte-identical RGBA. Catches cases where SWAP_UV
+    // is honored for RGB but not RGBA.
+    let w = 32u32;
+    let h = 16u32;
+    let ws = w as usize;
+    let hs = h as usize;
+
+    let yp: Vec<u8> = (0..ws * hs).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let mut uvp: Vec<u8> = std::vec![0u8; ws * (hs / 2)];
+    for r in 0..hs / 2 {
+      for c in 0..ws / 2 {
+        uvp[r * ws + 2 * c] = ((c + r * 53) & 0xFF) as u8;
+        uvp[r * ws + 2 * c + 1] = ((c + r * 71) & 0xFF) as u8;
+      }
+    }
+    let mut vup: Vec<u8> = uvp.clone();
+    for r in 0..hs / 2 {
+      for c in 0..ws / 2 {
+        vup[r * ws + 2 * c] = uvp[r * ws + 2 * c + 1];
+        vup[r * ws + 2 * c + 1] = uvp[r * ws + 2 * c];
+      }
+    }
+
+    let nv12_src = Nv12Frame::new(&yp, &uvp, w, h, w, w);
+    let nv21_src = Nv21Frame::new(&yp, &vup, w, h, w, w);
+
+    let mut rgba_nv12 = std::vec![0u8; ws * hs * 4];
+    let mut rgba_nv21 = std::vec![0u8; ws * hs * 4];
+    let mut s_nv12 = MixedSinker::<Nv12>::new(ws, hs)
+      .with_rgba(&mut rgba_nv12)
+      .unwrap();
+    let mut s_nv21 = MixedSinker::<Nv21>::new(ws, hs)
+      .with_rgba(&mut rgba_nv21)
+      .unwrap();
+    nv12_to(&nv12_src, false, ColorMatrix::Bt709, &mut s_nv12).unwrap();
+    nv21_to(&nv21_src, false, ColorMatrix::Bt709, &mut s_nv21).unwrap();
+
+    assert_eq!(rgba_nv12, rgba_nv21);
+  }
+
   // ---- NV24 MixedSinker ---------------------------------------------------
   //
   // 4:4:4 semi-planar: UV row is `2 * width` bytes (one UV pair per

From d8c0f6421e94c93bb2923238ea029635c6dfe091 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 16:40:32 +1200
Subject: [PATCH 2/2] update

---
 .github/workflows/benchmark.yml |  2 +-
 .github/workflows/ci.yml        |  2 +-
 .github/workflows/coverage.yml  |  2 +-
 .github/workflows/loc.yml       |  2 +-
 src/row/arch/neon.rs            | 54 ++++++++++++++++++++++++++++++---
 src/row/arch/wasm_simd128.rs    | 29 ++++++++++++++++++
 src/row/arch/x86_avx2.rs        | 29 ++++++++++++++++++
 src/row/arch/x86_avx512.rs      | 29 ++++++++++++++++++
 src/row/arch/x86_sse41.rs       | 29 ++++++++++++++++++
 9 files changed, 169 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 2b1cf96..81e5159 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -247,7 +247,7 @@ jobs:
     if: always()
     steps:
       - name: Download all benchmark results
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v8
         with:
           path: all-results
 
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d0fc00b..fdf5548 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -186,7 +186,7 @@ jobs:
       - name: Install Rust
         run: rustup update stable --no-self-update && rustup default stable
       - name: Install Intel SDE
-        uses: petarpetrovt/setup-sde@v2.4
+        uses: petarpetrovt/setup-sde@v3.0
         with:
           sdeVersion: 9.33.0
           environmentVariableName: SDE_PATH
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 3e65542..fdac3f0 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -156,7 +156,7 @@ jobs:
       - uses: actions/checkout@v6
 
       - name: Download ${{ matrix.label }} report
-        uses: actions/download-artifact@v6
+        uses: actions/download-artifact@v8
         with:
           name: coverage-${{ matrix.label }}
           path: coverage/
diff --git a/.github/workflows/loc.yml b/.github/workflows/loc.yml
index 669041e..c53d960 100644
--- a/.github/workflows/loc.yml
+++ b/.github/workflows/loc.yml
@@ -41,7 +41,7 @@ jobs:
         run: |
           tokeit --lang rust
       - name: Upload total loc to GitHub Gist
-        uses: actions/github-script@v8
+        uses: actions/github-script@v9
         with:
           github-token: ${{ secrets.GIST_PAT }}
           script: |
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index ae95884..d9563c9 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -1081,6 +1081,18 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row<const BITS: u32>(
 /// NEON NV12 → packed RGB (UV-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = false, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]:
+///
+/// 1. **NEON must be available on the current CPU.** Direct callers
+///    are responsible for verifying this; the dispatcher in
+///    [`crate::row::nv12_to_rgb_row`] checks it.
+/// 2. `width & 1 == 0` (4:2:0 requires even width).
+/// 3. `y.len() >= width`.
+/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair).
+/// 5. `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1102,6 +1114,12 @@ pub(crate) unsafe fn nv12_to_rgb_row(
 /// NEON NV21 → packed RGB (VU-ordered chroma). Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = true, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same
+/// number of bytes (`>= width`) but in V-then-U order per chroma
+/// pair.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1123,6 +1141,12 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 /// NEON NV12 → packed RGBA (R, G, B, `0xFF` per pixel). Same
 /// contract as [`nv12_to_rgb_row`] but writes 4 bytes per pixel via
 /// `vst4q_u8`. `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv12_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes (one extra byte per pixel for the opaque
+/// alpha).
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv12_to_rgba_row(
@@ -1144,6 +1168,11 @@ pub(crate) unsafe fn nv12_to_rgba_row(
 /// NEON NV21 → packed RGBA (R, G, B, `0xFF` per pixel). Same
 /// contract as [`nv21_to_rgb_row`] but writes 4 bytes per pixel via
 /// `vst4q_u8`. `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv21_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv21_to_rgba_row(
@@ -4108,7 +4137,22 @@ mod tests {
   }
 
   // ---- rgb_to_hsv_row equivalence ------------------------------------
-
+  //
+  // The NEON HSV kernel uses `vmaxq_f32` / `vminq_f32` / `vdivq_f32`
+  // (true f32 ops). Miri's interpreter does not currently implement
+  // these aarch64 NEON f32 intrinsics — under
+  // `cargo miri test --target aarch64-unknown-linux-gnu` the calls
+  // raise `unsupported operation: can't call foreign function
+  // "llvm.aarch64.neon.fmax.v4f32"`. The previous
+  // `#[cfg_attr(miri, ignore = ...)]` annotations didn't suffice in
+  // CI (Miri tried to evaluate them anyway). Compiling the helper
+  // and the tests *out* under `cfg(miri)` removes the f32
+  // intrinsics from the binary entirely so Miri can't trip on them.
+  // The other backends (wasm / x86) are tested by their respective
+  // arch modules; correctness of the NEON HSV path is still covered
+  // by host-arch CI runs that don't go through Miri.
+
+  #[cfg(not(miri))]
   fn check_hsv_equivalence(rgb: &[u8], width: usize) {
     let mut h_scalar = std::vec![0u8; width];
     let mut s_scalar = std::vec![0u8; width];
@@ -4160,21 +4204,21 @@ mod tests {
   }
 
   #[test]
-  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  #[cfg(not(miri))]
   fn hsv_neon_matches_scalar_pseudo_random_16() {
     let rgb = pseudo_random_bgr(16);
     check_hsv_equivalence(&rgb, 16);
   }
 
   #[test]
-  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  #[cfg(not(miri))]
   fn hsv_neon_matches_scalar_pseudo_random_1920() {
     let rgb = pseudo_random_bgr(1920);
     check_hsv_equivalence(&rgb, 1920);
   }
 
   #[test]
-  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  #[cfg(not(miri))]
   fn hsv_neon_matches_scalar_tail_widths() {
     // Widths that force a non‑trivial scalar tail (non‑multiple of 16).
     for w in [1usize, 7, 15, 17, 31, 1921] {
@@ -4184,7 +4228,7 @@ mod tests {
   }
 
   #[test]
-  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  #[cfg(not(miri))]
   fn hsv_neon_matches_scalar_primaries_and_edges() {
     // Primary colors, grays, near‑saturation — exercise each hue branch
     // and the v==0, delta==0, h<0 wrap paths.
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index f8a794d..ffdd522 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -1320,6 +1320,18 @@ unsafe fn deinterleave_uv_u16_wasm(ptr: *const u16) -> (v128, v128) {
 /// WASM simd128 NV12 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = false, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]:
+///
+/// 1. **simd128 must be enabled at compile time.** WASM has no
+///    runtime CPU detection — the module's SIMD support is fixed at
+///    produce time.
+/// 2. `width & 1 == 0` (4:2:0 requires even width).
+/// 3. `y.len() >= width`.
+/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair).
+/// 5. `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1340,6 +1352,12 @@ pub(crate) unsafe fn nv12_to_rgb_row(
 /// WASM simd128 NV21 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = true, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same
+/// number of bytes (`>= width`) but in V-then-U order per chroma
+/// pair.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1360,6 +1378,12 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 /// WASM simd128 NV12 → packed RGBA. Same contract as
 /// [`nv12_to_rgb_row`] but writes 4 bytes per pixel via
 /// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv12_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes (one extra byte per pixel for the opaque
+/// alpha).
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv12_to_rgba_row(
@@ -1380,6 +1404,11 @@ pub(crate) unsafe fn nv12_to_rgba_row(
 /// WASM simd128 NV21 → packed RGBA. Same contract as
 /// [`nv21_to_rgb_row`] but writes 4 bytes per pixel via
 /// [`write_rgba_16`]. `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv21_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv21_to_rgba_row(
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 3312a79..50c7aaf 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -1526,6 +1526,18 @@ unsafe fn deinterleave_uv_u16_avx2(ptr: *const u16) -> (__m256i, __m256i) {
 /// AVX2 NV12 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = false, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]:
+///
+/// 1. **AVX2 must be available on the current CPU.** Direct callers
+///    are responsible for verifying this; the dispatcher in
+///    [`crate::row::nv12_to_rgb_row`] checks it.
+/// 2. `width & 1 == 0` (4:2:0 requires even width).
+/// 3. `y.len() >= width`.
+/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair).
+/// 5. `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1546,6 +1558,12 @@ pub(crate) unsafe fn nv12_to_rgb_row(
 /// AVX2 NV21 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = true, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same
+/// number of bytes (`>= width`) but in V-then-U order per chroma
+/// pair.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1566,6 +1584,12 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 /// AVX2 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`]
 /// but writes 4 bytes per pixel via [`write_rgba_32`].
 /// `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv12_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes (one extra byte per pixel for the opaque
+/// alpha).
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv12_to_rgba_row(
@@ -1586,6 +1610,11 @@ pub(crate) unsafe fn nv12_to_rgba_row(
 /// AVX2 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`]
 /// but writes 4 bytes per pixel via [`write_rgba_32`].
 /// `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv21_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv21_to_rgba_row(
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 77ce177..e1bd915 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -1573,6 +1573,18 @@ unsafe fn deinterleave_uv_u16_avx512(ptr: *const u16) -> (__m512i, __m512i) {
 /// AVX‑512 NV12 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = false, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]:
+///
+/// 1. **AVX‑512F + AVX‑512BW must be available on the current CPU.**
+///    Direct callers are responsible for verifying this; the
+///    dispatcher in [`crate::row::nv12_to_rgb_row`] checks it.
+/// 2. `width & 1 == 0` (4:2:0 requires even width).
+/// 3. `y.len() >= width`.
+/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair).
+/// 5. `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1593,6 +1605,12 @@ pub(crate) unsafe fn nv12_to_rgb_row(
 /// AVX‑512 NV21 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = true, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same
+/// number of bytes (`>= width`) but in V-then-U order per chroma
+/// pair.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1613,6 +1631,12 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 /// AVX‑512 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`]
 /// but writes 4 bytes per pixel via [`write_rgba_64`].
 /// `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv12_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes (one extra byte per pixel for the opaque
+/// alpha).
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv12_to_rgba_row(
@@ -1633,6 +1657,11 @@ pub(crate) unsafe fn nv12_to_rgba_row(
 /// AVX‑512 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`]
 /// but writes 4 bytes per pixel via [`write_rgba_64`].
 /// `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv21_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv21_to_rgba_row(
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index ad8925b..d4a342e 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1339,6 +1339,18 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row(
 /// SSE4.1 NV12 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = false, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_or_nv21_to_rgb_or_rgba_row_impl`]:
+///
+/// 1. **SSE4.1 must be available on the current CPU.** Direct
+///    callers are responsible for verifying this; the dispatcher in
+///    [`crate::row::nv12_to_rgb_row`] checks it.
+/// 2. `width & 1 == 0` (4:2:0 requires even width).
+/// 3. `y.len() >= width`.
+/// 4. `uv_half.len() >= width` (interleaved UV bytes, 2 per chroma pair).
+/// 5. `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv12_to_rgb_row(
@@ -1359,6 +1371,12 @@ pub(crate) unsafe fn nv12_to_rgb_row(
 /// SSE4.1 NV21 → packed RGB. Thin wrapper over
 /// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`] with
 /// `SWAP_UV = true, ALPHA = false`.
+///
+/// # Safety
+///
+/// Same contract as [`nv12_to_rgb_row`]; `vu_half` carries the same
+/// number of bytes (`>= width`) but in V-then-U order per chroma
+/// pair.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv21_to_rgb_row(
@@ -1379,6 +1397,12 @@ pub(crate) unsafe fn nv21_to_rgb_row(
 /// SSE4.1 NV12 → packed RGBA. Same contract as [`nv12_to_rgb_row`]
 /// but writes 4 bytes per pixel via [`write_rgba_16`].
 /// `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv12_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes (one extra byte per pixel for the opaque
+/// alpha).
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv12_to_rgba_row(
@@ -1399,6 +1423,11 @@ pub(crate) unsafe fn nv12_to_rgba_row(
 /// SSE4.1 NV21 → packed RGBA. Same contract as [`nv21_to_rgb_row`]
 /// but writes 4 bytes per pixel via [`write_rgba_16`].
 /// `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv21_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv21_to_rgba_row(