From f639f4274d01f1e499a872ae09dd1fbd1c99fcbd Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 18:32:12 +1200
Subject: [PATCH] update

---
 src/row/arch/neon.rs         | 253 ++++++++++--
 src/row/arch/wasm_simd128.rs | 211 ++++++++--
 src/row/arch/x86_avx2.rs     | 225 ++++++++--
 src/row/arch/x86_avx512.rs   | 225 ++++++++--
 src/row/arch/x86_sse41.rs    | 225 ++++++++--
 src/row/mod.rs               | 150 +++++++
 src/row/scalar.rs            | 144 ++++++-
 src/sinker/mixed.rs          | 770 ++++++++++++++++++++++++++++++++---
 8 files changed, 1966 insertions(+), 237 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index c5c72c7..ce5e882 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -1349,11 +1349,19 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
 }
 
 /// NEON NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over
-/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same contract as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with
+/// `ALPHA = false` (so `out.len() >= width * 3` specializes to
+/// `rgb_out.len() >= 3 * width`):
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `y.len() >= width`.
+/// 3. `uv.len() >= 2 * width`.
+/// 4. `rgb_out.len() >= 3 * width`.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv24_to_rgb_row(
@@ -1366,16 +1374,18 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<false>(y, uv, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, false>(y, uv, rgb_out, width, matrix, full_range);
   }
 }
 
 /// NEON NV42 → packed RGB (VU-ordered, 4:4:4). Thin wrapper over
-/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = true`.
+/// [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same contract as [`nv24_to_rgb_row`]; `vu` carries the same
+/// `2 * width` bytes but in V-then-U order per chroma pair.
 #[inline]
 #[target_feature(enable = "neon")]
 pub(crate) unsafe fn nv42_to_rgb_row(
@@ -1388,40 +1398,96 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<true>(y, vu, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, false>(y, vu, rgb_out, width, matrix, full_range);
   }
 }
 
-/// Shared NEON NV24/NV42 kernel (4:4:4 semi-planar). Unlike
-/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one
-/// UV pair per Y pixel, so the chroma-duplication step (`vzip*`)
-/// disappears: compute 16 chroma values per 16 Y pixels directly.
+/// NEON NV24 → packed RGBA (R, G, B, `0xFF` per pixel). Same
+/// contract as [`nv24_to_rgb_row`] but writes 4 bytes per pixel via
+/// `vst4q_u8`. `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv24_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, true>(y, uv, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// NEON NV42 → packed RGBA (R, G, B, `0xFF` per pixel). Same
+/// contract as [`nv42_to_rgb_row`] but writes 4 bytes per pixel via
+/// `vst4q_u8`. `rgba_out.len() >= 4 * width`.
+///
+/// # Safety
+///
+/// Same as [`nv42_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, true>(y, vu, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON NV24/NV42 kernel (4:4:4 semi-planar) at 3 bpp (RGB)
+/// or 4 bpp + opaque alpha (RGBA). Unlike
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not
+/// subsampled — one UV pair per Y pixel, so the chroma-duplication
+/// step (`vzip*`) disappears: compute 16 chroma values per 16 Y
+/// pixels directly.
 ///
 /// `SWAP_UV = false` selects NV24 (even byte = U, odd = V);
-/// `SWAP_UV = true` selects NV42 (even = V, odd = U).
+/// `SWAP_UV = true` selects NV42 (even = V, odd = U). `ALPHA = true`
+/// writes via `vst4q_u8` with constant `0xFF` alpha; `ALPHA = false`
+/// writes via `vst3q_u8`. Both const generics drive compile-time
+/// monomorphization.
 ///
 /// # Safety
 ///
 /// 1. **NEON must be available on the current CPU.**
 /// 2. `y.len() >= width`.
-/// 3. `uv_or_vu.len() >= 2 * width` (one UV pair per Y pixel =
-///    `2 * width` bytes).
-/// 4. `rgb_out.len() >= 3 * width`.
+/// 3. `uv_or_vu.len() >= 2 * width` (one UV pair per Y pixel).
+/// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 ///
 /// No width parity constraint (4:4:4).
 #[inline]
 #[target_feature(enable = "neon")]
-unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1442,6 +1508,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1510,32 +1577,36 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)),
       );
 
-      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
-      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+      if ALPHA {
+        let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
+        vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
+      } else {
+        let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+        vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
+      }
 
       x += 16;
     }
 
     // Scalar tail for 0..15 leftover pixels.
     if x < width {
-      if SWAP_UV {
-        scalar::nv42_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv24_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu[x * 2..width * 2];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -4150,6 +4221,114 @@ mod tests {
     }
   }
 
+  // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ----------------
+
+  fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_neon = std::vec![0u8; width * 4];
+
+    scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv24_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_neon {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "NEON NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgba_scalar[first_diff], rgba_neon[first_diff]
+      );
+    }
+  }
+
+  fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_neon = std::vec![0u8; width * 4];
+
+    scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv42_to_rgba_row(&y, &vu, &mut rgba_neon, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_neon {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "NEON NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgba_scalar[first_diff], rgba_neon[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv24_neon_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv24_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv24_neon_rgba_matches_scalar_widths() {
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv42_neon_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv42_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn nv42_neon_rgba_matches_scalar_widths() {
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444_to_rgb_row equivalence ---------------------------------
 
   fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 7286e0b..92ca897 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -1606,12 +1606,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
   }
 }
 
-/// wasm simd128 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper
-/// over [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// wasm simd128 NV24 → packed RGB (UV-ordered, 4:4:4).
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv24_to_rgb_row(
@@ -1624,7 +1623,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<false>(y, uv, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, false>(y, uv, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -1632,7 +1631,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "simd128")]
 pub(crate) unsafe fn nv42_to_rgb_row(
@@ -1645,7 +1644,49 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<true>(y, vu, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, false>(y, vu, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, true>(y, uv, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, true>(y, vu, rgba_out, width, matrix, full_range);
   }
 }
 
@@ -1658,20 +1699,24 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 ///
 /// 1. **simd128 must be available** (compile-time `target_feature`).
 /// 2. `y.len() >= width`, `uv_or_vu.len() >= 2 * width`,
-///    `rgb_out.len() >= 3 * width`.
+///    `out.len() >= width * if ALPHA { 4 } else { 3 }`.
 #[inline]
 #[target_feature(enable = "simd128")]
-unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1691,6 +1736,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1828,30 +1874,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv42_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv24_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -3737,6 +3786,110 @@ mod tests {
     }
   }
 
+  // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ----------------
+
+  fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_simd = std::vec![0u8; width * 4];
+
+    scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv24_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_simd {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "simd128 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgba_scalar[first_diff], rgba_simd[first_diff]
+      );
+    }
+  }
+
+  fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_simd = std::vec![0u8; width * 4];
+
+    scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv42_to_rgba_row(&y, &vu, &mut rgba_simd, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_simd {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_simd.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "simd128 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} simd={}",
+        rgba_scalar[first_diff], rgba_simd[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn simd128_nv24_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv24_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_nv24_rgba_matches_scalar_widths() {
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
+  #[test]
+  fn simd128_nv42_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv42_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_nv42_rgba_matches_scalar_widths() {
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444_to_rgb_row equivalence ---------------------------------
 
   fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 24ceda0..53d55b6 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -1799,12 +1799,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
   }
 }
 
-/// AVX2 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over
-/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// AVX2 NV24 → packed RGB (UV-ordered, 4:4:4).
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv24_to_rgb_row(
@@ -1817,7 +1816,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<false>(y, uv, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, false>(y, uv, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -1825,7 +1824,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "avx2")]
 pub(crate) unsafe fn nv42_to_rgb_row(
@@ -1838,13 +1837,55 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<true>(y, vu, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, false>(y, vu, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, true>(y, uv, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, true>(y, vu, rgba_out, width, matrix, full_range);
   }
 }
 
 /// Shared AVX2 NV24/NV42 kernel (4:4:4 semi-planar). 32 Y pixels / 32
 /// chroma pairs / 64 UV bytes per iteration. Unlike
-/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not subsampled — one
 /// UV pair per Y pixel — so the `chroma_dup` step disappears; two
 /// `chroma_i16x16` calls per channel produce 32 chroma values
 /// directly.
@@ -1854,20 +1895,24 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 /// 1. **AVX2 must be available on the current CPU.**
 /// 2. `y.len() >= width`.
 /// 3. `uv_or_vu.len() >= 2 * width`.
-/// 4. `rgb_out.len() >= 3 * width`.
+/// 4. `out.len() >= width * if ALPHA { 4 } else { 3 }`.
 #[inline]
 #[target_feature(enable = "avx2")]
-unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1888,6 +1933,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1);
 
     // Same per-lane deinterleave mask as the NV12 kernel: within each
     // 128-bit lane, pack even bytes into low 8, odd bytes into high 8.
@@ -2003,30 +2049,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv42_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv24_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -4062,6 +4111,122 @@ mod tests {
     }
   }
 
+  // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ----------------
+
+  fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx2 = std::vec![0u8; width * 4];
+
+    scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv24_to_rgba_row(&y, &uv, &mut rgba_avx2, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx2 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx2.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX2 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}",
+        rgba_scalar[first_diff], rgba_avx2[first_diff]
+      );
+    }
+  }
+
+  fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx2 = std::vec![0u8; width * 4];
+
+    scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv42_to_rgba_row(&y, &vu, &mut rgba_avx2, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx2 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx2.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX2 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}",
+        rgba_scalar[first_diff], rgba_avx2[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx2_nv24_rgba_matches_scalar_all_matrices_32() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv24_rgba_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_nv24_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for w in [31usize, 32, 33, 63, 64, 65, 1920, 1921] {
+      check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
+  #[test]
+  fn avx2_nv42_rgba_matches_scalar_all_matrices_32() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv42_rgba_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_nv42_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for w in [31usize, 32, 33, 63, 64, 65, 1920, 1921] {
+      check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444_to_rgb_row equivalence ---------------------------------
 
   fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 04de26d..3e56ed4 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -1851,12 +1851,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
   }
 }
 
-/// AVX-512 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over
-/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// AVX-512 NV24 → packed RGB (UV-ordered, 4:4:4).
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv24_to_rgb_row(
@@ -1869,7 +1868,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<false>(y, uv, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, false>(y, uv, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -1877,7 +1876,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
 pub(crate) unsafe fn nv42_to_rgb_row(
@@ -1890,13 +1889,55 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<true>(y, vu, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, false>(y, vu, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, true>(y, uv, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, true>(y, vu, rgba_out, width, matrix, full_range);
   }
 }
 
 /// Shared AVX-512 NV24/NV42 kernel (4:4:4 semi-planar). 64 Y pixels /
 /// 64 chroma pairs / 128 UV bytes per iteration. Unlike
-/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not subsampled — one
 /// UV pair per Y pixel — so the `chroma_dup` step disappears; two
 /// `chroma_i16x32` calls per channel produce 64 chroma values
 /// directly.
@@ -1906,20 +1947,24 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 /// 1. **AVX-512F + AVX-512BW must be available.**
 /// 2. `y.len() >= width`.
 /// 3. `uv_or_vu.len() >= 2 * width`.
-/// 4. `rgb_out.len() >= 3 * width`.
+/// 4. `out.len() >= width * if ALPHA { 4 } else { 3 }`.
 #[inline]
 #[target_feature(enable = "avx512f,avx512bw")]
-unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1940,6 +1985,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1);
 
     // Same lane fixups as NV12 kernel — inherited verbatim.
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
@@ -2068,30 +2114,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 64;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv42_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv24_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -4232,6 +4281,122 @@ mod tests {
     }
   }
 
+  // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ----------------
+
+  fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx512 = std::vec![0u8; width * 4];
+
+    scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv24_to_rgba_row(&y, &uv, &mut rgba_avx512, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx512 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx512.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX-512 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}",
+        rgba_scalar[first_diff], rgba_avx512[first_diff]
+      );
+    }
+  }
+
+  fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx512 = std::vec![0u8; width * 4];
+
+    scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv42_to_rgba_row(&y, &vu, &mut rgba_avx512, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx512 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx512.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX-512 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}",
+        rgba_scalar[first_diff], rgba_avx512[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx512_nv24_rgba_matches_scalar_all_matrices_64() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv24_rgba_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_nv24_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for w in [63usize, 64, 65, 127, 128, 129, 1920, 1921] {
+      check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
+  #[test]
+  fn avx512_nv42_rgba_matches_scalar_all_matrices_64() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv42_rgba_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_nv42_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for w in [63usize, 64, 65, 127, 128, 129, 1920, 1921] {
+      check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444_to_rgb_row equivalence ---------------------------------
 
   fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 35a9453..e3077d7 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1594,12 +1594,11 @@ pub(crate) unsafe fn nv12_or_nv21_to_rgb_or_rgba_row_impl<
   }
 }
 
-/// SSE4.1 NV24 → packed RGB (UV-ordered, 4:4:4). Thin wrapper over
-/// [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// SSE4.1 NV24 → packed RGB (UV-ordered, 4:4:4).
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv24_to_rgb_row(
@@ -1612,7 +1611,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<false>(y, uv, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, false>(y, uv, rgb_out, width, matrix, full_range);
   }
 }
 
@@ -1620,7 +1619,7 @@ pub(crate) unsafe fn nv24_to_rgb_row(
 ///
 /// # Safety
 ///
-/// Same as [`nv24_or_nv42_to_rgb_row_impl`].
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`].
 #[inline]
 #[target_feature(enable = "sse4.1")]
 pub(crate) unsafe fn nv42_to_rgb_row(
@@ -1633,12 +1632,54 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 ) {
   // SAFETY: caller obligations forwarded to the shared impl.
   unsafe {
-    nv24_or_nv42_to_rgb_row_impl::<true>(y, vu, rgb_out, width, matrix, full_range);
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, false>(y, vu, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 NV24 → packed RGBA (UV-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, true>(y, uv, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 NV42 → packed RGBA (VU-ordered, 4:4:4, opaque alpha).
+///
+/// # Safety
+///
+/// Same as [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] but `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller obligations forwarded to the shared impl.
+  unsafe {
+    nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, true>(y, vu, rgba_out, width, matrix, full_range);
   }
 }
 
 /// Shared SSE4.1 NV24/NV42 kernel (4:4:4 semi-planar). Unlike
-/// [`nv12_or_nv21_to_rgb_row_impl`], chroma is not subsampled — one
+/// [`nv12_or_nv21_to_rgb_or_rgba_row_impl`], chroma is not subsampled — one
 /// UV pair per Y pixel. Per 16 Y pixels, load 32 UV bytes (two
 /// `_mm_loadu_si128`), deinterleave, compute 16 chroma values per
 /// channel directly, and skip the `_mm_unpacklo/hi_epi16` chroma
@@ -1649,20 +1690,24 @@ pub(crate) unsafe fn nv42_to_rgb_row(
 /// 1. **SSE4.1 must be available on the current CPU.**
 /// 2. `y.len() >= width`.
 /// 3. `uv_or_vu.len() >= 2 * width`.
-/// 4. `rgb_out.len() >= 3 * width`.
+/// 4. `out.len() >= width * if ALPHA { 4 } else { 3 }`.
 #[inline]
 #[target_feature(enable = "sse4.1")]
-unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) unsafe fn nv24_or_nv42_to_rgb_or_rgba_row_impl<
+  const SWAP_UV: bool,
+  const ALPHA: bool,
+>(
   y: &[u8],
   uv_or_vu: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
+  let bpp: usize = if ALPHA { 4 } else { 3 };
   debug_assert!(y.len() >= width);
   debug_assert!(uv_or_vu.len() >= 2 * width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1683,6 +1728,7 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1);
 
     // Shuffle masks to deinterleave 16 UV bytes into 8 U + 8 V (low
     // lanes). The upper 8 lanes are zeroed by `_mm_shuffle_epi8`
@@ -1766,30 +1812,33 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      if SWAP_UV {
-        scalar::nv42_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
-      } else {
-        scalar::nv24_to_rgb_row(
-          &y[x..width],
-          &uv_or_vu[x * 2..width * 2],
-          &mut rgb_out[x * 3..width * 3],
-          width - x,
-          matrix,
-          full_range,
-        );
+      let tail_y = &y[x..width];
+      let tail_uv = &uv_or_vu[x * 2..width * 2];
+      let tail_out = &mut out[x * bpp..width * bpp];
+      let tail_w = width - x;
+      match (SWAP_UV, ALPHA) {
+        (false, false) => {
+          scalar::nv24_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, false) => {
+          scalar::nv42_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (false, true) => {
+          scalar::nv24_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
+        (true, true) => {
+          scalar::nv42_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range)
+        }
       }
     }
   }
@@ -3561,6 +3610,122 @@ mod tests {
     }
   }
 
+  // ---- nv24_to_rgba_row / nv42_to_rgba_row equivalence ----------------
+
+  fn check_nv24_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let uv: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_sse41 = std::vec![0u8; width * 4];
+
+    scalar::nv24_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv24_to_rgba_row(&y, &uv, &mut rgba_sse41, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_sse41 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_sse41.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "SSE4.1 NV24 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}",
+        rgba_scalar[first_diff], rgba_sse41[first_diff]
+      );
+    }
+  }
+
+  fn check_nv42_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let vu: std::vec::Vec<u8> = (0..width)
+      .flat_map(|i| [((i * 53 + 23) & 0xFF) as u8, ((i * 71 + 91) & 0xFF) as u8])
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_sse41 = std::vec![0u8; width * 4];
+
+    scalar::nv42_to_rgba_row(&y, &vu, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      nv42_to_rgba_row(&y, &vu, &mut rgba_sse41, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_sse41 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_sse41.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "SSE4.1 NV42 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}",
+        rgba_scalar[first_diff], rgba_sse41[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn sse41_nv24_rgba_matches_scalar_all_matrices_16() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv24_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_nv24_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_nv24_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
+  #[test]
+  fn sse41_nv42_rgba_matches_scalar_all_matrices_16() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_nv42_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_nv42_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_nv42_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444_to_rgb_row equivalence ---------------------------------
 
   fn check_yuv_444_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 6499cec..c64403c 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -34,6 +34,8 @@
 pub(crate) mod arch;
 pub(crate) mod scalar;
 
+pub(crate) use scalar::expand_rgb_to_rgba_row;
+
 use crate::ColorMatrix;
 
 /// Converts one row of 4:2:0 YUV to packed RGB.
@@ -713,6 +715,154 @@ pub fn nv42_to_rgb_row(
   scalar::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of NV24 (semi‑planar 4:4:4, UV-ordered) to packed
+/// **RGBA** (8-bit). Same numerical contract as [`nv24_to_rgb_row`];
+/// alpha defaults to `0xFF` (opaque).
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let uv_min = match width.checked_mul(2) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 2 overflows usize"),
+  };
+  assert!(y.len() >= width, "y row too short");
+  assert!(uv.len() >= uv_min, "uv row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range);
+}
+
+/// Converts one row of NV42 (semi‑planar 4:4:4, VU-ordered) to packed
+/// **RGBA** (8-bit). Same as [`nv24_to_rgba_row`] but with swapped
+/// chroma byte order.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  let vu_min = match width.checked_mul(2) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 2 overflows usize"),
+  };
+  assert!(y.len() >= width, "y row too short");
+  assert!(vu.len() >= vu_min, "vu row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: NEON verified.
+          unsafe {
+            arch::neon::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: AVX‑512BW verified.
+          unsafe {
+            arch::x86_avx512::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: AVX2 verified.
+          unsafe {
+            arch::x86_avx2::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: SSE4.1 verified.
+          unsafe {
+            arch::x86_sse41::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 verified at compile time.
+          unsafe {
+            arch::wasm_simd128::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range);
+}
+
 /// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches
 /// to the best available SIMD backend for the current target.
 ///
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 1a76503..efd7f59 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -282,7 +282,8 @@ pub(crate) fn nv12_or_nv21_to_rgb_or_rgba_row_impl<const SWAP_UV: bool, const AL
 }
 
 /// NV24 (semi-planar 4:4:4, UV-ordered) → packed RGB. Thin wrapper
-/// over [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = false`.
+/// over [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = false, ALPHA = false`.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn nv24_to_rgb_row(
   y: &[u8],
@@ -292,11 +293,12 @@ pub(crate) fn nv24_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  nv24_or_nv42_to_rgb_row_impl::<false>(y, uv, rgb_out, width, matrix, full_range);
+  nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, false>(y, uv, rgb_out, width, matrix, full_range);
 }
 
 /// NV42 (semi-planar 4:4:4, VU-ordered) → packed RGB. Thin wrapper
-/// over [`nv24_or_nv42_to_rgb_row_impl`] with `SWAP_UV = true`.
+/// over [`nv24_or_nv42_to_rgb_or_rgba_row_impl`] with
+/// `SWAP_UV = true, ALPHA = false`.
 #[cfg_attr(not(tarpaulin), inline(always))]
 pub(crate) fn nv42_to_rgb_row(
   y: &[u8],
@@ -306,31 +308,63 @@ pub(crate) fn nv42_to_rgb_row(
   matrix: ColorMatrix,
   full_range: bool,
 ) {
-  nv24_or_nv42_to_rgb_row_impl::<true>(y, vu, rgb_out, width, matrix, full_range);
+  nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, false>(y, vu, rgb_out, width, matrix, full_range);
 }
 
-/// Shared scalar kernel for NV24 (SWAP_UV=false) and NV42
-/// (SWAP_UV=true). Identical math and numerical contract to
-/// [`yuv_420_to_rgb_row`]; the difference from NV12/NV21 is
-/// 4:4:4 — one UV pair per Y pixel, no chroma upsampling.
-/// No width parity constraint.
+/// NV24 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`.
+/// First three bytes per pixel are byte-identical to
+/// [`nv24_to_rgb_row`]. `rgba_out.len() >= 4 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn nv24_to_rgba_row(
+  y: &[u8],
+  uv: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  nv24_or_nv42_to_rgb_or_rgba_row_impl::<false, true>(y, uv, rgba_out, width, matrix, full_range);
+}
+
+/// NV42 → packed `R, G, B, A` quadruplets with constant `A = 0xFF`.
+/// First three bytes per pixel are byte-identical to
+/// [`nv42_to_rgb_row`]. `rgba_out.len() >= 4 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn nv42_to_rgba_row(
+  y: &[u8],
+  vu: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  nv24_or_nv42_to_rgb_or_rgba_row_impl::<true, true>(y, vu, rgba_out, width, matrix, full_range);
+}
+
+/// Shared scalar kernel for NV24 (`SWAP_UV = false`) / NV42
+/// (`SWAP_UV = true`) at 3 bpp (`ALPHA = false`) or 4 bpp + opaque
+/// alpha (`ALPHA = true`). Identical math to [`yuv_444_to_rgb_row`]
+/// (4:4:4 — one UV pair per Y pixel, no chroma upsampling); only
+/// the per-pixel store stride differs. Both `const` generics drive
+/// compile-time monomorphization.
 ///
 /// # Panics (debug builds)
 ///
 /// - `y.len() >= width`, `uv_or_vu.len() >= 2 * width`,
-///   `rgb_out.len() >= 3 * width`.
+///   `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
 #[cfg_attr(not(tarpaulin), inline(always))]
-fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
+pub(crate) fn nv24_or_nv42_to_rgb_or_rgba_row_impl<const SWAP_UV: bool, const ALPHA: bool>(
   y: &[u8],
   uv_or_vu: &[u8],
-  rgb_out: &mut [u8],
+  out: &mut [u8],
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
 ) {
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(uv_or_vu.len() >= 2 * width, "chroma row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params(full_range);
@@ -351,9 +385,12 @@ fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
     let b_chroma = (coeffs.b_u() * u_d + coeffs.b_v() * v_d + RND) >> 15;
 
     let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15;
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = clamp_u8(y0 + r_chroma);
+    out[x * bpp + 1] = clamp_u8(y0 + g_chroma);
+    out[x * bpp + 2] = clamp_u8(y0 + b_chroma);
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
   }
 }
 
@@ -451,6 +488,39 @@ fn clamp_u8(v: i32) -> u8 {
   v.clamp(0, 255) as u8
 }
 
+// ---- RGB → RGBA expand (Strategy A combined-buffer optimization) ------
+
+/// Reads packed `R, G, B` triples and writes packed `R, G, B, A`
+/// quadruplets with `A = 0xFF` (opaque). Used by `MixedSinker` impls
+/// when callers attach **both** `with_rgb` and `with_rgba`: instead
+/// of running the YUV→RGB math twice (once per output format), we
+/// run the RGB kernel into the user's RGB buffer and then expand
+/// here to derive the RGBA buffer with a single per-byte pass.
+///
+/// The 3W read is L1-hot from the just-completed RGB write, so the
+/// effective memory traffic is roughly 3W RGB write + 4W RGBA write
+/// = 7W per row — same as the existing native-RGBA path, but with
+/// only one pass through the YUV→RGB math instead of two. See
+/// `docs/color-conversion-functions.md` § Ship 8 for the full
+/// design discussion (Strategy A vs the alternative B "combined
+/// kernel writes both per pixel" deferred to a future PR).
+///
+/// # Panics (debug builds)
+///
+/// - `rgb.len() >= 3 * width`
+/// - `rgba_out.len() >= 4 * width`
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn expand_rgb_to_rgba_row(rgb: &[u8], rgba_out: &mut [u8], width: usize) {
+  debug_assert!(rgb.len() >= width * 3, "rgb row too short");
+  debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short");
+  for x in 0..width {
+    rgba_out[x * 4] = rgb[x * 3];
+    rgba_out[x * 4 + 1] = rgb[x * 3 + 1];
+    rgba_out[x * 4 + 2] = rgb[x * 3 + 2];
+    rgba_out[x * 4 + 3] = 0xFF;
+  }
+}
+
 // ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) -------------
 
 /// Converts one row of high-bit-depth 4:2:0 YUV (`u16` samples in the
@@ -2005,6 +2075,48 @@ fn clamp_u16_round(v: f32, max: f32) -> u16 {
 mod tests {
   use super::*;
 
+  // ---- expand_rgb_to_rgba_row -----------------------------------------
+
+  #[test]
+  fn expand_rgb_to_rgba_row_pads_alpha_and_preserves_rgb() {
+    // Each source pixel's R/G/B must land in the matching slot, with
+    // alpha forced to 0xFF — Strategy A's correctness depends on this.
+    let rgb: std::vec::Vec<u8> = (0..16 * 3).map(|i| i as u8).collect();
+    let mut rgba = std::vec![0u8; 16 * 4];
+    expand_rgb_to_rgba_row(&rgb, &mut rgba, 16);
+    for x in 0..16 {
+      assert_eq!(rgba[x * 4], rgb[x * 3], "R at px {x}");
+      assert_eq!(rgba[x * 4 + 1], rgb[x * 3 + 1], "G at px {x}");
+      assert_eq!(rgba[x * 4 + 2], rgb[x * 3 + 2], "B at px {x}");
+      assert_eq!(rgba[x * 4 + 3], 0xFF, "A at px {x}");
+    }
+  }
+
+  #[test]
+  fn expand_rgb_to_rgba_row_only_writes_first_width_pixels() {
+    // Caller may pass over-sized RGBA buffers; we must not stomp on
+    // the trailing region. Pre-fill 0xAA, expand into the head, and
+    // verify the tail still reads 0xAA.
+    let rgb: std::vec::Vec<u8> = (0..8 * 3).map(|i| (i + 1) as u8).collect();
+    let mut rgba = std::vec![0xAAu8; 16 * 4];
+    expand_rgb_to_rgba_row(&rgb, &mut rgba, 8);
+    for x in 0..8 {
+      assert_eq!(rgba[x * 4], rgb[x * 3]);
+      assert_eq!(rgba[x * 4 + 3], 0xFF);
+    }
+    for &b in &rgba[8 * 4..] {
+      assert_eq!(b, 0xAA, "tail must be untouched");
+    }
+  }
+
+  #[test]
+  fn expand_rgb_to_rgba_row_zero_width_is_noop() {
+    let rgb: std::vec::Vec<u8> = std::vec::Vec::new();
+    let mut rgba = std::vec![0u8; 0];
+    expand_rgb_to_rgba_row(&rgb, &mut rgba, 0);
+    assert!(rgba.is_empty());
+  }
+
   // ---- yuv_420_to_rgb_row ----------------------------------------------
 
   #[test]
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 94ca4e4..524c54f 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -62,17 +62,18 @@ use crate::{
   HsvBuffers, PixelSink, SourceFormat,
   raw::{Bayer, Bayer16, BayerRow, BayerRow16, BayerSink, BayerSink16},
   row::{
-    bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, nv12_to_rgb_row,
-    nv12_to_rgba_row, nv21_to_rgb_row, nv21_to_rgba_row, nv24_to_rgb_row, nv42_to_rgb_row,
-    p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row,
-    p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row,
-    p412_to_rgb_u16_row, p416_to_rgb_row, p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row,
-    yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv_444_to_rgba_row, yuv420p9_to_rgb_row,
-    yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row,
-    yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row,
-    yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row,
-    yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row,
-    yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row,
+    bayer_to_rgb_row, bayer16_to_rgb_row, bayer16_to_rgb_u16_row, expand_rgb_to_rgba_row,
+    nv12_to_rgb_row, nv12_to_rgba_row, nv21_to_rgb_row, nv21_to_rgba_row, nv24_to_rgb_row,
+    nv24_to_rgba_row, nv42_to_rgb_row, nv42_to_rgba_row, p010_to_rgb_row, p010_to_rgb_u16_row,
+    p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row,
+    p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row,
+    p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_420_to_rgba_row,
+    yuv_444_to_rgb_row, yuv_444_to_rgba_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row,
+    yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row,
+    yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row,
+    yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row,
+    yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row,
+    yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row,
   },
   yuv::{
     Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row,
@@ -1110,12 +1111,12 @@ impl<'a> MixedSinker<'a, Yuv420p> {
   ///
   /// ```compile_fail
   /// // Attaching RGBA to a sink that doesn't write it is rejected
-  /// // at compile time. Nv24 (4:4:4 semi‑planar) has not yet been
+  /// // at compile time. Yuv440p (4:4:0 planar) has not yet been
   /// // wired for RGBA; once that lands the negative example here
   /// // moves to the next not‑yet‑wired format.
-  /// use colconv::{sinker::MixedSinker, yuv::Nv24};
+  /// use colconv::{sinker::MixedSinker, yuv::Yuv440p};
   /// let mut buf = vec![0u8; 16 * 8 * 4];
-  /// let _ = MixedSinker::<Nv24>::new(16, 8).with_rgba(&mut buf);
+  /// let _ = MixedSinker::<Yuv440p>::new(16, 8).with_rgba(&mut buf);
   /// ```
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
@@ -1229,12 +1230,20 @@ impl PixelSink for MixedSinker<'_, Yuv420p> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
-    // Native RGBA: independent kernel run, separate from RGB. Avoids
-    // the compose-and-expand cost — the const-generic
-    // `yuv_420_to_rgba_row` writes 4 bytes per pixel directly.
-    // Default alpha = 0xFF (opaque); future YUVA source impls will
-    // copy alpha through from the source plane.
-    if let Some(buf) = rgba.as_deref_mut() {
+    // Output mode resolution (Strategy A):
+    // - RGBA-only: run dedicated `yuv_420_to_rgba_row` (4 bpp store).
+    // - RGB / HSV (with or without RGBA): run RGB kernel once, then if
+    //   RGBA is also requested, fan it out via `expand_rgb_to_rgba_row`
+    //   (memory-bound copy + 0xFF alpha pad). Saves the second YUV→RGB
+    //   per-pixel math when both buffers are attached.
+    // - None of the above: nothing to do beyond luma above.
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_plane_end =
         one_plane_end
           .checked_mul(4)
@@ -1243,22 +1252,21 @@ impl PixelSink for MixedSinker<'_, Yuv420p> {
             height: h,
             channels: 4,
           })?;
-      let rgba_plane_start = one_plane_start * 4; // ≤ rgba_plane_end.
+      let rgba_plane_start = one_plane_start * 4;
       yuv_420_to_rgba_row(
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgba_plane_start..rgba_plane_end],
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    let want_rgb = rgb.is_some();
-    let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1324,6 +1332,20 @@ impl PixelSink for MixedSinker<'_, Yuv420p> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -1437,11 +1459,16 @@ impl PixelSink for MixedSinker<'_, Yuv422p> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
-    // Native RGBA: independent kernel run, separate from RGB. Default
-    // alpha = 0xFF since Yuv422p has no alpha plane. Reuses the
-    // Yuv420p RGBA dispatcher — 4:2:2's per-row contract is
-    // identical (half-width chroma, one pair per Y pair).
-    if let Some(buf) = rgba.as_deref_mut() {
+    // Strategy A output mode resolution — see Yuv420p impl above.
+    // Reuses Yuv420p dispatchers (RGB and RGBA) since 4:2:2's per-row
+    // contract is identical (half-width chroma, one pair per Y pair).
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_plane_end =
         one_plane_end
           .checked_mul(4)
@@ -1455,17 +1482,16 @@ impl PixelSink for MixedSinker<'_, Yuv422p> {
         row.y(),
         row.u_half(),
         row.v_half(),
-        &mut buf[rgba_plane_start..rgba_plane_end],
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    let want_rgb = rgb.is_some();
-    let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1518,6 +1544,20 @@ impl PixelSink for MixedSinker<'_, Yuv422p> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -1623,9 +1663,14 @@ impl PixelSink for MixedSinker<'_, Yuv444p> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
-    // Native RGBA: independent kernel run, separate from RGB. Default
-    // alpha = 0xFF since Yuv444p has no alpha plane.
-    if let Some(buf) = rgba.as_deref_mut() {
+    // Strategy A output mode resolution — see Yuv420p impl above.
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_plane_end =
         one_plane_end
           .checked_mul(4)
@@ -1639,17 +1684,16 @@ impl PixelSink for MixedSinker<'_, Yuv444p> {
         row.y(),
         row.u(),
         row.v(),
-        &mut buf[rgba_plane_start..rgba_plane_end],
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    let want_rgb = rgb.is_some();
-    let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1700,6 +1744,20 @@ impl PixelSink for MixedSinker<'_, Yuv444p> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -1817,9 +1875,14 @@ impl PixelSink for MixedSinker<'_, Nv12> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
-    // Native RGBA: independent kernel run, separate from RGB. Default
-    // alpha = 0xFF since NV12 has no alpha plane.
-    if let Some(buf) = rgba.as_deref_mut() {
+    // Strategy A output mode resolution — see Yuv420p impl above.
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_plane_end =
         one_plane_end
           .checked_mul(4)
@@ -1832,17 +1895,16 @@ impl PixelSink for MixedSinker<'_, Nv12> {
       nv12_to_rgba_row(
         row.y(),
         row.uv_half(),
-        &mut buf[rgba_plane_start..rgba_plane_end],
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    let want_rgb = rgb.is_some();
-    let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -1894,6 +1956,20 @@ impl PixelSink for MixedSinker<'_, Nv12> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -2002,10 +2078,16 @@ impl PixelSink for MixedSinker<'_, Nv16> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
-    // Native RGBA: independent kernel run, separate from RGB. Default
-    // alpha = 0xFF since NV16 has no alpha plane. Reuses the NV12
-    // RGBA dispatcher — 4:2:2's row contract is identical.
-    if let Some(buf) = rgba.as_deref_mut() {
+    // Strategy A output mode resolution — see Yuv420p impl above.
+    // Reuses NV12 dispatchers (RGB and RGBA) since 4:2:2's row
+    // contract is identical.
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_plane_end =
         one_plane_end
           .checked_mul(4)
@@ -2018,17 +2100,16 @@ impl PixelSink for MixedSinker<'_, Nv16> {
       nv12_to_rgba_row(
         row.y(),
         row.uv(),
-        &mut buf[rgba_plane_start..rgba_plane_end],
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    let want_rgb = rgb.is_some();
-    let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -2079,6 +2160,20 @@ impl PixelSink for MixedSinker<'_, Nv16> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -2185,9 +2280,14 @@ impl PixelSink for MixedSinker<'_, Nv21> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
-    // Native RGBA: independent kernel run, separate from RGB. Default
-    // alpha = 0xFF since NV21 has no alpha plane.
-    if let Some(buf) = rgba.as_deref_mut() {
+    // Strategy A output mode resolution — see Yuv420p impl above.
+    let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
+    let want_hsv = hsv.is_some();
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
       let rgba_plane_end =
         one_plane_end
           .checked_mul(4)
@@ -2200,17 +2300,16 @@ impl PixelSink for MixedSinker<'_, Nv21> {
       nv21_to_rgba_row(
         row.y(),
         row.vu_half(),
-        &mut buf[rgba_plane_start..rgba_plane_end],
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
         w,
         row.matrix(),
         row.full_range(),
         use_simd,
       );
+      return Ok(());
     }
 
-    let want_rgb = rgb.is_some();
-    let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -2262,6 +2361,20 @@ impl PixelSink for MixedSinker<'_, Nv21> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -2273,6 +2386,38 @@ impl PixelSink for MixedSinker<'_, Nv21> {
 // is its own family (`nv24_to_rgb_row`) since chroma is no longer
 // duplicated across columns.
 
+impl<'a> MixedSinker<'a, Nv24> {
+  /// Attaches a packed 32‑bit RGBA output buffer.
+  ///
+  /// Only available on sinker types whose `PixelSink` impl writes
+  /// RGBA — see [`MixedSinker::<Yuv420p>::with_rgba`] for the same
+  /// rationale and constraints. Nv24 has no alpha plane, so every
+  /// alpha byte is filled with `0xFF` (opaque).
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+}
+
 impl Nv24Sink for MixedSinker<'_, Nv24> {}
 
 impl PixelSink for MixedSinker<'_, Nv24> {
@@ -2322,6 +2467,7 @@ impl PixelSink for MixedSinker<'_, Nv24> {
 
     let Self {
       rgb,
+      rgba,
       luma,
       hsv,
       rgb_scratch,
@@ -2336,8 +2482,37 @@ impl PixelSink for MixedSinker<'_, Nv24> {
     }
 
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    // Standalone RGBA path: the caller wants only RGBA (no RGB / HSV),
+    // so run the dedicated RGBA kernel directly into the output buffer.
+    // Avoids both the scratch allocation and the expand-pad pass.
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      nv24_to_rgba_row(
+        row.y(),
+        row.uv(),
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -2387,6 +2562,23 @@ impl PixelSink for MixedSinker<'_, Nv24> {
         use_simd,
       );
     }
+
+    // Strategy A: when both RGB-side and RGBA outputs are requested,
+    // derive RGBA from the just-computed RGB row (memory-bound copy +
+    // 0xFF alpha pad) instead of running a second YUV→RGB kernel.
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -2396,6 +2588,33 @@ impl PixelSink for MixedSinker<'_, Nv24> {
 // Structurally identical to the Nv24 impl — the row primitive hides
 // the V/U byte-order difference.
 
+impl<'a> MixedSinker<'a, Nv42> {
+  /// Attaches a packed 32‑bit RGBA output buffer.
+  ///
+  /// See [`MixedSinker::<Nv24>::with_rgba`] for the same rationale and
+  /// constraints; Nv42 differs only in chroma byte order (V before U).
+  /// Returns `Err(RgbaBufferTooShort)` if `buf.len() < width × height × 4`.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+}
+
 impl Nv42Sink for MixedSinker<'_, Nv42> {}
 
 impl PixelSink for MixedSinker<'_, Nv42> {
@@ -2442,6 +2661,7 @@ impl PixelSink for MixedSinker<'_, Nv42> {
 
     let Self {
       rgb,
+      rgba,
       luma,
       hsv,
       rgb_scratch,
@@ -2456,8 +2676,34 @@ impl PixelSink for MixedSinker<'_, Nv42> {
     }
 
     let want_rgb = rgb.is_some();
+    let want_rgba = rgba.is_some();
     let want_hsv = hsv.is_some();
-    if !want_rgb && !want_hsv {
+    let need_rgb_kernel = want_rgb || want_hsv;
+
+    if want_rgba && !need_rgb_kernel {
+      let rgba_buf = rgba.as_deref_mut().unwrap();
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      nv42_to_rgba_row(
+        row.y(),
+        row.vu(),
+        &mut rgba_buf[rgba_plane_start..rgba_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+      return Ok(());
+    }
+
+    if !need_rgb_kernel {
       return Ok(());
     }
 
@@ -2507,6 +2753,20 @@ impl PixelSink for MixedSinker<'_, Nv42> {
         use_simd,
       );
     }
+
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      expand_rgb_to_rgba_row(rgb_row, &mut buf[rgba_plane_start..rgba_plane_end], w);
+    }
+
     Ok(())
   }
 }
@@ -9876,6 +10136,386 @@ mod tests {
     );
   }
 
+  // ---- Nv24/Nv42 RGBA (Ship 8 PR 4b) tests --------------------------------
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv24_rgba_only_converts_gray_to_gray_with_opaque_alpha() {
+    let (yp, uvp) = solid_nv24_frame(16, 8, 128, 128, 128);
+    let src = Nv24Frame::new(&yp, &uvp, 16, 8, 16, 32);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Nv24>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv24_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgba.chunks(4) {
+      assert!(px[0].abs_diff(128) <= 1, "R");
+      assert_eq!(px[0], px[1], "RGB monochromatic");
+      assert_eq!(px[1], px[2], "RGB monochromatic");
+      assert_eq!(px[3], 0xFF, "alpha must default to opaque");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv24_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+    // Strategy A invariant: when both RGB and RGBA are attached, the
+    // RGBA bytes must be byte-for-byte identical to the RGB row +
+    // 0xFF alpha. This is the cross-format guarantee that holds even
+    // after we replaced the dual-kernel path with the
+    // expand_rgb_to_rgba_row fan-out.
+    let w = 32u32;
+    let h = 16u32;
+    let ws = w as usize;
+    let hs = h as usize;
+    let (yp, uvp) = solid_nv24_frame(w, h, 180, 60, 200);
+    let src = Nv24Frame::new(&yp, &uvp, w, h, w, 2 * w);
+
+    let mut rgb = std::vec![0u8; ws * hs * 3];
+    let mut rgba = std::vec![0u8; ws * hs * 4];
+    let mut sink = MixedSinker::<Nv24>::new(ws, hs)
+      .with_rgb(&mut rgb)
+      .unwrap()
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv24_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for i in 0..(ws * hs) {
+      assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}");
+    }
+  }
+
+  #[test]
+  fn nv24_rgba_buffer_too_short_returns_err() {
+    let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1];
+    let result = MixedSinker::<Nv24>::new(16, 8).with_rgba(&mut rgba_short);
+    let Err(err) = result else {
+      panic!("expected RgbaBufferTooShort error");
+    };
+    assert!(matches!(
+      err,
+      MixedSinkerError::RgbaBufferTooShort {
+        expected: 512,
+        actual: 511,
+      }
+    ));
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv24_rgba_simd_matches_scalar_with_random_yuv() {
+    // Width 1922 forces both the SIMD main loop AND scalar tail across
+    // every backend block size (16/32/64).
+    let w = 1922usize;
+    let h = 4usize;
+    let mut yp = std::vec![0u8; w * h];
+    let mut uvp = std::vec![0u8; 2 * w * h];
+    pseudo_random_u8(&mut yp, 0xC001_C0DE);
+    pseudo_random_u8(&mut uvp, 0xCAFE_F00D);
+    let src = Nv24Frame::new(&yp, &uvp, w as u32, h as u32, w as u32, (2 * w) as u32);
+
+    for &matrix in &[
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::YCgCo,
+    ] {
+      for &full_range in &[true, false] {
+        let mut rgba_simd = std::vec![0u8; w * h * 4];
+        let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+        let mut s_simd = MixedSinker::<Nv24>::new(w, h)
+          .with_rgba(&mut rgba_simd)
+          .unwrap();
+        nv24_to(&src, full_range, matrix, &mut s_simd).unwrap();
+
+        let mut s_scalar = MixedSinker::<Nv24>::new(w, h)
+          .with_rgba(&mut rgba_scalar)
+          .unwrap();
+        s_scalar.set_simd(false);
+        nv24_to(&src, full_range, matrix, &mut s_scalar).unwrap();
+
+        assert_eq!(
+          rgba_simd, rgba_scalar,
+          "Nv24 RGBA SIMD ≠ scalar (matrix={matrix:?}, full_range={full_range})"
+        );
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv42_rgba_only_converts_gray_to_gray_with_opaque_alpha() {
+    let (yp, vup) = solid_nv42_frame(16, 8, 128, 128, 128);
+    let src = Nv42Frame::new(&yp, &vup, 16, 8, 16, 32);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Nv42>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv42_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgba.chunks(4) {
+      assert!(px[0].abs_diff(128) <= 1);
+      assert_eq!(px[0], px[1]);
+      assert_eq!(px[1], px[2]);
+      assert_eq!(px[3], 0xFF);
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv42_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+    let w = 32u32;
+    let h = 16u32;
+    let ws = w as usize;
+    let hs = h as usize;
+    let (yp, vup) = solid_nv42_frame(w, h, 180, 60, 200);
+    let src = Nv42Frame::new(&yp, &vup, w, h, w, 2 * w);
+
+    let mut rgb = std::vec![0u8; ws * hs * 3];
+    let mut rgba = std::vec![0u8; ws * hs * 4];
+    let mut sink = MixedSinker::<Nv42>::new(ws, hs)
+      .with_rgb(&mut rgb)
+      .unwrap()
+      .with_rgba(&mut rgba)
+      .unwrap();
+    nv42_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for i in 0..(ws * hs) {
+      assert_eq!(rgba[i * 4], rgb[i * 3]);
+      assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1]);
+      assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2]);
+      assert_eq!(rgba[i * 4 + 3], 0xFF);
+    }
+  }
+
+  #[test]
+  fn nv42_rgba_buffer_too_short_returns_err() {
+    let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1];
+    let result = MixedSinker::<Nv42>::new(16, 8).with_rgba(&mut rgba_short);
+    let Err(err) = result else {
+      panic!("expected RgbaBufferTooShort error");
+    };
+    assert!(matches!(
+      err,
+      MixedSinkerError::RgbaBufferTooShort {
+        expected: 512,
+        actual: 511,
+      }
+    ));
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn nv42_rgba_simd_matches_scalar_with_random_yuv() {
+    let w = 1922usize;
+    let h = 4usize;
+    let mut yp = std::vec![0u8; w * h];
+    let mut vup = std::vec![0u8; 2 * w * h];
+    pseudo_random_u8(&mut yp, 0xC001_C0DE);
+    pseudo_random_u8(&mut vup, 0xCAFE_F00D);
+    let src = Nv42Frame::new(&yp, &vup, w as u32, h as u32, w as u32, (2 * w) as u32);
+
+    for &matrix in &[
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::YCgCo,
+    ] {
+      for &full_range in &[true, false] {
+        let mut rgba_simd = std::vec![0u8; w * h * 4];
+        let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+        let mut s_simd = MixedSinker::<Nv42>::new(w, h)
+          .with_rgba(&mut rgba_simd)
+          .unwrap();
+        nv42_to(&src, full_range, matrix, &mut s_simd).unwrap();
+
+        let mut s_scalar = MixedSinker::<Nv42>::new(w, h)
+          .with_rgba(&mut rgba_scalar)
+          .unwrap();
+        s_scalar.set_simd(false);
+        nv42_to(&src, full_range, matrix, &mut s_scalar).unwrap();
+
+        assert_eq!(
+          rgba_simd, rgba_scalar,
+          "Nv42 RGBA SIMD ≠ scalar (matrix={matrix:?}, full_range={full_range})"
+        );
+      }
+    }
+  }
+
+  // Cross-format Strategy A invariant: when both RGB+RGBA are
+  // attached, all 8 wired families derive RGBA from the RGB row via
+  // expand_rgb_to_rgba_row. This test runs all 8 process methods with
+  // the same gray input and asserts every RGBA sample equals the RGB
+  // sample with alpha = 0xFF — proving the fan-out shape never
+  // diverges from the kernel output.
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn strategy_a_rgb_and_rgba_byte_identical_for_all_wired_families() {
+    let w: u32 = 32;
+    let h: u32 = 8;
+    let ws = w as usize;
+    let hs = h as usize;
+
+    let assert_match = |rgb: &[u8], rgba: &[u8], who: &str| {
+      for i in 0..(ws * hs) {
+        assert_eq!(rgba[i * 4], rgb[i * 3], "{who}: R differs at px {i}");
+        assert_eq!(
+          rgba[i * 4 + 1],
+          rgb[i * 3 + 1],
+          "{who}: G differs at px {i}"
+        );
+        assert_eq!(
+          rgba[i * 4 + 2],
+          rgb[i * 3 + 2],
+          "{who}: B differs at px {i}"
+        );
+        assert_eq!(rgba[i * 4 + 3], 0xFF, "{who}: alpha not opaque at px {i}");
+      }
+    };
+
+    {
+      let (yp, up, vp) = solid_yuv420p_frame(w, h, 200, 128, 128);
+      let src = Yuv420pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Yuv420p>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Yuv420p");
+    }
+
+    {
+      let (yp, up, vp) = solid_yuv422p_frame(w, h, 200, 128, 128);
+      let src = Yuv422pFrame::new(&yp, &up, &vp, w, h, w, w / 2, w / 2);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Yuv422p>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      yuv422p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Yuv422p");
+    }
+
+    {
+      let (yp, up, vp) = solid_yuv444p_frame(w, h, 200, 128, 128);
+      let src = Yuv444pFrame::new(&yp, &up, &vp, w, h, w, w, w);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Yuv444p>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      yuv444p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Yuv444p");
+    }
+
+    {
+      let (yp, uvp) = solid_nv12_frame(w, h, 200, 128, 128);
+      let src = Nv12Frame::new(&yp, &uvp, w, h, w, w);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Nv12>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      nv12_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Nv12");
+    }
+
+    {
+      let (yp, vup) = solid_nv21_frame(w, h, 200, 128, 128);
+      let src = Nv21Frame::new(&yp, &vup, w, h, w, w);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Nv21>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      nv21_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Nv21");
+    }
+
+    {
+      let (yp, uvp) = solid_nv16_frame(w, h, 200, 128, 128);
+      let src = Nv16Frame::new(&yp, &uvp, w, h, w, w);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Nv16>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      nv16_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Nv16");
+    }
+
+    {
+      let (yp, uvp) = solid_nv24_frame(w, h, 200, 128, 128);
+      let src = Nv24Frame::new(&yp, &uvp, w, h, w, 2 * w);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Nv24>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      nv24_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Nv24");
+    }
+
+    {
+      let (yp, vup) = solid_nv42_frame(w, h, 200, 128, 128);
+      let src = Nv42Frame::new(&yp, &vup, w, h, w, 2 * w);
+      let mut rgb = std::vec![0u8; ws * hs * 3];
+      let mut rgba = std::vec![0u8; ws * hs * 4];
+      let mut sink = MixedSinker::<Nv42>::new(ws, hs)
+        .with_rgb(&mut rgb)
+        .unwrap()
+        .with_rgba(&mut rgba)
+        .unwrap();
+      nv42_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+      assert_match(&rgb, &rgba, "Nv42");
+    }
+  }
+
   // ---- Yuv420p10 --------------------------------------------------------
 
   fn solid_yuv420p10_frame(