From 9bd491b96fffd8052c06ef074af35550b6fb99af Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 13:46:56 +1200
Subject: [PATCH 1/5] update

---
 src/row/arch/neon.rs         |  82 +++++++++-
 src/row/arch/wasm_simd128.rs | 145 +++++++++++++++++-
 src/row/arch/x86_avx2.rs     | 104 ++++++++++++-
 src/row/arch/x86_avx512.rs   | 113 +++++++++++++-
 src/row/arch/x86_common.rs   |  86 +++++++++++
 src/row/arch/x86_sse41.rs    |  82 +++++++++-
 src/row/mod.rs               | 103 +++++++++++++
 src/row/scalar.rs            |  71 ++++++++-
 src/sinker/mixed.rs          | 279 ++++++++++++++++++++++++++++++++++-
 9 files changed, 1023 insertions(+), 42 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index 8d40e61..bfc52f6 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -69,12 +69,71 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked NEON availability + slice bounds — see
+  // [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as
+/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B,
+/// `0xFF`).
+///
+/// # Safety
+///
+/// 1. NEON must be available on the current CPU.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked NEON availability + slice bounds — see
+  // [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`,
+/// `vst3q_u8`) and [`yuv_420_to_rgba_row`] (`ALPHA = true`,
+/// `vst4q_u8` with constant `0xFF` alpha). Math is byte-identical to
+/// `scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>`; only the per-block
+/// store intrinsic differs. `const` generic monomorphizes per call
+/// site, so the `if ALPHA` branches are eliminated.
+///
+/// # Safety
+///
+/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the
+/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })`
+/// bytes long.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -97,6 +156,10 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    // Constant opaque-alpha vector for the RGBA path. Materializing
+    // it outside the loop costs one `vdupq_n_u8` regardless of
+    // ALPHA; the compiler DCE's it when ALPHA = false.
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -158,9 +221,16 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, r_dup_hi)),
       );
 
-      // vst3q_u8 writes 48 bytes as interleaved R, G, B triples.
-      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
-      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+      if ALPHA {
+        // vst4q_u8 writes 64 bytes as interleaved R, G, B, A
+        // quadruplets — native AArch64 4-channel store.
+        let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
+        vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
+      } else {
+        // vst3q_u8 writes 48 bytes as interleaved R, G, B triples.
+        let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+        vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
+      }
 
       x += 16;
     }
@@ -168,11 +238,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     // Scalar tail for the 0..14 leftover pixels (always even, 4:2:0
     // requires even width so x/2 and width/2 are well‑defined).
     if x < width {
-      scalar::yuv_420_to_rgb_row(
+      scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
+        &mut out[x * bpp..width * bpp],
         width - x,
         matrix,
         full_range,
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index b13a9a2..26b8363 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -73,12 +73,70 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked simd128 availability + slice bounds —
+  // see [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// WASM simd128 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract
+/// as [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B,
+/// `0xFF`).
+///
+/// # Safety
+///
+/// 1. simd128 must be enabled at compile time.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked simd128 availability + slice bounds —
+  // see [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared WASM simd128 kernel for [`yuv_420_to_rgb_row`]
+/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_420_to_rgba_row`]
+/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha).
+/// Math is byte-identical to
+/// `scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the
+/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })`
+/// bytes long.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -100,6 +158,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    // Constant opaque-alpha vector for the RGBA path; DCE'd when
+    // ALPHA = false.
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -160,19 +221,24 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
-      // 3‑way interleave → packed RGB (48 bytes).
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        // 4‑way interleave → packed RGBA (64 bytes).
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        // 3‑way interleave → packed RGB (48 bytes).
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     // Scalar tail for the 0..14 leftover pixels.
     if x < width {
-      scalar::yuv_420_to_rgb_row(
+      scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
+        &mut out[x * bpp..width * bpp],
         width - x,
         matrix,
         full_range,
@@ -1958,6 +2024,75 @@ unsafe fn write_rgb_16(r: v128, g: v128, b: v128, ptr: *mut u8) {
   }
 }
 
+/// Writes 16 pixels of packed RGBA (64 bytes) from four u8x16 channel
+/// vectors. Mirror of [`write_rgb_16`] for the 4-channel output path.
+///
+/// The 4-byte stride aligns cleanly with the 16-byte register width:
+/// each output block holds exactly 4 RGBA quads (16 bytes), with R,
+/// G, B, A interleaved at positions `(0, 1, 2, 3)`, `(4, 5, 6, 7)`,
+/// etc. `u8x16_swizzle` indices ≥ 16 zero the lane.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 64 writable bytes.
+#[inline(always)]
+unsafe fn write_rgba_16(r: v128, g: v128, b: v128, a: v128, ptr: *mut u8) {
+  unsafe {
+    // Block 0 (bytes 0..16): pixels 0..3, source bytes 0..3.
+    let r0 = i8x16(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1);
+    let g0 = i8x16(-1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1);
+    let b0 = i8x16(-1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1);
+    let a0 = i8x16(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3);
+    let out0 = v128_or(
+      v128_or(u8x16_swizzle(r, r0), u8x16_swizzle(g, g0)),
+      v128_or(u8x16_swizzle(b, b0), u8x16_swizzle(a, a0)),
+    );
+
+    // Block 1 (bytes 16..32): pixels 4..7, source bytes 4..7.
+    let r1 = i8x16(4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1);
+    let g1 = i8x16(-1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1);
+    let b1 = i8x16(-1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1);
+    let a1 = i8x16(-1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7);
+    let out1 = v128_or(
+      v128_or(u8x16_swizzle(r, r1), u8x16_swizzle(g, g1)),
+      v128_or(u8x16_swizzle(b, b1), u8x16_swizzle(a, a1)),
+    );
+
+    // Block 2 (bytes 32..48): pixels 8..11, source bytes 8..11.
+    let r2 = i8x16(8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1, -1);
+    let g2 = i8x16(-1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1);
+    let b2 = i8x16(-1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1);
+    let a2 = i8x16(-1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11);
+    let out2 = v128_or(
+      v128_or(u8x16_swizzle(r, r2), u8x16_swizzle(g, g2)),
+      v128_or(u8x16_swizzle(b, b2), u8x16_swizzle(a, a2)),
+    );
+
+    // Block 3 (bytes 48..64): pixels 12..15, source bytes 12..15.
+    let r3 = i8x16(
+      12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1,
+    );
+    let g3 = i8x16(
+      -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
+    );
+    let b3 = i8x16(
+      -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1,
+    );
+    let a3 = i8x16(
+      -1, -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15,
+    );
+    let out3 = v128_or(
+      v128_or(u8x16_swizzle(r, r3), u8x16_swizzle(g, g3)),
+      v128_or(u8x16_swizzle(b, b3), u8x16_swizzle(a, a3)),
+    );
+
+    v128_store(ptr.cast(), out0);
+    v128_store(ptr.add(16).cast(), out1);
+    v128_store(ptr.add(32).cast(), out2);
+    v128_store(ptr.add(48).cast(), out3);
+  }
+}
+
 // ===== 16-bit YUV → RGB ==================================================
 
 /// `(Y_u16x8 - y_off) * y_scale + RND >> 15` for full u16 Y samples.
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 87c65a3..dd384cb 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -43,7 +43,9 @@ use core::arch::x86_64::*;
 use crate::{
   ColorMatrix,
   row::{
-    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8},
+    arch::x86_common::{
+      rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16,
+    },
     scalar,
   },
 };
@@ -82,12 +84,69 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked AVX2 availability + slice bounds — see
+  // [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as
+/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B,
+/// `0xFF`).
+///
+/// # Safety
+///
+/// 1. AVX2 must be available on the current CPU.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked AVX2 availability + slice bounds — see
+  // [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`,
+/// [`write_rgb_32`]) and [`yuv_420_to_rgba_row`] (`ALPHA = true`,
+/// [`write_rgba_32`] with constant `0xFF` alpha). Math is
+/// byte-identical to `scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the
+/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })`
+/// bytes long.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -109,6 +168,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    // Constant opaque-alpha vector for the RGBA path; DCE'd when
+    // ALPHA = false.
+    let alpha_u8 = _mm256_set1_epi8(-1); // 0xFF as i8
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -178,8 +240,13 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
-      // 3‑way interleave → packed RGB (96 bytes = 3 × 32).
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        // 4‑way interleave → packed RGBA (128 bytes = 4 × 32).
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        // 3‑way interleave → packed RGB (96 bytes = 3 × 32).
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
@@ -187,11 +254,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     // Scalar tail for the 0..30 leftover pixels (always even; 4:2:0
     // requires even width so x/2 and width/2 are well‑defined).
     if x < width {
-      scalar::yuv_420_to_rgb_row(
+      scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
+        &mut out[x * bpp..width * bpp],
         width - x,
         matrix,
         full_range,
@@ -2162,6 +2229,31 @@ unsafe fn write_rgb_32(r: __m256i, g: __m256i, b: __m256i, ptr: *mut u8) {
   }
 }
 
+/// Writes 32 pixels of packed RGBA (128 bytes) by interleaving four
+/// u8x32 R/G/B/A channel vectors. Processed as two 16‑pixel halves
+/// via the shared
+/// [`write_rgba_16`](super::x86_common::write_rgba_16) helper.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 128 writable bytes.
+#[inline(always)]
+unsafe fn write_rgba_32(r: __m256i, g: __m256i, b: __m256i, a: __m256i, ptr: *mut u8) {
+  unsafe {
+    let r_lo = _mm256_castsi256_si128(r);
+    let r_hi = _mm256_extracti128_si256::<1>(r);
+    let g_lo = _mm256_castsi256_si128(g);
+    let g_hi = _mm256_extracti128_si256::<1>(g);
+    let b_lo = _mm256_castsi256_si128(b);
+    let b_hi = _mm256_extracti128_si256::<1>(b);
+    let a_lo = _mm256_castsi256_si128(a);
+    let a_hi = _mm256_extracti128_si256::<1>(a);
+
+    write_rgba_16(r_lo, g_lo, b_lo, a_lo, ptr);
+    write_rgba_16(r_hi, g_hi, b_hi, a_hi, ptr.add(64));
+  }
+}
+
 // ===== 16-bit YUV → RGB ==================================================
 
 /// `(Y_u16x16 - y_off) * y_scale + RND >> 15` for full u16 Y samples.
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 1b8fae1..3e01120 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -57,7 +57,9 @@ use core::arch::x86_64::*;
 use crate::{
   ColorMatrix,
   row::{
-    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8},
+    arch::x86_common::{
+      rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16,
+    },
     scalar,
   },
 };
@@ -96,12 +98,69 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked AVX-512BW availability + slice bounds —
+  // see [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX‑512 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as
+/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B,
+/// `0xFF`).
+///
+/// # Safety
+///
+/// 1. AVX‑512F + AVX‑512BW must be available on the current CPU.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked AVX-512BW availability + slice bounds —
+  // see [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX‑512 kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`,
+/// [`write_rgb_64`]) and [`yuv_420_to_rgba_row`] (`ALPHA = true`,
+/// [`write_rgba_64`] with constant `0xFF` alpha). Math is
+/// byte-identical to `scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>`.
+///
+/// # Safety
+///
+/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the
+/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })`
+/// bytes long.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -123,6 +182,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    // Constant opaque-alpha vector for the RGBA path; DCE'd when
+    // ALPHA = false.
+    let alpha_u8 = _mm512_set1_epi8(-1); // 0xFF as i8
 
     // Lane‑fixup permute indices, computed once per call.
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
@@ -193,8 +255,13 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
-      // 3‑way interleave → packed RGB (192 bytes = 4 × 48).
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        // 4‑way interleave → packed RGBA (256 bytes = 4 × 64).
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        // 3‑way interleave → packed RGB (192 bytes = 4 × 48).
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 64;
     }
@@ -202,11 +269,11 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     // Scalar tail for the 0..62 leftover pixels (always even; 4:2:0
     // requires even width so x/2 and width/2 are well‑defined).
     if x < width {
-      scalar::yuv_420_to_rgb_row(
+      scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
+        &mut out[x * bpp..width * bpp],
         width - x,
         matrix,
         full_range,
@@ -2230,6 +2297,40 @@ unsafe fn write_rgb_64(r: __m512i, g: __m512i, b: __m512i, ptr: *mut u8) {
   }
 }
 
+/// Writes 64 pixels of packed RGBA (256 bytes) by splitting the
+/// u8x64 channel vectors into four 128‑bit quarters and calling the
+/// shared [`write_rgba_16`] helper four times.
+///
+/// # Safety
+///
+/// `ptr` must point to at least 256 writable bytes.
+#[inline(always)]
+unsafe fn write_rgba_64(r: __m512i, g: __m512i, b: __m512i, a: __m512i, ptr: *mut u8) {
+  unsafe {
+    let r0: __m128i = _mm512_castsi512_si128(r);
+    let r1: __m128i = _mm512_extracti32x4_epi32::<1>(r);
+    let r2: __m128i = _mm512_extracti32x4_epi32::<2>(r);
+    let r3: __m128i = _mm512_extracti32x4_epi32::<3>(r);
+    let g0: __m128i = _mm512_castsi512_si128(g);
+    let g1: __m128i = _mm512_extracti32x4_epi32::<1>(g);
+    let g2: __m128i = _mm512_extracti32x4_epi32::<2>(g);
+    let g3: __m128i = _mm512_extracti32x4_epi32::<3>(g);
+    let b0: __m128i = _mm512_castsi512_si128(b);
+    let b1: __m128i = _mm512_extracti32x4_epi32::<1>(b);
+    let b2: __m128i = _mm512_extracti32x4_epi32::<2>(b);
+    let b3: __m128i = _mm512_extracti32x4_epi32::<3>(b);
+    let a0: __m128i = _mm512_castsi512_si128(a);
+    let a1: __m128i = _mm512_extracti32x4_epi32::<1>(a);
+    let a2: __m128i = _mm512_extracti32x4_epi32::<2>(a);
+    let a3: __m128i = _mm512_extracti32x4_epi32::<3>(a);
+
+    write_rgba_16(r0, g0, b0, a0, ptr);
+    write_rgba_16(r1, g1, b1, a1, ptr.add(64));
+    write_rgba_16(r2, g2, b2, a2, ptr.add(128));
+    write_rgba_16(r3, g3, b3, a3, ptr.add(192));
+  }
+}
+
 // ===== 16-bit u16-output helpers ========================================
 
 /// `(c_u * u_d + c_v * v_d + RND) >> 15` in i64 via two
diff --git a/src/row/arch/x86_common.rs b/src/row/arch/x86_common.rs
index 2f8807b..7cf7aca 100644
--- a/src/row/arch/x86_common.rs
+++ b/src/row/arch/x86_common.rs
@@ -79,6 +79,92 @@ pub(super) unsafe fn write_rgb_16(r: __m128i, g: __m128i, b: __m128i, ptr: *mut
   }
 }
 
+/// Writes 16 pixels of packed RGBA (64 bytes) from four u8x16 channel
+/// vectors. Mirrors [`write_rgb_16`] for the 4-channel output path.
+///
+/// The 4-byte stride aligns cleanly with the 16-byte register width:
+/// each output block holds exactly 4 RGBA quads (16 bytes), with R,
+/// G, B, A interleaved at positions `(0, 1, 2, 3)`, `(4, 5, 6, 7)`,
+/// etc. The shuffle masks are simpler than the 3-channel pattern
+/// because a single source byte goes to a single output byte (no
+/// channel "split across blocks" boundary).
+///
+/// Conceptually:
+/// - Block 0 (bytes 0..16): R0,G0,B0,A0, R1,G1,B1,A1, R2,G2,B2,A2,
+///   R3,G3,B3,A3
+/// - Block 1 (bytes 16..32): pixels 4..7
+/// - Block 2 (bytes 32..48): pixels 8..11
+/// - Block 3 (bytes 48..64): pixels 12..15
+///
+/// Each block is the OR of four `_mm_shuffle_epi8` gathers — one per
+/// channel — with `0x80` (`-1` as i8) zeroing lanes that another
+/// channel's shuffle will fill.
+///
+/// # Safety
+///
+/// - `ptr` must point to at least 64 writable bytes.
+/// - The calling function must have SSSE3 available (via
+///   `#[target_feature(enable = "ssse3")]` or a superset).
+#[inline(always)]
+pub(super) unsafe fn write_rgba_16(r: __m128i, g: __m128i, b: __m128i, a: __m128i, ptr: *mut u8) {
+  unsafe {
+    // Block 0 (bytes 0..16): pixels 0..3, source bytes 0..3 from
+    // each channel placed at output positions
+    // (0, 1, 2, 3) for pixel 0, (4, 5, 6, 7) for pixel 1, etc.
+    let r0 = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1);
+    let g0 = _mm_setr_epi8(-1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1, -1);
+    let b0 = _mm_setr_epi8(-1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, -1);
+    let a0 = _mm_setr_epi8(-1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3);
+    let out0 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r0), _mm_shuffle_epi8(g, g0)),
+      _mm_or_si128(_mm_shuffle_epi8(b, b0), _mm_shuffle_epi8(a, a0)),
+    );
+
+    // Block 1 (bytes 16..32): pixels 4..7, source bytes 4..7.
+    let r1 = _mm_setr_epi8(4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1);
+    let g1 = _mm_setr_epi8(-1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1, -1);
+    let b1 = _mm_setr_epi8(-1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7, -1);
+    let a1 = _mm_setr_epi8(-1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1, 7);
+    let out1 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r1), _mm_shuffle_epi8(g, g1)),
+      _mm_or_si128(_mm_shuffle_epi8(b, b1), _mm_shuffle_epi8(a, a1)),
+    );
+
+    // Block 2 (bytes 32..48): pixels 8..11, source bytes 8..11.
+    let r2 = _mm_setr_epi8(8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1, -1);
+    let g2 = _mm_setr_epi8(-1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1, -1);
+    let b2 = _mm_setr_epi8(-1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11, -1);
+    let a2 = _mm_setr_epi8(-1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1, 10, -1, -1, -1, 11);
+    let out2 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r2), _mm_shuffle_epi8(g, g2)),
+      _mm_or_si128(_mm_shuffle_epi8(b, b2), _mm_shuffle_epi8(a, a2)),
+    );
+
+    // Block 3 (bytes 48..64): pixels 12..15, source bytes 12..15.
+    let r3 = _mm_setr_epi8(
+      12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1,
+    );
+    let g3 = _mm_setr_epi8(
+      -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
+    );
+    let b3 = _mm_setr_epi8(
+      -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15, -1,
+    );
+    let a3 = _mm_setr_epi8(
+      -1, -1, -1, 12, -1, -1, -1, 13, -1, -1, -1, 14, -1, -1, -1, 15,
+    );
+    let out3 = _mm_or_si128(
+      _mm_or_si128(_mm_shuffle_epi8(r, r3), _mm_shuffle_epi8(g, g3)),
+      _mm_or_si128(_mm_shuffle_epi8(b, b3), _mm_shuffle_epi8(a, a3)),
+    );
+
+    _mm_storeu_si128(ptr.cast(), out0);
+    _mm_storeu_si128(ptr.add(16).cast(), out1);
+    _mm_storeu_si128(ptr.add(32).cast(), out2);
+    _mm_storeu_si128(ptr.add(48).cast(), out3);
+  }
+}
+
 /// Writes 8 pixels of packed **`u16`** RGB (48 bytes = 24 `u16`)
 /// from three `u16x8` channel vectors. Drives the SSE4.1 / AVX2 /
 /// AVX‑512 high‑bit‑depth kernels' u16 output path.
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 73efdb5..6d79ad8 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -40,7 +40,9 @@ use core::arch::x86_64::*;
 use crate::{
   ColorMatrix,
   row::{
-    arch::x86_common::{rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8},
+    arch::x86_common::{
+      rgb_to_hsv_16_pixels, swap_rb_16_pixels, write_rgb_16, write_rgb_u16_8, write_rgba_16,
+    },
     scalar,
   },
 };
@@ -79,12 +81,72 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked SSE4.1 availability + slice bounds — see
+  // [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 YUV 4:2:0 → packed **RGBA** (8-bit). Same contract as
+/// [`yuv_420_to_rgb_row`] but writes 4 bytes per pixel (R, G, B,
+/// `0xFF`).
+///
+/// # Safety
+///
+/// 1. SSE4.1 must be available on the current CPU.
+/// 2. `width & 1 == 0`.
+/// 3. `y.len() >= width`, `u_half.len() >= width / 2`,
+///    `v_half.len() >= width / 2`, `rgba_out.len() >= 4 * width`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked SSE4.1 availability + slice bounds — see
+  // [`yuv_420_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_420_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`,
+/// [`write_rgb_16`]) and [`yuv_420_to_rgba_row`] (`ALPHA = true`,
+/// [`write_rgba_16`] with constant `0xFF` alpha). Math is
+/// byte-identical to `scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>`;
+/// only the per-block store helper differs. `const` generic
+/// monomorphizes per call site, so the `if ALPHA` branches are
+/// eliminated.
+///
+/// # Safety
+///
+/// Same as [`yuv_420_to_rgb_row`] / [`yuv_420_to_rgba_row`]; the
+/// `out` slice must be `>= width * (if ALPHA { 4 } else { 3 })`
+/// bytes long.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width);
   debug_assert!(u_half.len() >= width / 2);
   debug_assert!(v_half.len() >= width / 2);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -106,6 +168,9 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    // Constant opaque-alpha vector for the RGBA path; DCE'd when
+    // ALPHA = false.
+    let alpha_u8 = _mm_set1_epi8(-1); // 0xFF as i8
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -166,19 +231,24 @@ pub(crate) unsafe fn yuv_420_to_rgb_row(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
-      // 3‑way interleave → packed RGB (48 bytes).
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        // 4‑way interleave → packed RGBA (64 bytes).
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        // 3‑way interleave → packed RGB (48 bytes).
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     // Scalar tail for the 0..14 leftover pixels.
     if x < width {
-      scalar::yuv_420_to_rgb_row(
+      scalar::yuv_420_to_rgb_or_rgba_row::<ALPHA>(
         &y[x..width],
         &u_half[x / 2..width / 2],
         &v_half[x / 2..width / 2],
-        &mut rgb_out[x * 3..width * 3],
+        &mut out[x * bpp..width * bpp],
         width - x,
         matrix,
         full_range,
diff --git a/src/row/mod.rs b/src/row/mod.rs
index 35b15d5..2523eda 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -152,6 +152,98 @@ pub fn yuv_420_to_rgb_row(
   scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit).
+///
+/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel — sources without an alpha plane
+/// produce opaque output). The first three bytes per pixel are
+/// byte-identical to what [`yuv_420_to_rgb_row`] would write.
+///
+/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the
+/// scalar reference path.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  // Runtime asserts at the dispatcher boundary — see
+  // [`yuv_420_to_rgb_row`] for rationale, including the checked
+  // `width × 4` multiplication via [`rgba_row_bytes`].
+  assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u_half.len() >= width / 2, "u_half row too short");
+  assert!(v_half.len() >= width / 2, "v_half row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          // SAFETY: `neon_available()` verified NEON is present.
+          unsafe {
+            arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          // SAFETY: `avx512_available()` verified AVX‑512BW is present.
+          unsafe {
+            arch::x86_avx512::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if avx2_available() {
+          // SAFETY: `avx2_available()` verified AVX2 is present.
+          unsafe {
+            arch::x86_avx2::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+        if sse41_available() {
+          // SAFETY: `sse41_available()` verified SSE4.1 is present.
+          unsafe {
+            arch::x86_sse41::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          // SAFETY: simd128 compile‑time availability verified.
+          unsafe {
+            arch::wasm_simd128::yuv_420_to_rgba_row(
+              y, u_half, v_half, rgba_out, width, matrix, full_range,
+            );
+          }
+          return;
+        }
+      },
+      _ => {
+        // Targets without a SIMD backend fall through to scalar.
+      }
+    }
+  }
+
+  scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
 /// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB.
 ///
 /// Same numerical contract as [`yuv_420_to_rgb_row`]; the only
@@ -2719,6 +2811,17 @@ fn rgb_row_bytes(width: usize) -> usize {
   }
 }
 
+/// Byte length of one packed‑RGBA row (`width × 4`) with overflow
+/// checking. Same purpose as [`rgb_row_bytes`] for the 4-channel
+/// path used by the RGBA dispatchers.
+#[cfg_attr(not(tarpaulin), inline(always))]
+fn rgba_row_bytes(width: usize) -> usize {
+  match width.checked_mul(4) {
+    Some(n) => n,
+    None => panic!("width ({width}) × 4 overflows usize"),
+  }
+}
+
 /// Element count of one packed `u16`‑RGB row (`width × 3`). Identical
 /// math to [`rgb_row_bytes`] — the returned value is in `u16`
 /// elements, not bytes. Callers use it to size `&mut [u16]` buffers
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index 82e9582..c41a2c8 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -35,12 +35,57 @@ pub(crate) fn yuv_420_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  yuv_420_to_rgb_or_rgba_row::<false>(y, u_half, v_half, rgb_out, width, matrix, full_range);
+}
+
+/// Same as [`yuv_420_to_rgb_row`] but writes packed `R, G, B, A`
+/// quadruplets, with `A = 0xFF` (opaque) for every pixel. The first
+/// three bytes per pixel are byte-identical to what
+/// [`yuv_420_to_rgb_row`] would write — only the per-pixel stride
+/// (4 vs 3) and the alpha byte differ. `rgba_out.len() >= 4 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420_to_rgba_row(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  yuv_420_to_rgb_or_rgba_row::<true>(y, u_half, v_half, rgba_out, width, matrix, full_range);
+}
+
+/// Shared scalar kernel for [`yuv_420_to_rgb_row`] (`ALPHA = false`,
+/// 3 bytes / pixel) and [`yuv_420_to_rgba_row`] (`ALPHA = true`,
+/// 4 bytes / pixel — 4th is opaque `0xFF`). The math is identical;
+/// only the per-pixel store differs. `const` generic drives
+/// compile-time monomorphization — each public wrapper is inlined
+/// with the branch eliminated.
+///
+/// # Panics (debug builds)
+///
+/// - `width` must be even.
+/// - `y.len() >= width`, `u_half.len() >= width / 2`,
+///   `v_half.len() >= width / 2`,
+///   `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_420_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u_half: &[u8],
+  v_half: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width");
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u_half.len() >= width / 2, "u_half row too short");
   debug_assert!(v_half.len() >= width / 2, "v_half row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params(full_range);
@@ -67,15 +112,27 @@ pub(crate) fn yuv_420_to_rgb_row(
 
     // Pixel x.
     let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15;
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    let r0 = clamp_u8(y0 + r_chroma);
+    let g0 = clamp_u8(y0 + g_chroma);
+    let b0 = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = r0;
+    out[x * bpp + 1] = g0;
+    out[x * bpp + 2] = b0;
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
 
     // Pixel x+1 shares chroma.
     let y1 = ((y[x + 1] as i32 - y_off) * y_scale + RND) >> 15;
-    rgb_out[(x + 1) * 3] = clamp_u8(y1 + r_chroma);
-    rgb_out[(x + 1) * 3 + 1] = clamp_u8(y1 + g_chroma);
-    rgb_out[(x + 1) * 3 + 2] = clamp_u8(y1 + b_chroma);
+    let r1 = clamp_u8(y1 + r_chroma);
+    let g1 = clamp_u8(y1 + g_chroma);
+    let b1 = clamp_u8(y1 + b_chroma);
+    out[(x + 1) * bpp] = r1;
+    out[(x + 1) * bpp + 1] = g1;
+    out[(x + 1) * bpp + 2] = b1;
+    if ALPHA {
+      out[(x + 1) * bpp + 3] = 0xFF;
+    }
 
     x += 2;
   }
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index d37146d..e2907a2 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -66,12 +66,13 @@ use crate::{
     nv24_to_rgb_row, nv42_to_rgb_row, p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row,
     p012_to_rgb_u16_row, p016_to_rgb_row, p016_to_rgb_u16_row, p410_to_rgb_row,
     p410_to_rgb_u16_row, p412_to_rgb_row, p412_to_rgb_u16_row, p416_to_rgb_row,
-    p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_444_to_rgb_row,
-    yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row,
-    yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row,
-    yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row,
-    yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row,
-    yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row,
+    p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row, yuv_420_to_rgba_row,
+    yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row,
+    yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row,
+    yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row,
+    yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row,
+    yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row,
+    yuv444p16_to_rgb_u16_row,
   },
   yuv::{
     Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row,
@@ -139,6 +140,32 @@ pub enum MixedSinkerError {
     actual: usize,
   },
 
+  /// RGBA buffer attached via [`MixedSinker::with_rgba`] /
+  /// [`MixedSinker::set_rgba`] is shorter than `width × height × 4`.
+  /// The fourth byte per pixel is alpha — opaque (`0xFF`) by default
+  /// when the source has no alpha plane.
+  #[error("MixedSinker rgba buffer too short: expected >= {expected} bytes, got {actual}")]
+  RgbaBufferTooShort {
+    /// Minimum bytes required (`width × height × 4`).
+    expected: usize,
+    /// Bytes supplied.
+    actual: usize,
+  },
+
+  /// `u16` RGBA buffer attached via [`MixedSinker::with_rgba_u16`] /
+  /// [`MixedSinker::set_rgba_u16`] is shorter than `width × height × 4`
+  /// `u16` elements. Only high‑bit‑depth source impls write into this
+  /// buffer; the fourth `u16` per pixel is alpha — opaque
+  /// (`(1 << BITS) - 1`) by default when the source has no alpha
+  /// plane.
+  #[error("MixedSinker rgba_u16 buffer too short: expected >= {expected} elements, got {actual}")]
+  RgbaU16BufferTooShort {
+    /// Minimum `u16` elements required (`width × height × 4`).
+    expected: usize,
+    /// `u16` elements supplied.
+    actual: usize,
+  },
+
   /// Luma buffer is shorter than `width × height`.
   #[error("MixedSinker luma buffer too short: expected >= {expected} bytes, got {actual}")]
   LumaBufferTooShort {
@@ -473,6 +500,8 @@ pub enum RowSlice {
 pub struct MixedSinker<'a, F: SourceFormat> {
   rgb: Option<&'a mut [u8]>,
   rgb_u16: Option<&'a mut [u16]>,
+  rgba: Option<&'a mut [u8]>,
+  rgba_u16: Option<&'a mut [u16]>,
   luma: Option<&'a mut [u8]>,
   hsv: Option<HsvBuffers<'a>>,
   width: usize,
@@ -818,6 +847,8 @@ impl<F: SourceFormat> MixedSinker<'_, F> {
     Self {
       rgb: None,
       rgb_u16: None,
+      rgba: None,
+      rgba_u16: None,
       luma: None,
       hsv: None,
       width,
@@ -850,6 +881,24 @@ impl<F: SourceFormat> MixedSinker<'_, F> {
     self.rgb_u16.is_some()
   }
 
+  /// Returns `true` iff the sinker will write 8‑bit RGBA. The
+  /// fourth byte per pixel is alpha — opaque (`0xFF`) by default
+  /// when the source has no alpha plane.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn produces_rgba(&self) -> bool {
+    self.rgba.is_some()
+  }
+
+  /// Returns `true` iff the sinker will write `u16` RGBA at the
+  /// source's native bit depth. The fourth `u16` per pixel is alpha
+  /// — opaque (`(1 << BITS) - 1`) by default when the source has no
+  /// alpha plane. Only high‑bit‑depth source impls honor this
+  /// buffer.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub const fn produces_rgba_u16(&self) -> bool {
+    self.rgba_u16.is_some()
+  }
+
   /// Returns `true` iff the sinker will write luma.
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub const fn produces_luma(&self) -> bool {
@@ -949,6 +998,41 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> {
   // compile error, not a silent stale‑state bug. Future high‑bit‑depth
   // markers (12‑bit, 14‑bit, P010) will add their own impl blocks.
 
+  /// Attaches a packed 32-bit RGBA output buffer.
+  ///
+  /// The fourth byte per pixel is alpha. For sources that **don't**
+  /// carry an alpha plane (every YUV format shipped today), every
+  /// alpha byte is filled with `0xFF` (opaque). Future YUVA source
+  /// impls will copy alpha through from the source plane.
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+
+  // NOTE: `with_rgba_u16` / `set_rgba_u16` are **not** declared here
+  // for the same reason as `with_rgb_u16` — they live on the
+  // format‑specific impl blocks for high‑bit‑depth sources so the
+  // buffer can only be attached to sinks that actually write it.
+
   /// Attaches a single-plane luma output buffer.
   /// Returns `Err(LumaBufferTooShort)` if `buf.len() < width × height`,
   /// or `Err(GeometryOverflow)` on 32‑bit overflow.
@@ -1094,6 +1178,7 @@ impl PixelSink for MixedSinker<'_, Yuv420p> {
     // collide with the `rgb` read-after-write chain below.
     let Self {
       rgb,
+      rgba,
       luma,
       hsv,
       rgb_scratch,
@@ -1113,6 +1198,33 @@ impl PixelSink for MixedSinker<'_, Yuv420p> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
+    // Native RGBA: independent kernel run, separate from RGB. Avoids
+    // the compose-and-expand cost — the const-generic
+    // `yuv_420_to_rgba_row` writes 4 bytes per pixel directly.
+    // Default alpha = 0xFF (opaque); future YUVA source impls will
+    // copy alpha through from the source plane.
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4; // ≤ rgba_plane_end.
+      yuv_420_to_rgba_row(
+        row.y(),
+        row.u_half(),
+        row.v_half(),
+        &mut buf[rgba_plane_start..rgba_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
     let want_rgb = rgb.is_some();
     let want_hsv = hsv.is_some();
     if !want_rgb && !want_hsv {
@@ -7298,6 +7410,161 @@ mod tests {
     assert!(v.iter().all(|&b| b.abs_diff(200) <= 1));
   }
 
+  // ---- RGBA (Ship 8) tests ------------------------------------------------
+  //
+  // Yuv420p is the template format for the const-generic-ALPHA
+  // refactor — proves the kernel writes 4 bytes per pixel correctly,
+  // alpha defaults to 0xFF (sources with no alpha plane), the RGB
+  // bytes match what `with_rgb` would have written, and SIMD ≡
+  // scalar bit-for-bit. Future formats inherit the pattern.
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn rgba_only_converts_gray_to_gray_with_opaque_alpha() {
+    let (yp, up, vp) = solid_yuv420p_frame(16, 8, 128, 128, 128);
+    let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Yuv420p>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgba.chunks(4) {
+      assert!(px[0].abs_diff(128) <= 1, "R");
+      assert_eq!(px[0], px[1], "RGB monochromatic");
+      assert_eq!(px[1], px[2], "RGB monochromatic");
+      assert_eq!(px[3], 0xFF, "alpha must default to opaque");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn rgba_alpha_is_opaque_for_arbitrary_color() {
+    // Non-gray content. The RGB three bytes will vary by pixel; alpha
+    // must stay 0xFF because Yuv420p has no alpha plane.
+    let (yp, up, vp) = solid_yuv420p_frame(16, 8, 180, 60, 200);
+    let src = Yuv420pFrame::new(&yp, &up, &vp, 16, 8, 16, 8, 8);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Yuv420p>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for (i, px) in rgba.chunks(4).enumerate() {
+      assert_eq!(px[3], 0xFF, "alpha must be opaque (px {i})");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+    // Cross-format invariant: alpha is the *only* difference between
+    // the two output buffers. RGBA bytes 0..3 of each pixel must
+    // equal the corresponding RGB pixel exactly.
+    let w = 32usize;
+    let h = 16usize;
+    let (yp, up, vp) = solid_yuv420p_frame(w as u32, h as u32, 180, 60, 200);
+    let src = Yuv420pFrame::new(
+      &yp,
+      &up,
+      &vp,
+      w as u32,
+      h as u32,
+      w as u32,
+      (w / 2) as u32,
+      (w / 2) as u32,
+    );
+
+    let mut rgb = std::vec![0u8; w * h * 3];
+    let mut rgba = std::vec![0u8; w * h * 4];
+    let mut sink = MixedSinker::<Yuv420p>::new(w, h)
+      .with_rgb(&mut rgb)
+      .unwrap()
+      .with_rgba(&mut rgba)
+      .unwrap();
+    yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for i in 0..(w * h) {
+      assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn rgba_with_simd_false_matches_with_simd_true() {
+    // SIMD ≡ scalar parity for the RGBA path. Widths chosen to force
+    // both the SIMD main loop and the scalar tail across every
+    // backend block size (16 / 32 / 64). 4:2:0 requires even width,
+    // so the tail is exercised via `block + 2/4/6` rather than odd
+    // widths.
+    for &w in &[16usize, 18, 32, 34, 64, 66, 128, 130] {
+      let h = 8usize;
+      let (yp, up, vp) = solid_yuv420p_frame(w as u32, h as u32, 180, 60, 200);
+      let src = Yuv420pFrame::new(
+        &yp,
+        &up,
+        &vp,
+        w as u32,
+        h as u32,
+        w as u32,
+        (w / 2) as u32,
+        (w / 2) as u32,
+      );
+
+      let mut rgba_simd = std::vec![0u8; w * h * 4];
+      let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+      let mut sink_simd = MixedSinker::<Yuv420p>::new(w, h)
+        .with_rgba(&mut rgba_simd)
+        .unwrap();
+      yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink_simd).unwrap();
+
+      let mut sink_scalar = MixedSinker::<Yuv420p>::new(w, h)
+        .with_rgba(&mut rgba_scalar)
+        .unwrap();
+      sink_scalar.set_simd(false);
+      yuv420p_to(&src, true, ColorMatrix::Bt601, &mut sink_scalar).unwrap();
+
+      assert_eq!(
+        rgba_simd, rgba_scalar,
+        "SIMD vs scalar diverged at width {w}"
+      );
+    }
+  }
+
+  #[test]
+  fn rgba_buffer_too_short_returns_err() {
+    let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1];
+    let result = MixedSinker::<Yuv420p>::new(16, 8).with_rgba(&mut rgba_short);
+    let Err(err) = result else {
+      panic!("expected RgbaBufferTooShort error");
+    };
+    assert!(matches!(
+      err,
+      MixedSinkerError::RgbaBufferTooShort {
+        expected: 512,
+        actual: 511,
+      }
+    ));
+  }
+
   #[test]
   #[cfg_attr(
     miri,

From aa62bff0520c9d14c49f5ee25ce7ea469f8a53b0 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 14:01:21 +1200
Subject: [PATCH 2/5] update

---
 src/sinker/mixed.rs | 93 +++++++++++++++++++++++++++++----------------
 1 file changed, 61 insertions(+), 32 deletions(-)

diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index e2907a2..cd680c5 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -998,40 +998,20 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> {
   // compile error, not a silent stale‑state bug. Future high‑bit‑depth
   // markers (12‑bit, 14‑bit, P010) will add their own impl blocks.
 
-  /// Attaches a packed 32-bit RGBA output buffer.
-  ///
-  /// The fourth byte per pixel is alpha. For sources that **don't**
-  /// carry an alpha plane (every YUV format shipped today), every
-  /// alpha byte is filled with `0xFF` (opaque). Future YUVA source
-  /// impls will copy alpha through from the source plane.
-  ///
-  /// Returns `Err(RgbaBufferTooShort)` if
-  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
-  /// 32‑bit targets when the product overflows.
-  #[cfg_attr(not(tarpaulin), inline(always))]
-  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
-    self.set_rgba(buf)?;
-    Ok(self)
-  }
-
-  /// In-place variant of [`with_rgba`](Self::with_rgba).
-  #[cfg_attr(not(tarpaulin), inline(always))]
-  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
-    let expected = self.frame_bytes(4)?;
-    if buf.len() < expected {
-      return Err(MixedSinkerError::RgbaBufferTooShort {
-        expected,
-        actual: buf.len(),
-      });
-    }
-    self.rgba = Some(buf);
-    Ok(self)
-  }
+  // NOTE: `with_rgba` / `set_rgba` are **not** declared here either —
+  // same rationale as `with_rgb_u16` above. The Ship 8 RGBA path is
+  // currently wired only on [`MixedSinker<Yuv420p>`]; attaching an
+  // RGBA buffer to a sink whose `PixelSink::process` doesn't write
+  // it would silently leave the caller buffer untouched while
+  // `produces_rgba()` returned `true`. Each format that writes RGBA
+  // gets its own format‑specific impl block exposing the accessors.
+  // Future formats (NV12 / NV21 / Yuv422p / Yuv444p / P010 / etc.)
+  // add their own impl blocks as RGBA support lands.
 
   // NOTE: `with_rgba_u16` / `set_rgba_u16` are **not** declared here
-  // for the same reason as `with_rgb_u16` — they live on the
-  // format‑specific impl blocks for high‑bit‑depth sources so the
-  // buffer can only be attached to sinks that actually write it.
+  // for the same reason — they live on the format‑specific impl
+  // blocks for high‑bit‑depth sources that actually write
+  // native‑depth RGBA.
 
   /// Attaches a single-plane luma output buffer.
   /// Returns `Err(LumaBufferTooShort)` if `buf.len() < width × height`,
@@ -1107,6 +1087,55 @@ impl<'a, F: SourceFormat> MixedSinker<'a, F> {
 
 // ---- Yuv420p impl --------------------------------------------------------
 
+impl<'a> MixedSinker<'a, Yuv420p> {
+  /// Attaches a packed 32‑bit RGBA output buffer.
+  ///
+  /// Only available on sinker types whose `PixelSink` impl writes
+  /// RGBA — calling `with_rgba` on a sink that doesn't (e.g.
+  /// [`MixedSinker<Nv12>`] today) is a compile error rather than a
+  /// silent no‑op that would leave the caller's buffer stale while
+  /// [`Self::produces_rgba`] returned `true`. The compile-time
+  /// scoping is load-bearing: if a future format adds RGBA, it must
+  /// add its own impl block here, which both wires the new path and
+  /// prevents accidental cross-format leakage.
+  ///
+  /// The fourth byte per pixel is alpha. [`Yuv420p`] has no alpha
+  /// plane, so every alpha byte is filled with `0xFF` (opaque).
+  /// Future YUVA source impls will copy alpha through from the
+  /// source plane.
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  ///
+  /// ```compile_fail
+  /// // Attaching RGBA to a sink that doesn't write it is rejected
+  /// // at compile time:
+  /// use colconv::{sinker::MixedSinker, yuv::Nv12};
+  /// let mut buf = vec![0u8; 16 * 8 * 4];
+  /// let _ = MixedSinker::<Nv12>::new(16, 8).with_rgba(&mut buf);
+  /// ```
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+}
+
 impl PixelSink for MixedSinker<'_, Yuv420p> {
   type Input<'r> = Yuv420pRow<'r>;
   type Error = MixedSinkerError;

From 36fee2633545a25956adb15d05a5741c0b1fe6bf Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 14:12:42 +1200
Subject: [PATCH 3/5] update

---
 src/sinker/mixed.rs | 88 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index cd680c5..bf52196 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -7594,6 +7594,86 @@ mod tests {
     ));
   }
 
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn yuv_420_to_rgba_simd_matches_scalar_with_random_yuv() {
+    // The earlier `rgba_with_simd_false_matches_with_simd_true` test
+    // uses solid Y/U/V, so every pixel collapses to the same RGBA
+    // quad and the new RGBA shuffle masks could permute / duplicate
+    // lanes within a SIMD block undetected. This test uses
+    // **pseudo-random per-pixel Y/U/V** so a bad shuffle in any of
+    // `write_rgba_16` (SSE4.1 / AVX2 / AVX-512 / wasm), `vst4q_u8`
+    // (NEON), or the scalar-tail handoff produces a measurable
+    // diff against the scalar reference. Width 1922 forces both
+    // the SIMD main loop AND a scalar tail across every backend
+    // block size (16 / 32 / 64). All four `ColorMatrix` variants
+    // exercise different `(r_u, r_v, g_u, g_v, b_u, b_v)`
+    // coefficient sets, and both ranges exercise the `y_off` /
+    // `y_scale` / `c_scale` parameter shape.
+    let w = 1922usize;
+    let h = 4usize;
+    let mut yp = std::vec![0u8; w * h];
+    let mut up = std::vec![0u8; (w / 2) * (h / 2)];
+    let mut vp = std::vec![0u8; (w / 2) * (h / 2)];
+    pseudo_random_u8(&mut yp, 0xC001_C0DE);
+    pseudo_random_u8(&mut up, 0xCAFE_F00D);
+    pseudo_random_u8(&mut vp, 0xDEAD_BEEF);
+    let src = Yuv420pFrame::new(
+      &yp,
+      &up,
+      &vp,
+      w as u32,
+      h as u32,
+      w as u32,
+      (w / 2) as u32,
+      (w / 2) as u32,
+    );
+
+    for &matrix in &[
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::YCgCo,
+    ] {
+      for &full_range in &[true, false] {
+        let mut rgba_simd = std::vec![0u8; w * h * 4];
+        let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+        let mut s_simd = MixedSinker::<Yuv420p>::new(w, h)
+          .with_rgba(&mut rgba_simd)
+          .unwrap();
+        yuv420p_to(&src, full_range, matrix, &mut s_simd).unwrap();
+
+        let mut s_scalar = MixedSinker::<Yuv420p>::new(w, h)
+          .with_rgba(&mut rgba_scalar)
+          .unwrap();
+        s_scalar.set_simd(false);
+        yuv420p_to(&src, full_range, matrix, &mut s_scalar).unwrap();
+
+        // Locate the first divergence to make backend-bug
+        // diagnosis tractable instead of dumping ~30 KB of bytes.
+        if rgba_simd != rgba_scalar {
+          let mismatch = rgba_simd
+            .iter()
+            .zip(rgba_scalar.iter())
+            .position(|(a, b)| a != b)
+            .unwrap();
+          let pixel = mismatch / 4;
+          let channel = ["R", "G", "B", "A"][mismatch % 4];
+          panic!(
+            "RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) \
+             for matrix={matrix:?} full_range={full_range}: \
+             simd={} scalar={}",
+            rgba_simd[mismatch], rgba_scalar[mismatch]
+          );
+        }
+      }
+    }
+  }
+
   #[test]
   #[cfg_attr(
     miri,
@@ -10798,6 +10878,14 @@ mod tests {
     }
   }
 
+  fn pseudo_random_u8(buf: &mut [u8], seed: u32) {
+    let mut state = seed;
+    for b in buf {
+      state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
+      *b = (state >> 16) as u8;
+    }
+  }
+
   #[test]
   #[cfg_attr(
     miri,

From c6b0526ad589509f0b61761a234c73939391b174 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 14:28:05 +1200
Subject: [PATCH 4/5] update

---
 src/row/arch/neon.rs         | 78 ++++++++++++++++++++++++++++++++
 src/row/arch/wasm_simd128.rs | 60 +++++++++++++++++++++++++
 src/row/arch/x86_avx2.rs     | 86 +++++++++++++++++++++++++++++++++++
 src/row/arch/x86_avx512.rs   | 86 +++++++++++++++++++++++++++++++++++
 src/row/arch/x86_sse41.rs    | 87 ++++++++++++++++++++++++++++++++++++
 5 files changed, 397 insertions(+)

diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index bfc52f6..24f4ae1 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -3416,6 +3416,84 @@ mod tests {
     }
   }
 
+  // ---- yuv_420_to_rgba_row equivalence --------------------------------
+  //
+  // Direct backend test for the new RGBA path: bypasses the public
+  // dispatcher so the NEON `vst4q_u8` write is exercised regardless
+  // of what tier the dispatcher would pick on the current runner.
+  // Catches lane-order or alpha-splat corruption in `vst4q_u8` that
+  // a dispatcher-routed test on a different host would miss.
+
+  fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_neon = std::vec![0u8; width * 4];
+
+    scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_neon {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "NEON RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgba_scalar[first_diff], rgba_neon[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn neon_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn neon_rgba_matches_scalar_width_32() {
+    check_rgba_equivalence(32, ColorMatrix::Bt601, true);
+    check_rgba_equivalence(32, ColorMatrix::Bt709, false);
+    check_rgba_equivalence(32, ColorMatrix::YCgCo, true);
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn neon_rgba_matches_scalar_width_1920() {
+    check_rgba_equivalence(1920, ColorMatrix::Bt709, false);
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn neon_rgba_matches_scalar_odd_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- nv12_to_rgb_row equivalence ------------------------------------
 
   /// Scalar‑equivalence fixture for NV12. Builds an interleaved UV row
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index 26b8363..2ff0619 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -3419,6 +3419,66 @@ mod tests {
     }
   }
 
+  // ---- yuv_420_to_rgba_row equivalence --------------------------------
+  //
+  // Direct backend test for the new RGBA path: bypasses the public
+  // dispatcher so the wasm `write_rgba_16` swizzle (4-mask + 4
+  // store) is exercised on every wasm32+simd128 target.
+
+  fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_wasm = std::vec![0u8; width * 4];
+
+    scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_wasm {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_wasm.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "wasm simd128 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}",
+        rgba_scalar[first_diff], rgba_wasm[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn simd128_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_rgba_matches_scalar_tail_widths() {
+    for w in [18usize, 30, 34, 1922] {
+      check_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- nv12_to_rgb_row equivalence ------------------------------------
 
   fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index dd384cb..54742dc 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -3663,6 +3663,92 @@ mod tests {
     }
   }
 
+  // ---- yuv_420_to_rgba_row equivalence --------------------------------
+  //
+  // Direct backend test for the new RGBA path: bypasses the public
+  // dispatcher so the AVX2 `write_rgba_32` path (two halves through
+  // `write_rgba_16`) is exercised regardless of what tier the
+  // dispatcher would pick on the current runner.
+
+  fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx2 = std::vec![0u8; width * 4];
+
+    scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_avx2, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx2 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx2.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX2 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}",
+        rgba_scalar[first_diff], rgba_avx2[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx2_rgba_matches_scalar_all_matrices_32() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_rgba_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_rgba_matches_scalar_width_64() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    check_rgba_equivalence(64, ColorMatrix::Bt601, true);
+    check_rgba_equivalence(64, ColorMatrix::Bt709, false);
+    check_rgba_equivalence(64, ColorMatrix::YCgCo, true);
+  }
+
+  #[test]
+  fn avx2_rgba_matches_scalar_width_1920() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    check_rgba_equivalence(1920, ColorMatrix::Bt709, false);
+  }
+
+  #[test]
+  fn avx2_rgba_matches_scalar_odd_tail_widths() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    // Widths that leave a non‑trivial scalar tail (non‑multiple of 32).
+    for w in [34usize, 46, 62, 1922] {
+      check_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- nv12_to_rgb_row equivalence ------------------------------------
 
   fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index 3e01120..9e17ee9 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -3832,6 +3832,92 @@ mod tests {
     }
   }
 
+  // ---- yuv_420_to_rgba_row equivalence --------------------------------
+  //
+  // Direct backend test for the new RGBA path: bypasses the public
+  // dispatcher so the AVX‑512 `write_rgba_64` path (four quarters
+  // through `write_rgba_16`) is exercised regardless of what tier
+  // the dispatcher would pick on the current runner.
+
+  fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx512 = std::vec![0u8; width * 4];
+
+    scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_avx512, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx512 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx512.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX‑512 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}",
+        rgba_scalar[first_diff], rgba_avx512[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx512_rgba_matches_scalar_all_matrices_64() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_rgba_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_rgba_matches_scalar_width_128() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    check_rgba_equivalence(128, ColorMatrix::Bt601, true);
+    check_rgba_equivalence(128, ColorMatrix::Bt709, false);
+    check_rgba_equivalence(128, ColorMatrix::YCgCo, true);
+  }
+
+  #[test]
+  fn avx512_rgba_matches_scalar_width_1920() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    check_rgba_equivalence(1920, ColorMatrix::Bt709, false);
+  }
+
+  #[test]
+  fn avx512_rgba_matches_scalar_odd_tail_widths() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    // Widths that leave a non‑trivial scalar tail (non‑multiple of 64).
+    for w in [66usize, 94, 126, 1922] {
+      check_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- nv12_to_rgb_row equivalence ------------------------------------
 
   fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index 6d79ad8..c431c72 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -3142,6 +3142,93 @@ mod tests {
     }
   }
 
+  // ---- yuv_420_to_rgba_row equivalence --------------------------------
+  //
+  // Direct backend test for the new RGBA path: bypasses the public
+  // dispatcher so the SSE4.1 `write_rgba_16` shuffle masks are
+  // exercised regardless of what tier the dispatcher would pick on
+  // the current runner. Catches lane-order, shuffle-mask, or alpha
+  // splat corruption that an AVX2- or AVX-512-routed test would
+  // miss.
+
+  fn check_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 53 + 23) & 0xFF) as u8)
+      .collect();
+    let v: std::vec::Vec<u8> = (0..width / 2)
+      .map(|i| ((i * 71 + 91) & 0xFF) as u8)
+      .collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_sse41 = std::vec![0u8; width * 4];
+
+    scalar::yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_420_to_rgba_row(&y, &u, &v, &mut rgba_sse41, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_sse41 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_sse41.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "SSE4.1 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}",
+        rgba_scalar[first_diff], rgba_sse41[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn sse41_rgba_matches_scalar_all_matrices_16() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_rgba_matches_scalar_width_32() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    check_rgba_equivalence(32, ColorMatrix::Bt601, true);
+    check_rgba_equivalence(32, ColorMatrix::Bt709, false);
+    check_rgba_equivalence(32, ColorMatrix::YCgCo, true);
+  }
+
+  #[test]
+  fn sse41_rgba_matches_scalar_width_1920() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    check_rgba_equivalence(1920, ColorMatrix::Bt709, false);
+  }
+
+  #[test]
+  fn sse41_rgba_matches_scalar_odd_tail_widths() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [18usize, 30, 34, 1922] {
+      check_rgba_equivalence(w, ColorMatrix::Bt601, false);
+    }
+  }
+
   // ---- nv12_to_rgb_row equivalence ------------------------------------
 
   fn check_nv12_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {

From be024521109b771a32cc8ad6b26b9d378980a3a0 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 15:04:19 +1200
Subject: [PATCH 5/5] update

---
 src/sinker/mixed.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index bf52196..5e852fb 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -152,12 +152,12 @@ pub enum MixedSinkerError {
     actual: usize,
   },
 
-  /// `u16` RGBA buffer attached via [`MixedSinker::with_rgba_u16`] /
-  /// [`MixedSinker::set_rgba_u16`] is shorter than `width × height × 4`
-  /// `u16` elements. Only high‑bit‑depth source impls write into this
-  /// buffer; the fourth `u16` per pixel is alpha — opaque
-  /// (`(1 << BITS) - 1`) by default when the source has no alpha
-  /// plane.
+  /// `u16` RGBA buffer attached via `with_rgba_u16` / `set_rgba_u16`
+  /// (per-format impl, not yet shipped on any sink) is shorter than
+  /// `width × height × 4` `u16` elements. Only high‑bit‑depth source
+  /// impls write into this buffer; the fourth `u16` per pixel is
+  /// alpha — opaque (`(1 << BITS) - 1`) by default when the source
+  /// has no alpha plane.
   #[error("MixedSinker rgba_u16 buffer too short: expected >= {expected} elements, got {actual}")]
   RgbaU16BufferTooShort {
     /// Minimum `u16` elements required (`width × height × 4`).