From 443efa44ebfa95a21ebaaca35a5699acf5f46a45 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 17:31:31 +1200
Subject: [PATCH 1/2] update

---
 src/row/arch/neon.rs         | 145 ++++++++++++++++++++++--
 src/row/arch/wasm_simd128.rs | 139 ++++++++++++++++++++---
 src/row/arch/x86_avx2.rs     | 145 +++++++++++++++++++++---
 src/row/arch/x86_avx512.rs   | 145 +++++++++++++++++++++---
 src/row/arch/x86_sse41.rs    | 147 ++++++++++++++++++++++---
 src/row/mod.rs               |  68 ++++++++++++
 src/row/scalar.rs            |  61 ++++++++--
 src/sinker/mixed.rs          | 208 +++++++++++++++++++++++++++++++++--
 8 files changed, 972 insertions(+), 86 deletions(-)
diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index d9563c9..a8ea1e9 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -1548,6 +1548,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
 ///
 /// # Safety
 ///
+/// Same contract as [`yuv_444_to_rgb_or_rgba_row`]:
+///
 /// 1. **NEON must be available on the current CPU.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
 /// 3. `rgb_out.len() >= 3 * width`.
@@ -1563,11 +1565,69 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked NEON availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// NEON YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract
+/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via
+/// `vst4q_u8` (R, G, B, `0xFF`).
+///
+/// # Safety
+///
+/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked NEON availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared NEON YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
+/// (`ALPHA = false`, `vst3q_u8`) and [`yuv_444_to_rgba_row`]
+/// (`ALPHA = true`, `vst4q_u8` with constant `0xFF` alpha). Math is
+/// byte-identical to `scalar::yuv_444_to_rgb_or_rgba_row::<ALPHA>`;
+/// only the per-block store intrinsic differs.
+///
+/// # Safety
+///
+/// 1. **NEON must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
+/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+///
+/// No width parity constraint (4:4:4).
+#[inline]
+#[target_feature(enable = "neon")]
+pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1588,6 +1648,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
     let cgv = vdupq_n_s32(coeffs.g_v());
     let cbu = vdupq_n_s32(coeffs.b_u());
     let cbv = vdupq_n_s32(coeffs.b_v());
+    let alpha_u8 = vdupq_n_u8(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1646,22 +1707,28 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
         vqmovun_s16(vqaddq_s16(y_scaled_hi, r_chroma_hi)),
       );
 
-      let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
-      vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb);
+      if ALPHA {
+        let rgba = uint8x16x4_t(r_u8, g_u8, b_u8, alpha_u8);
+        vst4q_u8(out.as_mut_ptr().add(x * 4), rgba);
+      } else {
+        let rgb = uint8x16x3_t(r_u8, g_u8, b_u8);
+        vst3q_u8(out.as_mut_ptr().add(x * 3), rgb);
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444_to_rgb_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      if ALPHA {
+        scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -4136,6 +4203,60 @@ mod tests {
     }
   }
 
+  // ---- yuv_444_to_rgba_row equivalence --------------------------------
+
+  fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
+    let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_neon = std::vec![0u8; width * 4];
+
+    scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_neon {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_neon.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "NEON yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} neon={}",
+        rgba_scalar[first_diff], rgba_neon[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn yuv_444_neon_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_yuv_444_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  #[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")]
+  fn yuv_444_neon_rgba_matches_scalar_widths() {
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- rgb_to_hsv_row equivalence ------------------------------------
   //
   // The NEON HSV kernel uses `vmaxq_f32` / `vminq_f32` / `vdivq_f32`
diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs
index ffdd522..7286e0b 100644
--- a/src/row/arch/wasm_simd128.rs
+++ b/src/row/arch/wasm_simd128.rs
@@ -1857,9 +1857,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
   }
 }
 
-/// wasm simd128 YUV 4:4:4 planar → packed RGB. 16 Y + 16 U + 16 V
-/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] but U and V
-/// come from separate planes (no deinterleave).
+/// wasm simd128 YUV 4:4:4 planar → packed RGB. Thin wrapper over
+/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`.
 ///
 /// # Safety
 ///
@@ -1876,11 +1875,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked simd128 availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// wasm simd128 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same
+/// contract as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel
+/// via [`write_rgba_16`] (R, G, B, `0xFF`).
+///
+/// # Safety
+///
+/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked simd128 availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared wasm simd128 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
+/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_444_to_rgba_row`]
+/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **simd128 must be enabled at compile time.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
+/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1898,6 +1951,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
     let cgv = i32x4_splat(coeffs.g_v());
     let cbu = i32x4_splat(coeffs.b_u());
     let cbv = i32x4_splat(coeffs.b_v());
+    let alpha_u8 = u8x16_splat(0xFF);
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1953,21 +2007,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
       let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi);
       let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444_to_rgb_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      if ALPHA {
+        scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -3721,6 +3780,58 @@ mod tests {
     }
   }
 
+  // ---- yuv_444_to_rgba_row equivalence --------------------------------
+
+  fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
+    let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_wasm = std::vec![0u8; width * 4];
+
+    scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_wasm {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_wasm.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "wasm simd128 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} wasm={}",
+        rgba_scalar[first_diff], rgba_wasm[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn simd128_yuv_444_rgba_matches_scalar_all_matrices_16() {
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_yuv_444_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn simd128_yuv_444_rgba_matches_scalar_widths() {
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444p_n<BITS> + yuv_444p16 equivalence ----------------------
 
   fn check_yuv_444p_n_equivalence<const BITS: u32>(
diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs
index 50c7aaf..24ceda0 100644
--- a/src/row/arch/x86_avx2.rs
+++ b/src/row/arch/x86_avx2.rs
@@ -2032,9 +2032,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
   }
 }
 
-/// AVX2 YUV 4:4:4 planar → packed RGB. 32 Y pixels + 32 U + 32 V
-/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] with U / V
-/// loaded directly from separate planes (no deinterleave step).
+/// AVX2 YUV 4:4:4 planar → packed RGB. Thin wrapper over
+/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`.
 ///
 /// # Safety
 ///
@@ -2051,11 +2050,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked AVX2 availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX2 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract
+/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via
+/// [`write_rgba_32`] (R, G, B, `0xFF`).
+///
+/// # Safety
+///
+/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked AVX2 availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX2 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
+/// (`ALPHA = false`, [`write_rgb_32`]) and [`yuv_444_to_rgba_row`]
+/// (`ALPHA = true`, [`write_rgba_32`] with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **AVX2 must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
+/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+#[inline]
+#[target_feature(enable = "avx2")]
+pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -2073,6 +2126,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
     let cgv = _mm256_set1_epi32(coeffs.g_v());
     let cbu = _mm256_set1_epi32(coeffs.b_u());
     let cbv = _mm256_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm256_set1_epi8(-1); // 0xFF as i8
 
     let mut x = 0usize;
     while x + 32 <= width {
@@ -2158,21 +2212,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
       let g_u8 = narrow_u8x32(g_lo, g_hi);
       let r_u8 = narrow_u8x32(r_lo, r_hi);
 
-      write_rgb_32(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 32;
     }
 
     if x < width {
-      scalar::yuv_444_to_rgb_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      if ALPHA {
+        scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -4053,6 +4112,64 @@ mod tests {
     }
   }
 
+  // ---- yuv_444_to_rgba_row equivalence --------------------------------
+
+  fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
+    let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx2 = std::vec![0u8; width * 4];
+
+    scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_avx2, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx2 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx2.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX2 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx2={}",
+        rgba_scalar[first_diff], rgba_avx2[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx2_yuv_444_rgba_matches_scalar_all_matrices_32() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_yuv_444_rgba_equivalence(32, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx2_yuv_444_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx2") {
+      return;
+    }
+    for w in [31usize, 32, 33, 63, 64, 65, 1920, 1921] {
+      check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444p_n<BITS> + yuv_444p16 equivalence ----------------------
 
   fn check_yuv_444p_n_equivalence<const BITS: u32>(
diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs
index e1bd915..04de26d 100644
--- a/src/row/arch/x86_avx512.rs
+++ b/src/row/arch/x86_avx512.rs
@@ -2097,9 +2097,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
   }
 }
 
-/// AVX-512 YUV 4:4:4 planar → packed RGB. 64 Y pixels + 64 U + 64 V
-/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] with U / V
-/// loaded directly from separate planes (no deinterleave step).
+/// AVX-512 YUV 4:4:4 planar → packed RGB. Thin wrapper over
+/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`.
 ///
 /// # Safety
 ///
@@ -2116,11 +2115,65 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked AVX-512BW availability + slice bounds —
+  // see [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// AVX-512 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract
+/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via
+/// [`write_rgba_64`] (R, G, B, `0xFF`).
+///
+/// # Safety
+///
+/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked AVX-512BW availability + slice bounds —
+  // see [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared AVX-512 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
+/// (`ALPHA = false`, [`write_rgb_64`]) and [`yuv_444_to_rgba_row`]
+/// (`ALPHA = true`, [`write_rgba_64`] with constant `0xFF` alpha).
+///
+/// # Safety
+///
+/// 1. **AVX-512F + AVX-512BW must be available.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
+/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+#[inline]
+#[target_feature(enable = "avx512f,avx512bw")]
+pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -2138,6 +2191,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
     let cgv = _mm512_set1_epi32(coeffs.g_v());
     let cbu = _mm512_set1_epi32(coeffs.b_u());
     let cbv = _mm512_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm512_set1_epi8(-1); // 0xFF as i8
 
     let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
 
@@ -2237,21 +2291,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
       let g_u8 = narrow_u8x64(g_lo, g_hi, pack_fixup);
       let r_u8 = narrow_u8x64(r_lo, r_hi, pack_fixup);
 
-      write_rgb_64(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_64(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 64;
     }
 
     if x < width {
-      scalar::yuv_444_to_rgb_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      if ALPHA {
+        scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -4224,6 +4283,64 @@ mod tests {
     }
   }
 
+  // ---- yuv_444_to_rgba_row equivalence --------------------------------
+
+  fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
+    let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_avx512 = std::vec![0u8; width * 4];
+
+    scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_avx512, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_avx512 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_avx512.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "AVX-512 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} avx512={}",
+        rgba_scalar[first_diff], rgba_avx512[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn avx512_yuv_444_rgba_matches_scalar_all_matrices_64() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_yuv_444_rgba_equivalence(64, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn avx512_yuv_444_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("avx512bw") {
+      return;
+    }
+    for w in [63usize, 64, 65, 127, 128, 129, 1920, 1921] {
+      check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444p_n<BITS> + yuv_444p16 equivalence ----------------------
 
   fn check_yuv_444p_n_equivalence<const BITS: u32>(
diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs
index d4a342e..35a9453 100644
--- a/src/row/arch/x86_sse41.rs
+++ b/src/row/arch/x86_sse41.rs
@@ -1795,9 +1795,8 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
   }
 }
 
-/// SSE4.1 YUV 4:4:4 planar → packed RGB. 16 Y pixels + 16 U + 16 V
-/// per iteration. Same arithmetic as [`nv24_to_rgb_row`] but U and V
-/// come from separate planes (no deinterleave).
+/// SSE4.1 YUV 4:4:4 planar → packed RGB. Thin wrapper over
+/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`.
 ///
 /// # Safety
 ///
@@ -1814,11 +1813,67 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  // SAFETY: caller-checked SSE4.1 availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+  }
+}
+
+/// SSE4.1 YUV 4:4:4 planar → packed **RGBA** (8-bit). Same contract
+/// as [`yuv_444_to_rgb_row`] but writes 4 bytes per pixel via
+/// [`write_rgba_16`] (R, G, B, `0xFF`).
+///
+/// # Safety
+///
+/// Same as [`yuv_444_to_rgb_row`] except the output slice must be
+/// `>= 4 * width` bytes.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  // SAFETY: caller-checked SSE4.1 availability + slice bounds — see
+  // [`yuv_444_to_rgb_or_rgba_row`] safety contract.
+  unsafe {
+    yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+  }
+}
+
+/// Shared SSE4.1 YUV 4:4:4 kernel for [`yuv_444_to_rgb_row`]
+/// (`ALPHA = false`, [`write_rgb_16`]) and [`yuv_444_to_rgba_row`]
+/// (`ALPHA = true`, [`write_rgba_16`] with constant `0xFF` alpha).
+/// Math is byte-identical to
+/// `scalar::yuv_444_to_rgb_or_rgba_row::<ALPHA>`.
+///
+/// # Safety
+///
+/// 1. **SSE4.1 must be available on the current CPU.**
+/// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.
+/// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+#[inline]
+#[target_feature(enable = "sse4.1")]
+pub(crate) unsafe fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert!(y.len() >= width);
   debug_assert!(u.len() >= width);
   debug_assert!(v.len() >= width);
-  debug_assert!(rgb_out.len() >= width * 3);
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp);
 
   let coeffs = scalar::Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = scalar::range_params(full_range);
@@ -1836,6 +1891,7 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
     let cgv = _mm_set1_epi32(coeffs.g_v());
     let cbu = _mm_set1_epi32(coeffs.b_u());
     let cbv = _mm_set1_epi32(coeffs.b_v());
+    let alpha_u8 = _mm_set1_epi8(-1); // 0xFF as i8
 
     let mut x = 0usize;
     while x + 16 <= width {
@@ -1892,21 +1948,26 @@ pub(crate) unsafe fn yuv_444_to_rgb_row(
       let g_u8 = _mm_packus_epi16(g_lo, g_hi);
       let r_u8 = _mm_packus_epi16(r_lo, r_hi);
 
-      write_rgb_16(r_u8, g_u8, b_u8, rgb_out.as_mut_ptr().add(x * 3));
+      if ALPHA {
+        write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4));
+      } else {
+        write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3));
+      }
 
       x += 16;
     }
 
     if x < width {
-      scalar::yuv_444_to_rgb_row(
-        &y[x..width],
-        &u[x..width],
-        &v[x..width],
-        &mut rgb_out[x * 3..width * 3],
-        width - x,
-        matrix,
-        full_range,
-      );
+      let tail_y = &y[x..width];
+      let tail_u = &u[x..width];
+      let tail_v = &v[x..width];
+      let tail_w = width - x;
+      let tail_out = &mut out[x * bpp..width * bpp];
+      if ALPHA {
+        scalar::yuv_444_to_rgba_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      } else {
+        scalar::yuv_444_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range);
+      }
     }
   }
 }
@@ -3549,6 +3610,64 @@ mod tests {
     }
   }
 
+  // ---- yuv_444_to_rgba_row equivalence --------------------------------
+
+  fn check_yuv_444_rgba_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) {
+    let y: std::vec::Vec<u8> = (0..width).map(|i| ((i * 37 + 11) & 0xFF) as u8).collect();
+    let u: std::vec::Vec<u8> = (0..width).map(|i| ((i * 53 + 23) & 0xFF) as u8).collect();
+    let v: std::vec::Vec<u8> = (0..width).map(|i| ((i * 71 + 91) & 0xFF) as u8).collect();
+    let mut rgba_scalar = std::vec![0u8; width * 4];
+    let mut rgba_sse41 = std::vec![0u8; width * 4];
+
+    scalar::yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range);
+    unsafe {
+      yuv_444_to_rgba_row(&y, &u, &v, &mut rgba_sse41, width, matrix, full_range);
+    }
+
+    if rgba_scalar != rgba_sse41 {
+      let first_diff = rgba_scalar
+        .iter()
+        .zip(rgba_sse41.iter())
+        .position(|(a, b)| a != b)
+        .unwrap();
+      let pixel = first_diff / 4;
+      let channel = ["R", "G", "B", "A"][first_diff % 4];
+      panic!(
+        "SSE4.1 yuv_444 RGBA diverges from scalar at byte {first_diff} (px {pixel} {channel}, width={width}, matrix={matrix:?}, full_range={full_range}): scalar={} sse41={}",
+        rgba_scalar[first_diff], rgba_sse41[first_diff]
+      );
+    }
+  }
+
+  #[test]
+  fn sse41_yuv_444_rgba_matches_scalar_all_matrices_16() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for m in [
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::Smpte240m,
+      ColorMatrix::Fcc,
+      ColorMatrix::YCgCo,
+    ] {
+      for full in [true, false] {
+        check_yuv_444_rgba_equivalence(16, m, full);
+      }
+    }
+  }
+
+  #[test]
+  fn sse41_yuv_444_rgba_matches_scalar_widths() {
+    if !std::arch::is_x86_feature_detected!("sse4.1") {
+      return;
+    }
+    for w in [1usize, 3, 15, 17, 32, 33, 1920, 1921] {
+      check_yuv_444_rgba_equivalence(w, ColorMatrix::Bt709, false);
+    }
+  }
+
   // ---- yuv_444p_n<BITS> + yuv_444p16 equivalence ----------------------
 
   fn check_yuv_444p_n_equivalence<const BITS: u32>(
diff --git a/src/row/mod.rs b/src/row/mod.rs
index f30667d..6499cec 100644
--- a/src/row/mod.rs
+++ b/src/row/mod.rs
@@ -790,6 +790,74 @@ pub fn yuv_444_to_rgb_row(
   scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range);
 }
 
+/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit).
+/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only
+/// differences are the per-pixel stride (4 vs 3) and the alpha byte
+/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`.
+/// `use_simd = false` forces scalar.
+#[cfg_attr(not(tarpaulin), inline(always))]
+#[allow(clippy::too_many_arguments)]
+pub fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+  use_simd: bool,
+) {
+  let rgba_min = rgba_row_bytes(width);
+  assert!(y.len() >= width, "y row too short");
+  assert!(u.len() >= width, "u row too short");
+  assert!(v.len() >= width, "v row too short");
+  assert!(rgba_out.len() >= rgba_min, "rgba_out row too short");
+
+  if use_simd {
+    cfg_select! {
+      target_arch = "aarch64" => {
+        if neon_available() {
+          unsafe {
+            arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "x86_64" => {
+        if avx512_available() {
+          unsafe {
+            arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if avx2_available() {
+          unsafe {
+            arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+        if sse41_available() {
+          unsafe {
+            arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      target_arch = "wasm32" => {
+        if simd128_available() {
+          unsafe {
+            arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+          }
+          return;
+        }
+      },
+      _ => {}
+    }
+  }
+
+  scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range);
+}
+
 /// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const
 /// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best
 /// available backend for the current target (NEON / SSE4.1 / AVX2 /
diff --git a/src/row/scalar.rs b/src/row/scalar.rs
index e6d07ac..1a76503 100644
--- a/src/row/scalar.rs
+++ b/src/row/scalar.rs
@@ -357,10 +357,12 @@ fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
   }
 }
 
-/// YUV 4:4:4 planar → packed RGB. One UV pair per Y pixel, U/V from
-/// separate planes. Same arithmetic as
-/// [`nv24_to_rgb_row`] (4:4:4 semi-planar) but without the
-/// deinterleave step — U and V come pre-separated.
+/// YUV 4:4:4 planar → packed RGB. Thin wrapper over
+/// [`yuv_444_to_rgb_or_rgba_row`] with `ALPHA = false`.
+///
+/// One UV pair per Y pixel, U/V from separate planes. Same
+/// arithmetic as [`nv24_to_rgb_row`] (4:4:4 semi-planar) but
+/// without the deinterleave step — U and V come pre-separated.
 ///
 /// # Panics (debug builds)
 ///
@@ -375,11 +377,51 @@ pub(crate) fn yuv_444_to_rgb_row(
   width: usize,
   matrix: ColorMatrix,
   full_range: bool,
+) {
+  yuv_444_to_rgb_or_rgba_row::<false>(y, u, v, rgb_out, width, matrix, full_range);
+}
+
+/// YUV 4:4:4 planar → packed `R, G, B, A` quadruplets with constant
+/// `A = 0xFF`. First three bytes per pixel are byte-identical to
+/// [`yuv_444_to_rgb_row`]. `rgba_out.len() >= 4 * width`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_444_to_rgba_row(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  rgba_out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
+) {
+  yuv_444_to_rgb_or_rgba_row::<true>(y, u, v, rgba_out, width, matrix, full_range);
+}
+
+/// Shared scalar kernel for [`yuv_444_to_rgb_row`] (`ALPHA = false`,
+/// 3 bpp) and [`yuv_444_to_rgba_row`] (`ALPHA = true`, 4 bpp + opaque
+/// alpha). Math is identical; only the per-pixel store stride
+/// differs. `const` generic monomorphizes per call site, so the
+/// `if ALPHA` branches are eliminated.
+///
+/// # Panics (debug builds)
+///
+/// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`,
+///   `out.len() >= width * (if ALPHA { 4 } else { 3 })`.
+#[cfg_attr(not(tarpaulin), inline(always))]
+pub(crate) fn yuv_444_to_rgb_or_rgba_row<const ALPHA: bool>(
+  y: &[u8],
+  u: &[u8],
+  v: &[u8],
+  out: &mut [u8],
+  width: usize,
+  matrix: ColorMatrix,
+  full_range: bool,
 ) {
   debug_assert!(y.len() >= width, "y row too short");
   debug_assert!(u.len() >= width, "u row too short");
   debug_assert!(v.len() >= width, "v row too short");
-  debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short");
+  let bpp: usize = if ALPHA { 4 } else { 3 };
+  debug_assert!(out.len() >= width * bpp, "out row too short for {bpp}bpp");
 
   let coeffs = Coefficients::for_matrix(matrix);
   let (y_off, y_scale, c_scale) = range_params(full_range);
@@ -395,9 +437,12 @@ pub(crate) fn yuv_444_to_rgb_row(
     let b_chroma = (coeffs.b_u() * u_d + coeffs.b_v() * v_d + RND) >> 15;
 
     let y0 = ((y[x] as i32 - y_off) * y_scale + RND) >> 15;
-    rgb_out[x * 3] = clamp_u8(y0 + r_chroma);
-    rgb_out[x * 3 + 1] = clamp_u8(y0 + g_chroma);
-    rgb_out[x * 3 + 2] = clamp_u8(y0 + b_chroma);
+    out[x * bpp] = clamp_u8(y0 + r_chroma);
+    out[x * bpp + 1] = clamp_u8(y0 + g_chroma);
+    out[x * bpp + 2] = clamp_u8(y0 + b_chroma);
+    if ALPHA {
+      out[x * bpp + 3] = 0xFF;
+    }
   }
 }
 
diff --git a/src/sinker/mixed.rs b/src/sinker/mixed.rs
index 146db35..94ca4e4 100644
--- a/src/sinker/mixed.rs
+++ b/src/sinker/mixed.rs
@@ -67,12 +67,12 @@ use crate::{
     p010_to_rgb_row, p010_to_rgb_u16_row, p012_to_rgb_row, p012_to_rgb_u16_row, p016_to_rgb_row,
     p016_to_rgb_u16_row, p410_to_rgb_row, p410_to_rgb_u16_row, p412_to_rgb_row,
     p412_to_rgb_u16_row, p416_to_rgb_row, p416_to_rgb_u16_row, rgb_to_hsv_row, yuv_420_to_rgb_row,
-    yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv420p9_to_rgb_row, yuv420p9_to_rgb_u16_row,
-    yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row, yuv420p12_to_rgb_u16_row,
-    yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row, yuv420p16_to_rgb_u16_row,
-    yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row, yuv444p10_to_rgb_u16_row,
-    yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row, yuv444p14_to_rgb_u16_row,
-    yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row,
+    yuv_420_to_rgba_row, yuv_444_to_rgb_row, yuv_444_to_rgba_row, yuv420p9_to_rgb_row,
+    yuv420p9_to_rgb_u16_row, yuv420p10_to_rgb_row, yuv420p10_to_rgb_u16_row, yuv420p12_to_rgb_row,
+    yuv420p12_to_rgb_u16_row, yuv420p14_to_rgb_row, yuv420p14_to_rgb_u16_row, yuv420p16_to_rgb_row,
+    yuv420p16_to_rgb_u16_row, yuv444p9_to_rgb_row, yuv444p9_to_rgb_u16_row, yuv444p10_to_rgb_row,
+    yuv444p10_to_rgb_u16_row, yuv444p12_to_rgb_row, yuv444p12_to_rgb_u16_row, yuv444p14_to_rgb_row,
+    yuv444p14_to_rgb_u16_row, yuv444p16_to_rgb_row, yuv444p16_to_rgb_u16_row,
   },
   yuv::{
     Nv12, Nv12Row, Nv12Sink, Nv16, Nv16Row, Nv16Sink, Nv21, Nv21Row, Nv21Sink, Nv24, Nv24Row,
@@ -1110,12 +1110,12 @@ impl<'a> MixedSinker<'a, Yuv420p> {
   ///
   /// ```compile_fail
   /// // Attaching RGBA to a sink that doesn't write it is rejected
-  /// // at compile time. Yuv444p (4:4:4 planar) has not yet been
+  /// // at compile time. Nv24 (4:4:4 semi‑planar) has not yet been
   /// // wired for RGBA; once that lands the negative example here
   /// // moves to the next not‑yet‑wired format.
-  /// use colconv::{sinker::MixedSinker, yuv::Yuv444p};
+  /// use colconv::{sinker::MixedSinker, yuv::Nv24};
   /// let mut buf = vec![0u8; 16 * 8 * 4];
-  /// let _ = MixedSinker::<Yuv444p>::new(16, 8).with_rgba(&mut buf);
+  /// let _ = MixedSinker::<Nv24>::new(16, 8).with_rgba(&mut buf);
   /// ```
   #[cfg_attr(not(tarpaulin), inline(always))]
   pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
@@ -1525,7 +1525,40 @@ impl PixelSink for MixedSinker<'_, Yuv422p> {
 // ---- Yuv444p impl -------------------------------------------------------
 //
 // 4:4:4 planar: U and V are full-width, full-height. No width parity
-// constraint. Uses the new `yuv_444_to_rgb_row` kernel family.
+// constraint. Uses the `yuv_444_to_rgb_row` / `yuv_444_to_rgba_row`
+// kernel family.
+
+impl<'a> MixedSinker<'a, Yuv444p> {
+  /// Attaches a packed 32‑bit RGBA output buffer.
+  ///
+  /// Only available on sinker types whose `PixelSink` impl writes
+  /// RGBA — see [`MixedSinker::<Yuv420p>::with_rgba`] for the same
+  /// rationale and constraints. Yuv444p has no alpha plane, so every
+  /// alpha byte is filled with `0xFF` (opaque).
+  ///
+  /// Returns `Err(RgbaBufferTooShort)` if
+  /// `buf.len() < width × height × 4`, or `Err(GeometryOverflow)` on
+  /// 32‑bit targets when the product overflows.
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn with_rgba(mut self, buf: &'a mut [u8]) -> Result<Self, MixedSinkerError> {
+    self.set_rgba(buf)?;
+    Ok(self)
+  }
+
+  /// In-place variant of [`with_rgba`](Self::with_rgba).
+  #[cfg_attr(not(tarpaulin), inline(always))]
+  pub fn set_rgba(&mut self, buf: &'a mut [u8]) -> Result<&mut Self, MixedSinkerError> {
+    let expected = self.frame_bytes(4)?;
+    if buf.len() < expected {
+      return Err(MixedSinkerError::RgbaBufferTooShort {
+        expected,
+        actual: buf.len(),
+      });
+    }
+    self.rgba = Some(buf);
+    Ok(self)
+  }
+}
 
 impl Yuv444pSink for MixedSinker<'_, Yuv444p> {}
 
@@ -1576,6 +1609,7 @@ impl PixelSink for MixedSinker<'_, Yuv444p> {
 
     let Self {
       rgb,
+      rgba,
       luma,
       hsv,
       rgb_scratch,
@@ -1589,6 +1623,30 @@ impl PixelSink for MixedSinker<'_, Yuv444p> {
       luma[one_plane_start..one_plane_end].copy_from_slice(&row.y()[..w]);
     }
 
+    // Native RGBA: independent kernel run, separate from RGB. Default
+    // alpha = 0xFF since Yuv444p has no alpha plane.
+    if let Some(buf) = rgba.as_deref_mut() {
+      let rgba_plane_end =
+        one_plane_end
+          .checked_mul(4)
+          .ok_or(MixedSinkerError::GeometryOverflow {
+            width: w,
+            height: h,
+            channels: 4,
+          })?;
+      let rgba_plane_start = one_plane_start * 4;
+      yuv_444_to_rgba_row(
+        row.y(),
+        row.u(),
+        row.v(),
+        &mut buf[rgba_plane_start..rgba_plane_end],
+        w,
+        row.matrix(),
+        row.full_range(),
+        use_simd,
+      );
+    }
+
     let want_rgb = rgb.is_some();
     let want_hsv = hsv.is_some();
     if !want_rgb && !want_hsv {
@@ -11181,6 +11239,136 @@ mod tests {
     }
   }
 
+  // ---- Yuv444p RGBA (Ship 8 PR 4a) tests ----------------------------------
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn yuv444p_rgba_only_converts_gray_to_gray_with_opaque_alpha() {
+    let (yp, up, vp) = solid_yuv444p_frame(16, 8, 128, 128, 128);
+    let src = Yuv444pFrame::new(&yp, &up, &vp, 16, 8, 16, 16, 16);
+
+    let mut rgba = std::vec![0u8; 16 * 8 * 4];
+    let mut sink = MixedSinker::<Yuv444p>::new(16, 8)
+      .with_rgba(&mut rgba)
+      .unwrap();
+    yuv444p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for px in rgba.chunks(4) {
+      assert!(px[0].abs_diff(128) <= 1, "R");
+      assert_eq!(px[0], px[1], "RGB monochromatic");
+      assert_eq!(px[1], px[2], "RGB monochromatic");
+      assert_eq!(px[3], 0xFF, "alpha must default to opaque");
+    }
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn yuv444p_with_rgb_and_with_rgba_produce_byte_identical_rgb_bytes() {
+    let w = 32u32;
+    let h = 16u32;
+    let ws = w as usize;
+    let hs = h as usize;
+    let (yp, up, vp) = solid_yuv444p_frame(w, h, 180, 60, 200);
+    let src = Yuv444pFrame::new(&yp, &up, &vp, w, h, w, w, w);
+
+    let mut rgb = std::vec![0u8; ws * hs * 3];
+    let mut rgba = std::vec![0u8; ws * hs * 4];
+    let mut sink = MixedSinker::<Yuv444p>::new(ws, hs)
+      .with_rgb(&mut rgb)
+      .unwrap()
+      .with_rgba(&mut rgba)
+      .unwrap();
+    yuv444p_to(&src, true, ColorMatrix::Bt601, &mut sink).unwrap();
+
+    for i in 0..(ws * hs) {
+      assert_eq!(rgba[i * 4], rgb[i * 3], "R differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 1], rgb[i * 3 + 1], "G differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 2], rgb[i * 3 + 2], "B differs at pixel {i}");
+      assert_eq!(rgba[i * 4 + 3], 0xFF, "A not opaque at pixel {i}");
+    }
+  }
+
+  #[test]
+  fn yuv444p_rgba_buffer_too_short_returns_err() {
+    let mut rgba_short = std::vec![0u8; 16 * 8 * 4 - 1];
+    let result = MixedSinker::<Yuv444p>::new(16, 8).with_rgba(&mut rgba_short);
+    let Err(err) = result else {
+      panic!("expected RgbaBufferTooShort error");
+    };
+    assert!(matches!(
+      err,
+      MixedSinkerError::RgbaBufferTooShort {
+        expected: 512,
+        actual: 511,
+      }
+    ));
+  }
+
+  #[test]
+  #[cfg_attr(
+    miri,
+    ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri"
+  )]
+  fn yuv444p_rgba_simd_matches_scalar_with_random_yuv() {
+    // 4:4:4 has full-width chroma — U / V are width-sized per row.
+    // Width 1922 forces both the SIMD main loop AND scalar tail
+    // across every backend block size (16/32/64).
+    let w = 1922usize;
+    let h = 4usize;
+    let mut yp = std::vec![0u8; w * h];
+    let mut up = std::vec![0u8; w * h];
+    let mut vp = std::vec![0u8; w * h];
+    pseudo_random_u8(&mut yp, 0xC001_C0DE);
+    pseudo_random_u8(&mut up, 0xCAFE_F00D);
+    pseudo_random_u8(&mut vp, 0xDEAD_BEEF);
+    let src = Yuv444pFrame::new(
+      &yp, &up, &vp, w as u32, h as u32, w as u32, w as u32, w as u32,
+    );
+
+    for &matrix in &[
+      ColorMatrix::Bt601,
+      ColorMatrix::Bt709,
+      ColorMatrix::Bt2020Ncl,
+      ColorMatrix::YCgCo,
+    ] {
+      for &full_range in &[true, false] {
+        let mut rgba_simd = std::vec![0u8; w * h * 4];
+        let mut rgba_scalar = std::vec![0u8; w * h * 4];
+
+        let mut s_simd = MixedSinker::<Yuv444p>::new(w, h)
+          .with_rgba(&mut rgba_simd)
+          .unwrap();
+        yuv444p_to(&src, full_range, matrix, &mut s_simd).unwrap();
+
+        let mut s_scalar = MixedSinker::<Yuv444p>::new(w, h)
+          .with_rgba(&mut rgba_scalar)
+          .unwrap();
+        s_scalar.set_simd(false);
+        yuv444p_to(&src, full_range, matrix, &mut s_scalar).unwrap();
+
+        if rgba_simd != rgba_scalar {
+          let mismatch = rgba_simd
+            .iter()
+            .zip(rgba_scalar.iter())
+            .position(|(a, b)| a != b)
+            .unwrap();
+          let pixel = mismatch / 4;
+          let channel = ["R", "G", "B", "A"][mismatch % 4];
+          panic!(
+            "Yuv444p RGBA SIMD ≠ scalar at byte {mismatch} (px {pixel} {channel}) for matrix={matrix:?} full_range={full_range}: simd={} scalar={}",
+            rgba_simd[mismatch], rgba_scalar[mismatch]
+          );
+        }
+      }
+    }
+  }
+
   #[test]
   #[cfg_attr(
     miri,

From fa48ba17f5871e3e20b9822359fa8217af0d9815 Mon Sep 17 00:00:00 2001
From: uqio <276879906+uqio@users.noreply.github.com>
Date: Sun, 26 Apr 2026 17:49:01 +1200
Subject: [PATCH 2/2] update

---
 src/row/arch/neon.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs
index a8ea1e9..c5c72c7 100644
--- a/src/row/arch/neon.rs
+++ b/src/row/arch/neon.rs
@@ -1548,7 +1548,9 @@ unsafe fn nv24_or_nv42_to_rgb_row_impl<const SWAP_UV: bool>(
 ///
 /// # Safety
 ///
-/// Same contract as [`yuv_444_to_rgb_or_rgba_row`]:
+/// Same contract as [`yuv_444_to_rgb_or_rgba_row`] with
+/// `ALPHA = false` (so `out.len() >= width * 3` specializes to
+/// `rgb_out.len() >= 3 * width`):
 ///
 /// 1. **NEON must be available on the current CPU.**
 /// 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`.