From e6696fcb09e17674e82bc0038f7cf7123d44346e Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 25 Jul 2025 09:53:53 -0400
Subject: [PATCH 1/3] Testable Models for SIMD Intrinsics

---
 .gitignore                                    |    1 +
 testable-simd-models/Cargo.toml               |   16 +
 testable-simd-models/README.md                |  127 +
 testable-simd-models/src/abstractions/bit.rs  |  204 ++
 .../src/abstractions/bitvec.rs                |  155 +
 .../src/abstractions/funarr.rs                |   79 +
 testable-simd-models/src/abstractions/mod.rs  |   26 +
 testable-simd-models/src/abstractions/simd.rs |  938 +++++++
 testable-simd-models/src/core_arch.rs         |    5 +
 .../src/core_arch/arm_shared/mod.rs           |    4 +
 .../src/core_arch/arm_shared/models/mod.rs    |   44 +
 .../src/core_arch/arm_shared/models/neon.rs   |  873 ++++++
 .../src/core_arch/arm_shared/tests/mod.rs     |  112 +
 .../src/core_arch/arm_shared/tests/neon.rs    |  218 ++
 testable-simd-models/src/core_arch/x86/mod.rs |    4 +
 .../src/core_arch/x86/models/avx.rs           |  432 +++
 .../src/core_arch/x86/models/avx2.rs          | 2493 +++++++++++++++++
 .../src/core_arch/x86/models/mod.rs           |   37 +
 .../src/core_arch/x86/models/sse2.rs          | 1303 +++++++++
 .../src/core_arch/x86/models/ssse3.rs         |  369 +++
 .../src/core_arch/x86/tests/avx.rs            |  132 +
 .../src/core_arch/x86/tests/avx2.rs           |  531 ++++
 .../src/core_arch/x86/tests/mod.rs            |  113 +
 .../src/core_arch/x86/tests/sse2.rs           |  201 ++
 .../src/core_arch/x86/tests/ssse3.rs          |   51 +
 testable-simd-models/src/helpers.rs           |   55 +
 testable-simd-models/src/lib.rs               |   35 +
 testable-simd-models/test.sh                  |    2 +
 28 files changed, 8560 insertions(+)
 create mode 100644 testable-simd-models/Cargo.toml
 create mode 100644 testable-simd-models/README.md
 create mode 100644 testable-simd-models/src/abstractions/bit.rs
 create mode 100644 testable-simd-models/src/abstractions/bitvec.rs
 create mode 100644 testable-simd-models/src/abstractions/funarr.rs
 create mode 100644 testable-simd-models/src/abstractions/mod.rs
 create mode 100644 testable-simd-models/src/abstractions/simd.rs
 create mode 100644 testable-simd-models/src/core_arch.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/models/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/models/neon.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/sse2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/ssse3.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/avx.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/avx2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/mod.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/sse2.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/tests/ssse3.rs
 create mode 100644 testable-simd-models/src/helpers.rs
 create mode 100644 testable-simd-models/src/lib.rs
 create mode 100755 testable-simd-models/test.sh

diff --git a/.gitignore b/.gitignore
index 39ad701a8883f..82d2291fd22b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,4 @@ goto-transcoder
 # already existing elements were commented out
 
 #/target
+testable-simd-models/target
diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml
new file mode 100644
index 0000000000000..6e2116fec82e0
--- /dev/null
+++ b/testable-simd-models/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "testable-simd-models"
+version = "0.0.2"
+authors = ["Cryspen"]
+license = "Apache-2.0"
+homepage = "https://github.com/cryspen/verify-rust-std/testable-simd-models"
+edition = "2021"
+repository = "https://github.com/cryspen/verify-rust-std/testable-simd-models"
+readme = "README.md"
+
+[dependencies]
+rand = "0.9"
+pastey = "0.1.0"
+
+[lints.rust]
+unexpected_cfgs = { level = "warn" }
diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
new file mode 100644
index 0000000000000..d051de6145f4a
--- /dev/null
+++ b/testable-simd-models/README.md
@@ -0,0 +1,127 @@
+# testable-simd-models
+
+This crate contains executable, independently testable specifications
+for the SIMD intrinsics provided by the `core::arch` library in Rust. 
+The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
+
+## Code Structure
+Within the `core_arch` folder in this crate, there is a different
+folder for each architecture for which we have wrtten models. 
+In particular, it contains folders for `x86` and `arm_shared`.
+Each such folder has 3 sub-folders, `models`, `tests`, and `specs`. 
+
+The `models` folder contains the models of the intrinsics, with a file
+corresponding to different target features, and are written using the
+various abstractions implementedin `crate::abstractions`, especially
+those in `crate::abstractions::simd`. These models are meant to
+closely resemble their implementations within the Rust core itself.
+
+The `tests` folder contains the tests of these models, and is
+structured the same way as `models`. Each file additionally contains
+the definition of a macro that makes writing these tests easier. The
+tests work by testing the models against the intrinsics in the Rust
+core, trying out random inputs (generally 1000), and comparing their
+outputs.
+
+## Modeling Process
+The process of adding a specific intrinsic's model goes as follows.
+For this example, let us say the intrinsic we are adding is
+`_mm256_bsrli_epi128` from the avx2 feature set.
+
+1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`.
+
+2. We see that the implementation looks like this:
+``` rust
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+#[inline]
+#[target_feature(enable = "avx2")]
+#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
+#[rustc_legacy_const_generics(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0
+        } else {
+            32 + (i + shift)
+        }
+    }
+    unsafe {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle!(
+            i8x32::ZERO,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+		...
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
+}
+  ```
+Thus, we then go to to `core_arch/x86/models/avx2.rs`, and add the implementation. After some modification, it ends up looking like this.
+``` rust
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0 as u64
+        } else {
+            (32 + (i + shift)) as u64
+        }
+    }
+    
+	let a = BitVec::to_i8x32(a);
+	let r: i8x32 = simd_shuffle(
+		i8x32::from_fn(|_| 0),
+		a,
+		[
+			mask(IMM8, 0),
+			mask(IMM8, 1),
+			mask(IMM8, 2),
+			mask(IMM8, 3),
+			...
+			mask(IMM8, 31),
+		],
+	);
+	r.into()
+}
+  ```
+  
+3. Next, we add a test for this intrinsic. For this, we navigate to `core_arch/avx2/tests/avx2.rs`. Since the value of
+   `IMM8` can be up to 8 bits, we want to test constant arguments up to 255. Thus, we write the following macro invocation.
+   ```rust
+	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
+   ```
+   Here, the `[100]` means we test 100 random inputs for each constant value. This concludes the necessary steps for implementing an intrinsic.
+
+
+## Contributing Models
+
+To contribute new models of intrinsics, we expect the author to follow
+the above steps and provide comprehensive tests.  It is important that
+the model author look carefully at both the Intel/ARM specification
+and the Rust `stdarch` implementation, because the Rust implementation
+may not necessarily be correct.
+
+Indeed, the previous implementation of `_mm256_bsrli_epi128` (and a
+similar intrinsic called `_mm512_bsrli_epi128`) in `stdarch` had a
+bug, which we found during the process of modeling and testing this
+intrinsic. This bug was [reported by
+us](https://github.com/rust-lang/stdarch/issues/1822) using a failing
+test case generated from the testable model and then fixed by [our
+PR](https://github.com/rust-lang/stdarch/pull/1823) in the 2025-06-30
+version of `stdarch`.
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
new file mode 100644
index 0000000000000..4fac19fdcd567
--- /dev/null
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -0,0 +1,204 @@
+//! # Bit Manipulation and Machine Integer Utilities
+//!
+//! This module provides utilities for working with individual bits and machine integer types.
+//! It defines a [`Bit`] enum to represent a single bit (`0` or `1`) along with convenient
+//! conversion implementations between `Bit`, [`bool`], and various primitive integer types.
+//!
+//! In addition, the module introduces the [`MachineInteger`] trait which abstracts over
+//! integer types, providing associated constants:
+//!
+//! - `BITS`: The size of the integer type in bits.
+//! - `SIGNED`: A flag indicating whether the type is signed.
+//!
+//! The [`Bit`] type includes methods for extracting the value of a specific bit from an integer.
+//! For example, [`Bit::of_int`] returns the bit at a given position for a provided integer,
+//! handling both positive and negative values (assuming a two's complement representation).
+//!
+//! # Examples
+//!
+//! ```rust
+//! use testable_simd_models::abstractions::bit::{Bit, MachineInteger};
+//!
+//! // Extract the 3rd bit (0-indexed) from an integer.
+//! let bit = Bit::of_int(42, 2);
+//! println!("The extracted bit is: {:?}", bit);
+//!
+//! // Convert Bit to a primitive integer type.
+//! let num: u8 = bit.into();
+//! println!("As an integer: {}", num);
+//! ```
+//!
+//! [`bool`]: https://doc.rust-lang.org/std/primitive.bool.html
+//! [`Bit::of_int`]: enum.Bit.html#method.of_int
+
+/// Represent a bit: `0` or `1`.
+#[derive(Copy, Clone, Eq, PartialEq, Debug)]
+pub enum Bit {
+    Zero,
+    One,
+}
+impl std::ops::BitAnd for Bit {
+    type Output = Self;
+    fn bitand(self, rhs: Self) -> Self {
+        match self {
+            Bit::Zero => Bit::Zero,
+            Bit::One => rhs,
+        }
+    }
+}
+
+impl std::ops::BitOr for Bit {
+    type Output = Self;
+    fn bitor(self, rhs: Self) -> Self {
+        match self {
+            Bit::Zero => rhs,
+            Bit::One => Bit::One,
+        }
+    }
+}
+
+impl std::ops::BitXor for Bit {
+    type Output = Self;
+    fn bitxor(self, rhs: Self) -> Self {
+        match (self, rhs) {
+            (Bit::Zero, Bit::Zero) => Bit::Zero,
+            (Bit::One, Bit::One) => Bit::Zero,
+            _ => Bit::One,
+        }
+    }
+}
+
+impl std::ops::Neg for Bit {
+    type Output = Self;
+    fn neg(self) -> Self {
+        match self {
+            Bit::One => Bit::Zero,
+            Bit::Zero => Bit::One,
+        }
+    }
+}
+macro_rules! generate_from_bit_impls {
+    ($($ty:ident),*) => {
+        $(impl From<Bit> for $ty {
+            fn from(bit: Bit) -> Self {
+                bool::from(bit) as $ty
+            }
+        })*
+    };
+}
+generate_from_bit_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
+
+impl From<Bit> for bool {
+    fn from(bit: Bit) -> Self {
+        match bit {
+            Bit::Zero => false,
+            Bit::One => true,
+        }
+    }
+}
+
+impl From<bool> for Bit {
+    fn from(b: bool) -> Bit {
+        match b {
+            false => Bit::Zero,
+            true => Bit::One,
+        }
+    }
+}
+
+/// A trait for types that represent machine integers.
+pub trait MachineInteger {
+    /// The size of this integer type in bits.
+    fn bits() -> u32;
+
+    /// The signedness of this integer type.
+    const SIGNED: bool;
+    /// Element of the integer type with every bit as 0.
+    const ZEROS: Self;
+    /// Element of the integer type with every bit as 1.
+    const ONES: Self;
+    /// Minimum value of the integer type.
+    const MIN: Self;
+    /// Maximum value of the integer type.
+    const MAX: Self;
+
+    /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
+    fn wrapping_add(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
+    fn wrapping_sub(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_mul` in `crate::abstractions::simd`.
+    fn overflowing_mul(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_saturating_add` in `crate::abstractions::simd`.
+    fn saturating_add(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_saturating_sub` in `crate::abstractions::simd`.
+    fn saturating_sub(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`.
+    fn absolute_diff(self, rhs: Self) -> Self;
+    /// Implements functionality for `simd_abs` in `crate::abstractions::simd`.
+    fn absolute_val(self) -> Self;
+}
+
+macro_rules! generate_imachine_integer_impls {
+    ($($ty:ident),*) => {
+        $(
+	    impl MachineInteger for $ty {
+		const SIGNED: bool = true;
+		const ZEROS: $ty = 0;
+		const ONES: $ty = -1;
+		const MIN: $ty = $ty::MIN;
+		const MAX: $ty = $ty::MAX;
+		fn bits() -> u32 { $ty::BITS }
+		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
+		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
+		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
+		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
+		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) }
+		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
+		fn absolute_val(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
+            })*
+    };
+}
+
+macro_rules! generate_umachine_integer_impls {
+    ($($ty:ident),*) => {
+        $(
+	    impl MachineInteger for $ty {
+		const SIGNED: bool = false;
+		const ZEROS: $ty = 0;
+		const ONES: $ty = $ty::MAX;
+		const MIN: $ty = $ty::MIN;
+		const MAX: $ty = $ty::MAX;
+
+
+		fn bits() -> u32 { $ty::BITS }
+		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
+		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
+		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
+		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
+		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)}
+		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn absolute_val(self) -> Self {self}
+        })*
+    };
+}
+generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
+generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
+
+impl Bit {
+    fn of_raw_int(x: u128, nth: u32) -> Self {
+        if x / 2u128.pow(nth) % 2 == 1 {
+            Self::One
+        } else {
+            Self::Zero
+        }
+    }
+
+    pub fn of_int<T: Into<i128> + MachineInteger>(x: T, nth: u32) -> Bit {
+        let x: i128 = x.into();
+        if x >= 0 {
+            Self::of_raw_int(x as u128, nth)
+        } else {
+            Self::of_raw_int((2i128.pow(T::bits()) + x) as u128, nth)
+        }
+    }
+}
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
new file mode 100644
index 0000000000000..0f3003f4beadc
--- /dev/null
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -0,0 +1,155 @@
+//! This module provides a specification-friendly bit vector type.
+use super::bit::{Bit, MachineInteger};
+use super::funarr::*;
+
+use std::fmt::Formatter;
+
+/// A fixed-size bit vector type.
+///
+/// `BitVec<N>` is a specification-friendly, fixed-length bit vector that internally
+/// stores an array of [`Bit`] values, where each `Bit` represents a single binary digit (0 or 1).
+///
+/// This type provides several utility methods for constructing and converting bit vectors:
+///
+/// The [`Debug`] implementation for `BitVec` pretty-prints the bits in groups of eight,
+/// making the bit pattern more human-readable. The type also implements indexing,
+/// allowing for easy access to individual bits.
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct BitVec<const N: u64>(FunArray<N, Bit>);
+
+/// Pretty prints a bit slice by group of 8
+fn bit_slice_to_string(bits: &[Bit]) -> String {
+    bits.iter()
+        .map(|bit| match bit {
+            Bit::Zero => '0',
+            Bit::One => '1',
+        })
+        .collect::<Vec<_>>()
+        .chunks(8)
+        .map(|bits| bits.iter().collect::<String>())
+        .map(|s| format!("{s} "))
+        .collect::<String>()
+        .trim()
+        .into()
+}
+
+impl<const N: u64> core::fmt::Debug for BitVec<N> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
+    }
+}
+
+impl<const N: u64> core::ops::Index<u64> for BitVec<N> {
+    type Output = Bit;
+    fn index(&self, index: u64) -> &Self::Output {
+        self.0.get(index)
+    }
+}
+
+/// Convert a bit slice into an unsigned number.
+
+fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
+    bits.iter()
+        .enumerate()
+        .map(|(i, bit)| u128::from(*bit) << i)
+        .sum::<u128>()
+}
+
+/// Convert a bit slice into a machine integer of type `T`.
+fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
+    debug_assert!(bits.len() <= T::bits() as usize);
+    let result = if T::SIGNED {
+        let is_negative = matches!(bits[T::bits() as usize - 1], Bit::One);
+        let s = u128_int_from_bit_slice(&bits[0..T::bits() as usize - 1]) as i128;
+        if is_negative {
+            s + (-2i128).pow(T::bits() - 1)
+        } else {
+            s
+        }
+    } else {
+        u128_int_from_bit_slice(bits) as i128
+    };
+    let Ok(n) = result.try_into() else {
+        // Conversion must succeed as `result` is guaranteed to be in range due to the bit-length check.
+        unreachable!()
+    };
+    n
+}
+impl<const N: u64> BitVec<N> {
+    /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
+    pub fn from_fn<F: Fn(u64) -> Bit>(f: F) -> Self {
+        Self(FunArray::from_fn(f))
+    }
+    /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
+    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u64) -> Self {
+        Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
+    }
+
+    /// Construct a BitVec out of a machine integer.
+    pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
+        Self::from_slice::<T>(&[n], T::bits() as u64)
+    }
+
+    /// Convert a BitVec into a machine integer of type `T`.
+    pub fn to_int<T: TryFrom<i128> + MachineInteger + Copy>(self) -> T {
+        int_from_bit_slice(&self.0.as_vec())
+    }
+
+    /// Convert a BitVec into a vector of machine integers of type `T`.
+    pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
+        self.0
+            .as_vec()
+            .chunks(T::bits() as usize)
+            .map(int_from_bit_slice)
+            .collect()
+    }
+
+    /// Generate a random BitVec.
+    pub fn rand() -> Self {
+        use rand::prelude::*;
+        let random_source: Vec<_> = {
+            let mut rng = rand::rng();
+            (0..N).map(|_| rng.random::<bool>()).collect()
+        };
+        Self::from_fn(|i| random_source[i as usize].into())
+    }
+}
+
+impl<const N: u64> BitVec<N> {
+    pub fn chunked_shift<const CHUNK: u64, const SHIFTS: u64>(
+        self,
+        shl: FunArray<SHIFTS, i128>,
+    ) -> BitVec<N> {
+        fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
+            bitvec: BitVec<N>,
+            shl: FunArray<SHIFTS, i128>,
+        ) -> BitVec<N> {
+            BitVec::from_fn(|i| {
+                let nth_bit = i % CHUNK;
+                let nth_chunk = i / CHUNK;
+                let shift: i128 = if nth_chunk < SHIFTS {
+                    shl[nth_chunk]
+                } else {
+                    0
+                };
+                let local_index = (nth_bit as i128).wrapping_sub(shift);
+                if local_index < CHUNK as i128 && local_index >= 0 {
+                    let local_index = local_index as u64;
+                    bitvec[nth_chunk * CHUNK + local_index]
+                } else {
+                    Bit::Zero
+                }
+            })
+        }
+        chunked_shift::<N, CHUNK, SHIFTS>(self, shl)
+    }
+
+    /// Folds over the array, accumulating a result.
+    ///
+    /// # Arguments
+    /// * `init` - The initial value of the accumulator.
+    /// * `f` - A function combining the accumulator and each element.
+    pub fn fold<A>(&self, init: A, f: fn(A, Bit) -> A) -> A {
+        self.0.fold(init, f)
+    }
+}
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
new file mode 100644
index 0000000000000..4c120addcb0c5
--- /dev/null
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -0,0 +1,79 @@
+//! This module implements a fixed-size array wrapper with functional semantics
+//! which are used in formulating abstractions.
+
+/// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
+/// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
+/// Unused elements beyond `N` are filled with `None`.
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct FunArray<const N: u64, T>([Option<T>; 512]);
+
+impl<const N: u64, T> FunArray<N, T> {
+    /// Gets a reference to the element at index `i`.
+    pub fn get(&self, i: u64) -> &T {
+        self.0[i as usize].as_ref().unwrap()
+    }
+    /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
+    pub fn from_fn<F: Fn(u64) -> T>(f: F) -> Self {
+        // let vec = (0..N).map(f).collect();
+        let arr = core::array::from_fn(|i| {
+            if (i as u64) < N {
+                Some(f(i as u64))
+            } else {
+                None
+            }
+        });
+        Self(arr)
+    }
+
+    /// Converts the `FunArray` into a `Vec<T>`.
+    pub fn as_vec(&self) -> Vec<T>
+    where
+        T: Clone,
+    {
+        self.0[0..(N as usize)]
+            .iter()
+            .cloned()
+            .map(|x| x.unwrap())
+            .collect()
+    }
+
+    /// Folds over the array, accumulating a result.
+    ///
+    /// # Arguments
+    /// * `init` - The initial value of the accumulator.
+    /// * `f` - A function combining the accumulator and each element.
+    pub fn fold<A>(&self, mut init: A, f: fn(A, T) -> A) -> A
+    where
+        T: Clone,
+    {
+        for i in 0..N {
+            init = f(init, self[i].clone());
+        }
+        init
+    }
+}
+
+impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+    type Error = ();
+    fn try_from(v: Vec<T>) -> Result<Self, ()> {
+        if (v.len() as u64) < N {
+            Err(())
+        } else {
+            Ok(Self::from_fn(|i| v[i as usize].clone()))
+        }
+    }
+}
+
+impl<const N: u64, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "{:?}", self.as_vec())
+    }
+}
+
+impl<const N: u64, T> core::ops::Index<u64> for FunArray<N, T> {
+    type Output = T;
+
+    fn index(&self, index: u64) -> &Self::Output {
+        self.get(index)
+    }
+}
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
new file mode 100644
index 0000000000000..b3018a8189569
--- /dev/null
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -0,0 +1,26 @@
+//! This module provides abstractions that are useful for writing
+//! specifications for the intrinsics. Currently it provides two abstractions: bits and
+//! bit vectors.
+//!
+//! # Examples
+//!
+//! Converting an integer to a bit vector and back:
+//!
+//! ```rust
+//! use testable_simd_models::abstractions::{bit::{Bit, MachineInteger}, bitvec::BitVec};
+//!
+//! // Create a BitVec from a machine integer (using the integer's bit-width)
+//! let bv = BitVec::<16>::from_int(42u16);
+//! println!("BitVec: {:?}", bv);
+//!
+//! // Convert the BitVec back into a machine integer
+//! let n: u16 = bv.to_int();
+//! println!("Integer: {}", n);
+//!
+//! assert!(n == 42);
+//! ```
+
+pub mod bit;
+pub mod bitvec;
+pub mod funarr;
+pub mod simd;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
new file mode 100644
index 0000000000000..08b1b21bce34d
--- /dev/null
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -0,0 +1,938 @@
+//! Models of SIMD compiler intrinsics.
+//!
+//! Operations are defined on FunArrs.
+
+use crate::abstractions::{bit::MachineInteger, bitvec::*, funarr::*};
+use std::convert::*;
+use std::ops::*;
+
+#[allow(dead_code)]
+/// Derives interpretations functions, and type synonyms.
+macro_rules! interpretations {
+($n:literal; $($name:ident [$ty:ty; $m:literal]),*) => {
+        $(
+    #[doc = concat!(stringify!($ty), " vectors of size ", stringify!($m))]
+    #[allow(non_camel_case_types)]
+    pub type $name = FunArray<$m, $ty>;
+    pastey::paste! {
+                const _: ()  = {
+        impl BitVec<$n> {
+                        #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
+                        pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
+            let vec: Vec<$ty> = iv.as_vec();
+            Self::from_slice(&vec[..], <$ty>::bits() as u64)
+                        }
+                        #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                        pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
+            let vec: Vec<$ty> = bv.to_vec();
+            $name::from_fn(|i| vec[i as usize])
+                        }
+
+
+        }
+
+
+        impl From<BitVec<$n>> for $name {
+                        fn from(bv: BitVec<$n>) -> Self {
+            BitVec::[< to_ $name >](bv)
+                        }
+        }
+
+        impl From<$name> for BitVec<$n> {
+                        fn from(iv: $name) -> Self {
+            BitVec::[< from_ $name >](iv)
+                        }
+        }
+
+        impl $name {
+
+            pub fn splat(value: $ty) -> Self {
+            FunArray::from_fn(|_| value)
+            }
+        }
+                };
+    }
+        )*
+};
+}
+
+interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+
+interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
+
+
+/// Inserts an element into a vector, returning the updated vector.
+///
+/// # Safety
+///
+/// `idx` must be in-bounds of the vector, ie. idx < N
+
+pub fn simd_insert<const N: u64, T: Copy>(x: FunArray<N, T>, idx: u64, val: T) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if i == idx { val } else { x[i] })
+}
+
+/// Extracts an element from a vector.
+///
+/// # Safety
+///
+/// `idx` must be in-bounds of the vector, ie. idx < N
+pub fn simd_extract<const N: u64, T: Clone>(x: FunArray<N, T>, idx: u64) -> T {
+    x.get(idx).clone()
+}
+
+/// Adds two vectors elementwise with wrapping on overflow/underflow.
+pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+}
+
+/// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
+pub fn simd_sub<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_sub(y[i])))
+}
+
+/// Multiplies two vectors elementwise with wrapping on overflow/underflow.
+pub fn simd_mul<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].overflowing_mul(y[i])))
+}
+
+/// Produces the elementwise absolute values.
+/// For vectors of unsigned integers it returns the vector untouched.
+/// If the element is the minimum value of a signed integer, it returns the element as is.
+pub fn simd_abs<const N: u64, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].absolute_val())
+}
+
+/// Produces the elementwise absolute difference of two vectors.
+/// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
+/// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
+pub fn simd_abs_diff<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].absolute_diff(y[i])))
+}
+
+/// Shifts vector left elementwise, with UB on overflow.
+///
+/// # Safety
+///
+/// Each element of `rhs` must be less than `<int>::BITS`.
+pub fn simd_shl<const N: u64, T: Shl + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as Shl>::Output> {
+    FunArray::from_fn(|i| (x[i] << y[i]))
+}
+
+/// Shifts vector right elementwise, with UB on overflow.
+///
+/// Shifts `lhs` right by `rhs`, shifting in sign bits for signed types.
+///
+/// # Safety
+///
+/// Each element of `rhs` must be less than `<int>::BITS`.
+
+pub fn simd_shr<const N: u64, T: Shr + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as Shr>::Output> {
+    FunArray::from_fn(|i| (x[i] >> y[i]))
+}
+
+/// "Ands" vectors elementwise.
+
+pub fn simd_and<const N: u64, T: BitAnd + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitAnd>::Output> {
+    FunArray::from_fn(|i| (x[i] & y[i]))
+}
+
+/// "Ors" vectors elementwise.
+
+pub fn simd_or<const N: u64, T: BitOr + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitOr>::Output> {
+    FunArray::from_fn(|i| (x[i] | y[i]))
+}
+
+/// "Exclusive ors" vectors elementwise.
+
+pub fn simd_xor<const N: u64, T: BitXor + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, <T as BitXor>::Output> {
+    FunArray::from_fn(|i| (x[i] ^ y[i]))
+}
+
+pub trait CastsFrom<T> {
+    fn cast(a: T) -> Self;
+}
+pub trait TruncateFrom<T> {
+    /// Truncates into [`Self`] from a larger integer
+    fn truncate_from(v: T) -> Self;
+}
+
+macro_rules! from_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    <$ty1>::from(a)
+		}
+	    }
+	)*
+    };
+}
+macro_rules! truncate_from_order {
+    ($t:ty, $($from:ty),+) => {
+        $(
+        impl TruncateFrom<$from> for $t {
+            #[inline]
+            fn truncate_from(v: $from) -> $t { v as $t }
+        }
+        )*
+        truncate_from_order!($($from),+);
+    };
+
+    ($t:ty) => {};
+}
+truncate_from_order!(u8, u16, u32, u64, u128);
+truncate_from_order!(i8, i16, i32, i64, i128);
+
+macro_rules! truncate_from_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    <$ty1>::truncate_from(a)
+		}
+	    }
+	)*
+    };
+}
+
+macro_rules! symm_impls{
+    ($([$ty1:ty, $ty2: ty]),*) => {
+        $(
+	    impl CastsFrom<$ty2> for $ty1 {
+		fn cast(a: $ty2) -> $ty1 {
+		    a as $ty1
+		}
+	    }
+	    impl CastsFrom<$ty1> for $ty2 {
+		fn cast(a: $ty1) -> $ty2 {
+		    a as $ty2
+		}
+	    }
+	)*
+    };
+}
+macro_rules! self_impls{
+    ($($ty1:ty),*) => {
+        $(
+	    impl CastsFrom<$ty1> for $ty1 {
+		fn cast(a: $ty1) -> $ty1 {
+		    a
+		}
+	    }
+
+	)*
+    };
+}
+from_impls!(
+    [u16, u8],
+    [u32, u8],
+    [u32, u16],
+    [u64, u8],
+    [u64, u16],
+    [u64, u32],
+    [u128, u8],
+    [u128, u16],
+    [u128, u32],
+    [u128, u64],
+    [i16, i8],
+    [i32, i8],
+    [i32, i16],
+    [i64, i8],
+    [i64, i16],
+    [i64, i32],
+    [i128, i8],
+    [i128, i16],
+    [i128, i32],
+    [i128, i64]
+);
+truncate_from_impls!(
+    [u8, u16],
+    [u8, u32],
+    [u16, u32],
+    [u8, u64],
+    [u16, u64],
+    [u32, u64],
+    [u8, u128],
+    [u16, u128],
+    [u32, u128],
+    [u64, u128],
+    [i8, i16],
+    [i8, i32],
+    [i16, i32],
+    [i8, i64],
+    [i16, i64],
+    [i32, i64],
+    [i8, i128],
+    [i16, i128],
+    [i32, i128],
+    [i64, i128]
+);
+
+symm_impls!([u8, i8], [u16, i16], [u32, i32], [u64, i64], [u128, i128]);
+
+self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
+
+// Would like to do the below instead of using the above macros, but currently this is an active issue in Rust (#31844)
+// impl <T,U> CastsFrom<T> for U
+// where
+//     U : From<T> {
+//     fn cast(a: T) -> U {
+// 	U::from(a)
+//     }
+// }
+
+// impl <T,U> CastsFrom<T> for U
+// where
+//     U : TruncateFrom<T> {
+//     fn cast(a: T) -> U {
+// 	U::truncate_from(a)
+//     }
+// }
+
+/// Numerically casts a vector, elementwise.
+///
+/// Casting can only happen between two integers of the same signedness.
+///
+/// When casting from a wider number to a smaller number, the higher bits are removed.
+/// Otherwise, it extends the number, following signedness.
+pub fn simd_cast<const N: u64, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
+    FunArray::from_fn(|i| T2::cast(x[i]))
+}
+
+/// Negates a vector elementwise.
+///
+/// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
+
+pub fn simd_neg<const N: u64, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+    x: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| {
+        if x[i] == T::MIN {
+            T::MIN
+        } else {
+            T::from(-x[i])
+        }
+    })
+}
+/// Tests elementwise equality of two vectors.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_eq<const N: u64, T: Eq + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] == y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests elementwise inequality equality of two vectors.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_ne<const N: u64, T: Eq + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] != y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is less than `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_lt<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] < y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is less than or equal to `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_le<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] <= y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is greater than `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_gt<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] > y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Tests if `x` is greater than or equal to `y`, elementwise.
+///
+/// Returns `0` (all zeros) for false and `!0` (all ones) for true.
+
+pub fn simd_ge<const N: u64, T: Ord + MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| if x[i] >= y[i] { T::ONES } else { T::ZEROS })
+}
+
+/// Shuffles two vectors by the indices in idx.
+///
+/// For safety, `N2 <= N1 + N3` must hold.
+pub fn simd_shuffle<T: Copy, const N1: u64, const N2: usize, const N3: u64>(
+    x: FunArray<N1, T>,
+    y: FunArray<N1, T>,
+    idx: [u64; N2],
+) -> FunArray<N3, T> {
+    FunArray::from_fn(|i| {
+        let i = idx[i as usize];
+        if i < N1 {
+            x[i]
+        } else {
+            y[i - N1]
+        }
+    })
+}
+
+/// Adds two vectors elementwise, with saturation.
+
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u64>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].saturating_add(y[i]))
+}
+
+/// Subtracts `y` from `x` elementwise, with saturation.
+
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u64>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].saturating_sub(y[i]))
+}
+
+/// Truncates an integer vector to a bitmask.
+/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on little endian systems.
+///
+/// The macro takes 3 arguments.
+/// The first is the highest index of the vector.
+/// The second is the vector itself, which should just contain `0` and `!0`.
+/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector.
+///
+/// Thus for example, to truncate the vector,
+/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]`
+/// to u16, you would call,
+/// `simd_bitmask_little!(15, a, u16)`
+/// to get,
+/// `0b0100001100000001u16`
+///
+/// # Safety
+/// The second argument must be a vector of signed integer types.
+/// The length of the vector must be 64 at most.
+
+// The numbers in here are powers of 2. If it is needed to extend the length of the vector, simply add more cases in the same manner.
+// The reason for doing this is that the expression becomes easier to work with when compiled for a proof assistant.
+macro_rules! simd_bitmask_little {
+    (63, $a:ident, $ty:ty) => {
+        9223372036854775808 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(62, $a, $ty)
+    };
+    (62, $a:ident, $ty:ty) => {
+        4611686018427387904 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(61, $a, $ty)
+    };
+    (61, $a:ident, $ty:ty) => {
+        2305843009213693952 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(60, $a, $ty)
+    };
+    (60, $a:ident, $ty:ty) => {
+        1152921504606846976 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(59, $a, $ty)
+    };
+    (59, $a:ident, $ty:ty) => {
+        576460752303423488 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(58, $a, $ty)
+    };
+    (58, $a:ident, $ty:ty) => {
+        288230376151711744 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(57, $a, $ty)
+    };
+    (57, $a:ident, $ty:ty) => {
+        144115188075855872 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(56, $a, $ty)
+    };
+    (56, $a:ident, $ty:ty) => {
+        72057594037927936 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(55, $a, $ty)
+    };
+    (55, $a:ident, $ty:ty) => {
+        36028797018963968 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(54, $a, $ty)
+    };
+    (54, $a:ident, $ty:ty) => {
+        18014398509481984 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(53, $a, $ty)
+    };
+    (53, $a:ident, $ty:ty) => {
+        9007199254740992 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(52, $a, $ty)
+    };
+    (52, $a:ident, $ty:ty) => {
+        4503599627370496 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(51, $a, $ty)
+    };
+    (51, $a:ident, $ty:ty) => {
+        2251799813685248 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(50, $a, $ty)
+    };
+    (50, $a:ident, $ty:ty) => {
+        1125899906842624 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(49, $a, $ty)
+    };
+    (49, $a:ident, $ty:ty) => {
+        562949953421312 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(48, $a, $ty)
+    };
+    (48, $a:ident, $ty:ty) => {
+        281474976710656 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(47, $a, $ty)
+    };
+    (47, $a:ident, $ty:ty) => {
+        140737488355328 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(46, $a, $ty)
+    };
+    (46, $a:ident, $ty:ty) => {
+        70368744177664 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(45, $a, $ty)
+    };
+    (45, $a:ident, $ty:ty) => {
+        35184372088832 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(44, $a, $ty)
+    };
+    (44, $a:ident, $ty:ty) => {
+        17592186044416 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(43, $a, $ty)
+    };
+    (43, $a:ident, $ty:ty) => {
+        8796093022208 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(42, $a, $ty)
+    };
+    (42, $a:ident, $ty:ty) => {
+        4398046511104 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(41, $a, $ty)
+    };
+    (41, $a:ident, $ty:ty) => {
+        2199023255552 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(40, $a, $ty)
+    };
+    (40, $a:ident, $ty:ty) => {
+        1099511627776 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_little!(39, $a, $ty)
+    };
+    (39, $a:ident, $ty:ty) => {
+        549755813888 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(38, $a, $ty)
+    };
+    (38, $a:ident, $ty:ty) => {
+        274877906944 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(37, $a, $ty)
+    };
+    (37, $a:ident, $ty:ty) => {
+        137438953472 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(36, $a, $ty)
+    };
+    (36, $a:ident, $ty:ty) => {
+        68719476736 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(35, $a, $ty)
+    };
+    (35, $a:ident, $ty:ty) => {
+        34359738368 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(34, $a, $ty)
+    };
+    (34, $a:ident, $ty:ty) => {
+        17179869184 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(33, $a, $ty)
+    };
+    (33, $a:ident, $ty:ty) => {
+        8589934592 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(32, $a, $ty)
+    };
+    (32, $a:ident, $ty:ty) => {
+        4294967296 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(31, $a, $ty)
+    };
+    (31, $a:ident, $ty:ty) => {
+        2147483648 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(30, $a, $ty)
+    };
+    (30, $a:ident, $ty:ty) => {
+        1073741824 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(29, $a, $ty)
+    };
+    (29, $a:ident, $ty:ty) => {
+        536870912 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(28, $a, $ty)
+    };
+    (28, $a:ident, $ty:ty) => {
+        268435456 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(27, $a, $ty)
+    };
+    (27, $a:ident, $ty:ty) => {
+        134217728 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(26, $a, $ty)
+    };
+    (26, $a:ident, $ty:ty) => {
+        67108864 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(25, $a, $ty)
+    };
+    (25, $a:ident, $ty:ty) => {
+        33554432 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(24, $a, $ty)
+    };
+    (24, $a:ident, $ty:ty) => {
+        16777216 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(23, $a, $ty)
+    };
+    (23, $a:ident, $ty:ty) => {
+        8388608 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(22, $a, $ty)
+    };
+    (22, $a:ident, $ty:ty) => {
+        4194304 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(21, $a, $ty)
+    };
+    (21, $a:ident, $ty:ty) => {
+        2097152 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(20, $a, $ty)
+    };
+    (20, $a:ident, $ty:ty) => {
+        1048576 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(19, $a, $ty)
+    };
+    (19, $a:ident, $ty:ty) => {
+        524288 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(18, $a, $ty)
+    };
+    (18, $a:ident, $ty:ty) => {
+        262144 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(17, $a, $ty)
+    };
+    (17, $a:ident, $ty:ty) => {
+        131072 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(16, $a, $ty)
+    };
+    (16, $a:ident, $ty:ty) => {
+        65536 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(15, $a, $ty)
+    };
+    (15, $a:ident, $ty:ty) => {
+        32768 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(14, $a, $ty)
+    };
+    (14, $a:ident, $ty:ty) => {
+        16384 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(13, $a, $ty)
+    };
+    (13, $a:ident, $ty:ty) => {
+        8192 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(12, $a, $ty)
+    };
+    (12, $a:ident, $ty:ty) => {
+        4096 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(11, $a, $ty)
+    };
+    (11, $a:ident, $ty:ty) => {
+        2048 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(10, $a, $ty)
+    };
+    (10, $a:ident, $ty:ty) => {
+        1024 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(9, $a, $ty)
+    };
+    (9, $a:ident, $ty:ty) => {
+        512 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(8, $a, $ty)
+    };
+    (8, $a:ident, $ty:ty) => {
+        256 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(7, $a, $ty)
+    };
+    (7, $a:ident, $ty:ty) => {
+        128 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(6, $a, $ty)
+    };
+    (6, $a:ident, $ty:ty) => {
+        64 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(5, $a, $ty)
+    };
+    (5, $a:ident, $ty:ty) => {
+        32 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(4, $a, $ty)
+    };
+    (4, $a:ident, $ty:ty) => {
+        16 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(3, $a, $ty)
+    };
+    (3, $a:ident, $ty:ty) => {
+        8 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(2, $a, $ty)
+    };
+    (2, $a:ident, $ty:ty) => {
+        4 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(1, $a, $ty)
+    };
+    (1, $a:ident, $ty:ty) => {
+        2 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_little!(0, $a, $ty)
+    };
+    (0, $a:ident, $ty:ty) => {
+        ((if $a[0] < 0 { 1 } else { 0 }) as $ty)
+    };
+}
+pub(crate) use simd_bitmask_little;
+
+/// Truncates an integer vector to a bitmask.
+/// Macro for that expands to an expression which is equivalent to truncating an integer vector to a bitmask, as it would on big endian systems.
+///
+/// The macro takes 3 arguments.
+/// The first is the highest index of the vector.
+/// The second is the vector itself, which should just contain `0` and `!0`.
+/// The third is the type to which the truncation happens, which should be atleast as wide as the number of elements in the vector.
+///
+/// Thus for example, to truncate the vector,
+/// `let a : i32 = [!0, 0, 0, 0, 0, 0, 0, 0, !0, !0, 0, 0, 0, 0, !0, 0]`
+/// to u16, you would call,
+/// `simd_bitmask_big!(15, a, u16)`
+/// to get,
+/// `0b1000000011000010u16`
+///
+/// # Safety
+/// The second argument must be a vector of signed integer types.
+
+#[allow(unused)]
+macro_rules! simd_bitmask_big {
+    (63, $a:ident, $ty:ty) => {
+        1 * ((if $a[63] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(62, $a, $ty)
+    };
+    (62, $a:ident, $ty:ty) => {
+        2 * ((if $a[62] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(61, $a, $ty)
+    };
+    (61, $a:ident, $ty:ty) => {
+        4 * ((if $a[61] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(60, $a, $ty)
+    };
+    (60, $a:ident, $ty:ty) => {
+        8 * ((if $a[60] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(59, $a, $ty)
+    };
+    (59, $a:ident, $ty:ty) => {
+        16 * ((if $a[59] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(58, $a, $ty)
+    };
+    (58, $a:ident, $ty:ty) => {
+        32 * ((if $a[58] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(57, $a, $ty)
+    };
+    (57, $a:ident, $ty:ty) => {
+        64 * ((if $a[57] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(56, $a, $ty)
+    };
+    (56, $a:ident, $ty:ty) => {
+        128 * ((if $a[56] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(55, $a, $ty)
+    };
+    (55, $a:ident, $ty:ty) => {
+        256 * ((if $a[55] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(54, $a, $ty)
+    };
+    (54, $a:ident, $ty:ty) => {
+        512 * ((if $a[54] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(53, $a, $ty)
+    };
+    (53, $a:ident, $ty:ty) => {
+        1024 * ((if $a[53] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(52, $a, $ty)
+    };
+    (52, $a:ident, $ty:ty) => {
+        2048 * ((if $a[52] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(51, $a, $ty)
+    };
+    (51, $a:ident, $ty:ty) => {
+        4096 * ((if $a[51] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(50, $a, $ty)
+    };
+    (50, $a:ident, $ty:ty) => {
+        8192 * ((if $a[50] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(49, $a, $ty)
+    };
+    (49, $a:ident, $ty:ty) => {
+        16384 * ((if $a[49] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(48, $a, $ty)
+    };
+    (48, $a:ident, $ty:ty) => {
+        32768 * ((if $a[48] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(47, $a, $ty)
+    };
+    (47, $a:ident, $ty:ty) => {
+        65536 * ((if $a[47] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(46, $a, $ty)
+    };
+    (46, $a:ident, $ty:ty) => {
+        131072 * ((if $a[46] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(45, $a, $ty)
+    };
+    (45, $a:ident, $ty:ty) => {
+        262144 * ((if $a[45] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(44, $a, $ty)
+    };
+    (44, $a:ident, $ty:ty) => {
+        524288 * ((if $a[44] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(43, $a, $ty)
+    };
+    (43, $a:ident, $ty:ty) => {
+        1048576 * ((if $a[43] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(42, $a, $ty)
+    };
+    (42, $a:ident, $ty:ty) => {
+        2097152 * ((if $a[42] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(41, $a, $ty)
+    };
+    (41, $a:ident, $ty:ty) => {
+        4194304 * ((if $a[41] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(40, $a, $ty)
+    };
+    (40, $a:ident, $ty:ty) => {
+        8388608 * ((if $a[40] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(39, $a, $ty)
+    };
+    (39, $a:ident, $ty:ty) => {
+        16777216 * ((if $a[39] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(38, $a, $ty)
+    };
+    (38, $a:ident, $ty:ty) => {
+        33554432 * ((if $a[38] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(37, $a, $ty)
+    };
+    (37, $a:ident, $ty:ty) => {
+        67108864 * ((if $a[37] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(36, $a, $ty)
+    };
+    (36, $a:ident, $ty:ty) => {
+        134217728 * ((if $a[36] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(35, $a, $ty)
+    };
+    (35, $a:ident, $ty:ty) => {
+        268435456 * ((if $a[35] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(34, $a, $ty)
+    };
+    (34, $a:ident, $ty:ty) => {
+        536870912 * ((if $a[34] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(33, $a, $ty)
+    };
+    (33, $a:ident, $ty:ty) => {
+        1073741824 * ((if $a[33] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(32, $a, $ty)
+    };
+    (32, $a:ident, $ty:ty) => {
+        2147483648 * ((if $a[32] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(31, $a, $ty)
+    };
+    (31, $a:ident, $ty:ty) => {
+        4294967296 * ((if $a[31] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(30, $a, $ty)
+    };
+    (30, $a:ident, $ty:ty) => {
+        8589934592 * ((if $a[30] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(29, $a, $ty)
+    };
+    (29, $a:ident, $ty:ty) => {
+        17179869184 * ((if $a[29] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(28, $a, $ty)
+    };
+    (28, $a:ident, $ty:ty) => {
+        34359738368 * ((if $a[28] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(27, $a, $ty)
+    };
+    (27, $a:ident, $ty:ty) => {
+        68719476736 * ((if $a[27] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(26, $a, $ty)
+    };
+    (26, $a:ident, $ty:ty) => {
+        137438953472 * ((if $a[26] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(25, $a, $ty)
+    };
+    (25, $a:ident, $ty:ty) => {
+        274877906944 * ((if $a[25] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(24, $a, $ty)
+    };
+    (24, $a:ident, $ty:ty) => {
+        549755813888 * ((if $a[24] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(23, $a, $ty)
+    };
+    (23, $a:ident, $ty:ty) => {
+        1099511627776 * ((if $a[23] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(22, $a, $ty)
+    };
+    (22, $a:ident, $ty:ty) => {
+        2199023255552 * ((if $a[22] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(21, $a, $ty)
+    };
+    (21, $a:ident, $ty:ty) => {
+        4398046511104 * ((if $a[21] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(20, $a, $ty)
+    };
+    (20, $a:ident, $ty:ty) => {
+        8796093022208 * ((if $a[20] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(19, $a, $ty)
+    };
+    (19, $a:ident, $ty:ty) => {
+        17592186044416 * ((if $a[19] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(18, $a, $ty)
+    };
+    (18, $a:ident, $ty:ty) => {
+        35184372088832 * ((if $a[18] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(17, $a, $ty)
+    };
+    (17, $a:ident, $ty:ty) => {
+        70368744177664 * ((if $a[17] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(16, $a, $ty)
+    };
+    (16, $a:ident, $ty:ty) => {
+        140737488355328 * ((if $a[16] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(15, $a, $ty)
+    };
+    (15, $a:ident, $ty:ty) => {
+        281474976710656 * ((if $a[15] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(14, $a, $ty)
+    };
+    (14, $a:ident, $ty:ty) => {
+        562949953421312 * ((if $a[14] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(13, $a, $ty)
+    };
+    (13, $a:ident, $ty:ty) => {
+        1125899906842624 * ((if $a[13] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(12, $a, $ty)
+    };
+    (12, $a:ident, $ty:ty) => {
+        2251799813685248 * ((if $a[12] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(11, $a, $ty)
+    };
+    (11, $a:ident, $ty:ty) => {
+        4503599627370496 * ((if $a[11] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(10, $a, $ty)
+    };
+    (10, $a:ident, $ty:ty) => {
+        9007199254740992 * ((if $a[10] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(9, $a, $ty)
+    };
+    (9, $a:ident, $ty:ty) => {
+        18014398509481984 * ((if $a[9] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(8, $a, $ty)
+    };
+    (8, $a:ident, $ty:ty) => {
+        36028797018963968 * ((if $a[8] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(7, $a, $ty)
+    };
+    (7, $a:ident, $ty:ty) => {
+        72057594037927936 * ((if $a[7] < 0 { 1 } else { 0 }) as $ty) + simd_bitmask_big!(6, $a, $ty)
+    };
+    (6, $a:ident, $ty:ty) => {
+        144115188075855872 * ((if $a[6] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(5, $a, $ty)
+    };
+    (5, $a:ident, $ty:ty) => {
+        288230376151711744 * ((if $a[5] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(4, $a, $ty)
+    };
+    (4, $a:ident, $ty:ty) => {
+        576460752303423488 * ((if $a[4] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(3, $a, $ty)
+    };
+    (3, $a:ident, $ty:ty) => {
+        1152921504606846976 * ((if $a[3] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(2, $a, $ty)
+    };
+    (2, $a:ident, $ty:ty) => {
+        2305843009213693952 * ((if $a[2] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(1, $a, $ty)
+    };
+    (1, $a:ident, $ty:ty) => {
+        4611686018427387904 * ((if $a[1] < 0 { 1 } else { 0 }) as $ty)
+            + simd_bitmask_big!(0, $a, $ty)
+    };
+    (0, $a:ident, $ty:ty) => {
+        9223372036854775808 * ((if $a[0] < 0 { 1 } else { 0 }) as $ty)
+    };
+}
+#[allow(unused)]
+pub(crate) use simd_bitmask_big;
+
+/// Selects elements from a mask.
+///
+/// For each element, if the corresponding value in `mask` is `!0`, select the element from
+/// `if_true`.  If the corresponding value in `mask` is `0`, select the element from
+/// `if_false`.
+///
+/// # Safety
+/// `mask` must only contain `0` and `!0`.
+
+pub fn simd_select<const N: u64, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+    mask: FunArray<N, T1>,
+    if_true: FunArray<N, T2>,
+    if_false: FunArray<N, T2>,
+) -> FunArray<N, T2> {
+    FunArray::from_fn(|i| {
+        if mask[i] == T1::ONES {
+            if_true[i]
+        } else {
+            if_false[i]
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch.rs b/testable-simd-models/src/core_arch.rs
new file mode 100644
index 0000000000000..19e643885f4ce
--- /dev/null
+++ b/testable-simd-models/src/core_arch.rs
@@ -0,0 +1,5 @@
+/// This is a (partial) mirror of [`core::arch`]
+pub mod x86;
+pub use x86 as x86_64;
+
+pub mod arm_shared;
diff --git a/testable-simd-models/src/core_arch/arm_shared/mod.rs b/testable-simd-models/src/core_arch/arm_shared/mod.rs
new file mode 100644
index 0000000000000..6e2272ec0e50a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/mod.rs
@@ -0,0 +1,4 @@
+pub mod models;
+#[cfg(test)]
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+pub mod tests;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/mod.rs b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
new file mode 100644
index 0000000000000..fb7844c6d0441
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/models/mod.rs
@@ -0,0 +1,44 @@
+//! Rust models for ARM intrinsics.
+//!
+//! This module contains models for the intrinsics as they are defined in the Rust core.
+//! Since this is supposed to model the Rust core, the implemented functions must
+//! mirror the Rust implementations as closely as they can.
+//!
+//! For example, calls to simd functions like simd_add and simd_sub are left as is,
+//! with their implementations defined in `crate::abstractions::simd`. Some other
+//! operations like simd_cast or simd_shuffle might need a little modification
+//! for correct compilation.
+//!
+//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
+//! or with `.into()`.
+//!
+//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding
+//! function is defined in the `c_extern` module in each file, which contain manually
+//! written implementations made by consulting the appropriate Intel documentation.
+//!
+//! In general, it is best to gain an idea of how an implementation should be written by looking
+//! at how other functions are implemented. Also see `core::arch::arm` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
+#![allow(unused)]
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::simd::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
+
+pub mod neon;
diff --git a/testable-simd-models/src/core_arch/arm_shared/models/neon.rs b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs
new file mode 100644
index 0000000000000..794fd25285b47
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/models/neon.rs
@@ -0,0 +1,873 @@
+use super::types::*;
+use crate::abstractions::simd::*;
+
+pub fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    simd_add(a, vabd_s16(b, c))
+}
+
+pub fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    simd_add(a, vabd_s32(b, c))
+}
+
+pub fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    simd_add(a, vabd_s8(b, c))
+}
+
+pub fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    simd_add(a, vabd_u16(b, c))
+}
+
+pub fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    simd_add(a, vabd_u32(b, c))
+}
+
+pub fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    simd_add(a, vabd_u8(b, c))
+}
+
+pub fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t {
+    let d: uint8x8_t = vabd_u8(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t {
+    let d: uint16x4_t = vabd_u16(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t {
+    let d: uint32x2_t = vabd_u32(b, c);
+    simd_add(a, simd_cast(d))
+}
+
+pub fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    simd_add(a, vabdq_s16(b, c))
+}
+
+pub fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    simd_add(a, vabdq_s32(b, c))
+}
+
+pub fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    simd_add(a, vabdq_s8(b, c))
+}
+
+pub fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    simd_add(a, vabdq_u16(b, c))
+}
+
+pub fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    simd_add(a, vabdq_u32(b, c))
+}
+
+pub fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    simd_add(a, vabdq_u8(b, c))
+}
+
+pub fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_abs_diff(a, b)
+}
+
+pub fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    simd_cast(vabd_u8(a, b))
+}
+
+pub fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    simd_cast(vabd_u16(a, b))
+}
+
+pub fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    simd_cast(vabd_u32(a, b))
+}
+
+pub fn vabs_s8(a: int8x8_t) -> int8x8_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s8(a: int8x16_t) -> int8x16_t {
+    simd_abs(a)
+}
+
+pub fn vabs_s16(a: int16x4_t) -> int16x4_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s16(a: int16x8_t) -> int16x8_t {
+    simd_abs(a)
+}
+
+pub fn vabs_s32(a: int32x2_t) -> int32x2_t {
+    simd_abs(a)
+}
+
+pub fn vabsq_s32(a: int32x4_t) -> int32x4_t {
+    simd_abs(a)
+}
+
+pub fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_add(a, b)
+}
+
+pub fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_add(a, b)
+}
+
+pub fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_add(a, b)
+}
+
+pub fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+pub fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+pub fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)));
+    simd_shuffle(r, x, [0, 1, 2, 3])
+}
+
+pub fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+}
+
+pub fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)));
+    simd_shuffle(r, x, [0, 1, 2, 3, 4, 5, 6, 7])
+}
+
+pub fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t {
+    let x = simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)));
+    simd_shuffle(r, x, [0, 1, 2, 3])
+}
+
+pub fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), int16x8_t::splat(8)))
+}
+
+pub fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), int32x4_t::splat(16)))
+}
+
+pub fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), int64x2_t::splat(32)))
+}
+
+pub fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t {
+    simd_cast(simd_shr(simd_add(a, b), uint16x8_t::splat(8)))
+}
+
+pub fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t {
+    simd_cast(simd_shr(simd_add(a, b), uint32x4_t::splat(16)))
+}
+
+pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
+    simd_cast(simd_shr(simd_add(a, b), uint64x2_t::splat(32)))
+}
+
+pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]);
+    let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = simd_shuffle(a, a, [2, 3]);
+    let b: int32x2_t = simd_shuffle(b, b, [2, 3]);
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = simd_shuffle(a, a, [4, 5, 6, 7]);
+    let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = simd_shuffle(a, a, [2, 3]);
+    let b: uint32x2_t = simd_shuffle(b, b, [2, 3]);
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = simd_shuffle(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t {
+    let a: int32x4_t = simd_cast(a);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t {
+    let a: int64x2_t = simd_cast(a);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t {
+    let a: int16x8_t = simd_cast(a);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t {
+    let a: uint32x4_t = simd_cast(a);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t {
+    let a: uint64x2_t = simd_cast(a);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t {
+    let a: uint16x8_t = simd_cast(a);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let b: int16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let b: int32x2_t = simd_shuffle(b, b, [2, 3]);
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let b: int8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let b: uint16x4_t = simd_shuffle(b, b, [4, 5, 6, 7]);
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let b: uint32x2_t = simd_shuffle(b, b, [2, 3]);
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let b: uint8x8_t = simd_shuffle(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t {
+    let b: int32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t {
+    let b: int64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t {
+    let b: int16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t {
+    let b: uint32x4_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t {
+    let b: uint64x2_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t {
+    let b: uint16x8_t = simd_cast(b);
+    simd_add(a, b)
+}
+
+pub fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    simd_and(a, b)
+}
+
+pub fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_and(a, b)
+}
+
+pub fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    simd_and(a, b)
+}
+
+pub fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    simd_and(a, b)
+}
+
+pub fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t {
+    let c = int16x4_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t {
+    let c = int32x2_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t {
+    let c = int64x1_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t {
+    let c = int8x8_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
+    let c = int16x8_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
+    let c = int32x4_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
+    let c = int64x2_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
+    let c = int8x16_t::splat(-1);
+    simd_and(simd_xor(b, c), a)
+}
+
+pub fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    let c = int16x4_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    let c = int32x2_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t {
+    let c = int64x1_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    let c = int8x8_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    let c = int16x8_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    let c = int32x4_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
+    let c = int64x2_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    let c = int8x16_t::splat(-1);
+    simd_and(simd_xor(b, simd_cast(c)), a)
+}
+
+pub fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t {
+    let not = int16x4_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t {
+    let not = int32x2_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t {
+    let not = int64x1_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
+    let not = int8x8_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t {
+    let not = int16x8_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t {
+    let not = int32x4_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t {
+    let not = int64x2_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t {
+    let not = int8x16_t::splat(-1);
+    simd_cast(simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), simd_cast(c)),
+    ))
+}
+
+pub fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t {
+    let not = int16x4_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t {
+    let not = int32x2_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t {
+    let not = int64x1_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
+    let not = int8x8_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t {
+    let not = int16x8_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t {
+    let not = int32x4_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t {
+    let not = int64x2_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t {
+    let not = int8x16_t::splat(-1);
+    simd_or(
+        simd_and(a, simd_cast(b)),
+        simd_and(simd_xor(a, simd_cast(not)), c),
+    )
+}
+
+pub fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_eq(a, b))
+}
+
+pub fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_eq(a, b)
+}
+
+pub fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_eq(a, b)
+}
+
+pub fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_eq(a, b)
+}
+
+pub fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_eq(a, b)
+}
+
+pub fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_ge(a, b))
+}
+
+pub fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_ge(a, b)
+}
+
+pub fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_ge(a, b)
+}
+
+pub fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_ge(a, b)
+}
+
+pub fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_gt(a, b))
+}
+
+pub fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_gt(a, b)
+}
+
+pub fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_gt(a, b)
+}
+
+pub fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t {
+    simd_cast(simd_le(a, b))
+}
+
+pub fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    simd_le(a, b)
+}
+
+pub fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
+    simd_le(a, b)
+}
+
+pub fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t {
+    simd_le(a, b)
+}
+
+pub fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
+    simd_le(a, b)
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
new file mode 100644
index 0000000000000..7ec0df1263b7f
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/mod.rs
@@ -0,0 +1,112 @@
+//! Tests for intrinsics defined in `crate::core_arch::models::arm_shared`
+//!
+//! Each and every modelled intrinsic is tested against the Rust
+//! implementation here. For the most part, the tests work by
+//! generating random inputs, passing them as arguments
+//! to both the models in this crate, and the corresponding intrinsics
+//! in the Rust core and then comparing their outputs.
+//!
+//! To add a test for a modelled intrinsic, go the appropriate file, and
+//! use the `mk!` macro to define it.
+//!
+//! A `mk!` macro invocation looks like the following,
+//! `mk!([<number of times the random test happens>]<function name>{<<const values, if the function takes any>,>}(<function arguments : with types,>))
+//!
+//! For example, some valid invocations are
+//!
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//!
+//! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
+//! The const values are necessary if the function has constant arguments, but should be discarded if not.
+//! The function name and the function arguments are necessary in all cases.
+//!
+//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the
+//! test has to be written manually. It is recommended that the manually defined test follows
+//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the
+//! case that the intrinsic takes constant arguments, each and every possible constant value
+//! (upto a maximum of 255) that can be passed to the function be used for testing. The number
+//! of constant values passed depends on if the Rust intrinsics statically asserts that the
+//! length of the constant argument be less than or equal to a certain number of bits.
+
+pub mod neon;
+
+#[allow(non_camel_case_types)]
+mod types {
+    use crate::abstractions::simd::*;
+    pub type int32x4_t = i32x4;
+    pub type int64x1_t = i64x1;
+    pub type int64x2_t = i64x2;
+    pub type int16x8_t = i16x8;
+    pub type int8x16_t = i8x16;
+    pub type uint32x4_t = u32x4;
+    pub type uint64x1_t = u64x1;
+    pub type uint64x2_t = u64x2;
+    pub type uint16x8_t = u16x8;
+    pub type uint8x16_t = u8x16;
+    pub type int32x2_t = i32x2;
+    pub type int16x4_t = i16x4;
+    pub type int8x8_t = i8x8;
+    pub type uint32x2_t = u32x2;
+    pub type uint16x4_t = u16x4;
+    pub type uint8x8_t = u8x8;
+}
+
+pub(crate) mod upstream {
+    #[cfg(target_arch = "aarch64")]
+    pub use core::arch::aarch64::*;
+    #[cfg(target_arch = "arm")]
+    pub use core::arch::arm::*;
+}
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+pub mod conversions {
+    use super::upstream::*;
+
+    use super::types;
+    use crate::abstractions::bitvec::BitVec;
+    use crate::abstractions::funarr::FunArray;
+
+    macro_rules! convert{
+	($($ty1:ident [$ty2:ty ; $n:literal]),*) => {
+	    $(
+		impl From<$ty1> for types::$ty1 {
+		    fn from (arg: $ty1) -> types::$ty1 {
+			let stuff = unsafe { *(&arg as *const $ty1 as *const [$ty2; $n])};
+			FunArray::from_fn(|i|
+					  stuff[i as usize]
+			)
+		    }
+		}
+		impl From<types::$ty1> for $ty1 {
+		    fn from (arg: types::$ty1) -> $ty1 {
+			let bv: &[u8] = &(BitVec::from(arg)).to_vec()[..];
+			unsafe {
+			    *(bv.as_ptr() as *const [$ty2; $n] as *const _)
+			}
+		    }
+		}
+	    )*
+	}
+    }
+
+    convert!(
+    int32x4_t [i32; 4],
+    int64x1_t [i64; 1],
+    int64x2_t [i64; 2],
+    int16x8_t [i16; 8],
+    int8x16_t [i8; 16],
+    uint32x4_t [u32; 4],
+    uint64x1_t [u64; 1],
+    uint64x2_t [u64; 2],
+    uint16x8_t [u16; 8],
+    uint8x16_t [u8; 16],
+    int32x2_t [i32; 2],
+    int16x4_t [i16; 4],
+    int8x8_t [i8; 8],
+    uint32x2_t [u32; 2],
+    uint16x4_t [u16; 4],
+    uint8x8_t [u8; 8]
+    );
+}
diff --git a/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
new file mode 100644
index 0000000000000..e07d385f656f6
--- /dev/null
+++ b/testable-simd-models/src/core_arch/arm_shared/tests/neon.rs
@@ -0,0 +1,218 @@
+#[cfg(test)]
+use super::upstream;
+use crate::abstractions::funarr::FunArray;
+use crate::helpers::test::HasRandom;
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::neon::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    FunArray::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+
+}
+
+use super::types::*;
+mk!(vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t));
+mk!(vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t));
+mk!(vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t));
+mk!(vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t));
+mk!(vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t));
+mk!(vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t));
+mk!(vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t));
+mk!(vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t));
+mk!(vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t));
+mk!(vabd_s8(a: int8x8_t, b: int8x8_t));
+mk!(vabdq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vabd_s16(a: int16x4_t, b: int16x4_t));
+mk!(vabdq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vabd_s32(a: int32x2_t, b: int32x2_t));
+mk!(vabdq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vabd_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vabdq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vabd_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vabdq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vabd_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vabdq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vabdl_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vabdl_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vabdl_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vabs_s8(a: int8x8_t));
+mk!(vabsq_s8(a: int8x16_t));
+mk!(vabs_s16(a: int16x4_t));
+mk!(vabsq_s16(a: int16x8_t));
+mk!(vabs_s32(a: int32x2_t));
+mk!(vabsq_s32(a: int32x4_t));
+mk!(vadd_s16(a: int16x4_t, b: int16x4_t));
+mk!(vadd_s32(a: int32x2_t, b: int32x2_t));
+mk!(vadd_s8(a: int8x8_t, b: int8x8_t));
+mk!(vadd_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vadd_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vadd_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vaddq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vaddq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vaddq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vaddq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t));
+mk!(vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t));
+mk!(vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t));
+mk!(vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t));
+mk!(vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t));
+mk!(vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t));
+mk!(vaddhn_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddhn_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddhn_s64(a: int64x2_t, b: int64x2_t));
+mk!(vaddhn_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddhn_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddhn_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vaddl_high_s16(a: int16x8_t, b: int16x8_t));
+mk!(vaddl_high_s32(a: int32x4_t, b: int32x4_t));
+mk!(vaddl_high_s8(a: int8x16_t, b: int8x16_t));
+mk!(vaddl_high_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vaddl_high_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vaddl_high_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vaddl_s16(a: int16x4_t, b: int16x4_t));
+mk!(vaddl_s32(a: int32x2_t, b: int32x2_t));
+mk!(vaddl_s8(a: int8x8_t, b: int8x8_t));
+mk!(vaddl_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vaddl_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vaddl_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vaddw_high_s16(a: int32x4_t, b: int16x8_t));
+mk!(vaddw_high_s32(a: int64x2_t, b: int32x4_t));
+mk!(vaddw_high_s8(a: int16x8_t, b: int8x16_t));
+mk!(vaddw_high_u16(a: uint32x4_t, b: uint16x8_t));
+mk!(vaddw_high_u32(a: uint64x2_t, b: uint32x4_t));
+mk!(vaddw_high_u8(a: uint16x8_t, b: uint8x16_t));
+mk!(vaddw_s16(a: int32x4_t, b: int16x4_t));
+mk!(vaddw_s32(a: int64x2_t, b: int32x2_t));
+mk!(vaddw_s8(a: int16x8_t, b: int8x8_t));
+mk!(vaddw_u16(a: uint32x4_t, b: uint16x4_t));
+mk!(vaddw_u32(a: uint64x2_t, b: uint32x2_t));
+mk!(vaddw_u8(a: uint16x8_t, b: uint8x8_t));
+mk!(vand_s8(a: int8x8_t, b: int8x8_t));
+mk!(vandq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vand_s16(a: int16x4_t, b: int16x4_t));
+mk!(vandq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vand_s32(a: int32x2_t, b: int32x2_t));
+mk!(vandq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vand_s64(a: int64x1_t, b: int64x1_t));
+mk!(vandq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vand_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vandq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vand_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vandq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vand_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vandq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vand_u64(a: uint64x1_t, b: uint64x1_t));
+mk!(vandq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vbic_s16(a: int16x4_t, b: int16x4_t));
+mk!(vbic_s32(a: int32x2_t, b: int32x2_t));
+mk!(vbic_s8(a: int8x8_t, b: int8x8_t));
+mk!(vbicq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vbicq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vbicq_s64(a: int64x2_t, b: int64x2_t));
+mk!(vbicq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vbic_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vbic_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vbic_u64(a: uint64x1_t, b: uint64x1_t));
+mk!(vbic_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vbicq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vbicq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vbicq_u64(a: uint64x2_t, b: uint64x2_t));
+mk!(vbicq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t));
+mk!(vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t));
+mk!(vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t));
+mk!(vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t));
+mk!(vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t));
+mk!(vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t));
+mk!(vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t));
+mk!(vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t));
+mk!(vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t));
+mk!(vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t));
+mk!(vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t));
+mk!(vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t));
+mk!(vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t));
+mk!(vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t));
+mk!(vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t));
+mk!(vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t));
+mk!(vceq_s8(a: int8x8_t, b: int8x8_t));
+mk!(vceqq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vceq_s16(a: int16x4_t, b: int16x4_t));
+mk!(vceqq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vceq_s32(a: int32x2_t, b: int32x2_t));
+mk!(vceqq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vceq_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vceqq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vceq_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vceqq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vceq_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vceqq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcge_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcgeq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcge_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcgeq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcge_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcgeq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcge_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcgeq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcge_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcgeq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcge_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcgeq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcgt_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcgtq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcgt_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcgtq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcgt_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcgtq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcgt_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcgtq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcgt_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcgtq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcgt_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcgtq_u32(a: uint32x4_t, b: uint32x4_t));
+mk!(vcle_s8(a: int8x8_t, b: int8x8_t));
+mk!(vcleq_s8(a: int8x16_t, b: int8x16_t));
+mk!(vcle_s16(a: int16x4_t, b: int16x4_t));
+mk!(vcleq_s16(a: int16x8_t, b: int16x8_t));
+mk!(vcle_s32(a: int32x2_t, b: int32x2_t));
+mk!(vcleq_s32(a: int32x4_t, b: int32x4_t));
+mk!(vcle_u8(a: uint8x8_t, b: uint8x8_t));
+mk!(vcleq_u8(a: uint8x16_t, b: uint8x16_t));
+mk!(vcle_u16(a: uint16x4_t, b: uint16x4_t));
+mk!(vcleq_u16(a: uint16x8_t, b: uint16x8_t));
+mk!(vcle_u32(a: uint32x2_t, b: uint32x2_t));
+mk!(vcleq_u32(a: uint32x4_t, b: uint32x4_t));
diff --git a/testable-simd-models/src/core_arch/x86/mod.rs b/testable-simd-models/src/core_arch/x86/mod.rs
new file mode 100644
index 0000000000000..3c5cd51d9c56b
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/mod.rs
@@ -0,0 +1,4 @@
+pub mod models;
+#[cfg(test)]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod tests;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
new file mode 100644
index 0000000000000..f392a7abf05b0
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -0,0 +1,432 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//!   Programmer's Manual, Volume 3: General-Purpose and System
+//!   Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
+use super::types::*;
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
+
+mod c_extern {
+    use crate::abstractions::simd::*;
+
+    pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+        let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+            0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+            1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+            2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+            3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+            _ => unreachable!(),
+        });
+
+        i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+    }
+}
+
+use c_extern::*;
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
+pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
+    let mask: i32x8 = simd_lt(BitVec::to_i32x8(c), i32x8::from_fn(|_| 0));
+    BitVec::from_i32x8(simd_select(mask, BitVec::to_i32x8(b), BitVec::to_i32x8(a)))
+}
+
+/// Equal (ordered, non-signaling)
+
+pub const _CMP_EQ_OQ: i32 = 0x00;
+/// Less-than (ordered, signaling)
+
+pub const _CMP_LT_OS: i32 = 0x01;
+/// Less-than-or-equal (ordered, signaling)
+
+pub const _CMP_LE_OS: i32 = 0x02;
+/// Unordered (non-signaling)
+
+pub const _CMP_UNORD_Q: i32 = 0x03;
+/// Not-equal (unordered, non-signaling)
+
+pub const _CMP_NEQ_UQ: i32 = 0x04;
+/// Not-less-than (unordered, signaling)
+
+pub const _CMP_NLT_US: i32 = 0x05;
+/// Not-less-than-or-equal (unordered, signaling)
+
+pub const _CMP_NLE_US: i32 = 0x06;
+/// Ordered (non-signaling)
+
+pub const _CMP_ORD_Q: i32 = 0x07;
+/// Equal (unordered, non-signaling)
+
+pub const _CMP_EQ_UQ: i32 = 0x08;
+/// Not-greater-than-or-equal (unordered, signaling)
+
+pub const _CMP_NGE_US: i32 = 0x09;
+/// Not-greater-than (unordered, signaling)
+
+pub const _CMP_NGT_US: i32 = 0x0a;
+/// False (ordered, non-signaling)
+
+pub const _CMP_FALSE_OQ: i32 = 0x0b;
+/// Not-equal (ordered, non-signaling)
+
+pub const _CMP_NEQ_OQ: i32 = 0x0c;
+/// Greater-than-or-equal (ordered, signaling)
+
+pub const _CMP_GE_OS: i32 = 0x0d;
+/// Greater-than (ordered, signaling)
+
+pub const _CMP_GT_OS: i32 = 0x0e;
+/// True (unordered, non-signaling)
+
+pub const _CMP_TRUE_UQ: i32 = 0x0f;
+/// Equal (ordered, signaling)
+
+pub const _CMP_EQ_OS: i32 = 0x10;
+/// Less-than (ordered, non-signaling)
+
+pub const _CMP_LT_OQ: i32 = 0x11;
+/// Less-than-or-equal (ordered, non-signaling)
+
+pub const _CMP_LE_OQ: i32 = 0x12;
+/// Unordered (signaling)
+
+pub const _CMP_UNORD_S: i32 = 0x13;
+/// Not-equal (unordered, signaling)
+
+pub const _CMP_NEQ_US: i32 = 0x14;
+/// Not-less-than (unordered, non-signaling)
+
+pub const _CMP_NLT_UQ: i32 = 0x15;
+/// Not-less-than-or-equal (unordered, non-signaling)
+
+pub const _CMP_NLE_UQ: i32 = 0x16;
+/// Ordered (signaling)
+
+pub const _CMP_ORD_S: i32 = 0x17;
+/// Equal (unordered, signaling)
+
+pub const _CMP_EQ_US: i32 = 0x18;
+/// Not-greater-than-or-equal (unordered, non-signaling)
+
+pub const _CMP_NGE_UQ: i32 = 0x19;
+/// Not-greater-than (unordered, non-signaling)
+
+pub const _CMP_NGT_UQ: i32 = 0x1a;
+/// False (ordered, signaling)
+
+pub const _CMP_FALSE_OS: i32 = 0x1b;
+/// Not-equal (ordered, signaling)
+
+pub const _CMP_NEQ_OS: i32 = 0x1c;
+/// Greater-than-or-equal (ordered, non-signaling)
+
+pub const _CMP_GE_OQ: i32 = 0x1d;
+/// Greater-than (ordered, non-signaling)
+
+pub const _CMP_GT_OQ: i32 = 0x1e;
+/// True (unordered, signaling)
+
+pub const _CMP_TRUE_US: i32 = 0x1f;
+
+pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    // // static_assert_uimm_bits!(IMM8, 8);
+    vperm2f128si256(BitVec::to_i32x8(a), BitVec::to_i32x8(b), IMM8 as i8).into()
+}
+
+/// Copies `a` to result, then inserts 128 bits from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
+
+pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    // // static_assert_uimm_bits!(IMM1, 1);
+
+    let dst: i64x4 = simd_shuffle(
+        BitVec::to_i64x4(a),
+        BitVec::to_i64x4(_mm256_castsi128_si256(b)),
+        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+    );
+    dst.into()
+}
+
+/// Copies `a` to result, and inserts the 8-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
+    // // static_assert_uimm_bits!(INDEX, 5);
+    simd_insert(BitVec::to_i8x32(a), INDEX as u64, i).into()
+}
+
+/// Copies `a` to result, and inserts the 16-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
+    // // static_assert_uimm_bits!(INDEX, 4);
+    simd_insert(BitVec::to_i16x16(a), INDEX as u64, i).into()
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
+    let c = BitVec::<256>::from_fn(|i| match (a[i], b[i]) {
+        (Bit::One, Bit::One) => Bit::One,
+        _ => Bit::Zero,
+    });
+    let all_zero = c.fold(true, |acc, bit| acc && bit == Bit::Zero);
+    if all_zero {
+        1
+    } else {
+        0
+    }
+}
+
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed single-precision (32-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+pub fn _mm256_movemask_ps(a: __m256) -> i32 {
+    // Propagate the highest bit to the rest, because simd_bitmask
+    // requires all-1 or all-0.
+    let mask: i32x8 = simd_lt(BitVec::to_i32x8(a), i32x8::from_fn(|_| 0));
+    let r = simd_bitmask_little!(7, mask, u8);
+    r as u32 as i32
+}
+
+/// Returns vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
+
+pub fn _mm256_setzero_ps() -> __m256 {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+/// Returns vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
+
+pub fn _mm256_setzero_si256() -> __m256i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+/// Sets packed 8-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    let vec = [
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
+        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+    ];
+    BitVec::from_i8x32(i8x32::from_fn(|i| vec[(31 - i) as usize]))
+}
+
+/// Sets packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    let vec = [
+        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+    ];
+    BitVec::from_i16x16(i16x16::from_fn(|i| vec[(15 - i) as usize]))
+}
+
+/// Sets packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
+    BitVec::from_i32x8(i32x8::from_fn(|i| vec[(7 - i) as usize]))
+}
+
+/// Sets packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
+// This intrinsic has no corresponding instruction.
+pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    let vec = [d, c, b, a];
+    BitVec::from_i64x4(i64x4::from_fn(|i| vec[i as usize]))
+}
+
+/// Broadcasts 8-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+
+//
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
+    BitVec::from_i8x32(i8x32::from_fn(|_| val))
+}
+
+/// Broadcasts 16-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
+
+//
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set1_epi16(a: i16) -> __m256i {
+    BitVec::from_i16x16(i16x16::from_fn(|_| a))
+}
+
+/// Broadcasts 32-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_set1_epi32(a: i32) -> __m256i {
+    BitVec::from_i32x8(i32x8::from_fn(|_| a))
+}
+
+/// Broadcasts 64-bit integer `a` to all elements of returned vector.
+/// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
+// This intrinsic has no corresponding instruction.
+pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
+    BitVec::from_i64x4(i64x4::from_fn(|_| a))
+}
+
+pub fn _mm256_castps_si256(a: __m256) -> __m256i {
+    a
+}
+
+/// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
+    a
+}
+
+/// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
+
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+
+pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
+    BitVec::from_fn(|i| a[i])
+}
+
+/// Casts vector of type __m128i to type __m256i;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
+
+// This intrinsic is only used for compilation and does not generate any
+// instructions, thus it has zero latency.
+
+pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
+    let a = BitVec::to_i64x2(a);
+    let undefined = i64x2::from_fn(|_| 0);
+    let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
+    BitVec::from_i64x4(dst)
+}
+
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
+
+pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
+    BitVec::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
new file mode 100644
index 0000000000000..05173b19a8c58
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -0,0 +1,2493 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//!
+//! This module contains models for AVX2 intrinsics.
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+use crate::abstractions::{bitvec::BitVec, simd::*};
+
+mod c_extern {
+    use crate::abstractions::{bit::MachineInteger, simd::*};
+    pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+
+    pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else if i < 4 {
+                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+            } else if i < 6 {
+                a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_add(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+
+    pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+
+    pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else if i < 4 {
+                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+            } else if i < 6 {
+                a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_sub(a[2 * i + 1])
+            } else if i < 8 {
+                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+            } else if i < 12 {
+                a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
+            } else {
+                b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
+            }
+        })
+    }
+    pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
+        i32x8::from_fn(|i| {
+            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+        })
+    }
+
+    pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
+        i16x16::from_fn(|i| {
+            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+        })
+    }
+    pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
+        i8x32::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i] as i8
+                }
+            } else if i < 16 {
+                if b[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 8] as i8
+                }
+            } else if i < 24 {
+                if a[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i - 8] as i8
+                }
+            } else {
+                if b[i - 16] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 16] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 16] as i8
+                }
+            }
+        })
+    }
+
+    pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i] as i16
+                }
+            } else if i < 8 {
+                if b[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 4] as i16
+                }
+            } else if i < 12 {
+                if a[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i - 4] as i16
+                }
+            } else {
+                if b[i - 8] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 8] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 8] as i16
+                }
+            }
+        })
+    }
+
+    pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
+        u8x32::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i] as u8
+                }
+            } else if i < 16 {
+                if b[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 8] as u8
+                }
+            } else if i < 24 {
+                if a[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i - 8] as u8
+                }
+            } else {
+                if b[i - 16] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 16] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 16] as u8
+                }
+            }
+        })
+    }
+
+    pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
+        u16x16::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if a[i] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    a[i] as u16
+                }
+            } else if i < 8 {
+                if b[i - 4] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if b[i - 4] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    b[i - 4] as u16
+                }
+            } else if i < 12 {
+                if a[i - 4] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if a[i - 4] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    a[i - 4] as u16
+                }
+            } else {
+                if b[i - 8] > (u16::MAX as i32) {
+                    u16::MAX
+                } else if b[i - 8] < (u16::MIN as i32) {
+                    u16::MIN
+                } else {
+                    b[i - 8] as u16
+                }
+            }
+        })
+    }
+
+    pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
+        i8x32::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i8::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+    pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i16::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i32::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
+        let count4: u64 = (count[0] as u16) as u64;
+        let count3: u64 = ((count[1] as u16) as u64) * 65536;
+        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+        let count = count1 + count2 + count3 + count4;
+        i16x16::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) << count) as i16
+            }
+        })
+    }
+
+    pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x8::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) << count) as i32
+            }
+        })
+    }
+    pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
+        let count: u64 = count[0] as u64;
+
+        i64x4::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) << count) as i64
+            }
+        })
+    }
+
+    pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) << count[i]) as i32
+            }
+        })
+    }
+    pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) << count[i]) as i32
+            }
+        })
+    }
+
+    pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
+        i64x2::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) << count[i]) as i64
+            }
+        })
+    }
+    pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
+        i64x4::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) << count[i]) as i64
+            }
+        })
+    }
+
+    pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
+        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+            + ((count[2] as u16) as u64) * 4294967296
+            + ((count[1] as u16) as u64) * 65536
+            + ((count[0] as u16) as u64);
+
+        i16x16::from_fn(|i| {
+            if count > 15 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count
+            }
+        })
+    }
+
+    pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x8::from_fn(|i| {
+            if count > 31 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] << count
+            }
+        })
+    }
+
+    pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count[i]
+            }
+        })
+    }
+
+    pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
+        dbg!(a, count);
+        i32x8::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count[i]
+            }
+        })
+    }
+
+    pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
+        let count: u64 = (count[3] as u16 as u64) * 281474976710656
+            + (count[2] as u16 as u64) * 4294967296
+            + (count[1] as u16 as u64) * 65536
+            + (count[0] as u16 as u64);
+
+        i16x16::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) >> count) as i16
+            }
+        })
+    }
+
+    pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
+        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+        i32x8::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) >> count) as i32
+            }
+        })
+    }
+
+    pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
+        let count: u64 = count[0] as u64;
+
+        i64x4::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) >> count) as i64
+            }
+        })
+    }
+
+    pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) >> count[i]) as i32
+            }
+        })
+    }
+    pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
+        i32x8::from_fn(|i| {
+            if count[i] > 31 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u32) >> count[i]) as i32
+            }
+        })
+    }
+
+    pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
+        i64x2::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) >> count[i]) as i64
+            }
+        })
+    }
+    pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
+        i64x4::from_fn(|i| {
+            if count[i] > 63 || count[i] < 0 {
+                0
+            } else {
+                ((a[i] as u64) >> count[i]) as i64
+            }
+        })
+    }
+
+    pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
+        u8x32::from_fn(|i| {
+            if i < 16 {
+                if b[i] > 127 {
+                    0
+                } else {
+                    let index: u64 = (b[i] % 16) as u64;
+                    a[index]
+                }
+            } else {
+                if b[i] > 127 {
+                    0
+                } else {
+                    let index: u64 = (b[i] % 16) as u64;
+                    a[index + 16]
+                }
+            }
+        })
+    }
+
+    pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
+        u32x8::from_fn(|i| {
+            let id = b[i] % 8;
+            a[id as u64]
+        })
+    }
+
+    pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
+        u16x16::from_fn(|i| {
+            if i < 8 {
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
+                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
+                let k = a_offset + i;
+                let l = b_offset;
+                ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
+                    + ((a[k + 1].absolute_diff(b[l + 1]) as i8) as u8 as u16)
+                    + ((a[k + 2].absolute_diff(b[l + 2]) as i8) as u8 as u16)
+                    + ((a[k + 3].absolute_diff(b[l + 3]) as i8) as u8 as u16)
+            } else {
+                let i = i - 8;
+                let imm8 = imm8 >> 3;
+                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
+                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
+                let k = a_offset + i;
+                let l = b_offset;
+                ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
+                    + ((a[16 + k + 1].absolute_diff(b[16 + l + 1]) as i8) as u8 as u16)
+                    + ((a[16 + k + 2].absolute_diff(b[16 + l + 2]) as i8) as u8 as u16)
+                    + ((a[16 + k + 3].absolute_diff(b[16 + l + 3]) as i8) as u8 as u16)
+            }
+        })
+    }
+
+    pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
+        let a = i128x2::from_fn(|i| {
+            ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
+        });
+        let b = i128x2::from_fn(|i| {
+            ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
+        });
+        let imm8 = imm8 as u8 as u32 as i32;
+        let r = i128x2::from_fn(|i| {
+            let control = imm8 >> (i * 4);
+            if (control >> 3) % 2 == 1 {
+                0
+            } else {
+                match control % 4 {
+                    0 => a[0],
+                    1 => a[1],
+                    2 => b[0],
+                    3 => b[1],
+                    _ => unreachable!(),
+                }
+            }
+        });
+        i64x4::from_fn(|i| {
+            let index = i >> 1;
+            let hilo = i.rem_euclid(2);
+            let val = r[index];
+            if hilo == 0 {
+                i64::cast(val)
+            } else {
+                i64::cast(val >> 64)
+            }
+        })
+    }
+    pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
+        i16x16::from_fn(|i| {
+            let temp = (a[i] as i32) * (b[i] as i32);
+            let temp = (temp >> 14).wrapping_add(1) >> 1;
+            temp as i16
+        })
+    }
+
+    pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
+        let tmp = u8x32::from_fn(|i| a[i].absolute_diff(b[i]));
+        u64x4::from_fn(|i| {
+            (tmp[i * 8] as u16)
+                .wrapping_add(tmp[i * 8 + 1] as u16)
+                .wrapping_add(tmp[i * 8 + 2] as u16)
+                .wrapping_add(tmp[i * 8 + 3] as u16)
+                .wrapping_add(tmp[i * 8 + 4] as u16)
+                .wrapping_add(tmp[i * 8 + 5] as u16)
+                .wrapping_add(tmp[i * 8 + 6] as u16)
+                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+        })
+    }
+}
+use c_extern::*;
+
+use super::avx::*;
+use super::types::*;
+use crate::abstractions::simd::*;
+/// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
+
+pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let r = simd_select(simd_lt(a, i32x8::from_fn(|_| 0)), simd_neg(a), a);
+    BitVec::from_i32x8(r)
+}
+
+/// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
+
+pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let r = simd_select(simd_lt(a, i16x16::from_fn(|_| 0)), simd_neg(a), a);
+    BitVec::from_i16x16(r)
+}
+
+/// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
+
+pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let r = simd_select(simd_lt(a, i8x32::from_fn(|_| 0)), simd_neg(a), a);
+    BitVec::from_i8x32(r)
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
+
+pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i64x4(simd_add(BitVec::to_i64x4(a), BitVec::to_i64x4(b)))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
+
+pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i32x8(simd_add(BitVec::to_i32x8(a), BitVec::to_i32x8(b)))
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
+
+pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i16x16(simd_add(BitVec::to_i16x16(a), BitVec::to_i16x16(b)))
+}
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
+
+pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i8x32(simd_add(BitVec::to_i8x32(a), BitVec::to_i8x32(b)))
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
+
+pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i8x32(simd_saturating_add(
+        BitVec::to_i8x32(a),
+        BitVec::to_i8x32(b),
+    ))
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
+
+pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    BitVec::from_i16x16(simd_saturating_add(
+        BitVec::to_i16x16(a),
+        BitVec::to_i16x16(b),
+    ))
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
+
+pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_add(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
+
+pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_add(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+}
+
+/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
+/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
+
+pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 >= 32 {
+        return _mm256_setzero_si256();
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm256_setzero_si256(), a)
+    } else {
+        (a, b)
+    };
+
+    let a = BitVec::to_i8x32(a);
+    let b = BitVec::to_i8x32(b);
+
+    if IMM8 == 16 {
+        return a.into();
+    }
+
+    let r: i8x32 = match IMM8 % 16 {
+        0 => simd_shuffle(
+            b,
+            a,
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+                23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ],
+        ),
+        1 => simd_shuffle(
+            b,
+            a,
+            [
+                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
+                24, 25, 26, 27, 28, 29, 30, 31, 48,
+            ],
+        ),
+        2 => simd_shuffle(
+            b,
+            a,
+            [
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49,
+            ],
+        ),
+        3 => simd_shuffle(
+            b,
+            a,
+            [
+                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
+                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
+            ],
+        ),
+        4 => simd_shuffle(
+            b,
+            a,
+            [
+                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
+                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
+            ],
+        ),
+        5 => simd_shuffle(
+            b,
+            a,
+            [
+                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
+                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
+            ],
+        ),
+        6 => simd_shuffle(
+            b,
+            a,
+            [
+                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
+            ],
+        ),
+        7 => simd_shuffle(
+            b,
+            a,
+            [
+                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
+                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
+            ],
+        ),
+        8 => simd_shuffle(
+            b,
+            a,
+            [
+                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
+                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
+            ],
+        ),
+        9 => simd_shuffle(
+            b,
+            a,
+            [
+                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
+                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+            ],
+        ),
+        10 => simd_shuffle(
+            b,
+            a,
+            [
+                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
+                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+            ],
+        ),
+        11 => simd_shuffle(
+            b,
+            a,
+            [
+                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
+                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+            ],
+        ),
+        12 => simd_shuffle(
+            b,
+            a,
+            [
+                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+            ],
+        ),
+        13 => simd_shuffle(
+            b,
+            a,
+            [
+                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
+                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+            ],
+        ),
+        14 => simd_shuffle(
+            b,
+            a,
+            [
+                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
+                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+            ],
+        ),
+        15 => simd_shuffle(
+            b,
+            a,
+            [
+                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+            ],
+        ),
+        _ => unreachable!(),
+    };
+    r.into()
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data)
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
+
+pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
+    simd_and(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Computes the bitwise NOT of 256 bits (representing integer data)
+/// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
+
+pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
+    let all_ones = _mm256_set1_epi8(-1);
+    simd_and(
+        simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(all_ones)),
+        BitVec::to_i64x4(b),
+    )
+    .into()
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
+
+pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
+    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+    simd_cast::<16, _, u16>(r).into()
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
+
+pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<32, _, u16>(BitVec::to_u8x32(a));
+    let b = simd_cast::<32, _, u16>(BitVec::to_u8x32(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+    simd_cast::<32, _, u8>(r).into()
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
+
+pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i32x4(a);
+    let b = BitVec::to_i32x4(b);
+    let r: i32x4 = simd_shuffle(
+        a,
+        b,
+        [
+            [0, 4, 0, 4][IMM4 as usize & 0b11],
+            [1, 1, 5, 5][IMM4 as usize & 0b11],
+            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+            [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+        ],
+    );
+    r.into()
+}
+
+/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
+
+pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    let r: i32x8 = simd_shuffle(
+        a,
+        b,
+        [
+            [0, 8, 0, 8][IMM8 as usize & 0b11],
+            [1, 1, 9, 9][IMM8 as usize & 0b11],
+            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+            [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+            [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+            [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+            [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+            [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+        ],
+    );
+    r.into()
+}
+
+/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
+pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+
+    let r: i16x16 = simd_shuffle(
+        a,
+        b,
+        [
+            [0, 16, 0, 16][IMM8 as usize & 0b11],
+            [1, 1, 17, 17][IMM8 as usize & 0b11],
+            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+            [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+            [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+            [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+            [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+            [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+            [8, 24, 8, 24][IMM8 as usize & 0b11],
+            [9, 9, 25, 25][IMM8 as usize & 0b11],
+            [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+            [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+            [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+            [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+            [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+            [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+        ],
+    );
+    r.into()
+}
+
+/// Blends packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
+pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
+    let mask: i8x32 = simd_lt(BitVec::to_i8x32(mask), i8x32::from_fn(|_| 0));
+    simd_select(mask, BitVec::to_i8x32(b), BitVec::to_i8x32(a)).into()
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
+pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 16]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 8-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
+pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 32]);
+    ret.into()
+}
+
+// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
+
+pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 4]);
+    ret.into()
+}
+
+// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
+// often compiled to `vbroadcastss`.
+/// Broadcasts the low packed 32-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
+
+pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 8]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
+
+// Emits `vmovddup` instead of `vpbroadcastq`
+// See https://github.com/rust-lang/stdarch/issues/791
+
+pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 2]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 64-bit integer from `a` to all elements of
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
+
+pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 4]);
+    ret.into()
+}
+
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
+
+pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
+    ret.into()
+}
+
+// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
+// `vbroadcastf128`.
+/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
+/// the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
+
+pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 128-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
+
+pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
+    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 8]);
+    ret.into()
+}
+
+/// Broadcasts the low packed 16-bit integer from a to all elements of
+/// the 256-bit returned value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
+
+pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
+    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 16]);
+    ret.into()
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
+
+pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
+
+pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
+
+pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
+
+pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_eq(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Compares packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
+
+pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
+
+pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
+
+pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
+
+pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_gt(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
+
+pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
+    simd_cast::<8, _, i32>(BitVec::to_i16x8(a)).into()
+}
+
+/// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
+
+pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_i16x8(a);
+    let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, i16, i64>(v64).into()
+}
+
+/// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
+
+pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
+    simd_cast::<4, i32, i64>(BitVec::to_i32x4(a)).into()
+}
+
+/// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
+
+pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
+    simd_cast::<16, i8, i16>(BitVec::to_i8x16(a)).into()
+}
+
+/// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
+
+pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
+    let a = BitVec::to_i8x16(a);
+    let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    simd_cast::<8, i8, i32>(v64).into()
+}
+
+/// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
+pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_i8x16(a);
+    let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, i8, i64>(v32).into()
+}
+
+/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
+/// integers, and stores the results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
+
+pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
+    simd_cast::<8, u16, u32>(BitVec::to_u16x8(a)).into()
+}
+
+/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
+/// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
+
+pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_u16x8(a);
+    let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, u16, u64>(v64).into()
+}
+
+/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
+
+pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
+    simd_cast::<4, u32, u64>(BitVec::to_u32x4(a)).into()
+}
+
+/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
+
+pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
+    simd_cast::<16, u8, u16>(BitVec::to_u8x16(a)).into()
+}
+
+/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
+/// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
+
+pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
+    let a = BitVec::to_u8x16(a);
+    let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+    simd_cast::<8, u8, u32>(v64).into()
+}
+
+/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
+/// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
+
+pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
+    let a = BitVec::to_u8x16(a);
+    let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+    simd_cast::<4, u8, u64>(v32).into()
+}
+
+/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
+
+pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    let a = BitVec::to_i64x4(a);
+    let b = i64x4::from_fn(|_| 0);
+    let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+    dst.into()
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
+
+pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phaddw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
+
+pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
+    phaddd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
+
+pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phaddsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
+
+pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phsubw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
+
+pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    phsubd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
+
+pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    phsubsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
+/// location specified by `IMM1`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
+
+pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
+    let a = BitVec::to_i64x4(a);
+    let b = BitVec::to_i64x4(_mm256_castsi128_si256(b));
+    let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+    dst.into()
+}
+
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
+/// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
+
+pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
+    pmaddwd(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Vertically multiplies each unsigned 8-bit integer from `a` with the
+/// corresponding signed 8-bit integer from `b`, producing intermediate
+/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
+/// signed 16-bit integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
+
+pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    pmaddubsw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
+
+pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+    simd_select::<16, i16, _>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
+
+pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    simd_select::<8, i32, _>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
+
+pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let b = BitVec::to_i8x32(b);
+    simd_select::<32, i8, _>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
+
+pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u16x16(a);
+    let b = BitVec::to_u16x16(b);
+    simd_select::<16, _, u16>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
+
+pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u32x8(a);
+    let b = BitVec::to_u32x8(b);
+    simd_select::<8, _, u32>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
+
+pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u8x32(a);
+    let b = BitVec::to_u8x32(b);
+    simd_select::<32, _, u8>(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
+
+pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let b = BitVec::to_i16x16(b);
+    simd_select::<16, _, i16>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
+
+pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i32x8(a);
+    let b = BitVec::to_i32x8(b);
+    simd_select::<8, i32, _>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
+
+pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_i8x32(a);
+    let b = BitVec::to_i8x32(b);
+    simd_select::<32, i8, _>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
+
+pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u16x16(a);
+    let b = BitVec::to_u16x16(b);
+    simd_select::<16, _, u16>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
+
+pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u32x8(a);
+    let b = BitVec::to_u32x8(b);
+    simd_select::<8, _, u32>(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
+/// the packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
+
+pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u8x32(a);
+    let b = BitVec::to_u8x32(b);
+    simd_select::<32, _, u8>(simd_lt(a, b), a, b).into()
+}
+
+/// Creates mask from the most significant bit of each 8-bit element in `a`,
+/// return the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
+
+pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
+    let z = i8x32::from_fn(|_| 0);
+    let m: i8x32 = simd_lt(BitVec::to_i8x32(a), z);
+    let r = simd_bitmask_little!(31, m, u32);
+    r as i32
+}
+
+/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
+/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
+/// results in dst. Eight SADs are performed for each 128-bit lane using one
+/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
+/// selected from `b` starting at on the offset specified in `imm8`. Eight
+/// quadruplets are formed from sequential 8-bit integers selected from `a`
+/// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
+
+pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    mpsadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b), IMM8).into()
+}
+
+/// Multiplies the low 32-bit integers from each packed 64-bit element in
+/// `a` and `b`
+///
+/// Returns the 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
+
+pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(a)));
+    let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(b)));
+    simd_mul(a, b).into()
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
+/// element in `a` and `b`
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
+
+pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
+    let a = BitVec::to_u64x4(a);
+    let b = BitVec::to_u64x4(b);
+    let mask = u64x4::splat(u32::MAX.into());
+    BitVec::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
+
+pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<16, _, i32>(BitVec::to_i16x16(a));
+    let b = simd_cast::<16, _, i32>(BitVec::to_i16x16(b));
+    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+    simd_cast::<16, i32, i16>(r).into()
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers and returning the high 16 bits of the
+/// intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
+
+pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
+    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
+    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
+    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+    simd_cast::<16, u32, u16>(r).into()
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`, producing
+/// intermediate 32-bit integers, and returns the low 16 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
+
+pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_mul(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Multiplies the packed 32-bit integers in `a` and `b`, producing
+/// intermediate 64-bit integers, and returns the low 32 bits of the
+/// intermediate integers
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
+
+pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_mul(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Multiplies packed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Truncate each intermediate
+/// integer to the 18 most significant bits, round by adding 1, and
+/// return bits `[16:1]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
+
+pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    pmulhrsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
+/// and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
+
+pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
+    simd_or(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
+
+pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    packsswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
+
+pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
+    packssdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
+
+pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
+    packuswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using unsigned saturation
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
+
+pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
+    packusdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Permutes packed 32-bit integers from `a` according to the content of `b`.
+///
+/// The last 3 bits of each integer of `b` are used as addresses into the 8
+/// integers of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
+
+pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
+    permd(BitVec::to_u32x8(a), BitVec::to_u32x8(b)).into()
+}
+
+/// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
+
+pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    let zero = i64x4::from_fn(|_| 0);
+    let r: i64x4 = simd_shuffle(
+        BitVec::to_i64x4(a),
+        zero,
+        [
+            IMM8 as u64 & 0b11,
+            (IMM8 as u64 >> 2) & 0b11,
+            (IMM8 as u64 >> 4) & 0b11,
+            (IMM8 as u64 >> 6) & 0b11,
+        ],
+    );
+    r.into()
+}
+
+/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
+
+pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    vperm2i128(BitVec::to_i64x4(a), BitVec::to_i64x4(b), IMM8 as i8).into()
+}
+
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to
+/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
+/// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
+
+pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
+    psadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// For each of the 128-bit low and high halves of the vectors, the last
+/// 4 bits of each byte of `b` are used as addresses into the respective
+/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
+///
+/// In addition, if the highest significant bit of a byte of `b` is set, the
+/// respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
+/// equivalent to:
+///
+/// ```
+/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
+///     let mut r = [0; 32];
+///     for i in 0..16 {
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///         if b[i + 16] & 0x80 == 0u8 {
+///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
+
+pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
+    pshufb(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
+/// `imm8`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
+
+pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle(
+        BitVec::to_i32x8(a),
+        BitVec::to_i32x8(a),
+        [
+            MASK as u64 & 0b11,
+            (MASK as u64 >> 2) & 0b11,
+            (MASK as u64 >> 4) & 0b11,
+            (MASK as u64 >> 6) & 0b11,
+            (MASK as u64 & 0b11) + 4,
+            ((MASK as u64 >> 2) & 0b11) + 4,
+            ((MASK as u64 >> 4) & 0b11) + 4,
+            ((MASK as u64 >> 6) & 0b11) + 4,
+        ],
+    );
+    r.into()
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
+
+pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let r: i16x16 = simd_shuffle(
+        a,
+        a,
+        [
+            0,
+            1,
+            2,
+            3,
+            4 + (IMM8 as u64 & 0b11),
+            4 + ((IMM8 as u64 >> 2) & 0b11),
+            4 + ((IMM8 as u64 >> 4) & 0b11),
+            4 + ((IMM8 as u64 >> 6) & 0b11),
+            8,
+            9,
+            10,
+            11,
+            12 + (IMM8 as u64 & 0b11),
+            12 + ((IMM8 as u64 >> 2) & 0b11),
+            12 + ((IMM8 as u64 >> 4) & 0b11),
+            12 + ((IMM8 as u64 >> 6) & 0b11),
+        ],
+    );
+    r.into()
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
+/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
+/// to the output.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
+
+pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    let a = BitVec::to_i16x16(a);
+    let r: i16x16 = simd_shuffle(
+        a,
+        a,
+        [
+            0 + (IMM8 as u64 & 0b11),
+            0 + ((IMM8 as u64 >> 2) & 0b11),
+            0 + ((IMM8 as u64 >> 4) & 0b11),
+            0 + ((IMM8 as u64 >> 6) & 0b11),
+            4,
+            5,
+            6,
+            7,
+            8 + (IMM8 as u64 & 0b11),
+            8 + ((IMM8 as u64 >> 2) & 0b11),
+            8 + ((IMM8 as u64 >> 4) & 0b11),
+            8 + ((IMM8 as u64 >> 6) & 0b11),
+            12,
+            13,
+            14,
+            15,
+        ],
+    );
+    r.into()
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed
+/// 16-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
+
+pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
+    psignw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed
+/// 32-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
+
+pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
+    psignd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed
+/// 8-bit integer in `b` is negative, and returns the results.
+/// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
+
+pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
+    psignb(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
+
+pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
+    psllw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
+
+pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
+    pslld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while
+/// shifting in zeros, and returns the result
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
+
+pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
+    psllq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
+
+pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shl(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
+
+pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shl(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while
+/// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
+
+pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shl(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+    }
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
+
+pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    _mm256_bslli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
+
+pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || i % 16 < shift {
+            0
+        } else {
+            32 + (i - shift)
+        }
+    }
+    let a = BitVec::to_i8x32(a);
+    let r: i8x32 = simd_shuffle(
+        i8x32::from_fn(|_| 0),
+        a,
+        [
+            mask(IMM8, 0) as u64,
+            mask(IMM8, 1) as u64,
+            mask(IMM8, 2) as u64,
+            mask(IMM8, 3) as u64,
+            mask(IMM8, 4) as u64,
+            mask(IMM8, 5) as u64,
+            mask(IMM8, 6) as u64,
+            mask(IMM8, 7) as u64,
+            mask(IMM8, 8) as u64,
+            mask(IMM8, 9) as u64,
+            mask(IMM8, 10) as u64,
+            mask(IMM8, 11) as u64,
+            mask(IMM8, 12) as u64,
+            mask(IMM8, 13) as u64,
+            mask(IMM8, 14) as u64,
+            mask(IMM8, 15) as u64,
+            mask(IMM8, 16) as u64,
+            mask(IMM8, 17) as u64,
+            mask(IMM8, 18) as u64,
+            mask(IMM8, 19) as u64,
+            mask(IMM8, 20) as u64,
+            mask(IMM8, 21) as u64,
+            mask(IMM8, 22) as u64,
+            mask(IMM8, 23) as u64,
+            mask(IMM8, 24) as u64,
+            mask(IMM8, 25) as u64,
+            mask(IMM8, 26) as u64,
+            mask(IMM8, 27) as u64,
+            mask(IMM8, 28) as u64,
+            mask(IMM8, 29) as u64,
+            mask(IMM8, 30) as u64,
+            mask(IMM8, 31) as u64,
+        ],
+    );
+    r.into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
+
+pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psllvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
+
+pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    psllvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
+
+pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psllvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by the amount
+/// specified by the corresponding element in `count` while
+/// shifting in zeros, and returns the result.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
+
+pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    psllvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
+
+pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
+    psraw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
+
+pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
+    psrad(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
+
+pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    simd_shr(BitVec::to_i16x16(a), i16x16::splat(IMM8.min(15) as i16)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while
+/// shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
+
+pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    simd_shr(BitVec::to_i32x8(a), i32x8::splat(IMM8.min(31))).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
+
+pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psravd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by the
+/// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
+
+pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
+    psravd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
+
+pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    _mm256_bsrli_epi128::<IMM8>(a)
+}
+
+/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+
+pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 || (15 - (i % 16)) < shift {
+            0 as u64
+        } else {
+            (32 + (i + shift)) as u64
+        }
+    }
+
+    let a = BitVec::to_i8x32(a);
+    let r: i8x32 = simd_shuffle(
+        i8x32::from_fn(|_| 0),
+        a,
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+            mask(IMM8, 16),
+            mask(IMM8, 17),
+            mask(IMM8, 18),
+            mask(IMM8, 19),
+            mask(IMM8, 20),
+            mask(IMM8, 21),
+            mask(IMM8, 22),
+            mask(IMM8, 23),
+            mask(IMM8, 24),
+            mask(IMM8, 25),
+            mask(IMM8, 26),
+            mask(IMM8, 27),
+            mask(IMM8, 28),
+            mask(IMM8, 29),
+            mask(IMM8, 30),
+            mask(IMM8, 31),
+        ],
+    );
+
+    r.into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
+
+pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
+    psrlw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
+
+pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
+    psrld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
+
+pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
+    psrlq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
+
+pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 16 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shr(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
+
+pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 32 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shr(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
+
+pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    if IMM8 >= 64 {
+        _mm256_setzero_si256()
+    } else {
+        simd_shr(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
+
+pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psrlvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
+
+pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
+    psrlvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
+
+pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psrlvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by the amount specified by
+/// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
+
+pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
+    psrlvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
+
+pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
+
+pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
+
+pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
+
+pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
+
+pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
+/// `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
+
+pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
+
+pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
+
+pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
+    simd_saturating_sub(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of each
+/// 128-bit lane in `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
+
+pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47,
+            24, 56, 25, 57, 26, 58, 27, 59,
+            28, 60, 29, 61, 30, 62, 31, 63,
+    ]);
+    r.into()
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
+
+pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
+    #[rustfmt::skip]
+    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
+        0, 32, 1, 33, 2, 34, 3, 35,
+        4, 36, 5, 37, 6, 38, 7, 39,
+        16, 48, 17, 49, 18, 50, 19, 51,
+        20, 52, 21, 53, 22, 54, 23, 55,
+    ]);
+    r.into()
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
+
+pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle(
+        BitVec::to_i16x16(a),
+        BitVec::to_i16x16(b),
+        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
+
+pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
+    let r: i16x16 = simd_shuffle(
+        BitVec::to_i16x16(a),
+        BitVec::to_i16x16(b),
+        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
+
+pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle(
+        BitVec::to_i32x8(a),
+        BitVec::to_i32x8(b),
+        [2, 10, 3, 11, 6, 14, 7, 15],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
+
+pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
+    let r: i32x8 = simd_shuffle(
+        BitVec::to_i32x8(a),
+        BitVec::to_i32x8(b),
+        [0, 8, 1, 9, 4, 12, 5, 13],
+    );
+    r.into()
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
+
+pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [1, 5, 3, 7]);
+    r.into()
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of each
+/// 128-bit lane of `a` and `b`.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
+
+pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [0, 4, 2, 6]);
+    r.into()
+}
+
+/// Computes the bitwise XOR of 256 bits (representing integer data)
+/// in `a` and `b`
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
+
+pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
+    simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+}
+
+/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
+    simd_extract(BitVec::to_u8x32(a), INDEX as u64) as u32 as i32
+}
+
+/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
+/// integer containing the zero-extended integer data.
+///
+/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
+
+// This intrinsic has no corresponding instruction.
+
+pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
+    simd_extract(BitVec::to_u16x16(a), INDEX as u64) as u32 as i32
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
new file mode 100644
index 0000000000000..95c9eb4061b6a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -0,0 +1,37 @@
+//! Rust models for x86 intrinsics.
+//!
+//! This module contains models for the intrinsics as they are defined in the Rust core.
+//! Since this is supposed to model the Rust core, the implemented functions must
+//! mirror the Rust implementations as closely as they can.
+//!
+//! For example, calls to simd functions like simd_add and simd_sub are left as is,
+//! with their implementations defined in `crate::abstractions::simd`. Some other
+//! operations like simd_cast or simd_shuffle might need a little modification
+//! for correct compilation.
+//!
+//! Calls to transmute are replaced with either an explicit call to a `BitVec::from_ function`,
+//! or with `.into()`.
+//!
+//! Sometimes, an intrinsic in Rust is implemented by directly using the corresponding
+//! LLVM instruction via an `unsafe extern "C"` module. In those cases, the corresponding
+//! function is defined in the `c_extern` module in each file, which contain manually
+//! written implementations made by consulting the appropriate Intel documentation.
+//!
+//! In general, it is best to gain an idea of how an implementation should be written by looking
+//! at how other functions are implemented. Also see `core::arch::x86` for [reference](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch).
+
+pub mod avx;
+pub mod avx2;
+pub mod sse2;
+pub mod ssse3;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
new file mode 100644
index 0000000000000..ed57f03cfd5d8
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -0,0 +1,1303 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+use super::types::*;
+use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
+mod c_extern {
+    use crate::abstractions::{bit::MachineInteger, simd::*};
+    pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
+        i8x16::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if a[i] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    a[i] as i8
+                }
+            } else {
+                if b[i - 8] > (i8::MAX as i16) {
+                    i8::MAX
+                } else if b[i - 8] < (i8::MIN as i16) {
+                    i8::MIN
+                } else {
+                    b[i - 8] as i8
+                }
+            }
+        })
+    }
+    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
+        i32x4::from_fn(|i| {
+            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+        })
+    }
+    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
+        let tmp = u8x16::from_fn(|i| a[i].absolute_diff(b[i]));
+        u64x2::from_fn(|i| {
+            (tmp[i * 8] as u16)
+                .wrapping_add(tmp[i * 8 + 1] as u16)
+                .wrapping_add(tmp[i * 8 + 2] as u16)
+                .wrapping_add(tmp[i * 8 + 3] as u16)
+                .wrapping_add(tmp[i * 8 + 4] as u16)
+                .wrapping_add(tmp[i * 8 + 5] as u16)
+                .wrapping_add(tmp[i * 8 + 6] as u16)
+                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+        })
+    }
+    pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
+        let count4: u64 = (count[0] as u16) as u64;
+        let count3: u64 = ((count[1] as u16) as u64) * 65536;
+        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+        let count = count1 + count2 + count3 + count4;
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) << count) as i16
+            }
+        })
+    }
+
+    pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) << count) as i32
+            }
+        })
+    }
+
+    pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
+        let count: u64 = count[0] as u64;
+
+        i64x2::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) << count) as i64
+            }
+        })
+    }
+
+    pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
+        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+            + ((count[2] as u16) as u64) * 4294967296
+            + ((count[1] as u16) as u64) * 65536
+            + ((count[0] as u16) as u64);
+
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] >> count
+            }
+        })
+    }
+
+    pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                if a[i] < 0 {
+                    -1
+                } else {
+                    0
+                }
+            } else {
+                a[i] << count
+            }
+        })
+    }
+
+    pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
+        let count: u64 = (count[3] as u16 as u64) * 281474976710656
+            + (count[2] as u16 as u64) * 4294967296
+            + (count[1] as u16 as u64) * 65536
+            + (count[0] as u16 as u64);
+
+        i16x8::from_fn(|i| {
+            if count > 15 {
+                0
+            } else {
+                ((a[i] as u16) >> count) as i16
+            }
+        })
+    }
+
+    pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
+        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+        i32x4::from_fn(|i| {
+            if count > 31 {
+                0
+            } else {
+                ((a[i] as u32) >> count) as i32
+            }
+        })
+    }
+
+    pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
+        let count: u64 = count[0] as u64;
+
+        i64x2::from_fn(|i| {
+            if count > 63 {
+                0
+            } else {
+                ((a[i] as u64) >> count) as i64
+            }
+        })
+    }
+
+    pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                if a[i] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if a[i] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    a[i] as i16
+                }
+            } else {
+                if b[i - 4] > (i16::MAX as i32) {
+                    i16::MAX
+                } else if b[i - 4] < (i16::MIN as i32) {
+                    i16::MIN
+                } else {
+                    b[i - 4] as i16
+                }
+            }
+        })
+    }
+
+    pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
+        u8x16::from_fn(|i| {
+            if i < 8 {
+                if a[i] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if a[i] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    a[i] as u8
+                }
+            } else {
+                if b[i - 8] > (u8::MAX as i16) {
+                    u8::MAX
+                } else if b[i - 8] < (u8::MIN as i16) {
+                    u8::MIN
+                } else {
+                    b[i - 8] as u8
+                }
+            }
+        })
+    }
+}
+
+use c_extern::*;
+
+/// Adds packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
+
+pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Adds packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
+
+pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i16x8(simd_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+}
+
+/// Adds packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
+
+pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Adds packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
+
+pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_add(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+}
+
+/// Adds packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
+
+pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Adds packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
+
+pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
+
+pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+}
+
+/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
+
+pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_add(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+}
+
+/// Averages packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
+
+pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<16, _, u16>(BitVec::to_u8x16(a));
+    let b = simd_cast::<16, _, u16>(BitVec::to_u8x16(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+    simd_cast::<16, _, u8>(r).into()
+}
+
+/// Averages packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
+
+pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
+    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
+    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+    simd_cast::<8, _, u16>(r).into()
+}
+
+/// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
+///
+/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
+/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
+/// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
+
+pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmaddwd(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
+
+pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    simd_select(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed maximum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
+
+pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_u8x16(a);
+    let b = BitVec::to_u8x16(b);
+    simd_select(simd_gt(a, b), a, b).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
+/// minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
+
+pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let b = BitVec::to_i16x8(b);
+    simd_select(simd_lt(a, b), a, b).into()
+}
+
+/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
+/// packed minimum values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
+
+pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_u8x16(a);
+    let b = BitVec::to_u8x16(b);
+    simd_select(simd_lt(a, b), a, b).into()
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
+
+pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<8, i16, i32>(BitVec::to_i16x8(a));
+    let b = simd_cast::<8, i16, i32>(BitVec::to_i16x8(b));
+    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+    BitVec::from_i16x8(simd_cast::<8, i32, i16>(r))
+}
+
+/// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
+
+pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
+    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
+    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
+    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+    simd_cast::<8, u32, u16>(r).into()
+}
+
+/// Multiplies the packed 16-bit integers in `a` and `b`.
+///
+/// The multiplication produces intermediate 32-bit integers, and returns the
+/// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
+
+pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i16x8(simd_mul(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+}
+
+/// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
+/// in `a` and `b`.
+///
+/// Returns the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
+
+pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
+    let a = BitVec::to_u64x2(a);
+    let b = BitVec::to_u64x2(b);
+    let mask = u64x2::splat(u32::MAX.into());
+    simd_mul(simd_and(a, mask), simd_and(b, mask)).into()
+}
+
+/// Sum the absolute differences of packed unsigned 8-bit integers.
+///
+/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
+/// and `b`, then horizontally sum each consecutive 8 differences to produce
+/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
+/// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
+
+pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
+    psadbw(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+}
+
+/// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
+
+pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i8x16(simd_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)))
+}
+
+/// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
+
+pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_i16x8(simd_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+}
+
+/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
+
+pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_sub(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
+
+pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_sub(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+}
+
+/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
+
+pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+/// using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
+
+pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
+
+pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+}
+
+/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
+/// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
+
+pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
+    simd_saturating_sub(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
+
+pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_slli_si128` intrinsic into a compile-time constant.
+
+fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        let shift = shift as u32 & 0xff;
+        if shift > 15 {
+            i as u64
+        } else {
+            (16 - shift + i) as u64
+        }
+    }
+    (simd_shuffle(
+        i8x16::from_fn(|_| 0),
+        BitVec::to_i8x16(a),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    ))
+    .into()
+}
+
+/// Shifts `a` left by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
+
+pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_slli_si128_impl::<IMM8>(a)
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
+
+pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+
+/// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
+
+pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        simd_shl(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
+
+pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
+    psllw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
+
+pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        simd_shl(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
+
+pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
+    pslld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
+
+pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 64 {
+        _mm_setzero_si128()
+    } else {
+        simd_shl(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)).into()
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` left by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
+
+pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psllq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
+
+pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    simd_shr(BitVec::to_i16x8(a), i16x8::splat(IMM8.min(15) as i16)).into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
+
+pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
+    psraw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
+
+pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    simd_shr(BitVec::to_i32x4(a), i32x4::splat(IMM8.min(31))).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
+/// bits.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
+
+pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psrad(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts `a` right by `IMM8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
+
+pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+    _mm_srli_si128_impl::<IMM8>(a)
+}
+
+/// Implementation detail: converts the immediate argument of the
+/// `_mm_srli_si128` intrinsic into a compile-time constant.
+
+fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u64 {
+        if (shift as u32) > 15 {
+            (i + 16) as u64
+        } else {
+            (i + (shift as u32)) as u64
+        }
+    }
+    let x: i8x16 = simd_shuffle(
+        BitVec::to_i8x16(a),
+        i8x16::from_fn(|_| 0),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    x.into()
+}
+
+/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
+
+pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 16 {
+        _mm_setzero_si128()
+    } else {
+        simd_shr(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+    }
+}
+
+/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
+
+pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
+    psrlw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+}
+
+/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
+
+pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 32 {
+        _mm_setzero_si128()
+    } else {
+        simd_shr(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+    }
+}
+
+/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
+
+pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
+    psrld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+}
+
+/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
+
+pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    // TODO    // static_assert_uimm_bits!(IMM8, 8);
+
+    if IMM8 >= 64 {
+        BitVec::from_fn(|_| Bit::Zero)
+    } else {
+        BitVec::from_u64x2(simd_shr(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)))
+    }
+}
+
+/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
+
+pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
+    psrlq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+}
+
+/// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
+
+pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| a[i] & b[i])
+}
+
+/// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
+/// then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
+
+pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| BitVec::<128>::from_fn(|i| _mm_set1_epi8(-1)[i] ^ a[i])[i] & b[i])
+}
+
+/// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
+
+pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| a[i] | b[i])
+}
+
+/// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
+
+pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_fn(|i| a[i] ^ b[i])
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
+
+pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_eq(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
+
+pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
+    (simd_eq(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
+
+pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_eq(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
+
+pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_gt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
+
+pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    (simd_gt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
+
+pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_gt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+}
+
+/// Compares packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
+
+pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_lt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+}
+
+/// Compares packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
+
+pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
+    (simd_lt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+}
+
+/// Compares packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
+
+pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_lt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+}
+
+pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
+    i32x4::from_fn(|i| if i == 0 { a } else { 0 }).into()
+}
+
+/// Returns the lowest element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
+
+pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
+    simd_extract(BitVec::to_i32x4(a), 0)
+}
+
+/// Sets packed 64-bit integers with the supplied values, from highest to
+/// lowest.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
+
+// no particular instruction to test
+
+pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
+    i64x2::from_fn(|i| if i == 0 { e0 } else { e1 }).into()
+}
+
+/// Sets packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
+// no particular instruction to test
+pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    let vec = [e0, e1, e2, e3];
+    BitVec::from_i32x4(i32x4::from_fn(|i| vec[i as usize]))
+}
+
+/// Sets packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
+
+// no particular instruction to test
+
+pub fn _mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
+    BitVec::from_i16x8(i16x8::from_fn(|i| vec[i as usize]))
+}
+
+/// Sets packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
+// no particular instruction to test
+pub fn _mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    let vec = [
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    ];
+    BitVec::from_i8x16(i8x16::from_fn(|i| vec[i as usize]))
+}
+
+/// Broadcasts 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi64x(a: i64) -> __m128i {
+    _mm_set_epi64x(a, a)
+}
+
+/// Broadcasts 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi32(a: i32) -> __m128i {
+    _mm_set_epi32(a, a, a, a)
+}
+
+/// Broadcasts 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi16(a: i16) -> __m128i {
+    BitVec::from_i16x8(i16x8::from_fn(|_| a))
+}
+
+/// Broadcasts 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
+
+// no particular instruction to test
+
+pub fn _mm_set1_epi8(a: i8) -> __m128i {
+    _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
+}
+
+/// Sets packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
+
+// no particular instruction to test
+
+pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
+    _mm_set_epi32(e0, e1, e2, e3)
+}
+
+/// Sets packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
+
+// no particular instruction to test
+
+pub fn _mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16,
+) -> __m128i {
+    _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
+}
+
+/// Sets packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
+
+// no particular instruction to test
+
+pub fn _mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8,
+) -> __m128i {
+    _mm_set_epi8(
+        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+    )
+}
+
+/// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
+
+pub fn _mm_setzero_si128() -> __m128i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
+
+/// Returns a vector where the low element is extracted from `a` and its upper
+/// element is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
+
+// FIXME movd on msvc, movd on i686
+
+pub fn _mm_move_epi64(a: __m128i) -> __m128i {
+    let r: i64x2 = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 2]);
+    r.into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
+
+pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    packsswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
+/// using signed saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
+
+pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
+    packssdw(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
+/// using unsigned saturation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
+
+pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
+    packuswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Returns the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
+
+pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
+    // static_assert_uimm_bits!(IMM8, 3);
+    simd_extract(BitVec::to_u16x8(a), IMM8 as u64) as i32
+}
+
+/// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
+
+pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 3);
+    simd_insert(BitVec::to_i16x8(a), IMM8 as u64, i as i16).into()
+}
+
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
+
+pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
+    let z = i8x16::from_fn(|_| 0);
+    let m: i8x16 = simd_lt(BitVec::to_i8x16(a), z);
+    let r = simd_bitmask_little!(15, m, u16);
+    r as u32 as i32
+}
+
+/// Shuffles 32-bit integers in `a` using the control in `IMM8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
+
+pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    let a = BitVec::to_i32x4(a);
+    let x: i32x4 = simd_shuffle(
+        a,
+        a,
+        [
+            IMM8 as u64 & 0b11,
+            (IMM8 as u64 >> 2) & 0b11,
+            (IMM8 as u64 >> 4) & 0b11,
+            (IMM8 as u64 >> 6) & 0b11,
+        ],
+    );
+    x.into()
+}
+
+/// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the high 64 bits of the returned vector, with the low 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
+
+pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    let a = BitVec::to_i16x8(a);
+    let x: i16x8 = simd_shuffle(
+        a,
+        a,
+        [
+            0,
+            1,
+            2,
+            3,
+            (IMM8 as u64 & 0b11) + 4,
+            ((IMM8 as u64 >> 2) & 0b11) + 4,
+            ((IMM8 as u64 >> 4) & 0b11) + 4,
+            ((IMM8 as u64 >> 6) & 0b11) + 4,
+        ],
+    );
+    x.into()
+}
+
+/// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
+/// `IMM8`.
+///
+/// Put the results in the low 64 bits of the returned vector, with the high 64
+/// bits being copied from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
+
+pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
+    // static_assert_uimm_bits!(IMM8, 8);
+
+    let a = BitVec::to_i16x8(a);
+    let x: i16x8 = simd_shuffle(
+        a,
+        a,
+        [
+            IMM8 as u64 & 0b11,
+            (IMM8 as u64 >> 2) & 0b11,
+            (IMM8 as u64 >> 4) & 0b11,
+            (IMM8 as u64 >> 6) & 0b11,
+            4,
+            5,
+            6,
+            7,
+        ],
+    );
+    x.into()
+}
+
+/// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
+
+pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(
+        BitVec::to_i8x16(a),
+        BitVec::to_i8x16(b),
+        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+    ))
+    .into()
+}
+
+/// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
+
+pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle(
+        BitVec::to_i16x8(a),
+        BitVec::to_i16x8(b),
+        [4, 12, 5, 13, 6, 14, 7, 15],
+    );
+    (x).into()
+}
+
+/// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
+
+pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [2, 6, 3, 7])).into()
+}
+
+/// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
+
+pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [1, 3])).into()
+}
+
+/// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
+
+pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
+    (simd_shuffle(
+        BitVec::to_i8x16(a),
+        BitVec::to_i8x16(b),
+        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+    ))
+    .into()
+}
+
+/// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
+
+pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
+    let x = simd_shuffle(
+        BitVec::to_i16x8(a),
+        BitVec::to_i16x8(b),
+        [0, 8, 1, 9, 2, 10, 3, 11],
+    );
+    x.into()
+}
+
+/// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
+
+pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
+    simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [0, 4, 1, 5]).into()
+}
+
+/// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
+
+pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [0, 2]).into()
+}
+
+/// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`core::mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`core::mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
+
+pub fn _mm_undefined_si128() -> __m128i {
+    BitVec::from_fn(|_| Bit::Zero)
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
new file mode 100644
index 0000000000000..8d0488430756c
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -0,0 +1,369 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+
+use crate::abstractions::{bitvec::BitVec, simd::*};
+
+use super::types::*;
+
+mod c_extern {
+    use crate::abstractions::simd::*;
+    pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
+        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u64] })
+    }
+
+    pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_add(a[2 * i + 1])
+            } else {
+                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+            }
+        })
+    }
+
+    pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if i < 4 {
+                a[2 * i].saturating_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+            }
+        })
+    }
+
+    pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if i < 2 {
+                a[2 * i].wrapping_sub(a[2 * i + 1])
+            } else {
+                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+            }
+        })
+    }
+
+    pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
+        i16x8::from_fn(|i| {
+            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+        })
+    }
+
+    pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            let temp = (a[i] as i32) * (b[i] as i32);
+            let temp = (temp >> 14).wrapping_add(1) >> 1;
+            temp as i16
+        })
+    }
+
+    pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
+        i8x16::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i8::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
+        i16x8::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i16::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+
+    pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
+        i32x4::from_fn(|i| {
+            if b[i] < 0 {
+                if a[i] == i32::MIN {
+                    a[i]
+                } else {
+                    -a[i]
+                }
+            } else if b[i] > 0 {
+                a[i]
+            } else {
+                0
+            }
+        })
+    }
+}
+
+use super::sse2::*;
+use c_extern::*;
+/// Computes the absolute value of packed 8-bit signed integers in `a` and
+/// return the unsigned results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
+pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
+    let a = BitVec::to_i8x16(a);
+    let zero = i8x16::from_fn(|_| 0);
+    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+    BitVec::from_i8x16(r)
+}
+
+/// Computes the absolute value of each of the packed 16-bit signed integers in
+/// `a` and
+/// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
+pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
+    let a = BitVec::to_i16x8(a);
+    let zero = i16x8::from_fn(|_| 0);
+    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+    BitVec::from_i16x8(r)
+}
+
+/// Computes the absolute value of each of the packed 32-bit signed integers in
+/// `a` and
+/// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
+pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
+    let a = BitVec::to_i32x4(a);
+    let zero = i32x4::from_fn(|_| 0);
+    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+    BitVec::from_i32x4(r)
+}
+
+/// Shuffles bytes from `a` according to the content of `b`.
+///
+/// The last 4 bits of each byte of `b` are used as addresses
+/// into the 16 bytes of `a`.
+///
+/// In addition, if the highest significant bit of a byte of `b`
+/// is set, the respective destination byte is set to 0.
+///
+/// Picturing `a` and `b` as `[u8; 16]`, `_mm_shuffle_epi8` is
+/// logically equivalent to:
+///
+/// ```
+/// fn mm_shuffle_epi8(a: [u8; 16], b: [u8; 16]) -> [u8; 16] {
+///     let mut r = [0u8; 16];
+///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
+///         if b[i] & 0x80 == 0u8 {
+///             r[i] = a[(b[i] % 16) as usize];
+///         }
+///     }
+///     r
+/// }
+/// ```
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
+pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
+    BitVec::from_u8x16(pshufb128(BitVec::to_u8x16(a), BitVec::to_u8x16(b)))
+}
+
+/// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
+/// shift the result right by `n` bytes, and returns the low 16 bytes.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
+
+pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    // TODO static_assert_uimm_bits!(IMM8, 8);
+    // If palignr is shifting the pair of vectors more than the size of two
+    // lanes, emit zero.
+    if IMM8 > 32 {
+        return _mm_setzero_si128();
+    }
+    // If palignr is shifting the pair of input vectors more than one lane,
+    // but less than two lanes, convert to shifting in zeroes.
+    let (a, b) = if IMM8 > 16 {
+        (_mm_setzero_si128(), a)
+    } else {
+        (a, b)
+    };
+    const fn mask(shift: u64, i: u64) -> u64 {
+        if shift > 32 {
+            // Unused, but needs to be a valid index.
+            i
+        } else if shift > 16 {
+            shift - 16 + i
+        } else {
+            shift + i
+        }
+    }
+
+    let r: i8x16 = simd_shuffle(
+        BitVec::to_i8x16(b),
+        BitVec::to_i8x16(a),
+        [
+            mask(IMM8 as u64, 0),
+            mask(IMM8 as u64, 1),
+            mask(IMM8 as u64, 2),
+            mask(IMM8 as u64, 3),
+            mask(IMM8 as u64, 4),
+            mask(IMM8 as u64, 5),
+            mask(IMM8 as u64, 6),
+            mask(IMM8 as u64, 7),
+            mask(IMM8 as u64, 8),
+            mask(IMM8 as u64, 9),
+            mask(IMM8 as u64, 10),
+            mask(IMM8 as u64, 11),
+            mask(IMM8 as u64, 12),
+            mask(IMM8 as u64, 13),
+            mask(IMM8 as u64, 14),
+            mask(IMM8 as u64, 15),
+        ],
+    );
+    r.into()
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
+
+pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phaddw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
+/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
+
+pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phaddsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
+
+pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
+    phaddd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
+
+pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phsubw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
+/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
+/// saturated to 8000h.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
+
+pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    phsubsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Horizontally subtract the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of `[4 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
+
+pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
+    phsubd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, add pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
+
+pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmaddubsw128(BitVec::to_u8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Multiplies packed 16-bit signed integer values, truncate the 32-bit
+/// product to the 18 most significant bits by right-shifting, round the
+/// truncated value by adding 1, and write bits `[16:1]` to the destination.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
+
+pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
+    pmulhrsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
+/// integer in `b` is negative, and returns the result.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
+
+pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
+    psignb128(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+}
+
+/// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
+/// integer in `b` is negative, and returns the results.
+/// Elements in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
+
+pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
+    psignw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+}
+
+/// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
+/// integer in `b` is negative, and returns the results.
+/// Element in result are zeroed out when the corresponding element in `b`
+/// is zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
+
+pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
+    psignd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
new file mode 100644
index 0000000000000..4ffa0dc139b9d
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -0,0 +1,132 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::avx::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm256_blendv_ps(a: __m256, b: __m256, c: __m256));
+
+#[test]
+fn _mm256_movemask_ps() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_movemask_ps(a.into()),
+            unsafe { upstream::_mm256_movemask_ps(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_testz_si256() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        let b: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_testz_si256(a.into(), b.into()),
+            unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) }
+        );
+    }
+}
+
+mk!(_mm256_setzero_ps());
+mk!(_mm256_setzero_si256());
+mk!(_mm256_set_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8
+));
+mk!(_mm256_set_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16
+));
+mk!(_mm256_set_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32
+));
+mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_epi8(a: i8));
+mk!(_mm256_set1_epi16(a: i16));
+mk!(_mm256_set1_epi32(a: i32));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
new file mode 100644
index 0000000000000..a1b8378566403
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -0,0 +1,531 @@
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::avx2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+
+mk!(_mm256_abs_epi32(a: BitVec));
+mk!(_mm256_abs_epi16(a: BitVec));
+mk!(_mm256_abs_epi8(a: BitVec));
+mk!(_mm256_add_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_add_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_adds_epu16(a: BitVec, b: BitVec));
+mk!([100]_mm256_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+mk!([100]_mm256_permute2x128_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+mk!(_mm256_blendv_epi8(a: BitVec, b: BitVec, mask: BitVec));
+mk!(_mm_broadcastb_epi8(a: BitVec));
+mk!(_mm256_broadcastb_epi8(a: BitVec));
+mk!(_mm_broadcastd_epi32(a: BitVec));
+mk!(_mm256_broadcastd_epi32(a: BitVec));
+mk!(_mm_broadcastq_epi64(a: BitVec));
+mk!(_mm256_broadcastq_epi64(a: BitVec));
+mk!(_mm_broadcastsi128_si256(a: BitVec));
+mk!(_mm256_broadcastsi128_si256(a: BitVec));
+mk!(_mm_broadcastw_epi16(a: BitVec));
+mk!(_mm256_broadcastw_epi16(a: BitVec));
+mk!(_mm256_cmpeq_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_cmpeq_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_cmpgt_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_cvtepi16_epi32(a: BitVec));
+mk!(_mm256_cvtepi16_epi64(a: BitVec));
+mk!(_mm256_cvtepi32_epi64(a: BitVec));
+mk!(_mm256_cvtepi8_epi16(a: BitVec));
+mk!(_mm256_cvtepi8_epi32(a: BitVec));
+mk!(_mm256_cvtepi8_epi64(a: BitVec));
+mk!(_mm256_cvtepu16_epi32(a: BitVec));
+mk!(_mm256_cvtepu16_epi64(a: BitVec));
+mk!(_mm256_cvtepu32_epi64(a: BitVec));
+mk!(_mm256_cvtepu8_epi16(a: BitVec));
+mk!(_mm256_cvtepu8_epi32(a: BitVec));
+mk!(_mm256_cvtepu8_epi64(a: BitVec));
+mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));
+mk!(_mm256_hadd_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hadd_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_hadds_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hsub_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_hsub_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_hsubs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_inserti128_si256{<0>,<1>}(a: BitVec, b: BitVec));
+mk!(_mm256_madd_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_maddubs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_max_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_max_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_min_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_min_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_mul_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_mul_epu32(a: BitVec, b: BitVec));
+mk!(_mm256_mulhi_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_mulhi_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_mullo_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_mullo_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_mulhrs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_or_si256(a: BitVec, b: BitVec));
+mk!(_mm256_packs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_packs_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_packus_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_packus_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_permutevar8x32_epi32(a: BitVec, b: BitVec));
+#[test]
+fn _mm256_movemask_epi8() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_movemask_epi8(a.into()),
+            unsafe { upstream::_mm256_movemask_epi8(a.into()) }
+        );
+    }
+}
+mk!([100]_mm256_mpsadbw_epu8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec, b: BitVec));
+
+mk!([100]_mm256_permute4x64_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_sad_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_shuffle_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_sign_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_sll_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_sll_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_sll_epi64(a: BitVec, count: BitVec));
+mk!([100]_mm256_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_slli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_bslli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_sllv_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_sllv_epi32(a: BitVec, count: BitVec));
+mk!(_mm_sllv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sllv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sra_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_sra_epi32(a: BitVec, count: BitVec));
+mk!([100]_mm256_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_srav_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srav_epi32(a: BitVec, count: BitVec));
+mk!([100]_mm256_srli_si256{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm256_srl_epi16(a: BitVec, count: BitVec));
+mk!(_mm256_srl_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srl_epi64(a: BitVec, count: BitVec));
+mk!([100]_mm256_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!([100]_mm256_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: BitVec));
+mk!(_mm_srlv_epi32(a: BitVec, count: BitVec));
+mk!(_mm256_srlv_epi32(a: BitVec, count: BitVec));
+mk!(_mm_srlv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_srlv_epi64(a: BitVec, count: BitVec));
+mk!(_mm256_sub_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_sub_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epu16(a: BitVec, b: BitVec));
+mk!(_mm256_subs_epu8(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi8(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi16(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi32(a: BitVec, b: BitVec));
+mk!(_mm256_unpackhi_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_unpacklo_epi64(a: BitVec, b: BitVec));
+mk!(_mm256_xor_si256(a: BitVec, b: BitVec));
+#[test]
+fn _mm256_extract_epi8() {
+    let n = 100;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<0>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<0>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<1>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<1>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<2>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<2>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<3>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<3>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<4>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<4>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<5>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<5>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<6>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<6>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<7>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<7>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<8>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<8>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<9>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<9>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<10>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<10>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<11>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<11>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<12>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<12>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<13>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<13>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<14>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<14>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<15>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<15>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<16>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<16>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<17>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<17>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<18>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<18>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<19>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<19>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<20>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<20>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<21>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<21>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<22>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<22>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<23>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<23>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<24>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<24>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<25>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<25>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<26>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<26>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<27>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<27>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<28>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<28>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<29>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<29>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<30>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<30>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi8::<31>(a.into()),
+            unsafe { upstream::_mm256_extract_epi8::<31>(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_extract_epi16() {
+    let n = 100;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<0>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<0>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<1>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<1>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<2>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<2>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<3>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<3>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<4>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<4>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<5>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<5>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<6>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<6>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<7>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<7>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<8>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<8>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<9>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<9>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<10>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<10>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<11>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<11>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<12>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<12>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<13>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<13>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<14>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<14>(a.into()) }
+        );
+    }
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx2::_mm256_extract_epi16::<15>(a.into()),
+            unsafe { upstream::_mm256_extract_epi16::<15>(a.into()) }
+        );
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs
new file mode 100644
index 0000000000000..b5a0c3a449715
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs
@@ -0,0 +1,113 @@
+//! Tests for intrinsics defined in `crate::core_arch::x86::models`
+//!
+//! Each and every modelled intrinsic is tested against the Rust
+//! implementation here. For the most part, the tests work by
+//! generating random inputs, passing them as arguments
+//! to both the models in this crate, and the corresponding intrinsics
+//! in the Rust core and then comparing their outputs.
+//!
+//! To add a test for a modelled intrinsic, go the appropriate file, and
+//! use the `mk!` macro to define it.
+//!
+//! A `mk!` macro invocation looks like the following,
+//! `mk!([<number of times the random test happens>]<function name>{<<const values, if the function takes any>,>}(<function arguments : with types,>))
+//!
+//! For example, some valid invocations are
+//!
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
+//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//!
+//! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
+//! The const values are necessary if the function has constant arguments, but should be discarded if not.
+//! The function name and the function arguments are necessary in all cases.
+//!
+//! Note: This only works if the function returns a bit-vector or funarray. If it returns an integer, the
+//! test has to be written manually. It is recommended that the manually defined test follows
+//! the pattern of tests defined via the `mk!` invocation. It is also recommended that, in the
+//! case that the intrinsic takes constant arguments, each and every possible constant value
+//! (upto a maximum of 255) that can be passed to the function be used for testing. The number
+//! of constant values passed depends on if the Rust intrinsics statically asserts that the
+//! length of the constant argument be less than or equal to a certain number of bits.
+
+mod avx;
+mod avx2;
+mod sse2;
+mod ssse3;
+use crate::abstractions::bitvec::*;
+
+pub(crate) mod types {
+    use crate::abstractions::bitvec::*;
+
+    #[allow(non_camel_case_types)]
+    pub type __m256i = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m256 = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128i = BitVec<128>;
+}
+
+pub(crate) mod upstream {
+    #[cfg(target_arch = "x86")]
+    pub use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    pub use core::arch::x86_64::*;
+}
+
+mod conversions {
+    use super::upstream::{
+        __m128i, __m256, __m256i, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_loadu_si256,
+        _mm256_storeu_si256, _mm_loadu_si128, _mm_storeu_si128,
+    };
+    use super::BitVec;
+
+    impl From<BitVec<256>> for __m256i {
+        fn from(bv: BitVec<256>) -> __m256i {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_loadu_si256(bv.as_ptr() as *const _) }
+        }
+    }
+    impl From<BitVec<256>> for __m256 {
+        fn from(bv: BitVec<256>) -> __m256 {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_castsi256_ps(_mm256_loadu_si256(bv.as_ptr() as *const _)) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128i {
+        fn from(bv: BitVec<128>) -> __m128i {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_loadu_si128(slice.as_ptr() as *const __m128i) }
+        }
+    }
+
+    impl From<__m256i> for BitVec<256> {
+        fn from(vec: __m256i) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, vec);
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m256> for BitVec<256> {
+        fn from(vec: __m256) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castps_si256(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128i> for BitVec<128> {
+        fn from(vec: __m128i) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, vec);
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/sse2.rs b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
new file mode 100644
index 0000000000000..ed387f5938524
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/sse2.rs
@@ -0,0 +1,201 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::sse2::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm_add_epi8(a: __m128i, b: __m128i));
+mk!(_mm_add_epi16(a: __m128i, b: __m128i));
+mk!(_mm_add_epi32(a: __m128i, b: __m128i));
+mk!(_mm_add_epi64(a: __m128i, b: __m128i));
+mk!(_mm_adds_epi8(a: __m128i, b: __m128i));
+mk!(_mm_adds_epi16(a: __m128i, b: __m128i));
+mk!(_mm_adds_epu8(a: __m128i, b: __m128i));
+mk!(_mm_adds_epu16(a: __m128i, b: __m128i));
+mk!(_mm_avg_epu8(a: __m128i, b: __m128i));
+mk!(_mm_avg_epu16(a: __m128i, b: __m128i));
+mk!(_mm_madd_epi16(a: __m128i, b: __m128i));
+mk!(_mm_max_epi16(a: __m128i, b: __m128i));
+mk!(_mm_max_epu8(a: __m128i, b: __m128i));
+mk!(_mm_min_epi16(a: __m128i, b: __m128i));
+mk!(_mm_min_epu8(a: __m128i, b: __m128i));
+mk!(_mm_mulhi_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mulhi_epu16(a: __m128i, b: __m128i));
+mk!(_mm_mullo_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mul_epu32(a: __m128i, b: __m128i));
+mk!(_mm_sad_epu8(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi8(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi32(a: __m128i, b: __m128i));
+mk!(_mm_sub_epi64(a: __m128i, b: __m128i));
+mk!(_mm_subs_epi8(a: __m128i, b: __m128i));
+mk!(_mm_subs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_subs_epu8(a: __m128i, b: __m128i));
+mk!(_mm_subs_epu16(a: __m128i, b: __m128i));
+
+mk!([100]_mm_slli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_bslli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_bsrli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_slli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_slli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi32(a: __m128i, count: __m128i));
+
+mk!([100]_mm_slli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sll_epi64(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srai_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sra_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srai_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_sra_epi32(a: __m128i, count: __m128i));
+mk!([100]_mm_srli_si128{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_srli_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_srl_epi16(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srli_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!([100]_mm_srl_epi32(a: __m128i, count: __m128i));
+
+mk!([100]_mm_srli_epi64{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+
+mk!(_mm_srl_epi64(a: __m128i, count: __m128i));
+mk!(_mm_and_si128(a: __m128i, b: __m128i));
+mk!(_mm_andnot_si128(a: __m128i, b: __m128i));
+mk!(_mm_or_si128(a: __m128i, b: __m128i));
+mk!(_mm_xor_si128(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmpeq_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmpgt_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi8(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi16(a: __m128i, b: __m128i));
+mk!(_mm_cmplt_epi32(a: __m128i, b: __m128i));
+mk!(_mm_cvtsi32_si128(a: i32));
+
+// mk!(_mm_cvtsi128_si32(a: __m128i));
+
+mk!(_mm_set_epi64x(e1: i64, e0: i64));
+mk!(_mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32));
+mk!(_mm_set_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16
+));
+mk!(_mm_set_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8
+));
+mk!(_mm_set1_epi64x(a: i64));
+mk!(_mm_set1_epi32(a: i32));
+mk!(_mm_set1_epi16(a: i16));
+mk!(_mm_set1_epi8(a: i8));
+mk!(_mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32));
+mk!(_mm_setr_epi16(
+    e7: i16,
+    e6: i16,
+    e5: i16,
+    e4: i16,
+    e3: i16,
+    e2: i16,
+    e1: i16,
+    e0: i16
+));
+mk!(_mm_setr_epi8(
+    e15: i8,
+    e14: i8,
+    e13: i8,
+    e12: i8,
+    e11: i8,
+    e10: i8,
+    e9: i8,
+    e8: i8,
+    e7: i8,
+    e6: i8,
+    e5: i8,
+    e4: i8,
+    e3: i8,
+    e2: i8,
+    e1: i8,
+    e0: i8
+));
+mk!(_mm_setzero_si128());
+mk!(_mm_move_epi64(a: __m128i));
+mk!(_mm_packs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_packs_epi32(a: __m128i, b: __m128i));
+mk!(_mm_packus_epi16(a: __m128i, b: __m128i));
+
+// mk!([100]_mm_extract_epi16(a: __m128i));
+mk!([100]_mm_insert_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>}(a: __m128i, i: i32));
+
+// mk!([100]_mm_movemask_epi8(a: __m128i));
+
+mk!([100]_mm_shuffle_epi32{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_shufflehi_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!([100]_mm_shufflelo_epi16{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i));
+mk!(_mm_unpackhi_epi8(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi16(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi32(a: __m128i, b: __m128i));
+mk!(_mm_unpackhi_epi64(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi8(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi16(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi32(a: __m128i, b: __m128i));
+mk!(_mm_unpacklo_epi64(a: __m128i, b: __m128i));
+mk!(_mm_undefined_si128());
diff --git a/testable-simd-models/src/core_arch/x86/tests/ssse3.rs b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
new file mode 100644
index 0000000000000..6382f953f2063
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/tests/ssse3.rs
@@ -0,0 +1,51 @@
+use super::types::*;
+use super::upstream;
+use crate::abstractions::bitvec::BitVec;
+use crate::helpers::test::HasRandom;
+
+/// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
+macro_rules! mk {
+    ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
+        #[test]
+        fn $name() {
+            #[allow(unused)]
+            const N: usize = {
+                let n: usize = 1000;
+                $(let n: usize = $N;)?
+                    n
+            };
+            mk!(@[N]$name$($(<$($c),*>)*)?($($x : $ty),*));
+        }
+    };
+    (@[$N:ident]$name:ident$(<$($c:literal),*>)?($($x:ident : $ty:ident),*)) => {
+        for _ in 0..$N {
+            $(let $x = $ty::random();)*
+                assert_eq!(super::super::models::ssse3::$name$(::<$($c,)*>)?($($x.into(),)*), unsafe {
+                    BitVec::from(upstream::$name$(::<$($c,)*>)?($($x.into(),)*)).into()
+                });
+        }
+    };
+    (@[$N:ident]$name:ident<$($c1:literal),*>$(<$($c:literal),*>)*($($x:ident : $ty:ident),*)) => {
+        let one = || {
+            mk!(@[$N]$name<$($c1),*>($($x : $ty),*));
+        };
+        one();
+        mk!(@[$N]$name$(<$($c),*>)*($($x : $ty),*));
+    }
+}
+mk!(_mm_abs_epi8(a: __m128i));
+mk!(_mm_abs_epi16(a: __m128i));
+mk!(_mm_abs_epi32(a: __m128i));
+mk!(_mm_shuffle_epi8(a: __m128i, b: __m128i));
+mk!([100]_mm_alignr_epi8{<0>,<1>,<2>,<3>,<4>,<5>,<6>,<7>,<8>,<9>,<10>,<11>,<12>,<13>,<14>,<15>,<16>,<17>,<18>,<19>,<20>,<21>,<22>,<23>,<24>,<25>,<26>,<27>,<28>,<29>,<30>,<31>,<32>,<33>,<34>,<35>,<36>,<37>,<38>,<39>,<40>,<41>,<42>,<43>,<44>,<45>,<46>,<47>,<48>,<49>,<50>,<51>,<52>,<53>,<54>,<55>,<56>,<57>,<58>,<59>,<60>,<61>,<62>,<63>,<64>,<65>,<66>,<67>,<68>,<69>,<70>,<71>,<72>,<73>,<74>,<75>,<76>,<77>,<78>,<79>,<80>,<81>,<82>,<83>,<84>,<85>,<86>,<87>,<88>,<89>,<90>,<91>,<92>,<93>,<94>,<95>,<96>,<97>,<98>,<99>,<100>,<101>,<102>,<103>,<104>,<105>,<106>,<107>,<108>,<109>,<110>,<111>,<112>,<113>,<114>,<115>,<116>,<117>,<118>,<119>,<120>,<121>,<122>,<123>,<124>,<125>,<126>,<127>,<128>,<129>,<130>,<131>,<132>,<133>,<134>,<135>,<136>,<137>,<138>,<139>,<140>,<141>,<142>,<143>,<144>,<145>,<146>,<147>,<148>,<149>,<150>,<151>,<152>,<153>,<154>,<155>,<156>,<157>,<158>,<159>,<160>,<161>,<162>,<163>,<164>,<165>,<166>,<167>,<168>,<169>,<170>,<171>,<172>,<173>,<174>,<175>,<176>,<177>,<178>,<179>,<180>,<181>,<182>,<183>,<184>,<185>,<186>,<187>,<188>,<189>,<190>,<191>,<192>,<193>,<194>,<195>,<196>,<197>,<198>,<199>,<200>,<201>,<202>,<203>,<204>,<205>,<206>,<207>,<208>,<209>,<210>,<211>,<212>,<213>,<214>,<215>,<216>,<217>,<218>,<219>,<220>,<221>,<222>,<223>,<224>,<225>,<226>,<227>,<228>,<229>,<230>,<231>,<232>,<233>,<234>,<235>,<236>,<237>,<238>,<239>,<240>,<241>,<242>,<243>,<244>,<245>,<246>,<247>,<248>,<249>,<250>,<251>,<252>,<253>,<254>,<255>}(a: __m128i, b: __m128i));
+mk!(_mm_hadd_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hadds_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hadd_epi32(a: __m128i, b: __m128i));
+mk!(_mm_hsub_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hsubs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_hsub_epi32(a: __m128i, b: __m128i));
+mk!(_mm_maddubs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_mulhrs_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi8(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi16(a: __m128i, b: __m128i));
+mk!(_mm_sign_epi32(a: __m128i, b: __m128i));
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
new file mode 100644
index 0000000000000..6c5e84e2a8dbd
--- /dev/null
+++ b/testable-simd-models/src/helpers.rs
@@ -0,0 +1,55 @@
+#[cfg(test)]
+pub mod test {
+    use crate::abstractions::{bit::Bit, bitvec::BitVec, funarr::FunArray};
+    use rand::prelude::*;
+
+    /// Helper trait to generate random values
+    pub trait HasRandom {
+        fn random() -> Self;
+    }
+    macro_rules! mk_has_random {
+        ($($ty:ty),*) => {
+            $(impl HasRandom for $ty {
+                fn random() -> Self {
+                    let mut rng = rand::rng();
+                    rng.random()
+                }
+            })*
+        };
+    }
+
+    mk_has_random!(bool);
+    mk_has_random!(i8, i16, i32, i64, i128);
+    mk_has_random!(u8, u16, u32, u64, u128);
+
+    impl HasRandom for isize {
+        fn random() -> Self {
+            i128::random() as isize
+        }
+    }
+    impl HasRandom for usize {
+        fn random() -> Self {
+            i128::random() as usize
+        }
+    }
+
+    impl HasRandom for Bit {
+        fn random() -> Self {
+            crate::abstractions::bit::Bit::from(bool::random())
+        }
+    }
+    impl<const N: u64> HasRandom for BitVec<N> {
+        fn random() -> Self {
+            Self::from_fn(|_| Bit::random())
+        }
+    }
+
+    impl<const N: u64, T: HasRandom> HasRandom for FunArray<N, T> {
+        fn random() -> Self {
+            FunArray::from_fn(|_| T::random())
+        }
+    }
+}
+
+#[cfg(test)]
+pub use test::*;
diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs
new file mode 100644
index 0000000000000..fc76194526e20
--- /dev/null
+++ b/testable-simd-models/src/lib.rs
@@ -0,0 +1,35 @@
+//! `testable-simd-models`: A Rust Model for the `core` Library
+//!
+//! `testable-simd-models` is a simplified, self-contained model of Rust’s `core` library. It aims to provide
+//! a purely Rust-based specification of `core`'s fundamental operations, making them easier to
+//! understand, analyze, and formally verify. Unlike `core`, which may rely on platform-specific
+//! intrinsics and compiler magic, `core-models` expresses everything in plain Rust, prioritizing
+//! clarity and explicitness over efficiency.
+//!
+//! ## Key Features
+//!
+//! - **Partial Modeling**: `core-models` includes only a subset of `core`, focusing on modeling
+//!   fundamental operations rather than providing a complete replacement.
+//! - **Exact Signatures**: Any item that exists in both `core-models` and `core` has the same type signature,
+//!   ensuring compatibility with formal verification efforts.
+//! - **Purely Functional Approach**: Where possible, `core-models` favors functional programming principles,
+//!   avoiding unnecessary mutation and side effects to facilitate formal reasoning.
+//! - **Explicit Implementations**: Even low-level operations, such as SIMD, are modeled explicitly using
+//!   Rust constructs like bit arrays and partial maps.
+//! - **Extra Abstractions**: `core-models` includes additional helper types and functions to support
+//!   modeling. These extra items are marked appropriately to distinguish them from `core` definitions.
+//!
+//! ## Intended Use
+//!
+//! `testable-simd-models` is designed as a reference model for formal verification and reasoning about Rust programs.
+//! By providing a readable, testable, well-specified version of `core`'s behavior, it serves as a foundation for
+//! proof assistants and other verification tools.
+
+// This recursion limit is necessary for mk! macro sued for tests.
+// We test functions with const generics, the macro generate a test per possible (const generic) control value.
+#![recursion_limit = "4096"]
+pub mod abstractions;
+pub mod core_arch;
+
+pub use core_arch as arch;
+pub mod helpers;
diff --git a/testable-simd-models/test.sh b/testable-simd-models/test.sh
new file mode 100755
index 0000000000000..8f521735122c3
--- /dev/null
+++ b/testable-simd-models/test.sh
@@ -0,0 +1,2 @@
+cross test --target aarch64-unknown-linux-gnu
+cross test --target x86_64-unknown-linux-gnu

From 22385cc6fb437f8667e8516390a669d07f20b502 Mon Sep 17 00:00:00 2001
From: karthikbhargavan <karthik.bhargavan@gmail.com>
Date: Fri, 1 Aug 2025 08:33:01 -0400
Subject: [PATCH 2/3] Make models closer to upstream core (#3)

* Provided more detailed description for how to model and test intrinsics
* Restored static asserts which were in the upstream code
* Switched the use of u64 back to u32 to make it closer to upstream
* Defined functions like `transpose` to reduce visual diffs
---
 library/Cargo.lock                            |   90 +
 testable-simd-models/README.md                |  197 +-
 testable-simd-models/src/abstractions/bit.rs  |  100 +-
 .../src/abstractions/bitvec.rs                |   59 +-
 .../src/abstractions/funarr.rs                |  128 +-
 testable-simd-models/src/abstractions/mod.rs  |    1 +
 testable-simd-models/src/abstractions/simd.rs |   97 +-
 .../src/abstractions/utilities.rs             |   59 +
 .../src/core_arch/x86/models/avx.rs           | 1780 ++++++++++--
 .../src/core_arch/x86/models/avx2.rs          | 2494 +++++++----------
 .../core_arch/x86/models/avx2_handwritten.rs  |  620 ++++
 .../core_arch/x86/models/avx_handwritten.rs   |   31 +
 .../src/core_arch/x86/models/mod.rs           |   11 +
 .../src/core_arch/x86/models/sse.rs           |   21 +
 .../src/core_arch/x86/models/sse2.rs          | 1667 ++++++-----
 .../core_arch/x86/models/sse2_handwritten.rs  |  196 ++
 .../src/core_arch/x86/models/ssse3.rs         |  301 +-
 .../core_arch/x86/models/ssse3_handwritten.rs |  127 +
 .../src/core_arch/x86/tests/avx.rs            |  126 +
 .../src/core_arch/x86/tests/avx2.rs           |   12 +-
 .../src/core_arch/x86/tests/mod.rs            |   69 +-
 testable-simd-models/src/helpers.rs           |   16 +-
 testable-simd-models/src/lib.rs               |    2 +-
 testable-simd-models/test.sh                  |    2 -
 24 files changed, 5394 insertions(+), 2812 deletions(-)
 create mode 100644 testable-simd-models/src/abstractions/utilities.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/sse.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
 create mode 100644 testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
 delete mode 100755 testable-simd-models/test.sh

diff --git a/library/Cargo.lock b/library/Cargo.lock
index c681c5935df5f..aa756e9e6aa94 100644
--- a/library/Cargo.lock
+++ b/library/Cargo.lock
@@ -28,6 +28,7 @@ version = "0.0.0"
 dependencies = [
  "compiler_builtins",
  "core",
+ "safety",
 ]
 
 [[package]]
@@ -67,6 +68,9 @@ dependencies = [
 [[package]]
 name = "core"
 version = "0.0.0"
+dependencies = [
+ "safety",
+]
 
 [[package]]
 name = "coretests"
@@ -201,6 +205,39 @@ dependencies = [
  "unwind",
 ]
 
+[[package]]
+name = "proc-macro-error"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
+dependencies = [
+ "proc-macro-error-attr",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro-error-attr"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "version_check",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
 [[package]]
 name = "proc_macro"
 version = "0.0.0"
@@ -217,6 +254,15 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
 [[package]]
 name = "r-efi"
 version = "5.3.0"
@@ -301,6 +347,16 @@ dependencies = [
  "std",
 ]
 
+[[package]]
+name = "safety"
+version = "0.1.0"
+dependencies = [
+ "proc-macro-error",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.104",
+]
+
 [[package]]
 name = "shlex"
 version = "1.3.0"
@@ -330,6 +386,7 @@ dependencies = [
  "rand",
  "rand_xorshift",
  "rustc-demangle",
+ "safety",
  "std_detect",
  "unwind",
  "wasi",
@@ -346,6 +403,27 @@ dependencies = [
  "rustc-std-workspace-core",
 ]
 
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.104"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
 [[package]]
 name = "sysroot"
 version = "0.0.0"
@@ -366,6 +444,12 @@ dependencies = [
  "std",
 ]
 
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
 [[package]]
 name = "unicode-width"
 version = "0.2.1"
@@ -398,6 +482,12 @@ dependencies = [
  "rustc-std-workspace-core",
 ]
 
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
 [[package]]
 name = "wasi"
 version = "0.11.1+wasi-snapshot-preview1"
diff --git a/testable-simd-models/README.md b/testable-simd-models/README.md
index d051de6145f4a..470c51072c8e5 100644
--- a/testable-simd-models/README.md
+++ b/testable-simd-models/README.md
@@ -6,26 +6,132 @@ The structure of this crate is based on [rust-lang/stdarch/crates/core_arch](htt
 
 ## Code Structure
 Within the `core_arch` folder in this crate, there is a different
-folder for each architecture for which we have wrtten models. 
+folder for each architecture for which we have written models. 
 In particular, it contains folders for `x86` and `arm_shared`.
-Each such folder has 3 sub-folders, `models`, `tests`, and `specs`. 
+Each such folder has 2 sub-folders: `models` and `tests`. 
 
-The `models` folder contains the models of the intrinsics, with a file
-corresponding to different target features, and are written using the
-various abstractions implementedin `crate::abstractions`, especially
-those in `crate::abstractions::simd`. These models are meant to
-closely resemble their implementations within the Rust core itself.
+The `models` folder contains the models of the intrinsics, with
+different files for different target features (e.g. `sse2`, `avx2`
+etc.). The code in this folder is written using the various
+abstractions implemented in `abstractions`, especially those in
+`abstractions::simd`. These models are meant to closely
+resemble their implementations within the Rust core itself.
 
 The `tests` folder contains the tests of these models, and is
-structured the same way as `models`. Each file additionally contains
+structured the same way as `models`. Each file additionally includes
 the definition of a macro that makes writing these tests easier. The
 tests work by testing the models against the intrinsics in the Rust
 core, trying out random inputs (generally 1000), and comparing their
 outputs.
 
-## Modeling Process
-The process of adding a specific intrinsic's model goes as follows.
-For this example, let us say the intrinsic we are adding is
+All tests can be run by executing `cargo test`, and we expect this to be
+run as part of CI.
+
+## Modeling a SIMD Intrinsic
+
+There are three kinds of SIMD intrinsics in `core::arch`.
+
+The first kind are builtin Rust compiler intrinsics, some of which are 
+in the [`intrinsics/simd.rs` file](https://github.com/model-checking/verify-rust-std/blob/main/library/core/src/intrinsics/simd.rs)
+in the `core` crate, and others are in the [`simd.rs` file of `core_arch`](https://github.com/model-checking/verify-rust-std/blob/main/library/stdarch/crates/core_arch/src/simd.rs).
+These builtin intrinsics define generic SIMD operations that the Rust compiler knows how to implement on each platform.
+
+The second kind are `extern` intrinsics that are links to definitions in LLVM.
+See, for example, [this list](https://github.com/rust-lang/stdarch/blob/master/crates/core_arch/src/x86/avx2.rs#L3596C8-L3596C14)
+of `extern` intrinsics used in the Intel x86 AVX2 library.
+These extern intrinsics are typically platform-specific functions that map to low-level instructions.
+
+The third kind are `defined` intrinsics that are given proper definitions in Rust, and their code may
+depend on the builtin intrinsics or the extern intrinsics. These defined intrinsics represent higher-level
+operations that are wrappers around one or more assembly instructions.
+
+### Modeling builtin intrinsics manually
+
+We model all three kinds of intrinsics, but in slightly different
+ways.  For the builtin intrinsics, we can write implementations once
+and for all, and to this end, we use a library within the
+`abstractions/simd.rs` file, where we copy the signatures of the
+intrinsics from Rust but give them our own implementation. In
+particular, we model each SIMD vector as an array of scalars, and
+define each generic operation as functions over such arrays. This can
+be seen as a reference implementation of the builtin intrinsics.
+
+Hence, for example, the SIMD add intrinsic `simd_add` is modeled as follows,
+it takes two arrays of machine integers and adds them pointwise using a
+`wrapping_add` operation:
+
+```rust
+pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+    x: FunArray<N, T>,
+    y: FunArray<N, T>,
+) -> FunArray<N, T> {
+    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+}
+```
+
+Notably, we model a strongly typed version of `simd_add`, in contrast to the compiler
+intrinsic, which is too generic and unimplementable in safe Rust:
+
+```rust
+/// Adds two simd vectors elementwise.
+///
+/// `T` must be a vector of integers or floats.
+#[rustc_intrinsic]
+#[rustc_nounwind]
+pub unsafe fn simd_add<T>(x: T, y: T) -> T;
+```
+
+The main rules for writing these models are that they should be simple and self-contained,
+relying only on the libraries in `abstractions`, on builtin Rust language features, or 
+other testable models. In particular, they should not themselves directly call Rust libraries
+or external crates, without going through the abstractions API.
+
+
+### Modeling extern intrinsics manually
+
+For each file in `core::arch`, we split the code into extern
+intrinsics that must be modeled by hand and defined intrinsics whose
+models can be derived semi-automatically. The extern intrinsics are
+placed in a module suffixed with `_handwritten`. Hence, for example,
+the extern intrinsics used in `avx2.rs` can be found in `avx2_handwritten.rs`.
+
+Modeling extern intrinsics is similar to modeling the builtin ones,
+in that the models are written by hand and treat the SIMD vectors
+as arrays of machine integers. The main difference is that these intrinsics
+are platform-specific and so their modeling requires looking at the Intel or ARM
+documentation for the underlying operation.
+
+For example, the extern intrinsic `phaddw` used in `avx2` corresponds to an
+Intel instruction called "Packed Horizontal Add" and is used in AVX2 intrinsics
+like `_mm256_hadd_epi16` documented [here](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16&ig_expand=3667_)
+By inspecting the Intel documentation, we can write a Rust model for it
+as follows 
+
+```rust
+pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+```
+
+### Modeling defined intrinsics semi-automatically
+
+To model a defined intrinsic, we essentially copy the Rust code of
+the intrinsic from `core::arch` and adapt it to use our underlying abstractions.  The
+changes needed to the code are sometimes scriptable, and indeed most
+of our models were generated from a script, but some changes are still
+needed by hand.
+
+For example, let us say the intrinsic we are modeling is
 `_mm256_bsrli_epi128` from the avx2 feature set.
 
 1. We go to [rust-lang/stdarch/crates/core_arch/src/x86/](https://github.com/rust-lang/stdarch/tree/master/crates/core_arch/src/x86/), and find the implementation of the intrinsic in `avx2.rs`.
@@ -67,56 +173,49 @@ pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
         transmute(r)
     }
 }
-  ```
-Thus, we then go to to `core_arch/x86/models/avx2.rs`, and add the implementation. After some modification, it ends up looking like this.
-``` rust
-/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
+```
 
-pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-    const fn mask(shift: i32, i: u32) -> u64 {
-        let shift = shift as u32 & 0xff;
-        if shift > 15 || (15 - (i % 16)) < shift {
-            0 as u64
-        } else {
-            (32 + (i + shift)) as u64
-        }
-    }
-    
-	let a = BitVec::to_i8x32(a);
-	let r: i8x32 = simd_shuffle(
-		i8x32::from_fn(|_| 0),
-		a,
-		[
-			mask(IMM8, 0),
-			mask(IMM8, 1),
-			mask(IMM8, 2),
-			mask(IMM8, 3),
-			...
-			mask(IMM8, 31),
-		],
-	);
-	r.into()
-}
-  ```
+Thus, we then go to `core_arch/x86/models/avx2.rs`, and add this implementation.
+The only change it requires here is that the `simd_shuffle` macro is a function in our model,
+and we discard all the function attributes.
+
+For other intrinsics, we sometimes need to make more changes. Since our model of the builtin intrinsics
+is more precise concerning the type of their arguments compared to their Rust counterparts, we
+sometimes need to add more type annotations in our defined models. We also remove all `unsafe` guards,
+since our models are always in safe Rust. Otherwise, our code for the defined intrinsics looks very
+similar to the upstream code in `core::arch`.
   
-3. Next, we add a test for this intrinsic. For this, we navigate to `core_arch/avx2/tests/avx2.rs`. Since the value of
-   `IMM8` can be up to 8 bits, we want to test constant arguments up to 255. Thus, we write the following macro invocation.
+3. Next, we add a test for this intrinsic in `core_arch/avx2/tests/avx2.rs`. For convenience purposes, we have defined a `mk!` macro, which can be used to automatically generate
+   tests. The test generated by the macro generates a number of random inputs (by default, 1000), and compares the output generated by the model
+   and that generated by the intrinsic in upstream `core::arch`.  A valid test of the intrinsic above looks like this.
    ```rust
 	   mk!([100]_mm256_bsrli_epi128{<0>,<1>,<2>,<3>,...,<255>}(a: BitVec));
    ```
-   Here, the `[100]` means we test 100 random inputs for each constant value. This concludes the necessary steps for implementing an intrinsic.
+   The macro invocation has four parts. 
+   1. `mk!([100]...`: By default, the macro tests for a thousand randomly generated inputs. If needed, this can be modified, such as here, where the `[100]` is used so that
+      only 100 inputs are generated. 
+   2. `_mm256_bsrli_epi128`: This is the name of the intrinsic being tested, and is necessary in all cases.
+   3. `{<0>,<1>,<2>,<3>,...,<255>}`: This part only appears when the intrinsic has a const generic argument, like the `IMM8` in this intrinsic.
+      As the name indicates, this constant argument is supposed to be at most 8 bits wide.
+      We can confirm this by looking at the implementation and spotting the `static_assert_uimm_bits!(IMM8, 8);`
+      line, which asserts that constant argument is positive and fits in 8 bits. Thus, we add `{<0>,<1>,<2>,<3>,...,<255>}` to test for each possible constant
+      value of the constant argument. 
+   4. `(a: BitVec)`: This part contains all the arguments of the intrinsic and their types.
+   
+   This summarizes the steps needed to use the `mk!` macro to generate a test. There is a caveat: in the case that the output of an intrinsic is _not_
+   a bit-vector (and is instead, say, an integer like `i32`), then the macro will not work, and a manual test has to be written. You can see examples in the test files.
+  
 
 
 ## Contributing Models
 
 To contribute new models of intrinsics, we expect the author to follow
 the above steps and provide comprehensive tests.  It is important that
-the model author look carefully at both the Intel/ARM specification
-and the Rust `stdarch` implementation, because the Rust implementation
-may not necessarily be correct.
+the model author looks carefully at both the Intel/ARM specifications
+and the Rust `stdarch` implementation, because they may look quite different
+from each other. 
 
+In some cases, the Rust implementation may not be correct.
 Indeed, the previous implementation of `_mm256_bsrli_epi128` (and a
 similar intrinsic called `_mm512_bsrli_epi128`) in `stdarch` had a
 bug, which we found during the process of modeling and testing this
diff --git a/testable-simd-models/src/abstractions/bit.rs b/testable-simd-models/src/abstractions/bit.rs
index 4fac19fdcd567..f8b67f2ca20f1 100644
--- a/testable-simd-models/src/abstractions/bit.rs
+++ b/testable-simd-models/src/abstractions/bit.rs
@@ -20,7 +20,7 @@
 //! use testable_simd_models::abstractions::bit::{Bit, MachineInteger};
 //!
 //! // Extract the 3rd bit (0-indexed) from an integer.
-//! let bit = Bit::of_int(42, 2);
+//! let bit = Bit::nth_bit(42, 2);
 //! println!("The extracted bit is: {:?}", bit);
 //!
 //! // Convert Bit to a primitive integer type.
@@ -68,6 +68,16 @@ impl std::ops::BitXor for Bit {
     }
 }
 
+impl std::ops::Not for Bit {
+    type Output = Self;
+    fn not(self) -> Self {
+        match self {
+            Bit::One => Bit::Zero,
+            Bit::Zero => Bit::One,
+        }
+    }
+}
+
 impl std::ops::Neg for Bit {
     type Output = Self;
     fn neg(self) -> Self {
@@ -106,11 +116,11 @@ impl From<bool> for Bit {
     }
 }
 
-/// A trait for types that represent machine integers.
-pub trait MachineInteger {
-    /// The size of this integer type in bits.
-    fn bits() -> u32;
+/// A trait for integers and floats
 
+pub trait MachineNumeric {
+    /// The size of this integer type in bits.
+    const BITS: u32;
     /// The signedness of this integer type.
     const SIGNED: bool;
     /// Element of the integer type with every bit as 0.
@@ -121,7 +131,14 @@ pub trait MachineInteger {
     const MIN: Self;
     /// Maximum value of the integer type.
     const MAX: Self;
+    /// Raw transmutation of bits to u128
+    fn to_u128(self) -> u128;
+    /// Raw transmutation of bits from u128
+    fn from_u128(x: u128) -> Self;
+}
 
+/// A trait for types that represent machine integers.
+pub trait MachineInteger: MachineNumeric {
     /// Implements functionality for `simd_add` in `crate::abstractions::simd`.
     fn wrapping_add(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_sub` in `crate::abstractions::simd`.
@@ -133,28 +150,32 @@ pub trait MachineInteger {
     /// Implements functionality for `simd_saturating_sub` in `crate::abstractions::simd`.
     fn saturating_sub(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_abs_diff` in `crate::abstractions::simd`.
-    fn absolute_diff(self, rhs: Self) -> Self;
+    fn wrapping_abs_diff(self, rhs: Self) -> Self;
     /// Implements functionality for `simd_abs` in `crate::abstractions::simd`.
-    fn absolute_val(self) -> Self;
+    fn wrapping_abs(self) -> Self;
 }
 
 macro_rules! generate_imachine_integer_impls {
     ($($ty:ident),*) => {
         $(
-	    impl MachineInteger for $ty {
+        impl MachineNumeric for $ty {
+        const BITS: u32 = $ty::BITS;
 		const SIGNED: bool = true;
 		const ZEROS: $ty = 0;
 		const ONES: $ty = -1;
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
-		fn bits() -> u32 { $ty::BITS }
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
+        }
+	    impl MachineInteger for $ty {
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
 		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
 		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs) }
-		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
-		fn absolute_val(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
+		fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {$ty::wrapping_sub(self, rhs)} else {$ty::wrapping_sub(rhs, self)}}
+		fn wrapping_abs(self) -> Self {if self == $ty::MIN {self} else {self.abs()}}
             })*
     };
 }
@@ -162,43 +183,66 @@ macro_rules! generate_imachine_integer_impls {
 macro_rules! generate_umachine_integer_impls {
     ($($ty:ident),*) => {
         $(
-	    impl MachineInteger for $ty {
+        impl MachineNumeric for $ty {
+        const BITS: u32 = $ty::BITS;
 		const SIGNED: bool = false;
 		const ZEROS: $ty = 0;
 		const ONES: $ty = $ty::MAX;
 		const MIN: $ty = $ty::MIN;
 		const MAX: $ty = $ty::MAX;
-
-
-		fn bits() -> u32 { $ty::BITS }
+        fn to_u128(self) -> u128 {self as u128}
+        fn from_u128(x:u128) -> Self {x as $ty}
+        }
+	    impl MachineInteger for $ty {
 		fn wrapping_add(self, rhs: Self) -> Self { self.wrapping_add(rhs) }
 		fn wrapping_sub(self, rhs: Self) -> Self { self.wrapping_sub(rhs) }
 		fn overflowing_mul(self, rhs: Self) -> Self { self.overflowing_mul(rhs).0 }
 		fn saturating_add(self, rhs: Self) -> Self { self.saturating_add(rhs)}
 		fn saturating_sub(self, rhs: Self) -> Self { self.saturating_sub(rhs)}
-		fn absolute_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
-		fn absolute_val(self) -> Self {self}
+		fn wrapping_abs_diff(self, rhs: Self) -> Self {if self > rhs {self - rhs} else {rhs - self}}
+		fn wrapping_abs(self) -> Self {self}
         })*
     };
 }
 generate_imachine_integer_impls!(i8, i16, i32, i64, i128);
 generate_umachine_integer_impls!(u8, u16, u32, u64, u128);
 
+impl MachineNumeric for f32 {
+    const BITS: u32 = 32;
+    const SIGNED: bool = false;
+    const ZEROS: f32 = 0.0;
+    const ONES: f32 = f32::from_bits(0xffffffffu32);
+    const MIN: f32 = f32::MIN;
+    const MAX: f32 = f32::MAX;
+    fn to_u128(self) -> u128 {
+        self.to_bits() as u128
+    }
+    fn from_u128(x: u128) -> Self {
+        f32::from_bits(x as u32)
+    }
+}
+
+impl MachineNumeric for f64 {
+    const BITS: u32 = 64;
+    const SIGNED: bool = false;
+    const ZEROS: f64 = 0.0;
+    const ONES: f64 = f64::from_bits(0xffffffffffffffffu64);
+    const MIN: f64 = f64::MIN;
+    const MAX: f64 = f64::MAX;
+    fn to_u128(self) -> u128 {
+        self.to_bits() as u128
+    }
+    fn from_u128(x: u128) -> Self {
+        f64::from_bits(x as u64)
+    }
+}
+
 impl Bit {
-    fn of_raw_int(x: u128, nth: u32) -> Self {
-        if x / 2u128.pow(nth) % 2 == 1 {
+    pub fn nth_bit<T: MachineNumeric>(x: T, nth: usize) -> Self {
+        if (x.to_u128() >> nth) % 2 == 1 {
             Self::One
         } else {
             Self::Zero
         }
     }
-
-    pub fn of_int<T: Into<i128> + MachineInteger>(x: T, nth: u32) -> Bit {
-        let x: i128 = x.into();
-        if x >= 0 {
-            Self::of_raw_int(x as u128, nth)
-        } else {
-            Self::of_raw_int((2i128.pow(T::bits()) + x) as u128, nth)
-        }
-    }
 }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index 0f3003f4beadc..ac73749482e37 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -1,5 +1,5 @@
 //! This module provides a specification-friendly bit vector type.
-use super::bit::{Bit, MachineInteger};
+use super::bit::{Bit, MachineNumeric};
 use super::funarr::*;
 
 use std::fmt::Formatter;
@@ -15,7 +15,14 @@ use std::fmt::Formatter;
 /// making the bit pattern more human-readable. The type also implements indexing,
 /// allowing for easy access to individual bits.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct BitVec<const N: u64>(FunArray<N, Bit>);
+pub struct BitVec<const N: u32>(FunArray<N, Bit>);
+
+impl<const N: u32> BitVec<N> {
+    #[allow(non_snake_case)]
+    pub fn ZERO() -> Self {
+        Self::from_fn(|_| Bit::Zero)
+    }
+}
 
 /// Pretty prints a bit slice by group of 8
 fn bit_slice_to_string(bits: &[Bit]) -> String {
@@ -33,15 +40,15 @@ fn bit_slice_to_string(bits: &[Bit]) -> String {
         .into()
 }
 
-impl<const N: u64> core::fmt::Debug for BitVec<N> {
+impl<const N: u32> core::fmt::Debug for BitVec<N> {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
         write!(f, "{}", bit_slice_to_string(&self.0.as_vec()))
     }
 }
 
-impl<const N: u64> core::ops::Index<u64> for BitVec<N> {
+impl<const N: u32> core::ops::Index<u32> for BitVec<N> {
     type Output = Bit;
-    fn index(&self, index: u64) -> &Self::Output {
+    fn index(&self, index: u32) -> &Self::Output {
         self.0.get(index)
     }
 }
@@ -56,50 +63,46 @@ fn u128_int_from_bit_slice(bits: &[Bit]) -> u128 {
 }
 
 /// Convert a bit slice into a machine integer of type `T`.
-fn int_from_bit_slice<T: TryFrom<i128> + MachineInteger + Copy>(bits: &[Bit]) -> T {
-    debug_assert!(bits.len() <= T::bits() as usize);
+fn int_from_bit_slice<T: MachineNumeric + Copy>(bits: &[Bit]) -> T {
+    debug_assert!(bits.len() <= T::BITS as usize);
     let result = if T::SIGNED {
-        let is_negative = matches!(bits[T::bits() as usize - 1], Bit::One);
-        let s = u128_int_from_bit_slice(&bits[0..T::bits() as usize - 1]) as i128;
+        let is_negative = matches!(bits[T::BITS as usize - 1], Bit::One);
+        let s = u128_int_from_bit_slice(&bits[0..T::BITS as usize - 1]) as i128;
         if is_negative {
-            s + (-2i128).pow(T::bits() - 1)
+            s + (-2i128).pow(T::BITS - 1)
         } else {
             s
         }
     } else {
         u128_int_from_bit_slice(bits) as i128
     };
-    let Ok(n) = result.try_into() else {
-        // Conversion must succeed as `result` is guaranteed to be in range due to the bit-length check.
-        unreachable!()
-    };
-    n
+    T::from_u128(result as u128)
 }
-impl<const N: u64> BitVec<N> {
+impl<const N: u32> BitVec<N> {
     /// Constructor for BitVec. `BitVec::<N>::from_fn` constructs a bitvector out of a function that takes usizes smaller than `N` and produces bits.
-    pub fn from_fn<F: Fn(u64) -> Bit>(f: F) -> Self {
+    pub fn from_fn<F: Fn(u32) -> Bit>(f: F) -> Self {
         Self(FunArray::from_fn(f))
     }
     /// Convert a slice of machine integers where only the `d` least significant bits are relevant.
-    pub fn from_slice<T: Into<i128> + MachineInteger + Copy>(x: &[T], d: u64) -> Self {
-        Self::from_fn(|i| Bit::of_int::<T>(x[(i / d) as usize], (i % d) as u32))
+    pub fn from_slice<T: MachineNumeric + Copy>(x: &[T], d: u32) -> Self {
+        Self::from_fn(|i| Bit::nth_bit::<T>(x[(i / d) as usize], (i % d) as usize))
     }
 
     /// Construct a BitVec out of a machine integer.
-    pub fn from_int<T: Into<i128> + MachineInteger + Copy>(n: T) -> Self {
-        Self::from_slice::<T>(&[n], T::bits() as u64)
+    pub fn from_int<T: MachineNumeric + Copy>(n: T) -> Self {
+        Self::from_slice::<T>(&[n], T::BITS as u32)
     }
 
     /// Convert a BitVec into a machine integer of type `T`.
-    pub fn to_int<T: TryFrom<i128> + MachineInteger + Copy>(self) -> T {
+    pub fn to_int<T: MachineNumeric + Copy>(self) -> T {
         int_from_bit_slice(&self.0.as_vec())
     }
 
     /// Convert a BitVec into a vector of machine integers of type `T`.
-    pub fn to_vec<T: TryFrom<i128> + MachineInteger + Copy>(&self) -> Vec<T> {
+    pub fn to_vec<T: MachineNumeric + Copy>(&self) -> Vec<T> {
         self.0
             .as_vec()
-            .chunks(T::bits() as usize)
+            .chunks(T::BITS as usize)
             .map(int_from_bit_slice)
             .collect()
     }
@@ -115,12 +118,12 @@ impl<const N: u64> BitVec<N> {
     }
 }
 
-impl<const N: u64> BitVec<N> {
-    pub fn chunked_shift<const CHUNK: u64, const SHIFTS: u64>(
+impl<const N: u32> BitVec<N> {
+    pub fn chunked_shift<const CHUNK: u32, const SHIFTS: u32>(
         self,
         shl: FunArray<SHIFTS, i128>,
     ) -> BitVec<N> {
-        fn chunked_shift<const N: u64, const CHUNK: u64, const SHIFTS: u64>(
+        fn chunked_shift<const N: u32, const CHUNK: u32, const SHIFTS: u32>(
             bitvec: BitVec<N>,
             shl: FunArray<SHIFTS, i128>,
         ) -> BitVec<N> {
@@ -134,7 +137,7 @@ impl<const N: u64> BitVec<N> {
                 };
                 let local_index = (nth_bit as i128).wrapping_sub(shift);
                 if local_index < CHUNK as i128 && local_index >= 0 {
-                    let local_index = local_index as u64;
+                    let local_index = local_index as u32;
                     bitvec[nth_chunk * CHUNK + local_index]
                 } else {
                     Bit::Zero
diff --git a/testable-simd-models/src/abstractions/funarr.rs b/testable-simd-models/src/abstractions/funarr.rs
index 4c120addcb0c5..4026efb66c1f5 100644
--- a/testable-simd-models/src/abstractions/funarr.rs
+++ b/testable-simd-models/src/abstractions/funarr.rs
@@ -1,23 +1,25 @@
 //! This module implements a fixed-size array wrapper with functional semantics
 //! which are used in formulating abstractions.
 
+use crate::abstractions::bit::MachineNumeric;
+
 /// `FunArray<N, T>` represents an array of `T` values of length `N`, where `N` is a compile-time constant.
 /// Internally, it uses a fixed-length array of `Option<T>` with a maximum capacity of 512 elements.
 /// Unused elements beyond `N` are filled with `None`.
 #[derive(Copy, Clone, Eq, PartialEq)]
-pub struct FunArray<const N: u64, T>([Option<T>; 512]);
+pub struct FunArray<const N: u32, T>([Option<T>; 512]);
 
-impl<const N: u64, T> FunArray<N, T> {
+impl<const N: u32, T> FunArray<N, T> {
     /// Gets a reference to the element at index `i`.
-    pub fn get(&self, i: u64) -> &T {
+    pub fn get(&self, i: u32) -> &T {
         self.0[i as usize].as_ref().unwrap()
     }
     /// Constructor for FunArray. `FunArray<N,T>::from_fn` constructs a funarray out of a function that takes usizes smaller than `N` and produces an element of type T.
-    pub fn from_fn<F: Fn(u64) -> T>(f: F) -> Self {
+    pub fn from_fn<F: Fn(u32) -> T>(f: F) -> Self {
         // let vec = (0..N).map(f).collect();
         let arr = core::array::from_fn(|i| {
-            if (i as u64) < N {
-                Some(f(i as u64))
+            if (i as u32) < N {
+                Some(f(i as u32))
             } else {
                 None
             }
@@ -53,10 +55,17 @@ impl<const N: u64, T> FunArray<N, T> {
     }
 }
 
-impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
+impl<const N: u32, T: MachineNumeric> FunArray<N, T> {
+    #[allow(non_snake_case)]
+    pub fn ZERO() -> Self {
+        Self::from_fn(|_| T::ZEROS)
+    }
+}
+
+impl<const N: u32, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     type Error = ();
     fn try_from(v: Vec<T>) -> Result<Self, ()> {
-        if (v.len() as u64) < N {
+        if (v.len() as u32) < N {
             Err(())
         } else {
             Ok(Self::from_fn(|i| v[i as usize].clone()))
@@ -64,16 +73,113 @@ impl<const N: u64, T: Clone> TryFrom<Vec<T>> for FunArray<N, T> {
     }
 }
 
-impl<const N: u64, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
+impl<const N: u32, T: core::fmt::Debug + Clone> core::fmt::Debug for FunArray<N, T> {
     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         write!(f, "{:?}", self.as_vec())
     }
 }
 
-impl<const N: u64, T> core::ops::Index<u64> for FunArray<N, T> {
+impl<const N: u32, T> core::ops::Index<u32> for FunArray<N, T> {
     type Output = T;
 
-    fn index(&self, index: u64) -> &Self::Output {
+    fn index(&self, index: u32) -> &Self::Output {
         self.get(index)
     }
 }
+
+impl<T: Copy> FunArray<1, T> {
+    pub fn new(x: T) -> Self {
+        let v = [x];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<2, T> {
+    pub fn new(x0: T, x1: T) -> Self {
+        let v = [x0, x1];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<4, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T) -> Self {
+        let v = [x0, x1, x2, x3];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<8, T> {
+    pub fn new(x0: T, x1: T, x2: T, x3: T, x4: T, x5: T, x6: T, x7: T) -> Self {
+        let v = [x0, x1, x2, x3, x4, x5, x6, x7];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<16, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15,
+        ];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
+
+impl<T: Copy> FunArray<32, T> {
+    pub fn new(
+        x0: T,
+        x1: T,
+        x2: T,
+        x3: T,
+        x4: T,
+        x5: T,
+        x6: T,
+        x7: T,
+        x8: T,
+        x9: T,
+        x10: T,
+        x11: T,
+        x12: T,
+        x13: T,
+        x14: T,
+        x15: T,
+        x16: T,
+        x17: T,
+        x18: T,
+        x19: T,
+        x20: T,
+        x21: T,
+        x22: T,
+        x23: T,
+        x24: T,
+        x25: T,
+        x26: T,
+        x27: T,
+        x28: T,
+        x29: T,
+        x30: T,
+        x31: T,
+    ) -> Self {
+        let v = [
+            x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18,
+            x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, x30, x31,
+        ];
+        Self::from_fn(|i| v[i as usize])
+    }
+}
diff --git a/testable-simd-models/src/abstractions/mod.rs b/testable-simd-models/src/abstractions/mod.rs
index b3018a8189569..4f840ab60235d 100644
--- a/testable-simd-models/src/abstractions/mod.rs
+++ b/testable-simd-models/src/abstractions/mod.rs
@@ -24,3 +24,4 @@ pub mod bit;
 pub mod bitvec;
 pub mod funarr;
 pub mod simd;
+pub mod utilities;
diff --git a/testable-simd-models/src/abstractions/simd.rs b/testable-simd-models/src/abstractions/simd.rs
index 08b1b21bce34d..70e0556618288 100644
--- a/testable-simd-models/src/abstractions/simd.rs
+++ b/testable-simd-models/src/abstractions/simd.rs
@@ -2,7 +2,7 @@
 //!
 //! Operations are defined on FunArrs.
 
-use crate::abstractions::{bit::MachineInteger, bitvec::*, funarr::*};
+use crate::abstractions::{bit::*, bitvec::*, funarr::*};
 use std::convert::*;
 use std::ops::*;
 
@@ -20,11 +20,16 @@ macro_rules! interpretations {
                         #[doc = concat!("Conversion from ", stringify!($ty), " vectors of size ", stringify!($m), "to  bit vectors of size ", stringify!($n))]
                         pub fn [< from_ $name >](iv: $name) -> BitVec<$n> {
             let vec: Vec<$ty> = iv.as_vec();
-            Self::from_slice(&vec[..], <$ty>::bits() as u64)
+            Self::from_slice(&vec[..], <$ty>::BITS as u32)
                         }
                         #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
                         pub fn [< to_ $name >](bv: BitVec<$n>) -> $name {
             let vec: Vec<$ty> = bv.to_vec();
+            $name::from_fn(|i| vec[i as usize])
+                        }
+                        #[doc = concat!("Conversion from bit vectors of size ", stringify!($n), " to ", stringify!($ty), " vectors of size ", stringify!($m))]
+                        pub fn [< as_ $name >](self) -> $name {
+            let vec: Vec<$ty> = self.to_vec();
             $name::from_fn(|i| vec[i as usize])
                         }
 
@@ -57,22 +62,20 @@ macro_rules! interpretations {
 }
 
 interpretations!(256; i32x8 [i32; 8], i64x4 [i64; 4], i16x16 [i16; 16], i128x2 [i128; 2], i8x32 [i8; 32],
-            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32]);
+            u32x8 [u32; 8], u64x4 [u64; 4], u16x16 [u16; 16], u8x32 [u8; 32], f32x8 [f32; 8], f64x4 [f64; 4]);
 interpretations!(128; i32x4 [i32; 4], i64x2 [i64; 2], i16x8 [i16; 8], i128x1 [i128; 1], i8x16 [i8; 16],
-            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16]);
+            u32x4 [u32; 4], u64x2 [u64; 2], u16x8 [u16; 8], u8x16 [u8; 16], f32x4 [f32; 4], f64x2 [f64; 2]);
 
 interpretations!(512; u32x16 [u32; 16], u16x32 [u16; 32], i32x16 [i32; 16], i16x32 [i16; 32]);
-interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8]);
+interpretations!(64; i64x1 [i64; 1], i32x2 [i32; 2], i16x4 [i16; 4], i8x8 [i8; 8], u64x1 [u64; 1], u32x2 [u32; 2],u16x4 [u16; 4], u8x8 [u8; 8], f32x2 [f32; 2], f64x1 [f64; 1]);
 interpretations!(32; i8x4 [i8; 4], u8x4 [u8; 4]);
 
-
 /// Inserts an element into a vector, returning the updated vector.
 ///
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-
-pub fn simd_insert<const N: u64, T: Copy>(x: FunArray<N, T>, idx: u64, val: T) -> FunArray<N, T> {
+pub fn simd_insert<const N: u32, T: Copy>(x: FunArray<N, T>, idx: u32, val: T) -> FunArray<N, T> {
     FunArray::from_fn(|i| if i == idx { val } else { x[i] })
 }
 
@@ -81,49 +84,49 @@ pub fn simd_insert<const N: u64, T: Copy>(x: FunArray<N, T>, idx: u64, val: T) -
 /// # Safety
 ///
 /// `idx` must be in-bounds of the vector, ie. idx < N
-pub fn simd_extract<const N: u64, T: Clone>(x: FunArray<N, T>, idx: u64) -> T {
+pub fn simd_extract<const N: u32, T: Clone>(x: FunArray<N, T>, idx: u32) -> T {
     x.get(idx).clone()
 }
 
 /// Adds two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_add<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_add<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].wrapping_add(y[i])))
+    FunArray::from_fn(|i| x[i].wrapping_add(y[i]))
 }
 
 /// Subtracts `rhs` from `lhs` elementwise with wrapping on overflow/underflow.
-pub fn simd_sub<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_sub<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].wrapping_sub(y[i])))
+    FunArray::from_fn(|i| x[i].wrapping_sub(y[i]))
 }
 
 /// Multiplies two vectors elementwise with wrapping on overflow/underflow.
-pub fn simd_mul<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_mul<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].overflowing_mul(y[i])))
+    FunArray::from_fn(|i| x[i].overflowing_mul(y[i]))
 }
 
 /// Produces the elementwise absolute values.
 /// For vectors of unsigned integers it returns the vector untouched.
 /// If the element is the minimum value of a signed integer, it returns the element as is.
-pub fn simd_abs<const N: u64, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
-    FunArray::from_fn(|i| x[i].absolute_val())
+pub fn simd_abs<const N: u32, T: MachineInteger + Copy>(x: FunArray<N, T>) -> FunArray<N, T> {
+    FunArray::from_fn(|i| x[i].wrapping_abs())
 }
 
 /// Produces the elementwise absolute difference of two vectors.
 /// Note: Absolute difference in this case is simply the element with the smaller value subtracted from the element with the larger value, with overflow/underflow.
 /// For example, if the elements are i8, the absolute difference of 255 and -2 is -255.
-pub fn simd_abs_diff<const N: u64, T: MachineInteger + Copy>(
+pub fn simd_abs_diff<const N: u32, T: MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
-    FunArray::from_fn(|i| (x[i].absolute_diff(y[i])))
+    FunArray::from_fn(|i| x[i].wrapping_abs_diff(y[i]))
 }
 
 /// Shifts vector left elementwise, with UB on overflow.
@@ -131,11 +134,11 @@ pub fn simd_abs_diff<const N: u64, T: MachineInteger + Copy>(
 /// # Safety
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
-pub fn simd_shl<const N: u64, T: Shl + Copy>(
+pub fn simd_shl<const N: u32, T: Shl + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shl>::Output> {
-    FunArray::from_fn(|i| (x[i] << y[i]))
+    FunArray::from_fn(|i| x[i] << y[i])
 }
 
 /// Shifts vector right elementwise, with UB on overflow.
@@ -146,38 +149,38 @@ pub fn simd_shl<const N: u64, T: Shl + Copy>(
 ///
 /// Each element of `rhs` must be less than `<int>::BITS`.
 
-pub fn simd_shr<const N: u64, T: Shr + Copy>(
+pub fn simd_shr<const N: u32, T: Shr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as Shr>::Output> {
-    FunArray::from_fn(|i| (x[i] >> y[i]))
+    FunArray::from_fn(|i| x[i] >> y[i])
 }
 
 /// "Ands" vectors elementwise.
 
-pub fn simd_and<const N: u64, T: BitAnd + Copy>(
+pub fn simd_and<const N: u32, T: BitAnd + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitAnd>::Output> {
-    FunArray::from_fn(|i| (x[i] & y[i]))
+    FunArray::from_fn(|i| x[i] & y[i])
 }
 
 /// "Ors" vectors elementwise.
 
-pub fn simd_or<const N: u64, T: BitOr + Copy>(
+pub fn simd_or<const N: u32, T: BitOr + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitOr>::Output> {
-    FunArray::from_fn(|i| (x[i] | y[i]))
+    FunArray::from_fn(|i| x[i] | y[i])
 }
 
 /// "Exclusive ors" vectors elementwise.
 
-pub fn simd_xor<const N: u64, T: BitXor + Copy>(
+pub fn simd_xor<const N: u32, T: BitXor + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, <T as BitXor>::Output> {
-    FunArray::from_fn(|i| (x[i] ^ y[i]))
+    FunArray::from_fn(|i| x[i] ^ y[i])
 }
 
 pub trait CastsFrom<T> {
@@ -193,7 +196,7 @@ macro_rules! from_impls{
         $(
 	    impl CastsFrom<$ty2> for $ty1 {
 		fn cast(a: $ty2) -> $ty1 {
-		    <$ty1>::from(a)
+		    a as $ty1
 		}
 	    }
 	)*
@@ -275,7 +278,13 @@ from_impls!(
     [i128, i8],
     [i128, i16],
     [i128, i32],
-    [i128, i64]
+    [i128, i64],
+    [f64, u32],
+    [f64, i32],
+    [f32, u32],
+    [f32, i32],
+    [f32, f64],
+    [f64, f32]
 );
 truncate_from_impls!(
     [u8, u16],
@@ -327,7 +336,7 @@ self_impls!(u8, u16, u32, u64, u128, i8, i16, i32, i64, i128);
 ///
 /// When casting from a wider number to a smaller number, the higher bits are removed.
 /// Otherwise, it extends the number, following signedness.
-pub fn simd_cast<const N: u64, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
+pub fn simd_cast<const N: u32, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>) -> FunArray<N, T2> {
     FunArray::from_fn(|i| T2::cast(x[i]))
 }
 
@@ -335,7 +344,7 @@ pub fn simd_cast<const N: u64, T1: Copy, T2: CastsFrom<T1>>(x: FunArray<N, T1>)
 ///
 /// Rust panics for `-<int>::Min` due to overflow, but here, it just returns the element as is.
 
-pub fn simd_neg<const N: u64, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
+pub fn simd_neg<const N: u32, T: From<<T as Neg>::Output> + MachineInteger + Eq + Neg + Copy>(
     x: FunArray<N, T>,
 ) -> FunArray<N, T> {
     FunArray::from_fn(|i| {
@@ -350,7 +359,7 @@ pub fn simd_neg<const N: u64, T: From<<T as Neg>::Output> + MachineInteger + Eq
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_eq<const N: u64, T: Eq + MachineInteger + Copy>(
+pub fn simd_eq<const N: u32, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -361,7 +370,7 @@ pub fn simd_eq<const N: u64, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ne<const N: u64, T: Eq + MachineInteger + Copy>(
+pub fn simd_ne<const N: u32, T: Eq + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -372,7 +381,7 @@ pub fn simd_ne<const N: u64, T: Eq + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_lt<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_lt<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -383,7 +392,7 @@ pub fn simd_lt<const N: u64, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_le<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_le<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -394,7 +403,7 @@ pub fn simd_le<const N: u64, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_gt<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_gt<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -405,7 +414,7 @@ pub fn simd_gt<const N: u64, T: Ord + MachineInteger + Copy>(
 ///
 /// Returns `0` (all zeros) for false and `!0` (all ones) for true.
 
-pub fn simd_ge<const N: u64, T: Ord + MachineInteger + Copy>(
+pub fn simd_ge<const N: u32, T: Ord + MachineInteger + Copy>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -415,10 +424,10 @@ pub fn simd_ge<const N: u64, T: Ord + MachineInteger + Copy>(
 /// Shuffles two vectors by the indices in idx.
 ///
 /// For safety, `N2 <= N1 + N3` must hold.
-pub fn simd_shuffle<T: Copy, const N1: u64, const N2: usize, const N3: u64>(
+pub fn simd_shuffle<T: Copy, const N1: u32, const N2: usize, const N3: u32>(
     x: FunArray<N1, T>,
     y: FunArray<N1, T>,
-    idx: [u64; N2],
+    idx: [u32; N2],
 ) -> FunArray<N3, T> {
     FunArray::from_fn(|i| {
         let i = idx[i as usize];
@@ -432,7 +441,7 @@ pub fn simd_shuffle<T: Copy, const N1: u64, const N2: usize, const N3: u64>(
 
 /// Adds two vectors elementwise, with saturation.
 
-pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u64>(
+pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u32>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -441,7 +450,7 @@ pub fn simd_saturating_add<T: MachineInteger + Copy, const N: u64>(
 
 /// Subtracts `y` from `x` elementwise, with saturation.
 
-pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u64>(
+pub fn simd_saturating_sub<T: MachineInteger + Copy, const N: u32>(
     x: FunArray<N, T>,
     y: FunArray<N, T>,
 ) -> FunArray<N, T> {
@@ -923,7 +932,7 @@ pub(crate) use simd_bitmask_big;
 /// # Safety
 /// `mask` must only contain `0` and `!0`.
 
-pub fn simd_select<const N: u64, T1: Eq + MachineInteger, T2: Copy + MachineInteger>(
+pub fn simd_select<const N: u32, T1: Eq + MachineInteger, T2: Copy>(
     mask: FunArray<N, T1>,
     if_true: FunArray<N, T2>,
     if_false: FunArray<N, T2>,
diff --git a/testable-simd-models/src/abstractions/utilities.rs b/testable-simd-models/src/abstractions/utilities.rs
new file mode 100644
index 0000000000000..86e1c0ba52de1
--- /dev/null
+++ b/testable-simd-models/src/abstractions/utilities.rs
@@ -0,0 +1,59 @@
+/// Converts one type to another
+pub fn transmute<T, U: From<T>>(a: T) -> U {
+    a.into()
+}
+
+#[allow(unused)]
+#[macro_export]
+macro_rules! static_assert {
+    ($e:expr) => {
+        const {
+            assert!($e);
+        }
+    };
+    ($e:expr, $msg:expr) => {
+        const {
+            assert!($e, $msg);
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[macro_export]
+macro_rules! static_assert_uimm_bits {
+    ($imm:ident, $bits:expr) => {
+        // `0 <= $imm` produces a warning if the immediate has an unsigned type
+        #[allow(unused_comparisons)]
+        {
+            static_assert!(
+                0 <= $imm && $imm < (1 << $bits),
+                concat!(
+                    stringify!($imm),
+                    " doesn't fit in ",
+                    stringify!($bits),
+                    " bits",
+                )
+            )
+        }
+    };
+}
+
+#[allow(unused_macros)]
+#[macro_export]
+macro_rules! static_assert_simm_bits {
+    ($imm:ident, $bits:expr) => {
+        static_assert!(
+            (-1 << ($bits - 1)) - 1 <= $imm && $imm < (1 << ($bits - 1)),
+            concat!(
+                stringify!($imm),
+                " doesn't fit in ",
+                stringify!($bits),
+                " bits",
+            )
+        )
+    };
+}
+
+pub use static_assert;
+pub use static_assert_simm_bits;
+pub use static_assert_uimm_bits;
diff --git a/testable-simd-models/src/core_arch/x86/models/avx.rs b/testable-simd-models/src/core_arch/x86/models/avx.rs
index f392a7abf05b0..8e2fb37319d36 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx.rs
@@ -13,231 +13,1362 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 
+use super::avx_handwritten::*;
+use super::sse::*;
+use super::sse2::*;
 use super::types::*;
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
 
-mod c_extern {
-    use crate::abstractions::simd::*;
-
-    pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
-        let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
-            0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
-            1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
-            2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
-            3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
-            _ => unreachable!(),
-        });
-
-        i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+/// Adds packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { transmute(simd_add(a.as_f64x4(), b.as_f64x4())) }
+// }
+
+/// Adds packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_add_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
+//     { transmute(simd_add(a.as_f32x8(), b.as_f32x8())) }
+// }
+
+/// Computes the bitwise AND of a packed double-precision (64-bit)
+/// floating-point elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_pd)
+pub fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_and_ps)
+pub fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(a, b))
+    }
+}
+/// Computes the bitwise OR packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_pd)
+pub fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+/// Computes the bitwise OR packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_or_ps)
+pub fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_or(a, b))
+    }
+}
+/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
+/// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_pd)
+pub fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 4,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 6,
+            ],
+        ))
+    }
+}
+/// Shuffles single-precision (32-bit) floating-point elements in `a` within
+/// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_shuffle_ps)
+pub fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        ))
+    }
+}
+/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
+/// elements in `a`, and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_pd)
+pub fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
+    }
+}
+/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
+/// elements in `a`
+/// and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_andnot_ps)
+pub fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
+    }
+}
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vmaxpd(a, b) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed maximum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_max_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
+//     { vmaxps(a, b) }
+// }
+
+/// Compares packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vminpd(a, b) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point elements in `a`
+/// and `b`, and returns packed minimum values
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_min_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
+//     { vminps(a, b) }
+// }
+
+/// Multiplies packed double-precision (64-bit) floating-point elements
+/// in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { transmute(simd_mul(a.as_f64x4(), b.as_f64x4())) }
+// }
+
+/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_mul_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
+//     { transmute(simd_mul(a.as_f32x8(), b.as_f32x8())) }
+// }
+
+/// Alternatively adds and subtracts packed double-precision (64-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     {
+//         let a = a.as_f64x4();
+//         let b = b.as_f64x4();
+//         let add = simd_add(a, b);
+//         let sub = simd_sub(a, b);
+//         simd_shuffle(add, sub, [4, 1, 6, 3])
+//     }
+// }
+
+/// Alternatively adds and subtracts packed single-precision (32-bit)
+/// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_addsub_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
+//     {
+//         let a = a.as_f32x8();
+//         let b = b.as_f32x8();
+//         let add = simd_add(a, b);
+//         let sub = simd_sub(a, b);
+//         simd_shuffle(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+//     }
+// }
+
+/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { simd_sub(a, b) }
+// }
+
+/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
+/// from packed elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sub_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
+//     { simd_sub(a, b) }
+// }
+
+/// Computes the division of each of the 8 packed 32-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
+//     { simd_div(a, b) }
+// }
+
+/// Computes the division of each of the 4 packed 64-bit floating-point elements
+/// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_div_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { simd_div(a, b) }
+// }
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(ROUNDING, 4);
+//     { roundpd256(a, ROUNDING) }
+// }
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_ceil_pd(a: __m256d) -> __m256d {
+//     { simd_ceil(a) }
+// }
+
+/// Rounds packed double-precision (64-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_floor_pd(a: __m256d) -> __m256d {
+//     { simd_floor(a) }
+// }
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
+///
+/// - `0x00`: Round to the nearest whole number.
+/// - `0x01`: Round down, toward negative infinity.
+/// - `0x02`: Round up, toward positive infinity.
+/// - `0x03`: Truncate the values.
+///
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
+///
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_round_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
+//     static_assert_uimm_bits!(ROUNDING, 4);
+//     { roundps256(a, ROUNDING) }
+// }
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward positive infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_ceil_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_ceil_ps(a: __m256) -> __m256 {
+//     { simd_ceil(a) }
+// }
+
+/// Rounds packed single-precision (32-bit) floating point elements in `a`
+/// toward negative infinity.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_floor_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_floor_ps(a: __m256) -> __m256 {
+//     { simd_floor(a) }
+// }
+
+/// Returns the square root of packed single-precision (32-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_sqrt_ps(a: __m256) -> __m256 {
+//     { simd_fsqrt(a) }
+// }
+
+/// Returns the square root of packed double-precision (64-bit) floating point
+/// elements in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_sqrt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
+//     { simd_fsqrt(a) }
+// }
+
+/// Blends packed double-precision (64-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_pd)
+pub fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            [
+                ((IMM4 as u32 >> 0) & 1) * 4 + 0,
+                ((IMM4 as u32 >> 1) & 1) * 4 + 1,
+                ((IMM4 as u32 >> 2) & 1) * 4 + 2,
+                ((IMM4 as u32 >> 3) & 1) * 4 + 3,
+            ],
+        ))
     }
 }
-
-use c_extern::*;
 /// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blend_ps)
+pub fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            [
+                ((IMM8 as u32 >> 0) & 1) * 8 + 0,
+                ((IMM8 as u32 >> 1) & 1) * 8 + 1,
+                ((IMM8 as u32 >> 2) & 1) * 8 + 2,
+                ((IMM8 as u32 >> 3) & 1) * 8 + 3,
+                ((IMM8 as u32 >> 4) & 1) * 8 + 4,
+                ((IMM8 as u32 >> 5) & 1) * 8 + 5,
+                ((IMM8 as u32 >> 6) & 1) * 8 + 6,
+                ((IMM8 as u32 >> 7) & 1) * 8 + 7,
+            ],
+        ))
+    }
+}
+/// Blends packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_pd)
+pub fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    {
+        let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO());
+        transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
+    }
+}
+/// Blends packed single-precision (32-bit) floating-point elements from
+/// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_blendv_ps)
 pub fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
-    let mask: i32x8 = simd_lt(BitVec::to_i32x8(c), i32x8::from_fn(|_| 0));
-    BitVec::from_i32x8(simd_select(mask, BitVec::to_i32x8(b), BitVec::to_i32x8(a)))
+    {
+        let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO());
+        transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
+    }
+}
+/// Conditionally multiplies the packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` using the high 4 bits in `imm8`,
+/// sum the four products, and conditionally return the sum
+///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_dp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vdpps(a, b, IMM8 as i8) }
+// }
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vhaddpd(a, b) }
+// }
+
+/// Horizontal addition of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hadd_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
+//     { vhaddps(a, b) }
+// }
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 4 64-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in even locations,
+/// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
+//     { vhsubpd(a, b) }
+// }
+
+/// Horizontal subtraction of adjacent pairs in the two packed vectors
+/// of 8 32-bit floating points `a` and `b`.
+/// In the result, sums of elements from `a` are returned in locations of
+/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
+/// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_hsub_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
+//     { vhsubps(a, b) }
+// }
+
+/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_pd)
+pub fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
+    {
+        let a: u64x4 = transmute(a);
+        let b: u64x4 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
+}
+/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_xor_ps)
+pub fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
+    {
+        let a: u32x8 = transmute(a);
+        let b: u32x8 = transmute(b);
+        transmute(simd_xor(a, b))
+    }
 }
-
 /// Equal (ordered, non-signaling)
-
 pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
-
 pub const _CMP_LT_OS: i32 = 0x01;
 /// Less-than-or-equal (ordered, signaling)
-
 pub const _CMP_LE_OS: i32 = 0x02;
 /// Unordered (non-signaling)
-
 pub const _CMP_UNORD_Q: i32 = 0x03;
 /// Not-equal (unordered, non-signaling)
-
 pub const _CMP_NEQ_UQ: i32 = 0x04;
 /// Not-less-than (unordered, signaling)
-
 pub const _CMP_NLT_US: i32 = 0x05;
 /// Not-less-than-or-equal (unordered, signaling)
-
 pub const _CMP_NLE_US: i32 = 0x06;
 /// Ordered (non-signaling)
-
 pub const _CMP_ORD_Q: i32 = 0x07;
 /// Equal (unordered, non-signaling)
-
 pub const _CMP_EQ_UQ: i32 = 0x08;
 /// Not-greater-than-or-equal (unordered, signaling)
-
 pub const _CMP_NGE_US: i32 = 0x09;
 /// Not-greater-than (unordered, signaling)
-
 pub const _CMP_NGT_US: i32 = 0x0a;
 /// False (ordered, non-signaling)
-
 pub const _CMP_FALSE_OQ: i32 = 0x0b;
 /// Not-equal (ordered, non-signaling)
-
 pub const _CMP_NEQ_OQ: i32 = 0x0c;
 /// Greater-than-or-equal (ordered, signaling)
-
 pub const _CMP_GE_OS: i32 = 0x0d;
 /// Greater-than (ordered, signaling)
-
 pub const _CMP_GT_OS: i32 = 0x0e;
 /// True (unordered, non-signaling)
-
 pub const _CMP_TRUE_UQ: i32 = 0x0f;
 /// Equal (ordered, signaling)
-
 pub const _CMP_EQ_OS: i32 = 0x10;
 /// Less-than (ordered, non-signaling)
-
 pub const _CMP_LT_OQ: i32 = 0x11;
 /// Less-than-or-equal (ordered, non-signaling)
-
 pub const _CMP_LE_OQ: i32 = 0x12;
 /// Unordered (signaling)
-
 pub const _CMP_UNORD_S: i32 = 0x13;
 /// Not-equal (unordered, signaling)
-
 pub const _CMP_NEQ_US: i32 = 0x14;
 /// Not-less-than (unordered, non-signaling)
-
 pub const _CMP_NLT_UQ: i32 = 0x15;
 /// Not-less-than-or-equal (unordered, non-signaling)
-
 pub const _CMP_NLE_UQ: i32 = 0x16;
 /// Ordered (signaling)
-
 pub const _CMP_ORD_S: i32 = 0x17;
 /// Equal (unordered, signaling)
-
 pub const _CMP_EQ_US: i32 = 0x18;
 /// Not-greater-than-or-equal (unordered, non-signaling)
-
 pub const _CMP_NGE_UQ: i32 = 0x19;
 /// Not-greater-than (unordered, non-signaling)
-
 pub const _CMP_NGT_UQ: i32 = 0x1a;
 /// False (ordered, signaling)
-
 pub const _CMP_FALSE_OS: i32 = 0x1b;
 /// Not-equal (ordered, signaling)
-
 pub const _CMP_NEQ_OS: i32 = 0x1c;
 /// Greater-than-or-equal (ordered, non-signaling)
-
 pub const _CMP_GE_OQ: i32 = 0x1d;
 /// Greater-than (ordered, non-signaling)
-
 pub const _CMP_GT_OQ: i32 = 0x1e;
 /// True (unordered, signaling)
-
 pub const _CMP_TRUE_US: i32 = 0x1f;
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmppd(a, b, const { IMM5 as i8 }) }
+// }
+
+/// Compares packed double-precision (64-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmppd256(a, b, IMM5 as u8) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpps(a, b, const { IMM5 as i8 }) }
+// }
+
+/// Compares packed single-precision (32-bit) floating-point
+/// elements in `a` and `b` based on the comparison operand
+/// specified by `IMM5`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cmp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpps256(a, b, const { IMM5 as u8 }) }
+// }
+
+/// Compares the lower double-precision (64-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper element from `a` to the upper element of returned
+/// vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpsd(a, b, IMM5 as i8) }
+// }
+
+/// Compares the lower single-precision (32-bit) floating-point element in
+/// `a` and `b` based on the comparison operand specified by `IMM5`,
+/// store the result in the lower element of returned vector,
+/// and copies the upper 3 packed elements from `a` to the upper elements of
+/// returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_cmp_ss)
+// NOTE: Not modeled yet
+// pub fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
+//     static_assert_uimm_bits!(IMM5, 5);
+//     { vcmpss(a, b, IMM5 as i8) }
+// }
+
+/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_pd)
+pub fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
+    transmute(simd_cast::<4, i32, f64>(a.as_i32x4()))
+}
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtepi32_ps)
+pub fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
+    transmute(simd_cast::<8, _, f32>(a.as_i32x8()))
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_ps)
+pub fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
+    transmute(simd_cast::<4, _, f32>(a.as_f64x4()))
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
+//     { transmute(vcvtps2dq(a)) }
+// }
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtps_pd)
+pub fn _mm256_cvtps_pd(a: __m128) -> __m256d {
+    transmute(simd_cast::<4, _, f64>(a.as_f32x4()))
+}
+/// Returns the first element of the input vector of `[4 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsd_f64)
+pub fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
+    simd_extract(a.as_f64x4(), 0)
+}
 
-pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    // // static_assert_uimm_bits!(IMM8, 8);
-    vperm2f128si256(BitVec::to_i32x8(a), BitVec::to_i32x8(b), IMM8 as i8).into()
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
+//     { transmute(vcvttpd2dq(a)) }
+// }
+
+/// Converts packed double-precision (64-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
+//     { transmute(vcvtpd2dq(a)) }
+// }
+
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvttps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
+//     { transmute(vcvttps2dq(a)) }
+// }
+
+/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_ps)
+pub fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            _mm256_undefined_ps().as_f32x8(),
+            [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
+        ))
+    }
+}
+/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
+/// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_pd)
+pub fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
+    static_assert_uimm_bits!(IMM1, 1);
+    transmute(simd_shuffle(
+        a.as_f64x4(),
+        _mm256_undefined_pd().as_f64x4(),
+        [[0, 1], [2, 3]][IMM1 as usize],
+    ))
+}
+/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extractf128_si256)
+pub fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let dst: i64x2 = simd_shuffle(a.as_i64x4(), i64x4::ZERO(), [[0, 1], [2, 3]][IMM1 as usize]);
+        transmute(dst)
+    }
+}
+/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_extract_epi32)
+pub fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
+    static_assert_uimm_bits!(INDEX, 3);
+    simd_extract(a.as_i32x8(), INDEX as u32)
+}
+/// Returns the first element of the input vector of `[8 x i32]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtsi256_si32)
+pub fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
+    simd_extract(a.as_i32x8(), 0)
+}
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroall)
+// NOTE: Not modeled yet
+// pub fn _mm256_zeroall() {
+//     { vzeroall() }
+// }
+
+/// Zeroes the upper 128 bits of all YMM registers;
+/// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zeroupper)
+// NOTE: Not modeled yet
+// pub fn _mm256_zeroupper() {
+//     { vzeroupper() }
+// }
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
+//     { vpermilps256(a, b.as_i32x8()) }
+// }
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
+//     { vpermilps(a, b.as_i32x4()) }
+// }
+
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_ps)
+pub fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            _mm256_undefined_ps().as_f32x8(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                ((IMM8 as u32 >> 0) & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        ))
+    }
+}
+/// Shuffles single-precision (32-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_ps)
+pub fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_undefined_ps().as_f32x4(),
+            [
+                (IMM8 as u32 >> 0) & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        ))
+    }
 }
 
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permutevar_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
+//     { vpermilpd256(a, b.as_i64x4()) }
+// }
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permutevar_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
+//     { vpermilpd(a, b.as_i64x2()) }
+// }
+
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute_pd)
+pub fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            _mm256_undefined_pd().as_f64x4(),
+            [
+                ((IMM4 as u32 >> 0) & 1),
+                ((IMM4 as u32 >> 1) & 1),
+                ((IMM4 as u32 >> 2) & 1) + 2,
+                ((IMM4 as u32 >> 3) & 1) + 2,
+            ],
+        ))
+    }
+}
+/// Shuffles double-precision (64-bit) floating-point elements in `a`
+/// using the control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_permute_pd)
+pub fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
+    static_assert_uimm_bits!(IMM2, 2);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_undefined_pd().as_f64x2(),
+            [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
+        ))
+    }
+}
+/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vperm2f128ps256(a, b, IMM8 as i8) }
+// }
+/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
+/// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     { vperm2f128pd256(a, b, IMM8 as i8) }
+// }
+/// Shuffles 128-bits (composed of integer data) selected by `imm8`
+/// from `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_permute2f128_si256)
+pub fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
+    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
+}
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ss)
+pub fn _mm256_broadcast_ss(f: &f32) -> __m256 {
+    _mm256_set1_ps(*f)
+}
+/// Broadcasts a single-precision (32-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_broadcast_ss)
+// NOTE: Not modeled yet
+// pub fn _mm_broadcast_ss(f: &f32) -> __m128 {
+//     _mm_set1_ps(*f)
+// }
+/// Broadcasts a double-precision (64-bit) floating-point element from memory
+/// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_sd)
+// NOTE: Not modeled yet
+// pub fn _mm256_broadcast_sd(f: &f64) -> __m256d {
+//     _mm256_set1_pd(*f)
+// }
+/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
+/// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_ps)
+pub fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            (*a).as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0, 1, 2, 3, 0, 1, 2, 3],
+        ))
+    }
+}
+/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
+/// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_broadcast_pd)
+pub fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
+    transmute(simd_shuffle(
+        (*a).as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        [0, 1, 0, 1],
+    ))
+}
+/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
+/// single-precision (32-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_ps)
+pub fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f32x8(),
+            _mm256_castps128_ps256(b).as_f32x8(),
+            [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
+        ))
+    }
+}
+/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
+/// double-precision (64-bit) floating-point elements) from `b` into result
+/// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_pd)
+pub fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        transmute(simd_shuffle(
+            a.as_f64x4(),
+            _mm256_castpd128_pd256(b).as_f64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        ))
+    }
+}
 /// Copies `a` to result, then inserts 128 bits from `b` into result
 /// at the location specified by `imm8`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insertf128_si256)
 pub fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
-    // // static_assert_uimm_bits!(IMM1, 1);
-
-    let dst: i64x4 = simd_shuffle(
-        BitVec::to_i64x4(a),
-        BitVec::to_i64x4(_mm256_castsi128_si256(b)),
-        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
-    );
-    dst.into()
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let dst: i64x4 = simd_shuffle(
+            a.as_i64x4(),
+            _mm256_castsi128_si256(b).as_i64x4(),
+            [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
+        );
+        transmute(dst)
+    }
 }
-
 /// Copies `a` to result, and inserts the 8-bit integer `i` into result
 /// at the location specified by `index`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi8)
 pub fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
-    // // static_assert_uimm_bits!(INDEX, 5);
-    simd_insert(BitVec::to_i8x32(a), INDEX as u64, i).into()
+    static_assert_uimm_bits!(INDEX, 5);
+    transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
 }
-
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
 /// at the location specified by `index`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi16)
 pub fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
-    // // static_assert_uimm_bits!(INDEX, 4);
-    simd_insert(BitVec::to_i16x16(a), INDEX as u64, i).into()
+    static_assert_uimm_bits!(INDEX, 4);
+    transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
+}
+/// Copies `a` to result, and inserts the 32-bit integer `i` into result
+/// at the location specified by `index`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_insert_epi32)
+pub fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
+    static_assert_uimm_bits!(INDEX, 3);
+    transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
+}
+/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movehdup_ps)
+pub fn _mm256_movehdup_ps(a: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        a.as_f32x8(),
+        [1, 1, 3, 3, 5, 5, 7, 7],
+    ))
+}
+/// Duplicate even-indexed single-precision (32-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_moveldup_ps)
+pub fn _mm256_moveldup_ps(a: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        a.as_f32x8(),
+        [0, 0, 2, 2, 4, 4, 6, 6],
+    ))
+}
+/// Duplicate even-indexed double-precision (64-bit) floating-point elements
+/// from `a`, and returns the results.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movedup_pd)
+pub fn _mm256_movedup_pd(a: __m256d) -> __m256d {
+    transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 0, 2, 2]))
+}
+/// Computes the approximate reciprocal of packed single-precision (32-bit)
+/// floating-point elements in `a`, and returns the results. The maximum
+/// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rcp_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_rcp_ps(a: __m256) -> __m256 {
+//     { vrcpps(a) }
+// }
+/// Computes the approximate reciprocal square root of packed single-precision
+/// (32-bit) floating-point elements in `a`, and returns the results.
+/// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_rsqrt_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
+//     { vrsqrtps(a) }
+// }
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_pd)
+pub fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
+    transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [1, 5, 3, 7]))
+}
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpackhi_ps)
+pub fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        [2, 10, 3, 11, 6, 14, 7, 15],
+    ))
+}
+/// Unpacks and interleave double-precision (64-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_pd)
+pub fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
+    transmute(simd_shuffle(a.as_f64x4(), b.as_f64x4(), [0, 4, 2, 6]))
+}
+/// Unpacks and interleave single-precision (32-bit) floating-point elements
+/// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_unpacklo_ps)
+pub fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
+    transmute(simd_shuffle(
+        a.as_f32x8(),
+        b.as_f32x8(),
+        [0, 8, 1, 9, 4, 12, 5, 13],
+    ))
 }
-
 /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_si256)
 pub fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
-    let c = BitVec::<256>::from_fn(|i| match (a[i], b[i]) {
-        (Bit::One, Bit::One) => Bit::One,
-        _ => Bit::Zero,
-    });
-    let all_zero = c.fold(true, |acc, bit| acc && bit == Bit::Zero);
-    if all_zero {
-        1
-    } else {
-        0
-    }
+    ptestz256(a.as_i64x4(), b.as_i64x4())
 }
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_si256)
+pub fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
+    ptestc256(a.as_i64x4(), b.as_i64x4())
+}
+
+/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
+/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
+/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
+/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
+/// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_si256)
+// NOTE: Not modeled yet
+// pub fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
+//     { ptestnzc256(a.as_i64x4(), b.as_i64x4()) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestzpd256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestcpd256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
+//     { vtestnzcpd256(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestzpd(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestcpd(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
+//     { vtestnzcpd(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testz_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestzps256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestcps256(a, b) }
+// }
+
+/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_testnzc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
+//     { vtestnzcps256(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testz_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestzps(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestcps(a, b) }
+// }
+
+/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
+/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
+/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
+/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
+/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
+/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
+/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
+/// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm_testnzc_ps)
+// NOTE: Not modeled yet
+// pub fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
+//     { vtestnzcps(a, b) }
+// }
 
+/// Sets each bit of the returned mask based on the most significant bit of the
+/// corresponding packed double-precision (64-bit) floating-point element in
+/// `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_pd)
+pub fn _mm256_movemask_pd(a: __m256d) -> i32 {
+    {
+        let mask: i64x4 = simd_lt(a.as_i64x4(), i64x4::ZERO());
+        simd_bitmask_little!(3, mask, u8) as i32
+    }
+}
 /// Sets each bit of the returned mask based on the most significant bit of the
 /// corresponding packed single-precision (32-bit) floating-point element in
 /// `a`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_movemask_ps)
 pub fn _mm256_movemask_ps(a: __m256) -> i32 {
-    // Propagate the highest bit to the rest, because simd_bitmask
-    // requires all-1 or all-0.
-    let mask: i32x8 = simd_lt(BitVec::to_i32x8(a), i32x8::from_fn(|_| 0));
-    let r = simd_bitmask_little!(7, mask, u8);
-    r as u32 as i32
+    {
+        let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO());
+        simd_bitmask_little!(7, mask, u8) as i32
+    }
+}
+/// Returns vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_pd)
+pub fn _mm256_setzero_pd() -> __m256d {
+    transmute(f64x4::ZERO())
 }
-
 /// Returns vector of type __m256 with all elements set to zero.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_ps)
 pub fn _mm256_setzero_ps() -> __m256 {
-    BitVec::from_fn(|_| Bit::Zero)
+    transmute(f32x8::ZERO())
 }
-
 /// Returns vector of type __m256i with all elements set to zero.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setzero_si256)
 pub fn _mm256_setzero_si256() -> __m256i {
-    BitVec::from_fn(|_| Bit::Zero)
+    transmute(i64x4::ZERO())
+}
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_pd)
+pub fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    _mm256_setr_pd(d, c, b, a)
+}
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_ps)
+pub fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
+    _mm256_setr_ps(h, g, f, e, d, c, b, a)
 }
-
 /// Sets packed 8-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi8)
 pub fn _mm256_set_epi8(
     e00: i8,
     e01: i8,
@@ -272,19 +1403,14 @@ pub fn _mm256_set_epi8(
     e30: i8,
     e31: i8,
 ) -> __m256i {
-    let vec = [
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16, e17,
-        e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    ];
-    BitVec::from_i8x32(i8x32::from_fn(|i| vec[(31 - i) as usize]))
+    _mm256_setr_epi8(
+        e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14,
+        e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
+    )
 }
-
 /// Sets packed 16-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi16)
 pub fn _mm256_set_epi16(
     e00: i16,
     e01: i16,
@@ -303,18 +1429,13 @@ pub fn _mm256_set_epi16(
     e14: i16,
     e15: i16,
 ) -> __m256i {
-    let vec = [
-        e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
-    ];
-    BitVec::from_i16x16(i16x16::from_fn(|i| vec[(15 - i) as usize]))
+    _mm256_setr_epi16(
+        e15, e14, e13, e12, e11, e10, e09, e08, e07, e06, e05, e04, e03, e02, e01, e00,
+    )
 }
-
 /// Sets packed 32-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi32)
 pub fn _mm256_set_epi32(
     e0: i32,
     e1: i32,
@@ -325,108 +1446,383 @@ pub fn _mm256_set_epi32(
     e6: i32,
     e7: i32,
 ) -> __m256i {
-    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    BitVec::from_i32x8(i32x8::from_fn(|i| vec[(7 - i) as usize]))
+    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
 }
-
 /// Sets packed 64-bit integers in returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
-// This intrinsic has no corresponding instruction.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_epi64x)
 pub fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
-    let vec = [d, c, b, a];
-    BitVec::from_i64x4(i64x4::from_fn(|i| vec[i as usize]))
+    _mm256_setr_epi64x(d, c, b, a)
+}
+/// Sets packed double-precision (64-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_pd)
+pub fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
+    transmute(f64x4::new(a, b, c, d))
+}
+/// Sets packed single-precision (32-bit) floating-point elements in returned
+/// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_ps)
+pub fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256 {
+    transmute(f32x8::new(a, b, c, d, e, f, g, h))
+}
+/// Sets packed 8-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi8)
+pub fn _mm256_setr_epi8(
+    e00: i8,
+    e01: i8,
+    e02: i8,
+    e03: i8,
+    e04: i8,
+    e05: i8,
+    e06: i8,
+    e07: i8,
+    e08: i8,
+    e09: i8,
+    e10: i8,
+    e11: i8,
+    e12: i8,
+    e13: i8,
+    e14: i8,
+    e15: i8,
+    e16: i8,
+    e17: i8,
+    e18: i8,
+    e19: i8,
+    e20: i8,
+    e21: i8,
+    e22: i8,
+    e23: i8,
+    e24: i8,
+    e25: i8,
+    e26: i8,
+    e27: i8,
+    e28: i8,
+    e29: i8,
+    e30: i8,
+    e31: i8,
+) -> __m256i {
+    {
+        transmute(i8x32::new(
+            e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15, e16,
+            e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+        ))
+    }
+}
+/// Sets packed 16-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi16)
+pub fn _mm256_setr_epi16(
+    e00: i16,
+    e01: i16,
+    e02: i16,
+    e03: i16,
+    e04: i16,
+    e05: i16,
+    e06: i16,
+    e07: i16,
+    e08: i16,
+    e09: i16,
+    e10: i16,
+    e11: i16,
+    e12: i16,
+    e13: i16,
+    e14: i16,
+    e15: i16,
+) -> __m256i {
+    {
+        transmute(i16x16::new(
+            e00, e01, e02, e03, e04, e05, e06, e07, e08, e09, e10, e11, e12, e13, e14, e15,
+        ))
+    }
+}
+/// Sets packed 32-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi32)
+pub fn _mm256_setr_epi32(
+    e0: i32,
+    e1: i32,
+    e2: i32,
+    e3: i32,
+    e4: i32,
+    e5: i32,
+    e6: i32,
+    e7: i32,
+) -> __m256i {
+    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
+}
+/// Sets packed 64-bit integers in returned vector with the supplied values in
+/// reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_epi64x)
+pub fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
+    transmute(i64x4::new(a, b, c, d))
+}
+/// Broadcasts double-precision (64-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_pd)
+pub fn _mm256_set1_pd(a: f64) -> __m256d {
+    _mm256_setr_pd(a, a, a, a)
+}
+/// Broadcasts single-precision (32-bit) floating-point value `a` to all
+/// elements of returned vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_ps)
+pub fn _mm256_set1_ps(a: f32) -> __m256 {
+    _mm256_setr_ps(a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 8-bit integer `a` to all elements of returned vector.
-/// This intrinsic may generate the `vpbroadcastw`.
+/// This intrinsic may generate the `vpbroadcastb`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
-
-//
-
-// This intrinsic has no corresponding instruction.
-
-pub fn _mm256_set1_epi8(val: i8) -> BitVec<256> {
-    BitVec::from_i8x32(i8x32::from_fn(|_| val))
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi8)
+pub fn _mm256_set1_epi8(a: i8) -> __m256i {
+    _mm256_setr_epi8(
+        a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a,
+        a, a,
+    )
 }
-
 /// Broadcasts 16-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
-
-//
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi16)
 pub fn _mm256_set1_epi16(a: i16) -> __m256i {
-    BitVec::from_i16x16(i16x16::from_fn(|_| a))
+    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 32-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastd`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
-
-// This intrinsic has no corresponding instruction.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi32)
 pub fn _mm256_set1_epi32(a: i32) -> __m256i {
-    BitVec::from_i32x8(i32x8::from_fn(|_| a))
+    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 64-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastq`.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
-// This intrinsic has no corresponding instruction.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set1_epi64x)
 pub fn _mm256_set1_epi64x(a: i64) -> __m256i {
-    BitVec::from_i64x4(i64x4::from_fn(|_| a))
+    _mm256_setr_epi64x(a, a, a, a)
 }
-
+/// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_ps)
+pub fn _mm256_castpd_ps(a: __m256d) -> __m256 {
+    transmute(a)
+}
+/// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_pd)
+pub fn _mm256_castps_pd(a: __m256) -> __m256d {
+    transmute(a)
+}
+/// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps_si256)
 pub fn _mm256_castps_si256(a: __m256) -> __m256i {
-    a
+    transmute(a)
 }
-
 /// Casts vector of type __m256i to type __m256.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
-// This intrinsic is only used for compilation and does not generate any
-// instructions, thus it has zero latency.
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_ps)
 pub fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
-    a
+    transmute(a)
+}
+/// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd_si256)
+pub fn _mm256_castpd_si256(a: __m256d) -> __m256i {
+    transmute(a)
+}
+/// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_pd)
+pub fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
+    transmute(a)
+}
+/// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps256_ps128)
+pub fn _mm256_castps256_ps128(a: __m256) -> __m128 {
+    transmute(simd_shuffle(a.as_f32x8(), a.as_f32x8(), [0, 1, 2, 3]))
+}
+/// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd256_pd128)
+pub fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
+    transmute(simd_shuffle(a.as_f64x4(), a.as_f64x4(), [0, 1]))
 }
-
 /// Casts vector of type __m256i to type __m128i.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
-
-// This intrinsic is only used for compilation and does not generate any
-// instructions, thus it has zero latency.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi256_si128)
 pub fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
-    BitVec::from_fn(|i| a[i])
+    {
+        let a = a.as_i64x4();
+        let dst: i64x2 = simd_shuffle(a, a, [0, 1]);
+        transmute(dst)
+    }
+}
+/// Casts vector of type __m128 to type __m256;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castps128_ps256)
+pub fn _mm256_castps128_ps256(a: __m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_undefined_ps().as_f32x4(),
+            [0, 1, 2, 3, 4, 4, 4, 4],
+        ))
+    }
+}
+/// Casts vector of type __m128d to type __m256d;
+/// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castpd128_pd256)
+pub fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
+    transmute(simd_shuffle(
+        a.as_f64x2(),
+        _mm_undefined_pd().as_f64x2(),
+        [0, 1, 2, 2],
+    ))
 }
-
 /// Casts vector of type __m128i to type __m256i;
 /// the upper 128 bits of the result are undefined.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
-
-// This intrinsic is only used for compilation and does not generate any
-// instructions, thus it has zero latency.
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_castsi128_si256)
 pub fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
-    let a = BitVec::to_i64x2(a);
-    let undefined = i64x2::from_fn(|_| 0);
-    let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
-    BitVec::from_i64x4(dst)
+    {
+        let a = a.as_i64x2();
+        let undefined = i64x2::ZERO();
+        let dst: i64x4 = simd_shuffle(a, undefined, [0, 1, 2, 2]);
+        transmute(dst)
+    }
+}
+/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
+/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextps128_ps256)
+pub fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ))
+    }
+}
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextsi128_si256)
+pub fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
+    {
+        let b = i64x2::ZERO();
+        let dst: i64x4 = simd_shuffle(a.as_i64x2(), b, [0, 1, 2, 3]);
+        transmute(dst)
+    }
+}
+/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
+/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_zextpd128_pd256)
+// NOTE: Not modeled yet
+pub fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0, 1, 2, 3],
+        ))
+    }
+}
+/// Returns vector of type `__m256` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_ps)
+pub fn _mm256_undefined_ps() -> __m256 {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type `__m256d` with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_pd)
+pub fn _mm256_undefined_pd() -> __m256d {
+    transmute(f32x8::ZERO())
+}
+/// Returns vector of type __m256i with with indeterminate elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_undefined_si256)
+pub fn _mm256_undefined_si256() -> __m256i {
+    transmute(i32x8::ZERO())
+}
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128)
+pub fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
+    transmute(simd_shuffle(
+        lo.as_i32x4(),
+        hi.as_i32x4(),
+        [0, 1, 2, 3, 4, 5, 6, 7],
+    ))
+}
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128d)
+pub fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
+    {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
 }
-
 /// Sets packed __m256i returned vector with the supplied values.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
-
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_set_m128i)
 pub fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
-    BitVec::from_fn(|i| if i < 128 { lo[i] } else { hi[i - 128] })
+    {
+        let hi: __m128 = transmute(hi);
+        let lo: __m128 = transmute(lo);
+        transmute(_mm256_set_m128(hi, lo))
+    }
+}
+/// Sets packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128)
+pub fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
+    _mm256_set_m128(hi, lo)
+}
+/// Sets packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128d)
+pub fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
+    _mm256_set_m128d(hi, lo)
+}
+/// Sets packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_setr_m128i)
+pub fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
+    _mm256_set_m128i(hi, lo)
+}
+/// Returns the first element of the input vector of `[8 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htmlext=_mm256_cvtss_f32)
+pub fn _mm256_cvtss_f32(a: __m256) -> f32 {
+    simd_extract(a.as_f32x8(), 0)
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2.rs b/testable-simd-models/src/core_arch/x86/models/avx2.rs
index 05173b19a8c58..2626d04635bd6 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2.rs
@@ -19,1543 +19,878 @@
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
-use crate::abstractions::{bitvec::BitVec, simd::*};
-
-mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::*};
-    pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-
-    pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else if i < 4 {
-                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
-            } else if i < 6 {
-                a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_add(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-
-    pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-
-    pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else if i < 4 {
-                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
-            } else if i < 6 {
-                a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_sub(a[2 * i + 1])
-            } else if i < 8 {
-                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
-            } else if i < 12 {
-                a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
-            } else {
-                b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
-            }
-        })
-    }
-    pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
-        i32x8::from_fn(|i| {
-            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
-        })
-    }
-
-    pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
-        i16x16::from_fn(|i| {
-            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
-                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
-        })
-    }
-    pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
-        i8x32::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i] as i8
-                }
-            } else if i < 16 {
-                if b[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 8] as i8
-                }
-            } else if i < 24 {
-                if a[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i - 8] as i8
-                }
-            } else {
-                if b[i - 16] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 16] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 16] as i8
-                }
-            }
-        })
-    }
-
-    pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i] as i16
-                }
-            } else if i < 8 {
-                if b[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 4] as i16
-                }
-            } else if i < 12 {
-                if a[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i - 4] as i16
-                }
-            } else {
-                if b[i - 8] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 8] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 8] as i16
-                }
-            }
-        })
-    }
-
-    pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
-        u8x32::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i] as u8
-                }
-            } else if i < 16 {
-                if b[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 8] as u8
-                }
-            } else if i < 24 {
-                if a[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i - 8] as u8
-                }
-            } else {
-                if b[i - 16] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 16] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 16] as u8
-                }
-            }
-        })
-    }
-
-    pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
-        u16x16::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if a[i] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    a[i] as u16
-                }
-            } else if i < 8 {
-                if b[i - 4] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if b[i - 4] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    b[i - 4] as u16
-                }
-            } else if i < 12 {
-                if a[i - 4] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if a[i - 4] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    a[i - 4] as u16
-                }
-            } else {
-                if b[i - 8] > (u16::MAX as i32) {
-                    u16::MAX
-                } else if b[i - 8] < (u16::MIN as i32) {
-                    u16::MIN
-                } else {
-                    b[i - 8] as u16
-                }
-            }
-        })
-    }
-
-    pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
-        i8x32::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i8::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-    pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i16::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i32::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
-        let count4: u64 = (count[0] as u16) as u64;
-        let count3: u64 = ((count[1] as u16) as u64) * 65536;
-        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
-        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
-        let count = count1 + count2 + count3 + count4;
-        i16x16::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) << count) as i16
-            }
-        })
-    }
-
-    pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x8::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) << count) as i32
-            }
-        })
-    }
-    pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
-        let count: u64 = count[0] as u64;
-
-        i64x4::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u64) << count) as i64
-            }
-        })
-    }
-
-    pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) << count[i]) as i32
-            }
-        })
-    }
-    pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) << count[i]) as i32
-            }
-        })
-    }
-
-    pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
-        i64x2::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u64) << count[i]) as i64
-            }
-        })
-    }
-    pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
-        i64x4::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u64) << count[i]) as i64
-            }
-        })
-    }
-
-    pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
-        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
-            + ((count[2] as u16) as u64) * 4294967296
-            + ((count[1] as u16) as u64) * 65536
-            + ((count[0] as u16) as u64);
-
-        i16x16::from_fn(|i| {
-            if count > 15 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count
-            }
-        })
-    }
-
-    pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x8::from_fn(|i| {
-            if count > 31 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] << count
-            }
-        })
-    }
-
-    pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count[i]
-            }
-        })
-    }
-
-    pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
-        dbg!(a, count);
-        i32x8::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count[i]
-            }
-        })
-    }
-
-    pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
-        let count: u64 = (count[3] as u16 as u64) * 281474976710656
-            + (count[2] as u16 as u64) * 4294967296
-            + (count[1] as u16 as u64) * 65536
-            + (count[0] as u16 as u64);
-
-        i16x16::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) >> count) as i16
-            }
-        })
-    }
-
-    pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
-        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
-
-        i32x8::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) >> count) as i32
-            }
-        })
-    }
-
-    pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
-        let count: u64 = count[0] as u64;
-
-        i64x4::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u64) >> count) as i64
-            }
-        })
-    }
-
-    pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) >> count[i]) as i32
-            }
-        })
-    }
-    pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
-        i32x8::from_fn(|i| {
-            if count[i] > 31 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u32) >> count[i]) as i32
-            }
-        })
-    }
-
-    pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
-        i64x2::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u64) >> count[i]) as i64
-            }
-        })
-    }
-    pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
-        i64x4::from_fn(|i| {
-            if count[i] > 63 || count[i] < 0 {
-                0
-            } else {
-                ((a[i] as u64) >> count[i]) as i64
-            }
-        })
-    }
-
-    pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
-        u8x32::from_fn(|i| {
-            if i < 16 {
-                if b[i] > 127 {
-                    0
-                } else {
-                    let index: u64 = (b[i] % 16) as u64;
-                    a[index]
-                }
-            } else {
-                if b[i] > 127 {
-                    0
-                } else {
-                    let index: u64 = (b[i] % 16) as u64;
-                    a[index + 16]
-                }
-            }
-        })
-    }
-
-    pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
-        u32x8::from_fn(|i| {
-            let id = b[i] % 8;
-            a[id as u64]
-        })
-    }
-
-    pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16 {
-        u16x16::from_fn(|i| {
-            if i < 8 {
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
-                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
-                let k = a_offset + i;
-                let l = b_offset;
-                ((a[k].absolute_diff(b[l]) as i8) as u8 as u16)
-                    + ((a[k + 1].absolute_diff(b[l + 1]) as i8) as u8 as u16)
-                    + ((a[k + 2].absolute_diff(b[l + 2]) as i8) as u8 as u16)
-                    + ((a[k + 3].absolute_diff(b[l + 3]) as i8) as u8 as u16)
-            } else {
-                let i = i - 8;
-                let imm8 = imm8 >> 3;
-                let a_offset = (((imm8 & 4) >> 2) * 4) as u32 as u64;
-                let b_offset = ((imm8 & 3) * 4) as u32 as u64;
-                let k = a_offset + i;
-                let l = b_offset;
-                ((a[16 + k].absolute_diff(b[16 + l]) as i8) as u8 as u16)
-                    + ((a[16 + k + 1].absolute_diff(b[16 + l + 1]) as i8) as u8 as u16)
-                    + ((a[16 + k + 2].absolute_diff(b[16 + l + 2]) as i8) as u8 as u16)
-                    + ((a[16 + k + 3].absolute_diff(b[16 + l + 3]) as i8) as u8 as u16)
-            }
-        })
-    }
-
-    pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
-        let a = i128x2::from_fn(|i| {
-            ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
-        });
-        let b = i128x2::from_fn(|i| {
-            ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
-        });
-        let imm8 = imm8 as u8 as u32 as i32;
-        let r = i128x2::from_fn(|i| {
-            let control = imm8 >> (i * 4);
-            if (control >> 3) % 2 == 1 {
-                0
-            } else {
-                match control % 4 {
-                    0 => a[0],
-                    1 => a[1],
-                    2 => b[0],
-                    3 => b[1],
-                    _ => unreachable!(),
-                }
-            }
-        });
-        i64x4::from_fn(|i| {
-            let index = i >> 1;
-            let hilo = i.rem_euclid(2);
-            let val = r[index];
-            if hilo == 0 {
-                i64::cast(val)
-            } else {
-                i64::cast(val >> 64)
-            }
-        })
-    }
-    pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
-        i16x16::from_fn(|i| {
-            let temp = (a[i] as i32) * (b[i] as i32);
-            let temp = (temp >> 14).wrapping_add(1) >> 1;
-            temp as i16
-        })
-    }
-
-    pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
-        let tmp = u8x32::from_fn(|i| a[i].absolute_diff(b[i]));
-        u64x4::from_fn(|i| {
-            (tmp[i * 8] as u16)
-                .wrapping_add(tmp[i * 8 + 1] as u16)
-                .wrapping_add(tmp[i * 8 + 2] as u16)
-                .wrapping_add(tmp[i * 8 + 3] as u16)
-                .wrapping_add(tmp[i * 8 + 4] as u16)
-                .wrapping_add(tmp[i * 8 + 5] as u16)
-                .wrapping_add(tmp[i * 8 + 6] as u16)
-                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
-        })
-    }
-}
-use c_extern::*;
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
 
 use super::avx::*;
+use super::avx2_handwritten::*;
+use super::sse::*;
+use super::sse2::*;
 use super::types::*;
-use crate::abstractions::simd::*;
+
 /// Computes the absolute values of packed 32-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
-
 pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let r = simd_select(simd_lt(a, i32x8::from_fn(|_| 0)), simd_neg(a), a);
-    BitVec::from_i32x8(r)
+    {
+        let a = a.as_i32x8();
+        let r = simd_select(simd_lt(a, i32x8::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute values of packed 16-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
-
 pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let r = simd_select(simd_lt(a, i16x16::from_fn(|_| 0)), simd_neg(a), a);
-    BitVec::from_i16x16(r)
+    {
+        let a = a.as_i16x16();
+        let r = simd_select(simd_lt(a, i16x16::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute values of packed 8-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
-
 pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
-    let a = BitVec::to_i8x32(a);
-    let r = simd_select(simd_lt(a, i8x32::from_fn(|_| 0)), simd_neg(a), a);
-    BitVec::from_i8x32(r)
+    {
+        let a = a.as_i8x32();
+        let r = simd_select(simd_lt(a, i8x32::ZERO()), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Adds packed 64-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
-
 pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i64x4(simd_add(BitVec::to_i64x4(a), BitVec::to_i64x4(b)))
+    {
+        transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Adds packed 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
-
 pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i32x8(simd_add(BitVec::to_i32x8(a), BitVec::to_i32x8(b)))
+    {
+        transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Adds packed 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
-
 pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i16x16(simd_add(BitVec::to_i16x16(a), BitVec::to_i16x16(b)))
+    {
+        transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Adds packed 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
-
 pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i8x32(simd_add(BitVec::to_i8x32(a), BitVec::to_i8x32(b)))
+    {
+        transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
-
 pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i8x32(simd_saturating_add(
-        BitVec::to_i8x32(a),
-        BitVec::to_i8x32(b),
-    ))
+    {
+        transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
-
 pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    BitVec::from_i16x16(simd_saturating_add(
-        BitVec::to_i16x16(a),
-        BitVec::to_i16x16(b),
-    ))
+    {
+        transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
-
 pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_add(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    {
+        transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
-
 pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_add(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+    {
+        transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
+    }
 }
-
 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
-
 pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
+    static_assert_uimm_bits!(IMM8, 8);
     if IMM8 >= 32 {
         return _mm256_setzero_si256();
     }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
     let (a, b) = if IMM8 > 16 {
         (_mm256_setzero_si256(), a)
     } else {
         (a, b)
     };
-
-    let a = BitVec::to_i8x32(a);
-    let b = BitVec::to_i8x32(b);
-
-    if IMM8 == 16 {
-        return a.into();
+    {
+        if IMM8 == 16 {
+            return transmute(a);
+        }
     }
-
-    let r: i8x32 = match IMM8 % 16 {
-        0 => simd_shuffle(
-            b,
-            a,
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
-        ),
-        1 => simd_shuffle(
-            b,
-            a,
-            [
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
-                24, 25, 26, 27, 28, 29, 30, 31, 48,
-            ],
-        ),
-        2 => simd_shuffle(
-            b,
-            a,
-            [
-                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 48, 49,
-            ],
-        ),
-        3 => simd_shuffle(
-            b,
-            a,
-            [
-                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
-            ],
-        ),
-        4 => simd_shuffle(
-            b,
-            a,
-            [
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
-                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
-            ],
-        ),
-        5 => simd_shuffle(
-            b,
-            a,
-            [
-                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
-                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
-            ],
-        ),
-        6 => simd_shuffle(
-            b,
-            a,
-            [
-                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
-            ],
-        ),
-        7 => simd_shuffle(
-            b,
-            a,
-            [
-                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
-            ],
-        ),
-        8 => simd_shuffle(
-            b,
-            a,
-            [
-                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
-                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
-            ],
-        ),
-        9 => simd_shuffle(
-            b,
-            a,
-            [
-                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
-                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
-            ],
-        ),
-        10 => simd_shuffle(
-            b,
-            a,
-            [
-                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
-                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
-            ],
-        ),
-        11 => simd_shuffle(
-            b,
-            a,
-            [
-                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
-                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
-            ],
-        ),
-        12 => simd_shuffle(
-            b,
-            a,
-            [
-                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
-                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
-            ],
-        ),
-        13 => simd_shuffle(
-            b,
-            a,
-            [
-                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
-                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-            ],
-        ),
-        14 => simd_shuffle(
-            b,
-            a,
-            [
-                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
-                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
-            ],
-        ),
-        15 => simd_shuffle(
-            b,
-            a,
+    const fn mask(shift: u32, i: u32) -> u32 {
+        let shift = shift % 16;
+        let mod_i = i % 16;
+        if mod_i < (16 - shift) {
+            i + shift
+        } else {
+            i + 16 + shift
+        }
+    }
+    {
+        let r: i8x32 = simd_shuffle(
+            b.as_i8x32(),
+            a.as_i8x32(),
             [
-                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
-                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+                mask(IMM8 as u32, 16),
+                mask(IMM8 as u32, 17),
+                mask(IMM8 as u32, 18),
+                mask(IMM8 as u32, 19),
+                mask(IMM8 as u32, 20),
+                mask(IMM8 as u32, 21),
+                mask(IMM8 as u32, 22),
+                mask(IMM8 as u32, 23),
+                mask(IMM8 as u32, 24),
+                mask(IMM8 as u32, 25),
+                mask(IMM8 as u32, 26),
+                mask(IMM8 as u32, 27),
+                mask(IMM8 as u32, 28),
+                mask(IMM8 as u32, 29),
+                mask(IMM8 as u32, 30),
+                mask(IMM8 as u32, 31),
             ],
-        ),
-        _ => unreachable!(),
-    };
-    r.into()
+        );
+        transmute(r)
+    }
 }
-
 /// Computes the bitwise AND of 256 bits (representing integer data)
 /// in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
-
 pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_and(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    {
+        transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Computes the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
-
 pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
-    let all_ones = _mm256_set1_epi8(-1);
-    simd_and(
-        simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(all_ones)),
-        BitVec::to_i64x4(b),
-    )
-    .into()
+    {
+        let all_ones = _mm256_set1_epi8(-1);
+        transmute(simd_and(
+            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
+            b.as_i64x4(),
+        ))
+    }
 }
-
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
-
 pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
-    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
-    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
-    simd_cast::<16, _, u16>(r).into()
+    {
+        let a = simd_cast::<16, _, u32>(a.as_u16x16());
+        let b = simd_cast::<16, _, u32>(b.as_u16x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
+        transmute(simd_cast::<16, _, u16>(r))
+    }
 }
-
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
-
 pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<32, _, u16>(BitVec::to_u8x32(a));
-    let b = simd_cast::<32, _, u16>(BitVec::to_u8x32(b));
-    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
-    simd_cast::<32, _, u8>(r).into()
+    {
+        let a = simd_cast::<32, _, u16>(a.as_u8x32());
+        let b = simd_cast::<32, _, u16>(b.as_u8x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
+        transmute(simd_cast::<32, _, u8>(r))
+    }
 }
-
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
-
 pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i32x4(a);
-    let b = BitVec::to_i32x4(b);
-    let r: i32x4 = simd_shuffle(
-        a,
-        b,
-        [
-            [0, 4, 0, 4][IMM4 as usize & 0b11],
-            [1, 1, 5, 5][IMM4 as usize & 0b11],
-            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
-            [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(IMM4, 4);
+    {
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r: i32x4 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 4, 0, 4][IMM4 as usize & 0b11],
+                [1, 1, 5, 5][IMM4 as usize & 0b11],
+                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
+                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
-
 pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    let r: i32x8 = simd_shuffle(
-        a,
-        b,
-        [
-            [0, 8, 0, 8][IMM8 as usize & 0b11],
-            [1, 1, 9, 9][IMM8 as usize & 0b11],
-            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
-            [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
-            [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
-            [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
-            [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
-            [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 8, 0, 8][IMM8 as usize & 0b11],
+                [1, 1, 9, 9][IMM8 as usize & 0b11],
+                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
+                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
+                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
 pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
-
-    let r: i16x16 = simd_shuffle(
-        a,
-        b,
-        [
-            [0, 16, 0, 16][IMM8 as usize & 0b11],
-            [1, 1, 17, 17][IMM8 as usize & 0b11],
-            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
-            [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
-            [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
-            [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
-            [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
-            [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
-            [8, 24, 8, 24][IMM8 as usize & 0b11],
-            [9, 9, 25, 25][IMM8 as usize & 0b11],
-            [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
-            [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
-            [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
-            [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
-            [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
-            [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            b,
+            [
+                [0, 16, 0, 16][IMM8 as usize & 0b11],
+                [1, 1, 17, 17][IMM8 as usize & 0b11],
+                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
+                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
+                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
+                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
+                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
+                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
+                [8, 24, 8, 24][IMM8 as usize & 0b11],
+                [9, 9, 25, 25][IMM8 as usize & 0b11],
+                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
+                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
+                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
+                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
+                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
+                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
 pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
-    let mask: i8x32 = simd_lt(BitVec::to_i8x32(mask), i8x32::from_fn(|_| 0));
-    simd_select(mask, BitVec::to_i8x32(b), BitVec::to_i8x32(a)).into()
+    {
+        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO());
+        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
+    }
 }
-
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 /// the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
 pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 16]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 16]);
+        transmute::<i8x16, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
 pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i8x16(a), i8x16::from_fn(|_| 0), [0_u64; 32]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i8x16(), i8x16::ZERO(), [0_u32; 32]);
+        transmute::<i8x32, _>(ret)
+    }
 }
-
-// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
-// often compiled to `vbroadcastss`.
 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 /// the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
-
 pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 4]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 4]);
+        transmute::<i32x4, _>(ret)
+    }
 }
-
-// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
-// often compiled to `vbroadcastss`.
 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
-
 pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i32x4(a), i32x4::from_fn(|_| 0), [0_u64; 8]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i32x4(), i32x4::ZERO(), [0_u32; 8]);
+        transmute::<i32x8, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 /// the 128-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
-
-// Emits `vmovddup` instead of `vpbroadcastq`
-// See https://github.com/rust-lang/stdarch/issues/791
-
 pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 2]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
+        transmute::<i64x2, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
-
 pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(a), [0_u64; 4]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
+pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0_u32; 2],
+        ))
+    }
+}
+/// Broadcasts the low double-precision (64-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
+pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
+    {
+        transmute(simd_shuffle(
+            a.as_f64x2(),
+            _mm_setzero_pd().as_f64x2(),
+            [0_u32; 4],
+        ))
+    }
 }
-
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
-
 pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
 }
-
-// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
-// `vbroadcastf128`.
 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
-
 pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 1, 0, 1]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 1, 0, 1]);
+        transmute::<i64x4, _>(ret)
+    }
+}
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
+pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0_u32; 4],
+        ))
+    }
+}
+/// Broadcasts the low single-precision (32-bit) floating-point element
+/// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
+pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
+    {
+        transmute(simd_shuffle(
+            a.as_f32x4(),
+            _mm_setzero_ps().as_f32x4(),
+            [0_u32; 8],
+        ))
+    }
 }
-
 /// Broadcasts the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
-
 pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
-    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 8]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 8]);
+        transmute::<i16x8, _>(ret)
+    }
 }
-
 /// Broadcasts the low packed 16-bit integer from a to all elements of
 /// the 256-bit returned value
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
-
 pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
-    let ret = simd_shuffle(BitVec::to_i16x8(a), i16x8::from_fn(|_| 0), [0_u64; 16]);
-    ret.into()
+    {
+        let ret = simd_shuffle(a.as_i16x8(), i16x8::ZERO(), [0_u32; 16]);
+        transmute::<i16x16, _>(ret)
+    }
 }
-
 /// Compares packed 64-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
-
 pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    {
+        transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
-
 pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
-
 pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
-
 pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_eq(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    {
+        transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
-
 pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    {
+        transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
-
 pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
-
 pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
-
 pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_gt(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    {
+        transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Sign-extend 16-bit integers to 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
-
 pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
-    simd_cast::<8, _, i32>(BitVec::to_i16x8(a)).into()
+    {
+        transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
+    }
 }
-
 /// Sign-extend 16-bit integers to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
-
 pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_i16x8(a);
-    let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, i16, i64>(v64).into()
+    {
+        let a = a.as_i16x8();
+        let v64: i16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v64))
+    }
 }
-
 /// Sign-extend 32-bit integers to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
-
 pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
-    simd_cast::<4, i32, i64>(BitVec::to_i32x4(a)).into()
+    {
+        transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
+    }
 }
-
 /// Sign-extend 8-bit integers to 16-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
-
 pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
-    simd_cast::<16, i8, i16>(BitVec::to_i8x16(a)).into()
+    {
+        transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
+    }
 }
-
 /// Sign-extend 8-bit integers to 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
-
 pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
-    let a = BitVec::to_i8x16(a);
-    let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    simd_cast::<8, i8, i32>(v64).into()
+    {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i32x8, _>(simd_cast(v64))
+    }
 }
-
 /// Sign-extend 8-bit integers to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
 pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_i8x16(a);
-    let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, i8, i64>(v32).into()
+    {
+        let a = a.as_i8x16();
+        let v32: i8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute::<i64x4, _>(simd_cast(v32))
+    }
 }
-
 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
 /// integers, and stores the results in `dst`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
-
 pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
-    simd_cast::<8, u16, u32>(BitVec::to_u16x8(a)).into()
+    {
+        transmute(simd_cast::<8, _, u32>(a.as_u16x8()))
+    }
 }
-
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 /// integers. The upper four elements of `a` are unused.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
-
 pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_u16x8(a);
-    let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, u16, u64>(v64).into()
+    {
+        let a = a.as_u16x8();
+        let v64: u16x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<4, _, u64>(v64))
+    }
 }
-
 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
-
 pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
-    simd_cast::<4, u32, u64>(BitVec::to_u32x4(a)).into()
+    {
+        transmute(simd_cast::<4, _, u64>(a.as_u32x4()))
+    }
 }
-
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
-
 pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
-    simd_cast::<16, u8, u16>(BitVec::to_u8x16(a)).into()
+    {
+        transmute(simd_cast::<16, _, u16>(a.as_u8x16()))
+    }
 }
-
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 /// integers. The upper eight elements of `a` are unused.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
-
 pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
-    let a = BitVec::to_u8x16(a);
-    let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    simd_cast::<8, u8, u32>(v64).into()
+    {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(simd_cast::<8, _, u32>(v64))
+    }
 }
-
 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 /// integers. The upper twelve elements of `a` are unused.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
-
 pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
-    let a = BitVec::to_u8x16(a);
-    let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
-    simd_cast::<4, u8, u64>(v32).into()
+    {
+        let a = a.as_u8x16();
+        let v32: u8x4 = simd_shuffle(a, a, [0, 1, 2, 3]);
+        transmute(simd_cast::<4, _, u64>(v32))
+    }
 }
-
 /// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
-
 pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
-    let a = BitVec::to_i64x4(a);
-    let b = i64x4::from_fn(|_| 0);
-    let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
-    dst.into()
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let a = a.as_i64x4();
+        let b = i64x4::ZERO();
+        let dst: i64x2 = simd_shuffle(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
+        transmute(dst)
+    }
 }
-
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
-
 pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phaddw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
-
 pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
-    phaddd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
-
 pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phaddsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
-
 pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phsubw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
-
 pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    phsubd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
-
 pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    phsubsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
 /// location specified by `IMM1`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
-
 pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
-    let a = BitVec::to_i64x4(a);
-    let b = BitVec::to_i64x4(_mm256_castsi128_si256(b));
-    let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
-    dst.into()
+    static_assert_uimm_bits!(IMM1, 1);
+    {
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        let dst: i64x4 = simd_shuffle(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
+        transmute(dst)
+    }
 }
-
 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
 /// of intermediate 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
-
 pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmaddwd(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
 /// corresponding signed 8-bit integer from `b`, producing intermediate
 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
 /// signed 16-bit integers
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
-
 pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmaddubsw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    {
+        transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
-
 pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
-    simd_select::<16, i16, _>(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
-
 pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    simd_select::<8, i32, _>(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
-
 pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i8x32(a);
-    let b = BitVec::to_i8x32(b);
-    simd_select::<32, i8, _>(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
 /// the packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
-
 pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u16x16(a);
-    let b = BitVec::to_u16x16(b);
-    simd_select::<16, _, u16>(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
 /// the packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
-
 pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u32x8(a);
-    let b = BitVec::to_u32x8(b);
-    simd_select::<8, _, u32>(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
 /// the packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
-
 pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u8x32(a);
-    let b = BitVec::to_u8x32(b);
-    simd_select::<32, _, u8>(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
-
 pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let b = BitVec::to_i16x16(b);
-    simd_select::<16, _, i16>(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
-
 pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i32x8(a);
-    let b = BitVec::to_i32x8(b);
-    simd_select::<8, i32, _>(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
-
 pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_i8x32(a);
-    let b = BitVec::to_i8x32(b);
-    simd_select::<32, i8, _>(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
 /// the packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
-
 pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u16x16(a);
-    let b = BitVec::to_u16x16(b);
-    simd_select::<16, _, u16>(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
 /// the packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
-
 pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u32x8(a);
-    let b = BitVec::to_u32x8(b);
-    simd_select::<8, _, u32>(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
 /// the packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
-
 pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u8x32(a);
-    let b = BitVec::to_u8x32(b);
-    simd_select::<32, _, u8>(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Creates mask from the most significant bit of each 8-bit element in `a`,
 /// return the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
-
 pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
-    let z = i8x32::from_fn(|_| 0);
-    let m: i8x32 = simd_lt(BitVec::to_i8x32(a), z);
-    let r = simd_bitmask_little!(31, m, u32);
-    r as i32
+    {
+        let z = i8x32::ZERO();
+        let m: i8x32 = simd_lt(a.as_i8x32(), z);
+        simd_bitmask_little!(31, m, u32) as i32
+    }
 }
-
 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
 /// results in dst. Eight SADs are performed for each 128-bit lane using one
@@ -1565,177 +900,204 @@ pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// starting at the offset specified in `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
-
 pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    mpsadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b), IMM8).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8))
+    }
 }
-
 /// Multiplies the low 32-bit integers from each packed 64-bit element in
 /// `a` and `b`
 ///
 /// Returns the 64-bit results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
-
 pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(a)));
-    let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(BitVec::to_i64x4(b)));
-    simd_mul(a, b).into()
+    {
+        let a = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(a.as_i64x4()));
+        let b = simd_cast::<4, _, i64>(simd_cast::<4, _, i32>(b.as_i64x4()));
+        transmute(simd_mul(a, b))
+    }
 }
-
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
 /// element in `a` and `b`
 ///
 /// Returns the unsigned 64-bit results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
-
 pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
-    let a = BitVec::to_u64x4(a);
-    let b = BitVec::to_u64x4(b);
-    let mask = u64x4::splat(u32::MAX.into());
-    BitVec::from_u64x4(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let mask = u64x4::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
-
 pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, i32>(BitVec::to_i16x16(a));
-    let b = simd_cast::<16, _, i32>(BitVec::to_i16x16(b));
-    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
-    simd_cast::<16, i32, i16>(r).into()
+    {
+        let a = simd_cast::<16, _, i32>(a.as_i16x16());
+        let b = simd_cast::<16, _, i32>(b.as_i16x16());
+        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
+        transmute(simd_cast::<16, i32, i16>(r))
+    }
 }
-
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
-
 pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
-    let a = simd_cast::<16, _, u32>(BitVec::to_u16x16(a));
-    let b = simd_cast::<16, _, u32>(BitVec::to_u16x16(b));
-    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
-    simd_cast::<16, u32, u16>(r).into()
+    {
+        let a = simd_cast::<16, _, u32>(a.as_u16x16());
+        let b = simd_cast::<16, _, u32>(b.as_u16x16());
+        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
+        transmute(simd_cast::<16, u32, u16>(r))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and returns the low 16 bits of the
 /// intermediate integers
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
-
 pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_mul(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and returns the low 32 bits of the
 /// intermediate integers
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
-
 pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_mul(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Multiplies packed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits `[16:1]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
-
 pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    pmulhrsw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
-
 pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_or(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
-
 pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    packsswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
-
 pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
-    packssdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
-
 pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
-    packuswb(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
-
 pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
-    packusdw(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
 ///
 /// The last 3 bits of each integer of `b` are used as addresses into the 8
 /// integers of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
-
 pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
-    permd(BitVec::to_u32x8(a), BitVec::to_u32x8(b)).into()
+    {
+        transmute(permd(a.as_u32x8(), b.as_u32x8()))
+    }
 }
-
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
-
 pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    let zero = i64x4::from_fn(|_| 0);
-    let r: i64x4 = simd_shuffle(
-        BitVec::to_i64x4(a),
-        zero,
-        [
-            IMM8 as u64 & 0b11,
-            (IMM8 as u64 >> 2) & 0b11,
-            (IMM8 as u64 >> 4) & 0b11,
-            (IMM8 as u64 >> 6) & 0b11,
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let zero = i64x4::ZERO();
+        let r: i64x4 = simd_shuffle(
+            a.as_i64x4(),
+            zero,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
-
 pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    vperm2i128(BitVec::to_i64x4(a), BitVec::to_i64x4(b), IMM8 as i8).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
+    }
 }
+/// Shuffles 64-bit floating-point elements in `a` across lanes using the
+/// control in `imm8`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
+// NOTE: Not modeled yet
+// pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+//     static_assert_uimm_bits!(IMM8, 8);
+//     {
+//         transmute(simd_shuffle(
+//             a, _mm256_undefined_pd(), [IMM8 as u32 & 0b11, (IMM8 as u32 >> 2) & 0b11,
+//             (IMM8 as u32 >> 4) & 0b11, (IMM8 as u32 >> 6) & 0b11,],
+//         ))
+//     }
+// }
+
+/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
+/// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
+// NOTE: Not modeled yet
+// pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
+//     { permps(a, idx.as_i32x8()) }
+// }
 
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
 /// and `b`, then horizontally sum each consecutive 8 differences to
@@ -1743,11 +1105,11 @@ pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 /// integers in the low 16 bits of the 64-bit return value
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
-
 pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
-    psadbw(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    {
+        transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Shuffles bytes from `a` according to the content of `b`.
 ///
 /// For each of the 128-bit low and high halves of the vectors, the last
@@ -1764,6 +1126,8 @@ pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
 ///     let mut r = [0; 32];
 ///     for i in 0..16 {
+///         // if the most significant bit of b is set,
+///         // then the destination byte is set to 0.
 ///         if b[i] & 0x80 == 0u8 {
 ///             r[i] = a[(b[i] % 16) as usize];
 ///         }
@@ -1776,208 +1140,216 @@ pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// ```
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
-
 pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
-    pshufb(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    {
+        transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
 /// `imm8`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
-
 pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(
-        BitVec::to_i32x8(a),
-        BitVec::to_i32x8(a),
-        [
-            MASK as u64 & 0b11,
-            (MASK as u64 >> 2) & 0b11,
-            (MASK as u64 >> 4) & 0b11,
-            (MASK as u64 >> 6) & 0b11,
-            (MASK as u64 & 0b11) + 4,
-            ((MASK as u64 >> 2) & 0b11) + 4,
-            ((MASK as u64 >> 4) & 0b11) + 4,
-            ((MASK as u64 >> 6) & 0b11) + 4,
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(MASK, 8);
+    {
+        let r: i32x8 = simd_shuffle(
+            a.as_i32x8(),
+            a.as_i32x8(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
-
 pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let r: i16x16 = simd_shuffle(
-        a,
-        a,
-        [
-            0,
-            1,
-            2,
-            3,
-            4 + (IMM8 as u64 & 0b11),
-            4 + ((IMM8 as u64 >> 2) & 0b11),
-            4 + ((IMM8 as u64 >> 4) & 0b11),
-            4 + ((IMM8 as u64 >> 6) & 0b11),
-            8,
-            9,
-            10,
-            11,
-            12 + (IMM8 as u64 & 0b11),
-            12 + ((IMM8 as u64 >> 2) & 0b11),
-            12 + ((IMM8 as u64 >> 4) & 0b11),
-            12 + ((IMM8 as u64 >> 6) & 0b11),
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                4 + (IMM8 as u32 & 0b11),
+                4 + ((IMM8 as u32 >> 2) & 0b11),
+                4 + ((IMM8 as u32 >> 4) & 0b11),
+                4 + ((IMM8 as u32 >> 6) & 0b11),
+                8,
+                9,
+                10,
+                11,
+                12 + (IMM8 as u32 & 0b11),
+                12 + ((IMM8 as u32 >> 2) & 0b11),
+                12 + ((IMM8 as u32 >> 4) & 0b11),
+                12 + ((IMM8 as u32 >> 6) & 0b11),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
-
 pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    let a = BitVec::to_i16x16(a);
-    let r: i16x16 = simd_shuffle(
-        a,
-        a,
-        [
-            0 + (IMM8 as u64 & 0b11),
-            0 + ((IMM8 as u64 >> 2) & 0b11),
-            0 + ((IMM8 as u64 >> 4) & 0b11),
-            0 + ((IMM8 as u64 >> 6) & 0b11),
-            4,
-            5,
-            6,
-            7,
-            8 + (IMM8 as u64 & 0b11),
-            8 + ((IMM8 as u64 >> 2) & 0b11),
-            8 + ((IMM8 as u64 >> 4) & 0b11),
-            8 + ((IMM8 as u64 >> 6) & 0b11),
-            12,
-            13,
-            14,
-            15,
-        ],
-    );
-    r.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x16();
+        let r: i16x16 = simd_shuffle(
+            a,
+            a,
+            [
+                0 + (IMM8 as u32 & 0b11),
+                0 + ((IMM8 as u32 >> 2) & 0b11),
+                0 + ((IMM8 as u32 >> 4) & 0b11),
+                0 + ((IMM8 as u32 >> 6) & 0b11),
+                4,
+                5,
+                6,
+                7,
+                8 + (IMM8 as u32 & 0b11),
+                8 + ((IMM8 as u32 >> 2) & 0b11),
+                8 + ((IMM8 as u32 >> 4) & 0b11),
+                8 + ((IMM8 as u32 >> 6) & 0b11),
+                12,
+                13,
+                14,
+                15,
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Negates packed 16-bit integers in `a` when the corresponding signed
 /// 16-bit integer in `b` is negative, and returns the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
-
 pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
-    psignw(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(psignw(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Negates packed 32-bit integers in `a` when the corresponding signed
 /// 32-bit integer in `b` is negative, and returns the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
-
 pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
-    psignd(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(psignd(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Negates packed 8-bit integers in `a` when the corresponding signed
 /// 8-bit integer in `b` is negative, and returns the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
-
 pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
-    psignb(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    {
+        transmute(psignb(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and returns the result
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
-
 pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psllw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+    {
+        transmute(psllw(a.as_i16x16(), count.as_i16x8()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and returns the result
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
-
 pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
-    pslld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+    {
+        transmute(pslld(a.as_i32x8(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and returns the result
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
-
 pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
-    psllq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+    {
+        transmute(psllq(a.as_i64x4(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` left by `IMM8` while
 /// shifting in zeros, return the results;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
-
 pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 16 {
-        _mm256_setzero_si256()
-    } else {
-        simd_shl(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while
 /// shifting in zeros, return the results;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
-
 pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 32 {
-        _mm256_setzero_si256()
-    } else {
-        simd_shl(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while
 /// shifting in zeros, return the results;
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
-
 pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 64 {
-        _mm256_setzero_si256()
-    } else {
-        simd_shl(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
-
 pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     _mm256_bslli_epi128::<IMM8>(a)
 }
 
 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
-
 pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 || i % 16 < shift {
@@ -1986,508 +1358,516 @@ pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
             32 + (i - shift)
         }
     }
-    let a = BitVec::to_i8x32(a);
-    let r: i8x32 = simd_shuffle(
-        i8x32::from_fn(|_| 0),
-        a,
-        [
-            mask(IMM8, 0) as u64,
-            mask(IMM8, 1) as u64,
-            mask(IMM8, 2) as u64,
-            mask(IMM8, 3) as u64,
-            mask(IMM8, 4) as u64,
-            mask(IMM8, 5) as u64,
-            mask(IMM8, 6) as u64,
-            mask(IMM8, 7) as u64,
-            mask(IMM8, 8) as u64,
-            mask(IMM8, 9) as u64,
-            mask(IMM8, 10) as u64,
-            mask(IMM8, 11) as u64,
-            mask(IMM8, 12) as u64,
-            mask(IMM8, 13) as u64,
-            mask(IMM8, 14) as u64,
-            mask(IMM8, 15) as u64,
-            mask(IMM8, 16) as u64,
-            mask(IMM8, 17) as u64,
-            mask(IMM8, 18) as u64,
-            mask(IMM8, 19) as u64,
-            mask(IMM8, 20) as u64,
-            mask(IMM8, 21) as u64,
-            mask(IMM8, 22) as u64,
-            mask(IMM8, 23) as u64,
-            mask(IMM8, 24) as u64,
-            mask(IMM8, 25) as u64,
-            mask(IMM8, 26) as u64,
-            mask(IMM8, 27) as u64,
-            mask(IMM8, 28) as u64,
-            mask(IMM8, 29) as u64,
-            mask(IMM8, 30) as u64,
-            mask(IMM8, 31) as u64,
-        ],
-    );
-    r.into()
+    {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle(
+            i8x32::ZERO(),
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
-
 pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psllvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    {
+        transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
-
 pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psllvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+    {
+        transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
-
 pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psllvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    {
+        transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and returns the result.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
-
 pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    psllvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+    {
+        transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
-
 pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psraw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+    {
+        transmute(psraw(a.as_i16x16(), count.as_i16x8()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
-
 pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
-    psrad(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+    {
+        transmute(psrad(a.as_i32x8(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
-
 pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    simd_shr(BitVec::to_i16x16(a), i16x16::splat(IMM8.min(15) as i16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while
 /// shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
-
 pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    simd_shr(BitVec::to_i32x8(a), i32x8::splat(IMM8.min(31))).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
-
 pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psravd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    {
+        transmute(psravd(a.as_i32x4(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
-
 pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psravd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+    {
+        transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
+    }
 }
-
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
-
 pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
+    static_assert_uimm_bits!(IMM8, 8);
     _mm256_bsrli_epi128::<IMM8>(a)
 }
-
 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
-
 pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
-    const fn mask(shift: i32, i: u32) -> u64 {
+    static_assert_uimm_bits!(IMM8, 8);
+    const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 || (15 - (i % 16)) < shift {
-            0 as u64
+            0
         } else {
-            (32 + (i + shift)) as u64
+            32 + (i + shift)
         }
     }
-
-    let a = BitVec::to_i8x32(a);
-    let r: i8x32 = simd_shuffle(
-        i8x32::from_fn(|_| 0),
-        a,
-        [
-            mask(IMM8, 0),
-            mask(IMM8, 1),
-            mask(IMM8, 2),
-            mask(IMM8, 3),
-            mask(IMM8, 4),
-            mask(IMM8, 5),
-            mask(IMM8, 6),
-            mask(IMM8, 7),
-            mask(IMM8, 8),
-            mask(IMM8, 9),
-            mask(IMM8, 10),
-            mask(IMM8, 11),
-            mask(IMM8, 12),
-            mask(IMM8, 13),
-            mask(IMM8, 14),
-            mask(IMM8, 15),
-            mask(IMM8, 16),
-            mask(IMM8, 17),
-            mask(IMM8, 18),
-            mask(IMM8, 19),
-            mask(IMM8, 20),
-            mask(IMM8, 21),
-            mask(IMM8, 22),
-            mask(IMM8, 23),
-            mask(IMM8, 24),
-            mask(IMM8, 25),
-            mask(IMM8, 26),
-            mask(IMM8, 27),
-            mask(IMM8, 28),
-            mask(IMM8, 29),
-            mask(IMM8, 30),
-            mask(IMM8, 31),
-        ],
-    );
-
-    r.into()
+    {
+        let a = a.as_i8x32();
+        let r: i8x32 = simd_shuffle(
+            i8x32::ZERO(),
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
-
 pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
-    psrlw(BitVec::to_i16x16(a), BitVec::to_i16x8(count)).into()
+    {
+        transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
-
 pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
-    psrld(BitVec::to_i32x8(a), BitVec::to_i32x4(count)).into()
+    {
+        transmute(psrld(a.as_i32x8(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
-
 pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
-    psrlq(BitVec::to_i64x4(a), BitVec::to_i64x2(count)).into()
+    {
+        transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
 /// zeros
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
-
 pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 16 {
-        _mm256_setzero_si256()
-    } else {
-        simd_shr(BitVec::to_u16x16(a), u16x16::splat(IMM8 as u16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
 /// zeros
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
-
 pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 32 {
-        _mm256_setzero_si256()
-    } else {
-        simd_shr(BitVec::to_u32x8(a), u32x8::splat(IMM8 as u32)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
 /// zeros
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
-
 pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    if IMM8 >= 64 {
-        _mm256_setzero_si256()
-    } else {
-        simd_shr(BitVec::to_u64x4(a), u64x4::splat(IMM8 as u64)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
-
 pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrlvd(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    {
+        transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
+    }
 }
-
 /// Shifts packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
-
 pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
-    psrlvd256(BitVec::to_i32x8(a), BitVec::to_i32x8(count)).into()
+    {
+        transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
-
 pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psrlvq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    {
+        transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
+    }
 }
-
 /// Shifts packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
-
 pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
-    psrlvq256(BitVec::to_i64x4(a), BitVec::to_i64x4(count)).into()
+    {
+        transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
+    }
 }
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
-
 pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
-
 pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i32x8(a), BitVec::to_i32x8(b)).into()
+    {
+        transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
+    }
 }
-
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
-
 pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    {
+        transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
-
 pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    {
+        transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
 /// `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
-
 pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_i16x16(a), BitVec::to_i16x16(b)).into()
+    {
+        transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
+    }
 }
-
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
 /// `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
-
 pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_i8x32(a), BitVec::to_i8x32(b)).into()
+    {
+        transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
+    }
 }
-
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
-
 pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_u16x16(a), BitVec::to_u16x16(b)).into()
+    {
+        transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
+    }
 }
-
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
-
 pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
-    simd_saturating_sub(BitVec::to_u8x32(a), BitVec::to_u8x32(b)).into()
+    {
+        transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
+    }
 }
-
 /// Unpacks and interleave 8-bit integers from the high half of each
 /// 128-bit lane in `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
-
 pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
-    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
-            8, 40, 9, 41, 10, 42, 11, 43,
-            12, 44, 13, 45, 14, 46, 15, 47,
-            24, 56, 25, 57, 26, 58, 27, 59,
-            28, 60, 29, 61, 30, 62, 31, 63,
-    ]);
-    r.into()
+    {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle(
+            a.as_i8x32(), b.as_i8x32(), [8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45,
+            14, 46, 15, 47, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
+            63,]
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 8-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
-
 pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
-    #[rustfmt::skip]
-    let r: i8x32 = simd_shuffle(BitVec::to_i8x32(a), BitVec::to_i8x32(b), [
-        0, 32, 1, 33, 2, 34, 3, 35,
-        4, 36, 5, 37, 6, 38, 7, 39,
-        16, 48, 17, 49, 18, 50, 19, 51,
-        20, 52, 21, 53, 22, 54, 23, 55,
-    ]);
-    r.into()
+    {
+        #[rustfmt::skip]
+        let r: i8x32 = simd_shuffle(
+            a.as_i8x32(), b.as_i8x32(), [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38,
+            7, 39, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,]
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the high half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
-
 pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle(
-        BitVec::to_i16x16(a),
-        BitVec::to_i16x16(b),
-        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
-    );
-    r.into()
+    {
+        let r: i16x16 = simd_shuffle(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
-
 pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
-    let r: i16x16 = simd_shuffle(
-        BitVec::to_i16x16(a),
-        BitVec::to_i16x16(b),
-        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
-    );
-    r.into()
+    {
+        let r: i16x16 = simd_shuffle(
+            a.as_i16x16(),
+            b.as_i16x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
+        );
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the high half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
-
 pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(
-        BitVec::to_i32x8(a),
-        BitVec::to_i32x8(b),
-        [2, 10, 3, 11, 6, 14, 7, 15],
-    );
-    r.into()
+    {
+        let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
-
 pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
-    let r: i32x8 = simd_shuffle(
-        BitVec::to_i32x8(a),
-        BitVec::to_i32x8(b),
-        [0, 8, 1, 9, 4, 12, 5, 13],
-    );
-    r.into()
+    {
+        let r: i32x8 = simd_shuffle(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 64-bit integers from the high half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
-
 pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [1, 5, 3, 7]);
-    r.into()
+    {
+        let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
+        transmute(r)
+    }
 }
-
 /// Unpacks and interleave 64-bit integers from the low half of each
 /// 128-bit lane of `a` and `b`.
+///
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
-
 pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let r: i64x4 = simd_shuffle(BitVec::to_i64x4(a), BitVec::to_i64x4(b), [0, 4, 2, 6]);
-    r.into()
+    {
+        let r: i64x4 = simd_shuffle(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
+        transmute(r)
+    }
 }
-
 /// Computes the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
-
 pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
-    simd_xor(BitVec::to_i64x4(a), BitVec::to_i64x4(b)).into()
+    {
+        transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+    }
 }
-
 /// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
-
-// This intrinsic has no corresponding instruction.
-
 pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(BitVec::to_u8x32(a), INDEX as u64) as u32 as i32
+    static_assert_uimm_bits!(INDEX, 5);
+    {
+        simd_extract(a.as_u8x32(), INDEX as u32) as i32
+    }
 }
-
 /// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
-
-// This intrinsic has no corresponding instruction.
-
 pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
-    simd_extract(BitVec::to_u16x16(a), INDEX as u64) as u32 as i32
+    static_assert_uimm_bits!(INDEX, 4);
+    {
+        simd_extract(a.as_u16x16(), INDEX as u32) as i32
+    }
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
new file mode 100644
index 0000000000000..43f0a840b54bd
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
@@ -0,0 +1,620 @@
+use crate::abstractions::{bit::MachineInteger, simd::*};
+pub fn phaddw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phaddd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else if i < 4 {
+            b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+        } else if i < 6 {
+            a[2 * (i - 2)].wrapping_add(a[2 * (i - 2) + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_add(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].saturating_add(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].saturating_add(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phsubw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].wrapping_sub(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].wrapping_sub(b[2 * (i - 8) + 1])
+        }
+    })
+}
+
+pub fn phsubd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else if i < 4 {
+            b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+        } else if i < 6 {
+            a[2 * (i - 2)].wrapping_sub(a[2 * (i - 2) + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_sub(a[2 * i + 1])
+        } else if i < 8 {
+            b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+        } else if i < 12 {
+            a[2 * (i - 4)].saturating_sub(a[2 * (i - 4) + 1])
+        } else {
+            b[2 * (i - 8)].saturating_sub(b[2 * (i - 8) + 1])
+        }
+    })
+}
+pub fn pmaddwd(a: i16x16, b: i16x16) -> i32x8 {
+    i32x8::from_fn(|i| {
+        (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+    })
+}
+
+pub fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16 {
+    i16x16::from_fn(|i| {
+        ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+            .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+    })
+}
+pub fn packsswb(a: i16x16, b: i16x16) -> i8x32 {
+    i8x32::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i] as i8
+            }
+        } else if i < 16 {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 8] as i8
+            }
+        } else if i < 24 {
+            if a[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i - 8] as i8
+            }
+        } else {
+            if b[i - 16] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 16] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 16] as i8
+            }
+        }
+    })
+}
+
+pub fn packssdw(a: i32x8, b: i32x8) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i] as i16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 4] as i16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i - 4] as i16
+            }
+        } else {
+            if b[i - 8] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 8] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 8] as i16
+            }
+        }
+    })
+}
+
+pub fn packuswb(a: i16x16, b: i16x16) -> u8x32 {
+    u8x32::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i] as u8
+            }
+        } else if i < 16 {
+            if b[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 8] as u8
+            }
+        } else if i < 24 {
+            if a[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i - 8] as u8
+            }
+        } else {
+            if b[i - 16] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 16] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 16] as u8
+            }
+        }
+    })
+}
+
+pub fn packusdw(a: i32x8, b: i32x8) -> u16x16 {
+    u16x16::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (u16::MAX as i32) {
+                u16::MAX
+            } else if a[i] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                a[i] as u16
+            }
+        } else if i < 8 {
+            if b[i - 4] > (u16::MAX as i32) {
+                u16::MAX
+            } else if b[i - 4] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                b[i - 4] as u16
+            }
+        } else if i < 12 {
+            if a[i - 4] > (u16::MAX as i32) {
+                u16::MAX
+            } else if a[i - 4] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                a[i - 4] as u16
+            }
+        } else {
+            if b[i - 8] > (u16::MAX as i32) {
+                u16::MAX
+            } else if b[i - 8] < (u16::MIN as i32) {
+                u16::MIN
+            } else {
+                b[i - 8] as u16
+            }
+        }
+    })
+}
+
+pub fn psignb(a: i8x32, b: i8x32) -> i8x32 {
+    i8x32::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i8::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+pub fn psignw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i16::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignd(a: i32x8, b: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psllw(a: i16x16, count: i16x8) -> i16x16 {
+    let count4 = (count[0] as u16) as u64;
+    let count3 = ((count[1] as u16) as u64) * 65536;
+    let count2 = ((count[2] as u16) as u64) * 4294967296;
+    let count1 = ((count[3] as u16) as u64) * 281474976710656;
+    let count = count1 + count2 + count3 + count4;
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) << count) as i16
+        }
+    })
+}
+
+pub fn pslld(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i32
+        }
+    })
+}
+pub fn psllq(a: i64x4, count: i64x2) -> i64x4 {
+    let count = count[0] as u32;
+
+    i64x4::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i64
+        }
+    })
+}
+
+pub fn psllvd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i32
+        }
+    })
+}
+pub fn psllvd256(a: i32x8, count: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i32
+        }
+    })
+}
+
+pub fn psllvq(a: i64x2, count: i64x2) -> i64x2 {
+    i64x2::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i64
+        }
+    })
+}
+pub fn psllvq256(a: i64x4, count: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) << count[i]) as i64
+        }
+    })
+}
+
+pub fn psraw(a: i16x16, count: i16x8) -> i16x16 {
+    let count = ((count[3] as u16) as u64) * 281474976710656
+        + ((count[2] as u16) as u64) * 4294967296
+        + ((count[1] as u16) as u64) * 65536
+        + ((count[0] as u16) as u64);
+
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count
+        }
+    })
+}
+
+pub fn psrad(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] << count
+        }
+    })
+}
+
+pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count[i]
+        }
+    })
+}
+
+pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
+    dbg!(a, count);
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count[i]
+        }
+    })
+}
+
+pub fn psrlw(a: i16x16, count: i16x8) -> i16x16 {
+    let count = (count[3] as u16 as u64) * 281474976710656
+        + (count[2] as u16 as u64) * 4294967296
+        + (count[1] as u16 as u64) * 65536
+        + (count[0] as u16 as u64);
+
+    i16x16::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> count) as i16
+        }
+    })
+}
+
+pub fn psrld(a: i32x8, count: i32x4) -> i32x8 {
+    let count = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x8::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i32
+        }
+    })
+}
+
+pub fn psrlq(a: i64x4, count: i64x2) -> i64x4 {
+    let count: u64 = count[0] as u64;
+
+    i64x4::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i64
+        }
+    })
+}
+
+pub fn psrlvd(a: i32x4, count: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i32
+        }
+    })
+}
+
+pub fn psrlvd256(a: i32x8, count: i32x8) -> i32x8 {
+    i32x8::from_fn(|i| {
+        if count[i] > 31 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i32
+        }
+    })
+}
+
+pub fn psrlvq(a: i64x2, count: i64x2) -> i64x2 {
+    i64x2::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i64
+        }
+    })
+}
+pub fn psrlvq256(a: i64x4, count: i64x4) -> i64x4 {
+    i64x4::from_fn(|i| {
+        if count[i] > 63 || count[i] < 0 {
+            0
+        } else {
+            ((a[i] as u32) >> count[i]) as i64
+        }
+    })
+}
+
+pub fn pshufb(a: u8x32, b: u8x32) -> u8x32 {
+    u8x32::from_fn(|i| {
+        if i < 16 {
+            if b[i] > 127 {
+                0
+            } else {
+                let index = (b[i] % 16) as u32;
+                a[index]
+            }
+        } else {
+            if b[i] > 127 {
+                0
+            } else {
+                let index = (b[i] % 16) as u32;
+                a[index + 16]
+            }
+        }
+    })
+}
+
+pub fn permd(a: u32x8, b: u32x8) -> u32x8 {
+    u32x8::from_fn(|i| {
+        let id = b[i] % 8;
+        a[id]
+    })
+}
+
+pub fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16 {
+    u16x16::from_fn(|i| {
+        if i < 8 {
+            let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+            let b_offset = ((imm8 & 3) * 4) as u32;
+            let k = a_offset + i;
+            let l = b_offset;
+            ((a[k].wrapping_abs_diff(b[l]) as i8) as u8 as u16)
+                + ((a[k + 1].wrapping_abs_diff(b[l + 1]) as i8) as u8 as u16)
+                + ((a[k + 2].wrapping_abs_diff(b[l + 2]) as i8) as u8 as u16)
+                + ((a[k + 3].wrapping_abs_diff(b[l + 3]) as i8) as u8 as u16)
+        } else {
+            let i = i - 8;
+            let imm8 = imm8 >> 3;
+            let a_offset = (((imm8 & 4) >> 2) * 4) as u32;
+            let b_offset = ((imm8 & 3) * 4) as u32;
+            let k = a_offset + i;
+            let l = b_offset;
+            ((a[16 + k].wrapping_abs_diff(b[16 + l]) as i8) as u8 as u16)
+                + ((a[16 + k + 1].wrapping_abs_diff(b[16 + l + 1]) as i8) as u8 as u16)
+                + ((a[16 + k + 2].wrapping_abs_diff(b[16 + l + 2]) as i8) as u8 as u16)
+                + ((a[16 + k + 3].wrapping_abs_diff(b[16 + l + 3]) as i8) as u8 as u16)
+        }
+    })
+}
+
+pub fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4 {
+    let a = i128x2::from_fn(|i| {
+        ((a[2 * i] as u64 as u128) + ((a[2 * i + 1] as u64 as u128) << 64)) as i128
+    });
+    let b = i128x2::from_fn(|i| {
+        ((b[2 * i] as u64 as u128) + ((b[2 * i + 1] as u64 as u128) << 64)) as i128
+    });
+    let imm8 = imm8 as u8 as u32 as i32;
+    let r = i128x2::from_fn(|i| {
+        let control = imm8 >> (i * 4);
+        if (control >> 3) % 2 == 1 {
+            0
+        } else {
+            match control % 4 {
+                0 => a[0],
+                1 => a[1],
+                2 => b[0],
+                3 => b[1],
+                _ => unreachable!(),
+            }
+        }
+    });
+    i64x4::from_fn(|i| {
+        let index = i >> 1;
+        let hilo = i.rem_euclid(2);
+        let val = r[index];
+        if hilo == 0 {
+            i64::cast(val)
+        } else {
+            i64::cast(val >> 64)
+        }
+    })
+}
+pub fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16 {
+    i16x16::from_fn(|i| {
+        let temp = (a[i] as i32) * (b[i] as i32);
+        let temp = (temp >> 14).wrapping_add(1) >> 1;
+        temp as i16
+    })
+}
+
+pub fn psadbw(a: u8x32, b: u8x32) -> u64x4 {
+    let tmp = u8x32::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+    u64x4::from_fn(|i| {
+        (tmp[i * 8] as u16)
+            .wrapping_add(tmp[i * 8 + 1] as u16)
+            .wrapping_add(tmp[i * 8 + 2] as u16)
+            .wrapping_add(tmp[i * 8 + 3] as u16)
+            .wrapping_add(tmp[i * 8 + 4] as u16)
+            .wrapping_add(tmp[i * 8 + 5] as u16)
+            .wrapping_add(tmp[i * 8 + 6] as u16)
+            .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
new file mode 100644
index 0000000000000..ba61996851392
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/avx_handwritten.rs
@@ -0,0 +1,31 @@
+use crate::abstractions::simd::*;
+
+pub fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8 {
+    let temp = i128x2::from_fn(|i| match (imm8 as u8) >> (i * 4) {
+        0 => (a[4 * i] as i128) + 16 * (a[4 * i + 1] as i128),
+        1 => (a[4 * i + 2] as i128) + 16 * (a[4 * i + 3] as i128),
+        2 => (b[4 * i] as i128) + 16 * (b[4 * i + 1] as i128),
+        3 => (b[4 * i + 2] as i128) + 16 * (b[4 * i + 3] as i128),
+        _ => unreachable!(),
+    });
+
+    i32x8::from_fn(|i| (temp[if i < 4 { 0 } else { 1 }] >> (i % 4)) as i32)
+}
+
+pub fn ptestz256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
+
+pub fn ptestc256(a: i64x4, b: i64x4) -> i32 {
+    let c = i64x4::from_fn(|i| !a[i] & b[i]);
+    if c == i64x4::ZERO() {
+        1
+    } else {
+        0
+    }
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/mod.rs b/testable-simd-models/src/core_arch/x86/models/mod.rs
index 95c9eb4061b6a..79b660019c07c 100644
--- a/testable-simd-models/src/core_arch/x86/models/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/models/mod.rs
@@ -22,8 +22,13 @@
 
 pub mod avx;
 pub mod avx2;
+pub mod avx2_handwritten;
+pub mod avx_handwritten;
+pub mod sse;
 pub mod sse2;
+pub mod sse2_handwritten;
 pub mod ssse3;
+pub mod ssse3_handwritten;
 
 pub(crate) mod types {
     use crate::abstractions::bitvec::*;
@@ -33,5 +38,11 @@ pub(crate) mod types {
     #[allow(non_camel_case_types)]
     pub type __m256 = BitVec<256>;
     #[allow(non_camel_case_types)]
+    pub type __m256d = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128 = BitVec<128>;
+    #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128d = BitVec<128>;
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/sse.rs b/testable-simd-models/src/core_arch/x86/models/sse.rs
new file mode 100644
index 0000000000000..f975c2814438a
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse.rs
@@ -0,0 +1,21 @@
+//! Streaming SIMD Extensions (SSE)
+use super::types::*;
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
+
+/// Returns vector of type __m128 with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
+pub fn _mm_undefined_ps() -> __m128 {
+    transmute(f32x4::ZERO())
+}
+
+/// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
+pub fn _mm_setzero_ps() -> __m128 {
+    transmute(f32x4::ZERO())
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2.rs b/testable-simd-models/src/core_arch/x86/models/sse2.rs
index ed57f03cfd5d8..c9c90e3e9e267 100644
--- a/testable-simd-models/src/core_arch/x86/models/sse2.rs
+++ b/testable-simd-models/src/core_arch/x86/models/sse2.rs
@@ -1,293 +1,79 @@
 //! Streaming SIMD Extensions 2 (SSE2)
+use super::sse2_handwritten::*;
 use super::types::*;
-use crate::abstractions::{bit::Bit, bitvec::BitVec, simd::*};
-mod c_extern {
-    use crate::abstractions::{bit::MachineInteger, simd::*};
-    pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
-        i8x16::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if a[i] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    a[i] as i8
-                }
-            } else {
-                if b[i - 8] > (i8::MAX as i16) {
-                    i8::MAX
-                } else if b[i - 8] < (i8::MIN as i16) {
-                    i8::MIN
-                } else {
-                    b[i - 8] as i8
-                }
-            }
-        })
-    }
-    pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
-        i32x4::from_fn(|i| {
-            (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
-        })
-    }
-    pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
-        let tmp = u8x16::from_fn(|i| a[i].absolute_diff(b[i]));
-        u64x2::from_fn(|i| {
-            (tmp[i * 8] as u16)
-                .wrapping_add(tmp[i * 8 + 1] as u16)
-                .wrapping_add(tmp[i * 8 + 2] as u16)
-                .wrapping_add(tmp[i * 8 + 3] as u16)
-                .wrapping_add(tmp[i * 8 + 4] as u16)
-                .wrapping_add(tmp[i * 8 + 5] as u16)
-                .wrapping_add(tmp[i * 8 + 6] as u16)
-                .wrapping_add(tmp[i * 8 + 7] as u16) as u64
-        })
-    }
-    pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
-        let count4: u64 = (count[0] as u16) as u64;
-        let count3: u64 = ((count[1] as u16) as u64) * 65536;
-        let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
-        let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
-        let count = count1 + count2 + count3 + count4;
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) << count) as i16
-            }
-        })
-    }
-
-    pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) << count) as i32
-            }
-        })
-    }
-
-    pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
-        let count: u64 = count[0] as u64;
-
-        i64x2::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u64) << count) as i64
-            }
-        })
-    }
-
-    pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
-        let count: u64 = ((count[3] as u16) as u64) * 281474976710656
-            + ((count[2] as u16) as u64) * 4294967296
-            + ((count[1] as u16) as u64) * 65536
-            + ((count[0] as u16) as u64);
-
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] >> count
-            }
-        })
-    }
-
-    pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                if a[i] < 0 {
-                    -1
-                } else {
-                    0
-                }
-            } else {
-                a[i] << count
-            }
-        })
-    }
-
-    pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
-        let count: u64 = (count[3] as u16 as u64) * 281474976710656
-            + (count[2] as u16 as u64) * 4294967296
-            + (count[1] as u16 as u64) * 65536
-            + (count[0] as u16 as u64);
-
-        i16x8::from_fn(|i| {
-            if count > 15 {
-                0
-            } else {
-                ((a[i] as u16) >> count) as i16
-            }
-        })
-    }
-
-    pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
-        let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
-
-        i32x4::from_fn(|i| {
-            if count > 31 {
-                0
-            } else {
-                ((a[i] as u32) >> count) as i32
-            }
-        })
-    }
-
-    pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
-        let count: u64 = count[0] as u64;
-
-        i64x2::from_fn(|i| {
-            if count > 63 {
-                0
-            } else {
-                ((a[i] as u64) >> count) as i64
-            }
-        })
-    }
-
-    pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                if a[i] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if a[i] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    a[i] as i16
-                }
-            } else {
-                if b[i - 4] > (i16::MAX as i32) {
-                    i16::MAX
-                } else if b[i - 4] < (i16::MIN as i32) {
-                    i16::MIN
-                } else {
-                    b[i - 4] as i16
-                }
-            }
-        })
-    }
-
-    pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
-        u8x16::from_fn(|i| {
-            if i < 8 {
-                if a[i] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if a[i] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    a[i] as u8
-                }
-            } else {
-                if b[i - 8] > (u8::MAX as i16) {
-                    u8::MAX
-                } else if b[i - 8] < (u8::MIN as i16) {
-                    u8::MIN
-                } else {
-                    b[i - 8] as u8
-                }
-            }
-        })
-    }
-}
-
-use c_extern::*;
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
 
 /// Adds packed 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8)
-
 pub fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Adds packed 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16)
-
 pub fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i16x8(simd_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Adds packed 32-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32)
-
 pub fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Adds packed 64-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64)
-
 pub fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_add(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+    transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
 }
-
 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8)
-
 pub fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    transmute(simd_saturating_add(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16)
-
 pub fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    transmute(simd_saturating_add(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8)
-
 pub fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+    transmute(simd_saturating_add(a.as_u8x16(), b.as_u8x16()))
 }
-
 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16)
-
 pub fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_add(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+    transmute(simd_saturating_add(a.as_u16x8(), b.as_u16x8()))
 }
-
 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8)
-
 pub fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<16, _, u16>(BitVec::to_u8x16(a));
-    let b = simd_cast::<16, _, u16>(BitVec::to_u8x16(b));
-    let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
-    simd_cast::<16, _, u8>(r).into()
+    {
+        let a = simd_cast::<16, _, u16>(a.as_u8x16());
+        let b = simd_cast::<16, _, u16>(b.as_u8x16());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x16::splat(1)), u16x16::splat(1));
+        transmute(simd_cast::<16, _, u8>(r))
+    }
 }
-
 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16)
-
 pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
-    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
-    let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
-    simd_cast::<8, _, u16>(r).into()
+    {
+        let a = simd_cast::<8, _, u32>(a.as_u16x8());
+        let b = simd_cast::<8, _, u32>(b.as_u16x8());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x8::splat(1)), u32x8::splat(1));
+        transmute(simd_cast::<8, _, u16>(r))
+    }
 }
-
 /// Multiplies and then horizontally add signed 16 bit integers in `a` and `b`.
 ///
 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
@@ -295,108 +81,104 @@ pub fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// intermediate 32-bit integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16)
-
 pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddwd(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16)
-
 pub fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-    simd_select(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
 /// packed maximum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8)
-
 pub fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_u8x16(a);
-    let b = BitVec::to_u8x16(b);
-    simd_select(simd_gt(a, b), a, b).into()
+    {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select(simd_gt(a, b), a, b))
+    }
 }
-
 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
 /// minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16)
-
 pub fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let b = BitVec::to_i16x8(b);
-    simd_select(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns the
 /// packed minimum values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8)
-
 pub fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_u8x16(a);
-    let b = BitVec::to_u8x16(b);
-    simd_select(simd_lt(a, b), a, b).into()
+    {
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        transmute(simd_select(simd_lt(a, b), a, b))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`.
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16)
-
 pub fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, i16, i32>(BitVec::to_i16x8(a));
-    let b = simd_cast::<8, i16, i32>(BitVec::to_i16x8(b));
-    let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
-    BitVec::from_i16x8(simd_cast::<8, i32, i16>(r))
+    {
+        let a = simd_cast::<8, _, i32>(a.as_i16x8());
+        let b = simd_cast::<8, _, i32>(b.as_i16x8());
+        let r = simd_shr(simd_mul(a, b), i32x8::splat(16));
+        transmute(simd_cast::<8, i32, i16>(r))
+    }
 }
-
 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`.
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16)
-
 pub fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
-    let a = simd_cast::<8, _, u32>(BitVec::to_u16x8(a));
-    let b = simd_cast::<8, _, u32>(BitVec::to_u16x8(b));
-    let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
-    simd_cast::<8, u32, u16>(r).into()
+    {
+        let a = simd_cast::<8, _, u32>(a.as_u16x8());
+        let b = simd_cast::<8, _, u32>(b.as_u16x8());
+        let r = simd_shr(simd_mul(a, b), u32x8::splat(16));
+        transmute(simd_cast::<8, u32, u16>(r))
+    }
 }
-
 /// Multiplies the packed 16-bit integers in `a` and `b`.
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// low 16 bits of the intermediate integers.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16)
-
 pub fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i16x8(simd_mul(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit element
 /// in `a` and `b`.
 ///
 /// Returns the unsigned 64-bit results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32)
-
 pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
-    let a = BitVec::to_u64x2(a);
-    let b = BitVec::to_u64x2(b);
-    let mask = u64x2::splat(u32::MAX.into());
-    simd_mul(simd_and(a, mask), simd_and(b, mask)).into()
+    {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let mask = u64x2::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
 }
-
 /// Sum the absolute differences of packed unsigned 8-bit integers.
 ///
 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
@@ -405,103 +187,81 @@ pub fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// the low 16 bits of 64-bit elements returned.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8)
-
 pub fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
-    psadbw(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+    transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
 }
-
 /// Subtracts packed 8-bit integers in `b` from packed 8-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8)
-
 pub fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i8x16(simd_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)))
+    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Subtracts packed 16-bit integers in `b` from packed 16-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16)
-
 pub fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_i16x8(simd_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)))
+    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32)
-
 pub fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64)
-
 pub fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_sub(BitVec::to_i64x2(a), BitVec::to_i64x2(b)).into()
+    transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
 }
-
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8)
-
 pub fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    transmute(simd_saturating_sub(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16)
-
 pub fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    transmute(simd_saturating_sub(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8)
-
 pub fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_u8x16(a), BitVec::to_u8x16(b)).into()
+    transmute(simd_saturating_sub(a.as_u8x16(), b.as_u8x16()))
 }
-
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16)
-
 pub fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
-    simd_saturating_sub(BitVec::to_u16x8(a), BitVec::to_u16x8(b)).into()
+    transmute(simd_saturating_sub(a.as_u16x8(), b.as_u16x8()))
 }
-
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128)
-
 pub fn _mm_slli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
+    static_assert_uimm_bits!(IMM8, 8);
     _mm_slli_si128_impl::<IMM8>(a)
 }
 
-/// Implementation detail: converts the immediate argument of the
-/// `_mm_slli_si128` intrinsic into a compile-time constant.
-
 fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
-    const fn mask(shift: i32, i: u32) -> u64 {
+    const fn mask(shift: i32, i: u32) -> u32 {
         let shift = shift as u32 & 0xff;
         if shift > 15 {
-            i as u64
+            i
         } else {
-            (16 - shift + i) as u64
+            16 - shift + i
         }
     }
-    (simd_shuffle(
-        i8x16::from_fn(|_| 0),
-        BitVec::to_i8x16(a),
+    transmute::<i8x16, _>(simd_shuffle(
+        i8x16::ZERO(),
+        a.as_i8x16(),
         [
             mask(IMM8, 0),
             mask(IMM8, 1),
@@ -521,397 +281,365 @@ fn _mm_slli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
             mask(IMM8, 15),
         ],
     ))
-    .into()
 }
 
 /// Shifts `a` left by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128)
-
 pub fn _mm_bslli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    _mm_slli_si128_impl::<IMM8>(a)
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_slli_si128_impl::<IMM8>(a)
+    }
 }
-
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128)
-
 pub fn _mm_bsrli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    _mm_srli_si128_impl::<IMM8>(a)
+    {
+        static_assert_uimm_bits!(IMM8, 8);
+        _mm_srli_si128_impl::<IMM8>(a)
+    }
 }
 
+fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
+    const fn mask(shift: i32, i: u32) -> u32 {
+        if (shift as u32) > 15 {
+            i + 16
+        } else {
+            i + (shift as u32)
+        }
+    }
+    let x: i8x16 = simd_shuffle(
+        a.as_i8x16(),
+        i8x16::ZERO(),
+        [
+            mask(IMM8, 0),
+            mask(IMM8, 1),
+            mask(IMM8, 2),
+            mask(IMM8, 3),
+            mask(IMM8, 4),
+            mask(IMM8, 5),
+            mask(IMM8, 6),
+            mask(IMM8, 7),
+            mask(IMM8, 8),
+            mask(IMM8, 9),
+            mask(IMM8, 10),
+            mask(IMM8, 11),
+            mask(IMM8, 12),
+            mask(IMM8, 13),
+            mask(IMM8, 14),
+            mask(IMM8, 15),
+        ],
+    );
+    transmute(x)
+}
 /// Shifts packed 16-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16)
-
 pub fn _mm_slli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 16 {
-        _mm_setzero_si128()
-    } else {
-        simd_shl(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 16-bit integers in `a` left by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16)
-
 pub fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psllw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+    transmute(psllw(a.as_i16x8(), count.as_i16x8()))
 }
-
 /// Shifts packed 32-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32)
-
 pub fn _mm_slli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 32 {
-        _mm_setzero_si128()
-    } else {
-        simd_shl(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` left by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32)
-
 pub fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
-    pslld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    transmute(pslld(a.as_i32x4(), count.as_i32x4()))
 }
-
 /// Shifts packed 64-bit integers in `a` left by `IMM8` while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64)
-
 pub fn _mm_slli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 64 {
-        _mm_setzero_si128()
-    } else {
-        simd_shl(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` left by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64)
-
 pub fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psllq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    transmute(psllq(a.as_i64x2(), count.as_i64x2()))
 }
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16)
-
 pub fn _mm_srai_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    simd_shr(BitVec::to_i16x8(a), i16x8::splat(IMM8.min(15) as i16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    transmute(simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16)))
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16)
-
 pub fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psraw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+    transmute(psraw(a.as_i16x8(), count.as_i16x8()))
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32)
-
 pub fn _mm_srai_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-    simd_shr(BitVec::to_i32x4(a), i32x4::splat(IMM8.min(31))).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    transmute(simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31))))
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32)
-
 pub fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrad(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    transmute(psrad(a.as_i32x4(), count.as_i32x4()))
 }
-
 /// Shifts `a` right by `IMM8` bytes while shifting in zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128)
-
 pub fn _mm_srli_si128<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
+    static_assert_uimm_bits!(IMM8, 8);
     _mm_srli_si128_impl::<IMM8>(a)
 }
-
-/// Implementation detail: converts the immediate argument of the
-/// `_mm_srli_si128` intrinsic into a compile-time constant.
-
-fn _mm_srli_si128_impl<const IMM8: i32>(a: __m128i) -> __m128i {
-    const fn mask(shift: i32, i: u32) -> u64 {
-        if (shift as u32) > 15 {
-            (i + 16) as u64
-        } else {
-            (i + (shift as u32)) as u64
-        }
-    }
-    let x: i8x16 = simd_shuffle(
-        BitVec::to_i8x16(a),
-        i8x16::from_fn(|_| 0),
-        [
-            mask(IMM8, 0),
-            mask(IMM8, 1),
-            mask(IMM8, 2),
-            mask(IMM8, 3),
-            mask(IMM8, 4),
-            mask(IMM8, 5),
-            mask(IMM8, 6),
-            mask(IMM8, 7),
-            mask(IMM8, 8),
-            mask(IMM8, 9),
-            mask(IMM8, 10),
-            mask(IMM8, 11),
-            mask(IMM8, 12),
-            mask(IMM8, 13),
-            mask(IMM8, 14),
-            mask(IMM8, 15),
-        ],
-    );
-    x.into()
-}
-
 /// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16)
-
 pub fn _mm_srli_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 16 {
-        _mm_setzero_si128()
-    } else {
-        simd_shr(BitVec::to_u16x8(a), u16x8::splat(IMM8 as u16)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u16x8(), u16x8::splat(IMM8 as u16)))
+        }
     }
 }
-
 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16)
-
 pub fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
-    psrlw(BitVec::to_i16x8(a), BitVec::to_i16x8(count)).into()
+    transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
 }
-
 /// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32)
-
 pub fn _mm_srli_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 32 {
-        _mm_setzero_si128()
-    } else {
-        simd_shr(BitVec::to_u32x4(a), u32x4::splat(IMM8 as u32)).into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u32x4(), u32x4::splat(IMM8 as u32)))
+        }
     }
 }
-
 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32)
-
 pub fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
-    psrld(BitVec::to_i32x4(a), BitVec::to_i32x4(count)).into()
+    transmute(psrld(a.as_i32x4(), count.as_i32x4()))
 }
-
 /// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64)
-
 pub fn _mm_srli_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    // TODO    // static_assert_uimm_bits!(IMM8, 8);
-
-    if IMM8 >= 64 {
-        BitVec::from_fn(|_| Bit::Zero)
-    } else {
-        BitVec::from_u64x2(simd_shr(BitVec::to_u64x2(a), u64x2::splat(IMM8 as u64)))
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            transmute(simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64)))
+        }
     }
 }
-
 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64)
-
 pub fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
-    psrlq(BitVec::to_i64x2(a), BitVec::to_i64x2(count)).into()
+    transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
 }
-
 /// Computes the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128)
-
 pub fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| a[i] & b[i])
+    transmute(simd_and(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Computes the bitwise NOT of 128 bits (representing integer data) in `a` and
 /// then AND with `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128)
-
 pub fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| BitVec::<128>::from_fn(|i| _mm_set1_epi8(-1)[i] ^ a[i])[i] & b[i])
+    transmute(simd_and(
+        simd_xor(_mm_set1_epi8(-1).as_i32x4(), a.as_i32x4()),
+        b.as_i32x4(),
+    ))
 }
-
 /// Computes the bitwise OR of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128)
-
 pub fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| a[i] | b[i])
+    transmute(simd_or(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Computes the bitwise XOR of 128 bits (representing integer data) in `a` and
 /// `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128)
-
 pub fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_fn(|i| a[i] ^ b[i])
+    transmute(simd_xor(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8)
-
 pub fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_eq(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+    transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16)
-
 pub fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
-    (simd_eq(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+    transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for equality.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32)
-
 pub fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_eq(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+    transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8)
-
 pub fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_gt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+    transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16)
-
 pub fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    (simd_gt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+    transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32)
-
 pub fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_gt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+    transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Compares packed 8-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8)
-
 pub fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_lt(BitVec::to_i8x16(a), BitVec::to_i8x16(b))).into()
+    transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
 }
-
 /// Compares packed 16-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16)
-
 pub fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
-    (simd_lt(BitVec::to_i16x8(a), BitVec::to_i16x8(b))).into()
+    transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Compares packed 32-bit integers in `a` and `b` for less-than.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32)
-
 pub fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_lt(BitVec::to_i32x4(a), BitVec::to_i32x4(b))).into()
+    transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
 }
-
+/// Converts the lower two packed 32-bit integers in `a` to packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd)
+pub fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
+    {
+        let a = a.as_i32x4();
+        transmute(simd_cast::<2, i32, f64>(simd_shuffle(a, a, [0, 1])))
+    }
+}
+/// Returns `a` with its lower element replaced by `b` after converting it to
+/// an `f64`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd)
+pub fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
+    transmute(simd_insert(a.as_f64x2(), 0, b as f64))
+}
+/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
+/// floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps)
+pub fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
+    transmute(simd_cast::<4, _, f32>(a.as_i32x4()))
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a`
+/// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtps_epi32(a: __m128) -> __m128i {
+//     { transmute(cvtps2dq(a)) }
+// }
+/// Returns a vector whose lowest element is `a` and all higher elements are
+/// `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128)
 pub fn _mm_cvtsi32_si128(a: i32) -> __m128i {
-    i32x4::from_fn(|i| if i == 0 { a } else { 0 }).into()
+    transmute(i32x4::new(a, 0, 0, 0))
 }
-
 /// Returns the lowest element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32)
-
 pub fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
-    simd_extract(BitVec::to_i32x4(a), 0)
+    simd_extract(a.as_i32x4(), 0)
 }
-
 /// Sets packed 64-bit integers with the supplied values, from highest to
 /// lowest.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x)
-
-// no particular instruction to test
-
 pub fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
-    i64x2::from_fn(|i| if i == 0 { e0 } else { e1 }).into()
+    transmute(i64x2::new(e0, e1))
 }
-
 /// Sets packed 32-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32)
-// no particular instruction to test
 pub fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
-    let vec = [e0, e1, e2, e3];
-    BitVec::from_i32x4(i32x4::from_fn(|i| vec[i as usize]))
+    transmute(i32x4::new(e0, e1, e2, e3))
 }
-
 /// Sets packed 16-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16)
-
-// no particular instruction to test
-
 pub fn _mm_set_epi16(
     e7: i16,
     e6: i16,
@@ -922,14 +650,11 @@ pub fn _mm_set_epi16(
     e1: i16,
     e0: i16,
 ) -> __m128i {
-    let vec = [e0, e1, e2, e3, e4, e5, e6, e7];
-    BitVec::from_i16x8(i16x8::from_fn(|i| vec[i as usize]))
+    transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
 }
-
 /// Sets packed 8-bit integers with the supplied values.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8)
-// no particular instruction to test
 pub fn _mm_set_epi8(
     e15: i8,
     e14: i8,
@@ -948,68 +673,45 @@ pub fn _mm_set_epi8(
     e1: i8,
     e0: i8,
 ) -> __m128i {
-    let vec = [
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    ];
-    BitVec::from_i8x16(i8x16::from_fn(|i| vec[i as usize]))
+    {
+        transmute(i8x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        ))
+    }
 }
-
 /// Broadcasts 64-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi64x(a: i64) -> __m128i {
     _mm_set_epi64x(a, a)
 }
-
 /// Broadcasts 32-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi32(a: i32) -> __m128i {
     _mm_set_epi32(a, a, a, a)
 }
-
 /// Broadcasts 16-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi16(a: i16) -> __m128i {
-    BitVec::from_i16x8(i16x8::from_fn(|_| a))
+    _mm_set_epi16(a, a, a, a, a, a, a, a)
 }
-
 /// Broadcasts 8-bit integer `a` to all elements.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8)
-
-// no particular instruction to test
-
 pub fn _mm_set1_epi8(a: i8) -> __m128i {
     _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
-
 /// Sets packed 32-bit integers with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32)
-
-// no particular instruction to test
-
 pub fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
     _mm_set_epi32(e0, e1, e2, e3)
 }
-
 /// Sets packed 16-bit integers with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16)
-
-// no particular instruction to test
-
 pub fn _mm_setr_epi16(
     e7: i16,
     e6: i16,
@@ -1022,13 +724,9 @@ pub fn _mm_setr_epi16(
 ) -> __m128i {
     _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
 }
-
 /// Sets packed 8-bit integers with the supplied values in reverse order.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8)
-
-// no particular instruction to test
-
 pub fn _mm_setr_epi8(
     e15: i8,
     e14: i8,
@@ -1051,104 +749,87 @@ pub fn _mm_setr_epi8(
         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
     )
 }
-
 /// Returns a vector with all elements set to zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128)
-
 pub fn _mm_setzero_si128() -> __m128i {
-    BitVec::from_fn(|_| Bit::Zero)
+    transmute(i32x4::ZERO())
 }
-
 /// Returns a vector where the low element is extracted from `a` and its upper
 /// element is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64)
-
-// FIXME movd on msvc, movd on i686
-
 pub fn _mm_move_epi64(a: __m128i) -> __m128i {
-    let r: i64x2 = simd_shuffle(BitVec::to_i64x2(a), i64x2::from_fn(|_| 0), [0, 2]);
-    r.into()
+    {
+        let r: i64x2 = simd_shuffle(a.as_i64x2(), i64x2::ZERO(), [0, 2]);
+        transmute(r)
+    }
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16)
-
 pub fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packsswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32)
-
 pub fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
-    packssdw(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
 }
-
 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16)
-
 pub fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
-    packuswb(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
 }
-
 /// Returns the `imm8` element of `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16)
-
 pub fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
-    // static_assert_uimm_bits!(IMM8, 3);
-    simd_extract(BitVec::to_u16x8(a), IMM8 as u64) as i32
+    static_assert_uimm_bits!(IMM8, 3);
+    simd_extract(a.as_u16x8(), IMM8 as u32) as i32
 }
-
 /// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16)
-
 pub fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 3);
-    simd_insert(BitVec::to_i16x8(a), IMM8 as u64, i as i16).into()
+    static_assert_uimm_bits!(IMM8, 3);
+    transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16))
 }
-
 /// Returns a mask of the most significant bit of each element in `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8)
-
 pub fn _mm_movemask_epi8(a: __m128i) -> i32 {
-    let z = i8x16::from_fn(|_| 0);
-    let m: i8x16 = simd_lt(BitVec::to_i8x16(a), z);
-    let r = simd_bitmask_little!(15, m, u16);
-    r as u32 as i32
+    {
+        let z = i8x16::ZERO();
+        let m: i8x16 = simd_lt(a.as_i8x16(), z);
+        simd_bitmask_little!(15, m, u16) as u32 as i32
+    }
 }
-
 /// Shuffles 32-bit integers in `a` using the control in `IMM8`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32)
-
 pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    let a = BitVec::to_i32x4(a);
-    let x: i32x4 = simd_shuffle(
-        a,
-        a,
-        [
-            IMM8 as u64 & 0b11,
-            (IMM8 as u64 >> 2) & 0b11,
-            (IMM8 as u64 >> 4) & 0b11,
-            (IMM8 as u64 >> 6) & 0b11,
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i32x4();
+        let x: i32x4 = simd_shuffle(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+            ],
+        );
+        transmute(x)
+    }
 }
-
 /// Shuffles 16-bit integers in the high 64 bits of `a` using the control in
 /// `IMM8`.
 ///
@@ -1156,28 +837,27 @@ pub fn _mm_shuffle_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 /// bits being copied from `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16)
-
 pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    let a = BitVec::to_i16x8(a);
-    let x: i16x8 = simd_shuffle(
-        a,
-        a,
-        [
-            0,
-            1,
-            2,
-            3,
-            (IMM8 as u64 & 0b11) + 4,
-            ((IMM8 as u64 >> 2) & 0b11) + 4,
-            ((IMM8 as u64 >> 4) & 0b11) + 4,
-            ((IMM8 as u64 >> 6) & 0b11) + 4,
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+            ],
+        );
+        transmute(x)
+    }
 }
-
 /// Shuffles 16-bit integers in the low 64 bits of `a` using the control in
 /// `IMM8`.
 ///
@@ -1185,119 +865,754 @@ pub fn _mm_shufflehi_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
 /// bits being copied from `a`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16)
-
 pub fn _mm_shufflelo_epi16<const IMM8: i32>(a: __m128i) -> __m128i {
-    // static_assert_uimm_bits!(IMM8, 8);
-
-    let a = BitVec::to_i16x8(a);
-    let x: i16x8 = simd_shuffle(
-        a,
-        a,
-        [
-            IMM8 as u64 & 0b11,
-            (IMM8 as u64 >> 2) & 0b11,
-            (IMM8 as u64 >> 4) & 0b11,
-            (IMM8 as u64 >> 6) & 0b11,
-            4,
-            5,
-            6,
-            7,
-        ],
-    );
-    x.into()
+    static_assert_uimm_bits!(IMM8, 8);
+    {
+        let a = a.as_i16x8();
+        let x: i16x8 = simd_shuffle(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+            ],
+        );
+        transmute(x)
+    }
 }
-
 /// Unpacks and interleave 8-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8)
-
 pub fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(
-        BitVec::to_i8x16(a),
-        BitVec::to_i8x16(b),
-        [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
-    ))
-    .into()
+    {
+        transmute::<i8x16, _>(simd_shuffle(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
+        ))
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16)
-
 pub fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle(
-        BitVec::to_i16x8(a),
-        BitVec::to_i16x8(b),
-        [4, 12, 5, 13, 6, 14, 7, 15],
-    );
-    (x).into()
+    {
+        let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
+        transmute::<i16x8, _>(x)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32)
-
 pub fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [2, 6, 3, 7])).into()
+    transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
 }
-
 /// Unpacks and interleave 64-bit integers from the high half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64)
-
 pub fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [1, 3])).into()
+    transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [1, 3]))
 }
-
 /// Unpacks and interleave 8-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8)
-
 pub fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
-    (simd_shuffle(
-        BitVec::to_i8x16(a),
-        BitVec::to_i8x16(b),
-        [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
-    ))
-    .into()
+    {
+        transmute::<i8x16, _>(simd_shuffle(
+            a.as_i8x16(),
+            b.as_i8x16(),
+            [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
+        ))
+    }
 }
-
 /// Unpacks and interleave 16-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16)
-
 pub fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
-    let x = simd_shuffle(
-        BitVec::to_i16x8(a),
-        BitVec::to_i16x8(b),
-        [0, 8, 1, 9, 2, 10, 3, 11],
-    );
-    x.into()
+    {
+        let x = simd_shuffle(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
+        transmute::<i16x8, _>(x)
+    }
 }
-
 /// Unpacks and interleave 32-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32)
-
 pub fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(BitVec::to_i32x4(a), BitVec::to_i32x4(b), [0, 4, 1, 5]).into()
+    transmute::<i32x4, _>(simd_shuffle(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
 }
-
 /// Unpacks and interleave 64-bit integers from the low half of `a` and `b`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64)
-
 pub fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    simd_shuffle(BitVec::to_i64x2(a), BitVec::to_i64x2(b), [0, 2]).into()
+    transmute::<i64x2, _>(simd_shuffle(a.as_i64x2(), b.as_i64x2(), [0, 2]))
+}
+/// Returns a new vector with the low element of `a` replaced by the sum of the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))) }
+// }
+/// Adds packed double-precision (64-bit) floating-point elements in `a` and
+/// `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_add(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))) }
+// }
+/// Divide packed double-precision (64-bit) floating-point elements in `a` by
+/// packed elements in `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_div(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the maximum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { maxsd(a, b) }
+// }
+/// Returns a new vector with the maximum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { maxpd(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the minimum
+/// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { minsd(a, b) }
+// }
+/// Returns a new vector with the minimum values from corresponding elements in
+/// `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { minpd(a, b) }
+// }
+/// Returns a new vector with the low element of `a` replaced by multiplying the
+/// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))) }
+// }
+/// Multiplies packed double-precision (64-bit) floating-point elements in `a`
+/// and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the square
+/// root of the lower element `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(a, 0, sqrtf64(_mm_cvtsd_f64(b))) }
+// }
+/// Returns a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_sqrt_pd(a: __m128d) -> __m128d {
+//     { simd_fsqrt(a) }
+// }
+/// Returns a new vector with the low element of `a` replaced by subtracting the
+/// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(a.as_f64x2(), 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))) }
+// }
+/// Subtract packed double-precision (64-bit) floating-point elements in `b`
+/// from `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_sub(a, b) }
+// }
+/// Computes the bitwise AND of packed double-precision (64-bit) floating-point
+/// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd)
+pub fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_and_si128(a, b))
+    }
+}
+/// Computes the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd)
+pub fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_andnot_si128(a, b))
+    }
+}
+/// Computes the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_pd)
+pub fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_or_si128(a, b))
+    }
+}
+/// Computes the bitwise XOR of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd)
+pub fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
+    {
+        let a: __m128i = transmute(a);
+        let b: __m128i = transmute(b);
+        transmute(_mm_xor_si128(a, b))
+    }
+}
+/// Returns a new vector with the low element of `a` replaced by the equality
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 0) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the less-than
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 1) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 2) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { transmute(simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract(a, 1))) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmple_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result
+/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
+/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
+/// otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 7) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the result of
+/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
+/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 3) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the not-equal
+/// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 4) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 5) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmpsd(a, b, 6) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Returns a new vector with the low element of `a` replaced by the
+/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
+//     { simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract(a, 1)) }
+// }
+/// Compares corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 0) }
+// }
+/// Compares corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 1) }
+// }
+/// Compares corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 2) }
+// }
+/// Compares corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmplt_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmple_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 7) }
+// }
+/// Compares corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 3) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 4) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 5) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
+//     { cmppd(a, b, 6) }
+// }
+/// Compares corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmpnlt_pd(b, a)
+// }
+/// Compares corresponding elements in `a` and `b` for
+/// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd)
+// NOTE: Not modeled yet
+// pub fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
+//     _mm_cmpnle_pd(b, a)
+// }
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comieqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comiltsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comilesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comigtsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comigesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { comineqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomieqsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomiltsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomilesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomigtsd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomigesd(a, b) }
+// }
+/// Compares the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
+//     { ucomineqsd(a, b) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps)
+pub fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
+    {
+        let r = simd_cast::<2, _, f32>(a.as_f64x2());
+        let zero = f32x2::ZERO();
+        transmute::<f32x4, _>(simd_shuffle(r, zero, [0, 1, 2, 3]))
+    }
+}
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed
+/// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd)
+pub fn _mm_cvtps_pd(a: __m128) -> __m128d {
+    {
+        let a = a.as_f32x4();
+        transmute(simd_cast::<2, f32, f64>(simd_shuffle(a, a, [0, 1])))
+    }
+}
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
+//     { transmute(cvtpd2dq(a)) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in a to
+/// a 32-bit integer.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtsd_si32(a: __m128d) -> i32 {
+//     { cvtsd2si(a) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in `b`
+/// to a single-precision (32-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
+//     { cvtsd2ss(a, b) }
+// }
+/// Returns the lower double-precision (64-bit) floating-point element of `a`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtsd_f64(a: __m128d) -> f64 {
+//     { simd_extract(a, 0) }
+// }
+/// Converts the lower single-precision (32-bit) floating-point element in `b`
+/// to a double-precision (64-bit) floating-point element, store the result in
+/// the lower element of the return value, and copies the upper element from `a`
+/// to the upper element the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd)
+// NOTE: Not modeled yet
+// pub fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
+//     { cvtss2sd(a, b) }
+// }
+/// Converts packed double-precision (64-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
+//     { transmute(cvttpd2dq(a)) }
+// }
+/// Converts the lower double-precision (64-bit) floating-point element in `a`
+/// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvttsd_si32(a: __m128d) -> i32 {
+//     { cvttsd2si(a) }
+// }
+/// Converts packed single-precision (32-bit) floating-point elements in `a` to
+/// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32)
+// NOTE: Not modeled yet
+// pub fn _mm_cvttps_epi32(a: __m128) -> __m128i {
+//     { transmute(cvttps2dq(a)) }
+// }
+/// Copies double-precision (64-bit) floating-point element `a` to the lower
+/// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd)
+pub fn _mm_set_sd(a: f64) -> __m128d {
+    _mm_set_pd(0.0, a)
+}
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd)
+pub fn _mm_set1_pd(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+/// Broadcasts double-precision (64-bit) floating-point value a to all elements
+/// of the return value.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1)
+pub fn _mm_set_pd1(a: f64) -> __m128d {
+    _mm_set_pd(a, a)
+}
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd)
+pub fn _mm_set_pd(a: f64, b: f64) -> __m128d {
+    transmute(f64x2::new(b, a))
+}
+/// Sets packed double-precision (64-bit) floating-point elements in the return
+/// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd)
+pub fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
+    _mm_set_pd(b, a)
+}
+/// Returns packed double-precision (64-bit) floating-point elements with all
+/// zeros.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd)
+pub fn _mm_setzero_pd() -> __m128d {
+    transmute(f64x2::ZERO())
+}
+/// Returns a mask of the most significant bit of each element in `a`.
+///
+/// The mask is stored in the 2 least significant bits of the return value.
+/// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd)
+pub fn _mm_movemask_pd(a: __m128d) -> i32 {
+    {
+        let mask: i64x2 = simd_lt(transmute(a), i64x2::ZERO());
+        simd_bitmask_little!(1, mask, u8) as i32
+    }
+}
+/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
+/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
+/// parameter as a specifier.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd)
+pub fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d {
+    static_assert_uimm_bits!(MASK, 8);
+    transmute(simd_shuffle(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        [MASK as u32 & 0b1, ((MASK as u32 >> 1) & 0b1) + 2],
+    ))
+}
+/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd)
+pub fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
+    _mm_setr_pd(simd_extract(b.as_f64x2(), 0), simd_extract(a.as_f64x2(), 1))
+}
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// floating-point vector of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps)
+pub fn _mm_castpd_ps(a: __m128d) -> __m128 {
+    transmute(a)
+}
+/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128)
+pub fn _mm_castpd_si128(a: __m128d) -> __m128i {
+    transmute(a)
+}
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// floating-point vector of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd)
+pub fn _mm_castps_pd(a: __m128) -> __m128d {
+    transmute(a)
+}
+/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
+/// integer vector.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128)
+pub fn _mm_castps_si128(a: __m128) -> __m128i {
+    transmute(a)
+}
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[2 x double]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd)
+pub fn _mm_castsi128_pd(a: __m128i) -> __m128d {
+    transmute(a)
+}
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of `[4 x float]`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps)
+pub fn _mm_castsi128_ps(a: __m128i) -> __m128 {
+    transmute(a)
+}
+/// Returns vector of type __m128d with indeterminate elements.with indetermination elements.
+/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd)
+pub fn _mm_undefined_pd() -> __m128d {
+    transmute(f32x4::ZERO())
 }
-
 /// Returns vector of type __m128i with indeterminate elements.with indetermination elements.
 /// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
-/// picks some valid value and is not equivalent to [`core::mem::MaybeUninit`].
-/// In practice, this is typically equivalent to [`core::mem::zeroed`].
+/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
+/// In practice, this is typically equivalent to [`mem::zeroed`].
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_si128)
-
 pub fn _mm_undefined_si128() -> __m128i {
-    BitVec::from_fn(|_| Bit::Zero)
+    transmute(u32x4::ZERO())
+}
+/// The resulting `__m128d` element is composed by the low-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[127:64]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd)
+pub fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [1, 3]))
+}
+/// The resulting `__m128d` element is composed by the high-order values of
+/// the two `__m128d` interleaved input elements, i.e.:
+///
+/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
+/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd)
+pub fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(simd_shuffle(a.as_f64x2(), b.as_f64x2(), [0, 2]))
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
new file mode 100644
index 0000000000000..217298286968c
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/sse2_handwritten.rs
@@ -0,0 +1,196 @@
+use crate::abstractions::{bit::MachineInteger, simd::*};
+pub fn packsswb(a: i16x8, b: i16x8) -> i8x16 {
+    i8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (i8::MAX as i16) {
+                i8::MAX
+            } else if a[i] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                a[i] as i8
+            }
+        } else {
+            if b[i - 8] > (i8::MAX as i16) {
+                i8::MAX
+            } else if b[i - 8] < (i8::MIN as i16) {
+                i8::MIN
+            } else {
+                b[i - 8] as i8
+            }
+        }
+    })
+}
+pub fn pmaddwd(a: i16x8, b: i16x8) -> i32x4 {
+    i32x4::from_fn(|i| {
+        (a[2 * i] as i32) * (b[2 * i] as i32) + (a[2 * i + 1] as i32) * (b[2 * i + 1] as i32)
+    })
+}
+pub fn psadbw(a: u8x16, b: u8x16) -> u64x2 {
+    let tmp = u8x16::from_fn(|i| a[i].wrapping_abs_diff(b[i]));
+    u64x2::from_fn(|i| {
+        (tmp[i * 8] as u16)
+            .wrapping_add(tmp[i * 8 + 1] as u16)
+            .wrapping_add(tmp[i * 8 + 2] as u16)
+            .wrapping_add(tmp[i * 8 + 3] as u16)
+            .wrapping_add(tmp[i * 8 + 4] as u16)
+            .wrapping_add(tmp[i * 8 + 5] as u16)
+            .wrapping_add(tmp[i * 8 + 6] as u16)
+            .wrapping_add(tmp[i * 8 + 7] as u16) as u64
+    })
+}
+pub fn psllw(a: i16x8, count: i16x8) -> i16x8 {
+    let count4: u64 = (count[0] as u16) as u64;
+    let count3: u64 = ((count[1] as u16) as u64) * 65536;
+    let count2: u64 = ((count[2] as u16) as u64) * 4294967296;
+    let count1: u64 = ((count[3] as u16) as u64) * 281474976710656;
+    let count = count1 + count2 + count3 + count4;
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) << count) as i16
+        }
+    })
+}
+
+pub fn pslld(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) << count) as i32
+        }
+    })
+}
+
+pub fn psllq(a: i64x2, count: i64x2) -> i64x2 {
+    let count: u64 = count[0] as u64;
+
+    i64x2::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u64) << count) as i64
+        }
+    })
+}
+
+pub fn psraw(a: i16x8, count: i16x8) -> i16x8 {
+    let count: u64 = ((count[3] as u16) as u64) * 281474976710656
+        + ((count[2] as u16) as u64) * 4294967296
+        + ((count[1] as u16) as u64) * 65536
+        + ((count[0] as u16) as u64);
+
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] >> count
+        }
+    })
+}
+
+pub fn psrad(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = ((count[1] as u32) as u64) * 4294967296 + ((count[0] as u32) as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            if a[i] < 0 {
+                -1
+            } else {
+                0
+            }
+        } else {
+            a[i] << count
+        }
+    })
+}
+
+pub fn psrlw(a: i16x8, count: i16x8) -> i16x8 {
+    let count: u64 = (count[3] as u16 as u64) * 281474976710656
+        + (count[2] as u16 as u64) * 4294967296
+        + (count[1] as u16 as u64) * 65536
+        + (count[0] as u16 as u64);
+
+    i16x8::from_fn(|i| {
+        if count > 15 {
+            0
+        } else {
+            ((a[i] as u16) >> count) as i16
+        }
+    })
+}
+
+pub fn psrld(a: i32x4, count: i32x4) -> i32x4 {
+    let count: u64 = (count[1] as u32 as u64) * 4294967296 + (count[0] as u32 as u64);
+
+    i32x4::from_fn(|i| {
+        if count > 31 {
+            0
+        } else {
+            ((a[i] as u32) >> count) as i32
+        }
+    })
+}
+
+pub fn psrlq(a: i64x2, count: i64x2) -> i64x2 {
+    let count: u64 = count[0] as u64;
+
+    i64x2::from_fn(|i| {
+        if count > 63 {
+            0
+        } else {
+            ((a[i] as u64) >> count) as i64
+        }
+    })
+}
+
+pub fn packssdw(a: i32x4, b: i32x4) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            if a[i] > (i16::MAX as i32) {
+                i16::MAX
+            } else if a[i] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                a[i] as i16
+            }
+        } else {
+            if b[i - 4] > (i16::MAX as i32) {
+                i16::MAX
+            } else if b[i - 4] < (i16::MIN as i32) {
+                i16::MIN
+            } else {
+                b[i - 4] as i16
+            }
+        }
+    })
+}
+
+pub fn packuswb(a: i16x8, b: i16x8) -> u8x16 {
+    u8x16::from_fn(|i| {
+        if i < 8 {
+            if a[i] > (u8::MAX as i16) {
+                u8::MAX
+            } else if a[i] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                a[i] as u8
+            }
+        } else {
+            if b[i - 8] > (u8::MAX as i16) {
+                u8::MAX
+            } else if b[i - 8] < (u8::MIN as i16) {
+                u8::MIN
+            } else {
+                b[i - 8] as u8
+            }
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3.rs b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
index 8d0488430756c..665e83460fca6 100644
--- a/testable-simd-models/src/core_arch/x86/models/ssse3.rs
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3.rs
@@ -1,176 +1,49 @@
 //! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+use crate::abstractions::simd::*;
+use crate::abstractions::utilities::*;
 
-use crate::abstractions::{bitvec::BitVec, simd::*};
-
+use super::sse2::*;
+use super::ssse3_handwritten::*;
 use super::types::*;
 
-mod c_extern {
-    use crate::abstractions::simd::*;
-    pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
-        u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u64] })
-    }
-
-    pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_add(a[2 * i + 1])
-            } else {
-                b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
-            }
-        })
-    }
-
-    pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if i < 4 {
-                a[2 * i].saturating_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
-            }
-        })
-    }
-
-    pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if i < 2 {
-                a[2 * i].wrapping_sub(a[2 * i + 1])
-            } else {
-                b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
-            }
-        })
-    }
-
-    pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
-        i16x8::from_fn(|i| {
-            ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
-                .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
-        })
-    }
-
-    pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            let temp = (a[i] as i32) * (b[i] as i32);
-            let temp = (temp >> 14).wrapping_add(1) >> 1;
-            temp as i16
-        })
-    }
-
-    pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
-        i8x16::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i8::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
-        i16x8::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i16::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-
-    pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
-        i32x4::from_fn(|i| {
-            if b[i] < 0 {
-                if a[i] == i32::MIN {
-                    a[i]
-                } else {
-                    -a[i]
-                }
-            } else if b[i] > 0 {
-                a[i]
-            } else {
-                0
-            }
-        })
-    }
-}
-
-use super::sse2::*;
-use c_extern::*;
 /// Computes the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8)
 pub fn _mm_abs_epi8(a: __m128i) -> __m128i {
-    let a = BitVec::to_i8x16(a);
-    let zero = i8x16::from_fn(|_| 0);
-    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    BitVec::from_i8x16(r)
+    {
+        let a = a.as_i8x16();
+        let zero = i8x16::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute value of each of the packed 16-bit signed integers in
 /// `a` and
 /// return the 16-bit unsigned integer
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16)
 pub fn _mm_abs_epi16(a: __m128i) -> __m128i {
-    let a = BitVec::to_i16x8(a);
-    let zero = i16x8::from_fn(|_| 0);
-    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    BitVec::from_i16x8(r)
+    {
+        let a = a.as_i16x8();
+        let zero = i16x8::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Computes the absolute value of each of the packed 32-bit signed integers in
 /// `a` and
 /// return the 32-bit unsigned integer
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32)
 pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
-    let a = BitVec::to_i32x4(a);
-    let zero = i32x4::from_fn(|_| 0);
-    let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
-    BitVec::from_i32x4(r)
+    {
+        let a = a.as_i32x4();
+        let zero = i32x4::ZERO();
+        let r = simd_select(simd_lt(a, zero), simd_neg(a), a);
+        transmute(r)
+    }
 }
-
 /// Shuffles bytes from `a` according to the content of `b`.
 ///
 /// The last 4 bits of each byte of `b` are used as addresses
@@ -198,31 +71,26 @@ pub fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8)
 pub fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
-    BitVec::from_u8x16(pshufb128(BitVec::to_u8x16(a), BitVec::to_u8x16(b)))
+    {
+        transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
+    }
 }
-
 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
 /// shift the result right by `n` bytes, and returns the low 16 bytes.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8)
-
 pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    // TODO static_assert_uimm_bits!(IMM8, 8);
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
+    static_assert_uimm_bits!(IMM8, 8);
     if IMM8 > 32 {
         return _mm_setzero_si128();
     }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
     let (a, b) = if IMM8 > 16 {
         (_mm_setzero_si128(), a)
     } else {
         (a, b)
     };
-    const fn mask(shift: u64, i: u64) -> u64 {
+    const fn mask(shift: u32, i: u32) -> u32 {
         if shift > 32 {
-            // Unused, but needs to be a valid index.
             i
         } else if shift > 16 {
             shift - 16 + i
@@ -230,89 +98,89 @@ pub fn _mm_alignr_epi8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
             shift + i
         }
     }
-
-    let r: i8x16 = simd_shuffle(
-        BitVec::to_i8x16(b),
-        BitVec::to_i8x16(a),
-        [
-            mask(IMM8 as u64, 0),
-            mask(IMM8 as u64, 1),
-            mask(IMM8 as u64, 2),
-            mask(IMM8 as u64, 3),
-            mask(IMM8 as u64, 4),
-            mask(IMM8 as u64, 5),
-            mask(IMM8 as u64, 6),
-            mask(IMM8 as u64, 7),
-            mask(IMM8 as u64, 8),
-            mask(IMM8 as u64, 9),
-            mask(IMM8 as u64, 10),
-            mask(IMM8 as u64, 11),
-            mask(IMM8 as u64, 12),
-            mask(IMM8 as u64, 13),
-            mask(IMM8 as u64, 14),
-            mask(IMM8 as u64, 15),
-        ],
-    );
-    r.into()
+    {
+        let r: i8x16 = simd_shuffle(
+            b.as_i8x16(),
+            a.as_i8x16(),
+            [
+                mask(IMM8 as u32, 0),
+                mask(IMM8 as u32, 1),
+                mask(IMM8 as u32, 2),
+                mask(IMM8 as u32, 3),
+                mask(IMM8 as u32, 4),
+                mask(IMM8 as u32, 5),
+                mask(IMM8 as u32, 6),
+                mask(IMM8 as u32, 7),
+                mask(IMM8 as u32, 8),
+                mask(IMM8 as u32, 9),
+                mask(IMM8 as u32, 10),
+                mask(IMM8 as u32, 11),
+                mask(IMM8 as u32, 12),
+                mask(IMM8 as u32, 13),
+                mask(IMM8 as u32, 14),
+                mask(IMM8 as u32, 15),
+            ],
+        );
+        transmute(r)
+    }
 }
-
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of `[8 x i16]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16)
-
 pub fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phaddw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    {
+        transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of `[8 x i16]`. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16)
-
 pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phaddsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    {
+        transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of `[4 x i32]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32)
-
 pub fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
-    phaddd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    {
+        transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
+    }
 }
-
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of `[8 x i16]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16)
-
 pub fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phsubw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    {
+        transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of `[8 x i16]`. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16)
-
 pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    phsubsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    {
+        transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of `[4 x i32]`.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32)
-
 pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
-    phsubd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    {
+        transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
+    }
 }
-
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 /// values contained in the first source operand and packed 8-bit signed
 /// integer values contained in the second source operand, add pairs of
@@ -320,50 +188,51 @@ pub fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// the corresponding bits in the destination.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16)
-
 pub fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmaddubsw128(BitVec::to_u8x16(a), BitVec::to_i8x16(b)).into()
+    {
+        transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
+    }
 }
-
 /// Multiplies packed 16-bit signed integer values, truncate the 32-bit
 /// product to the 18 most significant bits by right-shifting, round the
 /// truncated value by adding 1, and write bits `[16:1]` to the destination.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16)
-
 pub fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
-    pmulhrsw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    {
+        transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Negates packed 8-bit integers in `a` when the corresponding signed 8-bit
 /// integer in `b` is negative, and returns the result.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8)
-
 pub fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
-    psignb128(BitVec::to_i8x16(a), BitVec::to_i8x16(b)).into()
+    {
+        transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
+    }
 }
-
 /// Negates packed 16-bit integers in `a` when the corresponding signed 16-bit
 /// integer in `b` is negative, and returns the results.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16)
-
 pub fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
-    psignw128(BitVec::to_i16x8(a), BitVec::to_i16x8(b)).into()
+    {
+        transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
+    }
 }
-
 /// Negates packed 32-bit integers in `a` when the corresponding signed 32-bit
 /// integer in `b` is negative, and returns the results.
 /// Element in result are zeroed out when the corresponding element in `b`
 /// is zero.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32)
-
 pub fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
-    psignd128(BitVec::to_i32x4(a), BitVec::to_i32x4(b)).into()
+    {
+        transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
+    }
 }
diff --git a/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
new file mode 100644
index 0000000000000..4e911a83fb457
--- /dev/null
+++ b/testable-simd-models/src/core_arch/x86/models/ssse3_handwritten.rs
@@ -0,0 +1,127 @@
+use crate::abstractions::simd::*;
+pub fn pshufb128(a: u8x16, b: u8x16) -> u8x16 {
+    u8x16::from_fn(|i| if b[i] > 127 { 0 } else { a[(b[i] % 16) as u32] })
+}
+
+pub fn phaddw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].saturating_add(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phaddd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_add(a[2 * i + 1])
+        } else {
+            b[2 * (i - 2)].wrapping_add(b[2 * (i - 2) + 1])
+        }
+    })
+}
+
+pub fn phsubw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].wrapping_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if i < 4 {
+            a[2 * i].saturating_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 4)].saturating_sub(b[2 * (i - 4) + 1])
+        }
+    })
+}
+
+pub fn phsubd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if i < 2 {
+            a[2 * i].wrapping_sub(a[2 * i + 1])
+        } else {
+            b[2 * (i - 2)].wrapping_sub(b[2 * (i - 2) + 1])
+        }
+    })
+}
+
+pub fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8 {
+    i16x8::from_fn(|i| {
+        ((a[2 * i] as u8 as u16 as i16) * (b[2 * i] as i8 as i16))
+            .saturating_add((a[2 * i + 1] as u8 as u16 as i16) * (b[2 * i + 1] as i8 as i16))
+    })
+}
+
+pub fn pmulhrsw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        let temp = (a[i] as i32) * (b[i] as i32);
+        let temp = (temp >> 14).wrapping_add(1) >> 1;
+        temp as i16
+    })
+}
+
+pub fn psignb128(a: i8x16, b: i8x16) -> i8x16 {
+    i8x16::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i8::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignw128(a: i16x8, b: i16x8) -> i16x8 {
+    i16x8::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i16::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
+
+pub fn psignd128(a: i32x4, b: i32x4) -> i32x4 {
+    i32x4::from_fn(|i| {
+        if b[i] < 0 {
+            if a[i] == i32::MIN {
+                a[i]
+            } else {
+                -a[i]
+            }
+        } else if b[i] > 0 {
+            a[i]
+        } else {
+            0
+        }
+    })
+}
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 4ffa0dc139b9d..02b1d81173ad0 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -3,6 +3,12 @@ use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
 
+macro_rules! assert_feq {
+    ($lhs:expr, $rhs:expr) => {
+        assert!(($lhs.is_nan() && $rhs.is_nan()) || $lhs == $rhs)
+    };
+}
+
 /// Derives tests for a given intrinsics. Test that a given intrinsics and its model compute the same thing over random values (1000 by default).
 macro_rules! mk {
     ($([$N:literal])?$name:ident$({$(<$($c:literal),*>),*})?($($x:ident : $ty:ident),*)) => {
@@ -48,6 +54,19 @@ fn _mm256_movemask_ps() {
     }
 }
 
+#[test]
+fn _mm256_movemask_pd() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_movemask_pd(a.into()),
+            unsafe { upstream::_mm256_movemask_pd(a.into()) }
+        );
+    }
+}
+
 #[test]
 fn _mm256_testz_si256() {
     let n = 1000;
@@ -62,6 +81,59 @@ fn _mm256_testz_si256() {
     }
 }
 
+#[test]
+fn _mm256_testc_si256() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        let b: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_testc_si256(a.into(), b.into()),
+            unsafe { upstream::_mm256_testc_si256(a.into(), b.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtsd_f64() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_feq!(
+            super::super::models::avx::_mm256_cvtsd_f64(a.into()),
+            unsafe { upstream::_mm256_cvtsd_f64(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtsi256_si32() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_eq!(
+            super::super::models::avx::_mm256_cvtsi256_si32(a.into()),
+            unsafe { upstream::_mm256_cvtsi256_si32(a.into()) }
+        );
+    }
+}
+
+#[test]
+fn _mm256_cvtss_f32() {
+    let n = 1000;
+
+    for _ in 0..n {
+        let a: BitVec<256> = BitVec::random();
+        assert_feq!(
+            super::super::models::avx::_mm256_cvtss_f32(a.into()),
+            unsafe { upstream::_mm256_cvtss_f32(a.into()) }
+        );
+    }
+}
+
 mk!(_mm256_setzero_ps());
 mk!(_mm256_setzero_si256());
 mk!(_mm256_set_epi8(
@@ -130,3 +202,57 @@ mk!(_mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64));
 mk!(_mm256_set1_epi8(a: i8));
 mk!(_mm256_set1_epi16(a: i16));
 mk!(_mm256_set1_epi32(a: i32));
+mk!(_mm256_set1_epi64x(a: i64));
+mk!(_mm256_set_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_pd(a: f64, b: f64, c: f64, d: f64));
+mk!(_mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32));
+mk!(_mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64));
+mk!(_mm256_set1_pd(a: f64));
+mk!(_mm256_set1_ps(a: f32));
+
+mk!(_mm256_and_pd(a: __m256d, b: __m256d));
+mk!(_mm256_and_ps(a: __m256, b: __m256));
+mk!(_mm256_or_pd(a: __m256d, b: __m256d));
+mk!(_mm256_or_ps(a: __m256, b: __m256));
+mk!(_mm256_andnot_pd(a: __m256d, b: __m256d));
+mk!(_mm256_andnot_ps(a: __m256, b: __m256));
+mk!(_mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d));
+mk!(_mm256_xor_pd(a: __m256d, b: __m256d));
+mk!(_mm256_xor_ps(a: __m256, b: __m256));
+mk!(_mm256_cvtepi32_pd(a: __m128i));
+mk!(_mm256_cvtepi32_ps(a: __m256i));
+mk!(_mm256_cvtpd_ps(a: __m256d));
+mk!(_mm256_cvtps_pd(a: __m128));
+mk!(_mm256_movehdup_ps(a: __m256));
+mk!(_mm256_moveldup_ps(a: __m256));
+mk!(_mm256_movedup_pd(a: __m256d));
+mk!(_mm256_unpackhi_pd(a: __m256d, b: __m256d));
+mk!(_mm256_unpackhi_ps(a: __m256, b: __m256));
+mk!(_mm256_unpacklo_pd(a: __m256d, b: __m256d));
+mk!(_mm256_unpacklo_ps(a: __m256, b: __m256));
+mk!(_mm256_setzero_pd());
+mk!(_mm256_castpd_ps(a: __m256d));
+mk!(_mm256_castps_pd(a: __m256));
+mk!(_mm256_castps_si256(a: __m256));
+mk!(_mm256_castsi256_ps(a: __m256i));
+mk!(_mm256_castpd_si256(a: __m256d));
+mk!(_mm256_castsi256_pd(a: __m256i));
+mk!(_mm256_castps256_ps128(a: __m256));
+mk!(_mm256_castpd256_pd128(a: __m256d));
+mk!(_mm256_castsi256_si128(a: __m256i));
+mk!(_mm256_castps128_ps256(a: __m128));
+mk!(_mm256_castpd128_pd256(a: __m128d));
+mk!(_mm256_castsi128_si256(a: __m128i));
+mk!(_mm256_zextps128_ps256(a: __m128));
+mk!(_mm256_zextsi128_si256(a: __m128i));
+mk!(_mm256_zextpd128_pd256(a: __m128d));
+mk!(_mm256_undefined_ps());
+mk!(_mm256_undefined_pd());
+mk!(_mm256_undefined_si256());
+mk!(_mm256_set_m128(hi: __m128, lo: __m128));
+mk!(_mm256_set_m128d(hi: __m128d, lo: __m128d));
+mk!(_mm256_set_m128i(hi: __m128i, lo: __m128i));
+mk!(_mm256_setr_m128(lo: __m128, hi: __m128));
+mk!(_mm256_setr_m128d(lo: __m128d, hi: __m128d));
+mk!(_mm256_setr_m128i(lo: __m128i, hi: __m128i));
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index a1b8378566403..dcabcbb58b1e0 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -1,3 +1,4 @@
+use super::types::*;
 use super::upstream;
 use crate::abstractions::bitvec::BitVec;
 use crate::helpers::test::HasRandom;
@@ -80,7 +81,6 @@ mk!(_mm256_cvtepu8_epi64(a: BitVec));
 mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));
 mk!(_mm256_hadd_epi16(a: BitVec, b: BitVec));
 mk!(_mm256_hadd_epi32(a: BitVec, b: BitVec));
-mk!(_mm256_hadds_epi16(a: BitVec, b: BitVec));
 mk!(_mm256_hsub_epi16(a: BitVec, b: BitVec));
 mk!(_mm256_hsub_epi32(a: BitVec, b: BitVec));
 mk!(_mm256_hsubs_epi16(a: BitVec, b: BitVec));
@@ -182,6 +182,7 @@ mk!(_mm256_unpacklo_epi32(a: BitVec, b: BitVec));
 mk!(_mm256_unpackhi_epi64(a: BitVec, b: BitVec));
 mk!(_mm256_unpacklo_epi64(a: BitVec, b: BitVec));
 mk!(_mm256_xor_si256(a: BitVec, b: BitVec));
+
 #[test]
 fn _mm256_extract_epi8() {
     let n = 100;
@@ -529,3 +530,12 @@ fn _mm256_extract_epi16() {
         );
     }
 }
+
+mk!(_mm256_and_si256(a: __m256i, b: __m256i));
+mk!(_mm256_andnot_si256(a: __m256i, b: __m256i));
+mk!(_mm256_avg_epu16(a: __m256i, b: __m256i));
+mk!(_mm256_avg_epu8(a: __m256i, b: __m256i));
+mk!(_mm_broadcastsd_pd(a: __m128d));
+mk!(_mm256_broadcastsd_pd(a: __m128d));
+mk!(_mm_broadcastss_ps(a: __m128));
+mk!(_mm256_broadcastss_ps(a: __m128));
diff --git a/testable-simd-models/src/core_arch/x86/tests/mod.rs b/testable-simd-models/src/core_arch/x86/tests/mod.rs
index b5a0c3a449715..217ff55623dbf 100644
--- a/testable-simd-models/src/core_arch/x86/tests/mod.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/mod.rs
@@ -14,9 +14,9 @@
 //!
 //! For example, some valid invocations are
 //!
-//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
-//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: BitVec));`
-//! `mk!(_mm256_abs_epi16(a: BitVec));`
+//! `mk!([100]_mm256_extracti128_si256{<0>,<1>}(a: __m256i));`
+//! `mk!(_mm256_extracti128_si256{<0>,<1>}(a: __m256i));`
+//! `mk!(_mm256_abs_epi16(a: __m256i));`
 //!
 //! The number of random tests is optional. If not provided, it is taken to be 1000 by default.
 //! The const values are necessary if the function has constant arguments, but should be discarded if not.
@@ -45,6 +45,12 @@ pub(crate) mod types {
     pub type __m256 = BitVec<256>;
     #[allow(non_camel_case_types)]
     pub type __m128i = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m256d = BitVec<256>;
+    #[allow(non_camel_case_types)]
+    pub type __m128 = BitVec<128>;
+    #[allow(non_camel_case_types)]
+    pub type __m128d = BitVec<128>;
 }
 
 pub(crate) mod upstream {
@@ -56,8 +62,10 @@ pub(crate) mod upstream {
 
 mod conversions {
     use super::upstream::{
-        __m128i, __m256, __m256i, _mm256_castps_si256, _mm256_castsi256_ps, _mm256_loadu_si256,
-        _mm256_storeu_si256, _mm_loadu_si128, _mm_storeu_si128,
+        __m128, __m128d, __m128i, __m256, __m256d, __m256i, _mm256_castpd_si256,
+        _mm256_castps_si256, _mm256_castsi256_pd, _mm256_castsi256_ps, _mm256_loadu_si256,
+        _mm256_storeu_si256, _mm_castpd_si128, _mm_castps_si128, _mm_castsi128_pd,
+        _mm_castsi128_ps, _mm_loadu_si128, _mm_storeu_si128,
     };
     use super::BitVec;
 
@@ -81,6 +89,27 @@ mod conversions {
         }
     }
 
+    impl From<BitVec<128>> for __m128 {
+        fn from(bv: BitVec<128>) -> __m128 {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_castsi128_ps(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) }
+        }
+    }
+
+    impl From<BitVec<128>> for __m128d {
+        fn from(bv: BitVec<128>) -> __m128d {
+            let slice: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm_castsi128_pd(_mm_loadu_si128(slice.as_ptr() as *const __m128i)) }
+        }
+    }
+
+    impl From<BitVec<256>> for __m256d {
+        fn from(bv: BitVec<256>) -> __m256d {
+            let bv: &[u8] = &bv.to_vec()[..];
+            unsafe { _mm256_castsi256_pd(_mm256_loadu_si256(bv.as_ptr() as *const _)) }
+        }
+    }
+
     impl From<__m256i> for BitVec<256> {
         fn from(vec: __m256i) -> BitVec<256> {
             let mut v = [0u8; 32];
@@ -101,6 +130,16 @@ mod conversions {
         }
     }
 
+    impl From<__m256d> for BitVec<256> {
+        fn from(vec: __m256d) -> BitVec<256> {
+            let mut v = [0u8; 32];
+            unsafe {
+                _mm256_storeu_si256(v.as_mut_ptr() as *mut _, _mm256_castpd_si256(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
     impl From<__m128i> for BitVec<128> {
         fn from(vec: __m128i) -> BitVec<128> {
             let mut v = [0u8; 16];
@@ -110,4 +149,24 @@ mod conversions {
             BitVec::from_slice(&v[..], 8)
         }
     }
+
+    impl From<__m128> for BitVec<128> {
+        fn from(vec: __m128) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castps_si128(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
+
+    impl From<__m128d> for BitVec<128> {
+        fn from(vec: __m128d) -> BitVec<128> {
+            let mut v = [0u8; 16];
+            unsafe {
+                _mm_storeu_si128(v.as_mut_ptr() as *mut _, _mm_castpd_si128(vec));
+            }
+            BitVec::from_slice(&v[..], 8)
+        }
+    }
 }
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
index 6c5e84e2a8dbd..1a30bf251a877 100644
--- a/testable-simd-models/src/helpers.rs
+++ b/testable-simd-models/src/helpers.rs
@@ -33,18 +33,30 @@ pub mod test {
         }
     }
 
+    impl HasRandom for f32 {
+        fn random() -> Self {
+            u32::random() as f32
+        }
+    }
+
+    impl HasRandom for f64 {
+        fn random() -> Self {
+            u64::random() as f64
+        }
+    }
+
     impl HasRandom for Bit {
         fn random() -> Self {
             crate::abstractions::bit::Bit::from(bool::random())
         }
     }
-    impl<const N: u64> HasRandom for BitVec<N> {
+    impl<const N: u32> HasRandom for BitVec<N> {
         fn random() -> Self {
             Self::from_fn(|_| Bit::random())
         }
     }
 
-    impl<const N: u64, T: HasRandom> HasRandom for FunArray<N, T> {
+    impl<const N: u32, T: HasRandom> HasRandom for FunArray<N, T> {
         fn random() -> Self {
             FunArray::from_fn(|_| T::random())
         }
diff --git a/testable-simd-models/src/lib.rs b/testable-simd-models/src/lib.rs
index fc76194526e20..13d6ba2e6e7cd 100644
--- a/testable-simd-models/src/lib.rs
+++ b/testable-simd-models/src/lib.rs
@@ -25,7 +25,7 @@
 //! By providing a readable, testable, well-specified version of `core`'s behavior, it serves as a foundation for
 //! proof assistants and other verification tools.
 
-// This recursion limit is necessary for mk! macro sued for tests.
+// This recursion limit is necessary for mk! macro used for tests.
 // We test functions with const generics, the macro generate a test per possible (const generic) control value.
 #![recursion_limit = "4096"]
 pub mod abstractions;
diff --git a/testable-simd-models/test.sh b/testable-simd-models/test.sh
deleted file mode 100755
index 8f521735122c3..0000000000000
--- a/testable-simd-models/test.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-cross test --target aarch64-unknown-linux-gnu
-cross test --target x86_64-unknown-linux-gnu

From 13ef5d9fd182a63481c6c004becc9663fb9b627b Mon Sep 17 00:00:00 2001
From: Maxime Buyse <maxime@cryspen.com>
Date: Tue, 19 Aug 2025 17:54:00 +0200
Subject: [PATCH 3/3] Add ci job for simd models tests, with logging of rng
 seed.

---
 .github/workflows/testable-simd-models.yml    |  30 +++
 testable-simd-models/Cargo.toml               |   1 +
 .../src/abstractions/bitvec.rs                |  10 -
 .../core_arch/x86/models/avx2_handwritten.rs  |   1 -
 .../src/core_arch/x86/tests/avx.rs            |  22 +-
 .../src/core_arch/x86/tests/avx2.rs           | 196 +++++++++++++-----
 testable-simd-models/src/helpers.rs           |  12 +-
 7 files changed, 205 insertions(+), 67 deletions(-)
 create mode 100644 .github/workflows/testable-simd-models.yml

diff --git a/.github/workflows/testable-simd-models.yml b/.github/workflows/testable-simd-models.yml
new file mode 100644
index 0000000000000..02014bfbc9d77
--- /dev/null
+++ b/.github/workflows/testable-simd-models.yml
@@ -0,0 +1,30 @@
+# This workflow runs the tests for testable simd models.
+
+name: Testable simd models
+
+on:
+  workflow_dispatch:
+  merge_group:
+  pull_request:
+    branches: [ main ]
+  push:
+    paths:
+      - '.github/workflows/testable-simd-models.yml'
+      - 'testable-simd-models/**'
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  testable-simd-models:
+    name: Test testable simd models
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+      
+      - name: Run tests
+        run: cargo test -- --test-threads=1 --nocapture
+        
\ No newline at end of file
diff --git a/testable-simd-models/Cargo.toml b/testable-simd-models/Cargo.toml
index 6e2116fec82e0..1a95f75c56851 100644
--- a/testable-simd-models/Cargo.toml
+++ b/testable-simd-models/Cargo.toml
@@ -11,6 +11,7 @@ readme = "README.md"
 [dependencies]
 rand = "0.9"
 pastey = "0.1.0"
+lazy_static = "1.5.0"
 
 [lints.rust]
 unexpected_cfgs = { level = "warn" }
diff --git a/testable-simd-models/src/abstractions/bitvec.rs b/testable-simd-models/src/abstractions/bitvec.rs
index ac73749482e37..74e7141fc5266 100644
--- a/testable-simd-models/src/abstractions/bitvec.rs
+++ b/testable-simd-models/src/abstractions/bitvec.rs
@@ -106,16 +106,6 @@ impl<const N: u32> BitVec<N> {
             .map(int_from_bit_slice)
             .collect()
     }
-
-    /// Generate a random BitVec.
-    pub fn rand() -> Self {
-        use rand::prelude::*;
-        let random_source: Vec<_> = {
-            let mut rng = rand::rng();
-            (0..N).map(|_| rng.random::<bool>()).collect()
-        };
-        Self::from_fn(|i| random_source[i as usize].into())
-    }
 }
 
 impl<const N: u32> BitVec<N> {
diff --git a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
index 43f0a840b54bd..1183eb2524edf 100644
--- a/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
+++ b/testable-simd-models/src/core_arch/x86/models/avx2_handwritten.rs
@@ -419,7 +419,6 @@ pub fn psravd(a: i32x4, count: i32x4) -> i32x4 {
 }
 
 pub fn psravd256(a: i32x8, count: i32x8) -> i32x8 {
-    dbg!(a, count);
     i32x8::from_fn(|i| {
         if count[i] > 31 || count[i] < 0 {
             if a[i] < 0 {
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx.rs b/testable-simd-models/src/core_arch/x86/tests/avx.rs
index 02b1d81173ad0..c6e8684d32ca5 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx.rs
@@ -49,7 +49,9 @@ fn _mm256_movemask_ps() {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx::_mm256_movemask_ps(a.into()),
-            unsafe { upstream::_mm256_movemask_ps(a.into()) }
+            unsafe { upstream::_mm256_movemask_ps(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
 }
@@ -62,7 +64,9 @@ fn _mm256_movemask_pd() {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx::_mm256_movemask_pd(a.into()),
-            unsafe { upstream::_mm256_movemask_pd(a.into()) }
+            unsafe { upstream::_mm256_movemask_pd(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
 }
@@ -76,7 +80,10 @@ fn _mm256_testz_si256() {
         let b: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx::_mm256_testz_si256(a.into(), b.into()),
-            unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) }
+            unsafe { upstream::_mm256_testz_si256(a.into(), b.into()) },
+            "Failed with input values: {:?}, {:?}",
+            a,
+            b
         );
     }
 }
@@ -90,7 +97,10 @@ fn _mm256_testc_si256() {
         let b: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx::_mm256_testc_si256(a.into(), b.into()),
-            unsafe { upstream::_mm256_testc_si256(a.into(), b.into()) }
+            unsafe { upstream::_mm256_testc_si256(a.into(), b.into()) },
+            "Failed with input values: {:?}, {:?}",
+            a,
+            b
         );
     }
 }
@@ -116,7 +126,9 @@ fn _mm256_cvtsi256_si32() {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx::_mm256_cvtsi256_si32(a.into()),
-            unsafe { upstream::_mm256_cvtsi256_si32(a.into()) }
+            unsafe { upstream::_mm256_cvtsi256_si32(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
 }
diff --git a/testable-simd-models/src/core_arch/x86/tests/avx2.rs b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
index dcabcbb58b1e0..50ca3c63b4813 100644
--- a/testable-simd-models/src/core_arch/x86/tests/avx2.rs
+++ b/testable-simd-models/src/core_arch/x86/tests/avx2.rs
@@ -120,7 +120,9 @@ fn _mm256_movemask_epi8() {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_movemask_epi8(a.into()),
-            unsafe { upstream::_mm256_movemask_epi8(a.into()) }
+            unsafe { upstream::_mm256_movemask_epi8(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
 }
@@ -191,224 +193,288 @@ fn _mm256_extract_epi8() {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<0>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<0>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<0>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<1>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<1>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<1>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<2>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<2>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<2>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<3>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<3>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<3>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<4>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<4>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<4>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<5>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<5>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<5>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<6>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<6>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<6>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<7>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<7>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<7>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<8>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<8>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<8>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<9>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<9>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<9>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<10>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<10>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<10>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<11>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<11>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<11>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<12>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<12>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<12>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<13>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<13>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<13>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<14>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<14>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<14>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<15>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<15>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<15>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<16>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<16>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<16>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<17>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<17>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<17>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<18>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<18>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<18>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<19>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<19>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<19>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<20>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<20>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<20>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<21>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<21>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<21>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<22>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<22>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<22>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<23>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<23>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<23>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<24>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<24>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<24>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<25>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<25>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<25>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<26>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<26>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<26>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<27>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<27>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<27>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<28>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<28>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<28>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<29>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<29>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<29>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<30>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<30>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<30>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi8::<31>(a.into()),
-            unsafe { upstream::_mm256_extract_epi8::<31>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi8::<31>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
 }
@@ -421,112 +487,144 @@ fn _mm256_extract_epi16() {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<0>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<0>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<0>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<1>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<1>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<1>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<2>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<2>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<2>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<3>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<3>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<3>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<4>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<4>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<4>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<5>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<5>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<5>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<6>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<6>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<6>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<7>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<7>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<7>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<8>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<8>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<8>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<9>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<9>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<9>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<10>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<10>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<10>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<11>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<11>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<11>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<12>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<12>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<12>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<13>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<13>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<13>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<14>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<14>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<14>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
     for _ in 0..n {
         let a: BitVec<256> = BitVec::random();
         assert_eq!(
             super::super::models::avx2::_mm256_extract_epi16::<15>(a.into()),
-            unsafe { upstream::_mm256_extract_epi16::<15>(a.into()) }
+            unsafe { upstream::_mm256_extract_epi16::<15>(a.into()) },
+            "Failed with input value: {:?}",
+            a
         );
     }
 }
diff --git a/testable-simd-models/src/helpers.rs b/testable-simd-models/src/helpers.rs
index 1a30bf251a877..fb5d27373eb69 100644
--- a/testable-simd-models/src/helpers.rs
+++ b/testable-simd-models/src/helpers.rs
@@ -2,6 +2,15 @@
 pub mod test {
     use crate::abstractions::{bit::Bit, bitvec::BitVec, funarr::FunArray};
     use rand::prelude::*;
+    use std::sync::Mutex;
+
+    lazy_static::lazy_static! {
+        static ref RNG : Mutex<StdRng> = {
+            let seed = rand::rng().random();
+            println!("\nRandomness seed set to: {:?}", seed);
+            Mutex::new(StdRng::from_seed(seed))
+        };
+    }
 
     /// Helper trait to generate random values
     pub trait HasRandom {
@@ -11,8 +20,7 @@ pub mod test {
         ($($ty:ty),*) => {
             $(impl HasRandom for $ty {
                 fn random() -> Self {
-                    let mut rng = rand::rng();
-                    rng.random()
+                    RNG.lock().unwrap().random()
                 }
             })*
         };