@@ -1683,8 +1683,9 @@ pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
16831683#[target_feature(enable = "avx512fp16")]
16841684#[cfg_attr(test, assert_instr(vaddsh))]
16851685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1686- pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1687-     _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1686+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
1687+ pub const fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1688+     unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
16881689}
16891690
16901691/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1696,8 +1697,18 @@ pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
16961697#[target_feature(enable = "avx512fp16")]
16971698#[cfg_attr(test, assert_instr(vaddsh))]
16981699#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1699- pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1700-     _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1700+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
1701+ pub const fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1702+     unsafe {
1703+         let extractsrc: f16 = simd_extract!(src, 0);
1704+         let mut add: f16 = extractsrc;
1705+         if (k & 0b00000001) != 0 {
1706+             let extracta: f16 = simd_extract!(a, 0);
1707+             let extractb: f16 = simd_extract!(b, 0);
1708+             add = extracta + extractb;
1709+         }
1710+         simd_insert!(a, 0, add)
1711+     }
17011712}
17021713
17031714/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1709,8 +1720,17 @@ pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
17091720#[target_feature(enable = "avx512fp16")]
17101721#[cfg_attr(test, assert_instr(vaddsh))]
17111722#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1712- pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1713-     _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1723+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
1724+ pub const fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1725+     unsafe {
1726+         let mut add: f16 = 0.;
1727+         if (k & 0b00000001) != 0 {
1728+             let extracta: f16 = simd_extract!(a, 0);
1729+             let extractb: f16 = simd_extract!(b, 0);
1730+             add = extracta + extractb;
1731+         }
1732+         simd_insert!(a, 0, add)
1733+     }
17141734}
17151735
17161736/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -2004,8 +2024,9 @@ pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
20042024#[target_feature(enable = "avx512fp16")]
20052025#[cfg_attr(test, assert_instr(vsubsh))]
20062026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2007- pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
2008-     _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2027+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2028+ pub const fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
2029+     unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
20092030}
20102031
20112032/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -2017,8 +2038,18 @@ pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
20172038#[target_feature(enable = "avx512fp16")]
20182039#[cfg_attr(test, assert_instr(vsubsh))]
20192040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2020- pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2021-     _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2041+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2042+ pub const fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2043+     unsafe {
2044+         let extractsrc: f16 = simd_extract!(src, 0);
2045+         let mut add: f16 = extractsrc;
2046+         if (k & 0b00000001) != 0 {
2047+             let extracta: f16 = simd_extract!(a, 0);
2048+             let extractb: f16 = simd_extract!(b, 0);
2049+             add = extracta - extractb;
2050+         }
2051+         simd_insert!(a, 0, add)
2052+     }
20222053}
20232054
20242055/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -2030,8 +2061,17 @@ pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
20302061#[target_feature(enable = "avx512fp16")]
20312062#[cfg_attr(test, assert_instr(vsubsh))]
20322063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2033- pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2034-     _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2064+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2065+ pub const fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2066+     unsafe {
2067+         let mut add: f16 = 0.;
2068+         if (k & 0b00000001) != 0 {
2069+             let extracta: f16 = simd_extract!(a, 0);
2070+             let extractb: f16 = simd_extract!(b, 0);
2071+             add = extracta - extractb;
2072+         }
2073+         simd_insert!(a, 0, add)
2074+     }
20352075}
20362076
20372077/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -2325,8 +2365,9 @@ pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
23252365#[target_feature(enable = "avx512fp16")]
23262366#[cfg_attr(test, assert_instr(vmulsh))]
23272367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2328- pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2329-     _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2368+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2369+ pub const fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2370+     unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
23302371}
23312372
23322373/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2338,8 +2379,18 @@ pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
23382379#[target_feature(enable = "avx512fp16")]
23392380#[cfg_attr(test, assert_instr(vmulsh))]
23402381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2341- pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2342-     _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2382+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2383+ pub const fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2384+     unsafe {
2385+         let extractsrc: f16 = simd_extract!(src, 0);
2386+         let mut add: f16 = extractsrc;
2387+         if (k & 0b00000001) != 0 {
2388+             let extracta: f16 = simd_extract!(a, 0);
2389+             let extractb: f16 = simd_extract!(b, 0);
2390+             add = extracta * extractb;
2391+         }
2392+         simd_insert!(a, 0, add)
2393+     }
23432394}
23442395
23452396/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2351,8 +2402,17 @@ pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
23512402#[target_feature(enable = "avx512fp16")]
23522403#[cfg_attr(test, assert_instr(vmulsh))]
23532404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2354- pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2355-     _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2405+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2406+ pub const fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2407+     unsafe {
2408+         let mut add: f16 = 0.;
2409+         if (k & 0b00000001) != 0 {
2410+             let extracta: f16 = simd_extract!(a, 0);
2411+             let extractb: f16 = simd_extract!(b, 0);
2412+             add = extracta * extractb;
2413+         }
2414+         simd_insert!(a, 0, add)
2415+     }
23562416}
23572417
23582418/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2646,8 +2706,9 @@ pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: _
26462706#[target_feature(enable = "avx512fp16")]
26472707#[cfg_attr(test, assert_instr(vdivsh))]
26482708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2649- pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2650-     _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2709+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2710+ pub const fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2711+     unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
26512712}
26522713
26532714/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2659,8 +2720,18 @@ pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
26592720#[target_feature(enable = "avx512fp16")]
26602721#[cfg_attr(test, assert_instr(vdivsh))]
26612722#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2662- pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2663-     _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2723+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2724+ pub const fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2725+     unsafe {
2726+         let extractsrc: f16 = simd_extract!(src, 0);
2727+         let mut add: f16 = extractsrc;
2728+         if (k & 0b00000001) != 0 {
2729+             let extracta: f16 = simd_extract!(a, 0);
2730+             let extractb: f16 = simd_extract!(b, 0);
2731+             add = extracta / extractb;
2732+         }
2733+         simd_insert!(a, 0, add)
2734+     }
26642735}
26652736
26662737/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2672,8 +2743,17 @@ pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m
26722743#[target_feature(enable = "avx512fp16")]
26732744#[cfg_attr(test, assert_instr(vdivsh))]
26742745#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2675- pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2676-     _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2746+ #[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
2747+ pub const fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2748+     unsafe {
2749+         let mut add: f16 = 0.;
2750+         if (k & 0b00000001) != 0 {
2751+             let extracta: f16 = simd_extract!(a, 0);
2752+             let extractb: f16 = simd_extract!(b, 0);
2753+             add = extracta / extractb;
2754+         }
2755+         simd_insert!(a, 0, add)
2756+     }
26772757}
26782758
26792759/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
0 commit comments