Skip to content

Commit 3b7dd43

Browse files
committed
Make madd, hadd and hsub const
1 parent 77d3a8e commit 3b7dd43

File tree

6 files changed

+188
-86
lines changed

6 files changed

+188
-86
lines changed

crates/core_arch/src/x86/avx.rs

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -612,8 +612,13 @@ pub fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
612612
#[target_feature(enable = "avx")]
613613
#[cfg_attr(test, assert_instr(vhaddpd))]
614614
#[stable(feature = "simd_x86", since = "1.27.0")]
615-
pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
616-
unsafe { vhaddpd(a, b) }
615+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
616+
pub const fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
617+
unsafe {
618+
let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
619+
let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
620+
simd_add(even, odd)
621+
}
617622
}
618623

619624
/// Horizontal addition of adjacent pairs in the two packed vectors
@@ -627,8 +632,13 @@ pub fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
627632
#[target_feature(enable = "avx")]
628633
#[cfg_attr(test, assert_instr(vhaddps))]
629634
#[stable(feature = "simd_x86", since = "1.27.0")]
630-
pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
631-
unsafe { vhaddps(a, b) }
635+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
636+
pub const fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
637+
unsafe {
638+
let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
639+
let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
640+
simd_add(even, odd)
641+
}
632642
}
633643

634644
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -641,8 +651,13 @@ pub fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
641651
#[target_feature(enable = "avx")]
642652
#[cfg_attr(test, assert_instr(vhsubpd))]
643653
#[stable(feature = "simd_x86", since = "1.27.0")]
644-
pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
645-
unsafe { vhsubpd(a, b) }
654+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
655+
pub const fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656+
unsafe {
657+
let even = simd_shuffle!(a, b, [0, 4, 2, 6]);
658+
let odd = simd_shuffle!(a, b, [1, 5, 3, 7]);
659+
simd_sub(even, odd)
660+
}
646661
}
647662

648663
/// Horizontal subtraction of adjacent pairs in the two packed vectors
@@ -656,8 +671,13 @@ pub fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
656671
#[target_feature(enable = "avx")]
657672
#[cfg_attr(test, assert_instr(vhsubps))]
658673
#[stable(feature = "simd_x86", since = "1.27.0")]
659-
pub fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
660-
unsafe { vhsubps(a, b) }
674+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
675+
pub const fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
676+
unsafe {
677+
let even = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
678+
let odd = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
679+
simd_sub(even, odd)
680+
}
661681
}
662682

663683
/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
@@ -3188,14 +3208,6 @@ unsafe extern "C" {
31883208
fn roundps256(a: __m256, b: i32) -> __m256;
31893209
#[link_name = "llvm.x86.avx.dp.ps.256"]
31903210
fn vdpps(a: __m256, b: __m256, imm8: i8) -> __m256;
3191-
#[link_name = "llvm.x86.avx.hadd.pd.256"]
3192-
fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
3193-
#[link_name = "llvm.x86.avx.hadd.ps.256"]
3194-
fn vhaddps(a: __m256, b: __m256) -> __m256;
3195-
#[link_name = "llvm.x86.avx.hsub.pd.256"]
3196-
fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
3197-
#[link_name = "llvm.x86.avx.hsub.ps.256"]
3198-
fn vhsubps(a: __m256, b: __m256) -> __m256;
31993211
#[link_name = "llvm.x86.sse2.cmp.pd"]
32003212
fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
32013213
#[link_name = "llvm.x86.avx.cmp.pd.256"]

crates/core_arch/src/x86/avx2.rs

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -945,8 +945,23 @@ pub const fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
945945
#[target_feature(enable = "avx2")]
946946
#[cfg_attr(test, assert_instr(vphaddw))]
947947
#[stable(feature = "simd_x86", since = "1.27.0")]
948-
pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
949-
unsafe { transmute(phaddw(a.as_i16x16(), b.as_i16x16())) }
948+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
949+
pub const fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
950+
let a = a.as_i16x16();
951+
let b = b.as_i16x16();
952+
unsafe {
953+
let even: i16x16 = simd_shuffle!(
954+
a,
955+
b,
956+
[0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
957+
);
958+
let odd: i16x16 = simd_shuffle!(
959+
a,
960+
b,
961+
[1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
962+
);
963+
simd_add(even, odd).as_m256i()
964+
}
950965
}
951966

952967
/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
@@ -956,8 +971,15 @@ pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
956971
#[target_feature(enable = "avx2")]
957972
#[cfg_attr(test, assert_instr(vphaddd))]
958973
#[stable(feature = "simd_x86", since = "1.27.0")]
959-
pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
960-
unsafe { transmute(phaddd(a.as_i32x8(), b.as_i32x8())) }
974+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
975+
pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
976+
let a = a.as_i32x8();
977+
let b = b.as_i32x8();
978+
unsafe {
979+
let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
980+
let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
981+
simd_add(even, odd).as_m256i()
982+
}
961983
}
962984

963985
/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
@@ -979,8 +1001,23 @@ pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
9791001
#[target_feature(enable = "avx2")]
9801002
#[cfg_attr(test, assert_instr(vphsubw))]
9811003
#[stable(feature = "simd_x86", since = "1.27.0")]
982-
pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
983-
unsafe { transmute(phsubw(a.as_i16x16(), b.as_i16x16())) }
1004+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
1005+
pub const fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1006+
let a = a.as_i16x16();
1007+
let b = b.as_i16x16();
1008+
unsafe {
1009+
let even: i16x16 = simd_shuffle!(
1010+
a,
1011+
b,
1012+
[0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
1013+
);
1014+
let odd: i16x16 = simd_shuffle!(
1015+
a,
1016+
b,
1017+
[1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
1018+
);
1019+
simd_sub(even, odd).as_m256i()
1020+
}
9841021
}
9851022

9861023
/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
@@ -990,8 +1027,15 @@ pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
9901027
#[target_feature(enable = "avx2")]
9911028
#[cfg_attr(test, assert_instr(vphsubd))]
9921029
#[stable(feature = "simd_x86", since = "1.27.0")]
993-
pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
994-
unsafe { transmute(phsubd(a.as_i32x8(), b.as_i32x8())) }
1030+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
1031+
pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1032+
let a = a.as_i32x8();
1033+
let b = b.as_i32x8();
1034+
unsafe {
1035+
let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
1036+
let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
1037+
simd_sub(even, odd).as_m256i()
1038+
}
9951039
}
9961040

9971041
/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
@@ -1769,8 +1813,14 @@ pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) ->
17691813
#[target_feature(enable = "avx2")]
17701814
#[cfg_attr(test, assert_instr(vpmaddwd))]
17711815
#[stable(feature = "simd_x86", since = "1.27.0")]
1772-
pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1773-
unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1816+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
1817+
pub const fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1818+
unsafe {
1819+
let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
1820+
let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
1821+
let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
1822+
simd_add(even, odd).as_m256i()
1823+
}
17741824
}
17751825

17761826
/// Vertically multiplies each unsigned 8-bit integer from `a` with the
@@ -3716,20 +3766,10 @@ pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
37163766

37173767
#[allow(improper_ctypes)]
37183768
unsafe extern "C" {
3719-
#[link_name = "llvm.x86.avx2.phadd.w"]
3720-
fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3721-
#[link_name = "llvm.x86.avx2.phadd.d"]
3722-
fn phaddd(a: i32x8, b: i32x8) -> i32x8;
37233769
#[link_name = "llvm.x86.avx2.phadd.sw"]
37243770
fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3725-
#[link_name = "llvm.x86.avx2.phsub.w"]
3726-
fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3727-
#[link_name = "llvm.x86.avx2.phsub.d"]
3728-
fn phsubd(a: i32x8, b: i32x8) -> i32x8;
37293771
#[link_name = "llvm.x86.avx2.phsub.sw"]
37303772
fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3731-
#[link_name = "llvm.x86.avx2.pmadd.wd"]
3732-
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
37333773
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
37343774
fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
37353775
#[link_name = "llvm.x86.avx2.maskload.d"]

crates/core_arch/src/x86/avx512bw.rs

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6299,8 +6299,22 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
62996299
#[target_feature(enable = "avx512bw")]
63006300
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63016301
#[cfg_attr(test, assert_instr(vpmaddwd))]
6302-
pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6303-
unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
6302+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6303+
pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
6304+
unsafe {
6305+
let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
6306+
let even: i32x16 = simd_shuffle!(
6307+
r,
6308+
r,
6309+
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
6310+
);
6311+
let odd: i32x16 = simd_shuffle!(
6312+
r,
6313+
r,
6314+
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
6315+
);
6316+
simd_add(even, odd).as_m512i()
6317+
}
63046318
}
63056319

63066320
/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6310,7 +6324,8 @@ pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
63106324
#[target_feature(enable = "avx512bw")]
63116325
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63126326
#[cfg_attr(test, assert_instr(vpmaddwd))]
6313-
pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6327+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6328+
pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63146329
unsafe {
63156330
let madd = _mm512_madd_epi16(a, b).as_i32x16();
63166331
transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
@@ -6324,7 +6339,8 @@ pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i
63246339
#[target_feature(enable = "avx512bw")]
63256340
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63266341
#[cfg_attr(test, assert_instr(vpmaddwd))]
6327-
pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
6342+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6343+
pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
63286344
unsafe {
63296345
let madd = _mm512_madd_epi16(a, b).as_i32x16();
63306346
transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
@@ -6338,7 +6354,8 @@ pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
63386354
#[target_feature(enable = "avx512bw,avx512vl")]
63396355
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63406356
#[cfg_attr(test, assert_instr(vpmaddwd))]
6341-
pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6357+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6358+
pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63426359
unsafe {
63436360
let madd = _mm256_madd_epi16(a, b).as_i32x8();
63446361
transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
@@ -6352,7 +6369,8 @@ pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i)
63526369
#[target_feature(enable = "avx512bw,avx512vl")]
63536370
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63546371
#[cfg_attr(test, assert_instr(vpmaddwd))]
6355-
pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
6372+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6373+
pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63566374
unsafe {
63576375
let madd = _mm256_madd_epi16(a, b).as_i32x8();
63586376
transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
@@ -6366,7 +6384,8 @@ pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
63666384
#[target_feature(enable = "avx512bw,avx512vl")]
63676385
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63686386
#[cfg_attr(test, assert_instr(vpmaddwd))]
6369-
pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6387+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6388+
pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
63706389
unsafe {
63716390
let madd = _mm_madd_epi16(a, b).as_i32x4();
63726391
transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
@@ -6380,7 +6399,8 @@ pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) ->
63806399
#[target_feature(enable = "avx512bw,avx512vl")]
63816400
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
63826401
#[cfg_attr(test, assert_instr(vpmaddwd))]
6383-
pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
6402+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
6403+
pub const fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
63846404
unsafe {
63856405
let madd = _mm_madd_epi16(a, b).as_i32x4();
63866406
transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
@@ -12464,8 +12484,6 @@ unsafe extern "C" {
1246412484
#[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
1246512485
fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;
1246612486

12467-
#[link_name = "llvm.x86.avx512.pmaddw.d.512"]
12468-
fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
1246912487
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
1247012488
fn vpmaddubsw(a: i8x64, b: i8x64) -> i16x32;
1247112489

crates/core_arch/src/x86/sse2.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,14 @@ pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
210210
#[target_feature(enable = "sse2")]
211211
#[cfg_attr(test, assert_instr(pmaddwd))]
212212
#[stable(feature = "simd_x86", since = "1.27.0")]
213-
pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
214-
unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
213+
#[rustc_const_unstable(feature = "stdarch_const_intrinsics", issue = "none")]
214+
pub const fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
215+
unsafe {
216+
let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
217+
let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
218+
let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
219+
simd_add(even, odd).as_m128i()
220+
}
215221
}
216222

217223
/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
@@ -3190,8 +3196,6 @@ unsafe extern "C" {
31903196
fn lfence();
31913197
#[link_name = "llvm.x86.sse2.mfence"]
31923198
fn mfence();
3193-
#[link_name = "llvm.x86.sse2.pmadd.wd"]
3194-
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
31953199
#[link_name = "llvm.x86.sse2.psad.bw"]
31963200
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
31973201
#[link_name = "llvm.x86.sse2.psll.w"]

0 commit comments

Comments
 (0)