Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion library/std/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,11 @@ pub mod task {
pub use core::task::*;
}

#[doc = include_str!("../../stdarch/crates/core_arch/src/core_arch_docs.md")]
#[doc = concat!(
include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_prefix.md"),
include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_other_architectures_std.md"),
include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_examples.md"),
)]
#[stable(feature = "simd_arch", since = "1.27.0")]
pub mod arch {
#[stable(feature = "simd_arch", since = "1.27.0")]
Expand Down
8 changes: 8 additions & 0 deletions library/stdarch/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,11 @@ incremental = true
debug = 1
opt-level = 3
incremental = true

# The "dist" profile is used by bootstrap when building stdarch docs as part
# of the distribution pipeline. Keep it aligned with the library workspace so
# `cargo doc --profile=dist` works when stdarch is built as its own workspace.
[profile.dist]
inherits = "release"
codegen-units = 1
debug = 1
34 changes: 17 additions & 17 deletions library/stdarch/crates/core_arch/src/core_arch_docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,23 +199,23 @@ others at:
* [`loongarch64`]
* [`s390x`]

[`x86`]: ../../core/arch/x86/index.html
[`x86_64`]: ../../core/arch/x86_64/index.html
[`arm`]: ../../core/arch/arm/index.html
[`aarch64`]: ../../core/arch/aarch64/index.html
[`amdgpu`]: ../../core/arch/amdgpu/index.html
[`hexagon`]: ../../core/arch/hexagon/index.html
[`riscv32`]: ../../core/arch/riscv32/index.html
[`riscv64`]: ../../core/arch/riscv64/index.html
[`mips`]: ../../core/arch/mips/index.html
[`mips64`]: ../../core/arch/mips64/index.html
[`powerpc`]: ../../core/arch/powerpc/index.html
[`powerpc64`]: ../../core/arch/powerpc64/index.html
[`nvptx`]: ../../core/arch/nvptx/index.html
[`wasm32`]: ../../core/arch/wasm32/index.html
[`loongarch32`]: ../../core/arch/loongarch32/index.html
[`loongarch64`]: ../../core/arch/loongarch64/index.html
[`s390x`]: ../../core/arch/s390x/index.html
[`x86`]: crate::arch::x86
[`x86_64`]: crate::arch::x86_64
[`arm`]: crate::arch::arm
[`aarch64`]: crate::arch::aarch64
[`amdgpu`]: crate::arch::amdgpu
[`hexagon`]: crate::arch::hexagon
[`riscv32`]: crate::arch::riscv32
[`riscv64`]: crate::arch::riscv64
[`mips`]: crate::arch::mips
[`mips64`]: crate::arch::mips64
[`powerpc`]: crate::arch::powerpc
[`powerpc64`]: crate::arch::powerpc64
[`nvptx`]: crate::arch::nvptx
[`wasm32`]: crate::arch::wasm32
[`loongarch32`]: crate::arch::loongarch32
[`loongarch64`]: crate::arch::loongarch64
[`s390x`]: crate::arch::s390x

# Examples

Expand Down
137 changes: 137 additions & 0 deletions library/stdarch/crates/core_arch/src/core_arch_docs_examples.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Examples

First let's take a look at not actually using any intrinsics but instead
using LLVM's auto-vectorization to produce optimized vectorized code for
AVX2 and also for the default platform.

```rust
fn main() {
let mut dst = [0];
add_quickly(&[1], &[2], &mut dst);
assert_eq!(dst[0], 3);
}

fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
// Note that this `unsafe` block is safe because we're testing
// that the `avx2` feature is indeed available on our CPU.
if is_x86_feature_detected!("avx2") {
return unsafe { add_quickly_avx2(a, b, c) };
}
}

add_quickly_fallback(a, b, c)
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
add_quickly_fallback(a, b, c) // the function below is inlined here
}

fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
for ((a, b), c) in a.iter().zip(b).zip(c) {
*c = *a + *b;
}
}
```

Next up let's take a look at an example of manually using intrinsics. Here
we'll be using SSE4.1 features to implement hex encoding.

```
fn main() {
let mut dst = [0; 32];
hex_encode(b"\x01\x02\x03", &mut dst);
assert_eq!(&dst[..6], b"010203");

let mut src = [0; 16];
for i in 0..16 {
src[i] = (i + 1) as u8;
}
hex_encode(&src, &mut dst);
assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
}

pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
let len = src.len().checked_mul(2).unwrap();
assert!(dst.len() >= len);

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if is_x86_feature_detected!("sse4.1") {
return unsafe { hex_encode_sse41(src, dst) };
}
}

hex_encode_fallback(src, dst)
}

// translated from
// <https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp>
#[target_feature(enable = "sse4.1")]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

unsafe {
let ascii_zero = _mm_set1_epi8(b'0' as i8);
let nines = _mm_set1_epi8(9);
let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
let and4bits = _mm_set1_epi8(0xf);

let mut i = 0_isize;
while src.len() >= 16 {
let invec = _mm_loadu_si128(src.as_ptr() as *const _);

let masked1 = _mm_and_si128(invec, and4bits);
let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);

// return 0xff corresponding to the elements > 9, or 0x00 otherwise
let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);

// add '0' or the offset depending on the masks
let masked1 = _mm_add_epi8(
masked1,
_mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
);
let masked2 = _mm_add_epi8(
masked2,
_mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
);

// interleave masked1 and masked2 bytes
let res1 = _mm_unpacklo_epi8(masked2, masked1);
let res2 = _mm_unpackhi_epi8(masked2, masked1);

_mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
_mm_storeu_si128(
dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
res2,
);
src = &src[16..];
i += 16;
}

let i = i as usize;
hex_encode_fallback(src, &mut dst[i * 2..]);
}
}

fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
fn hex(byte: u8) -> u8 {
static TABLE: &[u8] = b"0123456789abcdef";
TABLE[byte as usize]
}

for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
slots[0] = hex((*byte >> 4) & 0xf);
slots[1] = hex(*byte & 0xf);
}
}
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Other architectures

This documentation is only for one particular architecture, you can find
others at:

* [`x86`]
* [`x86_64`]
* `arm`
* [`aarch64`]
* `amdgpu`
* `hexagon`
* `riscv32`
* `riscv64`
* `mips`
* `mips64`
* `powerpc`
* `powerpc64`
* `nvptx`
* [`wasm32`]
* `loongarch32`
* `loongarch64`
* `s390x`

[`x86`]: crate::arch::x86
[`x86_64`]: crate::arch::x86_64
[`aarch64`]: crate::arch::aarch64
[`wasm32`]: crate::arch::wasm32
Loading
Loading