diff --git a/salsa20/benches/mod.rs b/salsa20/benches/mod.rs index f07b023a..c33b40ba 100644 --- a/salsa20/benches/mod.rs +++ b/salsa20/benches/mod.rs @@ -1,9 +1,15 @@ #![feature(test)] + +use cipher::{ + Array, + consts::{U1, U4, U64}, +}; extern crate test; cipher::stream_cipher_bench!( salsa20::Salsa8; salsa8_bench1_16b 16; + salsa8_bench1_64b 64; salsa8_bench2_256b 256; salsa8_bench3_1kib 1024; salsa8_bench4_16kib 16384; @@ -12,6 +18,7 @@ cipher::stream_cipher_bench!( cipher::stream_cipher_bench!( salsa20::Salsa12; salsa12_bench1_16b 16; + salsa12_bench1_64b 64; salsa12_bench2_256b 256; salsa12_bench3_1kib 1024; salsa12_bench4_16kib 16384; @@ -20,7 +27,50 @@ cipher::stream_cipher_bench!( cipher::stream_cipher_bench!( salsa20::Salsa20; salsa20_bench1_16b 16; + salsa20_bench1_64b 64; salsa20_bench2_256b 256; salsa20_bench3_1kib 1024; salsa20_bench4_16kib 16384; ); + +#[bench] +fn salsa8_bench1_ks_altn(b: &mut test::Bencher) { + use salsa20::SalsaChaining; + use std::hash::{BuildHasher, Hasher}; + + let seed = std::hash::RandomState::new().build_hasher().finish(); + + let mut buf: Array<[u32; 16], U1> = [[0u32; 16]].into(); + buf[0][0] = seed as u32; + buf[0][1] = (seed >> 32) as u32; + + b.iter(|| { + let mut cipher = salsa20::SalsaCore::::from_raw_state_cv(buf); + cipher.write_keystream_block_cv([&mut buf[0]].into()); + test::black_box(&buf); + }); + + b.bytes = buf[0].len() as u64 * core::mem::size_of::() as u64; +} + +#[bench] +fn salsa8_bench1_ks(b: &mut test::Bencher) { + use cipher::StreamCipherCore; + use std::hash::{BuildHasher, Hasher}; + + let seed = std::hash::RandomState::new().build_hasher().finish(); + + let mut buf = [0u32; 16]; + buf[0] = seed as u32; + buf[1] = (seed >> 32) as u32; + + b.iter(|| { + let mut cipher = salsa20::SalsaCore::::from_raw_state(buf); + cipher.write_keystream_block(unsafe { + core::mem::transmute::<&mut [u32; 16], &mut Array>(&mut buf) + }); + test::black_box(&buf); + }); + + b.bytes = buf.len() as u64 * core::mem::size_of::() as u64; +} diff --git a/salsa20/src/backends.rs b/salsa20/src/backends.rs index fbc9393c..4623d655 100644 --- a/salsa20/src/backends.rs +++ b/salsa20/src/backends.rs @@ -1 +1,30 @@ -pub(crate) mod soft; +use cfg_if::cfg_if; + +cfg_if! { + if #[cfg(all(target_feature = "sse2", any(target_arch = "x86", target_arch = "x86_64")))] { + pub(crate) mod sse2; + pub(crate) type Backend<'a, R> = sse2::Backend<'a, R>; + } else { + pub(crate) mod soft; + pub(crate) type Backend<'a, R> = soft::Backend<'a, R>; + } +} + +#[inline] +#[allow(clippy::many_single_char_names)] +pub(crate) fn quarter_round( + a: usize, + b: usize, + c: usize, + d: usize, + state: &mut [u32; crate::STATE_WORDS], +) { + let a = crate::DATA_LAYOUT_INVERSE[a]; + let b = crate::DATA_LAYOUT_INVERSE[b]; + let c = crate::DATA_LAYOUT_INVERSE[c]; + let d = crate::DATA_LAYOUT_INVERSE[d]; + state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); + state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); + state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); + state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); +} diff --git a/salsa20/src/backends/soft.rs b/salsa20/src/backends/soft.rs index caf2693f..7207020f 100644 --- a/salsa20/src/backends/soft.rs +++ b/salsa20/src/backends/soft.rs @@ -7,8 +7,16 @@ use cipher::{ consts::{U1, U64}, }; +use super::quarter_round; + pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore); +impl<'a, R: Unsigned> From<&'a mut SalsaCore> for Backend<'a, R> { + fn from(core: &'a mut SalsaCore) -> Self { + Backend(core) + } +} + impl BlockSizeUser for Backend<'_, R> { type BlockSize = U64; } @@ -17,6 +25,17 @@ impl ParBlocksSizeUser for Backend<'_, R> { type ParBlocksSize = U1; } +impl Backend<'_, R> { + #[inline(always)] + pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) { + let res = run_rounds::(&self.0.state); + + self.0.set_block_pos(self.0.get_block_pos() + 1); + + block.copy_from_slice(&res); + } +} + impl StreamCipherBackend for Backend<'_, R> { #[inline(always)] fn gen_ks_block(&mut self, block: &mut Block) { @@ -24,27 +43,13 @@ impl StreamCipherBackend for Backend<'_, R> { self.0.set_block_pos(self.0.get_block_pos() + 1); - for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) { - chunk.copy_from_slice(&val.to_le_bytes()); + for i in 0..16 { + block[i * 4..(i + 1) * 4] + .copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes()); } } } -#[inline] -#[allow(clippy::many_single_char_names)] -pub(crate) fn quarter_round( - a: usize, - b: usize, - c: usize, - d: usize, - state: &mut [u32; STATE_WORDS], -) { - state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); - state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); - state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); - state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); -} - #[inline(always)] fn run_rounds(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] { let mut res = *state; diff --git a/salsa20/src/backends/sse2.rs b/salsa20/src/backends/sse2.rs new file mode 100644 index 00000000..1276695a --- /dev/null +++ b/salsa20/src/backends/sse2.rs @@ -0,0 +1,111 @@ +//! SSE2 backend for Salsa20. + +use crate::{Block, STATE_WORDS, SalsaCore, Unsigned}; +use cipher::{ + BlockSizeUser, ParBlocksSizeUser, StreamCipherBackend, StreamCipherSeekCore, + consts::{U1, U64}, +}; + +pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore); + +impl<'a, R: Unsigned> From<&'a mut SalsaCore> for Backend<'a, R> { + fn from(core: &'a mut SalsaCore) -> Self { + Backend(core) + } +} + +impl BlockSizeUser for Backend<'_, R> { + type BlockSize = U64; +} + +impl ParBlocksSizeUser for Backend<'_, R> { + type ParBlocksSize = U1; +} + +impl Backend<'_, R> { + #[inline(always)] + pub(crate) fn gen_ks_block_altn(&mut self, block: &mut [u32; STATE_WORDS]) { + let res = run_rounds_sse2::(&self.0.state); + + self.0.set_block_pos(self.0.get_block_pos() + 1); + + block.copy_from_slice(&res); + } +} + +impl StreamCipherBackend for Backend<'_, R> { + #[inline(always)] + fn gen_ks_block(&mut self, block: &mut Block) { + let res = run_rounds_sse2::(&self.0.state); + + self.0.set_block_pos(self.0.get_block_pos() + 1); + + for i in 0..16 { + block[i * 4..(i + 1) * 4] + .copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes()); + } + } +} + +#[inline(always)] +fn run_rounds_sse2(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] { + use core::arch::x86_64::*; + unsafe { + let [a_save, b_save, d_save, c_save] = [ + _mm_loadu_si128(state.as_ptr().add(0).cast()), + _mm_loadu_si128(state.as_ptr().add(4).cast()), + _mm_loadu_si128(state.as_ptr().add(8).cast()), + _mm_loadu_si128(state.as_ptr().add(12).cast()), + ]; + let [mut a, mut b, mut c, mut d] = [a_save, b_save, c_save, d_save]; + + macro_rules! mm_rol_epi32x { + ($w:expr, $amt:literal) => {{ + let w = $w; + _mm_xor_si128(_mm_slli_epi32(w, $amt), _mm_srli_epi32(w, 32 - $amt)) + }}; + } + + macro_rules! quarter_xmmwords { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + $b = _mm_xor_si128($b, mm_rol_epi32x!(_mm_add_epi32($a, $d), 7)); + $c = _mm_xor_si128($c, mm_rol_epi32x!(_mm_add_epi32($b, $a), 9)); + $d = _mm_xor_si128($d, mm_rol_epi32x!(_mm_add_epi32($c, $b), 13)); + $a = _mm_xor_si128($a, mm_rol_epi32x!(_mm_add_epi32($d, $c), 18)); + }; + } + + for _ in 0..R::USIZE { + quarter_xmmwords!(a, b, c, d); + + // a stays in place + // b = left shuffle d by 1 element + d = _mm_shuffle_epi32(d, 0b00_11_10_01); + // c = left shuffle c by 2 elements + c = _mm_shuffle_epi32(c, 0b01_00_11_10); + // d = left shuffle b by 3 elements + b = _mm_shuffle_epi32(b, 0b10_01_00_11); + + (b, d) = (d, b); + + quarter_xmmwords!(a, b, c, d); + + // a stays in place + // b = left shuffle d by 1 element + d = _mm_shuffle_epi32(d, 0b00_11_10_01); + // c = left shuffle c by 2 elements + c = _mm_shuffle_epi32(c, 0b01_00_11_10); + // d = left shuffle b by 3 elements + b = _mm_shuffle_epi32(b, 0b10_01_00_11); + + (b, d) = (d, b); + } + + let mut res = [0u32; STATE_WORDS]; + _mm_storeu_si128(res.as_mut_ptr().add(0).cast(), _mm_add_epi32(a, a_save)); + _mm_storeu_si128(res.as_mut_ptr().add(4).cast(), _mm_add_epi32(b, b_save)); + _mm_storeu_si128(res.as_mut_ptr().add(8).cast(), _mm_add_epi32(d, d_save)); + _mm_storeu_si128(res.as_mut_ptr().add(12).cast(), _mm_add_epi32(c, c_save)); + res + } +} diff --git a/salsa20/src/lib.rs b/salsa20/src/lib.rs index c75246f0..9adbfcd2 100644 --- a/salsa20/src/lib.rs +++ b/salsa20/src/lib.rs @@ -78,8 +78,8 @@ pub use cipher; use cipher::{ Block, BlockSizeUser, IvSizeUser, KeyIvInit, KeySizeUser, StreamCipherClosure, StreamCipherCore, StreamCipherCoreWrapper, StreamCipherSeekCore, - array::{Array, typenum::Unsigned}, - consts::{U4, U6, U8, U10, U24, U32, U64}, + array::{Array, ArraySize, typenum::Unsigned}, + consts::{U1, U4, U6, U8, U10, U24, U32, U64}, }; use core::marker::PhantomData; @@ -118,7 +118,27 @@ const STATE_WORDS: usize = 16; /// State initialization constant ("expand 32-byte k") const CONSTANTS: [u32; 4] = [0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]; +const DATA_LAYOUT: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 12, 1, 6, 11, 8, 13, 2, 7]; + +const DATA_LAYOUT_INVERSE: [usize; 16] = { + let mut index = [0; 16]; + let mut i = 0; + while i < 16 { + let mut inverse = 0; + while inverse < 16 { + if DATA_LAYOUT[inverse] == i { + index[i] = inverse; + break; + } + inverse += 1; + } + i += 1; + } + index +}; + /// The Salsa20 core function. +#[repr(transparent)] pub struct SalsaCore { /// Internal state of the core function state: [u32; STATE_WORDS], @@ -126,6 +146,56 @@ pub struct SalsaCore { rounds: PhantomData, } +#[expect(unused)] +const STATIC_ASSERT_CORE_IS_64_BYTES: [(); size_of::>()] = [(); 64]; + +/// Salsa20 chaining operations. +pub trait SalsaChaining: BlockSizeUser { + /// Number of lanes + type LaneCount: ArraySize; + + /// Permutation table for shuffling the natural order state into the internal order. + const ALTN_DATA_LAYOUT: [usize; STATE_WORDS]; + + /// Inverse permutation table. + const INVERSE_ALTN_DATA_LAYOUT: [usize; STATE_WORDS] = { + let mut index = [0; 16]; + let mut i = 0; + while i < 16 { + let mut inverse = 0; + while inverse < 16 { + if Self::ALTN_DATA_LAYOUT[inverse] == i { + index[i] = inverse; + break; + } + inverse += 1; + } + i += 1; + } + index + }; + + /// Shuffle the state into the internal data layout. + fn shuffle_state_into_altn(state: &mut [u32; STATE_WORDS]) { + for i in 0..STATE_WORDS { + state[i] = state[Self::ALTN_DATA_LAYOUT[i]]; + } + } + + /// Shuffle the state from the internal data layout. + fn shuffle_state_from_altn(state: &mut [u32; STATE_WORDS]) { + for i in 0..STATE_WORDS { + state[i] = state[Self::INVERSE_ALTN_DATA_LAYOUT[i]]; + } + } + + /// Instantiate new Salsa core from raw state in internal order. + fn from_raw_state_cv(state: Array<[u32; STATE_WORDS], Self::LaneCount>) -> Self; + + /// Generate keystream block in internal order. + fn write_keystream_block_cv(&mut self, block: Array<&mut [u32; STATE_WORDS], Self::LaneCount>); +} + impl SalsaCore { /// Create new Salsa core from raw state. /// @@ -133,12 +203,40 @@ impl SalsaCore { /// Other users generally should not use this method. pub fn from_raw_state(state: [u32; STATE_WORDS]) -> Self { Self { - state, + state: core::array::from_fn(|i| state[DATA_LAYOUT[i]]), rounds: PhantomData, } } } +impl SalsaChaining for SalsaCore { + type LaneCount = U1; + + const ALTN_DATA_LAYOUT: [usize; STATE_WORDS] = DATA_LAYOUT; + + /// Create new Salsa core from raw state with alternative data layout. + /// + /// This method is mainly intended for the `scrypt` crate. + /// Other users generally should not use this method. + fn from_raw_state_cv(state: Array<[u32; STATE_WORDS], Self::LaneCount>) -> Self { + Self { + state: state[0], + rounds: PhantomData, + } + } + + /// Generate keystream block with alternative data layout. + /// + /// This method is used to generate keystream blocks with alternative data layout. + fn write_keystream_block_cv( + &mut self, + mut block: Array<&mut [u32; STATE_WORDS], Self::LaneCount>, + ) { + let mut backend = backends::Backend::<'_, R>::from(self); + backend.gen_ks_block_altn(block[0]); + } +} + impl KeySizeUser for SalsaCore { type KeySize = U32; } @@ -177,7 +275,7 @@ impl KeyIvInit for SalsaCore { state[15] = CONSTANTS[3]; Self { - state, + state: core::array::from_fn(|i| state[DATA_LAYOUT[i]]), rounds: PhantomData, } } @@ -190,7 +288,7 @@ impl StreamCipherCore for SalsaCore { rem.try_into().ok() } fn process_with_backend(&mut self, f: impl StreamCipherClosure) { - f.call(&mut backends::soft::Backend(self)); + f.call(&mut backends::Backend::<'_, R>::from(self)); } } @@ -199,13 +297,14 @@ impl StreamCipherSeekCore for SalsaCore { #[inline(always)] fn get_block_pos(&self) -> u64 { - (self.state[8] as u64) + ((self.state[9] as u64) << 32) + (self.state[DATA_LAYOUT_INVERSE[8]] as u64) + + ((self.state[DATA_LAYOUT_INVERSE[9]] as u64) << 32) } #[inline(always)] fn set_block_pos(&mut self, pos: u64) { - self.state[8] = (pos & 0xffff_ffff) as u32; - self.state[9] = ((pos >> 32) & 0xffff_ffff) as u32; + self.state[DATA_LAYOUT_INVERSE[8]] = (pos & 0xffff_ffff) as u32; + self.state[DATA_LAYOUT_INVERSE[9]] = ((pos >> 32) & 0xffff_ffff) as u32; } } diff --git a/salsa20/src/xsalsa.rs b/salsa20/src/xsalsa.rs index 9f84cef5..aa30349b 100644 --- a/salsa20/src/xsalsa.rs +++ b/salsa20/src/xsalsa.rs @@ -8,7 +8,7 @@ use cipher::{ consts::{U4, U6, U10, U16, U24, U32, U64}, }; -use crate::backends::soft::quarter_round; +use crate::backends::quarter_round; #[cfg(feature = "zeroize")] use cipher::zeroize::ZeroizeOnDrop; @@ -90,28 +90,32 @@ impl ZeroizeOnDrop for XSalsaCore {} /// /// It produces 256-bits of output suitable for use as a Salsa20 key pub fn hsalsa(key: &Key, input: &Array) -> Array { + const KEY_IDX: [usize; 8] = [0, 5, 10, 15, 6, 7, 8, 9]; + #[inline(always)] fn to_u32(chunk: &[u8]) -> u32 { u32::from_le_bytes(chunk.try_into().unwrap()) } - let mut state = [0u32; 16]; - state[0] = CONSTANTS[0]; - state[1..5] + let mut t = [0u32; 16]; + t[0] = CONSTANTS[0]; + t[1..5] .iter_mut() .zip(key[0..16].chunks_exact(4)) .for_each(|(v, chunk)| *v = to_u32(chunk)); - state[5] = CONSTANTS[1]; - state[6..10] + t[5] = CONSTANTS[1]; + t[6..10] .iter_mut() .zip(input.chunks_exact(4)) .for_each(|(v, chunk)| *v = to_u32(chunk)); - state[10] = CONSTANTS[2]; - state[11..15] + t[10] = CONSTANTS[2]; + t[11..15] .iter_mut() .zip(key[16..].chunks_exact(4)) .for_each(|(v, chunk)| *v = to_u32(chunk)); - state[15] = CONSTANTS[3]; + t[15] = CONSTANTS[3]; + + let mut state = core::array::from_fn(|i| t[crate::DATA_LAYOUT[i]]); // 20 rounds consisting of 10 column rounds and 10 diagonal rounds for _ in 0..R::USIZE { @@ -129,10 +133,10 @@ pub fn hsalsa(key: &Key, input: &Array) -> Array } let mut output = Array::default(); - let key_idx: [usize; 8] = [0, 5, 10, 15, 6, 7, 8, 9]; - for (i, chunk) in output.chunks_exact_mut(4).enumerate() { - chunk.copy_from_slice(&state[key_idx[i]].to_le_bytes()); + for i in 0..8 { + output[i * 4..(i + 1) * 4] + .copy_from_slice(&state[crate::DATA_LAYOUT_INVERSE[KEY_IDX[i]]].to_le_bytes()); } output