Skip to content

Commit 23f496d

Browse files
layout data in simd friendly way
Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
1 parent 20b423b commit 23f496d

File tree

3 files changed

+47
-18
lines changed

3 files changed

+47
-18
lines changed

salsa20/src/backends/soft.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@ impl<R: Unsigned> StreamCipherBackend for Backend<'_, R> {
2424

2525
self.0.set_block_pos(self.0.get_block_pos() + 1);
2626

27-
for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) {
28-
chunk.copy_from_slice(&val.to_le_bytes());
27+
for i in 0..16 {
28+
block[i * 4..(i + 1) * 4]
29+
.copy_from_slice(&res[crate::DATA_LAYOUT_INVERSE[i]].to_le_bytes());
2930
}
3031
}
3132
}
@@ -39,6 +40,10 @@ pub(crate) fn quarter_round(
3940
d: usize,
4041
state: &mut [u32; STATE_WORDS],
4142
) {
43+
let a = crate::DATA_LAYOUT_INVERSE[a];
44+
let b = crate::DATA_LAYOUT_INVERSE[b];
45+
let c = crate::DATA_LAYOUT_INVERSE[c];
46+
let d = crate::DATA_LAYOUT_INVERSE[d];
4247
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
4348
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
4449
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);

salsa20/src/lib.rs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,25 @@ const STATE_WORDS: usize = 16;
118118
/// State initialization constant ("expand 32-byte k")
119119
const CONSTANTS: [u32; 4] = [0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574];
120120

121+
const DATA_LAYOUT: [usize; 16] = [0, 5, 10, 15, 4, 9, 14, 3, 12, 1, 6, 11, 8, 13, 2, 7];
122+
123+
const DATA_LAYOUT_INVERSE: [usize; 16] = {
124+
let mut index = [0; 16];
125+
let mut i = 0;
126+
while i < 16 {
127+
let mut inverse = 0;
128+
while inverse < 16 {
129+
if DATA_LAYOUT[inverse] == i {
130+
index[i] = inverse;
131+
break;
132+
}
133+
inverse += 1;
134+
}
135+
i += 1;
136+
}
137+
index
138+
};
139+
121140
/// The Salsa20 core function.
122141
pub struct SalsaCore<R: Unsigned> {
123142
/// Internal state of the core function
@@ -133,7 +152,7 @@ impl<R: Unsigned> SalsaCore<R> {
133152
/// Other users generally should not use this method.
134153
pub fn from_raw_state(state: [u32; STATE_WORDS]) -> Self {
135154
Self {
136-
state,
155+
state: core::array::from_fn(|i| state[DATA_LAYOUT[i]]),
137156
rounds: PhantomData,
138157
}
139158
}
@@ -177,7 +196,7 @@ impl<R: Unsigned> KeyIvInit for SalsaCore<R> {
177196
state[15] = CONSTANTS[3];
178197

179198
Self {
180-
state,
199+
state: core::array::from_fn(|i| state[DATA_LAYOUT[i]]),
181200
rounds: PhantomData,
182201
}
183202
}
@@ -199,13 +218,14 @@ impl<R: Unsigned> StreamCipherSeekCore for SalsaCore<R> {
199218

200219
#[inline(always)]
201220
fn get_block_pos(&self) -> u64 {
202-
(self.state[8] as u64) + ((self.state[9] as u64) << 32)
221+
(self.state[DATA_LAYOUT_INVERSE[8]] as u64)
222+
+ ((self.state[DATA_LAYOUT_INVERSE[9]] as u64) << 32)
203223
}
204224

205225
#[inline(always)]
206226
fn set_block_pos(&mut self, pos: u64) {
207-
self.state[8] = (pos & 0xffff_ffff) as u32;
208-
self.state[9] = ((pos >> 32) & 0xffff_ffff) as u32;
227+
self.state[DATA_LAYOUT_INVERSE[8]] = (pos & 0xffff_ffff) as u32;
228+
self.state[DATA_LAYOUT_INVERSE[9]] = ((pos >> 32) & 0xffff_ffff) as u32;
209229
}
210230
}
211231

salsa20/src/xsalsa.rs

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -90,28 +90,32 @@ impl<R: Unsigned> ZeroizeOnDrop for XSalsaCore<R> {}
9090
///
9191
/// It produces 256-bits of output suitable for use as a Salsa20 key
9292
pub fn hsalsa<R: Unsigned>(key: &Key, input: &Array<u8, U16>) -> Array<u8, U32> {
93+
const KEY_IDX: [usize; 8] = [0, 5, 10, 15, 6, 7, 8, 9];
94+
9395
#[inline(always)]
9496
fn to_u32(chunk: &[u8]) -> u32 {
9597
u32::from_le_bytes(chunk.try_into().unwrap())
9698
}
9799

98-
let mut state = [0u32; 16];
99-
state[0] = CONSTANTS[0];
100-
state[1..5]
100+
let mut t = [0u32; 16];
101+
t[0] = CONSTANTS[0];
102+
t[1..5]
101103
.iter_mut()
102104
.zip(key[0..16].chunks_exact(4))
103105
.for_each(|(v, chunk)| *v = to_u32(chunk));
104-
state[5] = CONSTANTS[1];
105-
state[6..10]
106+
t[5] = CONSTANTS[1];
107+
t[6..10]
106108
.iter_mut()
107109
.zip(input.chunks_exact(4))
108110
.for_each(|(v, chunk)| *v = to_u32(chunk));
109-
state[10] = CONSTANTS[2];
110-
state[11..15]
111+
t[10] = CONSTANTS[2];
112+
t[11..15]
111113
.iter_mut()
112114
.zip(key[16..].chunks_exact(4))
113115
.for_each(|(v, chunk)| *v = to_u32(chunk));
114-
state[15] = CONSTANTS[3];
116+
t[15] = CONSTANTS[3];
117+
118+
let mut state = core::array::from_fn(|i| t[crate::DATA_LAYOUT[i]]);
115119

116120
// 20 rounds consisting of 10 column rounds and 10 diagonal rounds
117121
for _ in 0..R::USIZE {
@@ -129,10 +133,10 @@ pub fn hsalsa<R: Unsigned>(key: &Key, input: &Array<u8, U16>) -> Array<u8, U32>
129133
}
130134

131135
let mut output = Array::default();
132-
let key_idx: [usize; 8] = [0, 5, 10, 15, 6, 7, 8, 9];
133136

134-
for (i, chunk) in output.chunks_exact_mut(4).enumerate() {
135-
chunk.copy_from_slice(&state[key_idx[i]].to_le_bytes());
137+
for i in 0..8 {
138+
output[i * 4..(i + 1) * 4]
139+
.copy_from_slice(&state[crate::DATA_LAYOUT_INVERSE[KEY_IDX[i]]].to_le_bytes());
136140
}
137141

138142
output

0 commit comments

Comments
 (0)