Skip to content

Commit e1269b1

Browse files
authored
salsa20: re-add SSE2 support with reduced round soft fallback (#348)
- Reverts #346 - Add regression tests created in response to #346 - Add soft fallback for non salsa20/20 variants
1 parent adfead3 commit e1269b1

File tree

7 files changed

+289
-70
lines changed

7 files changed

+289
-70
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

salsa20/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ keywords = ["crypto", "stream-cipher", "trait", "xsalsa20"]
1313
categories = ["cryptography", "no-std"]
1414

1515
[dependencies]
16+
cfg-if = "1"
1617
cipher = "=0.5.0-pre.4"
1718

1819
[dev-dependencies]

salsa20/src/backends.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
pub(crate) mod soft;
2+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
3+
pub(crate) mod sse2;

salsa20/src/backends/soft.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
//! Portable implementation which does not rely on architecture-specific
2+
//! intrinsics.
3+
4+
use crate::{Block, SalsaCore, Unsigned, STATE_WORDS};
5+
use cipher::{
6+
consts::{U1, U64},
7+
BlockSizeUser, ParBlocksSizeUser, StreamBackend, StreamCipherSeekCore,
8+
};
9+
10+
pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);
11+
12+
impl<'a, R: Unsigned> BlockSizeUser for Backend<'a, R> {
13+
type BlockSize = U64;
14+
}
15+
16+
impl<'a, R: Unsigned> ParBlocksSizeUser for Backend<'a, R> {
17+
type ParBlocksSize = U1;
18+
}
19+
20+
impl<'a, R: Unsigned> StreamBackend for Backend<'a, R> {
21+
#[inline(always)]
22+
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
23+
let res = run_rounds::<R>(&self.0.state);
24+
25+
self.0.set_block_pos(self.0.get_block_pos() + 1);
26+
27+
for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) {
28+
chunk.copy_from_slice(&val.to_le_bytes());
29+
}
30+
}
31+
}
32+
33+
#[inline]
34+
#[allow(clippy::many_single_char_names)]
35+
pub(crate) fn quarter_round(
36+
a: usize,
37+
b: usize,
38+
c: usize,
39+
d: usize,
40+
state: &mut [u32; STATE_WORDS],
41+
) {
42+
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
43+
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
44+
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
45+
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
46+
}
47+
48+
#[inline(always)]
49+
fn run_rounds<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
50+
let mut res = *state;
51+
52+
for _ in 0..R::USIZE {
53+
// column rounds
54+
quarter_round(0, 4, 8, 12, &mut res);
55+
quarter_round(5, 9, 13, 1, &mut res);
56+
quarter_round(10, 14, 2, 6, &mut res);
57+
quarter_round(15, 3, 7, 11, &mut res);
58+
59+
// diagonal rounds
60+
quarter_round(0, 1, 2, 3, &mut res);
61+
quarter_round(5, 6, 7, 4, &mut res);
62+
quarter_round(10, 11, 8, 9, &mut res);
63+
quarter_round(15, 12, 13, 14, &mut res);
64+
}
65+
66+
for (s1, s0) in res.iter_mut().zip(state.iter()) {
67+
*s1 = s1.wrapping_add(*s0);
68+
}
69+
res
70+
}

salsa20/src/backends/sse2.rs

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
use crate::{
2+
backends::soft::Backend as SoftBackend, Block, SalsaCore, StreamClosure, Unsigned, STATE_WORDS,
3+
};
4+
use cipher::{
5+
consts::{U1, U64},
6+
BlockSizeUser, ParBlocksSizeUser, StreamBackend,
7+
};
8+
use core::marker::PhantomData;
9+
10+
#[cfg(target_arch = "x86")]
11+
use core::arch::x86::*;
12+
#[cfg(target_arch = "x86_64")]
13+
use core::arch::x86_64::*;
14+
15+
#[inline]
16+
#[target_feature(enable = "sse2")]
17+
pub(crate) unsafe fn inner<R, F>(state: &mut [u32; STATE_WORDS], f: F)
18+
where
19+
R: Unsigned,
20+
F: StreamClosure<BlockSize = U64>,
21+
{
22+
let state_ptr = state.as_ptr() as *const __m128i;
23+
let mut backend = Backend::<R> {
24+
v: [
25+
_mm_loadu_si128(state_ptr.add(0)),
26+
_mm_loadu_si128(state_ptr.add(1)),
27+
_mm_loadu_si128(state_ptr.add(2)),
28+
_mm_loadu_si128(state_ptr.add(3)),
29+
],
30+
_pd: PhantomData,
31+
};
32+
33+
// The SSE2 backend only works for Salsa20/20. Any other variant will fallback to the soft backend.
34+
if R::USIZE == 10 {
35+
f.call(&mut backend);
36+
state[8] = _mm_cvtsi128_si32(backend.v[2]) as u32;
37+
} else {
38+
f.call(&mut SoftBackend(&mut SalsaCore::<R> {
39+
state: *state,
40+
rounds: PhantomData,
41+
}));
42+
}
43+
}
44+
45+
struct Backend<R: Unsigned> {
46+
v: [__m128i; 4],
47+
_pd: PhantomData<R>,
48+
}
49+
50+
impl<R: Unsigned> BlockSizeUser for Backend<R> {
51+
type BlockSize = U64;
52+
}
53+
54+
impl<R: Unsigned> ParBlocksSizeUser for Backend<R> {
55+
type ParBlocksSize = U1;
56+
}
57+
58+
impl<R: Unsigned> StreamBackend for Backend<R> {
59+
#[inline(always)]
60+
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
61+
unsafe {
62+
let res = rounds::<R>(&self.v);
63+
64+
self.v[2] = _mm_add_epi32(self.v[2], _mm_set_epi32(0, 0, 0, 1));
65+
let block_ptr = block.as_mut_ptr() as *mut __m128i;
66+
67+
for (i, v) in res.iter().enumerate() {
68+
_mm_storeu_si128(block_ptr.add(i), *v);
69+
}
70+
}
71+
}
72+
}
73+
74+
#[inline]
75+
#[target_feature(enable = "sse2")]
76+
unsafe fn rounds<R: Unsigned>(v: &[__m128i; 4]) -> [__m128i; 4] {
77+
let mut res = *v;
78+
79+
for _ in 0..R::USIZE {
80+
double_round(&mut res);
81+
}
82+
83+
for i in 0..4 {
84+
res[i] = _mm_add_epi32(res[i], v[i]);
85+
}
86+
87+
transpose(&mut res);
88+
res[1] = _mm_shuffle_epi32(res[1], 0b_10_01_00_11);
89+
res[2] = _mm_shuffle_epi32(res[2], 0b_01_00_11_10);
90+
res[3] = _mm_shuffle_epi32(res[3], 0b_00_11_10_01);
91+
transpose(&mut res);
92+
93+
res
94+
}
95+
96+
/// The Salsa20 doubleround function for SSE2.
97+
///
98+
/// https://users.rust-lang.org/t/can-the-compiler-infer-sse-instructions/59976
99+
#[inline]
100+
#[target_feature(enable = "sse2")]
101+
unsafe fn double_round([a, b, c, d]: &mut [__m128i; 4]) {
102+
let mut t_sum: __m128i;
103+
let mut t_rotl: __m128i;
104+
105+
// Operate on "columns"
106+
t_sum = _mm_add_epi32(*a, *d);
107+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25));
108+
*b = _mm_xor_si128(*b, t_rotl);
109+
110+
t_sum = _mm_add_epi32(*b, *a);
111+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23));
112+
*c = _mm_xor_si128(*c, t_rotl);
113+
114+
t_sum = _mm_add_epi32(*c, *b);
115+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19));
116+
*d = _mm_xor_si128(*d, t_rotl);
117+
118+
t_sum = _mm_add_epi32(*d, *c);
119+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14));
120+
*a = _mm_xor_si128(*a, t_rotl);
121+
122+
// Rearrange data.
123+
*b = _mm_shuffle_epi32(*b, 0b_10_01_00_11);
124+
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10);
125+
*d = _mm_shuffle_epi32(*d, 0b_00_11_10_01);
126+
127+
// Operate on "rows".
128+
t_sum = _mm_add_epi32(*a, *b);
129+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25));
130+
*d = _mm_xor_si128(*d, t_rotl);
131+
132+
t_sum = _mm_add_epi32(*d, *a);
133+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23));
134+
*c = _mm_xor_si128(*c, t_rotl);
135+
136+
t_sum = _mm_add_epi32(*c, *d);
137+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19));
138+
*b = _mm_xor_si128(*b, t_rotl);
139+
140+
t_sum = _mm_add_epi32(*b, *c);
141+
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14));
142+
*a = _mm_xor_si128(*a, t_rotl);
143+
144+
// Rearrange data.
145+
*b = _mm_shuffle_epi32(*b, 0b_00_11_10_01);
146+
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10);
147+
*d = _mm_shuffle_epi32(*d, 0b_10_01_00_11);
148+
}
149+
150+
/// Transpose an integer 4 by 4 matrix in SSE2.
151+
///
152+
/// https://randombit.net/bitbashing/posts/integer_matrix_transpose_in_sse2.html
153+
#[inline]
154+
#[target_feature(enable = "sse2")]
155+
unsafe fn transpose([a, b, c, d]: &mut [__m128i; 4]) {
156+
let t0 = _mm_unpacklo_epi32(*a, *b);
157+
let t1 = _mm_unpacklo_epi32(*c, *d);
158+
let t2 = _mm_unpackhi_epi32(*a, *b);
159+
let t3 = _mm_unpackhi_epi32(*c, *d);
160+
161+
*a = _mm_unpacklo_epi64(t0, t1);
162+
*b = _mm_unpackhi_epi64(t0, t1);
163+
*c = _mm_unpacklo_epi64(t2, t3);
164+
*d = _mm_unpackhi_epi64(t2, t3);
165+
}

0 commit comments

Comments
 (0)