Skip to content

Commit 0f4fc6b

Browse files
authored
Merge pull request #1192 from kazcw/chacha-faster
Chacha: performance improvements
2 parents fa6638b + fb7af73 commit 0f4fc6b

File tree

3 files changed

+51
-51
lines changed

3 files changed

+51
-51
lines changed

rand_chacha/CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88
- Made `rand_chacha` propagate the `std` feature down to `rand_core`
9+
- Performance improvements for AVX2: ~4-7%
910

1011
## [0.3.1] - 2021-06-09
1112
- add getters corresponding to existing setters: `get_seed`, `get_stream` (#1124)

rand_chacha/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ edition = "2018"
1616

1717
[dependencies]
1818
rand_core = { path = "../rand_core", version = "0.6.0" }
19-
ppv-lite86 = { version = "0.2.8", default-features = false, features = ["simd"] }
19+
ppv-lite86 = { version = "0.2.14", default-features = false, features = ["simd"] }
2020
serde = { version = "1.0", features = ["derive"], optional = true }
2121

2222
[dev-dependencies]

rand_chacha/src/guts.rs

+49-50
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
use ppv_lite86::{dispatch, dispatch_light128};
1313

1414
pub use ppv_lite86::Machine;
15-
use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4};
15+
use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, Vector};
1616

1717
pub(crate) const BLOCK: usize = 16;
1818
pub(crate) const BLOCK64: u64 = BLOCK as u64;
@@ -73,12 +73,6 @@ impl ChaCha {
7373
init_chacha(key, nonce)
7474
}
7575

76-
#[inline(always)]
77-
fn pos64<M: Machine>(&self, m: M) -> u64 {
78-
let d: M::u32x4 = m.unpack(self.d);
79-
((d.extract(1) as u64) << 32) | d.extract(0) as u64
80-
}
81-
8276
/// Produce 4 blocks of output, advancing the state
8377
#[inline(always)]
8478
pub fn refill4(&mut self, drounds: u32, out: &mut [u32; BUFSZ]) {
@@ -111,70 +105,75 @@ impl ChaCha {
111105
}
112106
}
113107

114-
#[allow(clippy::many_single_char_names)]
108+
// This implementation is platform-independent.
115109
#[inline(always)]
116-
fn refill_wide_impl<Mach: Machine>(
117-
m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ],
118-
) {
119-
let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
120-
let mut pos = state.pos64(m);
121-
let d0: Mach::u32x4 = m.unpack(state.d);
110+
#[cfg(target_endian = "big")]
111+
fn add_pos<Mach: Machine>(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 {
112+
let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
113+
let pos = pos0.wrapping_add(i);
114+
d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0)
115+
}
116+
#[inline(always)]
117+
#[cfg(target_endian = "big")]
118+
fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
119+
let d0: Mach::u32x4 = m.unpack(d);
120+
let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
122121
pos = pos.wrapping_add(1);
123122
let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
124123
pos = pos.wrapping_add(1);
125124
let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
126125
pos = pos.wrapping_add(1);
127126
let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
127+
Mach::u32x4x4::from_lanes([d0, d1, d2, d3])
128+
}
129+
130+
// Pos is packed into the state vectors as a little-endian u64,
131+
// so on LE platforms we can use native vector ops to increment it.
132+
#[inline(always)]
133+
#[cfg(target_endian = "little")]
134+
fn add_pos<Mach: Machine>(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 {
135+
let d0: Mach::u64x2 = m.unpack(d.into());
136+
let incr = m.vec([i, 0]);
137+
m.unpack((d0 + incr).into())
138+
}
139+
#[inline(always)]
140+
#[cfg(target_endian = "little")]
141+
fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
142+
let d0: Mach::u64x2 = m.unpack(d);
143+
let incr = Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]);
144+
m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into())
145+
}
128146

147+
#[allow(clippy::many_single_char_names)]
148+
#[inline(always)]
149+
fn refill_wide_impl<Mach: Machine>(
150+
m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ],
151+
) {
152+
let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
129153
let b = m.unpack(state.b);
130154
let c = m.unpack(state.c);
131155
let mut x = State {
132156
a: Mach::u32x4x4::from_lanes([k, k, k, k]),
133157
b: Mach::u32x4x4::from_lanes([b, b, b, b]),
134158
c: Mach::u32x4x4::from_lanes([c, c, c, c]),
135-
d: m.unpack(Mach::u32x4x4::from_lanes([d0, d1, d2, d3]).into()),
159+
d: d0123(m, state.d),
136160
};
137161
for _ in 0..drounds {
138162
x = round(x);
139163
x = undiagonalize(round(diagonalize(x)));
140164
}
141-
let mut pos = state.pos64(m);
142-
let d0: Mach::u32x4 = m.unpack(state.d);
143-
pos = pos.wrapping_add(1);
144-
let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
145-
pos = pos.wrapping_add(1);
146-
let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
147-
pos = pos.wrapping_add(1);
148-
let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
149-
pos = pos.wrapping_add(1);
150-
let d4 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
151-
152-
let (a, b, c, d) = (
153-
x.a.to_lanes(),
154-
x.b.to_lanes(),
155-
x.c.to_lanes(),
156-
x.d.to_lanes(),
157-
);
165+
let kk = Mach::u32x4x4::from_lanes([k, k, k, k]);
158166
let sb = m.unpack(state.b);
167+
let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]);
159168
let sc = m.unpack(state.c);
160-
let sd = [m.unpack(state.d), d1, d2, d3];
161-
state.d = d4.into();
162-
out[0..4].copy_from_slice(&(a[0] + k).to_lanes());
163-
out[4..8].copy_from_slice(&(b[0] + sb).to_lanes());
164-
out[8..12].copy_from_slice(&(c[0] + sc).to_lanes());
165-
out[12..16].copy_from_slice(&(d[0] + sd[0]).to_lanes());
166-
out[16..20].copy_from_slice(&(a[1] + k).to_lanes());
167-
out[20..24].copy_from_slice(&(b[1] + sb).to_lanes());
168-
out[24..28].copy_from_slice(&(c[1] + sc).to_lanes());
169-
out[28..32].copy_from_slice(&(d[1] + sd[1]).to_lanes());
170-
out[32..36].copy_from_slice(&(a[2] + k).to_lanes());
171-
out[36..40].copy_from_slice(&(b[2] + sb).to_lanes());
172-
out[40..44].copy_from_slice(&(c[2] + sc).to_lanes());
173-
out[44..48].copy_from_slice(&(d[2] + sd[2]).to_lanes());
174-
out[48..52].copy_from_slice(&(a[3] + k).to_lanes());
175-
out[52..56].copy_from_slice(&(b[3] + sb).to_lanes());
176-
out[56..60].copy_from_slice(&(c[3] + sc).to_lanes());
177-
out[60..64].copy_from_slice(&(d[3] + sd[3]).to_lanes());
169+
let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]);
170+
let sd = d0123(m, state.d);
171+
let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd);
172+
out[0..16].copy_from_slice(&results.0.to_scalars());
173+
out[16..32].copy_from_slice(&results.1.to_scalars());
174+
out[32..48].copy_from_slice(&results.2.to_scalars());
175+
out[48..64].copy_from_slice(&results.3.to_scalars());
176+
state.d = add_pos(m, sd.to_lanes()[0], 4).into();
178177
}
179178

180179
dispatch!(m, Mach, {

0 commit comments

Comments
 (0)