diff --git a/rand_chacha/CHANGELOG.md b/rand_chacha/CHANGELOG.md index 1367b34a3d7..7ef621f6781 100644 --- a/rand_chacha/CHANGELOG.md +++ b/rand_chacha/CHANGELOG.md @@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - Made `rand_chacha` propagate the `std` feature down to `rand_core` +- Performance improvements for AVX2: ~4-7% ## [0.3.1] - 2021-06-09 - add getters corresponding to existing setters: `get_seed`, `get_stream` (#1124) diff --git a/rand_chacha/Cargo.toml b/rand_chacha/Cargo.toml index f99d1967133..c4f5c113142 100644 --- a/rand_chacha/Cargo.toml +++ b/rand_chacha/Cargo.toml @@ -16,7 +16,7 @@ edition = "2018" [dependencies] rand_core = { path = "../rand_core", version = "0.6.0" } -ppv-lite86 = { version = "0.2.8", default-features = false, features = ["simd"] } +ppv-lite86 = { version = "0.2.14", default-features = false, features = ["simd"] } serde = { version = "1.0", features = ["derive"], optional = true } [dev-dependencies] diff --git a/rand_chacha/src/guts.rs b/rand_chacha/src/guts.rs index eeabd9f4c1d..797ded6fa73 100644 --- a/rand_chacha/src/guts.rs +++ b/rand_chacha/src/guts.rs @@ -12,7 +12,7 @@ use ppv_lite86::{dispatch, dispatch_light128}; pub use ppv_lite86::Machine; -use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4}; +use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, Vector}; pub(crate) const BLOCK: usize = 16; pub(crate) const BLOCK64: u64 = BLOCK as u64; @@ -73,12 +73,6 @@ impl ChaCha { init_chacha(key, nonce) } - #[inline(always)] - fn pos64(&self, m: M) -> u64 { - let d: M::u32x4 = m.unpack(self.d); - ((d.extract(1) as u64) << 32) | d.extract(0) as u64 - } - /// Produce 4 blocks of output, advancing the state #[inline(always)] pub fn refill4(&mut self, drounds: u32, out: &mut [u32; BUFSZ]) { @@ -111,70 +105,75 @@ impl ChaCha { } } -#[allow(clippy::many_single_char_names)] +// This implementation is platform-independent. #[inline(always)] -fn refill_wide_impl( - m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ], -) { - let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); - let mut pos = state.pos64(m); - let d0: Mach::u32x4 = m.unpack(state.d); +#[cfg(target_endian = "big")] +fn add_pos(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 { + let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; + let pos = pos0.wrapping_add(i); + d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0) +} +#[inline(always)] +#[cfg(target_endian = "big")] +fn d0123(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { + let d0: Mach::u32x4 = m.unpack(d); + let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; pos = pos.wrapping_add(1); let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); pos = pos.wrapping_add(1); let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); pos = pos.wrapping_add(1); let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); + Mach::u32x4x4::from_lanes([d0, d1, d2, d3]) +} + +// Pos is packed into the state vectors as a little-endian u64, +// so on LE platforms we can use native vector ops to increment it. +#[inline(always)] +#[cfg(target_endian = "little")] +fn add_pos(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 { + let d0: Mach::u64x2 = m.unpack(d.into()); + let incr = m.vec([i, 0]); + m.unpack((d0 + incr).into()) +} +#[inline(always)] +#[cfg(target_endian = "little")] +fn d0123(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { + let d0: Mach::u64x2 = m.unpack(d); + let incr = Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]); + m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into()) +} +#[allow(clippy::many_single_char_names)] +#[inline(always)] +fn refill_wide_impl( + m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ], +) { + let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); let b = m.unpack(state.b); let c = m.unpack(state.c); let mut x = State { a: Mach::u32x4x4::from_lanes([k, k, k, k]), b: Mach::u32x4x4::from_lanes([b, b, b, b]), c: Mach::u32x4x4::from_lanes([c, c, c, c]), - d: m.unpack(Mach::u32x4x4::from_lanes([d0, d1, d2, d3]).into()), + d: d0123(m, state.d), }; for _ in 0..drounds { x = round(x); x = undiagonalize(round(diagonalize(x))); } - let mut pos = state.pos64(m); - let d0: Mach::u32x4 = m.unpack(state.d); - pos = pos.wrapping_add(1); - let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos = pos.wrapping_add(1); - let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos = pos.wrapping_add(1); - let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos = pos.wrapping_add(1); - let d4 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - - let (a, b, c, d) = ( - x.a.to_lanes(), - x.b.to_lanes(), - x.c.to_lanes(), - x.d.to_lanes(), - ); + let kk = Mach::u32x4x4::from_lanes([k, k, k, k]); let sb = m.unpack(state.b); + let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]); let sc = m.unpack(state.c); - let sd = [m.unpack(state.d), d1, d2, d3]; - state.d = d4.into(); - out[0..4].copy_from_slice(&(a[0] + k).to_lanes()); - out[4..8].copy_from_slice(&(b[0] + sb).to_lanes()); - out[8..12].copy_from_slice(&(c[0] + sc).to_lanes()); - out[12..16].copy_from_slice(&(d[0] + sd[0]).to_lanes()); - out[16..20].copy_from_slice(&(a[1] + k).to_lanes()); - out[20..24].copy_from_slice(&(b[1] + sb).to_lanes()); - out[24..28].copy_from_slice(&(c[1] + sc).to_lanes()); - out[28..32].copy_from_slice(&(d[1] + sd[1]).to_lanes()); - out[32..36].copy_from_slice(&(a[2] + k).to_lanes()); - out[36..40].copy_from_slice(&(b[2] + sb).to_lanes()); - out[40..44].copy_from_slice(&(c[2] + sc).to_lanes()); - out[44..48].copy_from_slice(&(d[2] + sd[2]).to_lanes()); - out[48..52].copy_from_slice(&(a[3] + k).to_lanes()); - out[52..56].copy_from_slice(&(b[3] + sb).to_lanes()); - out[56..60].copy_from_slice(&(c[3] + sc).to_lanes()); - out[60..64].copy_from_slice(&(d[3] + sd[3]).to_lanes()); + let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]); + let sd = d0123(m, state.d); + let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd); + out[0..16].copy_from_slice(&results.0.to_scalars()); + out[16..32].copy_from_slice(&results.1.to_scalars()); + out[32..48].copy_from_slice(&results.2.to_scalars()); + out[48..64].copy_from_slice(&results.3.to_scalars()); + state.d = add_pos(m, sd.to_lanes()[0], 4).into(); } dispatch!(m, Mach, {