Skip to content

Commit f92197f

Browse files
committed
rand_chacha: optimize outputting
Improve AVX2 vectorizability of copying results to buffer. Performance gain measured at 15% (ChaCha20) to 37% (ChaCha8).
1 parent 4b4abf2 commit f92197f

File tree

1 file changed

+11
-25
lines changed

1 file changed

+11
-25
lines changed

rand_chacha/src/guts.rs

+11-25
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
use ppv_lite86::{dispatch, dispatch_light128};
1313

1414
pub use ppv_lite86::Machine;
15-
use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4};
15+
use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, Vector};
1616

1717
pub(crate) const BLOCK: usize = 16;
1818
pub(crate) const BLOCK64: u64 = BLOCK as u64;
@@ -162,32 +162,18 @@ fn refill_wide_impl<Mach: Machine>(
162162
x = round(x);
163163
x = undiagonalize(round(diagonalize(x)));
164164
}
165-
let (a, b, c, d) = (
166-
x.a.to_lanes(),
167-
x.b.to_lanes(),
168-
x.c.to_lanes(),
169-
x.d.to_lanes(),
170-
);
165+
let kk = Mach::u32x4x4::from_lanes([k, k, k, k]);
171166
let sb = m.unpack(state.b);
167+
let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]);
172168
let sc = m.unpack(state.c);
173-
let sd = d0123(m, state.d).to_lanes();
174-
state.d = add_pos(m, sd[0], 4).into();
175-
out[0..4].copy_from_slice(&(a[0] + k).to_lanes());
176-
out[4..8].copy_from_slice(&(b[0] + sb).to_lanes());
177-
out[8..12].copy_from_slice(&(c[0] + sc).to_lanes());
178-
out[12..16].copy_from_slice(&(d[0] + sd[0]).to_lanes());
179-
out[16..20].copy_from_slice(&(a[1] + k).to_lanes());
180-
out[20..24].copy_from_slice(&(b[1] + sb).to_lanes());
181-
out[24..28].copy_from_slice(&(c[1] + sc).to_lanes());
182-
out[28..32].copy_from_slice(&(d[1] + sd[1]).to_lanes());
183-
out[32..36].copy_from_slice(&(a[2] + k).to_lanes());
184-
out[36..40].copy_from_slice(&(b[2] + sb).to_lanes());
185-
out[40..44].copy_from_slice(&(c[2] + sc).to_lanes());
186-
out[44..48].copy_from_slice(&(d[2] + sd[2]).to_lanes());
187-
out[48..52].copy_from_slice(&(a[3] + k).to_lanes());
188-
out[52..56].copy_from_slice(&(b[3] + sb).to_lanes());
189-
out[56..60].copy_from_slice(&(c[3] + sc).to_lanes());
190-
out[60..64].copy_from_slice(&(d[3] + sd[3]).to_lanes());
169+
let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]);
170+
let sd = d0123(m, state.d);
171+
let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd);
172+
out[0..16].copy_from_slice(&results.0.to_scalars());
173+
out[16..32].copy_from_slice(&results.1.to_scalars());
174+
out[32..48].copy_from_slice(&results.2.to_scalars());
175+
out[48..64].copy_from_slice(&results.3.to_scalars());
176+
state.d = add_pos(m, sd.to_lanes()[0], 4).into();
191177
}
192178

193179
dispatch!(m, Mach, {

0 commit comments

Comments
 (0)