|
12 | 12 | use ppv_lite86::{dispatch, dispatch_light128};
|
13 | 13 |
|
14 | 14 | pub use ppv_lite86::Machine;
|
15 |
| -use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4}; |
| 15 | +use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, Vector}; |
16 | 16 |
|
17 | 17 | pub(crate) const BLOCK: usize = 16;
|
18 | 18 | pub(crate) const BLOCK64: u64 = BLOCK as u64;
|
@@ -73,12 +73,6 @@ impl ChaCha {
|
73 | 73 | init_chacha(key, nonce)
|
74 | 74 | }
|
75 | 75 |
|
76 |
| - #[inline(always)] |
77 |
| - fn pos64<M: Machine>(&self, m: M) -> u64 { |
78 |
| - let d: M::u32x4 = m.unpack(self.d); |
79 |
| - ((d.extract(1) as u64) << 32) | d.extract(0) as u64 |
80 |
| - } |
81 |
| - |
82 | 76 | /// Produce 4 blocks of output, advancing the state
|
83 | 77 | #[inline(always)]
|
84 | 78 | pub fn refill4(&mut self, drounds: u32, out: &mut [u32; BUFSZ]) {
|
@@ -111,70 +105,75 @@ impl ChaCha {
|
111 | 105 | }
|
112 | 106 | }
|
113 | 107 |
|
114 |
| -#[allow(clippy::many_single_char_names)] |
| 108 | +// This implementation is platform-independent. |
115 | 109 | #[inline(always)]
|
116 |
| -fn refill_wide_impl<Mach: Machine>( |
117 |
| - m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ], |
118 |
| -) { |
119 |
| - let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); |
120 |
| - let mut pos = state.pos64(m); |
121 |
| - let d0: Mach::u32x4 = m.unpack(state.d); |
| 110 | +#[cfg(target_endian = "big")] |
| 111 | +fn add_pos<Mach: Machine>(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 { |
| 112 | + let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; |
| 113 | + let pos = pos0.wrapping_add(i); |
| 114 | + d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0) |
| 115 | +} |
| 116 | +#[inline(always)] |
| 117 | +#[cfg(target_endian = "big")] |
| 118 | +fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { |
| 119 | + let d0: Mach::u32x4 = m.unpack(d); |
| 120 | + let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; |
122 | 121 | pos = pos.wrapping_add(1);
|
123 | 122 | let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
|
124 | 123 | pos = pos.wrapping_add(1);
|
125 | 124 | let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
|
126 | 125 | pos = pos.wrapping_add(1);
|
127 | 126 | let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
|
| 127 | + Mach::u32x4x4::from_lanes([d0, d1, d2, d3]) |
| 128 | +} |
| 129 | + |
| 130 | +// Pos is packed into the state vectors as a little-endian u64, |
| 131 | +// so on LE platforms we can use native vector ops to increment it. |
| 132 | +#[inline(always)] |
| 133 | +#[cfg(target_endian = "little")] |
| 134 | +fn add_pos<Mach: Machine>(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 { |
| 135 | + let d0: Mach::u64x2 = m.unpack(d.into()); |
| 136 | + let incr = m.vec([i, 0]); |
| 137 | + m.unpack((d0 + incr).into()) |
| 138 | +} |
| 139 | +#[inline(always)] |
| 140 | +#[cfg(target_endian = "little")] |
| 141 | +fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { |
| 142 | + let d0: Mach::u64x2 = m.unpack(d); |
| 143 | + let incr = Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]); |
| 144 | + m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into()) |
| 145 | +} |
128 | 146 |
|
| 147 | +#[allow(clippy::many_single_char_names)] |
| 148 | +#[inline(always)] |
| 149 | +fn refill_wide_impl<Mach: Machine>( |
| 150 | + m: Mach, state: &mut ChaCha, drounds: u32, out: &mut [u32; BUFSZ], |
| 151 | +) { |
| 152 | + let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); |
129 | 153 | let b = m.unpack(state.b);
|
130 | 154 | let c = m.unpack(state.c);
|
131 | 155 | let mut x = State {
|
132 | 156 | a: Mach::u32x4x4::from_lanes([k, k, k, k]),
|
133 | 157 | b: Mach::u32x4x4::from_lanes([b, b, b, b]),
|
134 | 158 | c: Mach::u32x4x4::from_lanes([c, c, c, c]),
|
135 |
| - d: m.unpack(Mach::u32x4x4::from_lanes([d0, d1, d2, d3]).into()), |
| 159 | + d: d0123(m, state.d), |
136 | 160 | };
|
137 | 161 | for _ in 0..drounds {
|
138 | 162 | x = round(x);
|
139 | 163 | x = undiagonalize(round(diagonalize(x)));
|
140 | 164 | }
|
141 |
| - let mut pos = state.pos64(m); |
142 |
| - let d0: Mach::u32x4 = m.unpack(state.d); |
143 |
| - pos = pos.wrapping_add(1); |
144 |
| - let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
145 |
| - pos = pos.wrapping_add(1); |
146 |
| - let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
147 |
| - pos = pos.wrapping_add(1); |
148 |
| - let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
149 |
| - pos = pos.wrapping_add(1); |
150 |
| - let d4 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); |
151 |
| - |
152 |
| - let (a, b, c, d) = ( |
153 |
| - x.a.to_lanes(), |
154 |
| - x.b.to_lanes(), |
155 |
| - x.c.to_lanes(), |
156 |
| - x.d.to_lanes(), |
157 |
| - ); |
| 165 | + let kk = Mach::u32x4x4::from_lanes([k, k, k, k]); |
158 | 166 | let sb = m.unpack(state.b);
|
| 167 | + let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]); |
159 | 168 | let sc = m.unpack(state.c);
|
160 |
| - let sd = [m.unpack(state.d), d1, d2, d3]; |
161 |
| - state.d = d4.into(); |
162 |
| - out[0..4].copy_from_slice(&(a[0] + k).to_lanes()); |
163 |
| - out[4..8].copy_from_slice(&(b[0] + sb).to_lanes()); |
164 |
| - out[8..12].copy_from_slice(&(c[0] + sc).to_lanes()); |
165 |
| - out[12..16].copy_from_slice(&(d[0] + sd[0]).to_lanes()); |
166 |
| - out[16..20].copy_from_slice(&(a[1] + k).to_lanes()); |
167 |
| - out[20..24].copy_from_slice(&(b[1] + sb).to_lanes()); |
168 |
| - out[24..28].copy_from_slice(&(c[1] + sc).to_lanes()); |
169 |
| - out[28..32].copy_from_slice(&(d[1] + sd[1]).to_lanes()); |
170 |
| - out[32..36].copy_from_slice(&(a[2] + k).to_lanes()); |
171 |
| - out[36..40].copy_from_slice(&(b[2] + sb).to_lanes()); |
172 |
| - out[40..44].copy_from_slice(&(c[2] + sc).to_lanes()); |
173 |
| - out[44..48].copy_from_slice(&(d[2] + sd[2]).to_lanes()); |
174 |
| - out[48..52].copy_from_slice(&(a[3] + k).to_lanes()); |
175 |
| - out[52..56].copy_from_slice(&(b[3] + sb).to_lanes()); |
176 |
| - out[56..60].copy_from_slice(&(c[3] + sc).to_lanes()); |
177 |
| - out[60..64].copy_from_slice(&(d[3] + sd[3]).to_lanes()); |
| 169 | + let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]); |
| 170 | + let sd = d0123(m, state.d); |
| 171 | + let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd); |
| 172 | + out[0..16].copy_from_slice(&results.0.to_scalars()); |
| 173 | + out[16..32].copy_from_slice(&results.1.to_scalars()); |
| 174 | + out[32..48].copy_from_slice(&results.2.to_scalars()); |
| 175 | + out[48..64].copy_from_slice(&results.3.to_scalars()); |
| 176 | + state.d = add_pos(m, sd.to_lanes()[0], 4).into(); |
178 | 177 | }
|
179 | 178 |
|
180 | 179 | dispatch!(m, Mach, {
|
|
0 commit comments