|
12 | 12 | use ppv_lite86::{dispatch, dispatch_light128};
|
13 | 13 |
|
14 | 14 | pub use ppv_lite86::Machine;
|
15 |
| -use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4}; |
| 15 | +use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, Vector}; |
16 | 16 |
|
17 | 17 | pub(crate) const BLOCK: usize = 16;
|
18 | 18 | pub(crate) const BLOCK64: u64 = BLOCK as u64;
|
@@ -162,32 +162,18 @@ fn refill_wide_impl<Mach: Machine>(
|
162 | 162 | x = round(x);
|
163 | 163 | x = undiagonalize(round(diagonalize(x)));
|
164 | 164 | }
|
165 |
| - let (a, b, c, d) = ( |
166 |
| - x.a.to_lanes(), |
167 |
| - x.b.to_lanes(), |
168 |
| - x.c.to_lanes(), |
169 |
| - x.d.to_lanes(), |
170 |
| - ); |
| 165 | + let kk = Mach::u32x4x4::from_lanes([k, k, k, k]); |
171 | 166 | let sb = m.unpack(state.b);
|
| 167 | + let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]); |
172 | 168 | let sc = m.unpack(state.c);
|
173 |
| - let sd = d0123(m, state.d).to_lanes(); |
174 |
| - state.d = add_pos(m, sd[0], 4).into(); |
175 |
| - out[0..4].copy_from_slice(&(a[0] + k).to_lanes()); |
176 |
| - out[4..8].copy_from_slice(&(b[0] + sb).to_lanes()); |
177 |
| - out[8..12].copy_from_slice(&(c[0] + sc).to_lanes()); |
178 |
| - out[12..16].copy_from_slice(&(d[0] + sd[0]).to_lanes()); |
179 |
| - out[16..20].copy_from_slice(&(a[1] + k).to_lanes()); |
180 |
| - out[20..24].copy_from_slice(&(b[1] + sb).to_lanes()); |
181 |
| - out[24..28].copy_from_slice(&(c[1] + sc).to_lanes()); |
182 |
| - out[28..32].copy_from_slice(&(d[1] + sd[1]).to_lanes()); |
183 |
| - out[32..36].copy_from_slice(&(a[2] + k).to_lanes()); |
184 |
| - out[36..40].copy_from_slice(&(b[2] + sb).to_lanes()); |
185 |
| - out[40..44].copy_from_slice(&(c[2] + sc).to_lanes()); |
186 |
| - out[44..48].copy_from_slice(&(d[2] + sd[2]).to_lanes()); |
187 |
| - out[48..52].copy_from_slice(&(a[3] + k).to_lanes()); |
188 |
| - out[52..56].copy_from_slice(&(b[3] + sb).to_lanes()); |
189 |
| - out[56..60].copy_from_slice(&(c[3] + sc).to_lanes()); |
190 |
| - out[60..64].copy_from_slice(&(d[3] + sd[3]).to_lanes()); |
| 169 | + let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]); |
| 170 | + let sd = d0123(m, state.d); |
| 171 | + let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd); |
| 172 | + out[0..16].copy_from_slice(&results.0.to_scalars()); |
| 173 | + out[16..32].copy_from_slice(&results.1.to_scalars()); |
| 174 | + out[32..48].copy_from_slice(&results.2.to_scalars()); |
| 175 | + out[48..64].copy_from_slice(&results.3.to_scalars()); |
| 176 | + state.d = add_pos(m, sd.to_lanes()[0], 4).into(); |
191 | 177 | }
|
192 | 178 |
|
193 | 179 | dispatch!(m, Mach, {
|
|
0 commit comments