Skip to content

Commit 47afd9c

Browse files
authored
Optimize sha256 for aarch64 (#27)
1 parent 24ac47c commit 47afd9c

File tree

1 file changed

+119
-128
lines changed

1 file changed

+119
-128
lines changed

sha2/src/sha256_aarch64.S

Lines changed: 119 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -30,184 +30,175 @@ sha256_compress:
3030
* 4 x0 state argument
3131
* 4 x1 block argument
3232
* 4 x2 pointer to k
33-
* 16 q0 W0
34-
* 16 q1 W1
35-
* 16 q2 W2
36-
* 16 q3 W3
33+
* 16 q0 state0
34+
* 16 q1 state1
35+
* 16 q2 abef
36+
* 16 q3 cdgh
3737
* 16 q4 k0
3838
* 16 q5 k1
39-
* 16 q6 state0
40-
* 16 q7 state1
41-
* 16 q16 abef
42-
* 16 q17 cdgh
43-
* 16 q18 cdgh0
39+
* 16 q8 W0
40+
* 16 q9 W1
41+
* 16 q10 W2
42+
* 16 q11 W3
4443
*/
4544

45+
// save the lower half of q8-q11
46+
stp d8, d9, [sp,#-32]!
47+
stp d10, d11, [sp,#16]
48+
4649
// Load state in registers
47-
ldr q16, [x0]
48-
ldr q17, [x0, 16]
49-
mov v18.16b, v17.16b
50+
ldp q0, q1, [x0]
51+
mov v2.16b, v0.16b
52+
mov v3.16b, v1.16b
5053

5154
// Load block in registers
52-
ldr q0, [x1]
53-
ldr q1, [x1, 16]
54-
ldr q2, [x1, 32]
55-
ldr q3, [x1, 48]
55+
ld1 {v8.4s-v11.4s}, [x1]
5656

5757
// TODO: only do that on little endian
58-
rev32 v0.16b, v0.16b
59-
rev32 v1.16b, v1.16b
60-
rev32 v2.16b, v2.16b
61-
rev32 v3.16b, v3.16b
58+
rev32 v8.16b, v8.16b
59+
rev32 v9.16b, v9.16b
60+
rev32 v10.16b, v10.16b
61+
rev32 v11.16b, v11.16b
6262

6363
// Compute the pointer to k
6464
adrp x2, .K
6565
add x2, x2, :lo12:.K
6666

6767
// load k
68-
ldr q4, [x2]
69-
add v4.4s, v4.4s, v0.4s
68+
ld1 {v16.4s-v19.4s}, [x2], #64
69+
ld1 {v20.4s-v23.4s}, [x2], #64
70+
ld1 {v24.4s-v27.4s}, [x2], #64
71+
ld1 {v28.4s-v31.4s}, [x2]
72+
add v6.4s, v8.4s, v16.4s
7073

7174
// Rounds 0-3
72-
sha256su0 v0.4s, v1.4s
73-
ldr q5, [x2, 16]
74-
add v5.4s, v5.4s, v1.4s
75-
mov v6.16b, v16.16b
76-
sha256h q6, q17, v4.4s
77-
sha256h2 q17, q16, v4.4s
78-
sha256su1 v0.4s, v2.4s, v3.4s
75+
sha256su0 v8.4s, v9.4s
76+
mov v4.16b, v2.16b
77+
add v7.4s, v9.4s, v17.4s
78+
sha256h q2, q3, v6.4s
79+
sha256h2 q3, q4, v6.4s
80+
sha256su1 v8.4s, v10.4s, v11.4s
7981

8082
// Rounds 4-7
81-
sha256su0 v1.4s, v2.4s
82-
ldr q4, [x2, 32]
83-
add v4.4s, v4.4s, v2.4s
84-
mov v7.16b, v6.16b
85-
sha256h q7, q17, v5.4s
86-
sha256h2 q17, q6, v5.4s
87-
sha256su1 v1.4s, v3.4s, v0.4s
83+
sha256su0 v9.4s, v10.4s
84+
mov v4.16b, v2.16b
85+
add v6.4s, v10.4s, v18.4s
86+
sha256h q2, q3, v7.4s
87+
sha256h2 q3, q4, v7.4s
88+
sha256su1 v9.4s, v11.4s, v8.4s
8889

8990
// Rounds 8-11
90-
sha256su0 v2.4s, v3.4s
91-
ldr q5, [x2, 48]
92-
add v5.4s, v5.4s, v3.4s
93-
mov v6.16b, v7.16b
94-
sha256h q6, q17, v4.4s
95-
sha256h2 q17, q7, v4.4s
96-
sha256su1 v2.4s, v0.4s, v1.4s
91+
sha256su0 v10.4s, v11.4s
92+
mov v4.16b, v2.16b
93+
add v7.4s, v11.4s, v19.4s
94+
sha256h q2, q3, v6.4s
95+
sha256h2 q3, q4, v6.4s
96+
sha256su1 v10.4s, v8.4s, v9.4s
9797

9898
// Rounds 12-15
99-
sha256su0 v3.4s, v0.4s
100-
ldr q4, [x2, 64]
101-
add v4.4s, v4.4s, v0.4s
102-
mov v7.16b, v6.16b
103-
sha256h q7, q17, v5.4s
104-
sha256h2 q17, q6, v5.4s
105-
sha256su1 v3.4s, v1.4s, v2.4s
99+
sha256su0 v11.4s, v8.4s
100+
mov v4.16b, v2.16b
101+
add v6.4s, v8.4s, v20.4s
102+
sha256h q2, q3, v7.4s
103+
sha256h2 q3, q4, v7.4s
104+
sha256su1 v11.4s, v9.4s, v10.4s
106105

107106
// Rounds 16-19
108-
sha256su0 v0.4s, v1.4s
109-
ldr q5, [x2, 80]
110-
add v5.4s, v5.4s, v1.4s
111-
mov v6.16b, v7.16b
112-
sha256h q6, q17, v4.4s
113-
sha256h2 q17, q7, v4.4s
114-
sha256su1 v0.4s, v2.4s, v3.4s
107+
sha256su0 v8.4s, v9.4s
108+
mov v4.16b, v2.16b
109+
add v7.4s, v9.4s, v21.4s
110+
sha256h q2, q3, v6.4s
111+
sha256h2 q3, q4, v6.4s
112+
sha256su1 v8.4s, v10.4s, v11.4s
115113

116114
// Rounds 20-23
117-
sha256su0 v1.4s, v2.4s
118-
ldr q4, [x2, 96]
119-
add v4.4s, v4.4s, v2.4s
120-
mov v7.16b, v6.16b
121-
sha256h q7, q17, v5.4s
122-
sha256h2 q17, q6, v5.4s
123-
sha256su1 v1.4s, v3.4s, v0.4s
115+
sha256su0 v9.4s, v10.4s
116+
mov v4.16b, v2.16b
117+
add v6.4s, v10.4s, v22.4s
118+
sha256h q2, q3, v7.4s
119+
sha256h2 q3, q4, v7.4s
120+
sha256su1 v9.4s, v11.4s, v8.4s
124121

125122
// Rounds 24-27
126-
sha256su0 v2.4s, v3.4s
127-
ldr q5, [x2, 112]
128-
add v5.4s, v5.4s, v3.4s
129-
mov v6.16b, v7.16b
130-
sha256h q6, q17, v4.4s
131-
sha256h2 q17, q7, v4.4s
132-
sha256su1 v2.4s, v0.4s, v1.4s
123+
sha256su0 v10.4s, v11.4s
124+
mov v4.16b, v2.16b
125+
add v7.4s, v11.4s, v23.4s
126+
sha256h q2, q3, v6.4s
127+
sha256h2 q3, q4, v6.4s
128+
sha256su1 v10.4s, v8.4s, v9.4s
133129

134130
// Rounds 28-31
135-
sha256su0 v3.4s, v0.4s
136-
ldr q4, [x2, 128]
137-
add v4.4s, v4.4s, v0.4s
138-
mov v7.16b, v6.16b
139-
sha256h q7, q17, v5.4s
140-
sha256h2 q17, q6, v5.4s
141-
sha256su1 v3.4s, v1.4s, v2.4s
131+
sha256su0 v11.4s, v8.4s
132+
mov v4.16b, v2.16b
133+
add v6.4s, v8.4s, v24.4s
134+
sha256h q2, q3, v7.4s
135+
sha256h2 q3, q4, v7.4s
136+
sha256su1 v11.4s, v9.4s, v10.4s
142137

143138
// Rounds 32-35
144-
sha256su0 v0.4s, v1.4s
145-
ldr q5, [x2, 144]
146-
add v5.4s, v5.4s, v1.4s
147-
mov v6.16b, v7.16b
148-
sha256h q6, q17, v4.4s
149-
sha256h2 q17, q7, v4.4s
150-
sha256su1 v0.4s, v2.4s, v3.4s
139+
sha256su0 v8.4s, v9.4s
140+
mov v4.16b, v2.16b
141+
add v7.4s, v9.4s, v25.4s
142+
sha256h q2, q3, v6.4s
143+
sha256h2 q3, q4, v6.4s
144+
sha256su1 v8.4s, v10.4s, v11.4s
151145

152146
// Rounds 36-39
153-
sha256su0 v1.4s, v2.4s
154-
ldr q4, [x2, 160]
155-
add v4.4s, v4.4s, v2.4s
156-
mov v7.16b, v6.16b
157-
sha256h q7, q17, v5.4s
158-
sha256h2 q17, q6, v5.4s
159-
sha256su1 v1.4s, v3.4s, v0.4s
147+
sha256su0 v9.4s, v10.4s
148+
mov v4.16b, v2.16b
149+
add v6.4s, v10.4s, v26.4s
150+
sha256h q2, q3, v7.4s
151+
sha256h2 q3, q4, v7.4s
152+
sha256su1 v9.4s, v11.4s, v8.4s
160153

161154
// Rounds 40-43
162-
sha256su0 v2.4s, v3.4s
163-
ldr q5, [x2, 176]
164-
add v5.4s, v5.4s, v3.4s
165-
mov v6.16b, v7.16b
166-
sha256h q6, q17, v4.4s
167-
sha256h2 q17, q7, v4.4s
168-
sha256su1 v2.4s, v0.4s, v1.4s
155+
sha256su0 v10.4s, v11.4s
156+
mov v4.16b, v2.16b
157+
add v7.4s, v11.4s, v27.4s
158+
sha256h q2, q3, v6.4s
159+
sha256h2 q3, q4, v6.4s
160+
sha256su1 v10.4s, v8.4s, v9.4s
169161

170162
// Rounds 44-47
171-
sha256su0 v3.4s, v0.4s
172-
ldr q4, [x2, 192]
173-
add v4.4s, v4.4s, v0.4s
174-
mov v7.16b, v6.16b
175-
sha256h q7, q17, v5.4s
176-
sha256h2 q17, q6, v5.4s
177-
sha256su1 v3.4s, v1.4s, v2.4s
163+
sha256su0 v11.4s, v8.4s
164+
mov v4.16b, v2.16b
165+
add v6.4s, v8.4s, v28.4s
166+
sha256h q2, q3, v7.4s
167+
sha256h2 q3, q4, v7.4s
168+
sha256su1 v11.4s, v9.4s, v10.4s
178169

179170
// Rounds 48-51
180-
ldr q5, [x2, 208]
181-
add v5.4s, v5.4s, v1.4s
182-
mov v6.16b, v7.16b
183-
sha256h q6, q17, v4.4s
184-
sha256h2 q17, q7, v4.4s
171+
mov v4.16b, v2.16b
172+
add v7.4s, v9.4s, v29.4s
173+
sha256h q2, q3, v6.4s
174+
sha256h2 q3, q4, v6.4s
185175

186176
// Rounds 52-55
187-
ldr q4, [x2, 224]
188-
add v4.4s, v4.4s, v2.4s
189-
mov v7.16b, v6.16b
190-
sha256h q7, q17, v5.4s
191-
sha256h2 q17, q6, v5.4s
177+
mov v4.16b, v2.16b
178+
add v6.4s, v10.4s, v30.4s
179+
sha256h q2, q3, v7.4s
180+
sha256h2 q3, q4, v7.4s
192181

193182
// Rounds 56-59
194-
ldr q5, [x2, 240]
195-
add v5.4s, v5.4s, v3.4s
196-
mov v6.16b, v7.16b
197-
sha256h q6, q17, v4.4s
198-
sha256h2 q17, q7, v4.4s
183+
mov v4.16b, v2.16b
184+
add v7.4s, v11.4s, v31.4s
185+
sha256h q2, q3, v6.4s
186+
sha256h2 q3, q4, v6.4s
199187

200188
// Rounds 60-63
201-
mov v7.16b, v6.16b
202-
sha256h q7, q17, v5.4s
203-
sha256h2 q17, q6, v5.4s
189+
mov v4.16b, v2.16b
190+
sha256h q2, q3, v7.4s
191+
sha256h2 q3, q4, v7.4s
204192

205193
// Update state
206-
add v16.4s, v16.4s, v7.4s
207-
str q16, [x0]
208-
add v18.4s, v18.4s, v17.4s
209-
str q18, [x0, 16]
194+
add v0.4s, v0.4s, v2.4s
195+
add v1.4s, v1.4s, v3.4s
196+
stp q0, q1, [x0]
210197

198+
// restore
199+
ldp d10, d11, [sp,#16]
200+
ldp d8, d9, [sp],#32
201+
211202
ret
212203
.align 4
213204
.K:

0 commit comments

Comments
 (0)