@@ -30,184 +30,175 @@ sha256_compress:
30
30
* 4 x0 state argument
31
31
* 4 x1 block argument
32
32
* 4 x2 pointer to k
33
- * 16 q0 W0
34
- * 16 q1 W1
35
- * 16 q2 W2
36
- * 16 q3 W3
33
+ * 16 q0 state0
34
+ * 16 q1 state1
35
+ * 16 q2 abef
36
+ * 16 q3 cdgh
37
37
* 16 q4 k0
38
38
* 16 q5 k1
39
- * 16 q6 state0
40
- * 16 q7 state1
41
- * 16 q16 abef
42
- * 16 q17 cdgh
43
- * 16 q18 cdgh0
39
+ * 16 q8 W0
40
+ * 16 q9 W1
41
+ * 16 q10 W2
42
+ * 16 q11 W3
44
43
* /
45
44
45
+ // save the lower half of q8 - q11
46
+ stp d8 , d9 , [ sp , # - 32 ] !
47
+ stp d10 , d11 , [ sp , # 16 ]
48
+
46
49
// Load state in registers
47
- ldr q16 , [ x0 ]
48
- ldr q17 , [ x0 , 16 ]
49
- mov v18 .16b , v17 .16b
50
+ ldp q0 , q1 , [ x0 ]
51
+ mov v2.16b , v0.16b
52
+ mov v3 .16b , v1 .16b
50
53
51
54
// Load block in registers
52
- ldr q0 , [ x1 ]
53
- ldr q1 , [ x1 , 16 ]
54
- ldr q2 , [ x1 , 32 ]
55
- ldr q3 , [ x1 , 48 ]
55
+ ld1 {v8.4s - v11.4s} , [ x1 ]
56
56
57
57
// TODO: only do th at on little endian
58
- rev32 v0 .16b , v0 .16b
59
- rev32 v1 .16b , v1 .16b
60
- rev32 v2 .16b , v2 .16b
61
- rev32 v3 .16b , v3 .16b
58
+ rev32 v8 .16b , v8 .16b
59
+ rev32 v9 .16b , v9 .16b
60
+ rev32 v10 .16b , v10 .16b
61
+ rev32 v11 .16b , v11 .16b
62
62
63
63
// Compute the pointer to k
64
64
adrp x2 , .K
65
65
add x2 , x2 , :lo12:.K
66
66
67
67
// load k
68
- ldr q4 , [ x2 ]
69
- add v4.4s , v4.4s , v0.4s
68
+ ld1 {v16.4s - v19.4s} , [ x2 ], # 64
69
+ ld1 {v20.4s - v23.4s} , [ x2 ], # 64
70
+ ld1 {v24.4s - v27.4s} , [ x2 ], # 64
71
+ ld1 {v28.4s - v31.4s} , [ x2 ]
72
+ add v6.4s , v8.4s , v16.4s
70
73
71
74
// Rounds 0 - 3
72
- sha256su0 v0.4s , v1.4s
73
- ldr q5 , [ x2 , 16 ]
74
- add v5.4s , v5.4s , v1.4s
75
- mov v6.16b , v16.16b
76
- sha256h q6 , q17 , v4.4s
77
- sha256h2 q17 , q16 , v4.4s
78
- sha256su1 v0.4s , v2.4s , v3.4s
75
+ sha256su0 v8.4s , v9.4s
76
+ mov v4.16b , v2.16b
77
+ add v7.4s , v9.4s , v17.4s
78
+ sha256h q2 , q3 , v6.4s
79
+ sha256h2 q3 , q4 , v6.4s
80
+ sha256su1 v8.4s , v10.4s , v11.4s
79
81
80
82
// Rounds 4 - 7
81
- sha256su0 v1.4s , v2.4s
82
- ldr q4 , [ x2 , 32 ]
83
- add v4.4s , v4.4s , v2.4s
84
- mov v7.16b , v6.16b
85
- sha256h q7 , q17 , v5.4s
86
- sha256h2 q17 , q6 , v5.4s
87
- sha256su1 v1.4s , v3.4s , v0.4s
83
+ sha256su0 v9.4s , v10.4s
84
+ mov v4.16b , v2.16b
85
+ add v6.4s , v10.4s , v18.4s
86
+ sha256h q2 , q3 , v7.4s
87
+ sha256h2 q3 , q4 , v7.4s
88
+ sha256su1 v9.4s , v11.4s , v8.4s
88
89
89
90
// Rounds 8 - 11
90
- sha256su0 v2.4s , v3.4s
91
- ldr q5 , [ x2 , 48 ]
92
- add v5.4s , v5.4s , v3.4s
93
- mov v6.16b , v7.16b
94
- sha256h q6 , q17 , v4.4s
95
- sha256h2 q17 , q7 , v4.4s
96
- sha256su1 v2.4s , v0.4s , v1.4s
91
+ sha256su0 v10.4s , v11.4s
92
+ mov v4.16b , v2.16b
93
+ add v7.4s , v11.4s , v19.4s
94
+ sha256h q2 , q3 , v6.4s
95
+ sha256h2 q3 , q4 , v6.4s
96
+ sha256su1 v10.4s , v8.4s , v9.4s
97
97
98
98
// Rounds 12 - 15
99
- sha256su0 v3.4s , v0.4s
100
- ldr q4 , [ x2 , 64 ]
101
- add v4.4s , v4.4s , v0.4s
102
- mov v7.16b , v6.16b
103
- sha256h q7 , q17 , v5.4s
104
- sha256h2 q17 , q6 , v5.4s
105
- sha256su1 v3.4s , v1.4s , v2.4s
99
+ sha256su0 v11.4s , v8.4s
100
+ mov v4.16b , v2.16b
101
+ add v6.4s , v8.4s , v20.4s
102
+ sha256h q2 , q3 , v7.4s
103
+ sha256h2 q3 , q4 , v7.4s
104
+ sha256su1 v11.4s , v9.4s , v10.4s
106
105
107
106
// Rounds 16 - 19
108
- sha256su0 v0.4s , v1.4s
109
- ldr q5 , [ x2 , 80 ]
110
- add v5.4s , v5.4s , v1.4s
111
- mov v6.16b , v7.16b
112
- sha256h q6 , q17 , v4.4s
113
- sha256h2 q17 , q7 , v4.4s
114
- sha256su1 v0.4s , v2.4s , v3.4s
107
+ sha256su0 v8.4s , v9.4s
108
+ mov v4.16b , v2.16b
109
+ add v7.4s , v9.4s , v21.4s
110
+ sha256h q2 , q3 , v6.4s
111
+ sha256h2 q3 , q4 , v6.4s
112
+ sha256su1 v8.4s , v10.4s , v11.4s
115
113
116
114
// Rounds 20 - 23
117
- sha256su0 v1.4s , v2.4s
118
- ldr q4 , [ x2 , 96 ]
119
- add v4.4s , v4.4s , v2.4s
120
- mov v7.16b , v6.16b
121
- sha256h q7 , q17 , v5.4s
122
- sha256h2 q17 , q6 , v5.4s
123
- sha256su1 v1.4s , v3.4s , v0.4s
115
+ sha256su0 v9.4s , v10.4s
116
+ mov v4.16b , v2.16b
117
+ add v6.4s , v10.4s , v22.4s
118
+ sha256h q2 , q3 , v7.4s
119
+ sha256h2 q3 , q4 , v7.4s
120
+ sha256su1 v9.4s , v11.4s , v8.4s
124
121
125
122
// Rounds 24 - 27
126
- sha256su0 v2.4s , v3.4s
127
- ldr q5 , [ x2 , 112 ]
128
- add v5.4s , v5.4s , v3.4s
129
- mov v6.16b , v7.16b
130
- sha256h q6 , q17 , v4.4s
131
- sha256h2 q17 , q7 , v4.4s
132
- sha256su1 v2.4s , v0.4s , v1.4s
123
+ sha256su0 v10.4s , v11.4s
124
+ mov v4.16b , v2.16b
125
+ add v7.4s , v11.4s , v23.4s
126
+ sha256h q2 , q3 , v6.4s
127
+ sha256h2 q3 , q4 , v6.4s
128
+ sha256su1 v10.4s , v8.4s , v9.4s
133
129
134
130
// Rounds 28 - 31
135
- sha256su0 v3.4s , v0.4s
136
- ldr q4 , [ x2 , 128 ]
137
- add v4.4s , v4.4s , v0.4s
138
- mov v7.16b , v6.16b
139
- sha256h q7 , q17 , v5.4s
140
- sha256h2 q17 , q6 , v5.4s
141
- sha256su1 v3.4s , v1.4s , v2.4s
131
+ sha256su0 v11.4s , v8.4s
132
+ mov v4.16b , v2.16b
133
+ add v6.4s , v8.4s , v24.4s
134
+ sha256h q2 , q3 , v7.4s
135
+ sha256h2 q3 , q4 , v7.4s
136
+ sha256su1 v11.4s , v9.4s , v10.4s
142
137
143
138
// Rounds 32 - 35
144
- sha256su0 v0.4s , v1.4s
145
- ldr q5 , [ x2 , 144 ]
146
- add v5.4s , v5.4s , v1.4s
147
- mov v6.16b , v7.16b
148
- sha256h q6 , q17 , v4.4s
149
- sha256h2 q17 , q7 , v4.4s
150
- sha256su1 v0.4s , v2.4s , v3.4s
139
+ sha256su0 v8.4s , v9.4s
140
+ mov v4.16b , v2.16b
141
+ add v7.4s , v9.4s , v25.4s
142
+ sha256h q2 , q3 , v6.4s
143
+ sha256h2 q3 , q4 , v6.4s
144
+ sha256su1 v8.4s , v10.4s , v11.4s
151
145
152
146
// Rounds 36 - 39
153
- sha256su0 v1.4s , v2.4s
154
- ldr q4 , [ x2 , 160 ]
155
- add v4.4s , v4.4s , v2.4s
156
- mov v7.16b , v6.16b
157
- sha256h q7 , q17 , v5.4s
158
- sha256h2 q17 , q6 , v5.4s
159
- sha256su1 v1.4s , v3.4s , v0.4s
147
+ sha256su0 v9.4s , v10.4s
148
+ mov v4.16b , v2.16b
149
+ add v6.4s , v10.4s , v26.4s
150
+ sha256h q2 , q3 , v7.4s
151
+ sha256h2 q3 , q4 , v7.4s
152
+ sha256su1 v9.4s , v11.4s , v8.4s
160
153
161
154
// Rounds 40 - 43
162
- sha256su0 v2.4s , v3.4s
163
- ldr q5 , [ x2 , 176 ]
164
- add v5.4s , v5.4s , v3.4s
165
- mov v6.16b , v7.16b
166
- sha256h q6 , q17 , v4.4s
167
- sha256h2 q17 , q7 , v4.4s
168
- sha256su1 v2.4s , v0.4s , v1.4s
155
+ sha256su0 v10.4s , v11.4s
156
+ mov v4.16b , v2.16b
157
+ add v7.4s , v11.4s , v27.4s
158
+ sha256h q2 , q3 , v6.4s
159
+ sha256h2 q3 , q4 , v6.4s
160
+ sha256su1 v10.4s , v8.4s , v9.4s
169
161
170
162
// Rounds 44 - 47
171
- sha256su0 v3.4s , v0.4s
172
- ldr q4 , [ x2 , 192 ]
173
- add v4.4s , v4.4s , v0.4s
174
- mov v7.16b , v6.16b
175
- sha256h q7 , q17 , v5.4s
176
- sha256h2 q17 , q6 , v5.4s
177
- sha256su1 v3.4s , v1.4s , v2.4s
163
+ sha256su0 v11.4s , v8.4s
164
+ mov v4.16b , v2.16b
165
+ add v6.4s , v8.4s , v28.4s
166
+ sha256h q2 , q3 , v7.4s
167
+ sha256h2 q3 , q4 , v7.4s
168
+ sha256su1 v11.4s , v9.4s , v10.4s
178
169
179
170
// Rounds 48 - 51
180
- ldr q5 , [ x2 , 208 ]
181
- add v5.4s , v5.4s , v1.4s
182
- mov v6.16b , v7.16b
183
- sha256h q6 , q17 , v4.4s
184
- sha256h2 q17 , q7 , v4.4s
171
+ mov v4.16b , v2.16b
172
+ add v7.4s , v9.4s , v29.4s
173
+ sha256h q2 , q3 , v6.4s
174
+ sha256h2 q3 , q4 , v6.4s
185
175
186
176
// Rounds 52 - 55
187
- ldr q4 , [ x2 , 224 ]
188
- add v4.4s , v4.4s , v2.4s
189
- mov v7.16b , v6.16b
190
- sha256h q7 , q17 , v5.4s
191
- sha256h2 q17 , q6 , v5.4s
177
+ mov v4.16b , v2.16b
178
+ add v6.4s , v10.4s , v30.4s
179
+ sha256h q2 , q3 , v7.4s
180
+ sha256h2 q3 , q4 , v7.4s
192
181
193
182
// Rounds 56 - 59
194
- ldr q5 , [ x2 , 240 ]
195
- add v5.4s , v5.4s , v3.4s
196
- mov v6.16b , v7.16b
197
- sha256h q6 , q17 , v4.4s
198
- sha256h2 q17 , q7 , v4.4s
183
+ mov v4.16b , v2.16b
184
+ add v7.4s , v11.4s , v31.4s
185
+ sha256h q2 , q3 , v6.4s
186
+ sha256h2 q3 , q4 , v6.4s
199
187
200
188
// Rounds 60 - 63
201
- mov v7 .16b , v6 .16b
202
- sha256h q7 , q17 , v5 .4s
203
- sha256h2 q17 , q6 , v5 .4s
189
+ mov v4 .16b , v2 .16b
190
+ sha256h q2 , q3 , v7 .4s
191
+ sha256h2 q3 , q4 , v7 .4s
204
192
205
193
// Update state
206
- add v16.4s , v16.4s , v7.4s
207
- str q16 , [ x0 ]
208
- add v18.4s , v18.4s , v17.4s
209
- str q18 , [ x0 , 16 ]
194
+ add v0.4s , v0.4s , v2.4s
195
+ add v1.4s , v1.4s , v3.4s
196
+ stp q0 , q1 , [ x0 ]
210
197
198
+ // restore
199
+ ldp d10 , d11 , [ sp , # 16 ]
200
+ ldp d8 , d9 , [ sp ], # 32
201
+
211
202
ret
212
203
. align 4
213
204
.K:
0 commit comments