Commit | Line | Data |
---|---|---|
f0be44f4 DM |
1 | #define __ARM_ARCH__ __LINUX_ARM_ARCH__ |
2 | @ ==================================================================== | |
3 | @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | |
4 | @ project. The module is, however, dual licensed under OpenSSL and | |
5 | @ CRYPTOGAMS licenses depending on where you obtain it. For further | |
6 | @ details see http://www.openssl.org/~appro/cryptogams/. | |
7 | @ ==================================================================== | |
8 | ||
9 | @ sha1_block procedure for ARMv4. | |
10 | @ | |
11 | @ January 2007. | |
12 | ||
13 | @ Size/performance trade-off | |
14 | @ ==================================================================== | |
15 | @ impl size in bytes comp cycles[*] measured performance | |
16 | @ ==================================================================== | |
17 | @ thumb 304 3212 4420 | |
18 | @ armv4-small 392/+29% 1958/+64% 2250/+96% | |
19 | @ armv4-compact 740/+89% 1552/+26% 1840/+22% | |
20 | @ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] | |
21 | @ full unroll ~5100/+260% ~1260/+4% ~1300/+5% | |
22 | @ ==================================================================== | |
23 | @ thumb = same as 'small' but in Thumb instructions[**] and | |
24 | @ with recurring code in two private functions; | |
25 | @ small = detached Xload/update, loops are folded; | |
26 | @ compact = detached Xload/update, 5x unroll; | |
27 | @ large = interleaved Xload/update, 5x unroll; | |
28 | @ full unroll = interleaved Xload/update, full unroll, estimated[!]; | |
29 | @ | |
30 | @ [*] Manually counted instructions in "grand" loop body. Measured | |
31 | @ performance is affected by prologue and epilogue overhead, | |
32 | @ i-cache availability, branch penalties, etc. | |
33 | @ [**] While each Thumb instruction is twice smaller, they are not as | |
34 | @ diverse as ARM ones: e.g., there are only two arithmetic | |
35 | @ instructions with 3 arguments, no [fixed] rotate, addressing | |
36 | @ modes are limited. As result it takes more instructions to do | |
37 | @ the same job in Thumb, therefore the code is never twice as | |
38 | @ small and always slower. | |
39 | @ [***] which is also ~35% better than compiler generated code. Dual- | |
40 | @ issue Cortex A8 core was measured to process input block in | |
41 | @ ~990 cycles. | |
42 | ||
43 | @ August 2010. | |
44 | @ | |
45 | @ Rescheduling for dual-issue pipeline resulted in 13% improvement on | |
46 | @ Cortex A8 core and in absolute terms ~870 cycles per input block | |
47 | @ [or 13.6 cycles per byte]. | |
48 | ||
49 | @ February 2011. | |
50 | @ | |
51 | @ Profiler-assisted and platform-specific optimization resulted in 10% | |
52 | @ improvement on Cortex A8 core and 12.2 cycles per byte. | |
53 | ||
638591cd | 54 | #include <linux/linkage.h> |
f0be44f4 | 55 | |
638591cd | 56 | .text |
f0be44f4 DM |
57 | |
58 | .align 2 | |
638591cd | 59 | ENTRY(sha1_block_data_order) |
f0be44f4 DM |
60 | stmdb sp!,{r4-r12,lr} |
61 | add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 | |
62 | ldmia r0,{r3,r4,r5,r6,r7} | |
63 | .Lloop: | |
64 | ldr r8,.LK_00_19 | |
65 | mov r14,sp | |
66 | sub sp,sp,#15*4 | |
67 | mov r5,r5,ror#30 | |
68 | mov r6,r6,ror#30 | |
69 | mov r7,r7,ror#30 @ [6] | |
70 | .L_00_15: | |
71 | #if __ARM_ARCH__<7 | |
72 | ldrb r10,[r1,#2] | |
73 | ldrb r9,[r1,#3] | |
74 | ldrb r11,[r1,#1] | |
75 | add r7,r8,r7,ror#2 @ E+=K_00_19 | |
76 | ldrb r12,[r1],#4 | |
77 | orr r9,r9,r10,lsl#8 | |
78 | eor r10,r5,r6 @ F_xx_xx | |
79 | orr r9,r9,r11,lsl#16 | |
80 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) | |
81 | orr r9,r9,r12,lsl#24 | |
82 | #else | |
83 | ldr r9,[r1],#4 @ handles unaligned | |
84 | add r7,r8,r7,ror#2 @ E+=K_00_19 | |
85 | eor r10,r5,r6 @ F_xx_xx | |
86 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) | |
87 | #ifdef __ARMEL__ | |
88 | rev r9,r9 @ byte swap | |
89 | #endif | |
90 | #endif | |
91 | and r10,r4,r10,ror#2 | |
92 | add r7,r7,r9 @ E+=X[i] | |
93 | eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) | |
94 | str r9,[r14,#-4]! | |
95 | add r7,r7,r10 @ E+=F_00_19(B,C,D) | |
96 | #if __ARM_ARCH__<7 | |
97 | ldrb r10,[r1,#2] | |
98 | ldrb r9,[r1,#3] | |
99 | ldrb r11,[r1,#1] | |
100 | add r6,r8,r6,ror#2 @ E+=K_00_19 | |
101 | ldrb r12,[r1],#4 | |
102 | orr r9,r9,r10,lsl#8 | |
103 | eor r10,r4,r5 @ F_xx_xx | |
104 | orr r9,r9,r11,lsl#16 | |
105 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) | |
106 | orr r9,r9,r12,lsl#24 | |
107 | #else | |
108 | ldr r9,[r1],#4 @ handles unaligned | |
109 | add r6,r8,r6,ror#2 @ E+=K_00_19 | |
110 | eor r10,r4,r5 @ F_xx_xx | |
111 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) | |
112 | #ifdef __ARMEL__ | |
113 | rev r9,r9 @ byte swap | |
114 | #endif | |
115 | #endif | |
116 | and r10,r3,r10,ror#2 | |
117 | add r6,r6,r9 @ E+=X[i] | |
118 | eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) | |
119 | str r9,[r14,#-4]! | |
120 | add r6,r6,r10 @ E+=F_00_19(B,C,D) | |
121 | #if __ARM_ARCH__<7 | |
122 | ldrb r10,[r1,#2] | |
123 | ldrb r9,[r1,#3] | |
124 | ldrb r11,[r1,#1] | |
125 | add r5,r8,r5,ror#2 @ E+=K_00_19 | |
126 | ldrb r12,[r1],#4 | |
127 | orr r9,r9,r10,lsl#8 | |
128 | eor r10,r3,r4 @ F_xx_xx | |
129 | orr r9,r9,r11,lsl#16 | |
130 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) | |
131 | orr r9,r9,r12,lsl#24 | |
132 | #else | |
133 | ldr r9,[r1],#4 @ handles unaligned | |
134 | add r5,r8,r5,ror#2 @ E+=K_00_19 | |
135 | eor r10,r3,r4 @ F_xx_xx | |
136 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) | |
137 | #ifdef __ARMEL__ | |
138 | rev r9,r9 @ byte swap | |
139 | #endif | |
140 | #endif | |
141 | and r10,r7,r10,ror#2 | |
142 | add r5,r5,r9 @ E+=X[i] | |
143 | eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) | |
144 | str r9,[r14,#-4]! | |
145 | add r5,r5,r10 @ E+=F_00_19(B,C,D) | |
146 | #if __ARM_ARCH__<7 | |
147 | ldrb r10,[r1,#2] | |
148 | ldrb r9,[r1,#3] | |
149 | ldrb r11,[r1,#1] | |
150 | add r4,r8,r4,ror#2 @ E+=K_00_19 | |
151 | ldrb r12,[r1],#4 | |
152 | orr r9,r9,r10,lsl#8 | |
153 | eor r10,r7,r3 @ F_xx_xx | |
154 | orr r9,r9,r11,lsl#16 | |
155 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) | |
156 | orr r9,r9,r12,lsl#24 | |
157 | #else | |
158 | ldr r9,[r1],#4 @ handles unaligned | |
159 | add r4,r8,r4,ror#2 @ E+=K_00_19 | |
160 | eor r10,r7,r3 @ F_xx_xx | |
161 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) | |
162 | #ifdef __ARMEL__ | |
163 | rev r9,r9 @ byte swap | |
164 | #endif | |
165 | #endif | |
166 | and r10,r6,r10,ror#2 | |
167 | add r4,r4,r9 @ E+=X[i] | |
168 | eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) | |
169 | str r9,[r14,#-4]! | |
170 | add r4,r4,r10 @ E+=F_00_19(B,C,D) | |
171 | #if __ARM_ARCH__<7 | |
172 | ldrb r10,[r1,#2] | |
173 | ldrb r9,[r1,#3] | |
174 | ldrb r11,[r1,#1] | |
175 | add r3,r8,r3,ror#2 @ E+=K_00_19 | |
176 | ldrb r12,[r1],#4 | |
177 | orr r9,r9,r10,lsl#8 | |
178 | eor r10,r6,r7 @ F_xx_xx | |
179 | orr r9,r9,r11,lsl#16 | |
180 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) | |
181 | orr r9,r9,r12,lsl#24 | |
182 | #else | |
183 | ldr r9,[r1],#4 @ handles unaligned | |
184 | add r3,r8,r3,ror#2 @ E+=K_00_19 | |
185 | eor r10,r6,r7 @ F_xx_xx | |
186 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) | |
187 | #ifdef __ARMEL__ | |
188 | rev r9,r9 @ byte swap | |
189 | #endif | |
190 | #endif | |
191 | and r10,r5,r10,ror#2 | |
192 | add r3,r3,r9 @ E+=X[i] | |
193 | eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) | |
194 | str r9,[r14,#-4]! | |
195 | add r3,r3,r10 @ E+=F_00_19(B,C,D) | |
638591cd | 196 | cmp r14,sp |
f0be44f4 DM |
197 | bne .L_00_15 @ [((11+4)*5+2)*3] |
198 | #if __ARM_ARCH__<7 | |
199 | ldrb r10,[r1,#2] | |
200 | ldrb r9,[r1,#3] | |
201 | ldrb r11,[r1,#1] | |
202 | add r7,r8,r7,ror#2 @ E+=K_00_19 | |
203 | ldrb r12,[r1],#4 | |
204 | orr r9,r9,r10,lsl#8 | |
205 | eor r10,r5,r6 @ F_xx_xx | |
206 | orr r9,r9,r11,lsl#16 | |
207 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) | |
208 | orr r9,r9,r12,lsl#24 | |
209 | #else | |
210 | ldr r9,[r1],#4 @ handles unaligned | |
211 | add r7,r8,r7,ror#2 @ E+=K_00_19 | |
212 | eor r10,r5,r6 @ F_xx_xx | |
213 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) | |
214 | #ifdef __ARMEL__ | |
215 | rev r9,r9 @ byte swap | |
216 | #endif | |
217 | #endif | |
218 | and r10,r4,r10,ror#2 | |
219 | add r7,r7,r9 @ E+=X[i] | |
220 | eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) | |
221 | str r9,[r14,#-4]! | |
222 | add r7,r7,r10 @ E+=F_00_19(B,C,D) | |
223 | ldr r9,[r14,#15*4] | |
224 | ldr r10,[r14,#13*4] | |
225 | ldr r11,[r14,#7*4] | |
226 | add r6,r8,r6,ror#2 @ E+=K_xx_xx | |
227 | ldr r12,[r14,#2*4] | |
228 | eor r9,r9,r10 | |
229 | eor r11,r11,r12 @ 1 cycle stall | |
230 | eor r10,r4,r5 @ F_xx_xx | |
231 | mov r9,r9,ror#31 | |
232 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) | |
233 | eor r9,r9,r11,ror#31 | |
234 | str r9,[r14,#-4]! | |
235 | and r10,r3,r10,ror#2 @ F_xx_xx | |
236 | @ F_xx_xx | |
237 | add r6,r6,r9 @ E+=X[i] | |
238 | eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) | |
239 | add r6,r6,r10 @ E+=F_00_19(B,C,D) | |
240 | ldr r9,[r14,#15*4] | |
241 | ldr r10,[r14,#13*4] | |
242 | ldr r11,[r14,#7*4] | |
243 | add r5,r8,r5,ror#2 @ E+=K_xx_xx | |
244 | ldr r12,[r14,#2*4] | |
245 | eor r9,r9,r10 | |
246 | eor r11,r11,r12 @ 1 cycle stall | |
247 | eor r10,r3,r4 @ F_xx_xx | |
248 | mov r9,r9,ror#31 | |
249 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) | |
250 | eor r9,r9,r11,ror#31 | |
251 | str r9,[r14,#-4]! | |
252 | and r10,r7,r10,ror#2 @ F_xx_xx | |
253 | @ F_xx_xx | |
254 | add r5,r5,r9 @ E+=X[i] | |
255 | eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) | |
256 | add r5,r5,r10 @ E+=F_00_19(B,C,D) | |
257 | ldr r9,[r14,#15*4] | |
258 | ldr r10,[r14,#13*4] | |
259 | ldr r11,[r14,#7*4] | |
260 | add r4,r8,r4,ror#2 @ E+=K_xx_xx | |
261 | ldr r12,[r14,#2*4] | |
262 | eor r9,r9,r10 | |
263 | eor r11,r11,r12 @ 1 cycle stall | |
264 | eor r10,r7,r3 @ F_xx_xx | |
265 | mov r9,r9,ror#31 | |
266 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) | |
267 | eor r9,r9,r11,ror#31 | |
268 | str r9,[r14,#-4]! | |
269 | and r10,r6,r10,ror#2 @ F_xx_xx | |
270 | @ F_xx_xx | |
271 | add r4,r4,r9 @ E+=X[i] | |
272 | eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) | |
273 | add r4,r4,r10 @ E+=F_00_19(B,C,D) | |
274 | ldr r9,[r14,#15*4] | |
275 | ldr r10,[r14,#13*4] | |
276 | ldr r11,[r14,#7*4] | |
277 | add r3,r8,r3,ror#2 @ E+=K_xx_xx | |
278 | ldr r12,[r14,#2*4] | |
279 | eor r9,r9,r10 | |
280 | eor r11,r11,r12 @ 1 cycle stall | |
281 | eor r10,r6,r7 @ F_xx_xx | |
282 | mov r9,r9,ror#31 | |
283 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) | |
284 | eor r9,r9,r11,ror#31 | |
285 | str r9,[r14,#-4]! | |
286 | and r10,r5,r10,ror#2 @ F_xx_xx | |
287 | @ F_xx_xx | |
288 | add r3,r3,r9 @ E+=X[i] | |
289 | eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) | |
290 | add r3,r3,r10 @ E+=F_00_19(B,C,D) | |
291 | ||
292 | ldr r8,.LK_20_39 @ [+15+16*4] | |
293 | sub sp,sp,#25*4 | |
294 | cmn sp,#0 @ [+3], clear carry to denote 20_39 | |
295 | .L_20_39_or_60_79: | |
296 | ldr r9,[r14,#15*4] | |
297 | ldr r10,[r14,#13*4] | |
298 | ldr r11,[r14,#7*4] | |
299 | add r7,r8,r7,ror#2 @ E+=K_xx_xx | |
300 | ldr r12,[r14,#2*4] | |
301 | eor r9,r9,r10 | |
302 | eor r11,r11,r12 @ 1 cycle stall | |
303 | eor r10,r5,r6 @ F_xx_xx | |
304 | mov r9,r9,ror#31 | |
305 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) | |
306 | eor r9,r9,r11,ror#31 | |
307 | str r9,[r14,#-4]! | |
308 | eor r10,r4,r10,ror#2 @ F_xx_xx | |
309 | @ F_xx_xx | |
310 | add r7,r7,r9 @ E+=X[i] | |
311 | add r7,r7,r10 @ E+=F_20_39(B,C,D) | |
312 | ldr r9,[r14,#15*4] | |
313 | ldr r10,[r14,#13*4] | |
314 | ldr r11,[r14,#7*4] | |
315 | add r6,r8,r6,ror#2 @ E+=K_xx_xx | |
316 | ldr r12,[r14,#2*4] | |
317 | eor r9,r9,r10 | |
318 | eor r11,r11,r12 @ 1 cycle stall | |
319 | eor r10,r4,r5 @ F_xx_xx | |
320 | mov r9,r9,ror#31 | |
321 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) | |
322 | eor r9,r9,r11,ror#31 | |
323 | str r9,[r14,#-4]! | |
324 | eor r10,r3,r10,ror#2 @ F_xx_xx | |
325 | @ F_xx_xx | |
326 | add r6,r6,r9 @ E+=X[i] | |
327 | add r6,r6,r10 @ E+=F_20_39(B,C,D) | |
328 | ldr r9,[r14,#15*4] | |
329 | ldr r10,[r14,#13*4] | |
330 | ldr r11,[r14,#7*4] | |
331 | add r5,r8,r5,ror#2 @ E+=K_xx_xx | |
332 | ldr r12,[r14,#2*4] | |
333 | eor r9,r9,r10 | |
334 | eor r11,r11,r12 @ 1 cycle stall | |
335 | eor r10,r3,r4 @ F_xx_xx | |
336 | mov r9,r9,ror#31 | |
337 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) | |
338 | eor r9,r9,r11,ror#31 | |
339 | str r9,[r14,#-4]! | |
340 | eor r10,r7,r10,ror#2 @ F_xx_xx | |
341 | @ F_xx_xx | |
342 | add r5,r5,r9 @ E+=X[i] | |
343 | add r5,r5,r10 @ E+=F_20_39(B,C,D) | |
344 | ldr r9,[r14,#15*4] | |
345 | ldr r10,[r14,#13*4] | |
346 | ldr r11,[r14,#7*4] | |
347 | add r4,r8,r4,ror#2 @ E+=K_xx_xx | |
348 | ldr r12,[r14,#2*4] | |
349 | eor r9,r9,r10 | |
350 | eor r11,r11,r12 @ 1 cycle stall | |
351 | eor r10,r7,r3 @ F_xx_xx | |
352 | mov r9,r9,ror#31 | |
353 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) | |
354 | eor r9,r9,r11,ror#31 | |
355 | str r9,[r14,#-4]! | |
356 | eor r10,r6,r10,ror#2 @ F_xx_xx | |
357 | @ F_xx_xx | |
358 | add r4,r4,r9 @ E+=X[i] | |
359 | add r4,r4,r10 @ E+=F_20_39(B,C,D) | |
360 | ldr r9,[r14,#15*4] | |
361 | ldr r10,[r14,#13*4] | |
362 | ldr r11,[r14,#7*4] | |
363 | add r3,r8,r3,ror#2 @ E+=K_xx_xx | |
364 | ldr r12,[r14,#2*4] | |
365 | eor r9,r9,r10 | |
366 | eor r11,r11,r12 @ 1 cycle stall | |
367 | eor r10,r6,r7 @ F_xx_xx | |
368 | mov r9,r9,ror#31 | |
369 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) | |
370 | eor r9,r9,r11,ror#31 | |
371 | str r9,[r14,#-4]! | |
372 | eor r10,r5,r10,ror#2 @ F_xx_xx | |
373 | @ F_xx_xx | |
374 | add r3,r3,r9 @ E+=X[i] | |
375 | add r3,r3,r10 @ E+=F_20_39(B,C,D) | |
638591cd DM |
376 | ARM( teq r14,sp ) @ preserve carry |
377 | THUMB( mov r11,sp ) | |
378 | THUMB( teq r14,r11 ) @ preserve carry | |
f0be44f4 DM |
379 | bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] |
380 | bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes | |
381 | ||
382 | ldr r8,.LK_40_59 | |
383 | sub sp,sp,#20*4 @ [+2] | |
384 | .L_40_59: | |
385 | ldr r9,[r14,#15*4] | |
386 | ldr r10,[r14,#13*4] | |
387 | ldr r11,[r14,#7*4] | |
388 | add r7,r8,r7,ror#2 @ E+=K_xx_xx | |
389 | ldr r12,[r14,#2*4] | |
390 | eor r9,r9,r10 | |
391 | eor r11,r11,r12 @ 1 cycle stall | |
392 | eor r10,r5,r6 @ F_xx_xx | |
393 | mov r9,r9,ror#31 | |
394 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) | |
395 | eor r9,r9,r11,ror#31 | |
396 | str r9,[r14,#-4]! | |
397 | and r10,r4,r10,ror#2 @ F_xx_xx | |
398 | and r11,r5,r6 @ F_xx_xx | |
399 | add r7,r7,r9 @ E+=X[i] | |
400 | add r7,r7,r10 @ E+=F_40_59(B,C,D) | |
401 | add r7,r7,r11,ror#2 | |
402 | ldr r9,[r14,#15*4] | |
403 | ldr r10,[r14,#13*4] | |
404 | ldr r11,[r14,#7*4] | |
405 | add r6,r8,r6,ror#2 @ E+=K_xx_xx | |
406 | ldr r12,[r14,#2*4] | |
407 | eor r9,r9,r10 | |
408 | eor r11,r11,r12 @ 1 cycle stall | |
409 | eor r10,r4,r5 @ F_xx_xx | |
410 | mov r9,r9,ror#31 | |
411 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) | |
412 | eor r9,r9,r11,ror#31 | |
413 | str r9,[r14,#-4]! | |
414 | and r10,r3,r10,ror#2 @ F_xx_xx | |
415 | and r11,r4,r5 @ F_xx_xx | |
416 | add r6,r6,r9 @ E+=X[i] | |
417 | add r6,r6,r10 @ E+=F_40_59(B,C,D) | |
418 | add r6,r6,r11,ror#2 | |
419 | ldr r9,[r14,#15*4] | |
420 | ldr r10,[r14,#13*4] | |
421 | ldr r11,[r14,#7*4] | |
422 | add r5,r8,r5,ror#2 @ E+=K_xx_xx | |
423 | ldr r12,[r14,#2*4] | |
424 | eor r9,r9,r10 | |
425 | eor r11,r11,r12 @ 1 cycle stall | |
426 | eor r10,r3,r4 @ F_xx_xx | |
427 | mov r9,r9,ror#31 | |
428 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) | |
429 | eor r9,r9,r11,ror#31 | |
430 | str r9,[r14,#-4]! | |
431 | and r10,r7,r10,ror#2 @ F_xx_xx | |
432 | and r11,r3,r4 @ F_xx_xx | |
433 | add r5,r5,r9 @ E+=X[i] | |
434 | add r5,r5,r10 @ E+=F_40_59(B,C,D) | |
435 | add r5,r5,r11,ror#2 | |
436 | ldr r9,[r14,#15*4] | |
437 | ldr r10,[r14,#13*4] | |
438 | ldr r11,[r14,#7*4] | |
439 | add r4,r8,r4,ror#2 @ E+=K_xx_xx | |
440 | ldr r12,[r14,#2*4] | |
441 | eor r9,r9,r10 | |
442 | eor r11,r11,r12 @ 1 cycle stall | |
443 | eor r10,r7,r3 @ F_xx_xx | |
444 | mov r9,r9,ror#31 | |
445 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) | |
446 | eor r9,r9,r11,ror#31 | |
447 | str r9,[r14,#-4]! | |
448 | and r10,r6,r10,ror#2 @ F_xx_xx | |
449 | and r11,r7,r3 @ F_xx_xx | |
450 | add r4,r4,r9 @ E+=X[i] | |
451 | add r4,r4,r10 @ E+=F_40_59(B,C,D) | |
452 | add r4,r4,r11,ror#2 | |
453 | ldr r9,[r14,#15*4] | |
454 | ldr r10,[r14,#13*4] | |
455 | ldr r11,[r14,#7*4] | |
456 | add r3,r8,r3,ror#2 @ E+=K_xx_xx | |
457 | ldr r12,[r14,#2*4] | |
458 | eor r9,r9,r10 | |
459 | eor r11,r11,r12 @ 1 cycle stall | |
460 | eor r10,r6,r7 @ F_xx_xx | |
461 | mov r9,r9,ror#31 | |
462 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) | |
463 | eor r9,r9,r11,ror#31 | |
464 | str r9,[r14,#-4]! | |
465 | and r10,r5,r10,ror#2 @ F_xx_xx | |
466 | and r11,r6,r7 @ F_xx_xx | |
467 | add r3,r3,r9 @ E+=X[i] | |
468 | add r3,r3,r10 @ E+=F_40_59(B,C,D) | |
469 | add r3,r3,r11,ror#2 | |
638591cd | 470 | cmp r14,sp |
f0be44f4 DM |
471 | bne .L_40_59 @ [+((12+5)*5+2)*4] |
472 | ||
473 | ldr r8,.LK_60_79 | |
474 | sub sp,sp,#20*4 | |
475 | cmp sp,#0 @ set carry to denote 60_79 | |
476 | b .L_20_39_or_60_79 @ [+4], spare 300 bytes | |
477 | .L_done: | |
478 | add sp,sp,#80*4 @ "deallocate" stack frame | |
479 | ldmia r0,{r8,r9,r10,r11,r12} | |
480 | add r3,r8,r3 | |
481 | add r4,r9,r4 | |
482 | add r5,r10,r5,ror#2 | |
483 | add r6,r11,r6,ror#2 | |
484 | add r7,r12,r7,ror#2 | |
485 | stmia r0,{r3,r4,r5,r6,r7} | |
486 | teq r1,r2 | |
487 | bne .Lloop @ [+18], total 1307 | |
488 | ||
f0be44f4 | 489 | ldmia sp!,{r4-r12,pc} |
f0be44f4 DM |
490 | .align 2 |
491 | .LK_00_19: .word 0x5a827999 | |
492 | .LK_20_39: .word 0x6ed9eba1 | |
493 | .LK_40_59: .word 0x8f1bbcdc | |
494 | .LK_60_79: .word 0xca62c1d6 | |
638591cd | 495 | ENDPROC(sha1_block_data_order) |
f0be44f4 DM |
496 | .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" |
497 | .align 2 |