1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from chacha-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7
8 .hidden OPENSSL_armcap_P
9
10 .align 5
11 .Lsigma:
12 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
13 .Lone:
14 .long 1,0,0,0
15 .LOPENSSL_armcap_P:
16 #ifdef __ILP32__
17 .long OPENSSL_armcap_P-.
18 #else
19 .quad OPENSSL_armcap_P-.
20 #endif
21 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
22 .align 2
23
24 .globl ChaCha20_ctr32
25 .type ChaCha20_ctr32,%function
26 .align 5
27 ChaCha20_ctr32:
28 cbz x2,.Labort
29 adr x5,.LOPENSSL_armcap_P
30 cmp x2,#192
31 b.lo .Lshort
32 #ifdef __ILP32__
33 ldrsw x6,[x5]
34 #else
35 ldr x6,[x5]
36 #endif
37 ldr w17,[x6,x5]
38 tst w17,#ARMV7_NEON
39 b.ne ChaCha20_neon
40
41 .Lshort:
42 .inst 0xd503233f // paciasp
43 stp x29,x30,[sp,#-96]!
44 add x29,sp,#0
45
46 adr x5,.Lsigma
47 stp x19,x20,[sp,#16]
48 stp x21,x22,[sp,#32]
49 stp x23,x24,[sp,#48]
50 stp x25,x26,[sp,#64]
51 stp x27,x28,[sp,#80]
52 sub sp,sp,#64
53
54 ldp x22,x23,[x5] // load sigma
55 ldp x24,x25,[x3] // load key
56 ldp x26,x27,[x3,#16]
57 ldp x28,x30,[x4] // load counter
58 #ifdef __ARMEB__
59 ror x24,x24,#32
60 ror x25,x25,#32
61 ror x26,x26,#32
62 ror x27,x27,#32
63 ror x28,x28,#32
64 ror x30,x30,#32
65 #endif
66
67 .Loop_outer:
68 mov w5,w22 // unpack key block
69 lsr x6,x22,#32
70 mov w7,w23
71 lsr x8,x23,#32
72 mov w9,w24
73 lsr x10,x24,#32
74 mov w11,w25
75 lsr x12,x25,#32
76 mov w13,w26
77 lsr x14,x26,#32
78 mov w15,w27
79 lsr x16,x27,#32
80 mov w17,w28
81 lsr x19,x28,#32
82 mov w20,w30
83 lsr x21,x30,#32
84
85 mov x4,#10
86 subs x2,x2,#64
87 .Loop:
88 sub x4,x4,#1
89 add w5,w5,w9
90 add w6,w6,w10
91 add w7,w7,w11
92 add w8,w8,w12
93 eor w17,w17,w5
94 eor w19,w19,w6
95 eor w20,w20,w7
96 eor w21,w21,w8
97 ror w17,w17,#16
98 ror w19,w19,#16
99 ror w20,w20,#16
100 ror w21,w21,#16
101 add w13,w13,w17
102 add w14,w14,w19
103 add w15,w15,w20
104 add w16,w16,w21
105 eor w9,w9,w13
106 eor w10,w10,w14
107 eor w11,w11,w15
108 eor w12,w12,w16
109 ror w9,w9,#20
110 ror w10,w10,#20
111 ror w11,w11,#20
112 ror w12,w12,#20
113 add w5,w5,w9
114 add w6,w6,w10
115 add w7,w7,w11
116 add w8,w8,w12
117 eor w17,w17,w5
118 eor w19,w19,w6
119 eor w20,w20,w7
120 eor w21,w21,w8
121 ror w17,w17,#24
122 ror w19,w19,#24
123 ror w20,w20,#24
124 ror w21,w21,#24
125 add w13,w13,w17
126 add w14,w14,w19
127 add w15,w15,w20
128 add w16,w16,w21
129 eor w9,w9,w13
130 eor w10,w10,w14
131 eor w11,w11,w15
132 eor w12,w12,w16
133 ror w9,w9,#25
134 ror w10,w10,#25
135 ror w11,w11,#25
136 ror w12,w12,#25
137 add w5,w5,w10
138 add w6,w6,w11
139 add w7,w7,w12
140 add w8,w8,w9
141 eor w21,w21,w5
142 eor w17,w17,w6
143 eor w19,w19,w7
144 eor w20,w20,w8
145 ror w21,w21,#16
146 ror w17,w17,#16
147 ror w19,w19,#16
148 ror w20,w20,#16
149 add w15,w15,w21
150 add w16,w16,w17
151 add w13,w13,w19
152 add w14,w14,w20
153 eor w10,w10,w15
154 eor w11,w11,w16
155 eor w12,w12,w13
156 eor w9,w9,w14
157 ror w10,w10,#20
158 ror w11,w11,#20
159 ror w12,w12,#20
160 ror w9,w9,#20
161 add w5,w5,w10
162 add w6,w6,w11
163 add w7,w7,w12
164 add w8,w8,w9
165 eor w21,w21,w5
166 eor w17,w17,w6
167 eor w19,w19,w7
168 eor w20,w20,w8
169 ror w21,w21,#24
170 ror w17,w17,#24
171 ror w19,w19,#24
172 ror w20,w20,#24
173 add w15,w15,w21
174 add w16,w16,w17
175 add w13,w13,w19
176 add w14,w14,w20
177 eor w10,w10,w15
178 eor w11,w11,w16
179 eor w12,w12,w13
180 eor w9,w9,w14
181 ror w10,w10,#25
182 ror w11,w11,#25
183 ror w12,w12,#25
184 ror w9,w9,#25
185 cbnz x4,.Loop
186
187 add w5,w5,w22 // accumulate key block
188 add x6,x6,x22,lsr#32
189 add w7,w7,w23
190 add x8,x8,x23,lsr#32
191 add w9,w9,w24
192 add x10,x10,x24,lsr#32
193 add w11,w11,w25
194 add x12,x12,x25,lsr#32
195 add w13,w13,w26
196 add x14,x14,x26,lsr#32
197 add w15,w15,w27
198 add x16,x16,x27,lsr#32
199 add w17,w17,w28
200 add x19,x19,x28,lsr#32
201 add w20,w20,w30
202 add x21,x21,x30,lsr#32
203
204 b.lo .Ltail
205
206 add x5,x5,x6,lsl#32 // pack
207 add x7,x7,x8,lsl#32
208 ldp x6,x8,[x1,#0] // load input
209 add x9,x9,x10,lsl#32
210 add x11,x11,x12,lsl#32
211 ldp x10,x12,[x1,#16]
212 add x13,x13,x14,lsl#32
213 add x15,x15,x16,lsl#32
214 ldp x14,x16,[x1,#32]
215 add x17,x17,x19,lsl#32
216 add x20,x20,x21,lsl#32
217 ldp x19,x21,[x1,#48]
218 add x1,x1,#64
219 #ifdef __ARMEB__
220 rev x5,x5
221 rev x7,x7
222 rev x9,x9
223 rev x11,x11
224 rev x13,x13
225 rev x15,x15
226 rev x17,x17
227 rev x20,x20
228 #endif
229 eor x5,x5,x6
230 eor x7,x7,x8
231 eor x9,x9,x10
232 eor x11,x11,x12
233 eor x13,x13,x14
234 eor x15,x15,x16
235 eor x17,x17,x19
236 eor x20,x20,x21
237
238 stp x5,x7,[x0,#0] // store output
239 add x28,x28,#1 // increment counter
240 stp x9,x11,[x0,#16]
241 stp x13,x15,[x0,#32]
242 stp x17,x20,[x0,#48]
243 add x0,x0,#64
244
245 b.hi .Loop_outer
246
247 ldp x19,x20,[x29,#16]
248 add sp,sp,#64
249 ldp x21,x22,[x29,#32]
250 ldp x23,x24,[x29,#48]
251 ldp x25,x26,[x29,#64]
252 ldp x27,x28,[x29,#80]
253 ldp x29,x30,[sp],#96
254 .inst 0xd50323bf // autiasp
255 .Labort:
256 ret
257
258 .align 4
259 .Ltail:
260 add x2,x2,#64
261 .Less_than_64:
262 sub x0,x0,#1
263 add x1,x1,x2
264 add x0,x0,x2
265 add x4,sp,x2
266 neg x2,x2
267
268 add x5,x5,x6,lsl#32 // pack
269 add x7,x7,x8,lsl#32
270 add x9,x9,x10,lsl#32
271 add x11,x11,x12,lsl#32
272 add x13,x13,x14,lsl#32
273 add x15,x15,x16,lsl#32
274 add x17,x17,x19,lsl#32
275 add x20,x20,x21,lsl#32
276 #ifdef __ARMEB__
277 rev x5,x5
278 rev x7,x7
279 rev x9,x9
280 rev x11,x11
281 rev x13,x13
282 rev x15,x15
283 rev x17,x17
284 rev x20,x20
285 #endif
286 stp x5,x7,[sp,#0]
287 stp x9,x11,[sp,#16]
288 stp x13,x15,[sp,#32]
289 stp x17,x20,[sp,#48]
290
291 .Loop_tail:
292 ldrb w10,[x1,x2]
293 ldrb w11,[x4,x2]
294 add x2,x2,#1
295 eor w10,w10,w11
296 strb w10,[x0,x2]
297 cbnz x2,.Loop_tail
298
299 stp xzr,xzr,[sp,#0]
300 stp xzr,xzr,[sp,#16]
301 stp xzr,xzr,[sp,#32]
302 stp xzr,xzr,[sp,#48]
303
304 ldp x19,x20,[x29,#16]
305 add sp,sp,#64
306 ldp x21,x22,[x29,#32]
307 ldp x23,x24,[x29,#48]
308 ldp x25,x26,[x29,#64]
309 ldp x27,x28,[x29,#80]
310 ldp x29,x30,[sp],#96
311 .inst 0xd50323bf // autiasp
312 ret
313 .size ChaCha20_ctr32,.-ChaCha20_ctr32
314
315 .type ChaCha20_neon,%function
316 .align 5
317 ChaCha20_neon:
318 .inst 0xd503233f // paciasp
319 stp x29,x30,[sp,#-96]!
320 add x29,sp,#0
321
322 adr x5,.Lsigma
323 stp x19,x20,[sp,#16]
324 stp x21,x22,[sp,#32]
325 stp x23,x24,[sp,#48]
326 stp x25,x26,[sp,#64]
327 stp x27,x28,[sp,#80]
328 cmp x2,#512
329 b.hs .L512_or_more_neon
330
331 sub sp,sp,#64
332
333 ldp x22,x23,[x5] // load sigma
334 ld1 {v24.4s},[x5],#16
335 ldp x24,x25,[x3] // load key
336 ldp x26,x27,[x3,#16]
337 ld1 {v25.4s,v26.4s},[x3]
338 ldp x28,x30,[x4] // load counter
339 ld1 {v27.4s},[x4]
340 ld1 {v31.4s},[x5]
341 #ifdef __ARMEB__
342 rev64 v24.4s,v24.4s
343 ror x24,x24,#32
344 ror x25,x25,#32
345 ror x26,x26,#32
346 ror x27,x27,#32
347 ror x28,x28,#32
348 ror x30,x30,#32
349 #endif
350 add v27.4s,v27.4s,v31.4s // += 1
351 add v28.4s,v27.4s,v31.4s
352 add v29.4s,v28.4s,v31.4s
353 shl v31.4s,v31.4s,#2 // 1 -> 4
354
355 .Loop_outer_neon:
356 mov w5,w22 // unpack key block
357 lsr x6,x22,#32
358 mov v0.16b,v24.16b
359 mov w7,w23
360 lsr x8,x23,#32
361 mov v4.16b,v24.16b
362 mov w9,w24
363 lsr x10,x24,#32
364 mov v16.16b,v24.16b
365 mov w11,w25
366 mov v1.16b,v25.16b
367 lsr x12,x25,#32
368 mov v5.16b,v25.16b
369 mov w13,w26
370 mov v17.16b,v25.16b
371 lsr x14,x26,#32
372 mov v3.16b,v27.16b
373 mov w15,w27
374 mov v7.16b,v28.16b
375 lsr x16,x27,#32
376 mov v19.16b,v29.16b
377 mov w17,w28
378 mov v2.16b,v26.16b
379 lsr x19,x28,#32
380 mov v6.16b,v26.16b
381 mov w20,w30
382 mov v18.16b,v26.16b
383 lsr x21,x30,#32
384
385 mov x4,#10
386 subs x2,x2,#256
387 .Loop_neon:
388 sub x4,x4,#1
389 add v0.4s,v0.4s,v1.4s
390 add w5,w5,w9
391 add v4.4s,v4.4s,v5.4s
392 add w6,w6,w10
393 add v16.4s,v16.4s,v17.4s
394 add w7,w7,w11
395 eor v3.16b,v3.16b,v0.16b
396 add w8,w8,w12
397 eor v7.16b,v7.16b,v4.16b
398 eor w17,w17,w5
399 eor v19.16b,v19.16b,v16.16b
400 eor w19,w19,w6
401 rev32 v3.8h,v3.8h
402 eor w20,w20,w7
403 rev32 v7.8h,v7.8h
404 eor w21,w21,w8
405 rev32 v19.8h,v19.8h
406 ror w17,w17,#16
407 add v2.4s,v2.4s,v3.4s
408 ror w19,w19,#16
409 add v6.4s,v6.4s,v7.4s
410 ror w20,w20,#16
411 add v18.4s,v18.4s,v19.4s
412 ror w21,w21,#16
413 eor v20.16b,v1.16b,v2.16b
414 add w13,w13,w17
415 eor v21.16b,v5.16b,v6.16b
416 add w14,w14,w19
417 eor v22.16b,v17.16b,v18.16b
418 add w15,w15,w20
419 ushr v1.4s,v20.4s,#20
420 add w16,w16,w21
421 ushr v5.4s,v21.4s,#20
422 eor w9,w9,w13
423 ushr v17.4s,v22.4s,#20
424 eor w10,w10,w14
425 sli v1.4s,v20.4s,#12
426 eor w11,w11,w15
427 sli v5.4s,v21.4s,#12
428 eor w12,w12,w16
429 sli v17.4s,v22.4s,#12
430 ror w9,w9,#20
431 add v0.4s,v0.4s,v1.4s
432 ror w10,w10,#20
433 add v4.4s,v4.4s,v5.4s
434 ror w11,w11,#20
435 add v16.4s,v16.4s,v17.4s
436 ror w12,w12,#20
437 eor v20.16b,v3.16b,v0.16b
438 add w5,w5,w9
439 eor v21.16b,v7.16b,v4.16b
440 add w6,w6,w10
441 eor v22.16b,v19.16b,v16.16b
442 add w7,w7,w11
443 ushr v3.4s,v20.4s,#24
444 add w8,w8,w12
445 ushr v7.4s,v21.4s,#24
446 eor w17,w17,w5
447 ushr v19.4s,v22.4s,#24
448 eor w19,w19,w6
449 sli v3.4s,v20.4s,#8
450 eor w20,w20,w7
451 sli v7.4s,v21.4s,#8
452 eor w21,w21,w8
453 sli v19.4s,v22.4s,#8
454 ror w17,w17,#24
455 add v2.4s,v2.4s,v3.4s
456 ror w19,w19,#24
457 add v6.4s,v6.4s,v7.4s
458 ror w20,w20,#24
459 add v18.4s,v18.4s,v19.4s
460 ror w21,w21,#24
461 eor v20.16b,v1.16b,v2.16b
462 add w13,w13,w17
463 eor v21.16b,v5.16b,v6.16b
464 add w14,w14,w19
465 eor v22.16b,v17.16b,v18.16b
466 add w15,w15,w20
467 ushr v1.4s,v20.4s,#25
468 add w16,w16,w21
469 ushr v5.4s,v21.4s,#25
470 eor w9,w9,w13
471 ushr v17.4s,v22.4s,#25
472 eor w10,w10,w14
473 sli v1.4s,v20.4s,#7
474 eor w11,w11,w15
475 sli v5.4s,v21.4s,#7
476 eor w12,w12,w16
477 sli v17.4s,v22.4s,#7
478 ror w9,w9,#25
479 ext v2.16b,v2.16b,v2.16b,#8
480 ror w10,w10,#25
481 ext v6.16b,v6.16b,v6.16b,#8
482 ror w11,w11,#25
483 ext v18.16b,v18.16b,v18.16b,#8
484 ror w12,w12,#25
485 ext v3.16b,v3.16b,v3.16b,#12
486 ext v7.16b,v7.16b,v7.16b,#12
487 ext v19.16b,v19.16b,v19.16b,#12
488 ext v1.16b,v1.16b,v1.16b,#4
489 ext v5.16b,v5.16b,v5.16b,#4
490 ext v17.16b,v17.16b,v17.16b,#4
491 add v0.4s,v0.4s,v1.4s
492 add w5,w5,w10
493 add v4.4s,v4.4s,v5.4s
494 add w6,w6,w11
495 add v16.4s,v16.4s,v17.4s
496 add w7,w7,w12
497 eor v3.16b,v3.16b,v0.16b
498 add w8,w8,w9
499 eor v7.16b,v7.16b,v4.16b
500 eor w21,w21,w5
501 eor v19.16b,v19.16b,v16.16b
502 eor w17,w17,w6
503 rev32 v3.8h,v3.8h
504 eor w19,w19,w7
505 rev32 v7.8h,v7.8h
506 eor w20,w20,w8
507 rev32 v19.8h,v19.8h
508 ror w21,w21,#16
509 add v2.4s,v2.4s,v3.4s
510 ror w17,w17,#16
511 add v6.4s,v6.4s,v7.4s
512 ror w19,w19,#16
513 add v18.4s,v18.4s,v19.4s
514 ror w20,w20,#16
515 eor v20.16b,v1.16b,v2.16b
516 add w15,w15,w21
517 eor v21.16b,v5.16b,v6.16b
518 add w16,w16,w17
519 eor v22.16b,v17.16b,v18.16b
520 add w13,w13,w19
521 ushr v1.4s,v20.4s,#20
522 add w14,w14,w20
523 ushr v5.4s,v21.4s,#20
524 eor w10,w10,w15
525 ushr v17.4s,v22.4s,#20
526 eor w11,w11,w16
527 sli v1.4s,v20.4s,#12
528 eor w12,w12,w13
529 sli v5.4s,v21.4s,#12
530 eor w9,w9,w14
531 sli v17.4s,v22.4s,#12
532 ror w10,w10,#20
533 add v0.4s,v0.4s,v1.4s
534 ror w11,w11,#20
535 add v4.4s,v4.4s,v5.4s
536 ror w12,w12,#20
537 add v16.4s,v16.4s,v17.4s
538 ror w9,w9,#20
539 eor v20.16b,v3.16b,v0.16b
540 add w5,w5,w10
541 eor v21.16b,v7.16b,v4.16b
542 add w6,w6,w11
543 eor v22.16b,v19.16b,v16.16b
544 add w7,w7,w12
545 ushr v3.4s,v20.4s,#24
546 add w8,w8,w9
547 ushr v7.4s,v21.4s,#24
548 eor w21,w21,w5
549 ushr v19.4s,v22.4s,#24
550 eor w17,w17,w6
551 sli v3.4s,v20.4s,#8
552 eor w19,w19,w7
553 sli v7.4s,v21.4s,#8
554 eor w20,w20,w8
555 sli v19.4s,v22.4s,#8
556 ror w21,w21,#24
557 add v2.4s,v2.4s,v3.4s
558 ror w17,w17,#24
559 add v6.4s,v6.4s,v7.4s
560 ror w19,w19,#24
561 add v18.4s,v18.4s,v19.4s
562 ror w20,w20,#24
563 eor v20.16b,v1.16b,v2.16b
564 add w15,w15,w21
565 eor v21.16b,v5.16b,v6.16b
566 add w16,w16,w17
567 eor v22.16b,v17.16b,v18.16b
568 add w13,w13,w19
569 ushr v1.4s,v20.4s,#25
570 add w14,w14,w20
571 ushr v5.4s,v21.4s,#25
572 eor w10,w10,w15
573 ushr v17.4s,v22.4s,#25
574 eor w11,w11,w16
575 sli v1.4s,v20.4s,#7
576 eor w12,w12,w13
577 sli v5.4s,v21.4s,#7
578 eor w9,w9,w14
579 sli v17.4s,v22.4s,#7
580 ror w10,w10,#25
581 ext v2.16b,v2.16b,v2.16b,#8
582 ror w11,w11,#25
583 ext v6.16b,v6.16b,v6.16b,#8
584 ror w12,w12,#25
585 ext v18.16b,v18.16b,v18.16b,#8
586 ror w9,w9,#25
587 ext v3.16b,v3.16b,v3.16b,#4
588 ext v7.16b,v7.16b,v7.16b,#4
589 ext v19.16b,v19.16b,v19.16b,#4
590 ext v1.16b,v1.16b,v1.16b,#12
591 ext v5.16b,v5.16b,v5.16b,#12
592 ext v17.16b,v17.16b,v17.16b,#12
593 cbnz x4,.Loop_neon
594
595 add w5,w5,w22 // accumulate key block
596 add v0.4s,v0.4s,v24.4s
597 add x6,x6,x22,lsr#32
598 add v4.4s,v4.4s,v24.4s
599 add w7,w7,w23
600 add v16.4s,v16.4s,v24.4s
601 add x8,x8,x23,lsr#32
602 add v2.4s,v2.4s,v26.4s
603 add w9,w9,w24
604 add v6.4s,v6.4s,v26.4s
605 add x10,x10,x24,lsr#32
606 add v18.4s,v18.4s,v26.4s
607 add w11,w11,w25
608 add v3.4s,v3.4s,v27.4s
609 add x12,x12,x25,lsr#32
610 add w13,w13,w26
611 add v7.4s,v7.4s,v28.4s
612 add x14,x14,x26,lsr#32
613 add w15,w15,w27
614 add v19.4s,v19.4s,v29.4s
615 add x16,x16,x27,lsr#32
616 add w17,w17,w28
617 add v1.4s,v1.4s,v25.4s
618 add x19,x19,x28,lsr#32
619 add w20,w20,w30
620 add v5.4s,v5.4s,v25.4s
621 add x21,x21,x30,lsr#32
622 add v17.4s,v17.4s,v25.4s
623
624 b.lo .Ltail_neon
625
626 add x5,x5,x6,lsl#32 // pack
627 add x7,x7,x8,lsl#32
628 ldp x6,x8,[x1,#0] // load input
629 add x9,x9,x10,lsl#32
630 add x11,x11,x12,lsl#32
631 ldp x10,x12,[x1,#16]
632 add x13,x13,x14,lsl#32
633 add x15,x15,x16,lsl#32
634 ldp x14,x16,[x1,#32]
635 add x17,x17,x19,lsl#32
636 add x20,x20,x21,lsl#32
637 ldp x19,x21,[x1,#48]
638 add x1,x1,#64
639 #ifdef __ARMEB__
640 rev x5,x5
641 rev x7,x7
642 rev x9,x9
643 rev x11,x11
644 rev x13,x13
645 rev x15,x15
646 rev x17,x17
647 rev x20,x20
648 #endif
649 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
650 eor x5,x5,x6
651 eor x7,x7,x8
652 eor x9,x9,x10
653 eor x11,x11,x12
654 eor x13,x13,x14
655 eor v0.16b,v0.16b,v20.16b
656 eor x15,x15,x16
657 eor v1.16b,v1.16b,v21.16b
658 eor x17,x17,x19
659 eor v2.16b,v2.16b,v22.16b
660 eor x20,x20,x21
661 eor v3.16b,v3.16b,v23.16b
662 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
663
664 stp x5,x7,[x0,#0] // store output
665 add x28,x28,#4 // increment counter
666 stp x9,x11,[x0,#16]
667 add v27.4s,v27.4s,v31.4s // += 4
668 stp x13,x15,[x0,#32]
669 add v28.4s,v28.4s,v31.4s
670 stp x17,x20,[x0,#48]
671 add v29.4s,v29.4s,v31.4s
672 add x0,x0,#64
673
674 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
675 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
676
677 eor v4.16b,v4.16b,v20.16b
678 eor v5.16b,v5.16b,v21.16b
679 eor v6.16b,v6.16b,v22.16b
680 eor v7.16b,v7.16b,v23.16b
681 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
682
683 eor v16.16b,v16.16b,v0.16b
684 eor v17.16b,v17.16b,v1.16b
685 eor v18.16b,v18.16b,v2.16b
686 eor v19.16b,v19.16b,v3.16b
687 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
688
689 b.hi .Loop_outer_neon
690
691 ldp x19,x20,[x29,#16]
692 add sp,sp,#64
693 ldp x21,x22,[x29,#32]
694 ldp x23,x24,[x29,#48]
695 ldp x25,x26,[x29,#64]
696 ldp x27,x28,[x29,#80]
697 ldp x29,x30,[sp],#96
698 .inst 0xd50323bf // autiasp
699 ret
700
701 .Ltail_neon:
702 add x2,x2,#256
703 cmp x2,#64
704 b.lo .Less_than_64
705
706 add x5,x5,x6,lsl#32 // pack
707 add x7,x7,x8,lsl#32
708 ldp x6,x8,[x1,#0] // load input
709 add x9,x9,x10,lsl#32
710 add x11,x11,x12,lsl#32
711 ldp x10,x12,[x1,#16]
712 add x13,x13,x14,lsl#32
713 add x15,x15,x16,lsl#32
714 ldp x14,x16,[x1,#32]
715 add x17,x17,x19,lsl#32
716 add x20,x20,x21,lsl#32
717 ldp x19,x21,[x1,#48]
718 add x1,x1,#64
719 #ifdef __ARMEB__
720 rev x5,x5
721 rev x7,x7
722 rev x9,x9
723 rev x11,x11
724 rev x13,x13
725 rev x15,x15
726 rev x17,x17
727 rev x20,x20
728 #endif
729 eor x5,x5,x6
730 eor x7,x7,x8
731 eor x9,x9,x10
732 eor x11,x11,x12
733 eor x13,x13,x14
734 eor x15,x15,x16
735 eor x17,x17,x19
736 eor x20,x20,x21
737
738 stp x5,x7,[x0,#0] // store output
739 add x28,x28,#4 // increment counter
740 stp x9,x11,[x0,#16]
741 stp x13,x15,[x0,#32]
742 stp x17,x20,[x0,#48]
743 add x0,x0,#64
744 b.eq .Ldone_neon
745 sub x2,x2,#64
746 cmp x2,#64
747 b.lo .Less_than_128
748
749 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
750 eor v0.16b,v0.16b,v20.16b
751 eor v1.16b,v1.16b,v21.16b
752 eor v2.16b,v2.16b,v22.16b
753 eor v3.16b,v3.16b,v23.16b
754 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
755 b.eq .Ldone_neon
756 sub x2,x2,#64
757 cmp x2,#64
758 b.lo .Less_than_192
759
760 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
761 eor v4.16b,v4.16b,v20.16b
762 eor v5.16b,v5.16b,v21.16b
763 eor v6.16b,v6.16b,v22.16b
764 eor v7.16b,v7.16b,v23.16b
765 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
766 b.eq .Ldone_neon
767 sub x2,x2,#64
768
769 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
770 b .Last_neon
771
772 .Less_than_128:
773 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
774 b .Last_neon
775 .Less_than_192:
776 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
777 b .Last_neon
778
779 .align 4
780 .Last_neon:
781 sub x0,x0,#1
782 add x1,x1,x2
783 add x0,x0,x2
784 add x4,sp,x2
785 neg x2,x2
786
787 .Loop_tail_neon:
788 ldrb w10,[x1,x2]
789 ldrb w11,[x4,x2]
790 add x2,x2,#1
791 eor w10,w10,w11
792 strb w10,[x0,x2]
793 cbnz x2,.Loop_tail_neon
794
795 stp xzr,xzr,[sp,#0]
796 stp xzr,xzr,[sp,#16]
797 stp xzr,xzr,[sp,#32]
798 stp xzr,xzr,[sp,#48]
799
800 .Ldone_neon:
801 ldp x19,x20,[x29,#16]
802 add sp,sp,#64
803 ldp x21,x22,[x29,#32]
804 ldp x23,x24,[x29,#48]
805 ldp x25,x26,[x29,#64]
806 ldp x27,x28,[x29,#80]
807 ldp x29,x30,[sp],#96
808 .inst 0xd50323bf // autiasp
809 ret
810 .size ChaCha20_neon,.-ChaCha20_neon
811 .type ChaCha20_512_neon,%function
812 .align 5
813 ChaCha20_512_neon:
814 .inst 0xd503233f // paciasp
815 stp x29,x30,[sp,#-96]!
816 add x29,sp,#0
817
818 adr x5,.Lsigma
819 stp x19,x20,[sp,#16]
820 stp x21,x22,[sp,#32]
821 stp x23,x24,[sp,#48]
822 stp x25,x26,[sp,#64]
823 stp x27,x28,[sp,#80]
824
825 .L512_or_more_neon:
826 sub sp,sp,#128+64
827
828 ldp x22,x23,[x5] // load sigma
829 ld1 {v24.4s},[x5],#16
830 ldp x24,x25,[x3] // load key
831 ldp x26,x27,[x3,#16]
832 ld1 {v25.4s,v26.4s},[x3]
833 ldp x28,x30,[x4] // load counter
834 ld1 {v27.4s},[x4]
835 ld1 {v31.4s},[x5]
836 #ifdef __ARMEB__
837 rev64 v24.4s,v24.4s
838 ror x24,x24,#32
839 ror x25,x25,#32
840 ror x26,x26,#32
841 ror x27,x27,#32
842 ror x28,x28,#32
843 ror x30,x30,#32
844 #endif
845 add v27.4s,v27.4s,v31.4s // += 1
846 stp q24,q25,[sp,#0] // off-load key block, invariant part
847 add v27.4s,v27.4s,v31.4s // not typo
848 str q26,[sp,#32]
849 add v28.4s,v27.4s,v31.4s
850 add v29.4s,v28.4s,v31.4s
851 add v30.4s,v29.4s,v31.4s
852 shl v31.4s,v31.4s,#2 // 1 -> 4
853
854 stp d8,d9,[sp,#128+0] // meet ABI requirements
855 stp d10,d11,[sp,#128+16]
856 stp d12,d13,[sp,#128+32]
857 stp d14,d15,[sp,#128+48]
858
859 sub x2,x2,#512 // not typo
860
861 .Loop_outer_512_neon:
862 mov v0.16b,v24.16b
863 mov v4.16b,v24.16b
864 mov v8.16b,v24.16b
865 mov v12.16b,v24.16b
866 mov v16.16b,v24.16b
867 mov v20.16b,v24.16b
868 mov v1.16b,v25.16b
869 mov w5,w22 // unpack key block
870 mov v5.16b,v25.16b
871 lsr x6,x22,#32
872 mov v9.16b,v25.16b
873 mov w7,w23
874 mov v13.16b,v25.16b
875 lsr x8,x23,#32
876 mov v17.16b,v25.16b
877 mov w9,w24
878 mov v21.16b,v25.16b
879 lsr x10,x24,#32
880 mov v3.16b,v27.16b
881 mov w11,w25
882 mov v7.16b,v28.16b
883 lsr x12,x25,#32
884 mov v11.16b,v29.16b
885 mov w13,w26
886 mov v15.16b,v30.16b
887 lsr x14,x26,#32
888 mov v2.16b,v26.16b
889 mov w15,w27
890 mov v6.16b,v26.16b
891 lsr x16,x27,#32
892 add v19.4s,v3.4s,v31.4s // +4
893 mov w17,w28
894 add v23.4s,v7.4s,v31.4s // +4
895 lsr x19,x28,#32
896 mov v10.16b,v26.16b
897 mov w20,w30
898 mov v14.16b,v26.16b
899 lsr x21,x30,#32
900 mov v18.16b,v26.16b
901 stp q27,q28,[sp,#48] // off-load key block, variable part
902 mov v22.16b,v26.16b
903 str q29,[sp,#80]
904
905 mov x4,#5
906 subs x2,x2,#512
907 .Loop_upper_neon:
908 sub x4,x4,#1
909 add v0.4s,v0.4s,v1.4s
910 add w5,w5,w9
911 add v4.4s,v4.4s,v5.4s
912 add w6,w6,w10
913 add v8.4s,v8.4s,v9.4s
914 add w7,w7,w11
915 add v12.4s,v12.4s,v13.4s
916 add w8,w8,w12
917 add v16.4s,v16.4s,v17.4s
918 eor w17,w17,w5
919 add v20.4s,v20.4s,v21.4s
920 eor w19,w19,w6
921 eor v3.16b,v3.16b,v0.16b
922 eor w20,w20,w7
923 eor v7.16b,v7.16b,v4.16b
924 eor w21,w21,w8
925 eor v11.16b,v11.16b,v8.16b
926 ror w17,w17,#16
927 eor v15.16b,v15.16b,v12.16b
928 ror w19,w19,#16
929 eor v19.16b,v19.16b,v16.16b
930 ror w20,w20,#16
931 eor v23.16b,v23.16b,v20.16b
932 ror w21,w21,#16
933 rev32 v3.8h,v3.8h
934 add w13,w13,w17
935 rev32 v7.8h,v7.8h
936 add w14,w14,w19
937 rev32 v11.8h,v11.8h
938 add w15,w15,w20
939 rev32 v15.8h,v15.8h
940 add w16,w16,w21
941 rev32 v19.8h,v19.8h
942 eor w9,w9,w13
943 rev32 v23.8h,v23.8h
944 eor w10,w10,w14
945 add v2.4s,v2.4s,v3.4s
946 eor w11,w11,w15
947 add v6.4s,v6.4s,v7.4s
948 eor w12,w12,w16
949 add v10.4s,v10.4s,v11.4s
950 ror w9,w9,#20
951 add v14.4s,v14.4s,v15.4s
952 ror w10,w10,#20
953 add v18.4s,v18.4s,v19.4s
954 ror w11,w11,#20
955 add v22.4s,v22.4s,v23.4s
956 ror w12,w12,#20
957 eor v24.16b,v1.16b,v2.16b
958 add w5,w5,w9
959 eor v25.16b,v5.16b,v6.16b
960 add w6,w6,w10
961 eor v26.16b,v9.16b,v10.16b
962 add w7,w7,w11
963 eor v27.16b,v13.16b,v14.16b
964 add w8,w8,w12
965 eor v28.16b,v17.16b,v18.16b
966 eor w17,w17,w5
967 eor v29.16b,v21.16b,v22.16b
968 eor w19,w19,w6
969 ushr v1.4s,v24.4s,#20
970 eor w20,w20,w7
971 ushr v5.4s,v25.4s,#20
972 eor w21,w21,w8
973 ushr v9.4s,v26.4s,#20
974 ror w17,w17,#24
975 ushr v13.4s,v27.4s,#20
976 ror w19,w19,#24
977 ushr v17.4s,v28.4s,#20
978 ror w20,w20,#24
979 ushr v21.4s,v29.4s,#20
980 ror w21,w21,#24
981 sli v1.4s,v24.4s,#12
982 add w13,w13,w17
983 sli v5.4s,v25.4s,#12
984 add w14,w14,w19
985 sli v9.4s,v26.4s,#12
986 add w15,w15,w20
987 sli v13.4s,v27.4s,#12
988 add w16,w16,w21
989 sli v17.4s,v28.4s,#12
990 eor w9,w9,w13
991 sli v21.4s,v29.4s,#12
992 eor w10,w10,w14
993 add v0.4s,v0.4s,v1.4s
994 eor w11,w11,w15
995 add v4.4s,v4.4s,v5.4s
996 eor w12,w12,w16
997 add v8.4s,v8.4s,v9.4s
998 ror w9,w9,#25
999 add v12.4s,v12.4s,v13.4s
1000 ror w10,w10,#25
1001 add v16.4s,v16.4s,v17.4s
1002 ror w11,w11,#25
1003 add v20.4s,v20.4s,v21.4s
1004 ror w12,w12,#25
1005 eor v24.16b,v3.16b,v0.16b
1006 add w5,w5,w10
1007 eor v25.16b,v7.16b,v4.16b
1008 add w6,w6,w11
1009 eor v26.16b,v11.16b,v8.16b
1010 add w7,w7,w12
1011 eor v27.16b,v15.16b,v12.16b
1012 add w8,w8,w9
1013 eor v28.16b,v19.16b,v16.16b
1014 eor w21,w21,w5
1015 eor v29.16b,v23.16b,v20.16b
1016 eor w17,w17,w6
1017 ushr v3.4s,v24.4s,#24
1018 eor w19,w19,w7
1019 ushr v7.4s,v25.4s,#24
1020 eor w20,w20,w8
1021 ushr v11.4s,v26.4s,#24
1022 ror w21,w21,#16
1023 ushr v15.4s,v27.4s,#24
1024 ror w17,w17,#16
1025 ushr v19.4s,v28.4s,#24
1026 ror w19,w19,#16
1027 ushr v23.4s,v29.4s,#24
1028 ror w20,w20,#16
1029 sli v3.4s,v24.4s,#8
1030 add w15,w15,w21
1031 sli v7.4s,v25.4s,#8
1032 add w16,w16,w17
1033 sli v11.4s,v26.4s,#8
1034 add w13,w13,w19
1035 sli v15.4s,v27.4s,#8
1036 add w14,w14,w20
1037 sli v19.4s,v28.4s,#8
1038 eor w10,w10,w15
1039 sli v23.4s,v29.4s,#8
1040 eor w11,w11,w16
1041 add v2.4s,v2.4s,v3.4s
1042 eor w12,w12,w13
1043 add v6.4s,v6.4s,v7.4s
1044 eor w9,w9,w14
1045 add v10.4s,v10.4s,v11.4s
1046 ror w10,w10,#20
1047 add v14.4s,v14.4s,v15.4s
1048 ror w11,w11,#20
1049 add v18.4s,v18.4s,v19.4s
1050 ror w12,w12,#20
1051 add v22.4s,v22.4s,v23.4s
1052 ror w9,w9,#20
1053 eor v24.16b,v1.16b,v2.16b
1054 add w5,w5,w10
1055 eor v25.16b,v5.16b,v6.16b
1056 add w6,w6,w11
1057 eor v26.16b,v9.16b,v10.16b
1058 add w7,w7,w12
1059 eor v27.16b,v13.16b,v14.16b
1060 add w8,w8,w9
1061 eor v28.16b,v17.16b,v18.16b
1062 eor w21,w21,w5
1063 eor v29.16b,v21.16b,v22.16b
1064 eor w17,w17,w6
1065 ushr v1.4s,v24.4s,#25
1066 eor w19,w19,w7
1067 ushr v5.4s,v25.4s,#25
1068 eor w20,w20,w8
1069 ushr v9.4s,v26.4s,#25
1070 ror w21,w21,#24
1071 ushr v13.4s,v27.4s,#25
1072 ror w17,w17,#24
1073 ushr v17.4s,v28.4s,#25
1074 ror w19,w19,#24
1075 ushr v21.4s,v29.4s,#25
1076 ror w20,w20,#24
1077 sli v1.4s,v24.4s,#7
1078 add w15,w15,w21
1079 sli v5.4s,v25.4s,#7
1080 add w16,w16,w17
1081 sli v9.4s,v26.4s,#7
1082 add w13,w13,w19
1083 sli v13.4s,v27.4s,#7
1084 add w14,w14,w20
1085 sli v17.4s,v28.4s,#7
1086 eor w10,w10,w15
1087 sli v21.4s,v29.4s,#7
1088 eor w11,w11,w16
1089 ext v2.16b,v2.16b,v2.16b,#8
1090 eor w12,w12,w13
1091 ext v6.16b,v6.16b,v6.16b,#8
1092 eor w9,w9,w14
1093 ext v10.16b,v10.16b,v10.16b,#8
1094 ror w10,w10,#25
1095 ext v14.16b,v14.16b,v14.16b,#8
1096 ror w11,w11,#25
1097 ext v18.16b,v18.16b,v18.16b,#8
1098 ror w12,w12,#25
1099 ext v22.16b,v22.16b,v22.16b,#8
1100 ror w9,w9,#25
1101 ext v3.16b,v3.16b,v3.16b,#12
1102 ext v7.16b,v7.16b,v7.16b,#12
1103 ext v11.16b,v11.16b,v11.16b,#12
1104 ext v15.16b,v15.16b,v15.16b,#12
1105 ext v19.16b,v19.16b,v19.16b,#12
1106 ext v23.16b,v23.16b,v23.16b,#12
1107 ext v1.16b,v1.16b,v1.16b,#4
1108 ext v5.16b,v5.16b,v5.16b,#4
1109 ext v9.16b,v9.16b,v9.16b,#4
1110 ext v13.16b,v13.16b,v13.16b,#4
1111 ext v17.16b,v17.16b,v17.16b,#4
1112 ext v21.16b,v21.16b,v21.16b,#4
1113 add v0.4s,v0.4s,v1.4s
1114 add w5,w5,w9
1115 add v4.4s,v4.4s,v5.4s
1116 add w6,w6,w10
1117 add v8.4s,v8.4s,v9.4s
1118 add w7,w7,w11
1119 add v12.4s,v12.4s,v13.4s
1120 add w8,w8,w12
1121 add v16.4s,v16.4s,v17.4s
1122 eor w17,w17,w5
1123 add v20.4s,v20.4s,v21.4s
1124 eor w19,w19,w6
1125 eor v3.16b,v3.16b,v0.16b
1126 eor w20,w20,w7
1127 eor v7.16b,v7.16b,v4.16b
1128 eor w21,w21,w8
1129 eor v11.16b,v11.16b,v8.16b
1130 ror w17,w17,#16
1131 eor v15.16b,v15.16b,v12.16b
1132 ror w19,w19,#16
1133 eor v19.16b,v19.16b,v16.16b
1134 ror w20,w20,#16
1135 eor v23.16b,v23.16b,v20.16b
1136 ror w21,w21,#16
1137 rev32 v3.8h,v3.8h
1138 add w13,w13,w17
1139 rev32 v7.8h,v7.8h
1140 add w14,w14,w19
1141 rev32 v11.8h,v11.8h
1142 add w15,w15,w20
1143 rev32 v15.8h,v15.8h
1144 add w16,w16,w21
1145 rev32 v19.8h,v19.8h
1146 eor w9,w9,w13
1147 rev32 v23.8h,v23.8h
1148 eor w10,w10,w14
1149 add v2.4s,v2.4s,v3.4s
1150 eor w11,w11,w15
1151 add v6.4s,v6.4s,v7.4s
1152 eor w12,w12,w16
1153 add v10.4s,v10.4s,v11.4s
1154 ror w9,w9,#20
1155 add v14.4s,v14.4s,v15.4s
1156 ror w10,w10,#20
1157 add v18.4s,v18.4s,v19.4s
1158 ror w11,w11,#20
1159 add v22.4s,v22.4s,v23.4s
1160 ror w12,w12,#20
1161 eor v24.16b,v1.16b,v2.16b
1162 add w5,w5,w9
1163 eor v25.16b,v5.16b,v6.16b
1164 add w6,w6,w10
1165 eor v26.16b,v9.16b,v10.16b
1166 add w7,w7,w11
1167 eor v27.16b,v13.16b,v14.16b
1168 add w8,w8,w12
1169 eor v28.16b,v17.16b,v18.16b
1170 eor w17,w17,w5
1171 eor v29.16b,v21.16b,v22.16b
1172 eor w19,w19,w6
1173 ushr v1.4s,v24.4s,#20
1174 eor w20,w20,w7
1175 ushr v5.4s,v25.4s,#20
1176 eor w21,w21,w8
1177 ushr v9.4s,v26.4s,#20
1178 ror w17,w17,#24
1179 ushr v13.4s,v27.4s,#20
1180 ror w19,w19,#24
1181 ushr v17.4s,v28.4s,#20
1182 ror w20,w20,#24
1183 ushr v21.4s,v29.4s,#20
1184 ror w21,w21,#24
1185 sli v1.4s,v24.4s,#12
1186 add w13,w13,w17
1187 sli v5.4s,v25.4s,#12
1188 add w14,w14,w19
1189 sli v9.4s,v26.4s,#12
1190 add w15,w15,w20
1191 sli v13.4s,v27.4s,#12
1192 add w16,w16,w21
1193 sli v17.4s,v28.4s,#12
1194 eor w9,w9,w13
1195 sli v21.4s,v29.4s,#12
1196 eor w10,w10,w14
1197 add v0.4s,v0.4s,v1.4s
1198 eor w11,w11,w15
1199 add v4.4s,v4.4s,v5.4s
1200 eor w12,w12,w16
1201 add v8.4s,v8.4s,v9.4s
1202 ror w9,w9,#25
1203 add v12.4s,v12.4s,v13.4s
1204 ror w10,w10,#25
1205 add v16.4s,v16.4s,v17.4s
1206 ror w11,w11,#25
1207 add v20.4s,v20.4s,v21.4s
1208 ror w12,w12,#25
1209 eor v24.16b,v3.16b,v0.16b
1210 add w5,w5,w10
1211 eor v25.16b,v7.16b,v4.16b
1212 add w6,w6,w11
1213 eor v26.16b,v11.16b,v8.16b
1214 add w7,w7,w12
1215 eor v27.16b,v15.16b,v12.16b
1216 add w8,w8,w9
1217 eor v28.16b,v19.16b,v16.16b
1218 eor w21,w21,w5
1219 eor v29.16b,v23.16b,v20.16b
1220 eor w17,w17,w6
1221 ushr v3.4s,v24.4s,#24
1222 eor w19,w19,w7
1223 ushr v7.4s,v25.4s,#24
1224 eor w20,w20,w8
1225 ushr v11.4s,v26.4s,#24
1226 ror w21,w21,#16
1227 ushr v15.4s,v27.4s,#24
1228 ror w17,w17,#16
1229 ushr v19.4s,v28.4s,#24
1230 ror w19,w19,#16
1231 ushr v23.4s,v29.4s,#24
1232 ror w20,w20,#16
1233 sli v3.4s,v24.4s,#8
1234 add w15,w15,w21
1235 sli v7.4s,v25.4s,#8
1236 add w16,w16,w17
1237 sli v11.4s,v26.4s,#8
1238 add w13,w13,w19
1239 sli v15.4s,v27.4s,#8
1240 add w14,w14,w20
1241 sli v19.4s,v28.4s,#8
1242 eor w10,w10,w15
1243 sli v23.4s,v29.4s,#8
1244 eor w11,w11,w16
1245 add v2.4s,v2.4s,v3.4s
1246 eor w12,w12,w13
1247 add v6.4s,v6.4s,v7.4s
1248 eor w9,w9,w14
1249 add v10.4s,v10.4s,v11.4s
1250 ror w10,w10,#20
1251 add v14.4s,v14.4s,v15.4s
1252 ror w11,w11,#20
1253 add v18.4s,v18.4s,v19.4s
1254 ror w12,w12,#20
1255 add v22.4s,v22.4s,v23.4s
1256 ror w9,w9,#20
1257 eor v24.16b,v1.16b,v2.16b
1258 add w5,w5,w10
1259 eor v25.16b,v5.16b,v6.16b
1260 add w6,w6,w11
1261 eor v26.16b,v9.16b,v10.16b
1262 add w7,w7,w12
1263 eor v27.16b,v13.16b,v14.16b
1264 add w8,w8,w9
1265 eor v28.16b,v17.16b,v18.16b
1266 eor w21,w21,w5
1267 eor v29.16b,v21.16b,v22.16b
1268 eor w17,w17,w6
1269 ushr v1.4s,v24.4s,#25
1270 eor w19,w19,w7
1271 ushr v5.4s,v25.4s,#25
1272 eor w20,w20,w8
1273 ushr v9.4s,v26.4s,#25
1274 ror w21,w21,#24
1275 ushr v13.4s,v27.4s,#25
1276 ror w17,w17,#24
1277 ushr v17.4s,v28.4s,#25
1278 ror w19,w19,#24
1279 ushr v21.4s,v29.4s,#25
1280 ror w20,w20,#24
1281 sli v1.4s,v24.4s,#7
1282 add w15,w15,w21
1283 sli v5.4s,v25.4s,#7
1284 add w16,w16,w17
1285 sli v9.4s,v26.4s,#7
1286 add w13,w13,w19
1287 sli v13.4s,v27.4s,#7
1288 add w14,w14,w20
1289 sli v17.4s,v28.4s,#7
1290 eor w10,w10,w15
1291 sli v21.4s,v29.4s,#7
1292 eor w11,w11,w16
1293 ext v2.16b,v2.16b,v2.16b,#8
1294 eor w12,w12,w13
1295 ext v6.16b,v6.16b,v6.16b,#8
1296 eor w9,w9,w14
1297 ext v10.16b,v10.16b,v10.16b,#8
1298 ror w10,w10,#25
1299 ext v14.16b,v14.16b,v14.16b,#8
1300 ror w11,w11,#25
1301 ext v18.16b,v18.16b,v18.16b,#8
1302 ror w12,w12,#25
1303 ext v22.16b,v22.16b,v22.16b,#8
1304 ror w9,w9,#25
1305 ext v3.16b,v3.16b,v3.16b,#4
1306 ext v7.16b,v7.16b,v7.16b,#4
1307 ext v11.16b,v11.16b,v11.16b,#4
1308 ext v15.16b,v15.16b,v15.16b,#4
1309 ext v19.16b,v19.16b,v19.16b,#4
1310 ext v23.16b,v23.16b,v23.16b,#4
1311 ext v1.16b,v1.16b,v1.16b,#12
1312 ext v5.16b,v5.16b,v5.16b,#12
1313 ext v9.16b,v9.16b,v9.16b,#12
1314 ext v13.16b,v13.16b,v13.16b,#12
1315 ext v17.16b,v17.16b,v17.16b,#12
1316 ext v21.16b,v21.16b,v21.16b,#12
1317 cbnz x4,.Loop_upper_neon
1318
1319 add w5,w5,w22 // accumulate key block
1320 add x6,x6,x22,lsr#32
1321 add w7,w7,w23
1322 add x8,x8,x23,lsr#32
1323 add w9,w9,w24
1324 add x10,x10,x24,lsr#32
1325 add w11,w11,w25
1326 add x12,x12,x25,lsr#32
1327 add w13,w13,w26
1328 add x14,x14,x26,lsr#32
1329 add w15,w15,w27
1330 add x16,x16,x27,lsr#32
1331 add w17,w17,w28
1332 add x19,x19,x28,lsr#32
1333 add w20,w20,w30
1334 add x21,x21,x30,lsr#32
1335
1336 add x5,x5,x6,lsl#32 // pack
1337 add x7,x7,x8,lsl#32
1338 ldp x6,x8,[x1,#0] // load input
1339 add x9,x9,x10,lsl#32
1340 add x11,x11,x12,lsl#32
1341 ldp x10,x12,[x1,#16]
1342 add x13,x13,x14,lsl#32
1343 add x15,x15,x16,lsl#32
1344 ldp x14,x16,[x1,#32]
1345 add x17,x17,x19,lsl#32
1346 add x20,x20,x21,lsl#32
1347 ldp x19,x21,[x1,#48]
1348 add x1,x1,#64
1349 #ifdef __ARMEB__
1350 rev x5,x5
1351 rev x7,x7
1352 rev x9,x9
1353 rev x11,x11
1354 rev x13,x13
1355 rev x15,x15
1356 rev x17,x17
1357 rev x20,x20
1358 #endif
1359 eor x5,x5,x6
1360 eor x7,x7,x8
1361 eor x9,x9,x10
1362 eor x11,x11,x12
1363 eor x13,x13,x14
1364 eor x15,x15,x16
1365 eor x17,x17,x19
1366 eor x20,x20,x21
1367
1368 stp x5,x7,[x0,#0] // store output
1369 add x28,x28,#1 // increment counter
1370 mov w5,w22 // unpack key block
1371 lsr x6,x22,#32
1372 stp x9,x11,[x0,#16]
1373 mov w7,w23
1374 lsr x8,x23,#32
1375 stp x13,x15,[x0,#32]
1376 mov w9,w24
1377 lsr x10,x24,#32
1378 stp x17,x20,[x0,#48]
1379 add x0,x0,#64
1380 mov w11,w25
1381 lsr x12,x25,#32
1382 mov w13,w26
1383 lsr x14,x26,#32
1384 mov w15,w27
1385 lsr x16,x27,#32
1386 mov w17,w28
1387 lsr x19,x28,#32
1388 mov w20,w30
1389 lsr x21,x30,#32
1390
1391 mov x4,#5
1392 .Loop_lower_neon:
1393 sub x4,x4,#1
1394 add v0.4s,v0.4s,v1.4s
1395 add w5,w5,w9
1396 add v4.4s,v4.4s,v5.4s
1397 add w6,w6,w10
1398 add v8.4s,v8.4s,v9.4s
1399 add w7,w7,w11
1400 add v12.4s,v12.4s,v13.4s
1401 add w8,w8,w12
1402 add v16.4s,v16.4s,v17.4s
1403 eor w17,w17,w5
1404 add v20.4s,v20.4s,v21.4s
1405 eor w19,w19,w6
1406 eor v3.16b,v3.16b,v0.16b
1407 eor w20,w20,w7
1408 eor v7.16b,v7.16b,v4.16b
1409 eor w21,w21,w8
1410 eor v11.16b,v11.16b,v8.16b
1411 ror w17,w17,#16
1412 eor v15.16b,v15.16b,v12.16b
1413 ror w19,w19,#16
1414 eor v19.16b,v19.16b,v16.16b
1415 ror w20,w20,#16
1416 eor v23.16b,v23.16b,v20.16b
1417 ror w21,w21,#16
1418 rev32 v3.8h,v3.8h
1419 add w13,w13,w17
1420 rev32 v7.8h,v7.8h
1421 add w14,w14,w19
1422 rev32 v11.8h,v11.8h
1423 add w15,w15,w20
1424 rev32 v15.8h,v15.8h
1425 add w16,w16,w21
1426 rev32 v19.8h,v19.8h
1427 eor w9,w9,w13
1428 rev32 v23.8h,v23.8h
1429 eor w10,w10,w14
1430 add v2.4s,v2.4s,v3.4s
1431 eor w11,w11,w15
1432 add v6.4s,v6.4s,v7.4s
1433 eor w12,w12,w16
1434 add v10.4s,v10.4s,v11.4s
1435 ror w9,w9,#20
1436 add v14.4s,v14.4s,v15.4s
1437 ror w10,w10,#20
1438 add v18.4s,v18.4s,v19.4s
1439 ror w11,w11,#20
1440 add v22.4s,v22.4s,v23.4s
1441 ror w12,w12,#20
1442 eor v24.16b,v1.16b,v2.16b
1443 add w5,w5,w9
1444 eor v25.16b,v5.16b,v6.16b
1445 add w6,w6,w10
1446 eor v26.16b,v9.16b,v10.16b
1447 add w7,w7,w11
1448 eor v27.16b,v13.16b,v14.16b
1449 add w8,w8,w12
1450 eor v28.16b,v17.16b,v18.16b
1451 eor w17,w17,w5
1452 eor v29.16b,v21.16b,v22.16b
1453 eor w19,w19,w6
1454 ushr v1.4s,v24.4s,#20
1455 eor w20,w20,w7
1456 ushr v5.4s,v25.4s,#20
1457 eor w21,w21,w8
1458 ushr v9.4s,v26.4s,#20
1459 ror w17,w17,#24
1460 ushr v13.4s,v27.4s,#20
1461 ror w19,w19,#24
1462 ushr v17.4s,v28.4s,#20
1463 ror w20,w20,#24
1464 ushr v21.4s,v29.4s,#20
1465 ror w21,w21,#24
1466 sli v1.4s,v24.4s,#12
1467 add w13,w13,w17
1468 sli v5.4s,v25.4s,#12
1469 add w14,w14,w19
1470 sli v9.4s,v26.4s,#12
1471 add w15,w15,w20
1472 sli v13.4s,v27.4s,#12
1473 add w16,w16,w21
1474 sli v17.4s,v28.4s,#12
1475 eor w9,w9,w13
1476 sli v21.4s,v29.4s,#12
1477 eor w10,w10,w14
1478 add v0.4s,v0.4s,v1.4s
1479 eor w11,w11,w15
1480 add v4.4s,v4.4s,v5.4s
1481 eor w12,w12,w16
1482 add v8.4s,v8.4s,v9.4s
1483 ror w9,w9,#25
1484 add v12.4s,v12.4s,v13.4s
1485 ror w10,w10,#25
1486 add v16.4s,v16.4s,v17.4s
1487 ror w11,w11,#25
1488 add v20.4s,v20.4s,v21.4s
1489 ror w12,w12,#25
1490 eor v24.16b,v3.16b,v0.16b
1491 add w5,w5,w10
1492 eor v25.16b,v7.16b,v4.16b
1493 add w6,w6,w11
1494 eor v26.16b,v11.16b,v8.16b
1495 add w7,w7,w12
1496 eor v27.16b,v15.16b,v12.16b
1497 add w8,w8,w9
1498 eor v28.16b,v19.16b,v16.16b
1499 eor w21,w21,w5
1500 eor v29.16b,v23.16b,v20.16b
1501 eor w17,w17,w6
1502 ushr v3.4s,v24.4s,#24
1503 eor w19,w19,w7
1504 ushr v7.4s,v25.4s,#24
1505 eor w20,w20,w8
1506 ushr v11.4s,v26.4s,#24
1507 ror w21,w21,#16
1508 ushr v15.4s,v27.4s,#24
1509 ror w17,w17,#16
1510 ushr v19.4s,v28.4s,#24
1511 ror w19,w19,#16
1512 ushr v23.4s,v29.4s,#24
1513 ror w20,w20,#16
1514 sli v3.4s,v24.4s,#8
1515 add w15,w15,w21
1516 sli v7.4s,v25.4s,#8
1517 add w16,w16,w17
1518 sli v11.4s,v26.4s,#8
1519 add w13,w13,w19
1520 sli v15.4s,v27.4s,#8
1521 add w14,w14,w20
1522 sli v19.4s,v28.4s,#8
1523 eor w10,w10,w15
1524 sli v23.4s,v29.4s,#8
1525 eor w11,w11,w16
1526 add v2.4s,v2.4s,v3.4s
1527 eor w12,w12,w13
1528 add v6.4s,v6.4s,v7.4s
1529 eor w9,w9,w14
1530 add v10.4s,v10.4s,v11.4s
1531 ror w10,w10,#20
1532 add v14.4s,v14.4s,v15.4s
1533 ror w11,w11,#20
1534 add v18.4s,v18.4s,v19.4s
1535 ror w12,w12,#20
1536 add v22.4s,v22.4s,v23.4s
1537 ror w9,w9,#20
1538 eor v24.16b,v1.16b,v2.16b
1539 add w5,w5,w10
1540 eor v25.16b,v5.16b,v6.16b
1541 add w6,w6,w11
1542 eor v26.16b,v9.16b,v10.16b
1543 add w7,w7,w12
1544 eor v27.16b,v13.16b,v14.16b
1545 add w8,w8,w9
1546 eor v28.16b,v17.16b,v18.16b
1547 eor w21,w21,w5
1548 eor v29.16b,v21.16b,v22.16b
1549 eor w17,w17,w6
1550 ushr v1.4s,v24.4s,#25
1551 eor w19,w19,w7
1552 ushr v5.4s,v25.4s,#25
1553 eor w20,w20,w8
1554 ushr v9.4s,v26.4s,#25
1555 ror w21,w21,#24
1556 ushr v13.4s,v27.4s,#25
1557 ror w17,w17,#24
1558 ushr v17.4s,v28.4s,#25
1559 ror w19,w19,#24
1560 ushr v21.4s,v29.4s,#25
1561 ror w20,w20,#24
1562 sli v1.4s,v24.4s,#7
1563 add w15,w15,w21
1564 sli v5.4s,v25.4s,#7
1565 add w16,w16,w17
1566 sli v9.4s,v26.4s,#7
1567 add w13,w13,w19
1568 sli v13.4s,v27.4s,#7
1569 add w14,w14,w20
1570 sli v17.4s,v28.4s,#7
1571 eor w10,w10,w15
1572 sli v21.4s,v29.4s,#7
1573 eor w11,w11,w16
1574 ext v2.16b,v2.16b,v2.16b,#8
1575 eor w12,w12,w13
1576 ext v6.16b,v6.16b,v6.16b,#8
1577 eor w9,w9,w14
1578 ext v10.16b,v10.16b,v10.16b,#8
1579 ror w10,w10,#25
1580 ext v14.16b,v14.16b,v14.16b,#8
1581 ror w11,w11,#25
1582 ext v18.16b,v18.16b,v18.16b,#8
1583 ror w12,w12,#25
1584 ext v22.16b,v22.16b,v22.16b,#8
1585 ror w9,w9,#25
1586 ext v3.16b,v3.16b,v3.16b,#12
1587 ext v7.16b,v7.16b,v7.16b,#12
1588 ext v11.16b,v11.16b,v11.16b,#12
1589 ext v15.16b,v15.16b,v15.16b,#12
1590 ext v19.16b,v19.16b,v19.16b,#12
1591 ext v23.16b,v23.16b,v23.16b,#12
1592 ext v1.16b,v1.16b,v1.16b,#4
1593 ext v5.16b,v5.16b,v5.16b,#4
1594 ext v9.16b,v9.16b,v9.16b,#4
1595 ext v13.16b,v13.16b,v13.16b,#4
1596 ext v17.16b,v17.16b,v17.16b,#4
1597 ext v21.16b,v21.16b,v21.16b,#4
1598 add v0.4s,v0.4s,v1.4s
1599 add w5,w5,w9
1600 add v4.4s,v4.4s,v5.4s
1601 add w6,w6,w10
1602 add v8.4s,v8.4s,v9.4s
1603 add w7,w7,w11
1604 add v12.4s,v12.4s,v13.4s
1605 add w8,w8,w12
1606 add v16.4s,v16.4s,v17.4s
1607 eor w17,w17,w5
1608 add v20.4s,v20.4s,v21.4s
1609 eor w19,w19,w6
1610 eor v3.16b,v3.16b,v0.16b
1611 eor w20,w20,w7
1612 eor v7.16b,v7.16b,v4.16b
1613 eor w21,w21,w8
1614 eor v11.16b,v11.16b,v8.16b
1615 ror w17,w17,#16
1616 eor v15.16b,v15.16b,v12.16b
1617 ror w19,w19,#16
1618 eor v19.16b,v19.16b,v16.16b
1619 ror w20,w20,#16
1620 eor v23.16b,v23.16b,v20.16b
1621 ror w21,w21,#16
1622 rev32 v3.8h,v3.8h
1623 add w13,w13,w17
1624 rev32 v7.8h,v7.8h
1625 add w14,w14,w19
1626 rev32 v11.8h,v11.8h
1627 add w15,w15,w20
1628 rev32 v15.8h,v15.8h
1629 add w16,w16,w21
1630 rev32 v19.8h,v19.8h
1631 eor w9,w9,w13
1632 rev32 v23.8h,v23.8h
1633 eor w10,w10,w14
1634 add v2.4s,v2.4s,v3.4s
1635 eor w11,w11,w15
1636 add v6.4s,v6.4s,v7.4s
1637 eor w12,w12,w16
1638 add v10.4s,v10.4s,v11.4s
1639 ror w9,w9,#20
1640 add v14.4s,v14.4s,v15.4s
1641 ror w10,w10,#20
1642 add v18.4s,v18.4s,v19.4s
1643 ror w11,w11,#20
1644 add v22.4s,v22.4s,v23.4s
1645 ror w12,w12,#20
1646 eor v24.16b,v1.16b,v2.16b
1647 add w5,w5,w9
1648 eor v25.16b,v5.16b,v6.16b
1649 add w6,w6,w10
1650 eor v26.16b,v9.16b,v10.16b
1651 add w7,w7,w11
1652 eor v27.16b,v13.16b,v14.16b
1653 add w8,w8,w12
1654 eor v28.16b,v17.16b,v18.16b
1655 eor w17,w17,w5
1656 eor v29.16b,v21.16b,v22.16b
1657 eor w19,w19,w6
1658 ushr v1.4s,v24.4s,#20
1659 eor w20,w20,w7
1660 ushr v5.4s,v25.4s,#20
1661 eor w21,w21,w8
1662 ushr v9.4s,v26.4s,#20
1663 ror w17,w17,#24
1664 ushr v13.4s,v27.4s,#20
1665 ror w19,w19,#24
1666 ushr v17.4s,v28.4s,#20
1667 ror w20,w20,#24
1668 ushr v21.4s,v29.4s,#20
1669 ror w21,w21,#24
1670 sli v1.4s,v24.4s,#12
1671 add w13,w13,w17
1672 sli v5.4s,v25.4s,#12
1673 add w14,w14,w19
1674 sli v9.4s,v26.4s,#12
1675 add w15,w15,w20
1676 sli v13.4s,v27.4s,#12
1677 add w16,w16,w21
1678 sli v17.4s,v28.4s,#12
1679 eor w9,w9,w13
1680 sli v21.4s,v29.4s,#12
1681 eor w10,w10,w14
1682 add v0.4s,v0.4s,v1.4s
1683 eor w11,w11,w15
1684 add v4.4s,v4.4s,v5.4s
1685 eor w12,w12,w16
1686 add v8.4s,v8.4s,v9.4s
1687 ror w9,w9,#25
1688 add v12.4s,v12.4s,v13.4s
1689 ror w10,w10,#25
1690 add v16.4s,v16.4s,v17.4s
1691 ror w11,w11,#25
1692 add v20.4s,v20.4s,v21.4s
1693 ror w12,w12,#25
1694 eor v24.16b,v3.16b,v0.16b
1695 add w5,w5,w10
1696 eor v25.16b,v7.16b,v4.16b
1697 add w6,w6,w11
1698 eor v26.16b,v11.16b,v8.16b
1699 add w7,w7,w12
1700 eor v27.16b,v15.16b,v12.16b
1701 add w8,w8,w9
1702 eor v28.16b,v19.16b,v16.16b
1703 eor w21,w21,w5
1704 eor v29.16b,v23.16b,v20.16b
1705 eor w17,w17,w6
1706 ushr v3.4s,v24.4s,#24
1707 eor w19,w19,w7
1708 ushr v7.4s,v25.4s,#24
1709 eor w20,w20,w8
1710 ushr v11.4s,v26.4s,#24
1711 ror w21,w21,#16
1712 ushr v15.4s,v27.4s,#24
1713 ror w17,w17,#16
1714 ushr v19.4s,v28.4s,#24
1715 ror w19,w19,#16
1716 ushr v23.4s,v29.4s,#24
1717 ror w20,w20,#16
1718 sli v3.4s,v24.4s,#8
1719 add w15,w15,w21
1720 sli v7.4s,v25.4s,#8
1721 add w16,w16,w17
1722 sli v11.4s,v26.4s,#8
1723 add w13,w13,w19
1724 sli v15.4s,v27.4s,#8
1725 add w14,w14,w20
1726 sli v19.4s,v28.4s,#8
1727 eor w10,w10,w15
1728 sli v23.4s,v29.4s,#8
1729 eor w11,w11,w16
1730 add v2.4s,v2.4s,v3.4s
1731 eor w12,w12,w13
1732 add v6.4s,v6.4s,v7.4s
1733 eor w9,w9,w14
1734 add v10.4s,v10.4s,v11.4s
1735 ror w10,w10,#20
1736 add v14.4s,v14.4s,v15.4s
1737 ror w11,w11,#20
1738 add v18.4s,v18.4s,v19.4s
1739 ror w12,w12,#20
1740 add v22.4s,v22.4s,v23.4s
1741 ror w9,w9,#20
1742 eor v24.16b,v1.16b,v2.16b
1743 add w5,w5,w10
1744 eor v25.16b,v5.16b,v6.16b
1745 add w6,w6,w11
1746 eor v26.16b,v9.16b,v10.16b
1747 add w7,w7,w12
1748 eor v27.16b,v13.16b,v14.16b
1749 add w8,w8,w9
1750 eor v28.16b,v17.16b,v18.16b
1751 eor w21,w21,w5
1752 eor v29.16b,v21.16b,v22.16b
1753 eor w17,w17,w6
1754 ushr v1.4s,v24.4s,#25
1755 eor w19,w19,w7
1756 ushr v5.4s,v25.4s,#25
1757 eor w20,w20,w8
1758 ushr v9.4s,v26.4s,#25
1759 ror w21,w21,#24
1760 ushr v13.4s,v27.4s,#25
1761 ror w17,w17,#24
1762 ushr v17.4s,v28.4s,#25
1763 ror w19,w19,#24
1764 ushr v21.4s,v29.4s,#25
1765 ror w20,w20,#24
1766 sli v1.4s,v24.4s,#7
1767 add w15,w15,w21
1768 sli v5.4s,v25.4s,#7
1769 add w16,w16,w17
1770 sli v9.4s,v26.4s,#7
1771 add w13,w13,w19
1772 sli v13.4s,v27.4s,#7
1773 add w14,w14,w20
1774 sli v17.4s,v28.4s,#7
1775 eor w10,w10,w15
1776 sli v21.4s,v29.4s,#7
1777 eor w11,w11,w16
1778 ext v2.16b,v2.16b,v2.16b,#8
1779 eor w12,w12,w13
1780 ext v6.16b,v6.16b,v6.16b,#8
1781 eor w9,w9,w14
1782 ext v10.16b,v10.16b,v10.16b,#8
1783 ror w10,w10,#25
1784 ext v14.16b,v14.16b,v14.16b,#8
1785 ror w11,w11,#25
1786 ext v18.16b,v18.16b,v18.16b,#8
1787 ror w12,w12,#25
1788 ext v22.16b,v22.16b,v22.16b,#8
1789 ror w9,w9,#25
1790 ext v3.16b,v3.16b,v3.16b,#4
1791 ext v7.16b,v7.16b,v7.16b,#4
1792 ext v11.16b,v11.16b,v11.16b,#4
1793 ext v15.16b,v15.16b,v15.16b,#4
1794 ext v19.16b,v19.16b,v19.16b,#4
1795 ext v23.16b,v23.16b,v23.16b,#4
1796 ext v1.16b,v1.16b,v1.16b,#12
1797 ext v5.16b,v5.16b,v5.16b,#12
1798 ext v9.16b,v9.16b,v9.16b,#12
1799 ext v13.16b,v13.16b,v13.16b,#12
1800 ext v17.16b,v17.16b,v17.16b,#12
1801 ext v21.16b,v21.16b,v21.16b,#12
1802 cbnz x4,.Loop_lower_neon
1803
1804 add w5,w5,w22 // accumulate key block
1805 ldp q24,q25,[sp,#0]
1806 add x6,x6,x22,lsr#32
1807 ldp q26,q27,[sp,#32]
1808 add w7,w7,w23
1809 ldp q28,q29,[sp,#64]
1810 add x8,x8,x23,lsr#32
1811 add v0.4s,v0.4s,v24.4s
1812 add w9,w9,w24
1813 add v4.4s,v4.4s,v24.4s
1814 add x10,x10,x24,lsr#32
1815 add v8.4s,v8.4s,v24.4s
1816 add w11,w11,w25
1817 add v12.4s,v12.4s,v24.4s
1818 add x12,x12,x25,lsr#32
1819 add v16.4s,v16.4s,v24.4s
1820 add w13,w13,w26
1821 add v20.4s,v20.4s,v24.4s
1822 add x14,x14,x26,lsr#32
1823 add v2.4s,v2.4s,v26.4s
1824 add w15,w15,w27
1825 add v6.4s,v6.4s,v26.4s
1826 add x16,x16,x27,lsr#32
1827 add v10.4s,v10.4s,v26.4s
1828 add w17,w17,w28
1829 add v14.4s,v14.4s,v26.4s
1830 add x19,x19,x28,lsr#32
1831 add v18.4s,v18.4s,v26.4s
1832 add w20,w20,w30
1833 add v22.4s,v22.4s,v26.4s
1834 add x21,x21,x30,lsr#32
1835 add v19.4s,v19.4s,v31.4s // +4
1836 add x5,x5,x6,lsl#32 // pack
1837 add v23.4s,v23.4s,v31.4s // +4
1838 add x7,x7,x8,lsl#32
1839 add v3.4s,v3.4s,v27.4s
1840 ldp x6,x8,[x1,#0] // load input
1841 add v7.4s,v7.4s,v28.4s
1842 add x9,x9,x10,lsl#32
1843 add v11.4s,v11.4s,v29.4s
1844 add x11,x11,x12,lsl#32
1845 add v15.4s,v15.4s,v30.4s
1846 ldp x10,x12,[x1,#16]
1847 add v19.4s,v19.4s,v27.4s
1848 add x13,x13,x14,lsl#32
1849 add v23.4s,v23.4s,v28.4s
1850 add x15,x15,x16,lsl#32
1851 add v1.4s,v1.4s,v25.4s
1852 ldp x14,x16,[x1,#32]
1853 add v5.4s,v5.4s,v25.4s
1854 add x17,x17,x19,lsl#32
1855 add v9.4s,v9.4s,v25.4s
1856 add x20,x20,x21,lsl#32
1857 add v13.4s,v13.4s,v25.4s
1858 ldp x19,x21,[x1,#48]
1859 add v17.4s,v17.4s,v25.4s
1860 add x1,x1,#64
1861 add v21.4s,v21.4s,v25.4s
1862
1863 #ifdef __ARMEB__
1864 rev x5,x5
1865 rev x7,x7
1866 rev x9,x9
1867 rev x11,x11
1868 rev x13,x13
1869 rev x15,x15
1870 rev x17,x17
1871 rev x20,x20
1872 #endif
1873 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1874 eor x5,x5,x6
1875 eor x7,x7,x8
1876 eor x9,x9,x10
1877 eor x11,x11,x12
1878 eor x13,x13,x14
1879 eor v0.16b,v0.16b,v24.16b
1880 eor x15,x15,x16
1881 eor v1.16b,v1.16b,v25.16b
1882 eor x17,x17,x19
1883 eor v2.16b,v2.16b,v26.16b
1884 eor x20,x20,x21
1885 eor v3.16b,v3.16b,v27.16b
1886 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1887
1888 stp x5,x7,[x0,#0] // store output
1889 add x28,x28,#7 // increment counter
1890 stp x9,x11,[x0,#16]
1891 stp x13,x15,[x0,#32]
1892 stp x17,x20,[x0,#48]
1893 add x0,x0,#64
1894 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1895
1896 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1897 eor v4.16b,v4.16b,v24.16b
1898 eor v5.16b,v5.16b,v25.16b
1899 eor v6.16b,v6.16b,v26.16b
1900 eor v7.16b,v7.16b,v27.16b
1901 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1902
1903 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1904 eor v8.16b,v8.16b,v0.16b
1905 ldp q24,q25,[sp,#0]
1906 eor v9.16b,v9.16b,v1.16b
1907 ldp q26,q27,[sp,#32]
1908 eor v10.16b,v10.16b,v2.16b
1909 eor v11.16b,v11.16b,v3.16b
1910 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1911
1912 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1913 eor v12.16b,v12.16b,v4.16b
1914 eor v13.16b,v13.16b,v5.16b
1915 eor v14.16b,v14.16b,v6.16b
1916 eor v15.16b,v15.16b,v7.16b
1917 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1918
1919 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1920 eor v16.16b,v16.16b,v8.16b
1921 eor v17.16b,v17.16b,v9.16b
1922 eor v18.16b,v18.16b,v10.16b
1923 eor v19.16b,v19.16b,v11.16b
1924 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1925
1926 shl v0.4s,v31.4s,#1 // 4 -> 8
1927 eor v20.16b,v20.16b,v12.16b
1928 eor v21.16b,v21.16b,v13.16b
1929 eor v22.16b,v22.16b,v14.16b
1930 eor v23.16b,v23.16b,v15.16b
1931 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1932
1933 add v27.4s,v27.4s,v0.4s // += 8
1934 add v28.4s,v28.4s,v0.4s
1935 add v29.4s,v29.4s,v0.4s
1936 add v30.4s,v30.4s,v0.4s
1937
1938 b.hs .Loop_outer_512_neon
1939
1940 adds x2,x2,#512
1941 ushr v0.4s,v31.4s,#2 // 4 -> 1
1942
1943 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1944 ldp d10,d11,[sp,#128+16]
1945 ldp d12,d13,[sp,#128+32]
1946 ldp d14,d15,[sp,#128+48]
1947
1948 stp q24,q31,[sp,#0] // wipe off-load area
1949 stp q24,q31,[sp,#32]
1950 stp q24,q31,[sp,#64]
1951
1952 b.eq .Ldone_512_neon
1953
1954 cmp x2,#192
1955 sub v27.4s,v27.4s,v0.4s // -= 1
1956 sub v28.4s,v28.4s,v0.4s
1957 sub v29.4s,v29.4s,v0.4s
1958 add sp,sp,#128
1959 b.hs .Loop_outer_neon
1960
1961 eor v25.16b,v25.16b,v25.16b
1962 eor v26.16b,v26.16b,v26.16b
1963 eor v27.16b,v27.16b,v27.16b
1964 eor v28.16b,v28.16b,v28.16b
1965 eor v29.16b,v29.16b,v29.16b
1966 eor v30.16b,v30.16b,v30.16b
1967 b .Loop_outer
1968
1969 .Ldone_512_neon:
1970 ldp x19,x20,[x29,#16]
1971 add sp,sp,#128+64
1972 ldp x21,x22,[x29,#32]
1973 ldp x23,x24,[x29,#48]
1974 ldp x25,x26,[x29,#64]
1975 ldp x27,x28,[x29,#80]
1976 ldp x29,x30,[sp],#96
1977 .inst 0xd50323bf // autiasp
1978 ret
1979 .size ChaCha20_512_neon,.-ChaCha20_512_neon
Cache object: 146e534c15ca818fdc691afc63d7853a
|