1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
3 #include "arm_arch.h"
4
5 .text
6
7 // forward "declarations" are required for Apple
8
9 .hidden OPENSSL_armcap_P
10 .globl poly1305_init
11 .hidden poly1305_init
12 .globl poly1305_blocks
13 .hidden poly1305_blocks
14 .globl poly1305_emit
15 .hidden poly1305_emit
16
17 .type poly1305_init,%function
18 .align 5
19 poly1305_init:
20 cmp x1,xzr
21 stp xzr,xzr,[x0] // zero hash value
22 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
23
24 csel x0,xzr,x0,eq
25 b.eq .Lno_key
26
27 #ifdef __ILP32__
28 ldrsw x11,.LOPENSSL_armcap_P
29 #else
30 ldr x11,.LOPENSSL_armcap_P
31 #endif
32 adr x10,.LOPENSSL_armcap_P
33
34 ldp x7,x8,[x1] // load key
35 mov x9,#0xfffffffc0fffffff
36 movk x9,#0x0fff,lsl#48
37 ldr w17,[x10,x11]
38 #ifdef __ARMEB__
39 rev x7,x7 // flip bytes
40 rev x8,x8
41 #endif
42 and x7,x7,x9 // &=0ffffffc0fffffff
43 and x9,x9,#-4
44 and x8,x8,x9 // &=0ffffffc0ffffffc
45 stp x7,x8,[x0,#32] // save key value
46
47 tst w17,#ARMV7_NEON
48
49 adr x12,poly1305_blocks
50 adr x7,poly1305_blocks_neon
51 adr x13,poly1305_emit
52 adr x8,poly1305_emit_neon
53
54 csel x12,x12,x7,eq
55 csel x13,x13,x8,eq
56
57 #ifdef __ILP32__
58 stp w12,w13,[x2]
59 #else
60 stp x12,x13,[x2]
61 #endif
62
63 mov x0,#1
64 .Lno_key:
65 ret
66 .size poly1305_init,.-poly1305_init
67
68 .type poly1305_blocks,%function
69 .align 5
70 poly1305_blocks:
71 ands x2,x2,#-16
72 b.eq .Lno_data
73
74 ldp x4,x5,[x0] // load hash value
75 ldp x7,x8,[x0,#32] // load key value
76 ldr x6,[x0,#16]
77 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
78 b .Loop
79
80 .align 5
81 .Loop:
82 ldp x10,x11,[x1],#16 // load input
83 sub x2,x2,#16
84 #ifdef __ARMEB__
85 rev x10,x10
86 rev x11,x11
87 #endif
88 adds x4,x4,x10 // accumulate input
89 adcs x5,x5,x11
90
91 mul x12,x4,x7 // h0*r0
92 adc x6,x6,x3
93 umulh x13,x4,x7
94
95 mul x10,x5,x9 // h1*5*r1
96 umulh x11,x5,x9
97
98 adds x12,x12,x10
99 mul x10,x4,x8 // h0*r1
100 adc x13,x13,x11
101 umulh x14,x4,x8
102
103 adds x13,x13,x10
104 mul x10,x5,x7 // h1*r0
105 adc x14,x14,xzr
106 umulh x11,x5,x7
107
108 adds x13,x13,x10
109 mul x10,x6,x9 // h2*5*r1
110 adc x14,x14,x11
111 mul x11,x6,x7 // h2*r0
112
113 adds x13,x13,x10
114 adc x14,x14,x11
115
116 and x10,x14,#-4 // final reduction
117 and x6,x14,#3
118 add x10,x10,x14,lsr#2
119 adds x4,x12,x10
120 adcs x5,x13,xzr
121 adc x6,x6,xzr
122
123 cbnz x2,.Loop
124
125 stp x4,x5,[x0] // store hash value
126 str x6,[x0,#16]
127
128 .Lno_data:
129 ret
130 .size poly1305_blocks,.-poly1305_blocks
131
132 .type poly1305_emit,%function
133 .align 5
134 poly1305_emit:
135 ldp x4,x5,[x0] // load hash base 2^64
136 ldr x6,[x0,#16]
137 ldp x10,x11,[x2] // load nonce
138
139 adds x12,x4,#5 // compare to modulus
140 adcs x13,x5,xzr
141 adc x14,x6,xzr
142
143 tst x14,#-4 // see if it's carried/borrowed
144
145 csel x4,x4,x12,eq
146 csel x5,x5,x13,eq
147
148 #ifdef __ARMEB__
149 ror x10,x10,#32 // flip nonce words
150 ror x11,x11,#32
151 #endif
152 adds x4,x4,x10 // accumulate nonce
153 adc x5,x5,x11
154 #ifdef __ARMEB__
155 rev x4,x4 // flip output bytes
156 rev x5,x5
157 #endif
158 stp x4,x5,[x1] // write result
159
160 ret
161 .size poly1305_emit,.-poly1305_emit
162 .type poly1305_mult,%function
163 .align 5
164 poly1305_mult:
165 mul x12,x4,x7 // h0*r0
166 umulh x13,x4,x7
167
168 mul x10,x5,x9 // h1*5*r1
169 umulh x11,x5,x9
170
171 adds x12,x12,x10
172 mul x10,x4,x8 // h0*r1
173 adc x13,x13,x11
174 umulh x14,x4,x8
175
176 adds x13,x13,x10
177 mul x10,x5,x7 // h1*r0
178 adc x14,x14,xzr
179 umulh x11,x5,x7
180
181 adds x13,x13,x10
182 mul x10,x6,x9 // h2*5*r1
183 adc x14,x14,x11
184 mul x11,x6,x7 // h2*r0
185
186 adds x13,x13,x10
187 adc x14,x14,x11
188
189 and x10,x14,#-4 // final reduction
190 and x6,x14,#3
191 add x10,x10,x14,lsr#2
192 adds x4,x12,x10
193 adcs x5,x13,xzr
194 adc x6,x6,xzr
195
196 ret
197 .size poly1305_mult,.-poly1305_mult
198
199 .type poly1305_splat,%function
200 .align 5
201 poly1305_splat:
202 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
203 ubfx x13,x4,#26,#26
204 extr x14,x5,x4,#52
205 and x14,x14,#0x03ffffff
206 ubfx x15,x5,#14,#26
207 extr x16,x6,x5,#40
208
209 str w12,[x0,#16*0] // r0
210 add w12,w13,w13,lsl#2 // r1*5
211 str w13,[x0,#16*1] // r1
212 add w13,w14,w14,lsl#2 // r2*5
213 str w12,[x0,#16*2] // s1
214 str w14,[x0,#16*3] // r2
215 add w14,w15,w15,lsl#2 // r3*5
216 str w13,[x0,#16*4] // s2
217 str w15,[x0,#16*5] // r3
218 add w15,w16,w16,lsl#2 // r4*5
219 str w14,[x0,#16*6] // s3
220 str w16,[x0,#16*7] // r4
221 str w15,[x0,#16*8] // s4
222
223 ret
224 .size poly1305_splat,.-poly1305_splat
225
226 .type poly1305_blocks_neon,%function
227 .align 5
228 poly1305_blocks_neon:
229 ldr x17,[x0,#24]
230 cmp x2,#128
231 b.hs .Lblocks_neon
232 cbz x17,poly1305_blocks
233
234 .Lblocks_neon:
235 .inst 0xd503233f // paciasp
236 stp x29,x30,[sp,#-80]!
237 add x29,sp,#0
238
239 ands x2,x2,#-16
240 b.eq .Lno_data_neon
241
242 cbz x17,.Lbase2_64_neon
243
244 ldp w10,w11,[x0] // load hash value base 2^26
245 ldp w12,w13,[x0,#8]
246 ldr w14,[x0,#16]
247
248 tst x2,#31
249 b.eq .Leven_neon
250
251 ldp x7,x8,[x0,#32] // load key value
252
253 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
254 lsr x5,x12,#12
255 adds x4,x4,x12,lsl#52
256 add x5,x5,x13,lsl#14
257 adc x5,x5,xzr
258 lsr x6,x14,#24
259 adds x5,x5,x14,lsl#40
260 adc x14,x6,xzr // can be partially reduced...
261
262 ldp x12,x13,[x1],#16 // load input
263 sub x2,x2,#16
264 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
265
266 and x10,x14,#-4 // ... so reduce
267 and x6,x14,#3
268 add x10,x10,x14,lsr#2
269 adds x4,x4,x10
270 adcs x5,x5,xzr
271 adc x6,x6,xzr
272
273 #ifdef __ARMEB__
274 rev x12,x12
275 rev x13,x13
276 #endif
277 adds x4,x4,x12 // accumulate input
278 adcs x5,x5,x13
279 adc x6,x6,x3
280
281 bl poly1305_mult
282 ldr x30,[sp,#8]
283
284 cbz x3,.Lstore_base2_64_neon
285
286 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
287 ubfx x11,x4,#26,#26
288 extr x12,x5,x4,#52
289 and x12,x12,#0x03ffffff
290 ubfx x13,x5,#14,#26
291 extr x14,x6,x5,#40
292
293 cbnz x2,.Leven_neon
294
295 stp w10,w11,[x0] // store hash value base 2^26
296 stp w12,w13,[x0,#8]
297 str w14,[x0,#16]
298 b .Lno_data_neon
299
300 .align 4
301 .Lstore_base2_64_neon:
302 stp x4,x5,[x0] // store hash value base 2^64
303 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed
304 b .Lno_data_neon
305
306 .align 4
307 .Lbase2_64_neon:
308 ldp x7,x8,[x0,#32] // load key value
309
310 ldp x4,x5,[x0] // load hash value base 2^64
311 ldr x6,[x0,#16]
312
313 tst x2,#31
314 b.eq .Linit_neon
315
316 ldp x12,x13,[x1],#16 // load input
317 sub x2,x2,#16
318 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
319 #ifdef __ARMEB__
320 rev x12,x12
321 rev x13,x13
322 #endif
323 adds x4,x4,x12 // accumulate input
324 adcs x5,x5,x13
325 adc x6,x6,x3
326
327 bl poly1305_mult
328
329 .Linit_neon:
330 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
331 ubfx x11,x4,#26,#26
332 extr x12,x5,x4,#52
333 and x12,x12,#0x03ffffff
334 ubfx x13,x5,#14,#26
335 extr x14,x6,x5,#40
336
337 stp d8,d9,[sp,#16] // meet ABI requirements
338 stp d10,d11,[sp,#32]
339 stp d12,d13,[sp,#48]
340 stp d14,d15,[sp,#64]
341
342 fmov d24,x10
343 fmov d25,x11
344 fmov d26,x12
345 fmov d27,x13
346 fmov d28,x14
347
348 ////////////////////////////////// initialize r^n table
349 mov x4,x7 // r^1
350 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
351 mov x5,x8
352 mov x6,xzr
353 add x0,x0,#48+12
354 bl poly1305_splat
355
356 bl poly1305_mult // r^2
357 sub x0,x0,#4
358 bl poly1305_splat
359
360 bl poly1305_mult // r^3
361 sub x0,x0,#4
362 bl poly1305_splat
363
364 bl poly1305_mult // r^4
365 sub x0,x0,#4
366 bl poly1305_splat
367 ldr x30,[sp,#8]
368
369 add x16,x1,#32
370 adr x17,.Lzeros
371 subs x2,x2,#64
372 csel x16,x17,x16,lo
373
374 mov x4,#1
375 str x4,[x0,#-24] // set is_base2_26
376 sub x0,x0,#48 // restore original x0
377 b .Ldo_neon
378
379 .align 4
380 .Leven_neon:
381 add x16,x1,#32
382 adr x17,.Lzeros
383 subs x2,x2,#64
384 csel x16,x17,x16,lo
385
386 stp d8,d9,[sp,#16] // meet ABI requirements
387 stp d10,d11,[sp,#32]
388 stp d12,d13,[sp,#48]
389 stp d14,d15,[sp,#64]
390
391 fmov d24,x10
392 fmov d25,x11
393 fmov d26,x12
394 fmov d27,x13
395 fmov d28,x14
396
397 .Ldo_neon:
398 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
399 ldp x9,x13,[x16],#48
400
401 lsl x3,x3,#24
402 add x15,x0,#48
403
404 #ifdef __ARMEB__
405 rev x8,x8
406 rev x12,x12
407 rev x9,x9
408 rev x13,x13
409 #endif
410 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
411 and x5,x9,#0x03ffffff
412 ubfx x6,x8,#26,#26
413 ubfx x7,x9,#26,#26
414 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
415 extr x8,x12,x8,#52
416 extr x9,x13,x9,#52
417 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
418 fmov d14,x4
419 and x8,x8,#0x03ffffff
420 and x9,x9,#0x03ffffff
421 ubfx x10,x12,#14,#26
422 ubfx x11,x13,#14,#26
423 add x12,x3,x12,lsr#40
424 add x13,x3,x13,lsr#40
425 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
426 fmov d15,x6
427 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
428 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
429 fmov d16,x8
430 fmov d17,x10
431 fmov d18,x12
432
433 ldp x8,x12,[x1],#16 // inp[0:1]
434 ldp x9,x13,[x1],#48
435
436 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
437 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
438 ld1 {v8.4s},[x15]
439
440 #ifdef __ARMEB__
441 rev x8,x8
442 rev x12,x12
443 rev x9,x9
444 rev x13,x13
445 #endif
446 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
447 and x5,x9,#0x03ffffff
448 ubfx x6,x8,#26,#26
449 ubfx x7,x9,#26,#26
450 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
451 extr x8,x12,x8,#52
452 extr x9,x13,x9,#52
453 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
454 fmov d9,x4
455 and x8,x8,#0x03ffffff
456 and x9,x9,#0x03ffffff
457 ubfx x10,x12,#14,#26
458 ubfx x11,x13,#14,#26
459 add x12,x3,x12,lsr#40
460 add x13,x3,x13,lsr#40
461 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
462 fmov d10,x6
463 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
464 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
465 movi v31.2d,#-1
466 fmov d11,x8
467 fmov d12,x10
468 fmov d13,x12
469 ushr v31.2d,v31.2d,#38
470
471 b.ls .Lskip_loop
472
473 .align 4
474 .Loop_neon:
475 ////////////////////////////////////////////////////////////////
476 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
477 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
478 // ___________________/
479 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
480 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
481 // ___________________/ ____________________/
482 //
483 // Note that we start with inp[2:3]*r^2. This is because it
484 // doesn't depend on reduction in previous iteration.
485 ////////////////////////////////////////////////////////////////
486 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
487 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
488 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
489 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
490 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
491
492 subs x2,x2,#64
493 umull v23.2d,v14.2s,v7.s[2]
494 csel x16,x17,x16,lo
495 umull v22.2d,v14.2s,v5.s[2]
496 umull v21.2d,v14.2s,v3.s[2]
497 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
498 umull v20.2d,v14.2s,v1.s[2]
499 ldp x9,x13,[x16],#48
500 umull v19.2d,v14.2s,v0.s[2]
501 #ifdef __ARMEB__
502 rev x8,x8
503 rev x12,x12
504 rev x9,x9
505 rev x13,x13
506 #endif
507
508 umlal v23.2d,v15.2s,v5.s[2]
509 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
510 umlal v22.2d,v15.2s,v3.s[2]
511 and x5,x9,#0x03ffffff
512 umlal v21.2d,v15.2s,v1.s[2]
513 ubfx x6,x8,#26,#26
514 umlal v20.2d,v15.2s,v0.s[2]
515 ubfx x7,x9,#26,#26
516 umlal v19.2d,v15.2s,v8.s[2]
517 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
518
519 umlal v23.2d,v16.2s,v3.s[2]
520 extr x8,x12,x8,#52
521 umlal v22.2d,v16.2s,v1.s[2]
522 extr x9,x13,x9,#52
523 umlal v21.2d,v16.2s,v0.s[2]
524 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
525 umlal v20.2d,v16.2s,v8.s[2]
526 fmov d14,x4
527 umlal v19.2d,v16.2s,v6.s[2]
528 and x8,x8,#0x03ffffff
529
530 umlal v23.2d,v17.2s,v1.s[2]
531 and x9,x9,#0x03ffffff
532 umlal v22.2d,v17.2s,v0.s[2]
533 ubfx x10,x12,#14,#26
534 umlal v21.2d,v17.2s,v8.s[2]
535 ubfx x11,x13,#14,#26
536 umlal v20.2d,v17.2s,v6.s[2]
537 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
538 umlal v19.2d,v17.2s,v4.s[2]
539 fmov d15,x6
540
541 add v11.2s,v11.2s,v26.2s
542 add x12,x3,x12,lsr#40
543 umlal v23.2d,v18.2s,v0.s[2]
544 add x13,x3,x13,lsr#40
545 umlal v22.2d,v18.2s,v8.s[2]
546 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
547 umlal v21.2d,v18.2s,v6.s[2]
548 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
549 umlal v20.2d,v18.2s,v4.s[2]
550 fmov d16,x8
551 umlal v19.2d,v18.2s,v2.s[2]
552 fmov d17,x10
553
554 ////////////////////////////////////////////////////////////////
555 // (hash+inp[0:1])*r^4 and accumulate
556
557 add v9.2s,v9.2s,v24.2s
558 fmov d18,x12
559 umlal v22.2d,v11.2s,v1.s[0]
560 ldp x8,x12,[x1],#16 // inp[0:1]
561 umlal v19.2d,v11.2s,v6.s[0]
562 ldp x9,x13,[x1],#48
563 umlal v23.2d,v11.2s,v3.s[0]
564 umlal v20.2d,v11.2s,v8.s[0]
565 umlal v21.2d,v11.2s,v0.s[0]
566 #ifdef __ARMEB__
567 rev x8,x8
568 rev x12,x12
569 rev x9,x9
570 rev x13,x13
571 #endif
572
573 add v10.2s,v10.2s,v25.2s
574 umlal v22.2d,v9.2s,v5.s[0]
575 umlal v23.2d,v9.2s,v7.s[0]
576 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
577 umlal v21.2d,v9.2s,v3.s[0]
578 and x5,x9,#0x03ffffff
579 umlal v19.2d,v9.2s,v0.s[0]
580 ubfx x6,x8,#26,#26
581 umlal v20.2d,v9.2s,v1.s[0]
582 ubfx x7,x9,#26,#26
583
584 add v12.2s,v12.2s,v27.2s
585 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
586 umlal v22.2d,v10.2s,v3.s[0]
587 extr x8,x12,x8,#52
588 umlal v23.2d,v10.2s,v5.s[0]
589 extr x9,x13,x9,#52
590 umlal v19.2d,v10.2s,v8.s[0]
591 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
592 umlal v21.2d,v10.2s,v1.s[0]
593 fmov d9,x4
594 umlal v20.2d,v10.2s,v0.s[0]
595 and x8,x8,#0x03ffffff
596
597 add v13.2s,v13.2s,v28.2s
598 and x9,x9,#0x03ffffff
599 umlal v22.2d,v12.2s,v0.s[0]
600 ubfx x10,x12,#14,#26
601 umlal v19.2d,v12.2s,v4.s[0]
602 ubfx x11,x13,#14,#26
603 umlal v23.2d,v12.2s,v1.s[0]
604 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
605 umlal v20.2d,v12.2s,v6.s[0]
606 fmov d10,x6
607 umlal v21.2d,v12.2s,v8.s[0]
608 add x12,x3,x12,lsr#40
609
610 umlal v22.2d,v13.2s,v8.s[0]
611 add x13,x3,x13,lsr#40
612 umlal v19.2d,v13.2s,v2.s[0]
613 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
614 umlal v23.2d,v13.2s,v0.s[0]
615 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
616 umlal v20.2d,v13.2s,v4.s[0]
617 fmov d11,x8
618 umlal v21.2d,v13.2s,v6.s[0]
619 fmov d12,x10
620 fmov d13,x12
621
622 /////////////////////////////////////////////////////////////////
623 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
624 // and P. Schwabe
625 //
626 // [see discussion in poly1305-armv4 module]
627
628 ushr v29.2d,v22.2d,#26
629 xtn v27.2s,v22.2d
630 ushr v30.2d,v19.2d,#26
631 and v19.16b,v19.16b,v31.16b
632 add v23.2d,v23.2d,v29.2d // h3 -> h4
633 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
634 add v20.2d,v20.2d,v30.2d // h0 -> h1
635
636 ushr v29.2d,v23.2d,#26
637 xtn v28.2s,v23.2d
638 ushr v30.2d,v20.2d,#26
639 xtn v25.2s,v20.2d
640 bic v28.2s,#0xfc,lsl#24
641 add v21.2d,v21.2d,v30.2d // h1 -> h2
642
643 add v19.2d,v19.2d,v29.2d
644 shl v29.2d,v29.2d,#2
645 shrn v30.2s,v21.2d,#26
646 xtn v26.2s,v21.2d
647 add v19.2d,v19.2d,v29.2d // h4 -> h0
648 bic v25.2s,#0xfc,lsl#24
649 add v27.2s,v27.2s,v30.2s // h2 -> h3
650 bic v26.2s,#0xfc,lsl#24
651
652 shrn v29.2s,v19.2d,#26
653 xtn v24.2s,v19.2d
654 ushr v30.2s,v27.2s,#26
655 bic v27.2s,#0xfc,lsl#24
656 bic v24.2s,#0xfc,lsl#24
657 add v25.2s,v25.2s,v29.2s // h0 -> h1
658 add v28.2s,v28.2s,v30.2s // h3 -> h4
659
660 b.hi .Loop_neon
661
662 .Lskip_loop:
663 dup v16.2d,v16.d[0]
664 add v11.2s,v11.2s,v26.2s
665
666 ////////////////////////////////////////////////////////////////
667 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
668
669 adds x2,x2,#32
670 b.ne .Long_tail
671
672 dup v16.2d,v11.d[0]
673 add v14.2s,v9.2s,v24.2s
674 add v17.2s,v12.2s,v27.2s
675 add v15.2s,v10.2s,v25.2s
676 add v18.2s,v13.2s,v28.2s
677
678 .Long_tail:
679 dup v14.2d,v14.d[0]
680 umull2 v19.2d,v16.4s,v6.4s
681 umull2 v22.2d,v16.4s,v1.4s
682 umull2 v23.2d,v16.4s,v3.4s
683 umull2 v21.2d,v16.4s,v0.4s
684 umull2 v20.2d,v16.4s,v8.4s
685
686 dup v15.2d,v15.d[0]
687 umlal2 v19.2d,v14.4s,v0.4s
688 umlal2 v21.2d,v14.4s,v3.4s
689 umlal2 v22.2d,v14.4s,v5.4s
690 umlal2 v23.2d,v14.4s,v7.4s
691 umlal2 v20.2d,v14.4s,v1.4s
692
693 dup v17.2d,v17.d[0]
694 umlal2 v19.2d,v15.4s,v8.4s
695 umlal2 v22.2d,v15.4s,v3.4s
696 umlal2 v21.2d,v15.4s,v1.4s
697 umlal2 v23.2d,v15.4s,v5.4s
698 umlal2 v20.2d,v15.4s,v0.4s
699
700 dup v18.2d,v18.d[0]
701 umlal2 v22.2d,v17.4s,v0.4s
702 umlal2 v23.2d,v17.4s,v1.4s
703 umlal2 v19.2d,v17.4s,v4.4s
704 umlal2 v20.2d,v17.4s,v6.4s
705 umlal2 v21.2d,v17.4s,v8.4s
706
707 umlal2 v22.2d,v18.4s,v8.4s
708 umlal2 v19.2d,v18.4s,v2.4s
709 umlal2 v23.2d,v18.4s,v0.4s
710 umlal2 v20.2d,v18.4s,v4.4s
711 umlal2 v21.2d,v18.4s,v6.4s
712
713 b.eq .Lshort_tail
714
715 ////////////////////////////////////////////////////////////////
716 // (hash+inp[0:1])*r^4:r^3 and accumulate
717
718 add v9.2s,v9.2s,v24.2s
719 umlal v22.2d,v11.2s,v1.2s
720 umlal v19.2d,v11.2s,v6.2s
721 umlal v23.2d,v11.2s,v3.2s
722 umlal v20.2d,v11.2s,v8.2s
723 umlal v21.2d,v11.2s,v0.2s
724
725 add v10.2s,v10.2s,v25.2s
726 umlal v22.2d,v9.2s,v5.2s
727 umlal v19.2d,v9.2s,v0.2s
728 umlal v23.2d,v9.2s,v7.2s
729 umlal v20.2d,v9.2s,v1.2s
730 umlal v21.2d,v9.2s,v3.2s
731
732 add v12.2s,v12.2s,v27.2s
733 umlal v22.2d,v10.2s,v3.2s
734 umlal v19.2d,v10.2s,v8.2s
735 umlal v23.2d,v10.2s,v5.2s
736 umlal v20.2d,v10.2s,v0.2s
737 umlal v21.2d,v10.2s,v1.2s
738
739 add v13.2s,v13.2s,v28.2s
740 umlal v22.2d,v12.2s,v0.2s
741 umlal v19.2d,v12.2s,v4.2s
742 umlal v23.2d,v12.2s,v1.2s
743 umlal v20.2d,v12.2s,v6.2s
744 umlal v21.2d,v12.2s,v8.2s
745
746 umlal v22.2d,v13.2s,v8.2s
747 umlal v19.2d,v13.2s,v2.2s
748 umlal v23.2d,v13.2s,v0.2s
749 umlal v20.2d,v13.2s,v4.2s
750 umlal v21.2d,v13.2s,v6.2s
751
752 .Lshort_tail:
753 ////////////////////////////////////////////////////////////////
754 // horizontal add
755
756 addp v22.2d,v22.2d,v22.2d
757 ldp d8,d9,[sp,#16] // meet ABI requirements
758 addp v19.2d,v19.2d,v19.2d
759 ldp d10,d11,[sp,#32]
760 addp v23.2d,v23.2d,v23.2d
761 ldp d12,d13,[sp,#48]
762 addp v20.2d,v20.2d,v20.2d
763 ldp d14,d15,[sp,#64]
764 addp v21.2d,v21.2d,v21.2d
765
766 ////////////////////////////////////////////////////////////////
767 // lazy reduction, but without narrowing
768
769 ushr v29.2d,v22.2d,#26
770 and v22.16b,v22.16b,v31.16b
771 ushr v30.2d,v19.2d,#26
772 and v19.16b,v19.16b,v31.16b
773
774 add v23.2d,v23.2d,v29.2d // h3 -> h4
775 add v20.2d,v20.2d,v30.2d // h0 -> h1
776
777 ushr v29.2d,v23.2d,#26
778 and v23.16b,v23.16b,v31.16b
779 ushr v30.2d,v20.2d,#26
780 and v20.16b,v20.16b,v31.16b
781 add v21.2d,v21.2d,v30.2d // h1 -> h2
782
783 add v19.2d,v19.2d,v29.2d
784 shl v29.2d,v29.2d,#2
785 ushr v30.2d,v21.2d,#26
786 and v21.16b,v21.16b,v31.16b
787 add v19.2d,v19.2d,v29.2d // h4 -> h0
788 add v22.2d,v22.2d,v30.2d // h2 -> h3
789
790 ushr v29.2d,v19.2d,#26
791 and v19.16b,v19.16b,v31.16b
792 ushr v30.2d,v22.2d,#26
793 and v22.16b,v22.16b,v31.16b
794 add v20.2d,v20.2d,v29.2d // h0 -> h1
795 add v23.2d,v23.2d,v30.2d // h3 -> h4
796
797 ////////////////////////////////////////////////////////////////
798 // write the result, can be partially reduced
799
800 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
801 st1 {v23.s}[0],[x0]
802
803 .Lno_data_neon:
804 ldr x29,[sp],#80
805 .inst 0xd50323bf // autiasp
806 ret
807 .size poly1305_blocks_neon,.-poly1305_blocks_neon
808
809 .type poly1305_emit_neon,%function
810 .align 5
811 poly1305_emit_neon:
812 ldr x17,[x0,#24]
813 cbz x17,poly1305_emit
814
815 ldp w10,w11,[x0] // load hash value base 2^26
816 ldp w12,w13,[x0,#8]
817 ldr w14,[x0,#16]
818
819 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
820 lsr x5,x12,#12
821 adds x4,x4,x12,lsl#52
822 add x5,x5,x13,lsl#14
823 adc x5,x5,xzr
824 lsr x6,x14,#24
825 adds x5,x5,x14,lsl#40
826 adc x6,x6,xzr // can be partially reduced...
827
828 ldp x10,x11,[x2] // load nonce
829
830 and x12,x6,#-4 // ... so reduce
831 add x12,x12,x6,lsr#2
832 and x6,x6,#3
833 adds x4,x4,x12
834 adcs x5,x5,xzr
835 adc x6,x6,xzr
836
837 adds x12,x4,#5 // compare to modulus
838 adcs x13,x5,xzr
839 adc x14,x6,xzr
840
841 tst x14,#-4 // see if it's carried/borrowed
842
843 csel x4,x4,x12,eq
844 csel x5,x5,x13,eq
845
846 #ifdef __ARMEB__
847 ror x10,x10,#32 // flip nonce words
848 ror x11,x11,#32
849 #endif
850 adds x4,x4,x10 // accumulate nonce
851 adc x5,x5,x11
852 #ifdef __ARMEB__
853 rev x4,x4 // flip output bytes
854 rev x5,x5
855 #endif
856 stp x4,x5,[x1] // write result
857
858 ret
859 .size poly1305_emit_neon,.-poly1305_emit_neon
860
861 .align 5
862 .Lzeros:
863 .long 0,0,0,0,0,0,0,0
864 .LOPENSSL_armcap_P:
865 #ifdef __ILP32__
866 .long OPENSSL_armcap_P-.
867 #else
868 .quad OPENSSL_armcap_P-.
869 #endif
870 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
871 .align 2
872 .align 2
Cache object: c454d7dc3fcab91c324f99f5453f9c16
|