1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
3 .text
4
5
6
7 .globl poly1305_init
8 .hidden poly1305_init
9 .globl poly1305_blocks
10 .hidden poly1305_blocks
11 .globl poly1305_emit
12 .hidden poly1305_emit
13
14 .type poly1305_init,@function
15 .align 32
16 poly1305_init:
17 .cfi_startproc
18 xorq %rax,%rax
19 movq %rax,0(%rdi)
20 movq %rax,8(%rdi)
21 movq %rax,16(%rdi)
22
23 cmpq $0,%rsi
24 je .Lno_key
25
26 leaq poly1305_blocks(%rip),%r10
27 leaq poly1305_emit(%rip),%r11
28 movq OPENSSL_ia32cap_P+4(%rip),%r9
29 leaq poly1305_blocks_avx(%rip),%rax
30 leaq poly1305_emit_avx(%rip),%rcx
31 btq $28,%r9
32 cmovcq %rax,%r10
33 cmovcq %rcx,%r11
34 leaq poly1305_blocks_avx2(%rip),%rax
35 btq $37,%r9
36 cmovcq %rax,%r10
37 movq $0x0ffffffc0fffffff,%rax
38 movq $0x0ffffffc0ffffffc,%rcx
39 andq 0(%rsi),%rax
40 andq 8(%rsi),%rcx
41 movq %rax,24(%rdi)
42 movq %rcx,32(%rdi)
43 movq %r10,0(%rdx)
44 movq %r11,8(%rdx)
45 movl $1,%eax
46 .Lno_key:
47 .byte 0xf3,0xc3
48 .cfi_endproc
49 .size poly1305_init,.-poly1305_init
50
51 .type poly1305_blocks,@function
52 .align 32
53 poly1305_blocks:
54 .cfi_startproc
55 .Lblocks:
56 shrq $4,%rdx
57 jz .Lno_data
58
59 pushq %rbx
60 .cfi_adjust_cfa_offset 8
61 .cfi_offset %rbx,-16
62 pushq %rbp
63 .cfi_adjust_cfa_offset 8
64 .cfi_offset %rbp,-24
65 pushq %r12
66 .cfi_adjust_cfa_offset 8
67 .cfi_offset %r12,-32
68 pushq %r13
69 .cfi_adjust_cfa_offset 8
70 .cfi_offset %r13,-40
71 pushq %r14
72 .cfi_adjust_cfa_offset 8
73 .cfi_offset %r14,-48
74 pushq %r15
75 .cfi_adjust_cfa_offset 8
76 .cfi_offset %r15,-56
77 .Lblocks_body:
78
79 movq %rdx,%r15
80
81 movq 24(%rdi),%r11
82 movq 32(%rdi),%r13
83
84 movq 0(%rdi),%r14
85 movq 8(%rdi),%rbx
86 movq 16(%rdi),%rbp
87
88 movq %r13,%r12
89 shrq $2,%r13
90 movq %r12,%rax
91 addq %r12,%r13
92 jmp .Loop
93
94 .align 32
95 .Loop:
96 addq 0(%rsi),%r14
97 adcq 8(%rsi),%rbx
98 leaq 16(%rsi),%rsi
99 adcq %rcx,%rbp
100 mulq %r14
101 movq %rax,%r9
102 movq %r11,%rax
103 movq %rdx,%r10
104
105 mulq %r14
106 movq %rax,%r14
107 movq %r11,%rax
108 movq %rdx,%r8
109
110 mulq %rbx
111 addq %rax,%r9
112 movq %r13,%rax
113 adcq %rdx,%r10
114
115 mulq %rbx
116 movq %rbp,%rbx
117 addq %rax,%r14
118 adcq %rdx,%r8
119
120 imulq %r13,%rbx
121 addq %rbx,%r9
122 movq %r8,%rbx
123 adcq $0,%r10
124
125 imulq %r11,%rbp
126 addq %r9,%rbx
127 movq $-4,%rax
128 adcq %rbp,%r10
129
130 andq %r10,%rax
131 movq %r10,%rbp
132 shrq $2,%r10
133 andq $3,%rbp
134 addq %r10,%rax
135 addq %rax,%r14
136 adcq $0,%rbx
137 adcq $0,%rbp
138 movq %r12,%rax
139 decq %r15
140 jnz .Loop
141
142 movq %r14,0(%rdi)
143 movq %rbx,8(%rdi)
144 movq %rbp,16(%rdi)
145
146 movq 0(%rsp),%r15
147 .cfi_restore %r15
148 movq 8(%rsp),%r14
149 .cfi_restore %r14
150 movq 16(%rsp),%r13
151 .cfi_restore %r13
152 movq 24(%rsp),%r12
153 .cfi_restore %r12
154 movq 32(%rsp),%rbp
155 .cfi_restore %rbp
156 movq 40(%rsp),%rbx
157 .cfi_restore %rbx
158 leaq 48(%rsp),%rsp
159 .cfi_adjust_cfa_offset -48
160 .Lno_data:
161 .Lblocks_epilogue:
162 .byte 0xf3,0xc3
163 .cfi_endproc
164 .size poly1305_blocks,.-poly1305_blocks
165
166 .type poly1305_emit,@function
167 .align 32
168 poly1305_emit:
169 .cfi_startproc
170 .Lemit:
171 movq 0(%rdi),%r8
172 movq 8(%rdi),%r9
173 movq 16(%rdi),%r10
174
175 movq %r8,%rax
176 addq $5,%r8
177 movq %r9,%rcx
178 adcq $0,%r9
179 adcq $0,%r10
180 shrq $2,%r10
181 cmovnzq %r8,%rax
182 cmovnzq %r9,%rcx
183
184 addq 0(%rdx),%rax
185 adcq 8(%rdx),%rcx
186 movq %rax,0(%rsi)
187 movq %rcx,8(%rsi)
188
189 .byte 0xf3,0xc3
190 .cfi_endproc
191 .size poly1305_emit,.-poly1305_emit
192 .type __poly1305_block,@function
193 .align 32
194 __poly1305_block:
195 .cfi_startproc
196 mulq %r14
197 movq %rax,%r9
198 movq %r11,%rax
199 movq %rdx,%r10
200
201 mulq %r14
202 movq %rax,%r14
203 movq %r11,%rax
204 movq %rdx,%r8
205
206 mulq %rbx
207 addq %rax,%r9
208 movq %r13,%rax
209 adcq %rdx,%r10
210
211 mulq %rbx
212 movq %rbp,%rbx
213 addq %rax,%r14
214 adcq %rdx,%r8
215
216 imulq %r13,%rbx
217 addq %rbx,%r9
218 movq %r8,%rbx
219 adcq $0,%r10
220
221 imulq %r11,%rbp
222 addq %r9,%rbx
223 movq $-4,%rax
224 adcq %rbp,%r10
225
226 andq %r10,%rax
227 movq %r10,%rbp
228 shrq $2,%r10
229 andq $3,%rbp
230 addq %r10,%rax
231 addq %rax,%r14
232 adcq $0,%rbx
233 adcq $0,%rbp
234 .byte 0xf3,0xc3
235 .cfi_endproc
236 .size __poly1305_block,.-__poly1305_block
237
238 .type __poly1305_init_avx,@function
239 .align 32
240 __poly1305_init_avx:
241 .cfi_startproc
242 movq %r11,%r14
243 movq %r12,%rbx
244 xorq %rbp,%rbp
245
246 leaq 48+64(%rdi),%rdi
247
248 movq %r12,%rax
249 call __poly1305_block
250
251 movl $0x3ffffff,%eax
252 movl $0x3ffffff,%edx
253 movq %r14,%r8
254 andl %r14d,%eax
255 movq %r11,%r9
256 andl %r11d,%edx
257 movl %eax,-64(%rdi)
258 shrq $26,%r8
259 movl %edx,-60(%rdi)
260 shrq $26,%r9
261
262 movl $0x3ffffff,%eax
263 movl $0x3ffffff,%edx
264 andl %r8d,%eax
265 andl %r9d,%edx
266 movl %eax,-48(%rdi)
267 leal (%rax,%rax,4),%eax
268 movl %edx,-44(%rdi)
269 leal (%rdx,%rdx,4),%edx
270 movl %eax,-32(%rdi)
271 shrq $26,%r8
272 movl %edx,-28(%rdi)
273 shrq $26,%r9
274
275 movq %rbx,%rax
276 movq %r12,%rdx
277 shlq $12,%rax
278 shlq $12,%rdx
279 orq %r8,%rax
280 orq %r9,%rdx
281 andl $0x3ffffff,%eax
282 andl $0x3ffffff,%edx
283 movl %eax,-16(%rdi)
284 leal (%rax,%rax,4),%eax
285 movl %edx,-12(%rdi)
286 leal (%rdx,%rdx,4),%edx
287 movl %eax,0(%rdi)
288 movq %rbx,%r8
289 movl %edx,4(%rdi)
290 movq %r12,%r9
291
292 movl $0x3ffffff,%eax
293 movl $0x3ffffff,%edx
294 shrq $14,%r8
295 shrq $14,%r9
296 andl %r8d,%eax
297 andl %r9d,%edx
298 movl %eax,16(%rdi)
299 leal (%rax,%rax,4),%eax
300 movl %edx,20(%rdi)
301 leal (%rdx,%rdx,4),%edx
302 movl %eax,32(%rdi)
303 shrq $26,%r8
304 movl %edx,36(%rdi)
305 shrq $26,%r9
306
307 movq %rbp,%rax
308 shlq $24,%rax
309 orq %rax,%r8
310 movl %r8d,48(%rdi)
311 leaq (%r8,%r8,4),%r8
312 movl %r9d,52(%rdi)
313 leaq (%r9,%r9,4),%r9
314 movl %r8d,64(%rdi)
315 movl %r9d,68(%rdi)
316
317 movq %r12,%rax
318 call __poly1305_block
319
320 movl $0x3ffffff,%eax
321 movq %r14,%r8
322 andl %r14d,%eax
323 shrq $26,%r8
324 movl %eax,-52(%rdi)
325
326 movl $0x3ffffff,%edx
327 andl %r8d,%edx
328 movl %edx,-36(%rdi)
329 leal (%rdx,%rdx,4),%edx
330 shrq $26,%r8
331 movl %edx,-20(%rdi)
332
333 movq %rbx,%rax
334 shlq $12,%rax
335 orq %r8,%rax
336 andl $0x3ffffff,%eax
337 movl %eax,-4(%rdi)
338 leal (%rax,%rax,4),%eax
339 movq %rbx,%r8
340 movl %eax,12(%rdi)
341
342 movl $0x3ffffff,%edx
343 shrq $14,%r8
344 andl %r8d,%edx
345 movl %edx,28(%rdi)
346 leal (%rdx,%rdx,4),%edx
347 shrq $26,%r8
348 movl %edx,44(%rdi)
349
350 movq %rbp,%rax
351 shlq $24,%rax
352 orq %rax,%r8
353 movl %r8d,60(%rdi)
354 leaq (%r8,%r8,4),%r8
355 movl %r8d,76(%rdi)
356
357 movq %r12,%rax
358 call __poly1305_block
359
360 movl $0x3ffffff,%eax
361 movq %r14,%r8
362 andl %r14d,%eax
363 shrq $26,%r8
364 movl %eax,-56(%rdi)
365
366 movl $0x3ffffff,%edx
367 andl %r8d,%edx
368 movl %edx,-40(%rdi)
369 leal (%rdx,%rdx,4),%edx
370 shrq $26,%r8
371 movl %edx,-24(%rdi)
372
373 movq %rbx,%rax
374 shlq $12,%rax
375 orq %r8,%rax
376 andl $0x3ffffff,%eax
377 movl %eax,-8(%rdi)
378 leal (%rax,%rax,4),%eax
379 movq %rbx,%r8
380 movl %eax,8(%rdi)
381
382 movl $0x3ffffff,%edx
383 shrq $14,%r8
384 andl %r8d,%edx
385 movl %edx,24(%rdi)
386 leal (%rdx,%rdx,4),%edx
387 shrq $26,%r8
388 movl %edx,40(%rdi)
389
390 movq %rbp,%rax
391 shlq $24,%rax
392 orq %rax,%r8
393 movl %r8d,56(%rdi)
394 leaq (%r8,%r8,4),%r8
395 movl %r8d,72(%rdi)
396
397 leaq -48-64(%rdi),%rdi
398 .byte 0xf3,0xc3
399 .cfi_endproc
400 .size __poly1305_init_avx,.-__poly1305_init_avx
401
402 .type poly1305_blocks_avx,@function
403 .align 32
404 poly1305_blocks_avx:
405 .cfi_startproc
406 movl 20(%rdi),%r8d
407 cmpq $128,%rdx
408 jae .Lblocks_avx
409 testl %r8d,%r8d
410 jz .Lblocks
411
412 .Lblocks_avx:
413 andq $-16,%rdx
414 jz .Lno_data_avx
415
416 vzeroupper
417
418 testl %r8d,%r8d
419 jz .Lbase2_64_avx
420
421 testq $31,%rdx
422 jz .Leven_avx
423
424 pushq %rbx
425 .cfi_adjust_cfa_offset 8
426 .cfi_offset %rbx,-16
427 pushq %rbp
428 .cfi_adjust_cfa_offset 8
429 .cfi_offset %rbp,-24
430 pushq %r12
431 .cfi_adjust_cfa_offset 8
432 .cfi_offset %r12,-32
433 pushq %r13
434 .cfi_adjust_cfa_offset 8
435 .cfi_offset %r13,-40
436 pushq %r14
437 .cfi_adjust_cfa_offset 8
438 .cfi_offset %r14,-48
439 pushq %r15
440 .cfi_adjust_cfa_offset 8
441 .cfi_offset %r15,-56
442 .Lblocks_avx_body:
443
444 movq %rdx,%r15
445
446 movq 0(%rdi),%r8
447 movq 8(%rdi),%r9
448 movl 16(%rdi),%ebp
449
450 movq 24(%rdi),%r11
451 movq 32(%rdi),%r13
452
453
454 movl %r8d,%r14d
455 andq $-2147483648,%r8
456 movq %r9,%r12
457 movl %r9d,%ebx
458 andq $-2147483648,%r9
459
460 shrq $6,%r8
461 shlq $52,%r12
462 addq %r8,%r14
463 shrq $12,%rbx
464 shrq $18,%r9
465 addq %r12,%r14
466 adcq %r9,%rbx
467
468 movq %rbp,%r8
469 shlq $40,%r8
470 shrq $24,%rbp
471 addq %r8,%rbx
472 adcq $0,%rbp
473
474 movq $-4,%r9
475 movq %rbp,%r8
476 andq %rbp,%r9
477 shrq $2,%r8
478 andq $3,%rbp
479 addq %r9,%r8
480 addq %r8,%r14
481 adcq $0,%rbx
482 adcq $0,%rbp
483
484 movq %r13,%r12
485 movq %r13,%rax
486 shrq $2,%r13
487 addq %r12,%r13
488
489 addq 0(%rsi),%r14
490 adcq 8(%rsi),%rbx
491 leaq 16(%rsi),%rsi
492 adcq %rcx,%rbp
493
494 call __poly1305_block
495
496 testq %rcx,%rcx
497 jz .Lstore_base2_64_avx
498
499
500 movq %r14,%rax
501 movq %r14,%rdx
502 shrq $52,%r14
503 movq %rbx,%r11
504 movq %rbx,%r12
505 shrq $26,%rdx
506 andq $0x3ffffff,%rax
507 shlq $12,%r11
508 andq $0x3ffffff,%rdx
509 shrq $14,%rbx
510 orq %r11,%r14
511 shlq $24,%rbp
512 andq $0x3ffffff,%r14
513 shrq $40,%r12
514 andq $0x3ffffff,%rbx
515 orq %r12,%rbp
516
517 subq $16,%r15
518 jz .Lstore_base2_26_avx
519
520 vmovd %eax,%xmm0
521 vmovd %edx,%xmm1
522 vmovd %r14d,%xmm2
523 vmovd %ebx,%xmm3
524 vmovd %ebp,%xmm4
525 jmp .Lproceed_avx
526
527 .align 32
528 .Lstore_base2_64_avx:
529 movq %r14,0(%rdi)
530 movq %rbx,8(%rdi)
531 movq %rbp,16(%rdi)
532 jmp .Ldone_avx
533
534 .align 16
535 .Lstore_base2_26_avx:
536 movl %eax,0(%rdi)
537 movl %edx,4(%rdi)
538 movl %r14d,8(%rdi)
539 movl %ebx,12(%rdi)
540 movl %ebp,16(%rdi)
541 .align 16
542 .Ldone_avx:
543 movq 0(%rsp),%r15
544 .cfi_restore %r15
545 movq 8(%rsp),%r14
546 .cfi_restore %r14
547 movq 16(%rsp),%r13
548 .cfi_restore %r13
549 movq 24(%rsp),%r12
550 .cfi_restore %r12
551 movq 32(%rsp),%rbp
552 .cfi_restore %rbp
553 movq 40(%rsp),%rbx
554 .cfi_restore %rbx
555 leaq 48(%rsp),%rsp
556 .cfi_adjust_cfa_offset -48
557 .Lno_data_avx:
558 .Lblocks_avx_epilogue:
559 .byte 0xf3,0xc3
560 .cfi_endproc
561
562 .align 32
563 .Lbase2_64_avx:
564 .cfi_startproc
565 pushq %rbx
566 .cfi_adjust_cfa_offset 8
567 .cfi_offset %rbx,-16
568 pushq %rbp
569 .cfi_adjust_cfa_offset 8
570 .cfi_offset %rbp,-24
571 pushq %r12
572 .cfi_adjust_cfa_offset 8
573 .cfi_offset %r12,-32
574 pushq %r13
575 .cfi_adjust_cfa_offset 8
576 .cfi_offset %r13,-40
577 pushq %r14
578 .cfi_adjust_cfa_offset 8
579 .cfi_offset %r14,-48
580 pushq %r15
581 .cfi_adjust_cfa_offset 8
582 .cfi_offset %r15,-56
583 .Lbase2_64_avx_body:
584
585 movq %rdx,%r15
586
587 movq 24(%rdi),%r11
588 movq 32(%rdi),%r13
589
590 movq 0(%rdi),%r14
591 movq 8(%rdi),%rbx
592 movl 16(%rdi),%ebp
593
594 movq %r13,%r12
595 movq %r13,%rax
596 shrq $2,%r13
597 addq %r12,%r13
598
599 testq $31,%rdx
600 jz .Linit_avx
601
602 addq 0(%rsi),%r14
603 adcq 8(%rsi),%rbx
604 leaq 16(%rsi),%rsi
605 adcq %rcx,%rbp
606 subq $16,%r15
607
608 call __poly1305_block
609
610 .Linit_avx:
611
612 movq %r14,%rax
613 movq %r14,%rdx
614 shrq $52,%r14
615 movq %rbx,%r8
616 movq %rbx,%r9
617 shrq $26,%rdx
618 andq $0x3ffffff,%rax
619 shlq $12,%r8
620 andq $0x3ffffff,%rdx
621 shrq $14,%rbx
622 orq %r8,%r14
623 shlq $24,%rbp
624 andq $0x3ffffff,%r14
625 shrq $40,%r9
626 andq $0x3ffffff,%rbx
627 orq %r9,%rbp
628
629 vmovd %eax,%xmm0
630 vmovd %edx,%xmm1
631 vmovd %r14d,%xmm2
632 vmovd %ebx,%xmm3
633 vmovd %ebp,%xmm4
634 movl $1,20(%rdi)
635
636 call __poly1305_init_avx
637
638 .Lproceed_avx:
639 movq %r15,%rdx
640
641 movq 0(%rsp),%r15
642 .cfi_restore %r15
643 movq 8(%rsp),%r14
644 .cfi_restore %r14
645 movq 16(%rsp),%r13
646 .cfi_restore %r13
647 movq 24(%rsp),%r12
648 .cfi_restore %r12
649 movq 32(%rsp),%rbp
650 .cfi_restore %rbp
651 movq 40(%rsp),%rbx
652 .cfi_restore %rbx
653 leaq 48(%rsp),%rax
654 leaq 48(%rsp),%rsp
655 .cfi_adjust_cfa_offset -48
656 .Lbase2_64_avx_epilogue:
657 jmp .Ldo_avx
658 .cfi_endproc
659
660 .align 32
661 .Leven_avx:
662 .cfi_startproc
663 vmovd 0(%rdi),%xmm0
664 vmovd 4(%rdi),%xmm1
665 vmovd 8(%rdi),%xmm2
666 vmovd 12(%rdi),%xmm3
667 vmovd 16(%rdi),%xmm4
668
669 .Ldo_avx:
670 leaq -88(%rsp),%r11
671 .cfi_def_cfa %r11,0x60
672 subq $0x178,%rsp
673 subq $64,%rdx
674 leaq -32(%rsi),%rax
675 cmovcq %rax,%rsi
676
677 vmovdqu 48(%rdi),%xmm14
678 leaq 112(%rdi),%rdi
679 leaq .Lconst(%rip),%rcx
680
681
682
683 vmovdqu 32(%rsi),%xmm5
684 vmovdqu 48(%rsi),%xmm6
685 vmovdqa 64(%rcx),%xmm15
686
687 vpsrldq $6,%xmm5,%xmm7
688 vpsrldq $6,%xmm6,%xmm8
689 vpunpckhqdq %xmm6,%xmm5,%xmm9
690 vpunpcklqdq %xmm6,%xmm5,%xmm5
691 vpunpcklqdq %xmm8,%xmm7,%xmm8
692
693 vpsrlq $40,%xmm9,%xmm9
694 vpsrlq $26,%xmm5,%xmm6
695 vpand %xmm15,%xmm5,%xmm5
696 vpsrlq $4,%xmm8,%xmm7
697 vpand %xmm15,%xmm6,%xmm6
698 vpsrlq $30,%xmm8,%xmm8
699 vpand %xmm15,%xmm7,%xmm7
700 vpand %xmm15,%xmm8,%xmm8
701 vpor 32(%rcx),%xmm9,%xmm9
702
703 jbe .Lskip_loop_avx
704
705
706 vmovdqu -48(%rdi),%xmm11
707 vmovdqu -32(%rdi),%xmm12
708 vpshufd $0xEE,%xmm14,%xmm13
709 vpshufd $0x44,%xmm14,%xmm10
710 vmovdqa %xmm13,-144(%r11)
711 vmovdqa %xmm10,0(%rsp)
712 vpshufd $0xEE,%xmm11,%xmm14
713 vmovdqu -16(%rdi),%xmm10
714 vpshufd $0x44,%xmm11,%xmm11
715 vmovdqa %xmm14,-128(%r11)
716 vmovdqa %xmm11,16(%rsp)
717 vpshufd $0xEE,%xmm12,%xmm13
718 vmovdqu 0(%rdi),%xmm11
719 vpshufd $0x44,%xmm12,%xmm12
720 vmovdqa %xmm13,-112(%r11)
721 vmovdqa %xmm12,32(%rsp)
722 vpshufd $0xEE,%xmm10,%xmm14
723 vmovdqu 16(%rdi),%xmm12
724 vpshufd $0x44,%xmm10,%xmm10
725 vmovdqa %xmm14,-96(%r11)
726 vmovdqa %xmm10,48(%rsp)
727 vpshufd $0xEE,%xmm11,%xmm13
728 vmovdqu 32(%rdi),%xmm10
729 vpshufd $0x44,%xmm11,%xmm11
730 vmovdqa %xmm13,-80(%r11)
731 vmovdqa %xmm11,64(%rsp)
732 vpshufd $0xEE,%xmm12,%xmm14
733 vmovdqu 48(%rdi),%xmm11
734 vpshufd $0x44,%xmm12,%xmm12
735 vmovdqa %xmm14,-64(%r11)
736 vmovdqa %xmm12,80(%rsp)
737 vpshufd $0xEE,%xmm10,%xmm13
738 vmovdqu 64(%rdi),%xmm12
739 vpshufd $0x44,%xmm10,%xmm10
740 vmovdqa %xmm13,-48(%r11)
741 vmovdqa %xmm10,96(%rsp)
742 vpshufd $0xEE,%xmm11,%xmm14
743 vpshufd $0x44,%xmm11,%xmm11
744 vmovdqa %xmm14,-32(%r11)
745 vmovdqa %xmm11,112(%rsp)
746 vpshufd $0xEE,%xmm12,%xmm13
747 vmovdqa 0(%rsp),%xmm14
748 vpshufd $0x44,%xmm12,%xmm12
749 vmovdqa %xmm13,-16(%r11)
750 vmovdqa %xmm12,128(%rsp)
751
752 jmp .Loop_avx
753
754 .align 32
755 .Loop_avx:
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776 vpmuludq %xmm5,%xmm14,%xmm10
777 vpmuludq %xmm6,%xmm14,%xmm11
778 vmovdqa %xmm2,32(%r11)
779 vpmuludq %xmm7,%xmm14,%xmm12
780 vmovdqa 16(%rsp),%xmm2
781 vpmuludq %xmm8,%xmm14,%xmm13
782 vpmuludq %xmm9,%xmm14,%xmm14
783
784 vmovdqa %xmm0,0(%r11)
785 vpmuludq 32(%rsp),%xmm9,%xmm0
786 vmovdqa %xmm1,16(%r11)
787 vpmuludq %xmm8,%xmm2,%xmm1
788 vpaddq %xmm0,%xmm10,%xmm10
789 vpaddq %xmm1,%xmm14,%xmm14
790 vmovdqa %xmm3,48(%r11)
791 vpmuludq %xmm7,%xmm2,%xmm0
792 vpmuludq %xmm6,%xmm2,%xmm1
793 vpaddq %xmm0,%xmm13,%xmm13
794 vmovdqa 48(%rsp),%xmm3
795 vpaddq %xmm1,%xmm12,%xmm12
796 vmovdqa %xmm4,64(%r11)
797 vpmuludq %xmm5,%xmm2,%xmm2
798 vpmuludq %xmm7,%xmm3,%xmm0
799 vpaddq %xmm2,%xmm11,%xmm11
800
801 vmovdqa 64(%rsp),%xmm4
802 vpaddq %xmm0,%xmm14,%xmm14
803 vpmuludq %xmm6,%xmm3,%xmm1
804 vpmuludq %xmm5,%xmm3,%xmm3
805 vpaddq %xmm1,%xmm13,%xmm13
806 vmovdqa 80(%rsp),%xmm2
807 vpaddq %xmm3,%xmm12,%xmm12
808 vpmuludq %xmm9,%xmm4,%xmm0
809 vpmuludq %xmm8,%xmm4,%xmm4
810 vpaddq %xmm0,%xmm11,%xmm11
811 vmovdqa 96(%rsp),%xmm3
812 vpaddq %xmm4,%xmm10,%xmm10
813
814 vmovdqa 128(%rsp),%xmm4
815 vpmuludq %xmm6,%xmm2,%xmm1
816 vpmuludq %xmm5,%xmm2,%xmm2
817 vpaddq %xmm1,%xmm14,%xmm14
818 vpaddq %xmm2,%xmm13,%xmm13
819 vpmuludq %xmm9,%xmm3,%xmm0
820 vpmuludq %xmm8,%xmm3,%xmm1
821 vpaddq %xmm0,%xmm12,%xmm12
822 vmovdqu 0(%rsi),%xmm0
823 vpaddq %xmm1,%xmm11,%xmm11
824 vpmuludq %xmm7,%xmm3,%xmm3
825 vpmuludq %xmm7,%xmm4,%xmm7
826 vpaddq %xmm3,%xmm10,%xmm10
827
828 vmovdqu 16(%rsi),%xmm1
829 vpaddq %xmm7,%xmm11,%xmm11
830 vpmuludq %xmm8,%xmm4,%xmm8
831 vpmuludq %xmm9,%xmm4,%xmm9
832 vpsrldq $6,%xmm0,%xmm2
833 vpaddq %xmm8,%xmm12,%xmm12
834 vpaddq %xmm9,%xmm13,%xmm13
835 vpsrldq $6,%xmm1,%xmm3
836 vpmuludq 112(%rsp),%xmm5,%xmm9
837 vpmuludq %xmm6,%xmm4,%xmm5
838 vpunpckhqdq %xmm1,%xmm0,%xmm4
839 vpaddq %xmm9,%xmm14,%xmm14
840 vmovdqa -144(%r11),%xmm9
841 vpaddq %xmm5,%xmm10,%xmm10
842
843 vpunpcklqdq %xmm1,%xmm0,%xmm0
844 vpunpcklqdq %xmm3,%xmm2,%xmm3
845
846
847 vpsrldq $5,%xmm4,%xmm4
848 vpsrlq $26,%xmm0,%xmm1
849 vpand %xmm15,%xmm0,%xmm0
850 vpsrlq $4,%xmm3,%xmm2
851 vpand %xmm15,%xmm1,%xmm1
852 vpand 0(%rcx),%xmm4,%xmm4
853 vpsrlq $30,%xmm3,%xmm3
854 vpand %xmm15,%xmm2,%xmm2
855 vpand %xmm15,%xmm3,%xmm3
856 vpor 32(%rcx),%xmm4,%xmm4
857
858 vpaddq 0(%r11),%xmm0,%xmm0
859 vpaddq 16(%r11),%xmm1,%xmm1
860 vpaddq 32(%r11),%xmm2,%xmm2
861 vpaddq 48(%r11),%xmm3,%xmm3
862 vpaddq 64(%r11),%xmm4,%xmm4
863
864 leaq 32(%rsi),%rax
865 leaq 64(%rsi),%rsi
866 subq $64,%rdx
867 cmovcq %rax,%rsi
868
869
870
871
872
873
874
875
876
877
878 vpmuludq %xmm0,%xmm9,%xmm5
879 vpmuludq %xmm1,%xmm9,%xmm6
880 vpaddq %xmm5,%xmm10,%xmm10
881 vpaddq %xmm6,%xmm11,%xmm11
882 vmovdqa -128(%r11),%xmm7
883 vpmuludq %xmm2,%xmm9,%xmm5
884 vpmuludq %xmm3,%xmm9,%xmm6
885 vpaddq %xmm5,%xmm12,%xmm12
886 vpaddq %xmm6,%xmm13,%xmm13
887 vpmuludq %xmm4,%xmm9,%xmm9
888 vpmuludq -112(%r11),%xmm4,%xmm5
889 vpaddq %xmm9,%xmm14,%xmm14
890
891 vpaddq %xmm5,%xmm10,%xmm10
892 vpmuludq %xmm2,%xmm7,%xmm6
893 vpmuludq %xmm3,%xmm7,%xmm5
894 vpaddq %xmm6,%xmm13,%xmm13
895 vmovdqa -96(%r11),%xmm8
896 vpaddq %xmm5,%xmm14,%xmm14
897 vpmuludq %xmm1,%xmm7,%xmm6
898 vpmuludq %xmm0,%xmm7,%xmm7
899 vpaddq %xmm6,%xmm12,%xmm12
900 vpaddq %xmm7,%xmm11,%xmm11
901
902 vmovdqa -80(%r11),%xmm9
903 vpmuludq %xmm2,%xmm8,%xmm5
904 vpmuludq %xmm1,%xmm8,%xmm6
905 vpaddq %xmm5,%xmm14,%xmm14
906 vpaddq %xmm6,%xmm13,%xmm13
907 vmovdqa -64(%r11),%xmm7
908 vpmuludq %xmm0,%xmm8,%xmm8
909 vpmuludq %xmm4,%xmm9,%xmm5
910 vpaddq %xmm8,%xmm12,%xmm12
911 vpaddq %xmm5,%xmm11,%xmm11
912 vmovdqa -48(%r11),%xmm8
913 vpmuludq %xmm3,%xmm9,%xmm9
914 vpmuludq %xmm1,%xmm7,%xmm6
915 vpaddq %xmm9,%xmm10,%xmm10
916
917 vmovdqa -16(%r11),%xmm9
918 vpaddq %xmm6,%xmm14,%xmm14
919 vpmuludq %xmm0,%xmm7,%xmm7
920 vpmuludq %xmm4,%xmm8,%xmm5
921 vpaddq %xmm7,%xmm13,%xmm13
922 vpaddq %xmm5,%xmm12,%xmm12
923 vmovdqu 32(%rsi),%xmm5
924 vpmuludq %xmm3,%xmm8,%xmm7
925 vpmuludq %xmm2,%xmm8,%xmm8
926 vpaddq %xmm7,%xmm11,%xmm11
927 vmovdqu 48(%rsi),%xmm6
928 vpaddq %xmm8,%xmm10,%xmm10
929
930 vpmuludq %xmm2,%xmm9,%xmm2
931 vpmuludq %xmm3,%xmm9,%xmm3
932 vpsrldq $6,%xmm5,%xmm7
933 vpaddq %xmm2,%xmm11,%xmm11
934 vpmuludq %xmm4,%xmm9,%xmm4
935 vpsrldq $6,%xmm6,%xmm8
936 vpaddq %xmm3,%xmm12,%xmm2
937 vpaddq %xmm4,%xmm13,%xmm3
938 vpmuludq -32(%r11),%xmm0,%xmm4
939 vpmuludq %xmm1,%xmm9,%xmm0
940 vpunpckhqdq %xmm6,%xmm5,%xmm9
941 vpaddq %xmm4,%xmm14,%xmm4
942 vpaddq %xmm0,%xmm10,%xmm0
943
944 vpunpcklqdq %xmm6,%xmm5,%xmm5
945 vpunpcklqdq %xmm8,%xmm7,%xmm8
946
947
948 vpsrldq $5,%xmm9,%xmm9
949 vpsrlq $26,%xmm5,%xmm6
950 vmovdqa 0(%rsp),%xmm14
951 vpand %xmm15,%xmm5,%xmm5
952 vpsrlq $4,%xmm8,%xmm7
953 vpand %xmm15,%xmm6,%xmm6
954 vpand 0(%rcx),%xmm9,%xmm9
955 vpsrlq $30,%xmm8,%xmm8
956 vpand %xmm15,%xmm7,%xmm7
957 vpand %xmm15,%xmm8,%xmm8
958 vpor 32(%rcx),%xmm9,%xmm9
959
960
961
962
963
964 vpsrlq $26,%xmm3,%xmm13
965 vpand %xmm15,%xmm3,%xmm3
966 vpaddq %xmm13,%xmm4,%xmm4
967
968 vpsrlq $26,%xmm0,%xmm10
969 vpand %xmm15,%xmm0,%xmm0
970 vpaddq %xmm10,%xmm11,%xmm1
971
972 vpsrlq $26,%xmm4,%xmm10
973 vpand %xmm15,%xmm4,%xmm4
974
975 vpsrlq $26,%xmm1,%xmm11
976 vpand %xmm15,%xmm1,%xmm1
977 vpaddq %xmm11,%xmm2,%xmm2
978
979 vpaddq %xmm10,%xmm0,%xmm0
980 vpsllq $2,%xmm10,%xmm10
981 vpaddq %xmm10,%xmm0,%xmm0
982
983 vpsrlq $26,%xmm2,%xmm12
984 vpand %xmm15,%xmm2,%xmm2
985 vpaddq %xmm12,%xmm3,%xmm3
986
987 vpsrlq $26,%xmm0,%xmm10
988 vpand %xmm15,%xmm0,%xmm0
989 vpaddq %xmm10,%xmm1,%xmm1
990
991 vpsrlq $26,%xmm3,%xmm13
992 vpand %xmm15,%xmm3,%xmm3
993 vpaddq %xmm13,%xmm4,%xmm4
994
995 ja .Loop_avx
996
997 .Lskip_loop_avx:
998
999
1000
1001 vpshufd $0x10,%xmm14,%xmm14
1002 addq $32,%rdx
1003 jnz .Long_tail_avx
1004
1005 vpaddq %xmm2,%xmm7,%xmm7
1006 vpaddq %xmm0,%xmm5,%xmm5
1007 vpaddq %xmm1,%xmm6,%xmm6
1008 vpaddq %xmm3,%xmm8,%xmm8
1009 vpaddq %xmm4,%xmm9,%xmm9
1010
1011 .Long_tail_avx:
1012 vmovdqa %xmm2,32(%r11)
1013 vmovdqa %xmm0,0(%r11)
1014 vmovdqa %xmm1,16(%r11)
1015 vmovdqa %xmm3,48(%r11)
1016 vmovdqa %xmm4,64(%r11)
1017
1018
1019
1020
1021
1022
1023
1024 vpmuludq %xmm7,%xmm14,%xmm12
1025 vpmuludq %xmm5,%xmm14,%xmm10
1026 vpshufd $0x10,-48(%rdi),%xmm2
1027 vpmuludq %xmm6,%xmm14,%xmm11
1028 vpmuludq %xmm8,%xmm14,%xmm13
1029 vpmuludq %xmm9,%xmm14,%xmm14
1030
1031 vpmuludq %xmm8,%xmm2,%xmm0
1032 vpaddq %xmm0,%xmm14,%xmm14
1033 vpshufd $0x10,-32(%rdi),%xmm3
1034 vpmuludq %xmm7,%xmm2,%xmm1
1035 vpaddq %xmm1,%xmm13,%xmm13
1036 vpshufd $0x10,-16(%rdi),%xmm4
1037 vpmuludq %xmm6,%xmm2,%xmm0
1038 vpaddq %xmm0,%xmm12,%xmm12
1039 vpmuludq %xmm5,%xmm2,%xmm2
1040 vpaddq %xmm2,%xmm11,%xmm11
1041 vpmuludq %xmm9,%xmm3,%xmm3
1042 vpaddq %xmm3,%xmm10,%xmm10
1043
1044 vpshufd $0x10,0(%rdi),%xmm2
1045 vpmuludq %xmm7,%xmm4,%xmm1
1046 vpaddq %xmm1,%xmm14,%xmm14
1047 vpmuludq %xmm6,%xmm4,%xmm0
1048 vpaddq %xmm0,%xmm13,%xmm13
1049 vpshufd $0x10,16(%rdi),%xmm3
1050 vpmuludq %xmm5,%xmm4,%xmm4
1051 vpaddq %xmm4,%xmm12,%xmm12
1052 vpmuludq %xmm9,%xmm2,%xmm1
1053 vpaddq %xmm1,%xmm11,%xmm11
1054 vpshufd $0x10,32(%rdi),%xmm4
1055 vpmuludq %xmm8,%xmm2,%xmm2
1056 vpaddq %xmm2,%xmm10,%xmm10
1057
1058 vpmuludq %xmm6,%xmm3,%xmm0
1059 vpaddq %xmm0,%xmm14,%xmm14
1060 vpmuludq %xmm5,%xmm3,%xmm3
1061 vpaddq %xmm3,%xmm13,%xmm13
1062 vpshufd $0x10,48(%rdi),%xmm2
1063 vpmuludq %xmm9,%xmm4,%xmm1
1064 vpaddq %xmm1,%xmm12,%xmm12
1065 vpshufd $0x10,64(%rdi),%xmm3
1066 vpmuludq %xmm8,%xmm4,%xmm0
1067 vpaddq %xmm0,%xmm11,%xmm11
1068 vpmuludq %xmm7,%xmm4,%xmm4
1069 vpaddq %xmm4,%xmm10,%xmm10
1070
1071 vpmuludq %xmm5,%xmm2,%xmm2
1072 vpaddq %xmm2,%xmm14,%xmm14
1073 vpmuludq %xmm9,%xmm3,%xmm1
1074 vpaddq %xmm1,%xmm13,%xmm13
1075 vpmuludq %xmm8,%xmm3,%xmm0
1076 vpaddq %xmm0,%xmm12,%xmm12
1077 vpmuludq %xmm7,%xmm3,%xmm1
1078 vpaddq %xmm1,%xmm11,%xmm11
1079 vpmuludq %xmm6,%xmm3,%xmm3
1080 vpaddq %xmm3,%xmm10,%xmm10
1081
1082 jz .Lshort_tail_avx
1083
1084 vmovdqu 0(%rsi),%xmm0
1085 vmovdqu 16(%rsi),%xmm1
1086
1087 vpsrldq $6,%xmm0,%xmm2
1088 vpsrldq $6,%xmm1,%xmm3
1089 vpunpckhqdq %xmm1,%xmm0,%xmm4
1090 vpunpcklqdq %xmm1,%xmm0,%xmm0
1091 vpunpcklqdq %xmm3,%xmm2,%xmm3
1092
1093 vpsrlq $40,%xmm4,%xmm4
1094 vpsrlq $26,%xmm0,%xmm1
1095 vpand %xmm15,%xmm0,%xmm0
1096 vpsrlq $4,%xmm3,%xmm2
1097 vpand %xmm15,%xmm1,%xmm1
1098 vpsrlq $30,%xmm3,%xmm3
1099 vpand %xmm15,%xmm2,%xmm2
1100 vpand %xmm15,%xmm3,%xmm3
1101 vpor 32(%rcx),%xmm4,%xmm4
1102
1103 vpshufd $0x32,-64(%rdi),%xmm9
1104 vpaddq 0(%r11),%xmm0,%xmm0
1105 vpaddq 16(%r11),%xmm1,%xmm1
1106 vpaddq 32(%r11),%xmm2,%xmm2
1107 vpaddq 48(%r11),%xmm3,%xmm3
1108 vpaddq 64(%r11),%xmm4,%xmm4
1109
1110
1111
1112
1113 vpmuludq %xmm0,%xmm9,%xmm5
1114 vpaddq %xmm5,%xmm10,%xmm10
1115 vpmuludq %xmm1,%xmm9,%xmm6
1116 vpaddq %xmm6,%xmm11,%xmm11
1117 vpmuludq %xmm2,%xmm9,%xmm5
1118 vpaddq %xmm5,%xmm12,%xmm12
1119 vpshufd $0x32,-48(%rdi),%xmm7
1120 vpmuludq %xmm3,%xmm9,%xmm6
1121 vpaddq %xmm6,%xmm13,%xmm13
1122 vpmuludq %xmm4,%xmm9,%xmm9
1123 vpaddq %xmm9,%xmm14,%xmm14
1124
1125 vpmuludq %xmm3,%xmm7,%xmm5
1126 vpaddq %xmm5,%xmm14,%xmm14
1127 vpshufd $0x32,-32(%rdi),%xmm8
1128 vpmuludq %xmm2,%xmm7,%xmm6
1129 vpaddq %xmm6,%xmm13,%xmm13
1130 vpshufd $0x32,-16(%rdi),%xmm9
1131 vpmuludq %xmm1,%xmm7,%xmm5
1132 vpaddq %xmm5,%xmm12,%xmm12
1133 vpmuludq %xmm0,%xmm7,%xmm7
1134 vpaddq %xmm7,%xmm11,%xmm11
1135 vpmuludq %xmm4,%xmm8,%xmm8
1136 vpaddq %xmm8,%xmm10,%xmm10
1137
1138 vpshufd $0x32,0(%rdi),%xmm7
1139 vpmuludq %xmm2,%xmm9,%xmm6
1140 vpaddq %xmm6,%xmm14,%xmm14
1141 vpmuludq %xmm1,%xmm9,%xmm5
1142 vpaddq %xmm5,%xmm13,%xmm13
1143 vpshufd $0x32,16(%rdi),%xmm8
1144 vpmuludq %xmm0,%xmm9,%xmm9
1145 vpaddq %xmm9,%xmm12,%xmm12
1146 vpmuludq %xmm4,%xmm7,%xmm6
1147 vpaddq %xmm6,%xmm11,%xmm11
1148 vpshufd $0x32,32(%rdi),%xmm9
1149 vpmuludq %xmm3,%xmm7,%xmm7
1150 vpaddq %xmm7,%xmm10,%xmm10
1151
1152 vpmuludq %xmm1,%xmm8,%xmm5
1153 vpaddq %xmm5,%xmm14,%xmm14
1154 vpmuludq %xmm0,%xmm8,%xmm8
1155 vpaddq %xmm8,%xmm13,%xmm13
1156 vpshufd $0x32,48(%rdi),%xmm7
1157 vpmuludq %xmm4,%xmm9,%xmm6
1158 vpaddq %xmm6,%xmm12,%xmm12
1159 vpshufd $0x32,64(%rdi),%xmm8
1160 vpmuludq %xmm3,%xmm9,%xmm5
1161 vpaddq %xmm5,%xmm11,%xmm11
1162 vpmuludq %xmm2,%xmm9,%xmm9
1163 vpaddq %xmm9,%xmm10,%xmm10
1164
1165 vpmuludq %xmm0,%xmm7,%xmm7
1166 vpaddq %xmm7,%xmm14,%xmm14
1167 vpmuludq %xmm4,%xmm8,%xmm6
1168 vpaddq %xmm6,%xmm13,%xmm13
1169 vpmuludq %xmm3,%xmm8,%xmm5
1170 vpaddq %xmm5,%xmm12,%xmm12
1171 vpmuludq %xmm2,%xmm8,%xmm6
1172 vpaddq %xmm6,%xmm11,%xmm11
1173 vpmuludq %xmm1,%xmm8,%xmm8
1174 vpaddq %xmm8,%xmm10,%xmm10
1175
1176 .Lshort_tail_avx:
1177
1178
1179
1180 vpsrldq $8,%xmm14,%xmm9
1181 vpsrldq $8,%xmm13,%xmm8
1182 vpsrldq $8,%xmm11,%xmm6
1183 vpsrldq $8,%xmm10,%xmm5
1184 vpsrldq $8,%xmm12,%xmm7
1185 vpaddq %xmm8,%xmm13,%xmm13
1186 vpaddq %xmm9,%xmm14,%xmm14
1187 vpaddq %xmm5,%xmm10,%xmm10
1188 vpaddq %xmm6,%xmm11,%xmm11
1189 vpaddq %xmm7,%xmm12,%xmm12
1190
1191
1192
1193
1194 vpsrlq $26,%xmm13,%xmm3
1195 vpand %xmm15,%xmm13,%xmm13
1196 vpaddq %xmm3,%xmm14,%xmm14
1197
1198 vpsrlq $26,%xmm10,%xmm0
1199 vpand %xmm15,%xmm10,%xmm10
1200 vpaddq %xmm0,%xmm11,%xmm11
1201
1202 vpsrlq $26,%xmm14,%xmm4
1203 vpand %xmm15,%xmm14,%xmm14
1204
1205 vpsrlq $26,%xmm11,%xmm1
1206 vpand %xmm15,%xmm11,%xmm11
1207 vpaddq %xmm1,%xmm12,%xmm12
1208
1209 vpaddq %xmm4,%xmm10,%xmm10
1210 vpsllq $2,%xmm4,%xmm4
1211 vpaddq %xmm4,%xmm10,%xmm10
1212
1213 vpsrlq $26,%xmm12,%xmm2
1214 vpand %xmm15,%xmm12,%xmm12
1215 vpaddq %xmm2,%xmm13,%xmm13
1216
1217 vpsrlq $26,%xmm10,%xmm0
1218 vpand %xmm15,%xmm10,%xmm10
1219 vpaddq %xmm0,%xmm11,%xmm11
1220
1221 vpsrlq $26,%xmm13,%xmm3
1222 vpand %xmm15,%xmm13,%xmm13
1223 vpaddq %xmm3,%xmm14,%xmm14
1224
1225 vmovd %xmm10,-112(%rdi)
1226 vmovd %xmm11,-108(%rdi)
1227 vmovd %xmm12,-104(%rdi)
1228 vmovd %xmm13,-100(%rdi)
1229 vmovd %xmm14,-96(%rdi)
1230 leaq 88(%r11),%rsp
1231 .cfi_def_cfa %rsp,8
1232 vzeroupper
1233 .byte 0xf3,0xc3
1234 .cfi_endproc
1235 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1236
1237 .type poly1305_emit_avx,@function
1238 .align 32
1239 poly1305_emit_avx:
1240 .cfi_startproc
1241 cmpl $0,20(%rdi)
1242 je .Lemit
1243
1244 movl 0(%rdi),%eax
1245 movl 4(%rdi),%ecx
1246 movl 8(%rdi),%r8d
1247 movl 12(%rdi),%r11d
1248 movl 16(%rdi),%r10d
1249
1250 shlq $26,%rcx
1251 movq %r8,%r9
1252 shlq $52,%r8
1253 addq %rcx,%rax
1254 shrq $12,%r9
1255 addq %rax,%r8
1256 adcq $0,%r9
1257
1258 shlq $14,%r11
1259 movq %r10,%rax
1260 shrq $24,%r10
1261 addq %r11,%r9
1262 shlq $40,%rax
1263 addq %rax,%r9
1264 adcq $0,%r10
1265
1266 movq %r10,%rax
1267 movq %r10,%rcx
1268 andq $3,%r10
1269 shrq $2,%rax
1270 andq $-4,%rcx
1271 addq %rcx,%rax
1272 addq %rax,%r8
1273 adcq $0,%r9
1274 adcq $0,%r10
1275
1276 movq %r8,%rax
1277 addq $5,%r8
1278 movq %r9,%rcx
1279 adcq $0,%r9
1280 adcq $0,%r10
1281 shrq $2,%r10
1282 cmovnzq %r8,%rax
1283 cmovnzq %r9,%rcx
1284
1285 addq 0(%rdx),%rax
1286 adcq 8(%rdx),%rcx
1287 movq %rax,0(%rsi)
1288 movq %rcx,8(%rsi)
1289
1290 .byte 0xf3,0xc3
1291 .cfi_endproc
1292 .size poly1305_emit_avx,.-poly1305_emit_avx
1293 .type poly1305_blocks_avx2,@function
1294 .align 32
1295 poly1305_blocks_avx2:
1296 .cfi_startproc
1297 movl 20(%rdi),%r8d
1298 cmpq $128,%rdx
1299 jae .Lblocks_avx2
1300 testl %r8d,%r8d
1301 jz .Lblocks
1302
1303 .Lblocks_avx2:
1304 andq $-16,%rdx
1305 jz .Lno_data_avx2
1306
1307 vzeroupper
1308
1309 testl %r8d,%r8d
1310 jz .Lbase2_64_avx2
1311
1312 testq $63,%rdx
1313 jz .Leven_avx2
1314
1315 pushq %rbx
1316 .cfi_adjust_cfa_offset 8
1317 .cfi_offset %rbx,-16
1318 pushq %rbp
1319 .cfi_adjust_cfa_offset 8
1320 .cfi_offset %rbp,-24
1321 pushq %r12
1322 .cfi_adjust_cfa_offset 8
1323 .cfi_offset %r12,-32
1324 pushq %r13
1325 .cfi_adjust_cfa_offset 8
1326 .cfi_offset %r13,-40
1327 pushq %r14
1328 .cfi_adjust_cfa_offset 8
1329 .cfi_offset %r14,-48
1330 pushq %r15
1331 .cfi_adjust_cfa_offset 8
1332 .cfi_offset %r15,-56
1333 .Lblocks_avx2_body:
1334
1335 movq %rdx,%r15
1336
1337 movq 0(%rdi),%r8
1338 movq 8(%rdi),%r9
1339 movl 16(%rdi),%ebp
1340
1341 movq 24(%rdi),%r11
1342 movq 32(%rdi),%r13
1343
1344
1345 movl %r8d,%r14d
1346 andq $-2147483648,%r8
1347 movq %r9,%r12
1348 movl %r9d,%ebx
1349 andq $-2147483648,%r9
1350
1351 shrq $6,%r8
1352 shlq $52,%r12
1353 addq %r8,%r14
1354 shrq $12,%rbx
1355 shrq $18,%r9
1356 addq %r12,%r14
1357 adcq %r9,%rbx
1358
1359 movq %rbp,%r8
1360 shlq $40,%r8
1361 shrq $24,%rbp
1362 addq %r8,%rbx
1363 adcq $0,%rbp
1364
1365 movq $-4,%r9
1366 movq %rbp,%r8
1367 andq %rbp,%r9
1368 shrq $2,%r8
1369 andq $3,%rbp
1370 addq %r9,%r8
1371 addq %r8,%r14
1372 adcq $0,%rbx
1373 adcq $0,%rbp
1374
1375 movq %r13,%r12
1376 movq %r13,%rax
1377 shrq $2,%r13
1378 addq %r12,%r13
1379
1380 .Lbase2_26_pre_avx2:
1381 addq 0(%rsi),%r14
1382 adcq 8(%rsi),%rbx
1383 leaq 16(%rsi),%rsi
1384 adcq %rcx,%rbp
1385 subq $16,%r15
1386
1387 call __poly1305_block
1388 movq %r12,%rax
1389
1390 testq $63,%r15
1391 jnz .Lbase2_26_pre_avx2
1392
1393 testq %rcx,%rcx
1394 jz .Lstore_base2_64_avx2
1395
1396
1397 movq %r14,%rax
1398 movq %r14,%rdx
1399 shrq $52,%r14
1400 movq %rbx,%r11
1401 movq %rbx,%r12
1402 shrq $26,%rdx
1403 andq $0x3ffffff,%rax
1404 shlq $12,%r11
1405 andq $0x3ffffff,%rdx
1406 shrq $14,%rbx
1407 orq %r11,%r14
1408 shlq $24,%rbp
1409 andq $0x3ffffff,%r14
1410 shrq $40,%r12
1411 andq $0x3ffffff,%rbx
1412 orq %r12,%rbp
1413
1414 testq %r15,%r15
1415 jz .Lstore_base2_26_avx2
1416
1417 vmovd %eax,%xmm0
1418 vmovd %edx,%xmm1
1419 vmovd %r14d,%xmm2
1420 vmovd %ebx,%xmm3
1421 vmovd %ebp,%xmm4
1422 jmp .Lproceed_avx2
1423
1424 .align 32
1425 .Lstore_base2_64_avx2:
1426 movq %r14,0(%rdi)
1427 movq %rbx,8(%rdi)
1428 movq %rbp,16(%rdi)
1429 jmp .Ldone_avx2
1430
1431 .align 16
1432 .Lstore_base2_26_avx2:
1433 movl %eax,0(%rdi)
1434 movl %edx,4(%rdi)
1435 movl %r14d,8(%rdi)
1436 movl %ebx,12(%rdi)
1437 movl %ebp,16(%rdi)
1438 .align 16
1439 .Ldone_avx2:
1440 movq 0(%rsp),%r15
1441 .cfi_restore %r15
1442 movq 8(%rsp),%r14
1443 .cfi_restore %r14
1444 movq 16(%rsp),%r13
1445 .cfi_restore %r13
1446 movq 24(%rsp),%r12
1447 .cfi_restore %r12
1448 movq 32(%rsp),%rbp
1449 .cfi_restore %rbp
1450 movq 40(%rsp),%rbx
1451 .cfi_restore %rbx
1452 leaq 48(%rsp),%rsp
1453 .cfi_adjust_cfa_offset -48
1454 .Lno_data_avx2:
1455 .Lblocks_avx2_epilogue:
1456 .byte 0xf3,0xc3
1457 .cfi_endproc
1458
1459 .align 32
1460 .Lbase2_64_avx2:
1461 .cfi_startproc
1462 pushq %rbx
1463 .cfi_adjust_cfa_offset 8
1464 .cfi_offset %rbx,-16
1465 pushq %rbp
1466 .cfi_adjust_cfa_offset 8
1467 .cfi_offset %rbp,-24
1468 pushq %r12
1469 .cfi_adjust_cfa_offset 8
1470 .cfi_offset %r12,-32
1471 pushq %r13
1472 .cfi_adjust_cfa_offset 8
1473 .cfi_offset %r13,-40
1474 pushq %r14
1475 .cfi_adjust_cfa_offset 8
1476 .cfi_offset %r14,-48
1477 pushq %r15
1478 .cfi_adjust_cfa_offset 8
1479 .cfi_offset %r15,-56
1480 .Lbase2_64_avx2_body:
1481
1482 movq %rdx,%r15
1483
1484 movq 24(%rdi),%r11
1485 movq 32(%rdi),%r13
1486
1487 movq 0(%rdi),%r14
1488 movq 8(%rdi),%rbx
1489 movl 16(%rdi),%ebp
1490
1491 movq %r13,%r12
1492 movq %r13,%rax
1493 shrq $2,%r13
1494 addq %r12,%r13
1495
1496 testq $63,%rdx
1497 jz .Linit_avx2
1498
1499 .Lbase2_64_pre_avx2:
1500 addq 0(%rsi),%r14
1501 adcq 8(%rsi),%rbx
1502 leaq 16(%rsi),%rsi
1503 adcq %rcx,%rbp
1504 subq $16,%r15
1505
1506 call __poly1305_block
1507 movq %r12,%rax
1508
1509 testq $63,%r15
1510 jnz .Lbase2_64_pre_avx2
1511
1512 .Linit_avx2:
1513
1514 movq %r14,%rax
1515 movq %r14,%rdx
1516 shrq $52,%r14
1517 movq %rbx,%r8
1518 movq %rbx,%r9
1519 shrq $26,%rdx
1520 andq $0x3ffffff,%rax
1521 shlq $12,%r8
1522 andq $0x3ffffff,%rdx
1523 shrq $14,%rbx
1524 orq %r8,%r14
1525 shlq $24,%rbp
1526 andq $0x3ffffff,%r14
1527 shrq $40,%r9
1528 andq $0x3ffffff,%rbx
1529 orq %r9,%rbp
1530
1531 vmovd %eax,%xmm0
1532 vmovd %edx,%xmm1
1533 vmovd %r14d,%xmm2
1534 vmovd %ebx,%xmm3
1535 vmovd %ebp,%xmm4
1536 movl $1,20(%rdi)
1537
1538 call __poly1305_init_avx
1539
1540 .Lproceed_avx2:
1541 movq %r15,%rdx
1542 movl OPENSSL_ia32cap_P+8(%rip),%r10d
1543 movl $3221291008,%r11d
1544
1545 movq 0(%rsp),%r15
1546 .cfi_restore %r15
1547 movq 8(%rsp),%r14
1548 .cfi_restore %r14
1549 movq 16(%rsp),%r13
1550 .cfi_restore %r13
1551 movq 24(%rsp),%r12
1552 .cfi_restore %r12
1553 movq 32(%rsp),%rbp
1554 .cfi_restore %rbp
1555 movq 40(%rsp),%rbx
1556 .cfi_restore %rbx
1557 leaq 48(%rsp),%rax
1558 leaq 48(%rsp),%rsp
1559 .cfi_adjust_cfa_offset -48
1560 .Lbase2_64_avx2_epilogue:
1561 jmp .Ldo_avx2
1562 .cfi_endproc
1563
1564 .align 32
1565 .Leven_avx2:
1566 .cfi_startproc
1567 movl OPENSSL_ia32cap_P+8(%rip),%r10d
1568 vmovd 0(%rdi),%xmm0
1569 vmovd 4(%rdi),%xmm1
1570 vmovd 8(%rdi),%xmm2
1571 vmovd 12(%rdi),%xmm3
1572 vmovd 16(%rdi),%xmm4
1573
1574 .Ldo_avx2:
1575 leaq -8(%rsp),%r11
1576 .cfi_def_cfa %r11,16
1577 subq $0x128,%rsp
1578 leaq .Lconst(%rip),%rcx
1579 leaq 48+64(%rdi),%rdi
1580 vmovdqa 96(%rcx),%ymm7
1581
1582
1583 vmovdqu -64(%rdi),%xmm9
1584 andq $-512,%rsp
1585 vmovdqu -48(%rdi),%xmm10
1586 vmovdqu -32(%rdi),%xmm6
1587 vmovdqu -16(%rdi),%xmm11
1588 vmovdqu 0(%rdi),%xmm12
1589 vmovdqu 16(%rdi),%xmm13
1590 leaq 144(%rsp),%rax
1591 vmovdqu 32(%rdi),%xmm14
1592 vpermd %ymm9,%ymm7,%ymm9
1593 vmovdqu 48(%rdi),%xmm15
1594 vpermd %ymm10,%ymm7,%ymm10
1595 vmovdqu 64(%rdi),%xmm5
1596 vpermd %ymm6,%ymm7,%ymm6
1597 vmovdqa %ymm9,0(%rsp)
1598 vpermd %ymm11,%ymm7,%ymm11
1599 vmovdqa %ymm10,32-144(%rax)
1600 vpermd %ymm12,%ymm7,%ymm12
1601 vmovdqa %ymm6,64-144(%rax)
1602 vpermd %ymm13,%ymm7,%ymm13
1603 vmovdqa %ymm11,96-144(%rax)
1604 vpermd %ymm14,%ymm7,%ymm14
1605 vmovdqa %ymm12,128-144(%rax)
1606 vpermd %ymm15,%ymm7,%ymm15
1607 vmovdqa %ymm13,160-144(%rax)
1608 vpermd %ymm5,%ymm7,%ymm5
1609 vmovdqa %ymm14,192-144(%rax)
1610 vmovdqa %ymm15,224-144(%rax)
1611 vmovdqa %ymm5,256-144(%rax)
1612 vmovdqa 64(%rcx),%ymm5
1613
1614
1615
1616 vmovdqu 0(%rsi),%xmm7
1617 vmovdqu 16(%rsi),%xmm8
1618 vinserti128 $1,32(%rsi),%ymm7,%ymm7
1619 vinserti128 $1,48(%rsi),%ymm8,%ymm8
1620 leaq 64(%rsi),%rsi
1621
1622 vpsrldq $6,%ymm7,%ymm9
1623 vpsrldq $6,%ymm8,%ymm10
1624 vpunpckhqdq %ymm8,%ymm7,%ymm6
1625 vpunpcklqdq %ymm10,%ymm9,%ymm9
1626 vpunpcklqdq %ymm8,%ymm7,%ymm7
1627
1628 vpsrlq $30,%ymm9,%ymm10
1629 vpsrlq $4,%ymm9,%ymm9
1630 vpsrlq $26,%ymm7,%ymm8
1631 vpsrlq $40,%ymm6,%ymm6
1632 vpand %ymm5,%ymm9,%ymm9
1633 vpand %ymm5,%ymm7,%ymm7
1634 vpand %ymm5,%ymm8,%ymm8
1635 vpand %ymm5,%ymm10,%ymm10
1636 vpor 32(%rcx),%ymm6,%ymm6
1637
1638 vpaddq %ymm2,%ymm9,%ymm2
1639 subq $64,%rdx
1640 jz .Ltail_avx2
1641 jmp .Loop_avx2
1642
1643 .align 32
1644 .Loop_avx2:
1645
1646
1647
1648
1649
1650
1651
1652
1653 vpaddq %ymm0,%ymm7,%ymm0
1654 vmovdqa 0(%rsp),%ymm7
1655 vpaddq %ymm1,%ymm8,%ymm1
1656 vmovdqa 32(%rsp),%ymm8
1657 vpaddq %ymm3,%ymm10,%ymm3
1658 vmovdqa 96(%rsp),%ymm9
1659 vpaddq %ymm4,%ymm6,%ymm4
1660 vmovdqa 48(%rax),%ymm10
1661 vmovdqa 112(%rax),%ymm5
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678 vpmuludq %ymm2,%ymm7,%ymm13
1679 vpmuludq %ymm2,%ymm8,%ymm14
1680 vpmuludq %ymm2,%ymm9,%ymm15
1681 vpmuludq %ymm2,%ymm10,%ymm11
1682 vpmuludq %ymm2,%ymm5,%ymm12
1683
1684 vpmuludq %ymm0,%ymm8,%ymm6
1685 vpmuludq %ymm1,%ymm8,%ymm2
1686 vpaddq %ymm6,%ymm12,%ymm12
1687 vpaddq %ymm2,%ymm13,%ymm13
1688 vpmuludq %ymm3,%ymm8,%ymm6
1689 vpmuludq 64(%rsp),%ymm4,%ymm2
1690 vpaddq %ymm6,%ymm15,%ymm15
1691 vpaddq %ymm2,%ymm11,%ymm11
1692 vmovdqa -16(%rax),%ymm8
1693
1694 vpmuludq %ymm0,%ymm7,%ymm6
1695 vpmuludq %ymm1,%ymm7,%ymm2
1696 vpaddq %ymm6,%ymm11,%ymm11
1697 vpaddq %ymm2,%ymm12,%ymm12
1698 vpmuludq %ymm3,%ymm7,%ymm6
1699 vpmuludq %ymm4,%ymm7,%ymm2
1700 vmovdqu 0(%rsi),%xmm7
1701 vpaddq %ymm6,%ymm14,%ymm14
1702 vpaddq %ymm2,%ymm15,%ymm15
1703 vinserti128 $1,32(%rsi),%ymm7,%ymm7
1704
1705 vpmuludq %ymm3,%ymm8,%ymm6
1706 vpmuludq %ymm4,%ymm8,%ymm2
1707 vmovdqu 16(%rsi),%xmm8
1708 vpaddq %ymm6,%ymm11,%ymm11
1709 vpaddq %ymm2,%ymm12,%ymm12
1710 vmovdqa 16(%rax),%ymm2
1711 vpmuludq %ymm1,%ymm9,%ymm6
1712 vpmuludq %ymm0,%ymm9,%ymm9
1713 vpaddq %ymm6,%ymm14,%ymm14
1714 vpaddq %ymm9,%ymm13,%ymm13
1715 vinserti128 $1,48(%rsi),%ymm8,%ymm8
1716 leaq 64(%rsi),%rsi
1717
1718 vpmuludq %ymm1,%ymm2,%ymm6
1719 vpmuludq %ymm0,%ymm2,%ymm2
1720 vpsrldq $6,%ymm7,%ymm9
1721 vpaddq %ymm6,%ymm15,%ymm15
1722 vpaddq %ymm2,%ymm14,%ymm14
1723 vpmuludq %ymm3,%ymm10,%ymm6
1724 vpmuludq %ymm4,%ymm10,%ymm2
1725 vpsrldq $6,%ymm8,%ymm10
1726 vpaddq %ymm6,%ymm12,%ymm12
1727 vpaddq %ymm2,%ymm13,%ymm13
1728 vpunpckhqdq %ymm8,%ymm7,%ymm6
1729
1730 vpmuludq %ymm3,%ymm5,%ymm3
1731 vpmuludq %ymm4,%ymm5,%ymm4
1732 vpunpcklqdq %ymm8,%ymm7,%ymm7
1733 vpaddq %ymm3,%ymm13,%ymm2
1734 vpaddq %ymm4,%ymm14,%ymm3
1735 vpunpcklqdq %ymm10,%ymm9,%ymm10
1736 vpmuludq 80(%rax),%ymm0,%ymm4
1737 vpmuludq %ymm1,%ymm5,%ymm0
1738 vmovdqa 64(%rcx),%ymm5
1739 vpaddq %ymm4,%ymm15,%ymm4
1740 vpaddq %ymm0,%ymm11,%ymm0
1741
1742
1743
1744
1745 vpsrlq $26,%ymm3,%ymm14
1746 vpand %ymm5,%ymm3,%ymm3
1747 vpaddq %ymm14,%ymm4,%ymm4
1748
1749 vpsrlq $26,%ymm0,%ymm11
1750 vpand %ymm5,%ymm0,%ymm0
1751 vpaddq %ymm11,%ymm12,%ymm1
1752
1753 vpsrlq $26,%ymm4,%ymm15
1754 vpand %ymm5,%ymm4,%ymm4
1755
1756 vpsrlq $4,%ymm10,%ymm9
1757
1758 vpsrlq $26,%ymm1,%ymm12
1759 vpand %ymm5,%ymm1,%ymm1
1760 vpaddq %ymm12,%ymm2,%ymm2
1761
1762 vpaddq %ymm15,%ymm0,%ymm0
1763 vpsllq $2,%ymm15,%ymm15
1764 vpaddq %ymm15,%ymm0,%ymm0
1765
1766 vpand %ymm5,%ymm9,%ymm9
1767 vpsrlq $26,%ymm7,%ymm8
1768
1769 vpsrlq $26,%ymm2,%ymm13
1770 vpand %ymm5,%ymm2,%ymm2
1771 vpaddq %ymm13,%ymm3,%ymm3
1772
1773 vpaddq %ymm9,%ymm2,%ymm2
1774 vpsrlq $30,%ymm10,%ymm10
1775
1776 vpsrlq $26,%ymm0,%ymm11
1777 vpand %ymm5,%ymm0,%ymm0
1778 vpaddq %ymm11,%ymm1,%ymm1
1779
1780 vpsrlq $40,%ymm6,%ymm6
1781
1782 vpsrlq $26,%ymm3,%ymm14
1783 vpand %ymm5,%ymm3,%ymm3
1784 vpaddq %ymm14,%ymm4,%ymm4
1785
1786 vpand %ymm5,%ymm7,%ymm7
1787 vpand %ymm5,%ymm8,%ymm8
1788 vpand %ymm5,%ymm10,%ymm10
1789 vpor 32(%rcx),%ymm6,%ymm6
1790
1791 subq $64,%rdx
1792 jnz .Loop_avx2
1793
1794 .byte 0x66,0x90
1795 .Ltail_avx2:
1796
1797
1798
1799
1800
1801
1802
1803 vpaddq %ymm0,%ymm7,%ymm0
1804 vmovdqu 4(%rsp),%ymm7
1805 vpaddq %ymm1,%ymm8,%ymm1
1806 vmovdqu 36(%rsp),%ymm8
1807 vpaddq %ymm3,%ymm10,%ymm3
1808 vmovdqu 100(%rsp),%ymm9
1809 vpaddq %ymm4,%ymm6,%ymm4
1810 vmovdqu 52(%rax),%ymm10
1811 vmovdqu 116(%rax),%ymm5
1812
1813 vpmuludq %ymm2,%ymm7,%ymm13
1814 vpmuludq %ymm2,%ymm8,%ymm14
1815 vpmuludq %ymm2,%ymm9,%ymm15
1816 vpmuludq %ymm2,%ymm10,%ymm11
1817 vpmuludq %ymm2,%ymm5,%ymm12
1818
1819 vpmuludq %ymm0,%ymm8,%ymm6
1820 vpmuludq %ymm1,%ymm8,%ymm2
1821 vpaddq %ymm6,%ymm12,%ymm12
1822 vpaddq %ymm2,%ymm13,%ymm13
1823 vpmuludq %ymm3,%ymm8,%ymm6
1824 vpmuludq 68(%rsp),%ymm4,%ymm2
1825 vpaddq %ymm6,%ymm15,%ymm15
1826 vpaddq %ymm2,%ymm11,%ymm11
1827
1828 vpmuludq %ymm0,%ymm7,%ymm6
1829 vpmuludq %ymm1,%ymm7,%ymm2
1830 vpaddq %ymm6,%ymm11,%ymm11
1831 vmovdqu -12(%rax),%ymm8
1832 vpaddq %ymm2,%ymm12,%ymm12
1833 vpmuludq %ymm3,%ymm7,%ymm6
1834 vpmuludq %ymm4,%ymm7,%ymm2
1835 vpaddq %ymm6,%ymm14,%ymm14
1836 vpaddq %ymm2,%ymm15,%ymm15
1837
1838 vpmuludq %ymm3,%ymm8,%ymm6
1839 vpmuludq %ymm4,%ymm8,%ymm2
1840 vpaddq %ymm6,%ymm11,%ymm11
1841 vpaddq %ymm2,%ymm12,%ymm12
1842 vmovdqu 20(%rax),%ymm2
1843 vpmuludq %ymm1,%ymm9,%ymm6
1844 vpmuludq %ymm0,%ymm9,%ymm9
1845 vpaddq %ymm6,%ymm14,%ymm14
1846 vpaddq %ymm9,%ymm13,%ymm13
1847
1848 vpmuludq %ymm1,%ymm2,%ymm6
1849 vpmuludq %ymm0,%ymm2,%ymm2
1850 vpaddq %ymm6,%ymm15,%ymm15
1851 vpaddq %ymm2,%ymm14,%ymm14
1852 vpmuludq %ymm3,%ymm10,%ymm6
1853 vpmuludq %ymm4,%ymm10,%ymm2
1854 vpaddq %ymm6,%ymm12,%ymm12
1855 vpaddq %ymm2,%ymm13,%ymm13
1856
1857 vpmuludq %ymm3,%ymm5,%ymm3
1858 vpmuludq %ymm4,%ymm5,%ymm4
1859 vpaddq %ymm3,%ymm13,%ymm2
1860 vpaddq %ymm4,%ymm14,%ymm3
1861 vpmuludq 84(%rax),%ymm0,%ymm4
1862 vpmuludq %ymm1,%ymm5,%ymm0
1863 vmovdqa 64(%rcx),%ymm5
1864 vpaddq %ymm4,%ymm15,%ymm4
1865 vpaddq %ymm0,%ymm11,%ymm0
1866
1867
1868
1869
1870 vpsrldq $8,%ymm12,%ymm8
1871 vpsrldq $8,%ymm2,%ymm9
1872 vpsrldq $8,%ymm3,%ymm10
1873 vpsrldq $8,%ymm4,%ymm6
1874 vpsrldq $8,%ymm0,%ymm7
1875 vpaddq %ymm8,%ymm12,%ymm12
1876 vpaddq %ymm9,%ymm2,%ymm2
1877 vpaddq %ymm10,%ymm3,%ymm3
1878 vpaddq %ymm6,%ymm4,%ymm4
1879 vpaddq %ymm7,%ymm0,%ymm0
1880
1881 vpermq $0x2,%ymm3,%ymm10
1882 vpermq $0x2,%ymm4,%ymm6
1883 vpermq $0x2,%ymm0,%ymm7
1884 vpermq $0x2,%ymm12,%ymm8
1885 vpermq $0x2,%ymm2,%ymm9
1886 vpaddq %ymm10,%ymm3,%ymm3
1887 vpaddq %ymm6,%ymm4,%ymm4
1888 vpaddq %ymm7,%ymm0,%ymm0
1889 vpaddq %ymm8,%ymm12,%ymm12
1890 vpaddq %ymm9,%ymm2,%ymm2
1891
1892
1893
1894
1895 vpsrlq $26,%ymm3,%ymm14
1896 vpand %ymm5,%ymm3,%ymm3
1897 vpaddq %ymm14,%ymm4,%ymm4
1898
1899 vpsrlq $26,%ymm0,%ymm11
1900 vpand %ymm5,%ymm0,%ymm0
1901 vpaddq %ymm11,%ymm12,%ymm1
1902
1903 vpsrlq $26,%ymm4,%ymm15
1904 vpand %ymm5,%ymm4,%ymm4
1905
1906 vpsrlq $26,%ymm1,%ymm12
1907 vpand %ymm5,%ymm1,%ymm1
1908 vpaddq %ymm12,%ymm2,%ymm2
1909
1910 vpaddq %ymm15,%ymm0,%ymm0
1911 vpsllq $2,%ymm15,%ymm15
1912 vpaddq %ymm15,%ymm0,%ymm0
1913
1914 vpsrlq $26,%ymm2,%ymm13
1915 vpand %ymm5,%ymm2,%ymm2
1916 vpaddq %ymm13,%ymm3,%ymm3
1917
1918 vpsrlq $26,%ymm0,%ymm11
1919 vpand %ymm5,%ymm0,%ymm0
1920 vpaddq %ymm11,%ymm1,%ymm1
1921
1922 vpsrlq $26,%ymm3,%ymm14
1923 vpand %ymm5,%ymm3,%ymm3
1924 vpaddq %ymm14,%ymm4,%ymm4
1925
1926 vmovd %xmm0,-112(%rdi)
1927 vmovd %xmm1,-108(%rdi)
1928 vmovd %xmm2,-104(%rdi)
1929 vmovd %xmm3,-100(%rdi)
1930 vmovd %xmm4,-96(%rdi)
1931 leaq 8(%r11),%rsp
1932 .cfi_def_cfa %rsp,8
1933 vzeroupper
1934 .byte 0xf3,0xc3
1935 .cfi_endproc
1936 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
1937 .align 64
1938 .Lconst:
1939 .Lmask24:
1940 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1941 .L129:
1942 .long 16777216,0,16777216,0,16777216,0,16777216,0
1943 .Lmask26:
1944 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1945 .Lpermd_avx2:
1946 .long 2,2,2,3,2,0,2,1
1947 .Lpermd_avx512:
1948 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1949
1950 .L2_44_inp_permd:
1951 .long 0,1,1,2,2,3,7,7
1952 .L2_44_inp_shift:
1953 .quad 0,12,24,64
1954 .L2_44_mask:
1955 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1956 .L2_44_shift_rgt:
1957 .quad 44,44,42,64
1958 .L2_44_shift_lft:
1959 .quad 8,8,10,64
1960
1961 .align 64
1962 .Lx_mask44:
1963 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1964 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1965 .Lx_mask42:
1966 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1967 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1968 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1969 .align 16
1970 .globl xor128_encrypt_n_pad
1971 .type xor128_encrypt_n_pad,@function
1972 .align 16
1973 xor128_encrypt_n_pad:
1974 .cfi_startproc
1975 subq %rdx,%rsi
1976 subq %rdx,%rdi
1977 movq %rcx,%r10
1978 shrq $4,%rcx
1979 jz .Ltail_enc
1980 nop
1981 .Loop_enc_xmm:
1982 movdqu (%rsi,%rdx,1),%xmm0
1983 pxor (%rdx),%xmm0
1984 movdqu %xmm0,(%rdi,%rdx,1)
1985 movdqa %xmm0,(%rdx)
1986 leaq 16(%rdx),%rdx
1987 decq %rcx
1988 jnz .Loop_enc_xmm
1989
1990 andq $15,%r10
1991 jz .Ldone_enc
1992
1993 .Ltail_enc:
1994 movq $16,%rcx
1995 subq %r10,%rcx
1996 xorl %eax,%eax
1997 .Loop_enc_byte:
1998 movb (%rsi,%rdx,1),%al
1999 xorb (%rdx),%al
2000 movb %al,(%rdi,%rdx,1)
2001 movb %al,(%rdx)
2002 leaq 1(%rdx),%rdx
2003 decq %r10
2004 jnz .Loop_enc_byte
2005
2006 xorl %eax,%eax
2007 .Loop_enc_pad:
2008 movb %al,(%rdx)
2009 leaq 1(%rdx),%rdx
2010 decq %rcx
2011 jnz .Loop_enc_pad
2012
2013 .Ldone_enc:
2014 movq %rdx,%rax
2015 .byte 0xf3,0xc3
2016 .cfi_endproc
2017 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2018
2019 .globl xor128_decrypt_n_pad
2020 .type xor128_decrypt_n_pad,@function
2021 .align 16
2022 xor128_decrypt_n_pad:
2023 .cfi_startproc
2024 subq %rdx,%rsi
2025 subq %rdx,%rdi
2026 movq %rcx,%r10
2027 shrq $4,%rcx
2028 jz .Ltail_dec
2029 nop
2030 .Loop_dec_xmm:
2031 movdqu (%rsi,%rdx,1),%xmm0
2032 movdqa (%rdx),%xmm1
2033 pxor %xmm0,%xmm1
2034 movdqu %xmm1,(%rdi,%rdx,1)
2035 movdqa %xmm0,(%rdx)
2036 leaq 16(%rdx),%rdx
2037 decq %rcx
2038 jnz .Loop_dec_xmm
2039
2040 pxor %xmm1,%xmm1
2041 andq $15,%r10
2042 jz .Ldone_dec
2043
2044 .Ltail_dec:
2045 movq $16,%rcx
2046 subq %r10,%rcx
2047 xorl %eax,%eax
2048 xorq %r11,%r11
2049 .Loop_dec_byte:
2050 movb (%rsi,%rdx,1),%r11b
2051 movb (%rdx),%al
2052 xorb %r11b,%al
2053 movb %al,(%rdi,%rdx,1)
2054 movb %r11b,(%rdx)
2055 leaq 1(%rdx),%rdx
2056 decq %r10
2057 jnz .Loop_dec_byte
2058
2059 xorl %eax,%eax
2060 .Loop_dec_pad:
2061 movb %al,(%rdx)
2062 leaq 1(%rdx),%rdx
2063 decq %rcx
2064 jnz .Loop_dec_pad
2065
2066 .Ldone_dec:
2067 movq %rdx,%rax
2068 .byte 0xf3,0xc3
2069 .cfi_endproc
2070 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
Cache object: 9118d171e283ada98abdd164fb36333f
|