1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from poly1305-x86.pl. */
3 #ifdef PIC
4 .text
5 .align 64
6 .globl poly1305_init
7 .type poly1305_init,@function
8 .align 16
9 poly1305_init:
10 .L_poly1305_init_begin:
11 pushl %ebp
12 pushl %ebx
13 pushl %esi
14 pushl %edi
15 movl 20(%esp),%edi
16 movl 24(%esp),%esi
17 movl 28(%esp),%ebp
18 xorl %eax,%eax
19 movl %eax,(%edi)
20 movl %eax,4(%edi)
21 movl %eax,8(%edi)
22 movl %eax,12(%edi)
23 movl %eax,16(%edi)
24 movl %eax,20(%edi)
25 cmpl $0,%esi
26 je .L000nokey
27 call .L001pic_point
28 .L001pic_point:
29 popl %ebx
30 leal poly1305_blocks-.L001pic_point(%ebx),%eax
31 leal poly1305_emit-.L001pic_point(%ebx),%edx
32 leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
33 movl (%edi),%ecx
34 andl $83886080,%ecx
35 cmpl $83886080,%ecx
36 jne .L002no_sse2
37 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
38 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
39 movl 8(%edi),%ecx
40 testl $32,%ecx
41 jz .L002no_sse2
42 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
43 .L002no_sse2:
44 movl 20(%esp),%edi
45 movl %eax,(%ebp)
46 movl %edx,4(%ebp)
47 movl (%esi),%eax
48 movl 4(%esi),%ebx
49 movl 8(%esi),%ecx
50 movl 12(%esi),%edx
51 andl $268435455,%eax
52 andl $268435452,%ebx
53 andl $268435452,%ecx
54 andl $268435452,%edx
55 movl %eax,24(%edi)
56 movl %ebx,28(%edi)
57 movl %ecx,32(%edi)
58 movl %edx,36(%edi)
59 movl $1,%eax
60 .L000nokey:
61 popl %edi
62 popl %esi
63 popl %ebx
64 popl %ebp
65 ret
66 .size poly1305_init,.-.L_poly1305_init_begin
67 .globl poly1305_blocks
68 .type poly1305_blocks,@function
69 .align 16
70 poly1305_blocks:
71 .L_poly1305_blocks_begin:
72 pushl %ebp
73 pushl %ebx
74 pushl %esi
75 pushl %edi
76 movl 20(%esp),%edi
77 movl 24(%esp),%esi
78 movl 28(%esp),%ecx
79 .Lenter_blocks:
80 andl $-15,%ecx
81 jz .L003nodata
82 subl $64,%esp
83 movl 24(%edi),%eax
84 movl 28(%edi),%ebx
85 leal (%esi,%ecx,1),%ebp
86 movl 32(%edi),%ecx
87 movl 36(%edi),%edx
88 movl %ebp,92(%esp)
89 movl %esi,%ebp
90 movl %eax,36(%esp)
91 movl %ebx,%eax
92 shrl $2,%eax
93 movl %ebx,40(%esp)
94 addl %ebx,%eax
95 movl %ecx,%ebx
96 shrl $2,%ebx
97 movl %ecx,44(%esp)
98 addl %ecx,%ebx
99 movl %edx,%ecx
100 shrl $2,%ecx
101 movl %edx,48(%esp)
102 addl %edx,%ecx
103 movl %eax,52(%esp)
104 movl %ebx,56(%esp)
105 movl %ecx,60(%esp)
106 movl (%edi),%eax
107 movl 4(%edi),%ebx
108 movl 8(%edi),%ecx
109 movl 12(%edi),%esi
110 movl 16(%edi),%edi
111 jmp .L004loop
112 .align 32
113 .L004loop:
114 addl (%ebp),%eax
115 adcl 4(%ebp),%ebx
116 adcl 8(%ebp),%ecx
117 adcl 12(%ebp),%esi
118 leal 16(%ebp),%ebp
119 adcl 96(%esp),%edi
120 movl %eax,(%esp)
121 movl %esi,12(%esp)
122 mull 36(%esp)
123 movl %edi,16(%esp)
124 movl %eax,%edi
125 movl %ebx,%eax
126 movl %edx,%esi
127 mull 60(%esp)
128 addl %eax,%edi
129 movl %ecx,%eax
130 adcl %edx,%esi
131 mull 56(%esp)
132 addl %eax,%edi
133 movl 12(%esp),%eax
134 adcl %edx,%esi
135 mull 52(%esp)
136 addl %eax,%edi
137 movl (%esp),%eax
138 adcl %edx,%esi
139 mull 40(%esp)
140 movl %edi,20(%esp)
141 xorl %edi,%edi
142 addl %eax,%esi
143 movl %ebx,%eax
144 adcl %edx,%edi
145 mull 36(%esp)
146 addl %eax,%esi
147 movl %ecx,%eax
148 adcl %edx,%edi
149 mull 60(%esp)
150 addl %eax,%esi
151 movl 12(%esp),%eax
152 adcl %edx,%edi
153 mull 56(%esp)
154 addl %eax,%esi
155 movl 16(%esp),%eax
156 adcl %edx,%edi
157 imull 52(%esp),%eax
158 addl %eax,%esi
159 movl (%esp),%eax
160 adcl $0,%edi
161 mull 44(%esp)
162 movl %esi,24(%esp)
163 xorl %esi,%esi
164 addl %eax,%edi
165 movl %ebx,%eax
166 adcl %edx,%esi
167 mull 40(%esp)
168 addl %eax,%edi
169 movl %ecx,%eax
170 adcl %edx,%esi
171 mull 36(%esp)
172 addl %eax,%edi
173 movl 12(%esp),%eax
174 adcl %edx,%esi
175 mull 60(%esp)
176 addl %eax,%edi
177 movl 16(%esp),%eax
178 adcl %edx,%esi
179 imull 56(%esp),%eax
180 addl %eax,%edi
181 movl (%esp),%eax
182 adcl $0,%esi
183 mull 48(%esp)
184 movl %edi,28(%esp)
185 xorl %edi,%edi
186 addl %eax,%esi
187 movl %ebx,%eax
188 adcl %edx,%edi
189 mull 44(%esp)
190 addl %eax,%esi
191 movl %ecx,%eax
192 adcl %edx,%edi
193 mull 40(%esp)
194 addl %eax,%esi
195 movl 12(%esp),%eax
196 adcl %edx,%edi
197 mull 36(%esp)
198 addl %eax,%esi
199 movl 16(%esp),%ecx
200 adcl %edx,%edi
201 movl %ecx,%edx
202 imull 60(%esp),%ecx
203 addl %ecx,%esi
204 movl 20(%esp),%eax
205 adcl $0,%edi
206 imull 36(%esp),%edx
207 addl %edi,%edx
208 movl 24(%esp),%ebx
209 movl 28(%esp),%ecx
210 movl %edx,%edi
211 shrl $2,%edx
212 andl $3,%edi
213 leal (%edx,%edx,4),%edx
214 addl %edx,%eax
215 adcl $0,%ebx
216 adcl $0,%ecx
217 adcl $0,%esi
218 adcl $0,%edi
219 cmpl 92(%esp),%ebp
220 jne .L004loop
221 movl 84(%esp),%edx
222 addl $64,%esp
223 movl %eax,(%edx)
224 movl %ebx,4(%edx)
225 movl %ecx,8(%edx)
226 movl %esi,12(%edx)
227 movl %edi,16(%edx)
228 .L003nodata:
229 popl %edi
230 popl %esi
231 popl %ebx
232 popl %ebp
233 ret
234 .size poly1305_blocks,.-.L_poly1305_blocks_begin
235 .globl poly1305_emit
236 .type poly1305_emit,@function
237 .align 16
238 poly1305_emit:
239 .L_poly1305_emit_begin:
240 pushl %ebp
241 pushl %ebx
242 pushl %esi
243 pushl %edi
244 movl 20(%esp),%ebp
245 .Lenter_emit:
246 movl 24(%esp),%edi
247 movl (%ebp),%eax
248 movl 4(%ebp),%ebx
249 movl 8(%ebp),%ecx
250 movl 12(%ebp),%edx
251 movl 16(%ebp),%esi
252 addl $5,%eax
253 adcl $0,%ebx
254 adcl $0,%ecx
255 adcl $0,%edx
256 adcl $0,%esi
257 shrl $2,%esi
258 negl %esi
259 andl %esi,%eax
260 andl %esi,%ebx
261 andl %esi,%ecx
262 andl %esi,%edx
263 movl %eax,(%edi)
264 movl %ebx,4(%edi)
265 movl %ecx,8(%edi)
266 movl %edx,12(%edi)
267 notl %esi
268 movl (%ebp),%eax
269 movl 4(%ebp),%ebx
270 movl 8(%ebp),%ecx
271 movl 12(%ebp),%edx
272 movl 28(%esp),%ebp
273 andl %esi,%eax
274 andl %esi,%ebx
275 andl %esi,%ecx
276 andl %esi,%edx
277 orl (%edi),%eax
278 orl 4(%edi),%ebx
279 orl 8(%edi),%ecx
280 orl 12(%edi),%edx
281 addl (%ebp),%eax
282 adcl 4(%ebp),%ebx
283 adcl 8(%ebp),%ecx
284 adcl 12(%ebp),%edx
285 movl %eax,(%edi)
286 movl %ebx,4(%edi)
287 movl %ecx,8(%edi)
288 movl %edx,12(%edi)
289 popl %edi
290 popl %esi
291 popl %ebx
292 popl %ebp
293 ret
294 .size poly1305_emit,.-.L_poly1305_emit_begin
295 .align 32
296 .type _poly1305_init_sse2,@function
297 .align 16
298 _poly1305_init_sse2:
299 movdqu 24(%edi),%xmm4
300 leal 48(%edi),%edi
301 movl %esp,%ebp
302 subl $224,%esp
303 andl $-16,%esp
304 movq 64(%ebx),%xmm7
305 movdqa %xmm4,%xmm0
306 movdqa %xmm4,%xmm1
307 movdqa %xmm4,%xmm2
308 pand %xmm7,%xmm0
309 psrlq $26,%xmm1
310 psrldq $6,%xmm2
311 pand %xmm7,%xmm1
312 movdqa %xmm2,%xmm3
313 psrlq $4,%xmm2
314 psrlq $30,%xmm3
315 pand %xmm7,%xmm2
316 pand %xmm7,%xmm3
317 psrldq $13,%xmm4
318 leal 144(%esp),%edx
319 movl $2,%ecx
320 .L005square:
321 movdqa %xmm0,(%esp)
322 movdqa %xmm1,16(%esp)
323 movdqa %xmm2,32(%esp)
324 movdqa %xmm3,48(%esp)
325 movdqa %xmm4,64(%esp)
326 movdqa %xmm1,%xmm6
327 movdqa %xmm2,%xmm5
328 pslld $2,%xmm6
329 pslld $2,%xmm5
330 paddd %xmm1,%xmm6
331 paddd %xmm2,%xmm5
332 movdqa %xmm6,80(%esp)
333 movdqa %xmm5,96(%esp)
334 movdqa %xmm3,%xmm6
335 movdqa %xmm4,%xmm5
336 pslld $2,%xmm6
337 pslld $2,%xmm5
338 paddd %xmm3,%xmm6
339 paddd %xmm4,%xmm5
340 movdqa %xmm6,112(%esp)
341 movdqa %xmm5,128(%esp)
342 pshufd $68,%xmm0,%xmm6
343 movdqa %xmm1,%xmm5
344 pshufd $68,%xmm1,%xmm1
345 pshufd $68,%xmm2,%xmm2
346 pshufd $68,%xmm3,%xmm3
347 pshufd $68,%xmm4,%xmm4
348 movdqa %xmm6,(%edx)
349 movdqa %xmm1,16(%edx)
350 movdqa %xmm2,32(%edx)
351 movdqa %xmm3,48(%edx)
352 movdqa %xmm4,64(%edx)
353 pmuludq %xmm0,%xmm4
354 pmuludq %xmm0,%xmm3
355 pmuludq %xmm0,%xmm2
356 pmuludq %xmm0,%xmm1
357 pmuludq %xmm6,%xmm0
358 movdqa %xmm5,%xmm6
359 pmuludq 48(%edx),%xmm5
360 movdqa %xmm6,%xmm7
361 pmuludq 32(%edx),%xmm6
362 paddq %xmm5,%xmm4
363 movdqa %xmm7,%xmm5
364 pmuludq 16(%edx),%xmm7
365 paddq %xmm6,%xmm3
366 movdqa 80(%esp),%xmm6
367 pmuludq (%edx),%xmm5
368 paddq %xmm7,%xmm2
369 pmuludq 64(%edx),%xmm6
370 movdqa 32(%esp),%xmm7
371 paddq %xmm5,%xmm1
372 movdqa %xmm7,%xmm5
373 pmuludq 32(%edx),%xmm7
374 paddq %xmm6,%xmm0
375 movdqa %xmm5,%xmm6
376 pmuludq 16(%edx),%xmm5
377 paddq %xmm7,%xmm4
378 movdqa 96(%esp),%xmm7
379 pmuludq (%edx),%xmm6
380 paddq %xmm5,%xmm3
381 movdqa %xmm7,%xmm5
382 pmuludq 64(%edx),%xmm7
383 paddq %xmm6,%xmm2
384 pmuludq 48(%edx),%xmm5
385 movdqa 48(%esp),%xmm6
386 paddq %xmm7,%xmm1
387 movdqa %xmm6,%xmm7
388 pmuludq 16(%edx),%xmm6
389 paddq %xmm5,%xmm0
390 movdqa 112(%esp),%xmm5
391 pmuludq (%edx),%xmm7
392 paddq %xmm6,%xmm4
393 movdqa %xmm5,%xmm6
394 pmuludq 64(%edx),%xmm5
395 paddq %xmm7,%xmm3
396 movdqa %xmm6,%xmm7
397 pmuludq 48(%edx),%xmm6
398 paddq %xmm5,%xmm2
399 pmuludq 32(%edx),%xmm7
400 movdqa 64(%esp),%xmm5
401 paddq %xmm6,%xmm1
402 movdqa 128(%esp),%xmm6
403 pmuludq (%edx),%xmm5
404 paddq %xmm7,%xmm0
405 movdqa %xmm6,%xmm7
406 pmuludq 64(%edx),%xmm6
407 paddq %xmm5,%xmm4
408 movdqa %xmm7,%xmm5
409 pmuludq 16(%edx),%xmm7
410 paddq %xmm6,%xmm3
411 movdqa %xmm5,%xmm6
412 pmuludq 32(%edx),%xmm5
413 paddq %xmm7,%xmm0
414 pmuludq 48(%edx),%xmm6
415 movdqa 64(%ebx),%xmm7
416 paddq %xmm5,%xmm1
417 paddq %xmm6,%xmm2
418 movdqa %xmm3,%xmm5
419 pand %xmm7,%xmm3
420 psrlq $26,%xmm5
421 paddq %xmm4,%xmm5
422 movdqa %xmm0,%xmm6
423 pand %xmm7,%xmm0
424 psrlq $26,%xmm6
425 movdqa %xmm5,%xmm4
426 paddq %xmm1,%xmm6
427 psrlq $26,%xmm5
428 pand %xmm7,%xmm4
429 movdqa %xmm6,%xmm1
430 psrlq $26,%xmm6
431 paddd %xmm5,%xmm0
432 psllq $2,%xmm5
433 paddq %xmm2,%xmm6
434 paddq %xmm0,%xmm5
435 pand %xmm7,%xmm1
436 movdqa %xmm6,%xmm2
437 psrlq $26,%xmm6
438 pand %xmm7,%xmm2
439 paddd %xmm3,%xmm6
440 movdqa %xmm5,%xmm0
441 psrlq $26,%xmm5
442 movdqa %xmm6,%xmm3
443 psrlq $26,%xmm6
444 pand %xmm7,%xmm0
445 paddd %xmm5,%xmm1
446 pand %xmm7,%xmm3
447 paddd %xmm6,%xmm4
448 decl %ecx
449 jz .L006square_break
450 punpcklqdq (%esp),%xmm0
451 punpcklqdq 16(%esp),%xmm1
452 punpcklqdq 32(%esp),%xmm2
453 punpcklqdq 48(%esp),%xmm3
454 punpcklqdq 64(%esp),%xmm4
455 jmp .L005square
456 .L006square_break:
457 psllq $32,%xmm0
458 psllq $32,%xmm1
459 psllq $32,%xmm2
460 psllq $32,%xmm3
461 psllq $32,%xmm4
462 por (%esp),%xmm0
463 por 16(%esp),%xmm1
464 por 32(%esp),%xmm2
465 por 48(%esp),%xmm3
466 por 64(%esp),%xmm4
467 pshufd $141,%xmm0,%xmm0
468 pshufd $141,%xmm1,%xmm1
469 pshufd $141,%xmm2,%xmm2
470 pshufd $141,%xmm3,%xmm3
471 pshufd $141,%xmm4,%xmm4
472 movdqu %xmm0,(%edi)
473 movdqu %xmm1,16(%edi)
474 movdqu %xmm2,32(%edi)
475 movdqu %xmm3,48(%edi)
476 movdqu %xmm4,64(%edi)
477 movdqa %xmm1,%xmm6
478 movdqa %xmm2,%xmm5
479 pslld $2,%xmm6
480 pslld $2,%xmm5
481 paddd %xmm1,%xmm6
482 paddd %xmm2,%xmm5
483 movdqu %xmm6,80(%edi)
484 movdqu %xmm5,96(%edi)
485 movdqa %xmm3,%xmm6
486 movdqa %xmm4,%xmm5
487 pslld $2,%xmm6
488 pslld $2,%xmm5
489 paddd %xmm3,%xmm6
490 paddd %xmm4,%xmm5
491 movdqu %xmm6,112(%edi)
492 movdqu %xmm5,128(%edi)
493 movl %ebp,%esp
494 leal -48(%edi),%edi
495 ret
496 .size _poly1305_init_sse2,.-_poly1305_init_sse2
497 .align 32
498 .type _poly1305_blocks_sse2,@function
499 .align 16
500 _poly1305_blocks_sse2:
501 pushl %ebp
502 pushl %ebx
503 pushl %esi
504 pushl %edi
505 movl 20(%esp),%edi
506 movl 24(%esp),%esi
507 movl 28(%esp),%ecx
508 movl 20(%edi),%eax
509 andl $-16,%ecx
510 jz .L007nodata
511 cmpl $64,%ecx
512 jae .L008enter_sse2
513 testl %eax,%eax
514 jz .Lenter_blocks
515 .align 16
516 .L008enter_sse2:
517 call .L009pic_point
518 .L009pic_point:
519 popl %ebx
520 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
521 testl %eax,%eax
522 jnz .L010base2_26
523 call _poly1305_init_sse2
524 movl (%edi),%eax
525 movl 3(%edi),%ecx
526 movl 6(%edi),%edx
527 movl 9(%edi),%esi
528 movl 13(%edi),%ebp
529 movl $1,20(%edi)
530 shrl $2,%ecx
531 andl $67108863,%eax
532 shrl $4,%edx
533 andl $67108863,%ecx
534 shrl $6,%esi
535 andl $67108863,%edx
536 movd %eax,%xmm0
537 movd %ecx,%xmm1
538 movd %edx,%xmm2
539 movd %esi,%xmm3
540 movd %ebp,%xmm4
541 movl 24(%esp),%esi
542 movl 28(%esp),%ecx
543 jmp .L011base2_32
544 .align 16
545 .L010base2_26:
546 movd (%edi),%xmm0
547 movd 4(%edi),%xmm1
548 movd 8(%edi),%xmm2
549 movd 12(%edi),%xmm3
550 movd 16(%edi),%xmm4
551 movdqa 64(%ebx),%xmm7
552 .L011base2_32:
553 movl 32(%esp),%eax
554 movl %esp,%ebp
555 subl $528,%esp
556 andl $-16,%esp
557 leal 48(%edi),%edi
558 shll $24,%eax
559 testl $31,%ecx
560 jz .L012even
561 movdqu (%esi),%xmm6
562 leal 16(%esi),%esi
563 movdqa %xmm6,%xmm5
564 pand %xmm7,%xmm6
565 paddd %xmm6,%xmm0
566 movdqa %xmm5,%xmm6
567 psrlq $26,%xmm5
568 psrldq $6,%xmm6
569 pand %xmm7,%xmm5
570 paddd %xmm5,%xmm1
571 movdqa %xmm6,%xmm5
572 psrlq $4,%xmm6
573 pand %xmm7,%xmm6
574 paddd %xmm6,%xmm2
575 movdqa %xmm5,%xmm6
576 psrlq $30,%xmm5
577 pand %xmm7,%xmm5
578 psrldq $7,%xmm6
579 paddd %xmm5,%xmm3
580 movd %eax,%xmm5
581 paddd %xmm6,%xmm4
582 movd 12(%edi),%xmm6
583 paddd %xmm5,%xmm4
584 movdqa %xmm0,(%esp)
585 movdqa %xmm1,16(%esp)
586 movdqa %xmm2,32(%esp)
587 movdqa %xmm3,48(%esp)
588 movdqa %xmm4,64(%esp)
589 pmuludq %xmm6,%xmm0
590 pmuludq %xmm6,%xmm1
591 pmuludq %xmm6,%xmm2
592 movd 28(%edi),%xmm5
593 pmuludq %xmm6,%xmm3
594 pmuludq %xmm6,%xmm4
595 movdqa %xmm5,%xmm6
596 pmuludq 48(%esp),%xmm5
597 movdqa %xmm6,%xmm7
598 pmuludq 32(%esp),%xmm6
599 paddq %xmm5,%xmm4
600 movdqa %xmm7,%xmm5
601 pmuludq 16(%esp),%xmm7
602 paddq %xmm6,%xmm3
603 movd 92(%edi),%xmm6
604 pmuludq (%esp),%xmm5
605 paddq %xmm7,%xmm2
606 pmuludq 64(%esp),%xmm6
607 movd 44(%edi),%xmm7
608 paddq %xmm5,%xmm1
609 movdqa %xmm7,%xmm5
610 pmuludq 32(%esp),%xmm7
611 paddq %xmm6,%xmm0
612 movdqa %xmm5,%xmm6
613 pmuludq 16(%esp),%xmm5
614 paddq %xmm7,%xmm4
615 movd 108(%edi),%xmm7
616 pmuludq (%esp),%xmm6
617 paddq %xmm5,%xmm3
618 movdqa %xmm7,%xmm5
619 pmuludq 64(%esp),%xmm7
620 paddq %xmm6,%xmm2
621 pmuludq 48(%esp),%xmm5
622 movd 60(%edi),%xmm6
623 paddq %xmm7,%xmm1
624 movdqa %xmm6,%xmm7
625 pmuludq 16(%esp),%xmm6
626 paddq %xmm5,%xmm0
627 movd 124(%edi),%xmm5
628 pmuludq (%esp),%xmm7
629 paddq %xmm6,%xmm4
630 movdqa %xmm5,%xmm6
631 pmuludq 64(%esp),%xmm5
632 paddq %xmm7,%xmm3
633 movdqa %xmm6,%xmm7
634 pmuludq 48(%esp),%xmm6
635 paddq %xmm5,%xmm2
636 pmuludq 32(%esp),%xmm7
637 movd 76(%edi),%xmm5
638 paddq %xmm6,%xmm1
639 movd 140(%edi),%xmm6
640 pmuludq (%esp),%xmm5
641 paddq %xmm7,%xmm0
642 movdqa %xmm6,%xmm7
643 pmuludq 64(%esp),%xmm6
644 paddq %xmm5,%xmm4
645 movdqa %xmm7,%xmm5
646 pmuludq 16(%esp),%xmm7
647 paddq %xmm6,%xmm3
648 movdqa %xmm5,%xmm6
649 pmuludq 32(%esp),%xmm5
650 paddq %xmm7,%xmm0
651 pmuludq 48(%esp),%xmm6
652 movdqa 64(%ebx),%xmm7
653 paddq %xmm5,%xmm1
654 paddq %xmm6,%xmm2
655 movdqa %xmm3,%xmm5
656 pand %xmm7,%xmm3
657 psrlq $26,%xmm5
658 paddq %xmm4,%xmm5
659 movdqa %xmm0,%xmm6
660 pand %xmm7,%xmm0
661 psrlq $26,%xmm6
662 movdqa %xmm5,%xmm4
663 paddq %xmm1,%xmm6
664 psrlq $26,%xmm5
665 pand %xmm7,%xmm4
666 movdqa %xmm6,%xmm1
667 psrlq $26,%xmm6
668 paddd %xmm5,%xmm0
669 psllq $2,%xmm5
670 paddq %xmm2,%xmm6
671 paddq %xmm0,%xmm5
672 pand %xmm7,%xmm1
673 movdqa %xmm6,%xmm2
674 psrlq $26,%xmm6
675 pand %xmm7,%xmm2
676 paddd %xmm3,%xmm6
677 movdqa %xmm5,%xmm0
678 psrlq $26,%xmm5
679 movdqa %xmm6,%xmm3
680 psrlq $26,%xmm6
681 pand %xmm7,%xmm0
682 paddd %xmm5,%xmm1
683 pand %xmm7,%xmm3
684 paddd %xmm6,%xmm4
685 subl $16,%ecx
686 jz .L013done
687 .L012even:
688 leal 384(%esp),%edx
689 leal -32(%esi),%eax
690 subl $64,%ecx
691 movdqu (%edi),%xmm5
692 pshufd $68,%xmm5,%xmm6
693 cmovbl %eax,%esi
694 pshufd $238,%xmm5,%xmm5
695 movdqa %xmm6,(%edx)
696 leal 160(%esp),%eax
697 movdqu 16(%edi),%xmm6
698 movdqa %xmm5,-144(%edx)
699 pshufd $68,%xmm6,%xmm5
700 pshufd $238,%xmm6,%xmm6
701 movdqa %xmm5,16(%edx)
702 movdqu 32(%edi),%xmm5
703 movdqa %xmm6,-128(%edx)
704 pshufd $68,%xmm5,%xmm6
705 pshufd $238,%xmm5,%xmm5
706 movdqa %xmm6,32(%edx)
707 movdqu 48(%edi),%xmm6
708 movdqa %xmm5,-112(%edx)
709 pshufd $68,%xmm6,%xmm5
710 pshufd $238,%xmm6,%xmm6
711 movdqa %xmm5,48(%edx)
712 movdqu 64(%edi),%xmm5
713 movdqa %xmm6,-96(%edx)
714 pshufd $68,%xmm5,%xmm6
715 pshufd $238,%xmm5,%xmm5
716 movdqa %xmm6,64(%edx)
717 movdqu 80(%edi),%xmm6
718 movdqa %xmm5,-80(%edx)
719 pshufd $68,%xmm6,%xmm5
720 pshufd $238,%xmm6,%xmm6
721 movdqa %xmm5,80(%edx)
722 movdqu 96(%edi),%xmm5
723 movdqa %xmm6,-64(%edx)
724 pshufd $68,%xmm5,%xmm6
725 pshufd $238,%xmm5,%xmm5
726 movdqa %xmm6,96(%edx)
727 movdqu 112(%edi),%xmm6
728 movdqa %xmm5,-48(%edx)
729 pshufd $68,%xmm6,%xmm5
730 pshufd $238,%xmm6,%xmm6
731 movdqa %xmm5,112(%edx)
732 movdqu 128(%edi),%xmm5
733 movdqa %xmm6,-32(%edx)
734 pshufd $68,%xmm5,%xmm6
735 pshufd $238,%xmm5,%xmm5
736 movdqa %xmm6,128(%edx)
737 movdqa %xmm5,-16(%edx)
738 movdqu 32(%esi),%xmm5
739 movdqu 48(%esi),%xmm6
740 leal 32(%esi),%esi
741 movdqa %xmm2,112(%esp)
742 movdqa %xmm3,128(%esp)
743 movdqa %xmm4,144(%esp)
744 movdqa %xmm5,%xmm2
745 movdqa %xmm6,%xmm3
746 psrldq $6,%xmm2
747 psrldq $6,%xmm3
748 movdqa %xmm5,%xmm4
749 punpcklqdq %xmm3,%xmm2
750 punpckhqdq %xmm6,%xmm4
751 punpcklqdq %xmm6,%xmm5
752 movdqa %xmm2,%xmm3
753 psrlq $4,%xmm2
754 psrlq $30,%xmm3
755 movdqa %xmm5,%xmm6
756 psrlq $40,%xmm4
757 psrlq $26,%xmm6
758 pand %xmm7,%xmm5
759 pand %xmm7,%xmm6
760 pand %xmm7,%xmm2
761 pand %xmm7,%xmm3
762 por (%ebx),%xmm4
763 movdqa %xmm0,80(%esp)
764 movdqa %xmm1,96(%esp)
765 jbe .L014skip_loop
766 jmp .L015loop
767 .align 32
768 .L015loop:
769 movdqa -144(%edx),%xmm7
770 movdqa %xmm6,16(%eax)
771 movdqa %xmm2,32(%eax)
772 movdqa %xmm3,48(%eax)
773 movdqa %xmm4,64(%eax)
774 movdqa %xmm5,%xmm1
775 pmuludq %xmm7,%xmm5
776 movdqa %xmm6,%xmm0
777 pmuludq %xmm7,%xmm6
778 pmuludq %xmm7,%xmm2
779 pmuludq %xmm7,%xmm3
780 pmuludq %xmm7,%xmm4
781 pmuludq -16(%edx),%xmm0
782 movdqa %xmm1,%xmm7
783 pmuludq -128(%edx),%xmm1
784 paddq %xmm5,%xmm0
785 movdqa %xmm7,%xmm5
786 pmuludq -112(%edx),%xmm7
787 paddq %xmm6,%xmm1
788 movdqa %xmm5,%xmm6
789 pmuludq -96(%edx),%xmm5
790 paddq %xmm7,%xmm2
791 movdqa 16(%eax),%xmm7
792 pmuludq -80(%edx),%xmm6
793 paddq %xmm5,%xmm3
794 movdqa %xmm7,%xmm5
795 pmuludq -128(%edx),%xmm7
796 paddq %xmm6,%xmm4
797 movdqa %xmm5,%xmm6
798 pmuludq -112(%edx),%xmm5
799 paddq %xmm7,%xmm2
800 movdqa 32(%eax),%xmm7
801 pmuludq -96(%edx),%xmm6
802 paddq %xmm5,%xmm3
803 movdqa %xmm7,%xmm5
804 pmuludq -32(%edx),%xmm7
805 paddq %xmm6,%xmm4
806 movdqa %xmm5,%xmm6
807 pmuludq -16(%edx),%xmm5
808 paddq %xmm7,%xmm0
809 movdqa %xmm6,%xmm7
810 pmuludq -128(%edx),%xmm6
811 paddq %xmm5,%xmm1
812 movdqa 48(%eax),%xmm5
813 pmuludq -112(%edx),%xmm7
814 paddq %xmm6,%xmm3
815 movdqa %xmm5,%xmm6
816 pmuludq -48(%edx),%xmm5
817 paddq %xmm7,%xmm4
818 movdqa %xmm6,%xmm7
819 pmuludq -32(%edx),%xmm6
820 paddq %xmm5,%xmm0
821 movdqa %xmm7,%xmm5
822 pmuludq -16(%edx),%xmm7
823 paddq %xmm6,%xmm1
824 movdqa 64(%eax),%xmm6
825 pmuludq -128(%edx),%xmm5
826 paddq %xmm7,%xmm2
827 movdqa %xmm6,%xmm7
828 pmuludq -16(%edx),%xmm6
829 paddq %xmm5,%xmm4
830 movdqa %xmm7,%xmm5
831 pmuludq -64(%edx),%xmm7
832 paddq %xmm6,%xmm3
833 movdqa %xmm5,%xmm6
834 pmuludq -48(%edx),%xmm5
835 paddq %xmm7,%xmm0
836 movdqa 64(%ebx),%xmm7
837 pmuludq -32(%edx),%xmm6
838 paddq %xmm5,%xmm1
839 paddq %xmm6,%xmm2
840 movdqu -32(%esi),%xmm5
841 movdqu -16(%esi),%xmm6
842 leal 32(%esi),%esi
843 movdqa %xmm2,32(%esp)
844 movdqa %xmm3,48(%esp)
845 movdqa %xmm4,64(%esp)
846 movdqa %xmm5,%xmm2
847 movdqa %xmm6,%xmm3
848 psrldq $6,%xmm2
849 psrldq $6,%xmm3
850 movdqa %xmm5,%xmm4
851 punpcklqdq %xmm3,%xmm2
852 punpckhqdq %xmm6,%xmm4
853 punpcklqdq %xmm6,%xmm5
854 movdqa %xmm2,%xmm3
855 psrlq $4,%xmm2
856 psrlq $30,%xmm3
857 movdqa %xmm5,%xmm6
858 psrlq $40,%xmm4
859 psrlq $26,%xmm6
860 pand %xmm7,%xmm5
861 pand %xmm7,%xmm6
862 pand %xmm7,%xmm2
863 pand %xmm7,%xmm3
864 por (%ebx),%xmm4
865 leal -32(%esi),%eax
866 subl $64,%ecx
867 paddd 80(%esp),%xmm5
868 paddd 96(%esp),%xmm6
869 paddd 112(%esp),%xmm2
870 paddd 128(%esp),%xmm3
871 paddd 144(%esp),%xmm4
872 cmovbl %eax,%esi
873 leal 160(%esp),%eax
874 movdqa (%edx),%xmm7
875 movdqa %xmm1,16(%esp)
876 movdqa %xmm6,16(%eax)
877 movdqa %xmm2,32(%eax)
878 movdqa %xmm3,48(%eax)
879 movdqa %xmm4,64(%eax)
880 movdqa %xmm5,%xmm1
881 pmuludq %xmm7,%xmm5
882 paddq %xmm0,%xmm5
883 movdqa %xmm6,%xmm0
884 pmuludq %xmm7,%xmm6
885 pmuludq %xmm7,%xmm2
886 pmuludq %xmm7,%xmm3
887 pmuludq %xmm7,%xmm4
888 paddq 16(%esp),%xmm6
889 paddq 32(%esp),%xmm2
890 paddq 48(%esp),%xmm3
891 paddq 64(%esp),%xmm4
892 pmuludq 128(%edx),%xmm0
893 movdqa %xmm1,%xmm7
894 pmuludq 16(%edx),%xmm1
895 paddq %xmm5,%xmm0
896 movdqa %xmm7,%xmm5
897 pmuludq 32(%edx),%xmm7
898 paddq %xmm6,%xmm1
899 movdqa %xmm5,%xmm6
900 pmuludq 48(%edx),%xmm5
901 paddq %xmm7,%xmm2
902 movdqa 16(%eax),%xmm7
903 pmuludq 64(%edx),%xmm6
904 paddq %xmm5,%xmm3
905 movdqa %xmm7,%xmm5
906 pmuludq 16(%edx),%xmm7
907 paddq %xmm6,%xmm4
908 movdqa %xmm5,%xmm6
909 pmuludq 32(%edx),%xmm5
910 paddq %xmm7,%xmm2
911 movdqa 32(%eax),%xmm7
912 pmuludq 48(%edx),%xmm6
913 paddq %xmm5,%xmm3
914 movdqa %xmm7,%xmm5
915 pmuludq 112(%edx),%xmm7
916 paddq %xmm6,%xmm4
917 movdqa %xmm5,%xmm6
918 pmuludq 128(%edx),%xmm5
919 paddq %xmm7,%xmm0
920 movdqa %xmm6,%xmm7
921 pmuludq 16(%edx),%xmm6
922 paddq %xmm5,%xmm1
923 movdqa 48(%eax),%xmm5
924 pmuludq 32(%edx),%xmm7
925 paddq %xmm6,%xmm3
926 movdqa %xmm5,%xmm6
927 pmuludq 96(%edx),%xmm5
928 paddq %xmm7,%xmm4
929 movdqa %xmm6,%xmm7
930 pmuludq 112(%edx),%xmm6
931 paddq %xmm5,%xmm0
932 movdqa %xmm7,%xmm5
933 pmuludq 128(%edx),%xmm7
934 paddq %xmm6,%xmm1
935 movdqa 64(%eax),%xmm6
936 pmuludq 16(%edx),%xmm5
937 paddq %xmm7,%xmm2
938 movdqa %xmm6,%xmm7
939 pmuludq 128(%edx),%xmm6
940 paddq %xmm5,%xmm4
941 movdqa %xmm7,%xmm5
942 pmuludq 80(%edx),%xmm7
943 paddq %xmm6,%xmm3
944 movdqa %xmm5,%xmm6
945 pmuludq 96(%edx),%xmm5
946 paddq %xmm7,%xmm0
947 movdqa 64(%ebx),%xmm7
948 pmuludq 112(%edx),%xmm6
949 paddq %xmm5,%xmm1
950 paddq %xmm6,%xmm2
951 movdqa %xmm3,%xmm5
952 pand %xmm7,%xmm3
953 psrlq $26,%xmm5
954 paddq %xmm4,%xmm5
955 movdqa %xmm0,%xmm6
956 pand %xmm7,%xmm0
957 psrlq $26,%xmm6
958 movdqa %xmm5,%xmm4
959 paddq %xmm1,%xmm6
960 psrlq $26,%xmm5
961 pand %xmm7,%xmm4
962 movdqa %xmm6,%xmm1
963 psrlq $26,%xmm6
964 paddd %xmm5,%xmm0
965 psllq $2,%xmm5
966 paddq %xmm2,%xmm6
967 paddq %xmm0,%xmm5
968 pand %xmm7,%xmm1
969 movdqa %xmm6,%xmm2
970 psrlq $26,%xmm6
971 pand %xmm7,%xmm2
972 paddd %xmm3,%xmm6
973 movdqa %xmm5,%xmm0
974 psrlq $26,%xmm5
975 movdqa %xmm6,%xmm3
976 psrlq $26,%xmm6
977 pand %xmm7,%xmm0
978 paddd %xmm5,%xmm1
979 pand %xmm7,%xmm3
980 paddd %xmm6,%xmm4
981 movdqu 32(%esi),%xmm5
982 movdqu 48(%esi),%xmm6
983 leal 32(%esi),%esi
984 movdqa %xmm2,112(%esp)
985 movdqa %xmm3,128(%esp)
986 movdqa %xmm4,144(%esp)
987 movdqa %xmm5,%xmm2
988 movdqa %xmm6,%xmm3
989 psrldq $6,%xmm2
990 psrldq $6,%xmm3
991 movdqa %xmm5,%xmm4
992 punpcklqdq %xmm3,%xmm2
993 punpckhqdq %xmm6,%xmm4
994 punpcklqdq %xmm6,%xmm5
995 movdqa %xmm2,%xmm3
996 psrlq $4,%xmm2
997 psrlq $30,%xmm3
998 movdqa %xmm5,%xmm6
999 psrlq $40,%xmm4
1000 psrlq $26,%xmm6
1001 pand %xmm7,%xmm5
1002 pand %xmm7,%xmm6
1003 pand %xmm7,%xmm2
1004 pand %xmm7,%xmm3
1005 por (%ebx),%xmm4
1006 movdqa %xmm0,80(%esp)
1007 movdqa %xmm1,96(%esp)
1008 ja .L015loop
1009 .L014skip_loop:
1010 pshufd $16,-144(%edx),%xmm7
1011 addl $32,%ecx
1012 jnz .L016long_tail
1013 paddd %xmm0,%xmm5
1014 paddd %xmm1,%xmm6
1015 paddd 112(%esp),%xmm2
1016 paddd 128(%esp),%xmm3
1017 paddd 144(%esp),%xmm4
1018 .L016long_tail:
1019 movdqa %xmm5,(%eax)
1020 movdqa %xmm6,16(%eax)
1021 movdqa %xmm2,32(%eax)
1022 movdqa %xmm3,48(%eax)
1023 movdqa %xmm4,64(%eax)
1024 pmuludq %xmm7,%xmm5
1025 pmuludq %xmm7,%xmm6
1026 pmuludq %xmm7,%xmm2
1027 movdqa %xmm5,%xmm0
1028 pshufd $16,-128(%edx),%xmm5
1029 pmuludq %xmm7,%xmm3
1030 movdqa %xmm6,%xmm1
1031 pmuludq %xmm7,%xmm4
1032 movdqa %xmm5,%xmm6
1033 pmuludq 48(%eax),%xmm5
1034 movdqa %xmm6,%xmm7
1035 pmuludq 32(%eax),%xmm6
1036 paddq %xmm5,%xmm4
1037 movdqa %xmm7,%xmm5
1038 pmuludq 16(%eax),%xmm7
1039 paddq %xmm6,%xmm3
1040 pshufd $16,-64(%edx),%xmm6
1041 pmuludq (%eax),%xmm5
1042 paddq %xmm7,%xmm2
1043 pmuludq 64(%eax),%xmm6
1044 pshufd $16,-112(%edx),%xmm7
1045 paddq %xmm5,%xmm1
1046 movdqa %xmm7,%xmm5
1047 pmuludq 32(%eax),%xmm7
1048 paddq %xmm6,%xmm0
1049 movdqa %xmm5,%xmm6
1050 pmuludq 16(%eax),%xmm5
1051 paddq %xmm7,%xmm4
1052 pshufd $16,-48(%edx),%xmm7
1053 pmuludq (%eax),%xmm6
1054 paddq %xmm5,%xmm3
1055 movdqa %xmm7,%xmm5
1056 pmuludq 64(%eax),%xmm7
1057 paddq %xmm6,%xmm2
1058 pmuludq 48(%eax),%xmm5
1059 pshufd $16,-96(%edx),%xmm6
1060 paddq %xmm7,%xmm1
1061 movdqa %xmm6,%xmm7
1062 pmuludq 16(%eax),%xmm6
1063 paddq %xmm5,%xmm0
1064 pshufd $16,-32(%edx),%xmm5
1065 pmuludq (%eax),%xmm7
1066 paddq %xmm6,%xmm4
1067 movdqa %xmm5,%xmm6
1068 pmuludq 64(%eax),%xmm5
1069 paddq %xmm7,%xmm3
1070 movdqa %xmm6,%xmm7
1071 pmuludq 48(%eax),%xmm6
1072 paddq %xmm5,%xmm2
1073 pmuludq 32(%eax),%xmm7
1074 pshufd $16,-80(%edx),%xmm5
1075 paddq %xmm6,%xmm1
1076 pshufd $16,-16(%edx),%xmm6
1077 pmuludq (%eax),%xmm5
1078 paddq %xmm7,%xmm0
1079 movdqa %xmm6,%xmm7
1080 pmuludq 64(%eax),%xmm6
1081 paddq %xmm5,%xmm4
1082 movdqa %xmm7,%xmm5
1083 pmuludq 16(%eax),%xmm7
1084 paddq %xmm6,%xmm3
1085 movdqa %xmm5,%xmm6
1086 pmuludq 32(%eax),%xmm5
1087 paddq %xmm7,%xmm0
1088 pmuludq 48(%eax),%xmm6
1089 movdqa 64(%ebx),%xmm7
1090 paddq %xmm5,%xmm1
1091 paddq %xmm6,%xmm2
1092 jz .L017short_tail
1093 movdqu -32(%esi),%xmm5
1094 movdqu -16(%esi),%xmm6
1095 leal 32(%esi),%esi
1096 movdqa %xmm2,32(%esp)
1097 movdqa %xmm3,48(%esp)
1098 movdqa %xmm4,64(%esp)
1099 movdqa %xmm5,%xmm2
1100 movdqa %xmm6,%xmm3
1101 psrldq $6,%xmm2
1102 psrldq $6,%xmm3
1103 movdqa %xmm5,%xmm4
1104 punpcklqdq %xmm3,%xmm2
1105 punpckhqdq %xmm6,%xmm4
1106 punpcklqdq %xmm6,%xmm5
1107 movdqa %xmm2,%xmm3
1108 psrlq $4,%xmm2
1109 psrlq $30,%xmm3
1110 movdqa %xmm5,%xmm6
1111 psrlq $40,%xmm4
1112 psrlq $26,%xmm6
1113 pand %xmm7,%xmm5
1114 pand %xmm7,%xmm6
1115 pand %xmm7,%xmm2
1116 pand %xmm7,%xmm3
1117 por (%ebx),%xmm4
1118 pshufd $16,(%edx),%xmm7
1119 paddd 80(%esp),%xmm5
1120 paddd 96(%esp),%xmm6
1121 paddd 112(%esp),%xmm2
1122 paddd 128(%esp),%xmm3
1123 paddd 144(%esp),%xmm4
1124 movdqa %xmm5,(%esp)
1125 pmuludq %xmm7,%xmm5
1126 movdqa %xmm6,16(%esp)
1127 pmuludq %xmm7,%xmm6
1128 paddq %xmm5,%xmm0
1129 movdqa %xmm2,%xmm5
1130 pmuludq %xmm7,%xmm2
1131 paddq %xmm6,%xmm1
1132 movdqa %xmm3,%xmm6
1133 pmuludq %xmm7,%xmm3
1134 paddq 32(%esp),%xmm2
1135 movdqa %xmm5,32(%esp)
1136 pshufd $16,16(%edx),%xmm5
1137 paddq 48(%esp),%xmm3
1138 movdqa %xmm6,48(%esp)
1139 movdqa %xmm4,%xmm6
1140 pmuludq %xmm7,%xmm4
1141 paddq 64(%esp),%xmm4
1142 movdqa %xmm6,64(%esp)
1143 movdqa %xmm5,%xmm6
1144 pmuludq 48(%esp),%xmm5
1145 movdqa %xmm6,%xmm7
1146 pmuludq 32(%esp),%xmm6
1147 paddq %xmm5,%xmm4
1148 movdqa %xmm7,%xmm5
1149 pmuludq 16(%esp),%xmm7
1150 paddq %xmm6,%xmm3
1151 pshufd $16,80(%edx),%xmm6
1152 pmuludq (%esp),%xmm5
1153 paddq %xmm7,%xmm2
1154 pmuludq 64(%esp),%xmm6
1155 pshufd $16,32(%edx),%xmm7
1156 paddq %xmm5,%xmm1
1157 movdqa %xmm7,%xmm5
1158 pmuludq 32(%esp),%xmm7
1159 paddq %xmm6,%xmm0
1160 movdqa %xmm5,%xmm6
1161 pmuludq 16(%esp),%xmm5
1162 paddq %xmm7,%xmm4
1163 pshufd $16,96(%edx),%xmm7
1164 pmuludq (%esp),%xmm6
1165 paddq %xmm5,%xmm3
1166 movdqa %xmm7,%xmm5
1167 pmuludq 64(%esp),%xmm7
1168 paddq %xmm6,%xmm2
1169 pmuludq 48(%esp),%xmm5
1170 pshufd $16,48(%edx),%xmm6
1171 paddq %xmm7,%xmm1
1172 movdqa %xmm6,%xmm7
1173 pmuludq 16(%esp),%xmm6
1174 paddq %xmm5,%xmm0
1175 pshufd $16,112(%edx),%xmm5
1176 pmuludq (%esp),%xmm7
1177 paddq %xmm6,%xmm4
1178 movdqa %xmm5,%xmm6
1179 pmuludq 64(%esp),%xmm5
1180 paddq %xmm7,%xmm3
1181 movdqa %xmm6,%xmm7
1182 pmuludq 48(%esp),%xmm6
1183 paddq %xmm5,%xmm2
1184 pmuludq 32(%esp),%xmm7
1185 pshufd $16,64(%edx),%xmm5
1186 paddq %xmm6,%xmm1
1187 pshufd $16,128(%edx),%xmm6
1188 pmuludq (%esp),%xmm5
1189 paddq %xmm7,%xmm0
1190 movdqa %xmm6,%xmm7
1191 pmuludq 64(%esp),%xmm6
1192 paddq %xmm5,%xmm4
1193 movdqa %xmm7,%xmm5
1194 pmuludq 16(%esp),%xmm7
1195 paddq %xmm6,%xmm3
1196 movdqa %xmm5,%xmm6
1197 pmuludq 32(%esp),%xmm5
1198 paddq %xmm7,%xmm0
1199 pmuludq 48(%esp),%xmm6
1200 movdqa 64(%ebx),%xmm7
1201 paddq %xmm5,%xmm1
1202 paddq %xmm6,%xmm2
1203 .L017short_tail:
1204 pshufd $78,%xmm4,%xmm6
1205 pshufd $78,%xmm3,%xmm5
1206 paddq %xmm6,%xmm4
1207 paddq %xmm5,%xmm3
1208 pshufd $78,%xmm0,%xmm6
1209 pshufd $78,%xmm1,%xmm5
1210 paddq %xmm6,%xmm0
1211 paddq %xmm5,%xmm1
1212 pshufd $78,%xmm2,%xmm6
1213 movdqa %xmm3,%xmm5
1214 pand %xmm7,%xmm3
1215 psrlq $26,%xmm5
1216 paddq %xmm6,%xmm2
1217 paddq %xmm4,%xmm5
1218 movdqa %xmm0,%xmm6
1219 pand %xmm7,%xmm0
1220 psrlq $26,%xmm6
1221 movdqa %xmm5,%xmm4
1222 paddq %xmm1,%xmm6
1223 psrlq $26,%xmm5
1224 pand %xmm7,%xmm4
1225 movdqa %xmm6,%xmm1
1226 psrlq $26,%xmm6
1227 paddd %xmm5,%xmm0
1228 psllq $2,%xmm5
1229 paddq %xmm2,%xmm6
1230 paddq %xmm0,%xmm5
1231 pand %xmm7,%xmm1
1232 movdqa %xmm6,%xmm2
1233 psrlq $26,%xmm6
1234 pand %xmm7,%xmm2
1235 paddd %xmm3,%xmm6
1236 movdqa %xmm5,%xmm0
1237 psrlq $26,%xmm5
1238 movdqa %xmm6,%xmm3
1239 psrlq $26,%xmm6
1240 pand %xmm7,%xmm0
1241 paddd %xmm5,%xmm1
1242 pand %xmm7,%xmm3
1243 paddd %xmm6,%xmm4
1244 .L013done:
1245 movd %xmm0,-48(%edi)
1246 movd %xmm1,-44(%edi)
1247 movd %xmm2,-40(%edi)
1248 movd %xmm3,-36(%edi)
1249 movd %xmm4,-32(%edi)
1250 movl %ebp,%esp
1251 .L007nodata:
1252 popl %edi
1253 popl %esi
1254 popl %ebx
1255 popl %ebp
1256 ret
1257 .size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
1258 .align 32
1259 .type _poly1305_emit_sse2,@function
1260 .align 16
1261 _poly1305_emit_sse2:
1262 pushl %ebp
1263 pushl %ebx
1264 pushl %esi
1265 pushl %edi
1266 movl 20(%esp),%ebp
1267 cmpl $0,20(%ebp)
1268 je .Lenter_emit
1269 movl (%ebp),%eax
1270 movl 4(%ebp),%edi
1271 movl 8(%ebp),%ecx
1272 movl 12(%ebp),%edx
1273 movl 16(%ebp),%esi
1274 movl %edi,%ebx
1275 shll $26,%edi
1276 shrl $6,%ebx
1277 addl %edi,%eax
1278 movl %ecx,%edi
1279 adcl $0,%ebx
1280 shll $20,%edi
1281 shrl $12,%ecx
1282 addl %edi,%ebx
1283 movl %edx,%edi
1284 adcl $0,%ecx
1285 shll $14,%edi
1286 shrl $18,%edx
1287 addl %edi,%ecx
1288 movl %esi,%edi
1289 adcl $0,%edx
1290 shll $8,%edi
1291 shrl $24,%esi
1292 addl %edi,%edx
1293 adcl $0,%esi
1294 movl %esi,%edi
1295 andl $3,%esi
1296 shrl $2,%edi
1297 leal (%edi,%edi,4),%ebp
1298 movl 24(%esp),%edi
1299 addl %ebp,%eax
1300 movl 28(%esp),%ebp
1301 adcl $0,%ebx
1302 adcl $0,%ecx
1303 adcl $0,%edx
1304 adcl $0,%esi
1305 movd %eax,%xmm0
1306 addl $5,%eax
1307 movd %ebx,%xmm1
1308 adcl $0,%ebx
1309 movd %ecx,%xmm2
1310 adcl $0,%ecx
1311 movd %edx,%xmm3
1312 adcl $0,%edx
1313 adcl $0,%esi
1314 shrl $2,%esi
1315 negl %esi
1316 andl %esi,%eax
1317 andl %esi,%ebx
1318 andl %esi,%ecx
1319 andl %esi,%edx
1320 movl %eax,(%edi)
1321 movd %xmm0,%eax
1322 movl %ebx,4(%edi)
1323 movd %xmm1,%ebx
1324 movl %ecx,8(%edi)
1325 movd %xmm2,%ecx
1326 movl %edx,12(%edi)
1327 movd %xmm3,%edx
1328 notl %esi
1329 andl %esi,%eax
1330 andl %esi,%ebx
1331 orl (%edi),%eax
1332 andl %esi,%ecx
1333 orl 4(%edi),%ebx
1334 andl %esi,%edx
1335 orl 8(%edi),%ecx
1336 orl 12(%edi),%edx
1337 addl (%ebp),%eax
1338 adcl 4(%ebp),%ebx
1339 movl %eax,(%edi)
1340 adcl 8(%ebp),%ecx
1341 movl %ebx,4(%edi)
1342 adcl 12(%ebp),%edx
1343 movl %ecx,8(%edi)
1344 movl %edx,12(%edi)
1345 popl %edi
1346 popl %esi
1347 popl %ebx
1348 popl %ebp
1349 ret
1350 .size _poly1305_emit_sse2,.-_poly1305_emit_sse2
1351 .align 32
1352 .type _poly1305_init_avx2,@function
1353 .align 16
1354 _poly1305_init_avx2:
1355 vmovdqu 24(%edi),%xmm4
1356 leal 48(%edi),%edi
1357 movl %esp,%ebp
1358 subl $224,%esp
1359 andl $-16,%esp
1360 vmovdqa 64(%ebx),%xmm7
1361 vpand %xmm7,%xmm4,%xmm0
1362 vpsrlq $26,%xmm4,%xmm1
1363 vpsrldq $6,%xmm4,%xmm3
1364 vpand %xmm7,%xmm1,%xmm1
1365 vpsrlq $4,%xmm3,%xmm2
1366 vpsrlq $30,%xmm3,%xmm3
1367 vpand %xmm7,%xmm2,%xmm2
1368 vpand %xmm7,%xmm3,%xmm3
1369 vpsrldq $13,%xmm4,%xmm4
1370 leal 144(%esp),%edx
1371 movl $2,%ecx
1372 .L018square:
1373 vmovdqa %xmm0,(%esp)
1374 vmovdqa %xmm1,16(%esp)
1375 vmovdqa %xmm2,32(%esp)
1376 vmovdqa %xmm3,48(%esp)
1377 vmovdqa %xmm4,64(%esp)
1378 vpslld $2,%xmm1,%xmm6
1379 vpslld $2,%xmm2,%xmm5
1380 vpaddd %xmm1,%xmm6,%xmm6
1381 vpaddd %xmm2,%xmm5,%xmm5
1382 vmovdqa %xmm6,80(%esp)
1383 vmovdqa %xmm5,96(%esp)
1384 vpslld $2,%xmm3,%xmm6
1385 vpslld $2,%xmm4,%xmm5
1386 vpaddd %xmm3,%xmm6,%xmm6
1387 vpaddd %xmm4,%xmm5,%xmm5
1388 vmovdqa %xmm6,112(%esp)
1389 vmovdqa %xmm5,128(%esp)
1390 vpshufd $68,%xmm0,%xmm5
1391 vmovdqa %xmm1,%xmm6
1392 vpshufd $68,%xmm1,%xmm1
1393 vpshufd $68,%xmm2,%xmm2
1394 vpshufd $68,%xmm3,%xmm3
1395 vpshufd $68,%xmm4,%xmm4
1396 vmovdqa %xmm5,(%edx)
1397 vmovdqa %xmm1,16(%edx)
1398 vmovdqa %xmm2,32(%edx)
1399 vmovdqa %xmm3,48(%edx)
1400 vmovdqa %xmm4,64(%edx)
1401 vpmuludq %xmm0,%xmm4,%xmm4
1402 vpmuludq %xmm0,%xmm3,%xmm3
1403 vpmuludq %xmm0,%xmm2,%xmm2
1404 vpmuludq %xmm0,%xmm1,%xmm1
1405 vpmuludq %xmm0,%xmm5,%xmm0
1406 vpmuludq 48(%edx),%xmm6,%xmm5
1407 vpaddq %xmm5,%xmm4,%xmm4
1408 vpmuludq 32(%edx),%xmm6,%xmm7
1409 vpaddq %xmm7,%xmm3,%xmm3
1410 vpmuludq 16(%edx),%xmm6,%xmm5
1411 vpaddq %xmm5,%xmm2,%xmm2
1412 vmovdqa 80(%esp),%xmm7
1413 vpmuludq (%edx),%xmm6,%xmm6
1414 vpaddq %xmm6,%xmm1,%xmm1
1415 vmovdqa 32(%esp),%xmm5
1416 vpmuludq 64(%edx),%xmm7,%xmm7
1417 vpaddq %xmm7,%xmm0,%xmm0
1418 vpmuludq 32(%edx),%xmm5,%xmm6
1419 vpaddq %xmm6,%xmm4,%xmm4
1420 vpmuludq 16(%edx),%xmm5,%xmm7
1421 vpaddq %xmm7,%xmm3,%xmm3
1422 vmovdqa 96(%esp),%xmm6
1423 vpmuludq (%edx),%xmm5,%xmm5
1424 vpaddq %xmm5,%xmm2,%xmm2
1425 vpmuludq 64(%edx),%xmm6,%xmm7
1426 vpaddq %xmm7,%xmm1,%xmm1
1427 vmovdqa 48(%esp),%xmm5
1428 vpmuludq 48(%edx),%xmm6,%xmm6
1429 vpaddq %xmm6,%xmm0,%xmm0
1430 vpmuludq 16(%edx),%xmm5,%xmm7
1431 vpaddq %xmm7,%xmm4,%xmm4
1432 vmovdqa 112(%esp),%xmm6
1433 vpmuludq (%edx),%xmm5,%xmm5
1434 vpaddq %xmm5,%xmm3,%xmm3
1435 vpmuludq 64(%edx),%xmm6,%xmm7
1436 vpaddq %xmm7,%xmm2,%xmm2
1437 vpmuludq 48(%edx),%xmm6,%xmm5
1438 vpaddq %xmm5,%xmm1,%xmm1
1439 vmovdqa 64(%esp),%xmm7
1440 vpmuludq 32(%edx),%xmm6,%xmm6
1441 vpaddq %xmm6,%xmm0,%xmm0
1442 vmovdqa 128(%esp),%xmm5
1443 vpmuludq (%edx),%xmm7,%xmm7
1444 vpaddq %xmm7,%xmm4,%xmm4
1445 vpmuludq 64(%edx),%xmm5,%xmm6
1446 vpaddq %xmm6,%xmm3,%xmm3
1447 vpmuludq 16(%edx),%xmm5,%xmm7
1448 vpaddq %xmm7,%xmm0,%xmm0
1449 vpmuludq 32(%edx),%xmm5,%xmm6
1450 vpaddq %xmm6,%xmm1,%xmm1
1451 vmovdqa 64(%ebx),%xmm7
1452 vpmuludq 48(%edx),%xmm5,%xmm5
1453 vpaddq %xmm5,%xmm2,%xmm2
1454 vpsrlq $26,%xmm3,%xmm5
1455 vpand %xmm7,%xmm3,%xmm3
1456 vpsrlq $26,%xmm0,%xmm6
1457 vpand %xmm7,%xmm0,%xmm0
1458 vpaddq %xmm5,%xmm4,%xmm4
1459 vpaddq %xmm6,%xmm1,%xmm1
1460 vpsrlq $26,%xmm4,%xmm5
1461 vpand %xmm7,%xmm4,%xmm4
1462 vpsrlq $26,%xmm1,%xmm6
1463 vpand %xmm7,%xmm1,%xmm1
1464 vpaddq %xmm6,%xmm2,%xmm2
1465 vpaddd %xmm5,%xmm0,%xmm0
1466 vpsllq $2,%xmm5,%xmm5
1467 vpsrlq $26,%xmm2,%xmm6
1468 vpand %xmm7,%xmm2,%xmm2
1469 vpaddd %xmm5,%xmm0,%xmm0
1470 vpaddd %xmm6,%xmm3,%xmm3
1471 vpsrlq $26,%xmm3,%xmm6
1472 vpsrlq $26,%xmm0,%xmm5
1473 vpand %xmm7,%xmm0,%xmm0
1474 vpand %xmm7,%xmm3,%xmm3
1475 vpaddd %xmm5,%xmm1,%xmm1
1476 vpaddd %xmm6,%xmm4,%xmm4
1477 decl %ecx
1478 jz .L019square_break
1479 vpunpcklqdq (%esp),%xmm0,%xmm0
1480 vpunpcklqdq 16(%esp),%xmm1,%xmm1
1481 vpunpcklqdq 32(%esp),%xmm2,%xmm2
1482 vpunpcklqdq 48(%esp),%xmm3,%xmm3
1483 vpunpcklqdq 64(%esp),%xmm4,%xmm4
1484 jmp .L018square
1485 .L019square_break:
1486 vpsllq $32,%xmm0,%xmm0
1487 vpsllq $32,%xmm1,%xmm1
1488 vpsllq $32,%xmm2,%xmm2
1489 vpsllq $32,%xmm3,%xmm3
1490 vpsllq $32,%xmm4,%xmm4
1491 vpor (%esp),%xmm0,%xmm0
1492 vpor 16(%esp),%xmm1,%xmm1
1493 vpor 32(%esp),%xmm2,%xmm2
1494 vpor 48(%esp),%xmm3,%xmm3
1495 vpor 64(%esp),%xmm4,%xmm4
1496 vpshufd $141,%xmm0,%xmm0
1497 vpshufd $141,%xmm1,%xmm1
1498 vpshufd $141,%xmm2,%xmm2
1499 vpshufd $141,%xmm3,%xmm3
1500 vpshufd $141,%xmm4,%xmm4
1501 vmovdqu %xmm0,(%edi)
1502 vmovdqu %xmm1,16(%edi)
1503 vmovdqu %xmm2,32(%edi)
1504 vmovdqu %xmm3,48(%edi)
1505 vmovdqu %xmm4,64(%edi)
1506 vpslld $2,%xmm1,%xmm6
1507 vpslld $2,%xmm2,%xmm5
1508 vpaddd %xmm1,%xmm6,%xmm6
1509 vpaddd %xmm2,%xmm5,%xmm5
1510 vmovdqu %xmm6,80(%edi)
1511 vmovdqu %xmm5,96(%edi)
1512 vpslld $2,%xmm3,%xmm6
1513 vpslld $2,%xmm4,%xmm5
1514 vpaddd %xmm3,%xmm6,%xmm6
1515 vpaddd %xmm4,%xmm5,%xmm5
1516 vmovdqu %xmm6,112(%edi)
1517 vmovdqu %xmm5,128(%edi)
1518 movl %ebp,%esp
1519 leal -48(%edi),%edi
1520 ret
1521 .size _poly1305_init_avx2,.-_poly1305_init_avx2
1522 .align 32
1523 .type _poly1305_blocks_avx2,@function
1524 .align 16
1525 _poly1305_blocks_avx2:
1526 pushl %ebp
1527 pushl %ebx
1528 pushl %esi
1529 pushl %edi
1530 movl 20(%esp),%edi
1531 movl 24(%esp),%esi
1532 movl 28(%esp),%ecx
1533 movl 20(%edi),%eax
1534 andl $-16,%ecx
1535 jz .L020nodata
1536 cmpl $64,%ecx
1537 jae .L021enter_avx2
1538 testl %eax,%eax
1539 jz .Lenter_blocks
1540 .L021enter_avx2:
1541 vzeroupper
1542 call .L022pic_point
1543 .L022pic_point:
1544 popl %ebx
1545 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
1546 testl %eax,%eax
1547 jnz .L023base2_26
1548 call _poly1305_init_avx2
1549 movl (%edi),%eax
1550 movl 3(%edi),%ecx
1551 movl 6(%edi),%edx
1552 movl 9(%edi),%esi
1553 movl 13(%edi),%ebp
1554 shrl $2,%ecx
1555 andl $67108863,%eax
1556 shrl $4,%edx
1557 andl $67108863,%ecx
1558 shrl $6,%esi
1559 andl $67108863,%edx
1560 movl %eax,(%edi)
1561 movl %ecx,4(%edi)
1562 movl %edx,8(%edi)
1563 movl %esi,12(%edi)
1564 movl %ebp,16(%edi)
1565 movl $1,20(%edi)
1566 movl 24(%esp),%esi
1567 movl 28(%esp),%ecx
1568 .L023base2_26:
1569 movl 32(%esp),%eax
1570 movl %esp,%ebp
1571 subl $448,%esp
1572 andl $-512,%esp
1573 vmovdqu 48(%edi),%xmm0
1574 leal 288(%esp),%edx
1575 vmovdqu 64(%edi),%xmm1
1576 vmovdqu 80(%edi),%xmm2
1577 vmovdqu 96(%edi),%xmm3
1578 vmovdqu 112(%edi),%xmm4
1579 leal 48(%edi),%edi
1580 vpermq $64,%ymm0,%ymm0
1581 vpermq $64,%ymm1,%ymm1
1582 vpermq $64,%ymm2,%ymm2
1583 vpermq $64,%ymm3,%ymm3
1584 vpermq $64,%ymm4,%ymm4
1585 vpshufd $200,%ymm0,%ymm0
1586 vpshufd $200,%ymm1,%ymm1
1587 vpshufd $200,%ymm2,%ymm2
1588 vpshufd $200,%ymm3,%ymm3
1589 vpshufd $200,%ymm4,%ymm4
1590 vmovdqa %ymm0,-128(%edx)
1591 vmovdqu 80(%edi),%xmm0
1592 vmovdqa %ymm1,-96(%edx)
1593 vmovdqu 96(%edi),%xmm1
1594 vmovdqa %ymm2,-64(%edx)
1595 vmovdqu 112(%edi),%xmm2
1596 vmovdqa %ymm3,-32(%edx)
1597 vmovdqu 128(%edi),%xmm3
1598 vmovdqa %ymm4,(%edx)
1599 vpermq $64,%ymm0,%ymm0
1600 vpermq $64,%ymm1,%ymm1
1601 vpermq $64,%ymm2,%ymm2
1602 vpermq $64,%ymm3,%ymm3
1603 vpshufd $200,%ymm0,%ymm0
1604 vpshufd $200,%ymm1,%ymm1
1605 vpshufd $200,%ymm2,%ymm2
1606 vpshufd $200,%ymm3,%ymm3
1607 vmovdqa %ymm0,32(%edx)
1608 vmovd -48(%edi),%xmm0
1609 vmovdqa %ymm1,64(%edx)
1610 vmovd -44(%edi),%xmm1
1611 vmovdqa %ymm2,96(%edx)
1612 vmovd -40(%edi),%xmm2
1613 vmovdqa %ymm3,128(%edx)
1614 vmovd -36(%edi),%xmm3
1615 vmovd -32(%edi),%xmm4
1616 vmovdqa 64(%ebx),%ymm7
1617 negl %eax
1618 testl $63,%ecx
1619 jz .L024even
1620 movl %ecx,%edx
1621 andl $-64,%ecx
1622 andl $63,%edx
1623 vmovdqu (%esi),%xmm5
1624 cmpl $32,%edx
1625 jb .L025one
1626 vmovdqu 16(%esi),%xmm6
1627 je .L026two
1628 vinserti128 $1,32(%esi),%ymm5,%ymm5
1629 leal 48(%esi),%esi
1630 leal 8(%ebx),%ebx
1631 leal 296(%esp),%edx
1632 jmp .L027tail
1633 .L026two:
1634 leal 32(%esi),%esi
1635 leal 16(%ebx),%ebx
1636 leal 304(%esp),%edx
1637 jmp .L027tail
1638 .L025one:
1639 leal 16(%esi),%esi
1640 vpxor %ymm6,%ymm6,%ymm6
1641 leal 32(%ebx,%eax,8),%ebx
1642 leal 312(%esp),%edx
1643 jmp .L027tail
1644 .align 32
1645 .L024even:
1646 vmovdqu (%esi),%xmm5
1647 vmovdqu 16(%esi),%xmm6
1648 vinserti128 $1,32(%esi),%ymm5,%ymm5
1649 vinserti128 $1,48(%esi),%ymm6,%ymm6
1650 leal 64(%esi),%esi
1651 subl $64,%ecx
1652 jz .L027tail
1653 .L028loop:
1654 vmovdqa %ymm2,64(%esp)
1655 vpsrldq $6,%ymm5,%ymm2
1656 vmovdqa %ymm0,(%esp)
1657 vpsrldq $6,%ymm6,%ymm0
1658 vmovdqa %ymm1,32(%esp)
1659 vpunpckhqdq %ymm6,%ymm5,%ymm1
1660 vpunpcklqdq %ymm6,%ymm5,%ymm5
1661 vpunpcklqdq %ymm0,%ymm2,%ymm2
1662 vpsrlq $30,%ymm2,%ymm0
1663 vpsrlq $4,%ymm2,%ymm2
1664 vpsrlq $26,%ymm5,%ymm6
1665 vpsrlq $40,%ymm1,%ymm1
1666 vpand %ymm7,%ymm2,%ymm2
1667 vpand %ymm7,%ymm5,%ymm5
1668 vpand %ymm7,%ymm6,%ymm6
1669 vpand %ymm7,%ymm0,%ymm0
1670 vpor (%ebx),%ymm1,%ymm1
1671 vpaddq 64(%esp),%ymm2,%ymm2
1672 vpaddq (%esp),%ymm5,%ymm5
1673 vpaddq 32(%esp),%ymm6,%ymm6
1674 vpaddq %ymm3,%ymm0,%ymm0
1675 vpaddq %ymm4,%ymm1,%ymm1
1676 vpmuludq -96(%edx),%ymm2,%ymm3
1677 vmovdqa %ymm6,32(%esp)
1678 vpmuludq -64(%edx),%ymm2,%ymm4
1679 vmovdqa %ymm0,96(%esp)
1680 vpmuludq 96(%edx),%ymm2,%ymm0
1681 vmovdqa %ymm1,128(%esp)
1682 vpmuludq 128(%edx),%ymm2,%ymm1
1683 vpmuludq -128(%edx),%ymm2,%ymm2
1684 vpmuludq -32(%edx),%ymm5,%ymm7
1685 vpaddq %ymm7,%ymm3,%ymm3
1686 vpmuludq (%edx),%ymm5,%ymm6
1687 vpaddq %ymm6,%ymm4,%ymm4
1688 vpmuludq -128(%edx),%ymm5,%ymm7
1689 vpaddq %ymm7,%ymm0,%ymm0
1690 vmovdqa 32(%esp),%ymm7
1691 vpmuludq -96(%edx),%ymm5,%ymm6
1692 vpaddq %ymm6,%ymm1,%ymm1
1693 vpmuludq -64(%edx),%ymm5,%ymm5
1694 vpaddq %ymm5,%ymm2,%ymm2
1695 vpmuludq -64(%edx),%ymm7,%ymm6
1696 vpaddq %ymm6,%ymm3,%ymm3
1697 vpmuludq -32(%edx),%ymm7,%ymm5
1698 vpaddq %ymm5,%ymm4,%ymm4
1699 vpmuludq 128(%edx),%ymm7,%ymm6
1700 vpaddq %ymm6,%ymm0,%ymm0
1701 vmovdqa 96(%esp),%ymm6
1702 vpmuludq -128(%edx),%ymm7,%ymm5
1703 vpaddq %ymm5,%ymm1,%ymm1
1704 vpmuludq -96(%edx),%ymm7,%ymm7
1705 vpaddq %ymm7,%ymm2,%ymm2
1706 vpmuludq -128(%edx),%ymm6,%ymm5
1707 vpaddq %ymm5,%ymm3,%ymm3
1708 vpmuludq -96(%edx),%ymm6,%ymm7
1709 vpaddq %ymm7,%ymm4,%ymm4
1710 vpmuludq 64(%edx),%ymm6,%ymm5
1711 vpaddq %ymm5,%ymm0,%ymm0
1712 vmovdqa 128(%esp),%ymm5
1713 vpmuludq 96(%edx),%ymm6,%ymm7
1714 vpaddq %ymm7,%ymm1,%ymm1
1715 vpmuludq 128(%edx),%ymm6,%ymm6
1716 vpaddq %ymm6,%ymm2,%ymm2
1717 vpmuludq 128(%edx),%ymm5,%ymm7
1718 vpaddq %ymm7,%ymm3,%ymm3
1719 vpmuludq 32(%edx),%ymm5,%ymm6
1720 vpaddq %ymm6,%ymm0,%ymm0
1721 vpmuludq -128(%edx),%ymm5,%ymm7
1722 vpaddq %ymm7,%ymm4,%ymm4
1723 vmovdqa 64(%ebx),%ymm7
1724 vpmuludq 64(%edx),%ymm5,%ymm6
1725 vpaddq %ymm6,%ymm1,%ymm1
1726 vpmuludq 96(%edx),%ymm5,%ymm5
1727 vpaddq %ymm5,%ymm2,%ymm2
1728 vpsrlq $26,%ymm3,%ymm5
1729 vpand %ymm7,%ymm3,%ymm3
1730 vpsrlq $26,%ymm0,%ymm6
1731 vpand %ymm7,%ymm0,%ymm0
1732 vpaddq %ymm5,%ymm4,%ymm4
1733 vpaddq %ymm6,%ymm1,%ymm1
1734 vpsrlq $26,%ymm4,%ymm5
1735 vpand %ymm7,%ymm4,%ymm4
1736 vpsrlq $26,%ymm1,%ymm6
1737 vpand %ymm7,%ymm1,%ymm1
1738 vpaddq %ymm6,%ymm2,%ymm2
1739 vpaddq %ymm5,%ymm0,%ymm0
1740 vpsllq $2,%ymm5,%ymm5
1741 vpsrlq $26,%ymm2,%ymm6
1742 vpand %ymm7,%ymm2,%ymm2
1743 vpaddq %ymm5,%ymm0,%ymm0
1744 vpaddq %ymm6,%ymm3,%ymm3
1745 vpsrlq $26,%ymm3,%ymm6
1746 vpsrlq $26,%ymm0,%ymm5
1747 vpand %ymm7,%ymm0,%ymm0
1748 vpand %ymm7,%ymm3,%ymm3
1749 vpaddq %ymm5,%ymm1,%ymm1
1750 vpaddq %ymm6,%ymm4,%ymm4
1751 vmovdqu (%esi),%xmm5
1752 vmovdqu 16(%esi),%xmm6
1753 vinserti128 $1,32(%esi),%ymm5,%ymm5
1754 vinserti128 $1,48(%esi),%ymm6,%ymm6
1755 leal 64(%esi),%esi
1756 subl $64,%ecx
1757 jnz .L028loop
1758 .L027tail:
1759 vmovdqa %ymm2,64(%esp)
1760 vpsrldq $6,%ymm5,%ymm2
1761 vmovdqa %ymm0,(%esp)
1762 vpsrldq $6,%ymm6,%ymm0
1763 vmovdqa %ymm1,32(%esp)
1764 vpunpckhqdq %ymm6,%ymm5,%ymm1
1765 vpunpcklqdq %ymm6,%ymm5,%ymm5
1766 vpunpcklqdq %ymm0,%ymm2,%ymm2
1767 vpsrlq $30,%ymm2,%ymm0
1768 vpsrlq $4,%ymm2,%ymm2
1769 vpsrlq $26,%ymm5,%ymm6
1770 vpsrlq $40,%ymm1,%ymm1
1771 vpand %ymm7,%ymm2,%ymm2
1772 vpand %ymm7,%ymm5,%ymm5
1773 vpand %ymm7,%ymm6,%ymm6
1774 vpand %ymm7,%ymm0,%ymm0
1775 vpor (%ebx),%ymm1,%ymm1
1776 andl $-64,%ebx
1777 vpaddq 64(%esp),%ymm2,%ymm2
1778 vpaddq (%esp),%ymm5,%ymm5
1779 vpaddq 32(%esp),%ymm6,%ymm6
1780 vpaddq %ymm3,%ymm0,%ymm0
1781 vpaddq %ymm4,%ymm1,%ymm1
1782 vpmuludq -92(%edx),%ymm2,%ymm3
1783 vmovdqa %ymm6,32(%esp)
1784 vpmuludq -60(%edx),%ymm2,%ymm4
1785 vmovdqa %ymm0,96(%esp)
1786 vpmuludq 100(%edx),%ymm2,%ymm0
1787 vmovdqa %ymm1,128(%esp)
1788 vpmuludq 132(%edx),%ymm2,%ymm1
1789 vpmuludq -124(%edx),%ymm2,%ymm2
1790 vpmuludq -28(%edx),%ymm5,%ymm7
1791 vpaddq %ymm7,%ymm3,%ymm3
1792 vpmuludq 4(%edx),%ymm5,%ymm6
1793 vpaddq %ymm6,%ymm4,%ymm4
1794 vpmuludq -124(%edx),%ymm5,%ymm7
1795 vpaddq %ymm7,%ymm0,%ymm0
1796 vmovdqa 32(%esp),%ymm7
1797 vpmuludq -92(%edx),%ymm5,%ymm6
1798 vpaddq %ymm6,%ymm1,%ymm1
1799 vpmuludq -60(%edx),%ymm5,%ymm5
1800 vpaddq %ymm5,%ymm2,%ymm2
1801 vpmuludq -60(%edx),%ymm7,%ymm6
1802 vpaddq %ymm6,%ymm3,%ymm3
1803 vpmuludq -28(%edx),%ymm7,%ymm5
1804 vpaddq %ymm5,%ymm4,%ymm4
1805 vpmuludq 132(%edx),%ymm7,%ymm6
1806 vpaddq %ymm6,%ymm0,%ymm0
1807 vmovdqa 96(%esp),%ymm6
1808 vpmuludq -124(%edx),%ymm7,%ymm5
1809 vpaddq %ymm5,%ymm1,%ymm1
1810 vpmuludq -92(%edx),%ymm7,%ymm7
1811 vpaddq %ymm7,%ymm2,%ymm2
1812 vpmuludq -124(%edx),%ymm6,%ymm5
1813 vpaddq %ymm5,%ymm3,%ymm3
1814 vpmuludq -92(%edx),%ymm6,%ymm7
1815 vpaddq %ymm7,%ymm4,%ymm4
1816 vpmuludq 68(%edx),%ymm6,%ymm5
1817 vpaddq %ymm5,%ymm0,%ymm0
1818 vmovdqa 128(%esp),%ymm5
1819 vpmuludq 100(%edx),%ymm6,%ymm7
1820 vpaddq %ymm7,%ymm1,%ymm1
1821 vpmuludq 132(%edx),%ymm6,%ymm6
1822 vpaddq %ymm6,%ymm2,%ymm2
1823 vpmuludq 132(%edx),%ymm5,%ymm7
1824 vpaddq %ymm7,%ymm3,%ymm3
1825 vpmuludq 36(%edx),%ymm5,%ymm6
1826 vpaddq %ymm6,%ymm0,%ymm0
1827 vpmuludq -124(%edx),%ymm5,%ymm7
1828 vpaddq %ymm7,%ymm4,%ymm4
1829 vmovdqa 64(%ebx),%ymm7
1830 vpmuludq 68(%edx),%ymm5,%ymm6
1831 vpaddq %ymm6,%ymm1,%ymm1
1832 vpmuludq 100(%edx),%ymm5,%ymm5
1833 vpaddq %ymm5,%ymm2,%ymm2
1834 vpsrldq $8,%ymm4,%ymm5
1835 vpsrldq $8,%ymm3,%ymm6
1836 vpaddq %ymm5,%ymm4,%ymm4
1837 vpsrldq $8,%ymm0,%ymm5
1838 vpaddq %ymm6,%ymm3,%ymm3
1839 vpsrldq $8,%ymm1,%ymm6
1840 vpaddq %ymm5,%ymm0,%ymm0
1841 vpsrldq $8,%ymm2,%ymm5
1842 vpaddq %ymm6,%ymm1,%ymm1
1843 vpermq $2,%ymm4,%ymm6
1844 vpaddq %ymm5,%ymm2,%ymm2
1845 vpermq $2,%ymm3,%ymm5
1846 vpaddq %ymm6,%ymm4,%ymm4
1847 vpermq $2,%ymm0,%ymm6
1848 vpaddq %ymm5,%ymm3,%ymm3
1849 vpermq $2,%ymm1,%ymm5
1850 vpaddq %ymm6,%ymm0,%ymm0
1851 vpermq $2,%ymm2,%ymm6
1852 vpaddq %ymm5,%ymm1,%ymm1
1853 vpaddq %ymm6,%ymm2,%ymm2
1854 vpsrlq $26,%ymm3,%ymm5
1855 vpand %ymm7,%ymm3,%ymm3
1856 vpsrlq $26,%ymm0,%ymm6
1857 vpand %ymm7,%ymm0,%ymm0
1858 vpaddq %ymm5,%ymm4,%ymm4
1859 vpaddq %ymm6,%ymm1,%ymm1
1860 vpsrlq $26,%ymm4,%ymm5
1861 vpand %ymm7,%ymm4,%ymm4
1862 vpsrlq $26,%ymm1,%ymm6
1863 vpand %ymm7,%ymm1,%ymm1
1864 vpaddq %ymm6,%ymm2,%ymm2
1865 vpaddq %ymm5,%ymm0,%ymm0
1866 vpsllq $2,%ymm5,%ymm5
1867 vpsrlq $26,%ymm2,%ymm6
1868 vpand %ymm7,%ymm2,%ymm2
1869 vpaddq %ymm5,%ymm0,%ymm0
1870 vpaddq %ymm6,%ymm3,%ymm3
1871 vpsrlq $26,%ymm3,%ymm6
1872 vpsrlq $26,%ymm0,%ymm5
1873 vpand %ymm7,%ymm0,%ymm0
1874 vpand %ymm7,%ymm3,%ymm3
1875 vpaddq %ymm5,%ymm1,%ymm1
1876 vpaddq %ymm6,%ymm4,%ymm4
1877 cmpl $0,%ecx
1878 je .L029done
1879 vpshufd $252,%xmm0,%xmm0
1880 leal 288(%esp),%edx
1881 vpshufd $252,%xmm1,%xmm1
1882 vpshufd $252,%xmm2,%xmm2
1883 vpshufd $252,%xmm3,%xmm3
1884 vpshufd $252,%xmm4,%xmm4
1885 jmp .L024even
1886 .align 16
1887 .L029done:
1888 vmovd %xmm0,-48(%edi)
1889 vmovd %xmm1,-44(%edi)
1890 vmovd %xmm2,-40(%edi)
1891 vmovd %xmm3,-36(%edi)
1892 vmovd %xmm4,-32(%edi)
1893 vzeroupper
1894 movl %ebp,%esp
1895 .L020nodata:
1896 popl %edi
1897 popl %esi
1898 popl %ebx
1899 popl %ebp
1900 ret
1901 .size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
1902 .align 64
1903 .Lconst_sse2:
1904 .long 16777216,0,16777216,0,16777216,0,16777216,0
1905 .long 0,0,0,0,0,0,0,0
1906 .long 67108863,0,67108863,0,67108863,0,67108863,0
1907 .long 268435455,268435452,268435452,268435452
1908 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1909 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1910 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1911 .byte 114,103,62,0
1912 .align 4
1913 .comm OPENSSL_ia32cap_P,16,4
1914 #else
1915 .text
1916 .align 64
1917 .globl poly1305_init
1918 .type poly1305_init,@function
1919 .align 16
1920 poly1305_init:
1921 .L_poly1305_init_begin:
1922 pushl %ebp
1923 pushl %ebx
1924 pushl %esi
1925 pushl %edi
1926 movl 20(%esp),%edi
1927 movl 24(%esp),%esi
1928 movl 28(%esp),%ebp
1929 xorl %eax,%eax
1930 movl %eax,(%edi)
1931 movl %eax,4(%edi)
1932 movl %eax,8(%edi)
1933 movl %eax,12(%edi)
1934 movl %eax,16(%edi)
1935 movl %eax,20(%edi)
1936 cmpl $0,%esi
1937 je .L000nokey
1938 call .L001pic_point
1939 .L001pic_point:
1940 popl %ebx
1941 leal poly1305_blocks-.L001pic_point(%ebx),%eax
1942 leal poly1305_emit-.L001pic_point(%ebx),%edx
1943 leal OPENSSL_ia32cap_P,%edi
1944 movl (%edi),%ecx
1945 andl $83886080,%ecx
1946 cmpl $83886080,%ecx
1947 jne .L002no_sse2
1948 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
1949 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
1950 movl 8(%edi),%ecx
1951 testl $32,%ecx
1952 jz .L002no_sse2
1953 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
1954 .L002no_sse2:
1955 movl 20(%esp),%edi
1956 movl %eax,(%ebp)
1957 movl %edx,4(%ebp)
1958 movl (%esi),%eax
1959 movl 4(%esi),%ebx
1960 movl 8(%esi),%ecx
1961 movl 12(%esi),%edx
1962 andl $268435455,%eax
1963 andl $268435452,%ebx
1964 andl $268435452,%ecx
1965 andl $268435452,%edx
1966 movl %eax,24(%edi)
1967 movl %ebx,28(%edi)
1968 movl %ecx,32(%edi)
1969 movl %edx,36(%edi)
1970 movl $1,%eax
1971 .L000nokey:
1972 popl %edi
1973 popl %esi
1974 popl %ebx
1975 popl %ebp
1976 ret
1977 .size poly1305_init,.-.L_poly1305_init_begin
1978 .globl poly1305_blocks
1979 .type poly1305_blocks,@function
1980 .align 16
1981 poly1305_blocks:
1982 .L_poly1305_blocks_begin:
1983 pushl %ebp
1984 pushl %ebx
1985 pushl %esi
1986 pushl %edi
1987 movl 20(%esp),%edi
1988 movl 24(%esp),%esi
1989 movl 28(%esp),%ecx
1990 .Lenter_blocks:
1991 andl $-15,%ecx
1992 jz .L003nodata
1993 subl $64,%esp
1994 movl 24(%edi),%eax
1995 movl 28(%edi),%ebx
1996 leal (%esi,%ecx,1),%ebp
1997 movl 32(%edi),%ecx
1998 movl 36(%edi),%edx
1999 movl %ebp,92(%esp)
2000 movl %esi,%ebp
2001 movl %eax,36(%esp)
2002 movl %ebx,%eax
2003 shrl $2,%eax
2004 movl %ebx,40(%esp)
2005 addl %ebx,%eax
2006 movl %ecx,%ebx
2007 shrl $2,%ebx
2008 movl %ecx,44(%esp)
2009 addl %ecx,%ebx
2010 movl %edx,%ecx
2011 shrl $2,%ecx
2012 movl %edx,48(%esp)
2013 addl %edx,%ecx
2014 movl %eax,52(%esp)
2015 movl %ebx,56(%esp)
2016 movl %ecx,60(%esp)
2017 movl (%edi),%eax
2018 movl 4(%edi),%ebx
2019 movl 8(%edi),%ecx
2020 movl 12(%edi),%esi
2021 movl 16(%edi),%edi
2022 jmp .L004loop
2023 .align 32
2024 .L004loop:
2025 addl (%ebp),%eax
2026 adcl 4(%ebp),%ebx
2027 adcl 8(%ebp),%ecx
2028 adcl 12(%ebp),%esi
2029 leal 16(%ebp),%ebp
2030 adcl 96(%esp),%edi
2031 movl %eax,(%esp)
2032 movl %esi,12(%esp)
2033 mull 36(%esp)
2034 movl %edi,16(%esp)
2035 movl %eax,%edi
2036 movl %ebx,%eax
2037 movl %edx,%esi
2038 mull 60(%esp)
2039 addl %eax,%edi
2040 movl %ecx,%eax
2041 adcl %edx,%esi
2042 mull 56(%esp)
2043 addl %eax,%edi
2044 movl 12(%esp),%eax
2045 adcl %edx,%esi
2046 mull 52(%esp)
2047 addl %eax,%edi
2048 movl (%esp),%eax
2049 adcl %edx,%esi
2050 mull 40(%esp)
2051 movl %edi,20(%esp)
2052 xorl %edi,%edi
2053 addl %eax,%esi
2054 movl %ebx,%eax
2055 adcl %edx,%edi
2056 mull 36(%esp)
2057 addl %eax,%esi
2058 movl %ecx,%eax
2059 adcl %edx,%edi
2060 mull 60(%esp)
2061 addl %eax,%esi
2062 movl 12(%esp),%eax
2063 adcl %edx,%edi
2064 mull 56(%esp)
2065 addl %eax,%esi
2066 movl 16(%esp),%eax
2067 adcl %edx,%edi
2068 imull 52(%esp),%eax
2069 addl %eax,%esi
2070 movl (%esp),%eax
2071 adcl $0,%edi
2072 mull 44(%esp)
2073 movl %esi,24(%esp)
2074 xorl %esi,%esi
2075 addl %eax,%edi
2076 movl %ebx,%eax
2077 adcl %edx,%esi
2078 mull 40(%esp)
2079 addl %eax,%edi
2080 movl %ecx,%eax
2081 adcl %edx,%esi
2082 mull 36(%esp)
2083 addl %eax,%edi
2084 movl 12(%esp),%eax
2085 adcl %edx,%esi
2086 mull 60(%esp)
2087 addl %eax,%edi
2088 movl 16(%esp),%eax
2089 adcl %edx,%esi
2090 imull 56(%esp),%eax
2091 addl %eax,%edi
2092 movl (%esp),%eax
2093 adcl $0,%esi
2094 mull 48(%esp)
2095 movl %edi,28(%esp)
2096 xorl %edi,%edi
2097 addl %eax,%esi
2098 movl %ebx,%eax
2099 adcl %edx,%edi
2100 mull 44(%esp)
2101 addl %eax,%esi
2102 movl %ecx,%eax
2103 adcl %edx,%edi
2104 mull 40(%esp)
2105 addl %eax,%esi
2106 movl 12(%esp),%eax
2107 adcl %edx,%edi
2108 mull 36(%esp)
2109 addl %eax,%esi
2110 movl 16(%esp),%ecx
2111 adcl %edx,%edi
2112 movl %ecx,%edx
2113 imull 60(%esp),%ecx
2114 addl %ecx,%esi
2115 movl 20(%esp),%eax
2116 adcl $0,%edi
2117 imull 36(%esp),%edx
2118 addl %edi,%edx
2119 movl 24(%esp),%ebx
2120 movl 28(%esp),%ecx
2121 movl %edx,%edi
2122 shrl $2,%edx
2123 andl $3,%edi
2124 leal (%edx,%edx,4),%edx
2125 addl %edx,%eax
2126 adcl $0,%ebx
2127 adcl $0,%ecx
2128 adcl $0,%esi
2129 adcl $0,%edi
2130 cmpl 92(%esp),%ebp
2131 jne .L004loop
2132 movl 84(%esp),%edx
2133 addl $64,%esp
2134 movl %eax,(%edx)
2135 movl %ebx,4(%edx)
2136 movl %ecx,8(%edx)
2137 movl %esi,12(%edx)
2138 movl %edi,16(%edx)
2139 .L003nodata:
2140 popl %edi
2141 popl %esi
2142 popl %ebx
2143 popl %ebp
2144 ret
2145 .size poly1305_blocks,.-.L_poly1305_blocks_begin
2146 .globl poly1305_emit
2147 .type poly1305_emit,@function
2148 .align 16
2149 poly1305_emit:
2150 .L_poly1305_emit_begin:
2151 pushl %ebp
2152 pushl %ebx
2153 pushl %esi
2154 pushl %edi
2155 movl 20(%esp),%ebp
2156 .Lenter_emit:
2157 movl 24(%esp),%edi
2158 movl (%ebp),%eax
2159 movl 4(%ebp),%ebx
2160 movl 8(%ebp),%ecx
2161 movl 12(%ebp),%edx
2162 movl 16(%ebp),%esi
2163 addl $5,%eax
2164 adcl $0,%ebx
2165 adcl $0,%ecx
2166 adcl $0,%edx
2167 adcl $0,%esi
2168 shrl $2,%esi
2169 negl %esi
2170 andl %esi,%eax
2171 andl %esi,%ebx
2172 andl %esi,%ecx
2173 andl %esi,%edx
2174 movl %eax,(%edi)
2175 movl %ebx,4(%edi)
2176 movl %ecx,8(%edi)
2177 movl %edx,12(%edi)
2178 notl %esi
2179 movl (%ebp),%eax
2180 movl 4(%ebp),%ebx
2181 movl 8(%ebp),%ecx
2182 movl 12(%ebp),%edx
2183 movl 28(%esp),%ebp
2184 andl %esi,%eax
2185 andl %esi,%ebx
2186 andl %esi,%ecx
2187 andl %esi,%edx
2188 orl (%edi),%eax
2189 orl 4(%edi),%ebx
2190 orl 8(%edi),%ecx
2191 orl 12(%edi),%edx
2192 addl (%ebp),%eax
2193 adcl 4(%ebp),%ebx
2194 adcl 8(%ebp),%ecx
2195 adcl 12(%ebp),%edx
2196 movl %eax,(%edi)
2197 movl %ebx,4(%edi)
2198 movl %ecx,8(%edi)
2199 movl %edx,12(%edi)
2200 popl %edi
2201 popl %esi
2202 popl %ebx
2203 popl %ebp
2204 ret
2205 .size poly1305_emit,.-.L_poly1305_emit_begin
2206 .align 32
2207 .type _poly1305_init_sse2,@function
2208 .align 16
2209 _poly1305_init_sse2:
2210 movdqu 24(%edi),%xmm4
2211 leal 48(%edi),%edi
2212 movl %esp,%ebp
2213 subl $224,%esp
2214 andl $-16,%esp
2215 movq 64(%ebx),%xmm7
2216 movdqa %xmm4,%xmm0
2217 movdqa %xmm4,%xmm1
2218 movdqa %xmm4,%xmm2
2219 pand %xmm7,%xmm0
2220 psrlq $26,%xmm1
2221 psrldq $6,%xmm2
2222 pand %xmm7,%xmm1
2223 movdqa %xmm2,%xmm3
2224 psrlq $4,%xmm2
2225 psrlq $30,%xmm3
2226 pand %xmm7,%xmm2
2227 pand %xmm7,%xmm3
2228 psrldq $13,%xmm4
2229 leal 144(%esp),%edx
2230 movl $2,%ecx
2231 .L005square:
2232 movdqa %xmm0,(%esp)
2233 movdqa %xmm1,16(%esp)
2234 movdqa %xmm2,32(%esp)
2235 movdqa %xmm3,48(%esp)
2236 movdqa %xmm4,64(%esp)
2237 movdqa %xmm1,%xmm6
2238 movdqa %xmm2,%xmm5
2239 pslld $2,%xmm6
2240 pslld $2,%xmm5
2241 paddd %xmm1,%xmm6
2242 paddd %xmm2,%xmm5
2243 movdqa %xmm6,80(%esp)
2244 movdqa %xmm5,96(%esp)
2245 movdqa %xmm3,%xmm6
2246 movdqa %xmm4,%xmm5
2247 pslld $2,%xmm6
2248 pslld $2,%xmm5
2249 paddd %xmm3,%xmm6
2250 paddd %xmm4,%xmm5
2251 movdqa %xmm6,112(%esp)
2252 movdqa %xmm5,128(%esp)
2253 pshufd $68,%xmm0,%xmm6
2254 movdqa %xmm1,%xmm5
2255 pshufd $68,%xmm1,%xmm1
2256 pshufd $68,%xmm2,%xmm2
2257 pshufd $68,%xmm3,%xmm3
2258 pshufd $68,%xmm4,%xmm4
2259 movdqa %xmm6,(%edx)
2260 movdqa %xmm1,16(%edx)
2261 movdqa %xmm2,32(%edx)
2262 movdqa %xmm3,48(%edx)
2263 movdqa %xmm4,64(%edx)
2264 pmuludq %xmm0,%xmm4
2265 pmuludq %xmm0,%xmm3
2266 pmuludq %xmm0,%xmm2
2267 pmuludq %xmm0,%xmm1
2268 pmuludq %xmm6,%xmm0
2269 movdqa %xmm5,%xmm6
2270 pmuludq 48(%edx),%xmm5
2271 movdqa %xmm6,%xmm7
2272 pmuludq 32(%edx),%xmm6
2273 paddq %xmm5,%xmm4
2274 movdqa %xmm7,%xmm5
2275 pmuludq 16(%edx),%xmm7
2276 paddq %xmm6,%xmm3
2277 movdqa 80(%esp),%xmm6
2278 pmuludq (%edx),%xmm5
2279 paddq %xmm7,%xmm2
2280 pmuludq 64(%edx),%xmm6
2281 movdqa 32(%esp),%xmm7
2282 paddq %xmm5,%xmm1
2283 movdqa %xmm7,%xmm5
2284 pmuludq 32(%edx),%xmm7
2285 paddq %xmm6,%xmm0
2286 movdqa %xmm5,%xmm6
2287 pmuludq 16(%edx),%xmm5
2288 paddq %xmm7,%xmm4
2289 movdqa 96(%esp),%xmm7
2290 pmuludq (%edx),%xmm6
2291 paddq %xmm5,%xmm3
2292 movdqa %xmm7,%xmm5
2293 pmuludq 64(%edx),%xmm7
2294 paddq %xmm6,%xmm2
2295 pmuludq 48(%edx),%xmm5
2296 movdqa 48(%esp),%xmm6
2297 paddq %xmm7,%xmm1
2298 movdqa %xmm6,%xmm7
2299 pmuludq 16(%edx),%xmm6
2300 paddq %xmm5,%xmm0
2301 movdqa 112(%esp),%xmm5
2302 pmuludq (%edx),%xmm7
2303 paddq %xmm6,%xmm4
2304 movdqa %xmm5,%xmm6
2305 pmuludq 64(%edx),%xmm5
2306 paddq %xmm7,%xmm3
2307 movdqa %xmm6,%xmm7
2308 pmuludq 48(%edx),%xmm6
2309 paddq %xmm5,%xmm2
2310 pmuludq 32(%edx),%xmm7
2311 movdqa 64(%esp),%xmm5
2312 paddq %xmm6,%xmm1
2313 movdqa 128(%esp),%xmm6
2314 pmuludq (%edx),%xmm5
2315 paddq %xmm7,%xmm0
2316 movdqa %xmm6,%xmm7
2317 pmuludq 64(%edx),%xmm6
2318 paddq %xmm5,%xmm4
2319 movdqa %xmm7,%xmm5
2320 pmuludq 16(%edx),%xmm7
2321 paddq %xmm6,%xmm3
2322 movdqa %xmm5,%xmm6
2323 pmuludq 32(%edx),%xmm5
2324 paddq %xmm7,%xmm0
2325 pmuludq 48(%edx),%xmm6
2326 movdqa 64(%ebx),%xmm7
2327 paddq %xmm5,%xmm1
2328 paddq %xmm6,%xmm2
2329 movdqa %xmm3,%xmm5
2330 pand %xmm7,%xmm3
2331 psrlq $26,%xmm5
2332 paddq %xmm4,%xmm5
2333 movdqa %xmm0,%xmm6
2334 pand %xmm7,%xmm0
2335 psrlq $26,%xmm6
2336 movdqa %xmm5,%xmm4
2337 paddq %xmm1,%xmm6
2338 psrlq $26,%xmm5
2339 pand %xmm7,%xmm4
2340 movdqa %xmm6,%xmm1
2341 psrlq $26,%xmm6
2342 paddd %xmm5,%xmm0
2343 psllq $2,%xmm5
2344 paddq %xmm2,%xmm6
2345 paddq %xmm0,%xmm5
2346 pand %xmm7,%xmm1
2347 movdqa %xmm6,%xmm2
2348 psrlq $26,%xmm6
2349 pand %xmm7,%xmm2
2350 paddd %xmm3,%xmm6
2351 movdqa %xmm5,%xmm0
2352 psrlq $26,%xmm5
2353 movdqa %xmm6,%xmm3
2354 psrlq $26,%xmm6
2355 pand %xmm7,%xmm0
2356 paddd %xmm5,%xmm1
2357 pand %xmm7,%xmm3
2358 paddd %xmm6,%xmm4
2359 decl %ecx
2360 jz .L006square_break
2361 punpcklqdq (%esp),%xmm0
2362 punpcklqdq 16(%esp),%xmm1
2363 punpcklqdq 32(%esp),%xmm2
2364 punpcklqdq 48(%esp),%xmm3
2365 punpcklqdq 64(%esp),%xmm4
2366 jmp .L005square
2367 .L006square_break:
2368 psllq $32,%xmm0
2369 psllq $32,%xmm1
2370 psllq $32,%xmm2
2371 psllq $32,%xmm3
2372 psllq $32,%xmm4
2373 por (%esp),%xmm0
2374 por 16(%esp),%xmm1
2375 por 32(%esp),%xmm2
2376 por 48(%esp),%xmm3
2377 por 64(%esp),%xmm4
2378 pshufd $141,%xmm0,%xmm0
2379 pshufd $141,%xmm1,%xmm1
2380 pshufd $141,%xmm2,%xmm2
2381 pshufd $141,%xmm3,%xmm3
2382 pshufd $141,%xmm4,%xmm4
2383 movdqu %xmm0,(%edi)
2384 movdqu %xmm1,16(%edi)
2385 movdqu %xmm2,32(%edi)
2386 movdqu %xmm3,48(%edi)
2387 movdqu %xmm4,64(%edi)
2388 movdqa %xmm1,%xmm6
2389 movdqa %xmm2,%xmm5
2390 pslld $2,%xmm6
2391 pslld $2,%xmm5
2392 paddd %xmm1,%xmm6
2393 paddd %xmm2,%xmm5
2394 movdqu %xmm6,80(%edi)
2395 movdqu %xmm5,96(%edi)
2396 movdqa %xmm3,%xmm6
2397 movdqa %xmm4,%xmm5
2398 pslld $2,%xmm6
2399 pslld $2,%xmm5
2400 paddd %xmm3,%xmm6
2401 paddd %xmm4,%xmm5
2402 movdqu %xmm6,112(%edi)
2403 movdqu %xmm5,128(%edi)
2404 movl %ebp,%esp
2405 leal -48(%edi),%edi
2406 ret
2407 .size _poly1305_init_sse2,.-_poly1305_init_sse2
2408 .align 32
2409 .type _poly1305_blocks_sse2,@function
2410 .align 16
2411 _poly1305_blocks_sse2:
2412 pushl %ebp
2413 pushl %ebx
2414 pushl %esi
2415 pushl %edi
2416 movl 20(%esp),%edi
2417 movl 24(%esp),%esi
2418 movl 28(%esp),%ecx
2419 movl 20(%edi),%eax
2420 andl $-16,%ecx
2421 jz .L007nodata
2422 cmpl $64,%ecx
2423 jae .L008enter_sse2
2424 testl %eax,%eax
2425 jz .Lenter_blocks
2426 .align 16
2427 .L008enter_sse2:
2428 call .L009pic_point
2429 .L009pic_point:
2430 popl %ebx
2431 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx
2432 testl %eax,%eax
2433 jnz .L010base2_26
2434 call _poly1305_init_sse2
2435 movl (%edi),%eax
2436 movl 3(%edi),%ecx
2437 movl 6(%edi),%edx
2438 movl 9(%edi),%esi
2439 movl 13(%edi),%ebp
2440 movl $1,20(%edi)
2441 shrl $2,%ecx
2442 andl $67108863,%eax
2443 shrl $4,%edx
2444 andl $67108863,%ecx
2445 shrl $6,%esi
2446 andl $67108863,%edx
2447 movd %eax,%xmm0
2448 movd %ecx,%xmm1
2449 movd %edx,%xmm2
2450 movd %esi,%xmm3
2451 movd %ebp,%xmm4
2452 movl 24(%esp),%esi
2453 movl 28(%esp),%ecx
2454 jmp .L011base2_32
2455 .align 16
2456 .L010base2_26:
2457 movd (%edi),%xmm0
2458 movd 4(%edi),%xmm1
2459 movd 8(%edi),%xmm2
2460 movd 12(%edi),%xmm3
2461 movd 16(%edi),%xmm4
2462 movdqa 64(%ebx),%xmm7
2463 .L011base2_32:
2464 movl 32(%esp),%eax
2465 movl %esp,%ebp
2466 subl $528,%esp
2467 andl $-16,%esp
2468 leal 48(%edi),%edi
2469 shll $24,%eax
2470 testl $31,%ecx
2471 jz .L012even
2472 movdqu (%esi),%xmm6
2473 leal 16(%esi),%esi
2474 movdqa %xmm6,%xmm5
2475 pand %xmm7,%xmm6
2476 paddd %xmm6,%xmm0
2477 movdqa %xmm5,%xmm6
2478 psrlq $26,%xmm5
2479 psrldq $6,%xmm6
2480 pand %xmm7,%xmm5
2481 paddd %xmm5,%xmm1
2482 movdqa %xmm6,%xmm5
2483 psrlq $4,%xmm6
2484 pand %xmm7,%xmm6
2485 paddd %xmm6,%xmm2
2486 movdqa %xmm5,%xmm6
2487 psrlq $30,%xmm5
2488 pand %xmm7,%xmm5
2489 psrldq $7,%xmm6
2490 paddd %xmm5,%xmm3
2491 movd %eax,%xmm5
2492 paddd %xmm6,%xmm4
2493 movd 12(%edi),%xmm6
2494 paddd %xmm5,%xmm4
2495 movdqa %xmm0,(%esp)
2496 movdqa %xmm1,16(%esp)
2497 movdqa %xmm2,32(%esp)
2498 movdqa %xmm3,48(%esp)
2499 movdqa %xmm4,64(%esp)
2500 pmuludq %xmm6,%xmm0
2501 pmuludq %xmm6,%xmm1
2502 pmuludq %xmm6,%xmm2
2503 movd 28(%edi),%xmm5
2504 pmuludq %xmm6,%xmm3
2505 pmuludq %xmm6,%xmm4
2506 movdqa %xmm5,%xmm6
2507 pmuludq 48(%esp),%xmm5
2508 movdqa %xmm6,%xmm7
2509 pmuludq 32(%esp),%xmm6
2510 paddq %xmm5,%xmm4
2511 movdqa %xmm7,%xmm5
2512 pmuludq 16(%esp),%xmm7
2513 paddq %xmm6,%xmm3
2514 movd 92(%edi),%xmm6
2515 pmuludq (%esp),%xmm5
2516 paddq %xmm7,%xmm2
2517 pmuludq 64(%esp),%xmm6
2518 movd 44(%edi),%xmm7
2519 paddq %xmm5,%xmm1
2520 movdqa %xmm7,%xmm5
2521 pmuludq 32(%esp),%xmm7
2522 paddq %xmm6,%xmm0
2523 movdqa %xmm5,%xmm6
2524 pmuludq 16(%esp),%xmm5
2525 paddq %xmm7,%xmm4
2526 movd 108(%edi),%xmm7
2527 pmuludq (%esp),%xmm6
2528 paddq %xmm5,%xmm3
2529 movdqa %xmm7,%xmm5
2530 pmuludq 64(%esp),%xmm7
2531 paddq %xmm6,%xmm2
2532 pmuludq 48(%esp),%xmm5
2533 movd 60(%edi),%xmm6
2534 paddq %xmm7,%xmm1
2535 movdqa %xmm6,%xmm7
2536 pmuludq 16(%esp),%xmm6
2537 paddq %xmm5,%xmm0
2538 movd 124(%edi),%xmm5
2539 pmuludq (%esp),%xmm7
2540 paddq %xmm6,%xmm4
2541 movdqa %xmm5,%xmm6
2542 pmuludq 64(%esp),%xmm5
2543 paddq %xmm7,%xmm3
2544 movdqa %xmm6,%xmm7
2545 pmuludq 48(%esp),%xmm6
2546 paddq %xmm5,%xmm2
2547 pmuludq 32(%esp),%xmm7
2548 movd 76(%edi),%xmm5
2549 paddq %xmm6,%xmm1
2550 movd 140(%edi),%xmm6
2551 pmuludq (%esp),%xmm5
2552 paddq %xmm7,%xmm0
2553 movdqa %xmm6,%xmm7
2554 pmuludq 64(%esp),%xmm6
2555 paddq %xmm5,%xmm4
2556 movdqa %xmm7,%xmm5
2557 pmuludq 16(%esp),%xmm7
2558 paddq %xmm6,%xmm3
2559 movdqa %xmm5,%xmm6
2560 pmuludq 32(%esp),%xmm5
2561 paddq %xmm7,%xmm0
2562 pmuludq 48(%esp),%xmm6
2563 movdqa 64(%ebx),%xmm7
2564 paddq %xmm5,%xmm1
2565 paddq %xmm6,%xmm2
2566 movdqa %xmm3,%xmm5
2567 pand %xmm7,%xmm3
2568 psrlq $26,%xmm5
2569 paddq %xmm4,%xmm5
2570 movdqa %xmm0,%xmm6
2571 pand %xmm7,%xmm0
2572 psrlq $26,%xmm6
2573 movdqa %xmm5,%xmm4
2574 paddq %xmm1,%xmm6
2575 psrlq $26,%xmm5
2576 pand %xmm7,%xmm4
2577 movdqa %xmm6,%xmm1
2578 psrlq $26,%xmm6
2579 paddd %xmm5,%xmm0
2580 psllq $2,%xmm5
2581 paddq %xmm2,%xmm6
2582 paddq %xmm0,%xmm5
2583 pand %xmm7,%xmm1
2584 movdqa %xmm6,%xmm2
2585 psrlq $26,%xmm6
2586 pand %xmm7,%xmm2
2587 paddd %xmm3,%xmm6
2588 movdqa %xmm5,%xmm0
2589 psrlq $26,%xmm5
2590 movdqa %xmm6,%xmm3
2591 psrlq $26,%xmm6
2592 pand %xmm7,%xmm0
2593 paddd %xmm5,%xmm1
2594 pand %xmm7,%xmm3
2595 paddd %xmm6,%xmm4
2596 subl $16,%ecx
2597 jz .L013done
2598 .L012even:
2599 leal 384(%esp),%edx
2600 leal -32(%esi),%eax
2601 subl $64,%ecx
2602 movdqu (%edi),%xmm5
2603 pshufd $68,%xmm5,%xmm6
2604 cmovbl %eax,%esi
2605 pshufd $238,%xmm5,%xmm5
2606 movdqa %xmm6,(%edx)
2607 leal 160(%esp),%eax
2608 movdqu 16(%edi),%xmm6
2609 movdqa %xmm5,-144(%edx)
2610 pshufd $68,%xmm6,%xmm5
2611 pshufd $238,%xmm6,%xmm6
2612 movdqa %xmm5,16(%edx)
2613 movdqu 32(%edi),%xmm5
2614 movdqa %xmm6,-128(%edx)
2615 pshufd $68,%xmm5,%xmm6
2616 pshufd $238,%xmm5,%xmm5
2617 movdqa %xmm6,32(%edx)
2618 movdqu 48(%edi),%xmm6
2619 movdqa %xmm5,-112(%edx)
2620 pshufd $68,%xmm6,%xmm5
2621 pshufd $238,%xmm6,%xmm6
2622 movdqa %xmm5,48(%edx)
2623 movdqu 64(%edi),%xmm5
2624 movdqa %xmm6,-96(%edx)
2625 pshufd $68,%xmm5,%xmm6
2626 pshufd $238,%xmm5,%xmm5
2627 movdqa %xmm6,64(%edx)
2628 movdqu 80(%edi),%xmm6
2629 movdqa %xmm5,-80(%edx)
2630 pshufd $68,%xmm6,%xmm5
2631 pshufd $238,%xmm6,%xmm6
2632 movdqa %xmm5,80(%edx)
2633 movdqu 96(%edi),%xmm5
2634 movdqa %xmm6,-64(%edx)
2635 pshufd $68,%xmm5,%xmm6
2636 pshufd $238,%xmm5,%xmm5
2637 movdqa %xmm6,96(%edx)
2638 movdqu 112(%edi),%xmm6
2639 movdqa %xmm5,-48(%edx)
2640 pshufd $68,%xmm6,%xmm5
2641 pshufd $238,%xmm6,%xmm6
2642 movdqa %xmm5,112(%edx)
2643 movdqu 128(%edi),%xmm5
2644 movdqa %xmm6,-32(%edx)
2645 pshufd $68,%xmm5,%xmm6
2646 pshufd $238,%xmm5,%xmm5
2647 movdqa %xmm6,128(%edx)
2648 movdqa %xmm5,-16(%edx)
2649 movdqu 32(%esi),%xmm5
2650 movdqu 48(%esi),%xmm6
2651 leal 32(%esi),%esi
2652 movdqa %xmm2,112(%esp)
2653 movdqa %xmm3,128(%esp)
2654 movdqa %xmm4,144(%esp)
2655 movdqa %xmm5,%xmm2
2656 movdqa %xmm6,%xmm3
2657 psrldq $6,%xmm2
2658 psrldq $6,%xmm3
2659 movdqa %xmm5,%xmm4
2660 punpcklqdq %xmm3,%xmm2
2661 punpckhqdq %xmm6,%xmm4
2662 punpcklqdq %xmm6,%xmm5
2663 movdqa %xmm2,%xmm3
2664 psrlq $4,%xmm2
2665 psrlq $30,%xmm3
2666 movdqa %xmm5,%xmm6
2667 psrlq $40,%xmm4
2668 psrlq $26,%xmm6
2669 pand %xmm7,%xmm5
2670 pand %xmm7,%xmm6
2671 pand %xmm7,%xmm2
2672 pand %xmm7,%xmm3
2673 por (%ebx),%xmm4
2674 movdqa %xmm0,80(%esp)
2675 movdqa %xmm1,96(%esp)
2676 jbe .L014skip_loop
2677 jmp .L015loop
2678 .align 32
2679 .L015loop:
2680 movdqa -144(%edx),%xmm7
2681 movdqa %xmm6,16(%eax)
2682 movdqa %xmm2,32(%eax)
2683 movdqa %xmm3,48(%eax)
2684 movdqa %xmm4,64(%eax)
2685 movdqa %xmm5,%xmm1
2686 pmuludq %xmm7,%xmm5
2687 movdqa %xmm6,%xmm0
2688 pmuludq %xmm7,%xmm6
2689 pmuludq %xmm7,%xmm2
2690 pmuludq %xmm7,%xmm3
2691 pmuludq %xmm7,%xmm4
2692 pmuludq -16(%edx),%xmm0
2693 movdqa %xmm1,%xmm7
2694 pmuludq -128(%edx),%xmm1
2695 paddq %xmm5,%xmm0
2696 movdqa %xmm7,%xmm5
2697 pmuludq -112(%edx),%xmm7
2698 paddq %xmm6,%xmm1
2699 movdqa %xmm5,%xmm6
2700 pmuludq -96(%edx),%xmm5
2701 paddq %xmm7,%xmm2
2702 movdqa 16(%eax),%xmm7
2703 pmuludq -80(%edx),%xmm6
2704 paddq %xmm5,%xmm3
2705 movdqa %xmm7,%xmm5
2706 pmuludq -128(%edx),%xmm7
2707 paddq %xmm6,%xmm4
2708 movdqa %xmm5,%xmm6
2709 pmuludq -112(%edx),%xmm5
2710 paddq %xmm7,%xmm2
2711 movdqa 32(%eax),%xmm7
2712 pmuludq -96(%edx),%xmm6
2713 paddq %xmm5,%xmm3
2714 movdqa %xmm7,%xmm5
2715 pmuludq -32(%edx),%xmm7
2716 paddq %xmm6,%xmm4
2717 movdqa %xmm5,%xmm6
2718 pmuludq -16(%edx),%xmm5
2719 paddq %xmm7,%xmm0
2720 movdqa %xmm6,%xmm7
2721 pmuludq -128(%edx),%xmm6
2722 paddq %xmm5,%xmm1
2723 movdqa 48(%eax),%xmm5
2724 pmuludq -112(%edx),%xmm7
2725 paddq %xmm6,%xmm3
2726 movdqa %xmm5,%xmm6
2727 pmuludq -48(%edx),%xmm5
2728 paddq %xmm7,%xmm4
2729 movdqa %xmm6,%xmm7
2730 pmuludq -32(%edx),%xmm6
2731 paddq %xmm5,%xmm0
2732 movdqa %xmm7,%xmm5
2733 pmuludq -16(%edx),%xmm7
2734 paddq %xmm6,%xmm1
2735 movdqa 64(%eax),%xmm6
2736 pmuludq -128(%edx),%xmm5
2737 paddq %xmm7,%xmm2
2738 movdqa %xmm6,%xmm7
2739 pmuludq -16(%edx),%xmm6
2740 paddq %xmm5,%xmm4
2741 movdqa %xmm7,%xmm5
2742 pmuludq -64(%edx),%xmm7
2743 paddq %xmm6,%xmm3
2744 movdqa %xmm5,%xmm6
2745 pmuludq -48(%edx),%xmm5
2746 paddq %xmm7,%xmm0
2747 movdqa 64(%ebx),%xmm7
2748 pmuludq -32(%edx),%xmm6
2749 paddq %xmm5,%xmm1
2750 paddq %xmm6,%xmm2
2751 movdqu -32(%esi),%xmm5
2752 movdqu -16(%esi),%xmm6
2753 leal 32(%esi),%esi
2754 movdqa %xmm2,32(%esp)
2755 movdqa %xmm3,48(%esp)
2756 movdqa %xmm4,64(%esp)
2757 movdqa %xmm5,%xmm2
2758 movdqa %xmm6,%xmm3
2759 psrldq $6,%xmm2
2760 psrldq $6,%xmm3
2761 movdqa %xmm5,%xmm4
2762 punpcklqdq %xmm3,%xmm2
2763 punpckhqdq %xmm6,%xmm4
2764 punpcklqdq %xmm6,%xmm5
2765 movdqa %xmm2,%xmm3
2766 psrlq $4,%xmm2
2767 psrlq $30,%xmm3
2768 movdqa %xmm5,%xmm6
2769 psrlq $40,%xmm4
2770 psrlq $26,%xmm6
2771 pand %xmm7,%xmm5
2772 pand %xmm7,%xmm6
2773 pand %xmm7,%xmm2
2774 pand %xmm7,%xmm3
2775 por (%ebx),%xmm4
2776 leal -32(%esi),%eax
2777 subl $64,%ecx
2778 paddd 80(%esp),%xmm5
2779 paddd 96(%esp),%xmm6
2780 paddd 112(%esp),%xmm2
2781 paddd 128(%esp),%xmm3
2782 paddd 144(%esp),%xmm4
2783 cmovbl %eax,%esi
2784 leal 160(%esp),%eax
2785 movdqa (%edx),%xmm7
2786 movdqa %xmm1,16(%esp)
2787 movdqa %xmm6,16(%eax)
2788 movdqa %xmm2,32(%eax)
2789 movdqa %xmm3,48(%eax)
2790 movdqa %xmm4,64(%eax)
2791 movdqa %xmm5,%xmm1
2792 pmuludq %xmm7,%xmm5
2793 paddq %xmm0,%xmm5
2794 movdqa %xmm6,%xmm0
2795 pmuludq %xmm7,%xmm6
2796 pmuludq %xmm7,%xmm2
2797 pmuludq %xmm7,%xmm3
2798 pmuludq %xmm7,%xmm4
2799 paddq 16(%esp),%xmm6
2800 paddq 32(%esp),%xmm2
2801 paddq 48(%esp),%xmm3
2802 paddq 64(%esp),%xmm4
2803 pmuludq 128(%edx),%xmm0
2804 movdqa %xmm1,%xmm7
2805 pmuludq 16(%edx),%xmm1
2806 paddq %xmm5,%xmm0
2807 movdqa %xmm7,%xmm5
2808 pmuludq 32(%edx),%xmm7
2809 paddq %xmm6,%xmm1
2810 movdqa %xmm5,%xmm6
2811 pmuludq 48(%edx),%xmm5
2812 paddq %xmm7,%xmm2
2813 movdqa 16(%eax),%xmm7
2814 pmuludq 64(%edx),%xmm6
2815 paddq %xmm5,%xmm3
2816 movdqa %xmm7,%xmm5
2817 pmuludq 16(%edx),%xmm7
2818 paddq %xmm6,%xmm4
2819 movdqa %xmm5,%xmm6
2820 pmuludq 32(%edx),%xmm5
2821 paddq %xmm7,%xmm2
2822 movdqa 32(%eax),%xmm7
2823 pmuludq 48(%edx),%xmm6
2824 paddq %xmm5,%xmm3
2825 movdqa %xmm7,%xmm5
2826 pmuludq 112(%edx),%xmm7
2827 paddq %xmm6,%xmm4
2828 movdqa %xmm5,%xmm6
2829 pmuludq 128(%edx),%xmm5
2830 paddq %xmm7,%xmm0
2831 movdqa %xmm6,%xmm7
2832 pmuludq 16(%edx),%xmm6
2833 paddq %xmm5,%xmm1
2834 movdqa 48(%eax),%xmm5
2835 pmuludq 32(%edx),%xmm7
2836 paddq %xmm6,%xmm3
2837 movdqa %xmm5,%xmm6
2838 pmuludq 96(%edx),%xmm5
2839 paddq %xmm7,%xmm4
2840 movdqa %xmm6,%xmm7
2841 pmuludq 112(%edx),%xmm6
2842 paddq %xmm5,%xmm0
2843 movdqa %xmm7,%xmm5
2844 pmuludq 128(%edx),%xmm7
2845 paddq %xmm6,%xmm1
2846 movdqa 64(%eax),%xmm6
2847 pmuludq 16(%edx),%xmm5
2848 paddq %xmm7,%xmm2
2849 movdqa %xmm6,%xmm7
2850 pmuludq 128(%edx),%xmm6
2851 paddq %xmm5,%xmm4
2852 movdqa %xmm7,%xmm5
2853 pmuludq 80(%edx),%xmm7
2854 paddq %xmm6,%xmm3
2855 movdqa %xmm5,%xmm6
2856 pmuludq 96(%edx),%xmm5
2857 paddq %xmm7,%xmm0
2858 movdqa 64(%ebx),%xmm7
2859 pmuludq 112(%edx),%xmm6
2860 paddq %xmm5,%xmm1
2861 paddq %xmm6,%xmm2
2862 movdqa %xmm3,%xmm5
2863 pand %xmm7,%xmm3
2864 psrlq $26,%xmm5
2865 paddq %xmm4,%xmm5
2866 movdqa %xmm0,%xmm6
2867 pand %xmm7,%xmm0
2868 psrlq $26,%xmm6
2869 movdqa %xmm5,%xmm4
2870 paddq %xmm1,%xmm6
2871 psrlq $26,%xmm5
2872 pand %xmm7,%xmm4
2873 movdqa %xmm6,%xmm1
2874 psrlq $26,%xmm6
2875 paddd %xmm5,%xmm0
2876 psllq $2,%xmm5
2877 paddq %xmm2,%xmm6
2878 paddq %xmm0,%xmm5
2879 pand %xmm7,%xmm1
2880 movdqa %xmm6,%xmm2
2881 psrlq $26,%xmm6
2882 pand %xmm7,%xmm2
2883 paddd %xmm3,%xmm6
2884 movdqa %xmm5,%xmm0
2885 psrlq $26,%xmm5
2886 movdqa %xmm6,%xmm3
2887 psrlq $26,%xmm6
2888 pand %xmm7,%xmm0
2889 paddd %xmm5,%xmm1
2890 pand %xmm7,%xmm3
2891 paddd %xmm6,%xmm4
2892 movdqu 32(%esi),%xmm5
2893 movdqu 48(%esi),%xmm6
2894 leal 32(%esi),%esi
2895 movdqa %xmm2,112(%esp)
2896 movdqa %xmm3,128(%esp)
2897 movdqa %xmm4,144(%esp)
2898 movdqa %xmm5,%xmm2
2899 movdqa %xmm6,%xmm3
2900 psrldq $6,%xmm2
2901 psrldq $6,%xmm3
2902 movdqa %xmm5,%xmm4
2903 punpcklqdq %xmm3,%xmm2
2904 punpckhqdq %xmm6,%xmm4
2905 punpcklqdq %xmm6,%xmm5
2906 movdqa %xmm2,%xmm3
2907 psrlq $4,%xmm2
2908 psrlq $30,%xmm3
2909 movdqa %xmm5,%xmm6
2910 psrlq $40,%xmm4
2911 psrlq $26,%xmm6
2912 pand %xmm7,%xmm5
2913 pand %xmm7,%xmm6
2914 pand %xmm7,%xmm2
2915 pand %xmm7,%xmm3
2916 por (%ebx),%xmm4
2917 movdqa %xmm0,80(%esp)
2918 movdqa %xmm1,96(%esp)
2919 ja .L015loop
2920 .L014skip_loop:
2921 pshufd $16,-144(%edx),%xmm7
2922 addl $32,%ecx
2923 jnz .L016long_tail
2924 paddd %xmm0,%xmm5
2925 paddd %xmm1,%xmm6
2926 paddd 112(%esp),%xmm2
2927 paddd 128(%esp),%xmm3
2928 paddd 144(%esp),%xmm4
2929 .L016long_tail:
2930 movdqa %xmm5,(%eax)
2931 movdqa %xmm6,16(%eax)
2932 movdqa %xmm2,32(%eax)
2933 movdqa %xmm3,48(%eax)
2934 movdqa %xmm4,64(%eax)
2935 pmuludq %xmm7,%xmm5
2936 pmuludq %xmm7,%xmm6
2937 pmuludq %xmm7,%xmm2
2938 movdqa %xmm5,%xmm0
2939 pshufd $16,-128(%edx),%xmm5
2940 pmuludq %xmm7,%xmm3
2941 movdqa %xmm6,%xmm1
2942 pmuludq %xmm7,%xmm4
2943 movdqa %xmm5,%xmm6
2944 pmuludq 48(%eax),%xmm5
2945 movdqa %xmm6,%xmm7
2946 pmuludq 32(%eax),%xmm6
2947 paddq %xmm5,%xmm4
2948 movdqa %xmm7,%xmm5
2949 pmuludq 16(%eax),%xmm7
2950 paddq %xmm6,%xmm3
2951 pshufd $16,-64(%edx),%xmm6
2952 pmuludq (%eax),%xmm5
2953 paddq %xmm7,%xmm2
2954 pmuludq 64(%eax),%xmm6
2955 pshufd $16,-112(%edx),%xmm7
2956 paddq %xmm5,%xmm1
2957 movdqa %xmm7,%xmm5
2958 pmuludq 32(%eax),%xmm7
2959 paddq %xmm6,%xmm0
2960 movdqa %xmm5,%xmm6
2961 pmuludq 16(%eax),%xmm5
2962 paddq %xmm7,%xmm4
2963 pshufd $16,-48(%edx),%xmm7
2964 pmuludq (%eax),%xmm6
2965 paddq %xmm5,%xmm3
2966 movdqa %xmm7,%xmm5
2967 pmuludq 64(%eax),%xmm7
2968 paddq %xmm6,%xmm2
2969 pmuludq 48(%eax),%xmm5
2970 pshufd $16,-96(%edx),%xmm6
2971 paddq %xmm7,%xmm1
2972 movdqa %xmm6,%xmm7
2973 pmuludq 16(%eax),%xmm6
2974 paddq %xmm5,%xmm0
2975 pshufd $16,-32(%edx),%xmm5
2976 pmuludq (%eax),%xmm7
2977 paddq %xmm6,%xmm4
2978 movdqa %xmm5,%xmm6
2979 pmuludq 64(%eax),%xmm5
2980 paddq %xmm7,%xmm3
2981 movdqa %xmm6,%xmm7
2982 pmuludq 48(%eax),%xmm6
2983 paddq %xmm5,%xmm2
2984 pmuludq 32(%eax),%xmm7
2985 pshufd $16,-80(%edx),%xmm5
2986 paddq %xmm6,%xmm1
2987 pshufd $16,-16(%edx),%xmm6
2988 pmuludq (%eax),%xmm5
2989 paddq %xmm7,%xmm0
2990 movdqa %xmm6,%xmm7
2991 pmuludq 64(%eax),%xmm6
2992 paddq %xmm5,%xmm4
2993 movdqa %xmm7,%xmm5
2994 pmuludq 16(%eax),%xmm7
2995 paddq %xmm6,%xmm3
2996 movdqa %xmm5,%xmm6
2997 pmuludq 32(%eax),%xmm5
2998 paddq %xmm7,%xmm0
2999 pmuludq 48(%eax),%xmm6
3000 movdqa 64(%ebx),%xmm7
3001 paddq %xmm5,%xmm1
3002 paddq %xmm6,%xmm2
3003 jz .L017short_tail
3004 movdqu -32(%esi),%xmm5
3005 movdqu -16(%esi),%xmm6
3006 leal 32(%esi),%esi
3007 movdqa %xmm2,32(%esp)
3008 movdqa %xmm3,48(%esp)
3009 movdqa %xmm4,64(%esp)
3010 movdqa %xmm5,%xmm2
3011 movdqa %xmm6,%xmm3
3012 psrldq $6,%xmm2
3013 psrldq $6,%xmm3
3014 movdqa %xmm5,%xmm4
3015 punpcklqdq %xmm3,%xmm2
3016 punpckhqdq %xmm6,%xmm4
3017 punpcklqdq %xmm6,%xmm5
3018 movdqa %xmm2,%xmm3
3019 psrlq $4,%xmm2
3020 psrlq $30,%xmm3
3021 movdqa %xmm5,%xmm6
3022 psrlq $40,%xmm4
3023 psrlq $26,%xmm6
3024 pand %xmm7,%xmm5
3025 pand %xmm7,%xmm6
3026 pand %xmm7,%xmm2
3027 pand %xmm7,%xmm3
3028 por (%ebx),%xmm4
3029 pshufd $16,(%edx),%xmm7
3030 paddd 80(%esp),%xmm5
3031 paddd 96(%esp),%xmm6
3032 paddd 112(%esp),%xmm2
3033 paddd 128(%esp),%xmm3
3034 paddd 144(%esp),%xmm4
3035 movdqa %xmm5,(%esp)
3036 pmuludq %xmm7,%xmm5
3037 movdqa %xmm6,16(%esp)
3038 pmuludq %xmm7,%xmm6
3039 paddq %xmm5,%xmm0
3040 movdqa %xmm2,%xmm5
3041 pmuludq %xmm7,%xmm2
3042 paddq %xmm6,%xmm1
3043 movdqa %xmm3,%xmm6
3044 pmuludq %xmm7,%xmm3
3045 paddq 32(%esp),%xmm2
3046 movdqa %xmm5,32(%esp)
3047 pshufd $16,16(%edx),%xmm5
3048 paddq 48(%esp),%xmm3
3049 movdqa %xmm6,48(%esp)
3050 movdqa %xmm4,%xmm6
3051 pmuludq %xmm7,%xmm4
3052 paddq 64(%esp),%xmm4
3053 movdqa %xmm6,64(%esp)
3054 movdqa %xmm5,%xmm6
3055 pmuludq 48(%esp),%xmm5
3056 movdqa %xmm6,%xmm7
3057 pmuludq 32(%esp),%xmm6
3058 paddq %xmm5,%xmm4
3059 movdqa %xmm7,%xmm5
3060 pmuludq 16(%esp),%xmm7
3061 paddq %xmm6,%xmm3
3062 pshufd $16,80(%edx),%xmm6
3063 pmuludq (%esp),%xmm5
3064 paddq %xmm7,%xmm2
3065 pmuludq 64(%esp),%xmm6
3066 pshufd $16,32(%edx),%xmm7
3067 paddq %xmm5,%xmm1
3068 movdqa %xmm7,%xmm5
3069 pmuludq 32(%esp),%xmm7
3070 paddq %xmm6,%xmm0
3071 movdqa %xmm5,%xmm6
3072 pmuludq 16(%esp),%xmm5
3073 paddq %xmm7,%xmm4
3074 pshufd $16,96(%edx),%xmm7
3075 pmuludq (%esp),%xmm6
3076 paddq %xmm5,%xmm3
3077 movdqa %xmm7,%xmm5
3078 pmuludq 64(%esp),%xmm7
3079 paddq %xmm6,%xmm2
3080 pmuludq 48(%esp),%xmm5
3081 pshufd $16,48(%edx),%xmm6
3082 paddq %xmm7,%xmm1
3083 movdqa %xmm6,%xmm7
3084 pmuludq 16(%esp),%xmm6
3085 paddq %xmm5,%xmm0
3086 pshufd $16,112(%edx),%xmm5
3087 pmuludq (%esp),%xmm7
3088 paddq %xmm6,%xmm4
3089 movdqa %xmm5,%xmm6
3090 pmuludq 64(%esp),%xmm5
3091 paddq %xmm7,%xmm3
3092 movdqa %xmm6,%xmm7
3093 pmuludq 48(%esp),%xmm6
3094 paddq %xmm5,%xmm2
3095 pmuludq 32(%esp),%xmm7
3096 pshufd $16,64(%edx),%xmm5
3097 paddq %xmm6,%xmm1
3098 pshufd $16,128(%edx),%xmm6
3099 pmuludq (%esp),%xmm5
3100 paddq %xmm7,%xmm0
3101 movdqa %xmm6,%xmm7
3102 pmuludq 64(%esp),%xmm6
3103 paddq %xmm5,%xmm4
3104 movdqa %xmm7,%xmm5
3105 pmuludq 16(%esp),%xmm7
3106 paddq %xmm6,%xmm3
3107 movdqa %xmm5,%xmm6
3108 pmuludq 32(%esp),%xmm5
3109 paddq %xmm7,%xmm0
3110 pmuludq 48(%esp),%xmm6
3111 movdqa 64(%ebx),%xmm7
3112 paddq %xmm5,%xmm1
3113 paddq %xmm6,%xmm2
3114 .L017short_tail:
3115 pshufd $78,%xmm4,%xmm6
3116 pshufd $78,%xmm3,%xmm5
3117 paddq %xmm6,%xmm4
3118 paddq %xmm5,%xmm3
3119 pshufd $78,%xmm0,%xmm6
3120 pshufd $78,%xmm1,%xmm5
3121 paddq %xmm6,%xmm0
3122 paddq %xmm5,%xmm1
3123 pshufd $78,%xmm2,%xmm6
3124 movdqa %xmm3,%xmm5
3125 pand %xmm7,%xmm3
3126 psrlq $26,%xmm5
3127 paddq %xmm6,%xmm2
3128 paddq %xmm4,%xmm5
3129 movdqa %xmm0,%xmm6
3130 pand %xmm7,%xmm0
3131 psrlq $26,%xmm6
3132 movdqa %xmm5,%xmm4
3133 paddq %xmm1,%xmm6
3134 psrlq $26,%xmm5
3135 pand %xmm7,%xmm4
3136 movdqa %xmm6,%xmm1
3137 psrlq $26,%xmm6
3138 paddd %xmm5,%xmm0
3139 psllq $2,%xmm5
3140 paddq %xmm2,%xmm6
3141 paddq %xmm0,%xmm5
3142 pand %xmm7,%xmm1
3143 movdqa %xmm6,%xmm2
3144 psrlq $26,%xmm6
3145 pand %xmm7,%xmm2
3146 paddd %xmm3,%xmm6
3147 movdqa %xmm5,%xmm0
3148 psrlq $26,%xmm5
3149 movdqa %xmm6,%xmm3
3150 psrlq $26,%xmm6
3151 pand %xmm7,%xmm0
3152 paddd %xmm5,%xmm1
3153 pand %xmm7,%xmm3
3154 paddd %xmm6,%xmm4
3155 .L013done:
3156 movd %xmm0,-48(%edi)
3157 movd %xmm1,-44(%edi)
3158 movd %xmm2,-40(%edi)
3159 movd %xmm3,-36(%edi)
3160 movd %xmm4,-32(%edi)
3161 movl %ebp,%esp
3162 .L007nodata:
3163 popl %edi
3164 popl %esi
3165 popl %ebx
3166 popl %ebp
3167 ret
3168 .size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
3169 .align 32
3170 .type _poly1305_emit_sse2,@function
3171 .align 16
3172 _poly1305_emit_sse2:
3173 pushl %ebp
3174 pushl %ebx
3175 pushl %esi
3176 pushl %edi
3177 movl 20(%esp),%ebp
3178 cmpl $0,20(%ebp)
3179 je .Lenter_emit
3180 movl (%ebp),%eax
3181 movl 4(%ebp),%edi
3182 movl 8(%ebp),%ecx
3183 movl 12(%ebp),%edx
3184 movl 16(%ebp),%esi
3185 movl %edi,%ebx
3186 shll $26,%edi
3187 shrl $6,%ebx
3188 addl %edi,%eax
3189 movl %ecx,%edi
3190 adcl $0,%ebx
3191 shll $20,%edi
3192 shrl $12,%ecx
3193 addl %edi,%ebx
3194 movl %edx,%edi
3195 adcl $0,%ecx
3196 shll $14,%edi
3197 shrl $18,%edx
3198 addl %edi,%ecx
3199 movl %esi,%edi
3200 adcl $0,%edx
3201 shll $8,%edi
3202 shrl $24,%esi
3203 addl %edi,%edx
3204 adcl $0,%esi
3205 movl %esi,%edi
3206 andl $3,%esi
3207 shrl $2,%edi
3208 leal (%edi,%edi,4),%ebp
3209 movl 24(%esp),%edi
3210 addl %ebp,%eax
3211 movl 28(%esp),%ebp
3212 adcl $0,%ebx
3213 adcl $0,%ecx
3214 adcl $0,%edx
3215 adcl $0,%esi
3216 movd %eax,%xmm0
3217 addl $5,%eax
3218 movd %ebx,%xmm1
3219 adcl $0,%ebx
3220 movd %ecx,%xmm2
3221 adcl $0,%ecx
3222 movd %edx,%xmm3
3223 adcl $0,%edx
3224 adcl $0,%esi
3225 shrl $2,%esi
3226 negl %esi
3227 andl %esi,%eax
3228 andl %esi,%ebx
3229 andl %esi,%ecx
3230 andl %esi,%edx
3231 movl %eax,(%edi)
3232 movd %xmm0,%eax
3233 movl %ebx,4(%edi)
3234 movd %xmm1,%ebx
3235 movl %ecx,8(%edi)
3236 movd %xmm2,%ecx
3237 movl %edx,12(%edi)
3238 movd %xmm3,%edx
3239 notl %esi
3240 andl %esi,%eax
3241 andl %esi,%ebx
3242 orl (%edi),%eax
3243 andl %esi,%ecx
3244 orl 4(%edi),%ebx
3245 andl %esi,%edx
3246 orl 8(%edi),%ecx
3247 orl 12(%edi),%edx
3248 addl (%ebp),%eax
3249 adcl 4(%ebp),%ebx
3250 movl %eax,(%edi)
3251 adcl 8(%ebp),%ecx
3252 movl %ebx,4(%edi)
3253 adcl 12(%ebp),%edx
3254 movl %ecx,8(%edi)
3255 movl %edx,12(%edi)
3256 popl %edi
3257 popl %esi
3258 popl %ebx
3259 popl %ebp
3260 ret
3261 .size _poly1305_emit_sse2,.-_poly1305_emit_sse2
3262 .align 32
3263 .type _poly1305_init_avx2,@function
3264 .align 16
3265 _poly1305_init_avx2:
3266 vmovdqu 24(%edi),%xmm4
3267 leal 48(%edi),%edi
3268 movl %esp,%ebp
3269 subl $224,%esp
3270 andl $-16,%esp
3271 vmovdqa 64(%ebx),%xmm7
3272 vpand %xmm7,%xmm4,%xmm0
3273 vpsrlq $26,%xmm4,%xmm1
3274 vpsrldq $6,%xmm4,%xmm3
3275 vpand %xmm7,%xmm1,%xmm1
3276 vpsrlq $4,%xmm3,%xmm2
3277 vpsrlq $30,%xmm3,%xmm3
3278 vpand %xmm7,%xmm2,%xmm2
3279 vpand %xmm7,%xmm3,%xmm3
3280 vpsrldq $13,%xmm4,%xmm4
3281 leal 144(%esp),%edx
3282 movl $2,%ecx
3283 .L018square:
3284 vmovdqa %xmm0,(%esp)
3285 vmovdqa %xmm1,16(%esp)
3286 vmovdqa %xmm2,32(%esp)
3287 vmovdqa %xmm3,48(%esp)
3288 vmovdqa %xmm4,64(%esp)
3289 vpslld $2,%xmm1,%xmm6
3290 vpslld $2,%xmm2,%xmm5
3291 vpaddd %xmm1,%xmm6,%xmm6
3292 vpaddd %xmm2,%xmm5,%xmm5
3293 vmovdqa %xmm6,80(%esp)
3294 vmovdqa %xmm5,96(%esp)
3295 vpslld $2,%xmm3,%xmm6
3296 vpslld $2,%xmm4,%xmm5
3297 vpaddd %xmm3,%xmm6,%xmm6
3298 vpaddd %xmm4,%xmm5,%xmm5
3299 vmovdqa %xmm6,112(%esp)
3300 vmovdqa %xmm5,128(%esp)
3301 vpshufd $68,%xmm0,%xmm5
3302 vmovdqa %xmm1,%xmm6
3303 vpshufd $68,%xmm1,%xmm1
3304 vpshufd $68,%xmm2,%xmm2
3305 vpshufd $68,%xmm3,%xmm3
3306 vpshufd $68,%xmm4,%xmm4
3307 vmovdqa %xmm5,(%edx)
3308 vmovdqa %xmm1,16(%edx)
3309 vmovdqa %xmm2,32(%edx)
3310 vmovdqa %xmm3,48(%edx)
3311 vmovdqa %xmm4,64(%edx)
3312 vpmuludq %xmm0,%xmm4,%xmm4
3313 vpmuludq %xmm0,%xmm3,%xmm3
3314 vpmuludq %xmm0,%xmm2,%xmm2
3315 vpmuludq %xmm0,%xmm1,%xmm1
3316 vpmuludq %xmm0,%xmm5,%xmm0
3317 vpmuludq 48(%edx),%xmm6,%xmm5
3318 vpaddq %xmm5,%xmm4,%xmm4
3319 vpmuludq 32(%edx),%xmm6,%xmm7
3320 vpaddq %xmm7,%xmm3,%xmm3
3321 vpmuludq 16(%edx),%xmm6,%xmm5
3322 vpaddq %xmm5,%xmm2,%xmm2
3323 vmovdqa 80(%esp),%xmm7
3324 vpmuludq (%edx),%xmm6,%xmm6
3325 vpaddq %xmm6,%xmm1,%xmm1
3326 vmovdqa 32(%esp),%xmm5
3327 vpmuludq 64(%edx),%xmm7,%xmm7
3328 vpaddq %xmm7,%xmm0,%xmm0
3329 vpmuludq 32(%edx),%xmm5,%xmm6
3330 vpaddq %xmm6,%xmm4,%xmm4
3331 vpmuludq 16(%edx),%xmm5,%xmm7
3332 vpaddq %xmm7,%xmm3,%xmm3
3333 vmovdqa 96(%esp),%xmm6
3334 vpmuludq (%edx),%xmm5,%xmm5
3335 vpaddq %xmm5,%xmm2,%xmm2
3336 vpmuludq 64(%edx),%xmm6,%xmm7
3337 vpaddq %xmm7,%xmm1,%xmm1
3338 vmovdqa 48(%esp),%xmm5
3339 vpmuludq 48(%edx),%xmm6,%xmm6
3340 vpaddq %xmm6,%xmm0,%xmm0
3341 vpmuludq 16(%edx),%xmm5,%xmm7
3342 vpaddq %xmm7,%xmm4,%xmm4
3343 vmovdqa 112(%esp),%xmm6
3344 vpmuludq (%edx),%xmm5,%xmm5
3345 vpaddq %xmm5,%xmm3,%xmm3
3346 vpmuludq 64(%edx),%xmm6,%xmm7
3347 vpaddq %xmm7,%xmm2,%xmm2
3348 vpmuludq 48(%edx),%xmm6,%xmm5
3349 vpaddq %xmm5,%xmm1,%xmm1
3350 vmovdqa 64(%esp),%xmm7
3351 vpmuludq 32(%edx),%xmm6,%xmm6
3352 vpaddq %xmm6,%xmm0,%xmm0
3353 vmovdqa 128(%esp),%xmm5
3354 vpmuludq (%edx),%xmm7,%xmm7
3355 vpaddq %xmm7,%xmm4,%xmm4
3356 vpmuludq 64(%edx),%xmm5,%xmm6
3357 vpaddq %xmm6,%xmm3,%xmm3
3358 vpmuludq 16(%edx),%xmm5,%xmm7
3359 vpaddq %xmm7,%xmm0,%xmm0
3360 vpmuludq 32(%edx),%xmm5,%xmm6
3361 vpaddq %xmm6,%xmm1,%xmm1
3362 vmovdqa 64(%ebx),%xmm7
3363 vpmuludq 48(%edx),%xmm5,%xmm5
3364 vpaddq %xmm5,%xmm2,%xmm2
3365 vpsrlq $26,%xmm3,%xmm5
3366 vpand %xmm7,%xmm3,%xmm3
3367 vpsrlq $26,%xmm0,%xmm6
3368 vpand %xmm7,%xmm0,%xmm0
3369 vpaddq %xmm5,%xmm4,%xmm4
3370 vpaddq %xmm6,%xmm1,%xmm1
3371 vpsrlq $26,%xmm4,%xmm5
3372 vpand %xmm7,%xmm4,%xmm4
3373 vpsrlq $26,%xmm1,%xmm6
3374 vpand %xmm7,%xmm1,%xmm1
3375 vpaddq %xmm6,%xmm2,%xmm2
3376 vpaddd %xmm5,%xmm0,%xmm0
3377 vpsllq $2,%xmm5,%xmm5
3378 vpsrlq $26,%xmm2,%xmm6
3379 vpand %xmm7,%xmm2,%xmm2
3380 vpaddd %xmm5,%xmm0,%xmm0
3381 vpaddd %xmm6,%xmm3,%xmm3
3382 vpsrlq $26,%xmm3,%xmm6
3383 vpsrlq $26,%xmm0,%xmm5
3384 vpand %xmm7,%xmm0,%xmm0
3385 vpand %xmm7,%xmm3,%xmm3
3386 vpaddd %xmm5,%xmm1,%xmm1
3387 vpaddd %xmm6,%xmm4,%xmm4
3388 decl %ecx
3389 jz .L019square_break
3390 vpunpcklqdq (%esp),%xmm0,%xmm0
3391 vpunpcklqdq 16(%esp),%xmm1,%xmm1
3392 vpunpcklqdq 32(%esp),%xmm2,%xmm2
3393 vpunpcklqdq 48(%esp),%xmm3,%xmm3
3394 vpunpcklqdq 64(%esp),%xmm4,%xmm4
3395 jmp .L018square
3396 .L019square_break:
3397 vpsllq $32,%xmm0,%xmm0
3398 vpsllq $32,%xmm1,%xmm1
3399 vpsllq $32,%xmm2,%xmm2
3400 vpsllq $32,%xmm3,%xmm3
3401 vpsllq $32,%xmm4,%xmm4
3402 vpor (%esp),%xmm0,%xmm0
3403 vpor 16(%esp),%xmm1,%xmm1
3404 vpor 32(%esp),%xmm2,%xmm2
3405 vpor 48(%esp),%xmm3,%xmm3
3406 vpor 64(%esp),%xmm4,%xmm4
3407 vpshufd $141,%xmm0,%xmm0
3408 vpshufd $141,%xmm1,%xmm1
3409 vpshufd $141,%xmm2,%xmm2
3410 vpshufd $141,%xmm3,%xmm3
3411 vpshufd $141,%xmm4,%xmm4
3412 vmovdqu %xmm0,(%edi)
3413 vmovdqu %xmm1,16(%edi)
3414 vmovdqu %xmm2,32(%edi)
3415 vmovdqu %xmm3,48(%edi)
3416 vmovdqu %xmm4,64(%edi)
3417 vpslld $2,%xmm1,%xmm6
3418 vpslld $2,%xmm2,%xmm5
3419 vpaddd %xmm1,%xmm6,%xmm6
3420 vpaddd %xmm2,%xmm5,%xmm5
3421 vmovdqu %xmm6,80(%edi)
3422 vmovdqu %xmm5,96(%edi)
3423 vpslld $2,%xmm3,%xmm6
3424 vpslld $2,%xmm4,%xmm5
3425 vpaddd %xmm3,%xmm6,%xmm6
3426 vpaddd %xmm4,%xmm5,%xmm5
3427 vmovdqu %xmm6,112(%edi)
3428 vmovdqu %xmm5,128(%edi)
3429 movl %ebp,%esp
3430 leal -48(%edi),%edi
3431 ret
3432 .size _poly1305_init_avx2,.-_poly1305_init_avx2
3433 .align 32
3434 .type _poly1305_blocks_avx2,@function
3435 .align 16
3436 _poly1305_blocks_avx2:
3437 pushl %ebp
3438 pushl %ebx
3439 pushl %esi
3440 pushl %edi
3441 movl 20(%esp),%edi
3442 movl 24(%esp),%esi
3443 movl 28(%esp),%ecx
3444 movl 20(%edi),%eax
3445 andl $-16,%ecx
3446 jz .L020nodata
3447 cmpl $64,%ecx
3448 jae .L021enter_avx2
3449 testl %eax,%eax
3450 jz .Lenter_blocks
3451 .L021enter_avx2:
3452 vzeroupper
3453 call .L022pic_point
3454 .L022pic_point:
3455 popl %ebx
3456 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
3457 testl %eax,%eax
3458 jnz .L023base2_26
3459 call _poly1305_init_avx2
3460 movl (%edi),%eax
3461 movl 3(%edi),%ecx
3462 movl 6(%edi),%edx
3463 movl 9(%edi),%esi
3464 movl 13(%edi),%ebp
3465 shrl $2,%ecx
3466 andl $67108863,%eax
3467 shrl $4,%edx
3468 andl $67108863,%ecx
3469 shrl $6,%esi
3470 andl $67108863,%edx
3471 movl %eax,(%edi)
3472 movl %ecx,4(%edi)
3473 movl %edx,8(%edi)
3474 movl %esi,12(%edi)
3475 movl %ebp,16(%edi)
3476 movl $1,20(%edi)
3477 movl 24(%esp),%esi
3478 movl 28(%esp),%ecx
3479 .L023base2_26:
3480 movl 32(%esp),%eax
3481 movl %esp,%ebp
3482 subl $448,%esp
3483 andl $-512,%esp
3484 vmovdqu 48(%edi),%xmm0
3485 leal 288(%esp),%edx
3486 vmovdqu 64(%edi),%xmm1
3487 vmovdqu 80(%edi),%xmm2
3488 vmovdqu 96(%edi),%xmm3
3489 vmovdqu 112(%edi),%xmm4
3490 leal 48(%edi),%edi
3491 vpermq $64,%ymm0,%ymm0
3492 vpermq $64,%ymm1,%ymm1
3493 vpermq $64,%ymm2,%ymm2
3494 vpermq $64,%ymm3,%ymm3
3495 vpermq $64,%ymm4,%ymm4
3496 vpshufd $200,%ymm0,%ymm0
3497 vpshufd $200,%ymm1,%ymm1
3498 vpshufd $200,%ymm2,%ymm2
3499 vpshufd $200,%ymm3,%ymm3
3500 vpshufd $200,%ymm4,%ymm4
3501 vmovdqa %ymm0,-128(%edx)
3502 vmovdqu 80(%edi),%xmm0
3503 vmovdqa %ymm1,-96(%edx)
3504 vmovdqu 96(%edi),%xmm1
3505 vmovdqa %ymm2,-64(%edx)
3506 vmovdqu 112(%edi),%xmm2
3507 vmovdqa %ymm3,-32(%edx)
3508 vmovdqu 128(%edi),%xmm3
3509 vmovdqa %ymm4,(%edx)
3510 vpermq $64,%ymm0,%ymm0
3511 vpermq $64,%ymm1,%ymm1
3512 vpermq $64,%ymm2,%ymm2
3513 vpermq $64,%ymm3,%ymm3
3514 vpshufd $200,%ymm0,%ymm0
3515 vpshufd $200,%ymm1,%ymm1
3516 vpshufd $200,%ymm2,%ymm2
3517 vpshufd $200,%ymm3,%ymm3
3518 vmovdqa %ymm0,32(%edx)
3519 vmovd -48(%edi),%xmm0
3520 vmovdqa %ymm1,64(%edx)
3521 vmovd -44(%edi),%xmm1
3522 vmovdqa %ymm2,96(%edx)
3523 vmovd -40(%edi),%xmm2
3524 vmovdqa %ymm3,128(%edx)
3525 vmovd -36(%edi),%xmm3
3526 vmovd -32(%edi),%xmm4
3527 vmovdqa 64(%ebx),%ymm7
3528 negl %eax
3529 testl $63,%ecx
3530 jz .L024even
3531 movl %ecx,%edx
3532 andl $-64,%ecx
3533 andl $63,%edx
3534 vmovdqu (%esi),%xmm5
3535 cmpl $32,%edx
3536 jb .L025one
3537 vmovdqu 16(%esi),%xmm6
3538 je .L026two
3539 vinserti128 $1,32(%esi),%ymm5,%ymm5
3540 leal 48(%esi),%esi
3541 leal 8(%ebx),%ebx
3542 leal 296(%esp),%edx
3543 jmp .L027tail
3544 .L026two:
3545 leal 32(%esi),%esi
3546 leal 16(%ebx),%ebx
3547 leal 304(%esp),%edx
3548 jmp .L027tail
3549 .L025one:
3550 leal 16(%esi),%esi
3551 vpxor %ymm6,%ymm6,%ymm6
3552 leal 32(%ebx,%eax,8),%ebx
3553 leal 312(%esp),%edx
3554 jmp .L027tail
3555 .align 32
3556 .L024even:
3557 vmovdqu (%esi),%xmm5
3558 vmovdqu 16(%esi),%xmm6
3559 vinserti128 $1,32(%esi),%ymm5,%ymm5
3560 vinserti128 $1,48(%esi),%ymm6,%ymm6
3561 leal 64(%esi),%esi
3562 subl $64,%ecx
3563 jz .L027tail
3564 .L028loop:
3565 vmovdqa %ymm2,64(%esp)
3566 vpsrldq $6,%ymm5,%ymm2
3567 vmovdqa %ymm0,(%esp)
3568 vpsrldq $6,%ymm6,%ymm0
3569 vmovdqa %ymm1,32(%esp)
3570 vpunpckhqdq %ymm6,%ymm5,%ymm1
3571 vpunpcklqdq %ymm6,%ymm5,%ymm5
3572 vpunpcklqdq %ymm0,%ymm2,%ymm2
3573 vpsrlq $30,%ymm2,%ymm0
3574 vpsrlq $4,%ymm2,%ymm2
3575 vpsrlq $26,%ymm5,%ymm6
3576 vpsrlq $40,%ymm1,%ymm1
3577 vpand %ymm7,%ymm2,%ymm2
3578 vpand %ymm7,%ymm5,%ymm5
3579 vpand %ymm7,%ymm6,%ymm6
3580 vpand %ymm7,%ymm0,%ymm0
3581 vpor (%ebx),%ymm1,%ymm1
3582 vpaddq 64(%esp),%ymm2,%ymm2
3583 vpaddq (%esp),%ymm5,%ymm5
3584 vpaddq 32(%esp),%ymm6,%ymm6
3585 vpaddq %ymm3,%ymm0,%ymm0
3586 vpaddq %ymm4,%ymm1,%ymm1
3587 vpmuludq -96(%edx),%ymm2,%ymm3
3588 vmovdqa %ymm6,32(%esp)
3589 vpmuludq -64(%edx),%ymm2,%ymm4
3590 vmovdqa %ymm0,96(%esp)
3591 vpmuludq 96(%edx),%ymm2,%ymm0
3592 vmovdqa %ymm1,128(%esp)
3593 vpmuludq 128(%edx),%ymm2,%ymm1
3594 vpmuludq -128(%edx),%ymm2,%ymm2
3595 vpmuludq -32(%edx),%ymm5,%ymm7
3596 vpaddq %ymm7,%ymm3,%ymm3
3597 vpmuludq (%edx),%ymm5,%ymm6
3598 vpaddq %ymm6,%ymm4,%ymm4
3599 vpmuludq -128(%edx),%ymm5,%ymm7
3600 vpaddq %ymm7,%ymm0,%ymm0
3601 vmovdqa 32(%esp),%ymm7
3602 vpmuludq -96(%edx),%ymm5,%ymm6
3603 vpaddq %ymm6,%ymm1,%ymm1
3604 vpmuludq -64(%edx),%ymm5,%ymm5
3605 vpaddq %ymm5,%ymm2,%ymm2
3606 vpmuludq -64(%edx),%ymm7,%ymm6
3607 vpaddq %ymm6,%ymm3,%ymm3
3608 vpmuludq -32(%edx),%ymm7,%ymm5
3609 vpaddq %ymm5,%ymm4,%ymm4
3610 vpmuludq 128(%edx),%ymm7,%ymm6
3611 vpaddq %ymm6,%ymm0,%ymm0
3612 vmovdqa 96(%esp),%ymm6
3613 vpmuludq -128(%edx),%ymm7,%ymm5
3614 vpaddq %ymm5,%ymm1,%ymm1
3615 vpmuludq -96(%edx),%ymm7,%ymm7
3616 vpaddq %ymm7,%ymm2,%ymm2
3617 vpmuludq -128(%edx),%ymm6,%ymm5
3618 vpaddq %ymm5,%ymm3,%ymm3
3619 vpmuludq -96(%edx),%ymm6,%ymm7
3620 vpaddq %ymm7,%ymm4,%ymm4
3621 vpmuludq 64(%edx),%ymm6,%ymm5
3622 vpaddq %ymm5,%ymm0,%ymm0
3623 vmovdqa 128(%esp),%ymm5
3624 vpmuludq 96(%edx),%ymm6,%ymm7
3625 vpaddq %ymm7,%ymm1,%ymm1
3626 vpmuludq 128(%edx),%ymm6,%ymm6
3627 vpaddq %ymm6,%ymm2,%ymm2
3628 vpmuludq 128(%edx),%ymm5,%ymm7
3629 vpaddq %ymm7,%ymm3,%ymm3
3630 vpmuludq 32(%edx),%ymm5,%ymm6
3631 vpaddq %ymm6,%ymm0,%ymm0
3632 vpmuludq -128(%edx),%ymm5,%ymm7
3633 vpaddq %ymm7,%ymm4,%ymm4
3634 vmovdqa 64(%ebx),%ymm7
3635 vpmuludq 64(%edx),%ymm5,%ymm6
3636 vpaddq %ymm6,%ymm1,%ymm1
3637 vpmuludq 96(%edx),%ymm5,%ymm5
3638 vpaddq %ymm5,%ymm2,%ymm2
3639 vpsrlq $26,%ymm3,%ymm5
3640 vpand %ymm7,%ymm3,%ymm3
3641 vpsrlq $26,%ymm0,%ymm6
3642 vpand %ymm7,%ymm0,%ymm0
3643 vpaddq %ymm5,%ymm4,%ymm4
3644 vpaddq %ymm6,%ymm1,%ymm1
3645 vpsrlq $26,%ymm4,%ymm5
3646 vpand %ymm7,%ymm4,%ymm4
3647 vpsrlq $26,%ymm1,%ymm6
3648 vpand %ymm7,%ymm1,%ymm1
3649 vpaddq %ymm6,%ymm2,%ymm2
3650 vpaddq %ymm5,%ymm0,%ymm0
3651 vpsllq $2,%ymm5,%ymm5
3652 vpsrlq $26,%ymm2,%ymm6
3653 vpand %ymm7,%ymm2,%ymm2
3654 vpaddq %ymm5,%ymm0,%ymm0
3655 vpaddq %ymm6,%ymm3,%ymm3
3656 vpsrlq $26,%ymm3,%ymm6
3657 vpsrlq $26,%ymm0,%ymm5
3658 vpand %ymm7,%ymm0,%ymm0
3659 vpand %ymm7,%ymm3,%ymm3
3660 vpaddq %ymm5,%ymm1,%ymm1
3661 vpaddq %ymm6,%ymm4,%ymm4
3662 vmovdqu (%esi),%xmm5
3663 vmovdqu 16(%esi),%xmm6
3664 vinserti128 $1,32(%esi),%ymm5,%ymm5
3665 vinserti128 $1,48(%esi),%ymm6,%ymm6
3666 leal 64(%esi),%esi
3667 subl $64,%ecx
3668 jnz .L028loop
3669 .L027tail:
3670 vmovdqa %ymm2,64(%esp)
3671 vpsrldq $6,%ymm5,%ymm2
3672 vmovdqa %ymm0,(%esp)
3673 vpsrldq $6,%ymm6,%ymm0
3674 vmovdqa %ymm1,32(%esp)
3675 vpunpckhqdq %ymm6,%ymm5,%ymm1
3676 vpunpcklqdq %ymm6,%ymm5,%ymm5
3677 vpunpcklqdq %ymm0,%ymm2,%ymm2
3678 vpsrlq $30,%ymm2,%ymm0
3679 vpsrlq $4,%ymm2,%ymm2
3680 vpsrlq $26,%ymm5,%ymm6
3681 vpsrlq $40,%ymm1,%ymm1
3682 vpand %ymm7,%ymm2,%ymm2
3683 vpand %ymm7,%ymm5,%ymm5
3684 vpand %ymm7,%ymm6,%ymm6
3685 vpand %ymm7,%ymm0,%ymm0
3686 vpor (%ebx),%ymm1,%ymm1
3687 andl $-64,%ebx
3688 vpaddq 64(%esp),%ymm2,%ymm2
3689 vpaddq (%esp),%ymm5,%ymm5
3690 vpaddq 32(%esp),%ymm6,%ymm6
3691 vpaddq %ymm3,%ymm0,%ymm0
3692 vpaddq %ymm4,%ymm1,%ymm1
3693 vpmuludq -92(%edx),%ymm2,%ymm3
3694 vmovdqa %ymm6,32(%esp)
3695 vpmuludq -60(%edx),%ymm2,%ymm4
3696 vmovdqa %ymm0,96(%esp)
3697 vpmuludq 100(%edx),%ymm2,%ymm0
3698 vmovdqa %ymm1,128(%esp)
3699 vpmuludq 132(%edx),%ymm2,%ymm1
3700 vpmuludq -124(%edx),%ymm2,%ymm2
3701 vpmuludq -28(%edx),%ymm5,%ymm7
3702 vpaddq %ymm7,%ymm3,%ymm3
3703 vpmuludq 4(%edx),%ymm5,%ymm6
3704 vpaddq %ymm6,%ymm4,%ymm4
3705 vpmuludq -124(%edx),%ymm5,%ymm7
3706 vpaddq %ymm7,%ymm0,%ymm0
3707 vmovdqa 32(%esp),%ymm7
3708 vpmuludq -92(%edx),%ymm5,%ymm6
3709 vpaddq %ymm6,%ymm1,%ymm1
3710 vpmuludq -60(%edx),%ymm5,%ymm5
3711 vpaddq %ymm5,%ymm2,%ymm2
3712 vpmuludq -60(%edx),%ymm7,%ymm6
3713 vpaddq %ymm6,%ymm3,%ymm3
3714 vpmuludq -28(%edx),%ymm7,%ymm5
3715 vpaddq %ymm5,%ymm4,%ymm4
3716 vpmuludq 132(%edx),%ymm7,%ymm6
3717 vpaddq %ymm6,%ymm0,%ymm0
3718 vmovdqa 96(%esp),%ymm6
3719 vpmuludq -124(%edx),%ymm7,%ymm5
3720 vpaddq %ymm5,%ymm1,%ymm1
3721 vpmuludq -92(%edx),%ymm7,%ymm7
3722 vpaddq %ymm7,%ymm2,%ymm2
3723 vpmuludq -124(%edx),%ymm6,%ymm5
3724 vpaddq %ymm5,%ymm3,%ymm3
3725 vpmuludq -92(%edx),%ymm6,%ymm7
3726 vpaddq %ymm7,%ymm4,%ymm4
3727 vpmuludq 68(%edx),%ymm6,%ymm5
3728 vpaddq %ymm5,%ymm0,%ymm0
3729 vmovdqa 128(%esp),%ymm5
3730 vpmuludq 100(%edx),%ymm6,%ymm7
3731 vpaddq %ymm7,%ymm1,%ymm1
3732 vpmuludq 132(%edx),%ymm6,%ymm6
3733 vpaddq %ymm6,%ymm2,%ymm2
3734 vpmuludq 132(%edx),%ymm5,%ymm7
3735 vpaddq %ymm7,%ymm3,%ymm3
3736 vpmuludq 36(%edx),%ymm5,%ymm6
3737 vpaddq %ymm6,%ymm0,%ymm0
3738 vpmuludq -124(%edx),%ymm5,%ymm7
3739 vpaddq %ymm7,%ymm4,%ymm4
3740 vmovdqa 64(%ebx),%ymm7
3741 vpmuludq 68(%edx),%ymm5,%ymm6
3742 vpaddq %ymm6,%ymm1,%ymm1
3743 vpmuludq 100(%edx),%ymm5,%ymm5
3744 vpaddq %ymm5,%ymm2,%ymm2
3745 vpsrldq $8,%ymm4,%ymm5
3746 vpsrldq $8,%ymm3,%ymm6
3747 vpaddq %ymm5,%ymm4,%ymm4
3748 vpsrldq $8,%ymm0,%ymm5
3749 vpaddq %ymm6,%ymm3,%ymm3
3750 vpsrldq $8,%ymm1,%ymm6
3751 vpaddq %ymm5,%ymm0,%ymm0
3752 vpsrldq $8,%ymm2,%ymm5
3753 vpaddq %ymm6,%ymm1,%ymm1
3754 vpermq $2,%ymm4,%ymm6
3755 vpaddq %ymm5,%ymm2,%ymm2
3756 vpermq $2,%ymm3,%ymm5
3757 vpaddq %ymm6,%ymm4,%ymm4
3758 vpermq $2,%ymm0,%ymm6
3759 vpaddq %ymm5,%ymm3,%ymm3
3760 vpermq $2,%ymm1,%ymm5
3761 vpaddq %ymm6,%ymm0,%ymm0
3762 vpermq $2,%ymm2,%ymm6
3763 vpaddq %ymm5,%ymm1,%ymm1
3764 vpaddq %ymm6,%ymm2,%ymm2
3765 vpsrlq $26,%ymm3,%ymm5
3766 vpand %ymm7,%ymm3,%ymm3
3767 vpsrlq $26,%ymm0,%ymm6
3768 vpand %ymm7,%ymm0,%ymm0
3769 vpaddq %ymm5,%ymm4,%ymm4
3770 vpaddq %ymm6,%ymm1,%ymm1
3771 vpsrlq $26,%ymm4,%ymm5
3772 vpand %ymm7,%ymm4,%ymm4
3773 vpsrlq $26,%ymm1,%ymm6
3774 vpand %ymm7,%ymm1,%ymm1
3775 vpaddq %ymm6,%ymm2,%ymm2
3776 vpaddq %ymm5,%ymm0,%ymm0
3777 vpsllq $2,%ymm5,%ymm5
3778 vpsrlq $26,%ymm2,%ymm6
3779 vpand %ymm7,%ymm2,%ymm2
3780 vpaddq %ymm5,%ymm0,%ymm0
3781 vpaddq %ymm6,%ymm3,%ymm3
3782 vpsrlq $26,%ymm3,%ymm6
3783 vpsrlq $26,%ymm0,%ymm5
3784 vpand %ymm7,%ymm0,%ymm0
3785 vpand %ymm7,%ymm3,%ymm3
3786 vpaddq %ymm5,%ymm1,%ymm1
3787 vpaddq %ymm6,%ymm4,%ymm4
3788 cmpl $0,%ecx
3789 je .L029done
3790 vpshufd $252,%xmm0,%xmm0
3791 leal 288(%esp),%edx
3792 vpshufd $252,%xmm1,%xmm1
3793 vpshufd $252,%xmm2,%xmm2
3794 vpshufd $252,%xmm3,%xmm3
3795 vpshufd $252,%xmm4,%xmm4
3796 jmp .L024even
3797 .align 16
3798 .L029done:
3799 vmovd %xmm0,-48(%edi)
3800 vmovd %xmm1,-44(%edi)
3801 vmovd %xmm2,-40(%edi)
3802 vmovd %xmm3,-36(%edi)
3803 vmovd %xmm4,-32(%edi)
3804 vzeroupper
3805 movl %ebp,%esp
3806 .L020nodata:
3807 popl %edi
3808 popl %esi
3809 popl %ebx
3810 popl %ebp
3811 ret
3812 .size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
3813 .align 64
3814 .Lconst_sse2:
3815 .long 16777216,0,16777216,0,16777216,0,16777216,0
3816 .long 0,0,0,0,0,0,0,0
3817 .long 67108863,0,67108863,0,67108863,0,67108863,0
3818 .long 268435455,268435452,268435452,268435452
3819 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
3820 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
3821 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
3822 .byte 114,103,62,0
3823 .align 4
3824 .comm OPENSSL_ia32cap_P,16,4
3825 #endif
Cache object: 52d1e58cbca27ccda0b7df76a3548961
|