The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/openssl/i386/poly1305-x86.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $FreeBSD$ */
    2 /* Do not modify. This file is auto-generated from poly1305-x86.pl. */
    3 #ifdef PIC
    4 .text
    5 .align  64
    6 .globl  poly1305_init
    7 .type   poly1305_init,@function
    8 .align  16
    9 poly1305_init:
   10 .L_poly1305_init_begin:
   11         pushl   %ebp
   12         pushl   %ebx
   13         pushl   %esi
   14         pushl   %edi
   15         movl    20(%esp),%edi
   16         movl    24(%esp),%esi
   17         movl    28(%esp),%ebp
   18         xorl    %eax,%eax
   19         movl    %eax,(%edi)
   20         movl    %eax,4(%edi)
   21         movl    %eax,8(%edi)
   22         movl    %eax,12(%edi)
   23         movl    %eax,16(%edi)
   24         movl    %eax,20(%edi)
   25         cmpl    $0,%esi
   26         je      .L000nokey
   27         call    .L001pic_point
   28 .L001pic_point:
   29         popl    %ebx
   30         leal    poly1305_blocks-.L001pic_point(%ebx),%eax
   31         leal    poly1305_emit-.L001pic_point(%ebx),%edx
   32         leal    OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
   33         movl    (%edi),%ecx
   34         andl    $83886080,%ecx
   35         cmpl    $83886080,%ecx
   36         jne     .L002no_sse2
   37         leal    _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
   38         leal    _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
   39         movl    8(%edi),%ecx
   40         testl   $32,%ecx
   41         jz      .L002no_sse2
   42         leal    _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
   43 .L002no_sse2:
   44         movl    20(%esp),%edi
   45         movl    %eax,(%ebp)
   46         movl    %edx,4(%ebp)
   47         movl    (%esi),%eax
   48         movl    4(%esi),%ebx
   49         movl    8(%esi),%ecx
   50         movl    12(%esi),%edx
   51         andl    $268435455,%eax
   52         andl    $268435452,%ebx
   53         andl    $268435452,%ecx
   54         andl    $268435452,%edx
   55         movl    %eax,24(%edi)
   56         movl    %ebx,28(%edi)
   57         movl    %ecx,32(%edi)
   58         movl    %edx,36(%edi)
   59         movl    $1,%eax
   60 .L000nokey:
   61         popl    %edi
   62         popl    %esi
   63         popl    %ebx
   64         popl    %ebp
   65         ret
   66 .size   poly1305_init,.-.L_poly1305_init_begin
   67 .globl  poly1305_blocks
   68 .type   poly1305_blocks,@function
   69 .align  16
   70 poly1305_blocks:
   71 .L_poly1305_blocks_begin:
   72         pushl   %ebp
   73         pushl   %ebx
   74         pushl   %esi
   75         pushl   %edi
   76         movl    20(%esp),%edi
   77         movl    24(%esp),%esi
   78         movl    28(%esp),%ecx
   79 .Lenter_blocks:
   80         andl    $-15,%ecx
   81         jz      .L003nodata
   82         subl    $64,%esp
   83         movl    24(%edi),%eax
   84         movl    28(%edi),%ebx
   85         leal    (%esi,%ecx,1),%ebp
   86         movl    32(%edi),%ecx
   87         movl    36(%edi),%edx
   88         movl    %ebp,92(%esp)
   89         movl    %esi,%ebp
   90         movl    %eax,36(%esp)
   91         movl    %ebx,%eax
   92         shrl    $2,%eax
   93         movl    %ebx,40(%esp)
   94         addl    %ebx,%eax
   95         movl    %ecx,%ebx
   96         shrl    $2,%ebx
   97         movl    %ecx,44(%esp)
   98         addl    %ecx,%ebx
   99         movl    %edx,%ecx
  100         shrl    $2,%ecx
  101         movl    %edx,48(%esp)
  102         addl    %edx,%ecx
  103         movl    %eax,52(%esp)
  104         movl    %ebx,56(%esp)
  105         movl    %ecx,60(%esp)
  106         movl    (%edi),%eax
  107         movl    4(%edi),%ebx
  108         movl    8(%edi),%ecx
  109         movl    12(%edi),%esi
  110         movl    16(%edi),%edi
  111         jmp     .L004loop
  112 .align  32
  113 .L004loop:
  114         addl    (%ebp),%eax
  115         adcl    4(%ebp),%ebx
  116         adcl    8(%ebp),%ecx
  117         adcl    12(%ebp),%esi
  118         leal    16(%ebp),%ebp
  119         adcl    96(%esp),%edi
  120         movl    %eax,(%esp)
  121         movl    %esi,12(%esp)
  122         mull    36(%esp)
  123         movl    %edi,16(%esp)
  124         movl    %eax,%edi
  125         movl    %ebx,%eax
  126         movl    %edx,%esi
  127         mull    60(%esp)
  128         addl    %eax,%edi
  129         movl    %ecx,%eax
  130         adcl    %edx,%esi
  131         mull    56(%esp)
  132         addl    %eax,%edi
  133         movl    12(%esp),%eax
  134         adcl    %edx,%esi
  135         mull    52(%esp)
  136         addl    %eax,%edi
  137         movl    (%esp),%eax
  138         adcl    %edx,%esi
  139         mull    40(%esp)
  140         movl    %edi,20(%esp)
  141         xorl    %edi,%edi
  142         addl    %eax,%esi
  143         movl    %ebx,%eax
  144         adcl    %edx,%edi
  145         mull    36(%esp)
  146         addl    %eax,%esi
  147         movl    %ecx,%eax
  148         adcl    %edx,%edi
  149         mull    60(%esp)
  150         addl    %eax,%esi
  151         movl    12(%esp),%eax
  152         adcl    %edx,%edi
  153         mull    56(%esp)
  154         addl    %eax,%esi
  155         movl    16(%esp),%eax
  156         adcl    %edx,%edi
  157         imull   52(%esp),%eax
  158         addl    %eax,%esi
  159         movl    (%esp),%eax
  160         adcl    $0,%edi
  161         mull    44(%esp)
  162         movl    %esi,24(%esp)
  163         xorl    %esi,%esi
  164         addl    %eax,%edi
  165         movl    %ebx,%eax
  166         adcl    %edx,%esi
  167         mull    40(%esp)
  168         addl    %eax,%edi
  169         movl    %ecx,%eax
  170         adcl    %edx,%esi
  171         mull    36(%esp)
  172         addl    %eax,%edi
  173         movl    12(%esp),%eax
  174         adcl    %edx,%esi
  175         mull    60(%esp)
  176         addl    %eax,%edi
  177         movl    16(%esp),%eax
  178         adcl    %edx,%esi
  179         imull   56(%esp),%eax
  180         addl    %eax,%edi
  181         movl    (%esp),%eax
  182         adcl    $0,%esi
  183         mull    48(%esp)
  184         movl    %edi,28(%esp)
  185         xorl    %edi,%edi
  186         addl    %eax,%esi
  187         movl    %ebx,%eax
  188         adcl    %edx,%edi
  189         mull    44(%esp)
  190         addl    %eax,%esi
  191         movl    %ecx,%eax
  192         adcl    %edx,%edi
  193         mull    40(%esp)
  194         addl    %eax,%esi
  195         movl    12(%esp),%eax
  196         adcl    %edx,%edi
  197         mull    36(%esp)
  198         addl    %eax,%esi
  199         movl    16(%esp),%ecx
  200         adcl    %edx,%edi
  201         movl    %ecx,%edx
  202         imull   60(%esp),%ecx
  203         addl    %ecx,%esi
  204         movl    20(%esp),%eax
  205         adcl    $0,%edi
  206         imull   36(%esp),%edx
  207         addl    %edi,%edx
  208         movl    24(%esp),%ebx
  209         movl    28(%esp),%ecx
  210         movl    %edx,%edi
  211         shrl    $2,%edx
  212         andl    $3,%edi
  213         leal    (%edx,%edx,4),%edx
  214         addl    %edx,%eax
  215         adcl    $0,%ebx
  216         adcl    $0,%ecx
  217         adcl    $0,%esi
  218         adcl    $0,%edi
  219         cmpl    92(%esp),%ebp
  220         jne     .L004loop
  221         movl    84(%esp),%edx
  222         addl    $64,%esp
  223         movl    %eax,(%edx)
  224         movl    %ebx,4(%edx)
  225         movl    %ecx,8(%edx)
  226         movl    %esi,12(%edx)
  227         movl    %edi,16(%edx)
  228 .L003nodata:
  229         popl    %edi
  230         popl    %esi
  231         popl    %ebx
  232         popl    %ebp
  233         ret
  234 .size   poly1305_blocks,.-.L_poly1305_blocks_begin
  235 .globl  poly1305_emit
  236 .type   poly1305_emit,@function
  237 .align  16
  238 poly1305_emit:
  239 .L_poly1305_emit_begin:
  240         pushl   %ebp
  241         pushl   %ebx
  242         pushl   %esi
  243         pushl   %edi
  244         movl    20(%esp),%ebp
  245 .Lenter_emit:
  246         movl    24(%esp),%edi
  247         movl    (%ebp),%eax
  248         movl    4(%ebp),%ebx
  249         movl    8(%ebp),%ecx
  250         movl    12(%ebp),%edx
  251         movl    16(%ebp),%esi
  252         addl    $5,%eax
  253         adcl    $0,%ebx
  254         adcl    $0,%ecx
  255         adcl    $0,%edx
  256         adcl    $0,%esi
  257         shrl    $2,%esi
  258         negl    %esi
  259         andl    %esi,%eax
  260         andl    %esi,%ebx
  261         andl    %esi,%ecx
  262         andl    %esi,%edx
  263         movl    %eax,(%edi)
  264         movl    %ebx,4(%edi)
  265         movl    %ecx,8(%edi)
  266         movl    %edx,12(%edi)
  267         notl    %esi
  268         movl    (%ebp),%eax
  269         movl    4(%ebp),%ebx
  270         movl    8(%ebp),%ecx
  271         movl    12(%ebp),%edx
  272         movl    28(%esp),%ebp
  273         andl    %esi,%eax
  274         andl    %esi,%ebx
  275         andl    %esi,%ecx
  276         andl    %esi,%edx
  277         orl     (%edi),%eax
  278         orl     4(%edi),%ebx
  279         orl     8(%edi),%ecx
  280         orl     12(%edi),%edx
  281         addl    (%ebp),%eax
  282         adcl    4(%ebp),%ebx
  283         adcl    8(%ebp),%ecx
  284         adcl    12(%ebp),%edx
  285         movl    %eax,(%edi)
  286         movl    %ebx,4(%edi)
  287         movl    %ecx,8(%edi)
  288         movl    %edx,12(%edi)
  289         popl    %edi
  290         popl    %esi
  291         popl    %ebx
  292         popl    %ebp
  293         ret
  294 .size   poly1305_emit,.-.L_poly1305_emit_begin
  295 .align  32
  296 .type   _poly1305_init_sse2,@function
  297 .align  16
  298 _poly1305_init_sse2:
  299         movdqu  24(%edi),%xmm4
  300         leal    48(%edi),%edi
  301         movl    %esp,%ebp
  302         subl    $224,%esp
  303         andl    $-16,%esp
  304         movq    64(%ebx),%xmm7
  305         movdqa  %xmm4,%xmm0
  306         movdqa  %xmm4,%xmm1
  307         movdqa  %xmm4,%xmm2
  308         pand    %xmm7,%xmm0
  309         psrlq   $26,%xmm1
  310         psrldq  $6,%xmm2
  311         pand    %xmm7,%xmm1
  312         movdqa  %xmm2,%xmm3
  313         psrlq   $4,%xmm2
  314         psrlq   $30,%xmm3
  315         pand    %xmm7,%xmm2
  316         pand    %xmm7,%xmm3
  317         psrldq  $13,%xmm4
  318         leal    144(%esp),%edx
  319         movl    $2,%ecx
  320 .L005square:
  321         movdqa  %xmm0,(%esp)
  322         movdqa  %xmm1,16(%esp)
  323         movdqa  %xmm2,32(%esp)
  324         movdqa  %xmm3,48(%esp)
  325         movdqa  %xmm4,64(%esp)
  326         movdqa  %xmm1,%xmm6
  327         movdqa  %xmm2,%xmm5
  328         pslld   $2,%xmm6
  329         pslld   $2,%xmm5
  330         paddd   %xmm1,%xmm6
  331         paddd   %xmm2,%xmm5
  332         movdqa  %xmm6,80(%esp)
  333         movdqa  %xmm5,96(%esp)
  334         movdqa  %xmm3,%xmm6
  335         movdqa  %xmm4,%xmm5
  336         pslld   $2,%xmm6
  337         pslld   $2,%xmm5
  338         paddd   %xmm3,%xmm6
  339         paddd   %xmm4,%xmm5
  340         movdqa  %xmm6,112(%esp)
  341         movdqa  %xmm5,128(%esp)
  342         pshufd  $68,%xmm0,%xmm6
  343         movdqa  %xmm1,%xmm5
  344         pshufd  $68,%xmm1,%xmm1
  345         pshufd  $68,%xmm2,%xmm2
  346         pshufd  $68,%xmm3,%xmm3
  347         pshufd  $68,%xmm4,%xmm4
  348         movdqa  %xmm6,(%edx)
  349         movdqa  %xmm1,16(%edx)
  350         movdqa  %xmm2,32(%edx)
  351         movdqa  %xmm3,48(%edx)
  352         movdqa  %xmm4,64(%edx)
  353         pmuludq %xmm0,%xmm4
  354         pmuludq %xmm0,%xmm3
  355         pmuludq %xmm0,%xmm2
  356         pmuludq %xmm0,%xmm1
  357         pmuludq %xmm6,%xmm0
  358         movdqa  %xmm5,%xmm6
  359         pmuludq 48(%edx),%xmm5
  360         movdqa  %xmm6,%xmm7
  361         pmuludq 32(%edx),%xmm6
  362         paddq   %xmm5,%xmm4
  363         movdqa  %xmm7,%xmm5
  364         pmuludq 16(%edx),%xmm7
  365         paddq   %xmm6,%xmm3
  366         movdqa  80(%esp),%xmm6
  367         pmuludq (%edx),%xmm5
  368         paddq   %xmm7,%xmm2
  369         pmuludq 64(%edx),%xmm6
  370         movdqa  32(%esp),%xmm7
  371         paddq   %xmm5,%xmm1
  372         movdqa  %xmm7,%xmm5
  373         pmuludq 32(%edx),%xmm7
  374         paddq   %xmm6,%xmm0
  375         movdqa  %xmm5,%xmm6
  376         pmuludq 16(%edx),%xmm5
  377         paddq   %xmm7,%xmm4
  378         movdqa  96(%esp),%xmm7
  379         pmuludq (%edx),%xmm6
  380         paddq   %xmm5,%xmm3
  381         movdqa  %xmm7,%xmm5
  382         pmuludq 64(%edx),%xmm7
  383         paddq   %xmm6,%xmm2
  384         pmuludq 48(%edx),%xmm5
  385         movdqa  48(%esp),%xmm6
  386         paddq   %xmm7,%xmm1
  387         movdqa  %xmm6,%xmm7
  388         pmuludq 16(%edx),%xmm6
  389         paddq   %xmm5,%xmm0
  390         movdqa  112(%esp),%xmm5
  391         pmuludq (%edx),%xmm7
  392         paddq   %xmm6,%xmm4
  393         movdqa  %xmm5,%xmm6
  394         pmuludq 64(%edx),%xmm5
  395         paddq   %xmm7,%xmm3
  396         movdqa  %xmm6,%xmm7
  397         pmuludq 48(%edx),%xmm6
  398         paddq   %xmm5,%xmm2
  399         pmuludq 32(%edx),%xmm7
  400         movdqa  64(%esp),%xmm5
  401         paddq   %xmm6,%xmm1
  402         movdqa  128(%esp),%xmm6
  403         pmuludq (%edx),%xmm5
  404         paddq   %xmm7,%xmm0
  405         movdqa  %xmm6,%xmm7
  406         pmuludq 64(%edx),%xmm6
  407         paddq   %xmm5,%xmm4
  408         movdqa  %xmm7,%xmm5
  409         pmuludq 16(%edx),%xmm7
  410         paddq   %xmm6,%xmm3
  411         movdqa  %xmm5,%xmm6
  412         pmuludq 32(%edx),%xmm5
  413         paddq   %xmm7,%xmm0
  414         pmuludq 48(%edx),%xmm6
  415         movdqa  64(%ebx),%xmm7
  416         paddq   %xmm5,%xmm1
  417         paddq   %xmm6,%xmm2
  418         movdqa  %xmm3,%xmm5
  419         pand    %xmm7,%xmm3
  420         psrlq   $26,%xmm5
  421         paddq   %xmm4,%xmm5
  422         movdqa  %xmm0,%xmm6
  423         pand    %xmm7,%xmm0
  424         psrlq   $26,%xmm6
  425         movdqa  %xmm5,%xmm4
  426         paddq   %xmm1,%xmm6
  427         psrlq   $26,%xmm5
  428         pand    %xmm7,%xmm4
  429         movdqa  %xmm6,%xmm1
  430         psrlq   $26,%xmm6
  431         paddd   %xmm5,%xmm0
  432         psllq   $2,%xmm5
  433         paddq   %xmm2,%xmm6
  434         paddq   %xmm0,%xmm5
  435         pand    %xmm7,%xmm1
  436         movdqa  %xmm6,%xmm2
  437         psrlq   $26,%xmm6
  438         pand    %xmm7,%xmm2
  439         paddd   %xmm3,%xmm6
  440         movdqa  %xmm5,%xmm0
  441         psrlq   $26,%xmm5
  442         movdqa  %xmm6,%xmm3
  443         psrlq   $26,%xmm6
  444         pand    %xmm7,%xmm0
  445         paddd   %xmm5,%xmm1
  446         pand    %xmm7,%xmm3
  447         paddd   %xmm6,%xmm4
  448         decl    %ecx
  449         jz      .L006square_break
  450         punpcklqdq      (%esp),%xmm0
  451         punpcklqdq      16(%esp),%xmm1
  452         punpcklqdq      32(%esp),%xmm2
  453         punpcklqdq      48(%esp),%xmm3
  454         punpcklqdq      64(%esp),%xmm4
  455         jmp     .L005square
  456 .L006square_break:
  457         psllq   $32,%xmm0
  458         psllq   $32,%xmm1
  459         psllq   $32,%xmm2
  460         psllq   $32,%xmm3
  461         psllq   $32,%xmm4
  462         por     (%esp),%xmm0
  463         por     16(%esp),%xmm1
  464         por     32(%esp),%xmm2
  465         por     48(%esp),%xmm3
  466         por     64(%esp),%xmm4
  467         pshufd  $141,%xmm0,%xmm0
  468         pshufd  $141,%xmm1,%xmm1
  469         pshufd  $141,%xmm2,%xmm2
  470         pshufd  $141,%xmm3,%xmm3
  471         pshufd  $141,%xmm4,%xmm4
  472         movdqu  %xmm0,(%edi)
  473         movdqu  %xmm1,16(%edi)
  474         movdqu  %xmm2,32(%edi)
  475         movdqu  %xmm3,48(%edi)
  476         movdqu  %xmm4,64(%edi)
  477         movdqa  %xmm1,%xmm6
  478         movdqa  %xmm2,%xmm5
  479         pslld   $2,%xmm6
  480         pslld   $2,%xmm5
  481         paddd   %xmm1,%xmm6
  482         paddd   %xmm2,%xmm5
  483         movdqu  %xmm6,80(%edi)
  484         movdqu  %xmm5,96(%edi)
  485         movdqa  %xmm3,%xmm6
  486         movdqa  %xmm4,%xmm5
  487         pslld   $2,%xmm6
  488         pslld   $2,%xmm5
  489         paddd   %xmm3,%xmm6
  490         paddd   %xmm4,%xmm5
  491         movdqu  %xmm6,112(%edi)
  492         movdqu  %xmm5,128(%edi)
  493         movl    %ebp,%esp
  494         leal    -48(%edi),%edi
  495         ret
  496 .size   _poly1305_init_sse2,.-_poly1305_init_sse2
  497 .align  32
  498 .type   _poly1305_blocks_sse2,@function
  499 .align  16
  500 _poly1305_blocks_sse2:
  501         pushl   %ebp
  502         pushl   %ebx
  503         pushl   %esi
  504         pushl   %edi
  505         movl    20(%esp),%edi
  506         movl    24(%esp),%esi
  507         movl    28(%esp),%ecx
  508         movl    20(%edi),%eax
  509         andl    $-16,%ecx
  510         jz      .L007nodata
  511         cmpl    $64,%ecx
  512         jae     .L008enter_sse2
  513         testl   %eax,%eax
  514         jz      .Lenter_blocks
  515 .align  16
  516 .L008enter_sse2:
  517         call    .L009pic_point
  518 .L009pic_point:
  519         popl    %ebx
  520         leal    .Lconst_sse2-.L009pic_point(%ebx),%ebx
  521         testl   %eax,%eax
  522         jnz     .L010base2_26
  523         call    _poly1305_init_sse2
  524         movl    (%edi),%eax
  525         movl    3(%edi),%ecx
  526         movl    6(%edi),%edx
  527         movl    9(%edi),%esi
  528         movl    13(%edi),%ebp
  529         movl    $1,20(%edi)
  530         shrl    $2,%ecx
  531         andl    $67108863,%eax
  532         shrl    $4,%edx
  533         andl    $67108863,%ecx
  534         shrl    $6,%esi
  535         andl    $67108863,%edx
  536         movd    %eax,%xmm0
  537         movd    %ecx,%xmm1
  538         movd    %edx,%xmm2
  539         movd    %esi,%xmm3
  540         movd    %ebp,%xmm4
  541         movl    24(%esp),%esi
  542         movl    28(%esp),%ecx
  543         jmp     .L011base2_32
  544 .align  16
  545 .L010base2_26:
  546         movd    (%edi),%xmm0
  547         movd    4(%edi),%xmm1
  548         movd    8(%edi),%xmm2
  549         movd    12(%edi),%xmm3
  550         movd    16(%edi),%xmm4
  551         movdqa  64(%ebx),%xmm7
  552 .L011base2_32:
  553         movl    32(%esp),%eax
  554         movl    %esp,%ebp
  555         subl    $528,%esp
  556         andl    $-16,%esp
  557         leal    48(%edi),%edi
  558         shll    $24,%eax
  559         testl   $31,%ecx
  560         jz      .L012even
  561         movdqu  (%esi),%xmm6
  562         leal    16(%esi),%esi
  563         movdqa  %xmm6,%xmm5
  564         pand    %xmm7,%xmm6
  565         paddd   %xmm6,%xmm0
  566         movdqa  %xmm5,%xmm6
  567         psrlq   $26,%xmm5
  568         psrldq  $6,%xmm6
  569         pand    %xmm7,%xmm5
  570         paddd   %xmm5,%xmm1
  571         movdqa  %xmm6,%xmm5
  572         psrlq   $4,%xmm6
  573         pand    %xmm7,%xmm6
  574         paddd   %xmm6,%xmm2
  575         movdqa  %xmm5,%xmm6
  576         psrlq   $30,%xmm5
  577         pand    %xmm7,%xmm5
  578         psrldq  $7,%xmm6
  579         paddd   %xmm5,%xmm3
  580         movd    %eax,%xmm5
  581         paddd   %xmm6,%xmm4
  582         movd    12(%edi),%xmm6
  583         paddd   %xmm5,%xmm4
  584         movdqa  %xmm0,(%esp)
  585         movdqa  %xmm1,16(%esp)
  586         movdqa  %xmm2,32(%esp)
  587         movdqa  %xmm3,48(%esp)
  588         movdqa  %xmm4,64(%esp)
  589         pmuludq %xmm6,%xmm0
  590         pmuludq %xmm6,%xmm1
  591         pmuludq %xmm6,%xmm2
  592         movd    28(%edi),%xmm5
  593         pmuludq %xmm6,%xmm3
  594         pmuludq %xmm6,%xmm4
  595         movdqa  %xmm5,%xmm6
  596         pmuludq 48(%esp),%xmm5
  597         movdqa  %xmm6,%xmm7
  598         pmuludq 32(%esp),%xmm6
  599         paddq   %xmm5,%xmm4
  600         movdqa  %xmm7,%xmm5
  601         pmuludq 16(%esp),%xmm7
  602         paddq   %xmm6,%xmm3
  603         movd    92(%edi),%xmm6
  604         pmuludq (%esp),%xmm5
  605         paddq   %xmm7,%xmm2
  606         pmuludq 64(%esp),%xmm6
  607         movd    44(%edi),%xmm7
  608         paddq   %xmm5,%xmm1
  609         movdqa  %xmm7,%xmm5
  610         pmuludq 32(%esp),%xmm7
  611         paddq   %xmm6,%xmm0
  612         movdqa  %xmm5,%xmm6
  613         pmuludq 16(%esp),%xmm5
  614         paddq   %xmm7,%xmm4
  615         movd    108(%edi),%xmm7
  616         pmuludq (%esp),%xmm6
  617         paddq   %xmm5,%xmm3
  618         movdqa  %xmm7,%xmm5
  619         pmuludq 64(%esp),%xmm7
  620         paddq   %xmm6,%xmm2
  621         pmuludq 48(%esp),%xmm5
  622         movd    60(%edi),%xmm6
  623         paddq   %xmm7,%xmm1
  624         movdqa  %xmm6,%xmm7
  625         pmuludq 16(%esp),%xmm6
  626         paddq   %xmm5,%xmm0
  627         movd    124(%edi),%xmm5
  628         pmuludq (%esp),%xmm7
  629         paddq   %xmm6,%xmm4
  630         movdqa  %xmm5,%xmm6
  631         pmuludq 64(%esp),%xmm5
  632         paddq   %xmm7,%xmm3
  633         movdqa  %xmm6,%xmm7
  634         pmuludq 48(%esp),%xmm6
  635         paddq   %xmm5,%xmm2
  636         pmuludq 32(%esp),%xmm7
  637         movd    76(%edi),%xmm5
  638         paddq   %xmm6,%xmm1
  639         movd    140(%edi),%xmm6
  640         pmuludq (%esp),%xmm5
  641         paddq   %xmm7,%xmm0
  642         movdqa  %xmm6,%xmm7
  643         pmuludq 64(%esp),%xmm6
  644         paddq   %xmm5,%xmm4
  645         movdqa  %xmm7,%xmm5
  646         pmuludq 16(%esp),%xmm7
  647         paddq   %xmm6,%xmm3
  648         movdqa  %xmm5,%xmm6
  649         pmuludq 32(%esp),%xmm5
  650         paddq   %xmm7,%xmm0
  651         pmuludq 48(%esp),%xmm6
  652         movdqa  64(%ebx),%xmm7
  653         paddq   %xmm5,%xmm1
  654         paddq   %xmm6,%xmm2
  655         movdqa  %xmm3,%xmm5
  656         pand    %xmm7,%xmm3
  657         psrlq   $26,%xmm5
  658         paddq   %xmm4,%xmm5
  659         movdqa  %xmm0,%xmm6
  660         pand    %xmm7,%xmm0
  661         psrlq   $26,%xmm6
  662         movdqa  %xmm5,%xmm4
  663         paddq   %xmm1,%xmm6
  664         psrlq   $26,%xmm5
  665         pand    %xmm7,%xmm4
  666         movdqa  %xmm6,%xmm1
  667         psrlq   $26,%xmm6
  668         paddd   %xmm5,%xmm0
  669         psllq   $2,%xmm5
  670         paddq   %xmm2,%xmm6
  671         paddq   %xmm0,%xmm5
  672         pand    %xmm7,%xmm1
  673         movdqa  %xmm6,%xmm2
  674         psrlq   $26,%xmm6
  675         pand    %xmm7,%xmm2
  676         paddd   %xmm3,%xmm6
  677         movdqa  %xmm5,%xmm0
  678         psrlq   $26,%xmm5
  679         movdqa  %xmm6,%xmm3
  680         psrlq   $26,%xmm6
  681         pand    %xmm7,%xmm0
  682         paddd   %xmm5,%xmm1
  683         pand    %xmm7,%xmm3
  684         paddd   %xmm6,%xmm4
  685         subl    $16,%ecx
  686         jz      .L013done
  687 .L012even:
  688         leal    384(%esp),%edx
  689         leal    -32(%esi),%eax
  690         subl    $64,%ecx
  691         movdqu  (%edi),%xmm5
  692         pshufd  $68,%xmm5,%xmm6
  693         cmovbl  %eax,%esi
  694         pshufd  $238,%xmm5,%xmm5
  695         movdqa  %xmm6,(%edx)
  696         leal    160(%esp),%eax
  697         movdqu  16(%edi),%xmm6
  698         movdqa  %xmm5,-144(%edx)
  699         pshufd  $68,%xmm6,%xmm5
  700         pshufd  $238,%xmm6,%xmm6
  701         movdqa  %xmm5,16(%edx)
  702         movdqu  32(%edi),%xmm5
  703         movdqa  %xmm6,-128(%edx)
  704         pshufd  $68,%xmm5,%xmm6
  705         pshufd  $238,%xmm5,%xmm5
  706         movdqa  %xmm6,32(%edx)
  707         movdqu  48(%edi),%xmm6
  708         movdqa  %xmm5,-112(%edx)
  709         pshufd  $68,%xmm6,%xmm5
  710         pshufd  $238,%xmm6,%xmm6
  711         movdqa  %xmm5,48(%edx)
  712         movdqu  64(%edi),%xmm5
  713         movdqa  %xmm6,-96(%edx)
  714         pshufd  $68,%xmm5,%xmm6
  715         pshufd  $238,%xmm5,%xmm5
  716         movdqa  %xmm6,64(%edx)
  717         movdqu  80(%edi),%xmm6
  718         movdqa  %xmm5,-80(%edx)
  719         pshufd  $68,%xmm6,%xmm5
  720         pshufd  $238,%xmm6,%xmm6
  721         movdqa  %xmm5,80(%edx)
  722         movdqu  96(%edi),%xmm5
  723         movdqa  %xmm6,-64(%edx)
  724         pshufd  $68,%xmm5,%xmm6
  725         pshufd  $238,%xmm5,%xmm5
  726         movdqa  %xmm6,96(%edx)
  727         movdqu  112(%edi),%xmm6
  728         movdqa  %xmm5,-48(%edx)
  729         pshufd  $68,%xmm6,%xmm5
  730         pshufd  $238,%xmm6,%xmm6
  731         movdqa  %xmm5,112(%edx)
  732         movdqu  128(%edi),%xmm5
  733         movdqa  %xmm6,-32(%edx)
  734         pshufd  $68,%xmm5,%xmm6
  735         pshufd  $238,%xmm5,%xmm5
  736         movdqa  %xmm6,128(%edx)
  737         movdqa  %xmm5,-16(%edx)
  738         movdqu  32(%esi),%xmm5
  739         movdqu  48(%esi),%xmm6
  740         leal    32(%esi),%esi
  741         movdqa  %xmm2,112(%esp)
  742         movdqa  %xmm3,128(%esp)
  743         movdqa  %xmm4,144(%esp)
  744         movdqa  %xmm5,%xmm2
  745         movdqa  %xmm6,%xmm3
  746         psrldq  $6,%xmm2
  747         psrldq  $6,%xmm3
  748         movdqa  %xmm5,%xmm4
  749         punpcklqdq      %xmm3,%xmm2
  750         punpckhqdq      %xmm6,%xmm4
  751         punpcklqdq      %xmm6,%xmm5
  752         movdqa  %xmm2,%xmm3
  753         psrlq   $4,%xmm2
  754         psrlq   $30,%xmm3
  755         movdqa  %xmm5,%xmm6
  756         psrlq   $40,%xmm4
  757         psrlq   $26,%xmm6
  758         pand    %xmm7,%xmm5
  759         pand    %xmm7,%xmm6
  760         pand    %xmm7,%xmm2
  761         pand    %xmm7,%xmm3
  762         por     (%ebx),%xmm4
  763         movdqa  %xmm0,80(%esp)
  764         movdqa  %xmm1,96(%esp)
  765         jbe     .L014skip_loop
  766         jmp     .L015loop
  767 .align  32
  768 .L015loop:
  769         movdqa  -144(%edx),%xmm7
  770         movdqa  %xmm6,16(%eax)
  771         movdqa  %xmm2,32(%eax)
  772         movdqa  %xmm3,48(%eax)
  773         movdqa  %xmm4,64(%eax)
  774         movdqa  %xmm5,%xmm1
  775         pmuludq %xmm7,%xmm5
  776         movdqa  %xmm6,%xmm0
  777         pmuludq %xmm7,%xmm6
  778         pmuludq %xmm7,%xmm2
  779         pmuludq %xmm7,%xmm3
  780         pmuludq %xmm7,%xmm4
  781         pmuludq -16(%edx),%xmm0
  782         movdqa  %xmm1,%xmm7
  783         pmuludq -128(%edx),%xmm1
  784         paddq   %xmm5,%xmm0
  785         movdqa  %xmm7,%xmm5
  786         pmuludq -112(%edx),%xmm7
  787         paddq   %xmm6,%xmm1
  788         movdqa  %xmm5,%xmm6
  789         pmuludq -96(%edx),%xmm5
  790         paddq   %xmm7,%xmm2
  791         movdqa  16(%eax),%xmm7
  792         pmuludq -80(%edx),%xmm6
  793         paddq   %xmm5,%xmm3
  794         movdqa  %xmm7,%xmm5
  795         pmuludq -128(%edx),%xmm7
  796         paddq   %xmm6,%xmm4
  797         movdqa  %xmm5,%xmm6
  798         pmuludq -112(%edx),%xmm5
  799         paddq   %xmm7,%xmm2
  800         movdqa  32(%eax),%xmm7
  801         pmuludq -96(%edx),%xmm6
  802         paddq   %xmm5,%xmm3
  803         movdqa  %xmm7,%xmm5
  804         pmuludq -32(%edx),%xmm7
  805         paddq   %xmm6,%xmm4
  806         movdqa  %xmm5,%xmm6
  807         pmuludq -16(%edx),%xmm5
  808         paddq   %xmm7,%xmm0
  809         movdqa  %xmm6,%xmm7
  810         pmuludq -128(%edx),%xmm6
  811         paddq   %xmm5,%xmm1
  812         movdqa  48(%eax),%xmm5
  813         pmuludq -112(%edx),%xmm7
  814         paddq   %xmm6,%xmm3
  815         movdqa  %xmm5,%xmm6
  816         pmuludq -48(%edx),%xmm5
  817         paddq   %xmm7,%xmm4
  818         movdqa  %xmm6,%xmm7
  819         pmuludq -32(%edx),%xmm6
  820         paddq   %xmm5,%xmm0
  821         movdqa  %xmm7,%xmm5
  822         pmuludq -16(%edx),%xmm7
  823         paddq   %xmm6,%xmm1
  824         movdqa  64(%eax),%xmm6
  825         pmuludq -128(%edx),%xmm5
  826         paddq   %xmm7,%xmm2
  827         movdqa  %xmm6,%xmm7
  828         pmuludq -16(%edx),%xmm6
  829         paddq   %xmm5,%xmm4
  830         movdqa  %xmm7,%xmm5
  831         pmuludq -64(%edx),%xmm7
  832         paddq   %xmm6,%xmm3
  833         movdqa  %xmm5,%xmm6
  834         pmuludq -48(%edx),%xmm5
  835         paddq   %xmm7,%xmm0
  836         movdqa  64(%ebx),%xmm7
  837         pmuludq -32(%edx),%xmm6
  838         paddq   %xmm5,%xmm1
  839         paddq   %xmm6,%xmm2
  840         movdqu  -32(%esi),%xmm5
  841         movdqu  -16(%esi),%xmm6
  842         leal    32(%esi),%esi
  843         movdqa  %xmm2,32(%esp)
  844         movdqa  %xmm3,48(%esp)
  845         movdqa  %xmm4,64(%esp)
  846         movdqa  %xmm5,%xmm2
  847         movdqa  %xmm6,%xmm3
  848         psrldq  $6,%xmm2
  849         psrldq  $6,%xmm3
  850         movdqa  %xmm5,%xmm4
  851         punpcklqdq      %xmm3,%xmm2
  852         punpckhqdq      %xmm6,%xmm4
  853         punpcklqdq      %xmm6,%xmm5
  854         movdqa  %xmm2,%xmm3
  855         psrlq   $4,%xmm2
  856         psrlq   $30,%xmm3
  857         movdqa  %xmm5,%xmm6
  858         psrlq   $40,%xmm4
  859         psrlq   $26,%xmm6
  860         pand    %xmm7,%xmm5
  861         pand    %xmm7,%xmm6
  862         pand    %xmm7,%xmm2
  863         pand    %xmm7,%xmm3
  864         por     (%ebx),%xmm4
  865         leal    -32(%esi),%eax
  866         subl    $64,%ecx
  867         paddd   80(%esp),%xmm5
  868         paddd   96(%esp),%xmm6
  869         paddd   112(%esp),%xmm2
  870         paddd   128(%esp),%xmm3
  871         paddd   144(%esp),%xmm4
  872         cmovbl  %eax,%esi
  873         leal    160(%esp),%eax
  874         movdqa  (%edx),%xmm7
  875         movdqa  %xmm1,16(%esp)
  876         movdqa  %xmm6,16(%eax)
  877         movdqa  %xmm2,32(%eax)
  878         movdqa  %xmm3,48(%eax)
  879         movdqa  %xmm4,64(%eax)
  880         movdqa  %xmm5,%xmm1
  881         pmuludq %xmm7,%xmm5
  882         paddq   %xmm0,%xmm5
  883         movdqa  %xmm6,%xmm0
  884         pmuludq %xmm7,%xmm6
  885         pmuludq %xmm7,%xmm2
  886         pmuludq %xmm7,%xmm3
  887         pmuludq %xmm7,%xmm4
  888         paddq   16(%esp),%xmm6
  889         paddq   32(%esp),%xmm2
  890         paddq   48(%esp),%xmm3
  891         paddq   64(%esp),%xmm4
  892         pmuludq 128(%edx),%xmm0
  893         movdqa  %xmm1,%xmm7
  894         pmuludq 16(%edx),%xmm1
  895         paddq   %xmm5,%xmm0
  896         movdqa  %xmm7,%xmm5
  897         pmuludq 32(%edx),%xmm7
  898         paddq   %xmm6,%xmm1
  899         movdqa  %xmm5,%xmm6
  900         pmuludq 48(%edx),%xmm5
  901         paddq   %xmm7,%xmm2
  902         movdqa  16(%eax),%xmm7
  903         pmuludq 64(%edx),%xmm6
  904         paddq   %xmm5,%xmm3
  905         movdqa  %xmm7,%xmm5
  906         pmuludq 16(%edx),%xmm7
  907         paddq   %xmm6,%xmm4
  908         movdqa  %xmm5,%xmm6
  909         pmuludq 32(%edx),%xmm5
  910         paddq   %xmm7,%xmm2
  911         movdqa  32(%eax),%xmm7
  912         pmuludq 48(%edx),%xmm6
  913         paddq   %xmm5,%xmm3
  914         movdqa  %xmm7,%xmm5
  915         pmuludq 112(%edx),%xmm7
  916         paddq   %xmm6,%xmm4
  917         movdqa  %xmm5,%xmm6
  918         pmuludq 128(%edx),%xmm5
  919         paddq   %xmm7,%xmm0
  920         movdqa  %xmm6,%xmm7
  921         pmuludq 16(%edx),%xmm6
  922         paddq   %xmm5,%xmm1
  923         movdqa  48(%eax),%xmm5
  924         pmuludq 32(%edx),%xmm7
  925         paddq   %xmm6,%xmm3
  926         movdqa  %xmm5,%xmm6
  927         pmuludq 96(%edx),%xmm5
  928         paddq   %xmm7,%xmm4
  929         movdqa  %xmm6,%xmm7
  930         pmuludq 112(%edx),%xmm6
  931         paddq   %xmm5,%xmm0
  932         movdqa  %xmm7,%xmm5
  933         pmuludq 128(%edx),%xmm7
  934         paddq   %xmm6,%xmm1
  935         movdqa  64(%eax),%xmm6
  936         pmuludq 16(%edx),%xmm5
  937         paddq   %xmm7,%xmm2
  938         movdqa  %xmm6,%xmm7
  939         pmuludq 128(%edx),%xmm6
  940         paddq   %xmm5,%xmm4
  941         movdqa  %xmm7,%xmm5
  942         pmuludq 80(%edx),%xmm7
  943         paddq   %xmm6,%xmm3
  944         movdqa  %xmm5,%xmm6
  945         pmuludq 96(%edx),%xmm5
  946         paddq   %xmm7,%xmm0
  947         movdqa  64(%ebx),%xmm7
  948         pmuludq 112(%edx),%xmm6
  949         paddq   %xmm5,%xmm1
  950         paddq   %xmm6,%xmm2
  951         movdqa  %xmm3,%xmm5
  952         pand    %xmm7,%xmm3
  953         psrlq   $26,%xmm5
  954         paddq   %xmm4,%xmm5
  955         movdqa  %xmm0,%xmm6
  956         pand    %xmm7,%xmm0
  957         psrlq   $26,%xmm6
  958         movdqa  %xmm5,%xmm4
  959         paddq   %xmm1,%xmm6
  960         psrlq   $26,%xmm5
  961         pand    %xmm7,%xmm4
  962         movdqa  %xmm6,%xmm1
  963         psrlq   $26,%xmm6
  964         paddd   %xmm5,%xmm0
  965         psllq   $2,%xmm5
  966         paddq   %xmm2,%xmm6
  967         paddq   %xmm0,%xmm5
  968         pand    %xmm7,%xmm1
  969         movdqa  %xmm6,%xmm2
  970         psrlq   $26,%xmm6
  971         pand    %xmm7,%xmm2
  972         paddd   %xmm3,%xmm6
  973         movdqa  %xmm5,%xmm0
  974         psrlq   $26,%xmm5
  975         movdqa  %xmm6,%xmm3
  976         psrlq   $26,%xmm6
  977         pand    %xmm7,%xmm0
  978         paddd   %xmm5,%xmm1
  979         pand    %xmm7,%xmm3
  980         paddd   %xmm6,%xmm4
  981         movdqu  32(%esi),%xmm5
  982         movdqu  48(%esi),%xmm6
  983         leal    32(%esi),%esi
  984         movdqa  %xmm2,112(%esp)
  985         movdqa  %xmm3,128(%esp)
  986         movdqa  %xmm4,144(%esp)
  987         movdqa  %xmm5,%xmm2
  988         movdqa  %xmm6,%xmm3
  989         psrldq  $6,%xmm2
  990         psrldq  $6,%xmm3
  991         movdqa  %xmm5,%xmm4
  992         punpcklqdq      %xmm3,%xmm2
  993         punpckhqdq      %xmm6,%xmm4
  994         punpcklqdq      %xmm6,%xmm5
  995         movdqa  %xmm2,%xmm3
  996         psrlq   $4,%xmm2
  997         psrlq   $30,%xmm3
  998         movdqa  %xmm5,%xmm6
  999         psrlq   $40,%xmm4
 1000         psrlq   $26,%xmm6
 1001         pand    %xmm7,%xmm5
 1002         pand    %xmm7,%xmm6
 1003         pand    %xmm7,%xmm2
 1004         pand    %xmm7,%xmm3
 1005         por     (%ebx),%xmm4
 1006         movdqa  %xmm0,80(%esp)
 1007         movdqa  %xmm1,96(%esp)
 1008         ja      .L015loop
 1009 .L014skip_loop:
 1010         pshufd  $16,-144(%edx),%xmm7
 1011         addl    $32,%ecx
 1012         jnz     .L016long_tail
 1013         paddd   %xmm0,%xmm5
 1014         paddd   %xmm1,%xmm6
 1015         paddd   112(%esp),%xmm2
 1016         paddd   128(%esp),%xmm3
 1017         paddd   144(%esp),%xmm4
 1018 .L016long_tail:
 1019         movdqa  %xmm5,(%eax)
 1020         movdqa  %xmm6,16(%eax)
 1021         movdqa  %xmm2,32(%eax)
 1022         movdqa  %xmm3,48(%eax)
 1023         movdqa  %xmm4,64(%eax)
 1024         pmuludq %xmm7,%xmm5
 1025         pmuludq %xmm7,%xmm6
 1026         pmuludq %xmm7,%xmm2
 1027         movdqa  %xmm5,%xmm0
 1028         pshufd  $16,-128(%edx),%xmm5
 1029         pmuludq %xmm7,%xmm3
 1030         movdqa  %xmm6,%xmm1
 1031         pmuludq %xmm7,%xmm4
 1032         movdqa  %xmm5,%xmm6
 1033         pmuludq 48(%eax),%xmm5
 1034         movdqa  %xmm6,%xmm7
 1035         pmuludq 32(%eax),%xmm6
 1036         paddq   %xmm5,%xmm4
 1037         movdqa  %xmm7,%xmm5
 1038         pmuludq 16(%eax),%xmm7
 1039         paddq   %xmm6,%xmm3
 1040         pshufd  $16,-64(%edx),%xmm6
 1041         pmuludq (%eax),%xmm5
 1042         paddq   %xmm7,%xmm2
 1043         pmuludq 64(%eax),%xmm6
 1044         pshufd  $16,-112(%edx),%xmm7
 1045         paddq   %xmm5,%xmm1
 1046         movdqa  %xmm7,%xmm5
 1047         pmuludq 32(%eax),%xmm7
 1048         paddq   %xmm6,%xmm0
 1049         movdqa  %xmm5,%xmm6
 1050         pmuludq 16(%eax),%xmm5
 1051         paddq   %xmm7,%xmm4
 1052         pshufd  $16,-48(%edx),%xmm7
 1053         pmuludq (%eax),%xmm6
 1054         paddq   %xmm5,%xmm3
 1055         movdqa  %xmm7,%xmm5
 1056         pmuludq 64(%eax),%xmm7
 1057         paddq   %xmm6,%xmm2
 1058         pmuludq 48(%eax),%xmm5
 1059         pshufd  $16,-96(%edx),%xmm6
 1060         paddq   %xmm7,%xmm1
 1061         movdqa  %xmm6,%xmm7
 1062         pmuludq 16(%eax),%xmm6
 1063         paddq   %xmm5,%xmm0
 1064         pshufd  $16,-32(%edx),%xmm5
 1065         pmuludq (%eax),%xmm7
 1066         paddq   %xmm6,%xmm4
 1067         movdqa  %xmm5,%xmm6
 1068         pmuludq 64(%eax),%xmm5
 1069         paddq   %xmm7,%xmm3
 1070         movdqa  %xmm6,%xmm7
 1071         pmuludq 48(%eax),%xmm6
 1072         paddq   %xmm5,%xmm2
 1073         pmuludq 32(%eax),%xmm7
 1074         pshufd  $16,-80(%edx),%xmm5
 1075         paddq   %xmm6,%xmm1
 1076         pshufd  $16,-16(%edx),%xmm6
 1077         pmuludq (%eax),%xmm5
 1078         paddq   %xmm7,%xmm0
 1079         movdqa  %xmm6,%xmm7
 1080         pmuludq 64(%eax),%xmm6
 1081         paddq   %xmm5,%xmm4
 1082         movdqa  %xmm7,%xmm5
 1083         pmuludq 16(%eax),%xmm7
 1084         paddq   %xmm6,%xmm3
 1085         movdqa  %xmm5,%xmm6
 1086         pmuludq 32(%eax),%xmm5
 1087         paddq   %xmm7,%xmm0
 1088         pmuludq 48(%eax),%xmm6
 1089         movdqa  64(%ebx),%xmm7
 1090         paddq   %xmm5,%xmm1
 1091         paddq   %xmm6,%xmm2
 1092         jz      .L017short_tail
 1093         movdqu  -32(%esi),%xmm5
 1094         movdqu  -16(%esi),%xmm6
 1095         leal    32(%esi),%esi
 1096         movdqa  %xmm2,32(%esp)
 1097         movdqa  %xmm3,48(%esp)
 1098         movdqa  %xmm4,64(%esp)
 1099         movdqa  %xmm5,%xmm2
 1100         movdqa  %xmm6,%xmm3
 1101         psrldq  $6,%xmm2
 1102         psrldq  $6,%xmm3
 1103         movdqa  %xmm5,%xmm4
 1104         punpcklqdq      %xmm3,%xmm2
 1105         punpckhqdq      %xmm6,%xmm4
 1106         punpcklqdq      %xmm6,%xmm5
 1107         movdqa  %xmm2,%xmm3
 1108         psrlq   $4,%xmm2
 1109         psrlq   $30,%xmm3
 1110         movdqa  %xmm5,%xmm6
 1111         psrlq   $40,%xmm4
 1112         psrlq   $26,%xmm6
 1113         pand    %xmm7,%xmm5
 1114         pand    %xmm7,%xmm6
 1115         pand    %xmm7,%xmm2
 1116         pand    %xmm7,%xmm3
 1117         por     (%ebx),%xmm4
 1118         pshufd  $16,(%edx),%xmm7
 1119         paddd   80(%esp),%xmm5
 1120         paddd   96(%esp),%xmm6
 1121         paddd   112(%esp),%xmm2
 1122         paddd   128(%esp),%xmm3
 1123         paddd   144(%esp),%xmm4
 1124         movdqa  %xmm5,(%esp)
 1125         pmuludq %xmm7,%xmm5
 1126         movdqa  %xmm6,16(%esp)
 1127         pmuludq %xmm7,%xmm6
 1128         paddq   %xmm5,%xmm0
 1129         movdqa  %xmm2,%xmm5
 1130         pmuludq %xmm7,%xmm2
 1131         paddq   %xmm6,%xmm1
 1132         movdqa  %xmm3,%xmm6
 1133         pmuludq %xmm7,%xmm3
 1134         paddq   32(%esp),%xmm2
 1135         movdqa  %xmm5,32(%esp)
 1136         pshufd  $16,16(%edx),%xmm5
 1137         paddq   48(%esp),%xmm3
 1138         movdqa  %xmm6,48(%esp)
 1139         movdqa  %xmm4,%xmm6
 1140         pmuludq %xmm7,%xmm4
 1141         paddq   64(%esp),%xmm4
 1142         movdqa  %xmm6,64(%esp)
 1143         movdqa  %xmm5,%xmm6
 1144         pmuludq 48(%esp),%xmm5
 1145         movdqa  %xmm6,%xmm7
 1146         pmuludq 32(%esp),%xmm6
 1147         paddq   %xmm5,%xmm4
 1148         movdqa  %xmm7,%xmm5
 1149         pmuludq 16(%esp),%xmm7
 1150         paddq   %xmm6,%xmm3
 1151         pshufd  $16,80(%edx),%xmm6
 1152         pmuludq (%esp),%xmm5
 1153         paddq   %xmm7,%xmm2
 1154         pmuludq 64(%esp),%xmm6
 1155         pshufd  $16,32(%edx),%xmm7
 1156         paddq   %xmm5,%xmm1
 1157         movdqa  %xmm7,%xmm5
 1158         pmuludq 32(%esp),%xmm7
 1159         paddq   %xmm6,%xmm0
 1160         movdqa  %xmm5,%xmm6
 1161         pmuludq 16(%esp),%xmm5
 1162         paddq   %xmm7,%xmm4
 1163         pshufd  $16,96(%edx),%xmm7
 1164         pmuludq (%esp),%xmm6
 1165         paddq   %xmm5,%xmm3
 1166         movdqa  %xmm7,%xmm5
 1167         pmuludq 64(%esp),%xmm7
 1168         paddq   %xmm6,%xmm2
 1169         pmuludq 48(%esp),%xmm5
 1170         pshufd  $16,48(%edx),%xmm6
 1171         paddq   %xmm7,%xmm1
 1172         movdqa  %xmm6,%xmm7
 1173         pmuludq 16(%esp),%xmm6
 1174         paddq   %xmm5,%xmm0
 1175         pshufd  $16,112(%edx),%xmm5
 1176         pmuludq (%esp),%xmm7
 1177         paddq   %xmm6,%xmm4
 1178         movdqa  %xmm5,%xmm6
 1179         pmuludq 64(%esp),%xmm5
 1180         paddq   %xmm7,%xmm3
 1181         movdqa  %xmm6,%xmm7
 1182         pmuludq 48(%esp),%xmm6
 1183         paddq   %xmm5,%xmm2
 1184         pmuludq 32(%esp),%xmm7
 1185         pshufd  $16,64(%edx),%xmm5
 1186         paddq   %xmm6,%xmm1
 1187         pshufd  $16,128(%edx),%xmm6
 1188         pmuludq (%esp),%xmm5
 1189         paddq   %xmm7,%xmm0
 1190         movdqa  %xmm6,%xmm7
 1191         pmuludq 64(%esp),%xmm6
 1192         paddq   %xmm5,%xmm4
 1193         movdqa  %xmm7,%xmm5
 1194         pmuludq 16(%esp),%xmm7
 1195         paddq   %xmm6,%xmm3
 1196         movdqa  %xmm5,%xmm6
 1197         pmuludq 32(%esp),%xmm5
 1198         paddq   %xmm7,%xmm0
 1199         pmuludq 48(%esp),%xmm6
 1200         movdqa  64(%ebx),%xmm7
 1201         paddq   %xmm5,%xmm1
 1202         paddq   %xmm6,%xmm2
 1203 .L017short_tail:
 1204         pshufd  $78,%xmm4,%xmm6
 1205         pshufd  $78,%xmm3,%xmm5
 1206         paddq   %xmm6,%xmm4
 1207         paddq   %xmm5,%xmm3
 1208         pshufd  $78,%xmm0,%xmm6
 1209         pshufd  $78,%xmm1,%xmm5
 1210         paddq   %xmm6,%xmm0
 1211         paddq   %xmm5,%xmm1
 1212         pshufd  $78,%xmm2,%xmm6
 1213         movdqa  %xmm3,%xmm5
 1214         pand    %xmm7,%xmm3
 1215         psrlq   $26,%xmm5
 1216         paddq   %xmm6,%xmm2
 1217         paddq   %xmm4,%xmm5
 1218         movdqa  %xmm0,%xmm6
 1219         pand    %xmm7,%xmm0
 1220         psrlq   $26,%xmm6
 1221         movdqa  %xmm5,%xmm4
 1222         paddq   %xmm1,%xmm6
 1223         psrlq   $26,%xmm5
 1224         pand    %xmm7,%xmm4
 1225         movdqa  %xmm6,%xmm1
 1226         psrlq   $26,%xmm6
 1227         paddd   %xmm5,%xmm0
 1228         psllq   $2,%xmm5
 1229         paddq   %xmm2,%xmm6
 1230         paddq   %xmm0,%xmm5
 1231         pand    %xmm7,%xmm1
 1232         movdqa  %xmm6,%xmm2
 1233         psrlq   $26,%xmm6
 1234         pand    %xmm7,%xmm2
 1235         paddd   %xmm3,%xmm6
 1236         movdqa  %xmm5,%xmm0
 1237         psrlq   $26,%xmm5
 1238         movdqa  %xmm6,%xmm3
 1239         psrlq   $26,%xmm6
 1240         pand    %xmm7,%xmm0
 1241         paddd   %xmm5,%xmm1
 1242         pand    %xmm7,%xmm3
 1243         paddd   %xmm6,%xmm4
 1244 .L013done:
 1245         movd    %xmm0,-48(%edi)
 1246         movd    %xmm1,-44(%edi)
 1247         movd    %xmm2,-40(%edi)
 1248         movd    %xmm3,-36(%edi)
 1249         movd    %xmm4,-32(%edi)
 1250         movl    %ebp,%esp
 1251 .L007nodata:
 1252         popl    %edi
 1253         popl    %esi
 1254         popl    %ebx
 1255         popl    %ebp
 1256         ret
 1257 .size   _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
 1258 .align  32
 1259 .type   _poly1305_emit_sse2,@function
 1260 .align  16
 1261 _poly1305_emit_sse2:
 1262         pushl   %ebp
 1263         pushl   %ebx
 1264         pushl   %esi
 1265         pushl   %edi
 1266         movl    20(%esp),%ebp
 1267         cmpl    $0,20(%ebp)
 1268         je      .Lenter_emit
 1269         movl    (%ebp),%eax
 1270         movl    4(%ebp),%edi
 1271         movl    8(%ebp),%ecx
 1272         movl    12(%ebp),%edx
 1273         movl    16(%ebp),%esi
 1274         movl    %edi,%ebx
 1275         shll    $26,%edi
 1276         shrl    $6,%ebx
 1277         addl    %edi,%eax
 1278         movl    %ecx,%edi
 1279         adcl    $0,%ebx
 1280         shll    $20,%edi
 1281         shrl    $12,%ecx
 1282         addl    %edi,%ebx
 1283         movl    %edx,%edi
 1284         adcl    $0,%ecx
 1285         shll    $14,%edi
 1286         shrl    $18,%edx
 1287         addl    %edi,%ecx
 1288         movl    %esi,%edi
 1289         adcl    $0,%edx
 1290         shll    $8,%edi
 1291         shrl    $24,%esi
 1292         addl    %edi,%edx
 1293         adcl    $0,%esi
 1294         movl    %esi,%edi
 1295         andl    $3,%esi
 1296         shrl    $2,%edi
 1297         leal    (%edi,%edi,4),%ebp
 1298         movl    24(%esp),%edi
 1299         addl    %ebp,%eax
 1300         movl    28(%esp),%ebp
 1301         adcl    $0,%ebx
 1302         adcl    $0,%ecx
 1303         adcl    $0,%edx
 1304         adcl    $0,%esi
 1305         movd    %eax,%xmm0
 1306         addl    $5,%eax
 1307         movd    %ebx,%xmm1
 1308         adcl    $0,%ebx
 1309         movd    %ecx,%xmm2
 1310         adcl    $0,%ecx
 1311         movd    %edx,%xmm3
 1312         adcl    $0,%edx
 1313         adcl    $0,%esi
 1314         shrl    $2,%esi
 1315         negl    %esi
 1316         andl    %esi,%eax
 1317         andl    %esi,%ebx
 1318         andl    %esi,%ecx
 1319         andl    %esi,%edx
 1320         movl    %eax,(%edi)
 1321         movd    %xmm0,%eax
 1322         movl    %ebx,4(%edi)
 1323         movd    %xmm1,%ebx
 1324         movl    %ecx,8(%edi)
 1325         movd    %xmm2,%ecx
 1326         movl    %edx,12(%edi)
 1327         movd    %xmm3,%edx
 1328         notl    %esi
 1329         andl    %esi,%eax
 1330         andl    %esi,%ebx
 1331         orl     (%edi),%eax
 1332         andl    %esi,%ecx
 1333         orl     4(%edi),%ebx
 1334         andl    %esi,%edx
 1335         orl     8(%edi),%ecx
 1336         orl     12(%edi),%edx
 1337         addl    (%ebp),%eax
 1338         adcl    4(%ebp),%ebx
 1339         movl    %eax,(%edi)
 1340         adcl    8(%ebp),%ecx
 1341         movl    %ebx,4(%edi)
 1342         adcl    12(%ebp),%edx
 1343         movl    %ecx,8(%edi)
 1344         movl    %edx,12(%edi)
 1345         popl    %edi
 1346         popl    %esi
 1347         popl    %ebx
 1348         popl    %ebp
 1349         ret
 1350 .size   _poly1305_emit_sse2,.-_poly1305_emit_sse2
 1351 .align  32
 1352 .type   _poly1305_init_avx2,@function
 1353 .align  16
 1354 _poly1305_init_avx2:
 1355         vmovdqu 24(%edi),%xmm4
 1356         leal    48(%edi),%edi
 1357         movl    %esp,%ebp
 1358         subl    $224,%esp
 1359         andl    $-16,%esp
 1360         vmovdqa 64(%ebx),%xmm7
 1361         vpand   %xmm7,%xmm4,%xmm0
 1362         vpsrlq  $26,%xmm4,%xmm1
 1363         vpsrldq $6,%xmm4,%xmm3
 1364         vpand   %xmm7,%xmm1,%xmm1
 1365         vpsrlq  $4,%xmm3,%xmm2
 1366         vpsrlq  $30,%xmm3,%xmm3
 1367         vpand   %xmm7,%xmm2,%xmm2
 1368         vpand   %xmm7,%xmm3,%xmm3
 1369         vpsrldq $13,%xmm4,%xmm4
 1370         leal    144(%esp),%edx
 1371         movl    $2,%ecx
 1372 .L018square:
 1373         vmovdqa %xmm0,(%esp)
 1374         vmovdqa %xmm1,16(%esp)
 1375         vmovdqa %xmm2,32(%esp)
 1376         vmovdqa %xmm3,48(%esp)
 1377         vmovdqa %xmm4,64(%esp)
 1378         vpslld  $2,%xmm1,%xmm6
 1379         vpslld  $2,%xmm2,%xmm5
 1380         vpaddd  %xmm1,%xmm6,%xmm6
 1381         vpaddd  %xmm2,%xmm5,%xmm5
 1382         vmovdqa %xmm6,80(%esp)
 1383         vmovdqa %xmm5,96(%esp)
 1384         vpslld  $2,%xmm3,%xmm6
 1385         vpslld  $2,%xmm4,%xmm5
 1386         vpaddd  %xmm3,%xmm6,%xmm6
 1387         vpaddd  %xmm4,%xmm5,%xmm5
 1388         vmovdqa %xmm6,112(%esp)
 1389         vmovdqa %xmm5,128(%esp)
 1390         vpshufd $68,%xmm0,%xmm5
 1391         vmovdqa %xmm1,%xmm6
 1392         vpshufd $68,%xmm1,%xmm1
 1393         vpshufd $68,%xmm2,%xmm2
 1394         vpshufd $68,%xmm3,%xmm3
 1395         vpshufd $68,%xmm4,%xmm4
 1396         vmovdqa %xmm5,(%edx)
 1397         vmovdqa %xmm1,16(%edx)
 1398         vmovdqa %xmm2,32(%edx)
 1399         vmovdqa %xmm3,48(%edx)
 1400         vmovdqa %xmm4,64(%edx)
 1401         vpmuludq        %xmm0,%xmm4,%xmm4
 1402         vpmuludq        %xmm0,%xmm3,%xmm3
 1403         vpmuludq        %xmm0,%xmm2,%xmm2
 1404         vpmuludq        %xmm0,%xmm1,%xmm1
 1405         vpmuludq        %xmm0,%xmm5,%xmm0
 1406         vpmuludq        48(%edx),%xmm6,%xmm5
 1407         vpaddq  %xmm5,%xmm4,%xmm4
 1408         vpmuludq        32(%edx),%xmm6,%xmm7
 1409         vpaddq  %xmm7,%xmm3,%xmm3
 1410         vpmuludq        16(%edx),%xmm6,%xmm5
 1411         vpaddq  %xmm5,%xmm2,%xmm2
 1412         vmovdqa 80(%esp),%xmm7
 1413         vpmuludq        (%edx),%xmm6,%xmm6
 1414         vpaddq  %xmm6,%xmm1,%xmm1
 1415         vmovdqa 32(%esp),%xmm5
 1416         vpmuludq        64(%edx),%xmm7,%xmm7
 1417         vpaddq  %xmm7,%xmm0,%xmm0
 1418         vpmuludq        32(%edx),%xmm5,%xmm6
 1419         vpaddq  %xmm6,%xmm4,%xmm4
 1420         vpmuludq        16(%edx),%xmm5,%xmm7
 1421         vpaddq  %xmm7,%xmm3,%xmm3
 1422         vmovdqa 96(%esp),%xmm6
 1423         vpmuludq        (%edx),%xmm5,%xmm5
 1424         vpaddq  %xmm5,%xmm2,%xmm2
 1425         vpmuludq        64(%edx),%xmm6,%xmm7
 1426         vpaddq  %xmm7,%xmm1,%xmm1
 1427         vmovdqa 48(%esp),%xmm5
 1428         vpmuludq        48(%edx),%xmm6,%xmm6
 1429         vpaddq  %xmm6,%xmm0,%xmm0
 1430         vpmuludq        16(%edx),%xmm5,%xmm7
 1431         vpaddq  %xmm7,%xmm4,%xmm4
 1432         vmovdqa 112(%esp),%xmm6
 1433         vpmuludq        (%edx),%xmm5,%xmm5
 1434         vpaddq  %xmm5,%xmm3,%xmm3
 1435         vpmuludq        64(%edx),%xmm6,%xmm7
 1436         vpaddq  %xmm7,%xmm2,%xmm2
 1437         vpmuludq        48(%edx),%xmm6,%xmm5
 1438         vpaddq  %xmm5,%xmm1,%xmm1
 1439         vmovdqa 64(%esp),%xmm7
 1440         vpmuludq        32(%edx),%xmm6,%xmm6
 1441         vpaddq  %xmm6,%xmm0,%xmm0
 1442         vmovdqa 128(%esp),%xmm5
 1443         vpmuludq        (%edx),%xmm7,%xmm7
 1444         vpaddq  %xmm7,%xmm4,%xmm4
 1445         vpmuludq        64(%edx),%xmm5,%xmm6
 1446         vpaddq  %xmm6,%xmm3,%xmm3
 1447         vpmuludq        16(%edx),%xmm5,%xmm7
 1448         vpaddq  %xmm7,%xmm0,%xmm0
 1449         vpmuludq        32(%edx),%xmm5,%xmm6
 1450         vpaddq  %xmm6,%xmm1,%xmm1
 1451         vmovdqa 64(%ebx),%xmm7
 1452         vpmuludq        48(%edx),%xmm5,%xmm5
 1453         vpaddq  %xmm5,%xmm2,%xmm2
 1454         vpsrlq  $26,%xmm3,%xmm5
 1455         vpand   %xmm7,%xmm3,%xmm3
 1456         vpsrlq  $26,%xmm0,%xmm6
 1457         vpand   %xmm7,%xmm0,%xmm0
 1458         vpaddq  %xmm5,%xmm4,%xmm4
 1459         vpaddq  %xmm6,%xmm1,%xmm1
 1460         vpsrlq  $26,%xmm4,%xmm5
 1461         vpand   %xmm7,%xmm4,%xmm4
 1462         vpsrlq  $26,%xmm1,%xmm6
 1463         vpand   %xmm7,%xmm1,%xmm1
 1464         vpaddq  %xmm6,%xmm2,%xmm2
 1465         vpaddd  %xmm5,%xmm0,%xmm0
 1466         vpsllq  $2,%xmm5,%xmm5
 1467         vpsrlq  $26,%xmm2,%xmm6
 1468         vpand   %xmm7,%xmm2,%xmm2
 1469         vpaddd  %xmm5,%xmm0,%xmm0
 1470         vpaddd  %xmm6,%xmm3,%xmm3
 1471         vpsrlq  $26,%xmm3,%xmm6
 1472         vpsrlq  $26,%xmm0,%xmm5
 1473         vpand   %xmm7,%xmm0,%xmm0
 1474         vpand   %xmm7,%xmm3,%xmm3
 1475         vpaddd  %xmm5,%xmm1,%xmm1
 1476         vpaddd  %xmm6,%xmm4,%xmm4
 1477         decl    %ecx
 1478         jz      .L019square_break
 1479         vpunpcklqdq     (%esp),%xmm0,%xmm0
 1480         vpunpcklqdq     16(%esp),%xmm1,%xmm1
 1481         vpunpcklqdq     32(%esp),%xmm2,%xmm2
 1482         vpunpcklqdq     48(%esp),%xmm3,%xmm3
 1483         vpunpcklqdq     64(%esp),%xmm4,%xmm4
 1484         jmp     .L018square
 1485 .L019square_break:
 1486         vpsllq  $32,%xmm0,%xmm0
 1487         vpsllq  $32,%xmm1,%xmm1
 1488         vpsllq  $32,%xmm2,%xmm2
 1489         vpsllq  $32,%xmm3,%xmm3
 1490         vpsllq  $32,%xmm4,%xmm4
 1491         vpor    (%esp),%xmm0,%xmm0
 1492         vpor    16(%esp),%xmm1,%xmm1
 1493         vpor    32(%esp),%xmm2,%xmm2
 1494         vpor    48(%esp),%xmm3,%xmm3
 1495         vpor    64(%esp),%xmm4,%xmm4
 1496         vpshufd $141,%xmm0,%xmm0
 1497         vpshufd $141,%xmm1,%xmm1
 1498         vpshufd $141,%xmm2,%xmm2
 1499         vpshufd $141,%xmm3,%xmm3
 1500         vpshufd $141,%xmm4,%xmm4
 1501         vmovdqu %xmm0,(%edi)
 1502         vmovdqu %xmm1,16(%edi)
 1503         vmovdqu %xmm2,32(%edi)
 1504         vmovdqu %xmm3,48(%edi)
 1505         vmovdqu %xmm4,64(%edi)
 1506         vpslld  $2,%xmm1,%xmm6
 1507         vpslld  $2,%xmm2,%xmm5
 1508         vpaddd  %xmm1,%xmm6,%xmm6
 1509         vpaddd  %xmm2,%xmm5,%xmm5
 1510         vmovdqu %xmm6,80(%edi)
 1511         vmovdqu %xmm5,96(%edi)
 1512         vpslld  $2,%xmm3,%xmm6
 1513         vpslld  $2,%xmm4,%xmm5
 1514         vpaddd  %xmm3,%xmm6,%xmm6
 1515         vpaddd  %xmm4,%xmm5,%xmm5
 1516         vmovdqu %xmm6,112(%edi)
 1517         vmovdqu %xmm5,128(%edi)
 1518         movl    %ebp,%esp
 1519         leal    -48(%edi),%edi
 1520         ret
 1521 .size   _poly1305_init_avx2,.-_poly1305_init_avx2
 1522 .align  32
 1523 .type   _poly1305_blocks_avx2,@function
 1524 .align  16
 1525 _poly1305_blocks_avx2:
 1526         pushl   %ebp
 1527         pushl   %ebx
 1528         pushl   %esi
 1529         pushl   %edi
 1530         movl    20(%esp),%edi
 1531         movl    24(%esp),%esi
 1532         movl    28(%esp),%ecx
 1533         movl    20(%edi),%eax
 1534         andl    $-16,%ecx
 1535         jz      .L020nodata
 1536         cmpl    $64,%ecx
 1537         jae     .L021enter_avx2
 1538         testl   %eax,%eax
 1539         jz      .Lenter_blocks
 1540 .L021enter_avx2:
 1541         vzeroupper
 1542         call    .L022pic_point
 1543 .L022pic_point:
 1544         popl    %ebx
 1545         leal    .Lconst_sse2-.L022pic_point(%ebx),%ebx
 1546         testl   %eax,%eax
 1547         jnz     .L023base2_26
 1548         call    _poly1305_init_avx2
 1549         movl    (%edi),%eax
 1550         movl    3(%edi),%ecx
 1551         movl    6(%edi),%edx
 1552         movl    9(%edi),%esi
 1553         movl    13(%edi),%ebp
 1554         shrl    $2,%ecx
 1555         andl    $67108863,%eax
 1556         shrl    $4,%edx
 1557         andl    $67108863,%ecx
 1558         shrl    $6,%esi
 1559         andl    $67108863,%edx
 1560         movl    %eax,(%edi)
 1561         movl    %ecx,4(%edi)
 1562         movl    %edx,8(%edi)
 1563         movl    %esi,12(%edi)
 1564         movl    %ebp,16(%edi)
 1565         movl    $1,20(%edi)
 1566         movl    24(%esp),%esi
 1567         movl    28(%esp),%ecx
 1568 .L023base2_26:
 1569         movl    32(%esp),%eax
 1570         movl    %esp,%ebp
 1571         subl    $448,%esp
 1572         andl    $-512,%esp
 1573         vmovdqu 48(%edi),%xmm0
 1574         leal    288(%esp),%edx
 1575         vmovdqu 64(%edi),%xmm1
 1576         vmovdqu 80(%edi),%xmm2
 1577         vmovdqu 96(%edi),%xmm3
 1578         vmovdqu 112(%edi),%xmm4
 1579         leal    48(%edi),%edi
 1580         vpermq  $64,%ymm0,%ymm0
 1581         vpermq  $64,%ymm1,%ymm1
 1582         vpermq  $64,%ymm2,%ymm2
 1583         vpermq  $64,%ymm3,%ymm3
 1584         vpermq  $64,%ymm4,%ymm4
 1585         vpshufd $200,%ymm0,%ymm0
 1586         vpshufd $200,%ymm1,%ymm1
 1587         vpshufd $200,%ymm2,%ymm2
 1588         vpshufd $200,%ymm3,%ymm3
 1589         vpshufd $200,%ymm4,%ymm4
 1590         vmovdqa %ymm0,-128(%edx)
 1591         vmovdqu 80(%edi),%xmm0
 1592         vmovdqa %ymm1,-96(%edx)
 1593         vmovdqu 96(%edi),%xmm1
 1594         vmovdqa %ymm2,-64(%edx)
 1595         vmovdqu 112(%edi),%xmm2
 1596         vmovdqa %ymm3,-32(%edx)
 1597         vmovdqu 128(%edi),%xmm3
 1598         vmovdqa %ymm4,(%edx)
 1599         vpermq  $64,%ymm0,%ymm0
 1600         vpermq  $64,%ymm1,%ymm1
 1601         vpermq  $64,%ymm2,%ymm2
 1602         vpermq  $64,%ymm3,%ymm3
 1603         vpshufd $200,%ymm0,%ymm0
 1604         vpshufd $200,%ymm1,%ymm1
 1605         vpshufd $200,%ymm2,%ymm2
 1606         vpshufd $200,%ymm3,%ymm3
 1607         vmovdqa %ymm0,32(%edx)
 1608         vmovd   -48(%edi),%xmm0
 1609         vmovdqa %ymm1,64(%edx)
 1610         vmovd   -44(%edi),%xmm1
 1611         vmovdqa %ymm2,96(%edx)
 1612         vmovd   -40(%edi),%xmm2
 1613         vmovdqa %ymm3,128(%edx)
 1614         vmovd   -36(%edi),%xmm3
 1615         vmovd   -32(%edi),%xmm4
 1616         vmovdqa 64(%ebx),%ymm7
 1617         negl    %eax
 1618         testl   $63,%ecx
 1619         jz      .L024even
 1620         movl    %ecx,%edx
 1621         andl    $-64,%ecx
 1622         andl    $63,%edx
 1623         vmovdqu (%esi),%xmm5
 1624         cmpl    $32,%edx
 1625         jb      .L025one
 1626         vmovdqu 16(%esi),%xmm6
 1627         je      .L026two
 1628         vinserti128     $1,32(%esi),%ymm5,%ymm5
 1629         leal    48(%esi),%esi
 1630         leal    8(%ebx),%ebx
 1631         leal    296(%esp),%edx
 1632         jmp     .L027tail
 1633 .L026two:
 1634         leal    32(%esi),%esi
 1635         leal    16(%ebx),%ebx
 1636         leal    304(%esp),%edx
 1637         jmp     .L027tail
 1638 .L025one:
 1639         leal    16(%esi),%esi
 1640         vpxor   %ymm6,%ymm6,%ymm6
 1641         leal    32(%ebx,%eax,8),%ebx
 1642         leal    312(%esp),%edx
 1643         jmp     .L027tail
 1644 .align  32
 1645 .L024even:
 1646         vmovdqu (%esi),%xmm5
 1647         vmovdqu 16(%esi),%xmm6
 1648         vinserti128     $1,32(%esi),%ymm5,%ymm5
 1649         vinserti128     $1,48(%esi),%ymm6,%ymm6
 1650         leal    64(%esi),%esi
 1651         subl    $64,%ecx
 1652         jz      .L027tail
 1653 .L028loop:
 1654         vmovdqa %ymm2,64(%esp)
 1655         vpsrldq $6,%ymm5,%ymm2
 1656         vmovdqa %ymm0,(%esp)
 1657         vpsrldq $6,%ymm6,%ymm0
 1658         vmovdqa %ymm1,32(%esp)
 1659         vpunpckhqdq     %ymm6,%ymm5,%ymm1
 1660         vpunpcklqdq     %ymm6,%ymm5,%ymm5
 1661         vpunpcklqdq     %ymm0,%ymm2,%ymm2
 1662         vpsrlq  $30,%ymm2,%ymm0
 1663         vpsrlq  $4,%ymm2,%ymm2
 1664         vpsrlq  $26,%ymm5,%ymm6
 1665         vpsrlq  $40,%ymm1,%ymm1
 1666         vpand   %ymm7,%ymm2,%ymm2
 1667         vpand   %ymm7,%ymm5,%ymm5
 1668         vpand   %ymm7,%ymm6,%ymm6
 1669         vpand   %ymm7,%ymm0,%ymm0
 1670         vpor    (%ebx),%ymm1,%ymm1
 1671         vpaddq  64(%esp),%ymm2,%ymm2
 1672         vpaddq  (%esp),%ymm5,%ymm5
 1673         vpaddq  32(%esp),%ymm6,%ymm6
 1674         vpaddq  %ymm3,%ymm0,%ymm0
 1675         vpaddq  %ymm4,%ymm1,%ymm1
 1676         vpmuludq        -96(%edx),%ymm2,%ymm3
 1677         vmovdqa %ymm6,32(%esp)
 1678         vpmuludq        -64(%edx),%ymm2,%ymm4
 1679         vmovdqa %ymm0,96(%esp)
 1680         vpmuludq        96(%edx),%ymm2,%ymm0
 1681         vmovdqa %ymm1,128(%esp)
 1682         vpmuludq        128(%edx),%ymm2,%ymm1
 1683         vpmuludq        -128(%edx),%ymm2,%ymm2
 1684         vpmuludq        -32(%edx),%ymm5,%ymm7
 1685         vpaddq  %ymm7,%ymm3,%ymm3
 1686         vpmuludq        (%edx),%ymm5,%ymm6
 1687         vpaddq  %ymm6,%ymm4,%ymm4
 1688         vpmuludq        -128(%edx),%ymm5,%ymm7
 1689         vpaddq  %ymm7,%ymm0,%ymm0
 1690         vmovdqa 32(%esp),%ymm7
 1691         vpmuludq        -96(%edx),%ymm5,%ymm6
 1692         vpaddq  %ymm6,%ymm1,%ymm1
 1693         vpmuludq        -64(%edx),%ymm5,%ymm5
 1694         vpaddq  %ymm5,%ymm2,%ymm2
 1695         vpmuludq        -64(%edx),%ymm7,%ymm6
 1696         vpaddq  %ymm6,%ymm3,%ymm3
 1697         vpmuludq        -32(%edx),%ymm7,%ymm5
 1698         vpaddq  %ymm5,%ymm4,%ymm4
 1699         vpmuludq        128(%edx),%ymm7,%ymm6
 1700         vpaddq  %ymm6,%ymm0,%ymm0
 1701         vmovdqa 96(%esp),%ymm6
 1702         vpmuludq        -128(%edx),%ymm7,%ymm5
 1703         vpaddq  %ymm5,%ymm1,%ymm1
 1704         vpmuludq        -96(%edx),%ymm7,%ymm7
 1705         vpaddq  %ymm7,%ymm2,%ymm2
 1706         vpmuludq        -128(%edx),%ymm6,%ymm5
 1707         vpaddq  %ymm5,%ymm3,%ymm3
 1708         vpmuludq        -96(%edx),%ymm6,%ymm7
 1709         vpaddq  %ymm7,%ymm4,%ymm4
 1710         vpmuludq        64(%edx),%ymm6,%ymm5
 1711         vpaddq  %ymm5,%ymm0,%ymm0
 1712         vmovdqa 128(%esp),%ymm5
 1713         vpmuludq        96(%edx),%ymm6,%ymm7
 1714         vpaddq  %ymm7,%ymm1,%ymm1
 1715         vpmuludq        128(%edx),%ymm6,%ymm6
 1716         vpaddq  %ymm6,%ymm2,%ymm2
 1717         vpmuludq        128(%edx),%ymm5,%ymm7
 1718         vpaddq  %ymm7,%ymm3,%ymm3
 1719         vpmuludq        32(%edx),%ymm5,%ymm6
 1720         vpaddq  %ymm6,%ymm0,%ymm0
 1721         vpmuludq        -128(%edx),%ymm5,%ymm7
 1722         vpaddq  %ymm7,%ymm4,%ymm4
 1723         vmovdqa 64(%ebx),%ymm7
 1724         vpmuludq        64(%edx),%ymm5,%ymm6
 1725         vpaddq  %ymm6,%ymm1,%ymm1
 1726         vpmuludq        96(%edx),%ymm5,%ymm5
 1727         vpaddq  %ymm5,%ymm2,%ymm2
 1728         vpsrlq  $26,%ymm3,%ymm5
 1729         vpand   %ymm7,%ymm3,%ymm3
 1730         vpsrlq  $26,%ymm0,%ymm6
 1731         vpand   %ymm7,%ymm0,%ymm0
 1732         vpaddq  %ymm5,%ymm4,%ymm4
 1733         vpaddq  %ymm6,%ymm1,%ymm1
 1734         vpsrlq  $26,%ymm4,%ymm5
 1735         vpand   %ymm7,%ymm4,%ymm4
 1736         vpsrlq  $26,%ymm1,%ymm6
 1737         vpand   %ymm7,%ymm1,%ymm1
 1738         vpaddq  %ymm6,%ymm2,%ymm2
 1739         vpaddq  %ymm5,%ymm0,%ymm0
 1740         vpsllq  $2,%ymm5,%ymm5
 1741         vpsrlq  $26,%ymm2,%ymm6
 1742         vpand   %ymm7,%ymm2,%ymm2
 1743         vpaddq  %ymm5,%ymm0,%ymm0
 1744         vpaddq  %ymm6,%ymm3,%ymm3
 1745         vpsrlq  $26,%ymm3,%ymm6
 1746         vpsrlq  $26,%ymm0,%ymm5
 1747         vpand   %ymm7,%ymm0,%ymm0
 1748         vpand   %ymm7,%ymm3,%ymm3
 1749         vpaddq  %ymm5,%ymm1,%ymm1
 1750         vpaddq  %ymm6,%ymm4,%ymm4
 1751         vmovdqu (%esi),%xmm5
 1752         vmovdqu 16(%esi),%xmm6
 1753         vinserti128     $1,32(%esi),%ymm5,%ymm5
 1754         vinserti128     $1,48(%esi),%ymm6,%ymm6
 1755         leal    64(%esi),%esi
 1756         subl    $64,%ecx
 1757         jnz     .L028loop
 1758 .L027tail:
 1759         vmovdqa %ymm2,64(%esp)
 1760         vpsrldq $6,%ymm5,%ymm2
 1761         vmovdqa %ymm0,(%esp)
 1762         vpsrldq $6,%ymm6,%ymm0
 1763         vmovdqa %ymm1,32(%esp)
 1764         vpunpckhqdq     %ymm6,%ymm5,%ymm1
 1765         vpunpcklqdq     %ymm6,%ymm5,%ymm5
 1766         vpunpcklqdq     %ymm0,%ymm2,%ymm2
 1767         vpsrlq  $30,%ymm2,%ymm0
 1768         vpsrlq  $4,%ymm2,%ymm2
 1769         vpsrlq  $26,%ymm5,%ymm6
 1770         vpsrlq  $40,%ymm1,%ymm1
 1771         vpand   %ymm7,%ymm2,%ymm2
 1772         vpand   %ymm7,%ymm5,%ymm5
 1773         vpand   %ymm7,%ymm6,%ymm6
 1774         vpand   %ymm7,%ymm0,%ymm0
 1775         vpor    (%ebx),%ymm1,%ymm1
 1776         andl    $-64,%ebx
 1777         vpaddq  64(%esp),%ymm2,%ymm2
 1778         vpaddq  (%esp),%ymm5,%ymm5
 1779         vpaddq  32(%esp),%ymm6,%ymm6
 1780         vpaddq  %ymm3,%ymm0,%ymm0
 1781         vpaddq  %ymm4,%ymm1,%ymm1
 1782         vpmuludq        -92(%edx),%ymm2,%ymm3
 1783         vmovdqa %ymm6,32(%esp)
 1784         vpmuludq        -60(%edx),%ymm2,%ymm4
 1785         vmovdqa %ymm0,96(%esp)
 1786         vpmuludq        100(%edx),%ymm2,%ymm0
 1787         vmovdqa %ymm1,128(%esp)
 1788         vpmuludq        132(%edx),%ymm2,%ymm1
 1789         vpmuludq        -124(%edx),%ymm2,%ymm2
 1790         vpmuludq        -28(%edx),%ymm5,%ymm7
 1791         vpaddq  %ymm7,%ymm3,%ymm3
 1792         vpmuludq        4(%edx),%ymm5,%ymm6
 1793         vpaddq  %ymm6,%ymm4,%ymm4
 1794         vpmuludq        -124(%edx),%ymm5,%ymm7
 1795         vpaddq  %ymm7,%ymm0,%ymm0
 1796         vmovdqa 32(%esp),%ymm7
 1797         vpmuludq        -92(%edx),%ymm5,%ymm6
 1798         vpaddq  %ymm6,%ymm1,%ymm1
 1799         vpmuludq        -60(%edx),%ymm5,%ymm5
 1800         vpaddq  %ymm5,%ymm2,%ymm2
 1801         vpmuludq        -60(%edx),%ymm7,%ymm6
 1802         vpaddq  %ymm6,%ymm3,%ymm3
 1803         vpmuludq        -28(%edx),%ymm7,%ymm5
 1804         vpaddq  %ymm5,%ymm4,%ymm4
 1805         vpmuludq        132(%edx),%ymm7,%ymm6
 1806         vpaddq  %ymm6,%ymm0,%ymm0
 1807         vmovdqa 96(%esp),%ymm6
 1808         vpmuludq        -124(%edx),%ymm7,%ymm5
 1809         vpaddq  %ymm5,%ymm1,%ymm1
 1810         vpmuludq        -92(%edx),%ymm7,%ymm7
 1811         vpaddq  %ymm7,%ymm2,%ymm2
 1812         vpmuludq        -124(%edx),%ymm6,%ymm5
 1813         vpaddq  %ymm5,%ymm3,%ymm3
 1814         vpmuludq        -92(%edx),%ymm6,%ymm7
 1815         vpaddq  %ymm7,%ymm4,%ymm4
 1816         vpmuludq        68(%edx),%ymm6,%ymm5
 1817         vpaddq  %ymm5,%ymm0,%ymm0
 1818         vmovdqa 128(%esp),%ymm5
 1819         vpmuludq        100(%edx),%ymm6,%ymm7
 1820         vpaddq  %ymm7,%ymm1,%ymm1
 1821         vpmuludq        132(%edx),%ymm6,%ymm6
 1822         vpaddq  %ymm6,%ymm2,%ymm2
 1823         vpmuludq        132(%edx),%ymm5,%ymm7
 1824         vpaddq  %ymm7,%ymm3,%ymm3
 1825         vpmuludq        36(%edx),%ymm5,%ymm6
 1826         vpaddq  %ymm6,%ymm0,%ymm0
 1827         vpmuludq        -124(%edx),%ymm5,%ymm7
 1828         vpaddq  %ymm7,%ymm4,%ymm4
 1829         vmovdqa 64(%ebx),%ymm7
 1830         vpmuludq        68(%edx),%ymm5,%ymm6
 1831         vpaddq  %ymm6,%ymm1,%ymm1
 1832         vpmuludq        100(%edx),%ymm5,%ymm5
 1833         vpaddq  %ymm5,%ymm2,%ymm2
 1834         vpsrldq $8,%ymm4,%ymm5
 1835         vpsrldq $8,%ymm3,%ymm6
 1836         vpaddq  %ymm5,%ymm4,%ymm4
 1837         vpsrldq $8,%ymm0,%ymm5
 1838         vpaddq  %ymm6,%ymm3,%ymm3
 1839         vpsrldq $8,%ymm1,%ymm6
 1840         vpaddq  %ymm5,%ymm0,%ymm0
 1841         vpsrldq $8,%ymm2,%ymm5
 1842         vpaddq  %ymm6,%ymm1,%ymm1
 1843         vpermq  $2,%ymm4,%ymm6
 1844         vpaddq  %ymm5,%ymm2,%ymm2
 1845         vpermq  $2,%ymm3,%ymm5
 1846         vpaddq  %ymm6,%ymm4,%ymm4
 1847         vpermq  $2,%ymm0,%ymm6
 1848         vpaddq  %ymm5,%ymm3,%ymm3
 1849         vpermq  $2,%ymm1,%ymm5
 1850         vpaddq  %ymm6,%ymm0,%ymm0
 1851         vpermq  $2,%ymm2,%ymm6
 1852         vpaddq  %ymm5,%ymm1,%ymm1
 1853         vpaddq  %ymm6,%ymm2,%ymm2
 1854         vpsrlq  $26,%ymm3,%ymm5
 1855         vpand   %ymm7,%ymm3,%ymm3
 1856         vpsrlq  $26,%ymm0,%ymm6
 1857         vpand   %ymm7,%ymm0,%ymm0
 1858         vpaddq  %ymm5,%ymm4,%ymm4
 1859         vpaddq  %ymm6,%ymm1,%ymm1
 1860         vpsrlq  $26,%ymm4,%ymm5
 1861         vpand   %ymm7,%ymm4,%ymm4
 1862         vpsrlq  $26,%ymm1,%ymm6
 1863         vpand   %ymm7,%ymm1,%ymm1
 1864         vpaddq  %ymm6,%ymm2,%ymm2
 1865         vpaddq  %ymm5,%ymm0,%ymm0
 1866         vpsllq  $2,%ymm5,%ymm5
 1867         vpsrlq  $26,%ymm2,%ymm6
 1868         vpand   %ymm7,%ymm2,%ymm2
 1869         vpaddq  %ymm5,%ymm0,%ymm0
 1870         vpaddq  %ymm6,%ymm3,%ymm3
 1871         vpsrlq  $26,%ymm3,%ymm6
 1872         vpsrlq  $26,%ymm0,%ymm5
 1873         vpand   %ymm7,%ymm0,%ymm0
 1874         vpand   %ymm7,%ymm3,%ymm3
 1875         vpaddq  %ymm5,%ymm1,%ymm1
 1876         vpaddq  %ymm6,%ymm4,%ymm4
 1877         cmpl    $0,%ecx
 1878         je      .L029done
 1879         vpshufd $252,%xmm0,%xmm0
 1880         leal    288(%esp),%edx
 1881         vpshufd $252,%xmm1,%xmm1
 1882         vpshufd $252,%xmm2,%xmm2
 1883         vpshufd $252,%xmm3,%xmm3
 1884         vpshufd $252,%xmm4,%xmm4
 1885         jmp     .L024even
 1886 .align  16
 1887 .L029done:
 1888         vmovd   %xmm0,-48(%edi)
 1889         vmovd   %xmm1,-44(%edi)
 1890         vmovd   %xmm2,-40(%edi)
 1891         vmovd   %xmm3,-36(%edi)
 1892         vmovd   %xmm4,-32(%edi)
 1893         vzeroupper
 1894         movl    %ebp,%esp
 1895 .L020nodata:
 1896         popl    %edi
 1897         popl    %esi
 1898         popl    %ebx
 1899         popl    %ebp
 1900         ret
 1901 .size   _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
 1902 .align  64
 1903 .Lconst_sse2:
 1904 .long   16777216,0,16777216,0,16777216,0,16777216,0
 1905 .long   0,0,0,0,0,0,0,0
 1906 .long   67108863,0,67108863,0,67108863,0,67108863,0
 1907 .long   268435455,268435452,268435452,268435452
 1908 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
 1909 .byte   44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
 1910 .byte   60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
 1911 .byte   114,103,62,0
 1912 .align  4
 1913 .comm   OPENSSL_ia32cap_P,16,4
 1914 #else
 1915 .text
 1916 .align  64
 1917 .globl  poly1305_init
 1918 .type   poly1305_init,@function
 1919 .align  16
 1920 poly1305_init:
 1921 .L_poly1305_init_begin:
 1922         pushl   %ebp
 1923         pushl   %ebx
 1924         pushl   %esi
 1925         pushl   %edi
 1926         movl    20(%esp),%edi
 1927         movl    24(%esp),%esi
 1928         movl    28(%esp),%ebp
 1929         xorl    %eax,%eax
 1930         movl    %eax,(%edi)
 1931         movl    %eax,4(%edi)
 1932         movl    %eax,8(%edi)
 1933         movl    %eax,12(%edi)
 1934         movl    %eax,16(%edi)
 1935         movl    %eax,20(%edi)
 1936         cmpl    $0,%esi
 1937         je      .L000nokey
 1938         call    .L001pic_point
 1939 .L001pic_point:
 1940         popl    %ebx
 1941         leal    poly1305_blocks-.L001pic_point(%ebx),%eax
 1942         leal    poly1305_emit-.L001pic_point(%ebx),%edx
 1943         leal    OPENSSL_ia32cap_P,%edi
 1944         movl    (%edi),%ecx
 1945         andl    $83886080,%ecx
 1946         cmpl    $83886080,%ecx
 1947         jne     .L002no_sse2
 1948         leal    _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
 1949         leal    _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
 1950         movl    8(%edi),%ecx
 1951         testl   $32,%ecx
 1952         jz      .L002no_sse2
 1953         leal    _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
 1954 .L002no_sse2:
 1955         movl    20(%esp),%edi
 1956         movl    %eax,(%ebp)
 1957         movl    %edx,4(%ebp)
 1958         movl    (%esi),%eax
 1959         movl    4(%esi),%ebx
 1960         movl    8(%esi),%ecx
 1961         movl    12(%esi),%edx
 1962         andl    $268435455,%eax
 1963         andl    $268435452,%ebx
 1964         andl    $268435452,%ecx
 1965         andl    $268435452,%edx
 1966         movl    %eax,24(%edi)
 1967         movl    %ebx,28(%edi)
 1968         movl    %ecx,32(%edi)
 1969         movl    %edx,36(%edi)
 1970         movl    $1,%eax
 1971 .L000nokey:
 1972         popl    %edi
 1973         popl    %esi
 1974         popl    %ebx
 1975         popl    %ebp
 1976         ret
 1977 .size   poly1305_init,.-.L_poly1305_init_begin
 1978 .globl  poly1305_blocks
 1979 .type   poly1305_blocks,@function
 1980 .align  16
 1981 poly1305_blocks:
 1982 .L_poly1305_blocks_begin:
 1983         pushl   %ebp
 1984         pushl   %ebx
 1985         pushl   %esi
 1986         pushl   %edi
 1987         movl    20(%esp),%edi
 1988         movl    24(%esp),%esi
 1989         movl    28(%esp),%ecx
 1990 .Lenter_blocks:
 1991         andl    $-15,%ecx
 1992         jz      .L003nodata
 1993         subl    $64,%esp
 1994         movl    24(%edi),%eax
 1995         movl    28(%edi),%ebx
 1996         leal    (%esi,%ecx,1),%ebp
 1997         movl    32(%edi),%ecx
 1998         movl    36(%edi),%edx
 1999         movl    %ebp,92(%esp)
 2000         movl    %esi,%ebp
 2001         movl    %eax,36(%esp)
 2002         movl    %ebx,%eax
 2003         shrl    $2,%eax
 2004         movl    %ebx,40(%esp)
 2005         addl    %ebx,%eax
 2006         movl    %ecx,%ebx
 2007         shrl    $2,%ebx
 2008         movl    %ecx,44(%esp)
 2009         addl    %ecx,%ebx
 2010         movl    %edx,%ecx
 2011         shrl    $2,%ecx
 2012         movl    %edx,48(%esp)
 2013         addl    %edx,%ecx
 2014         movl    %eax,52(%esp)
 2015         movl    %ebx,56(%esp)
 2016         movl    %ecx,60(%esp)
 2017         movl    (%edi),%eax
 2018         movl    4(%edi),%ebx
 2019         movl    8(%edi),%ecx
 2020         movl    12(%edi),%esi
 2021         movl    16(%edi),%edi
 2022         jmp     .L004loop
 2023 .align  32
 2024 .L004loop:
 2025         addl    (%ebp),%eax
 2026         adcl    4(%ebp),%ebx
 2027         adcl    8(%ebp),%ecx
 2028         adcl    12(%ebp),%esi
 2029         leal    16(%ebp),%ebp
 2030         adcl    96(%esp),%edi
 2031         movl    %eax,(%esp)
 2032         movl    %esi,12(%esp)
 2033         mull    36(%esp)
 2034         movl    %edi,16(%esp)
 2035         movl    %eax,%edi
 2036         movl    %ebx,%eax
 2037         movl    %edx,%esi
 2038         mull    60(%esp)
 2039         addl    %eax,%edi
 2040         movl    %ecx,%eax
 2041         adcl    %edx,%esi
 2042         mull    56(%esp)
 2043         addl    %eax,%edi
 2044         movl    12(%esp),%eax
 2045         adcl    %edx,%esi
 2046         mull    52(%esp)
 2047         addl    %eax,%edi
 2048         movl    (%esp),%eax
 2049         adcl    %edx,%esi
 2050         mull    40(%esp)
 2051         movl    %edi,20(%esp)
 2052         xorl    %edi,%edi
 2053         addl    %eax,%esi
 2054         movl    %ebx,%eax
 2055         adcl    %edx,%edi
 2056         mull    36(%esp)
 2057         addl    %eax,%esi
 2058         movl    %ecx,%eax
 2059         adcl    %edx,%edi
 2060         mull    60(%esp)
 2061         addl    %eax,%esi
 2062         movl    12(%esp),%eax
 2063         adcl    %edx,%edi
 2064         mull    56(%esp)
 2065         addl    %eax,%esi
 2066         movl    16(%esp),%eax
 2067         adcl    %edx,%edi
 2068         imull   52(%esp),%eax
 2069         addl    %eax,%esi
 2070         movl    (%esp),%eax
 2071         adcl    $0,%edi
 2072         mull    44(%esp)
 2073         movl    %esi,24(%esp)
 2074         xorl    %esi,%esi
 2075         addl    %eax,%edi
 2076         movl    %ebx,%eax
 2077         adcl    %edx,%esi
 2078         mull    40(%esp)
 2079         addl    %eax,%edi
 2080         movl    %ecx,%eax
 2081         adcl    %edx,%esi
 2082         mull    36(%esp)
 2083         addl    %eax,%edi
 2084         movl    12(%esp),%eax
 2085         adcl    %edx,%esi
 2086         mull    60(%esp)
 2087         addl    %eax,%edi
 2088         movl    16(%esp),%eax
 2089         adcl    %edx,%esi
 2090         imull   56(%esp),%eax
 2091         addl    %eax,%edi
 2092         movl    (%esp),%eax
 2093         adcl    $0,%esi
 2094         mull    48(%esp)
 2095         movl    %edi,28(%esp)
 2096         xorl    %edi,%edi
 2097         addl    %eax,%esi
 2098         movl    %ebx,%eax
 2099         adcl    %edx,%edi
 2100         mull    44(%esp)
 2101         addl    %eax,%esi
 2102         movl    %ecx,%eax
 2103         adcl    %edx,%edi
 2104         mull    40(%esp)
 2105         addl    %eax,%esi
 2106         movl    12(%esp),%eax
 2107         adcl    %edx,%edi
 2108         mull    36(%esp)
 2109         addl    %eax,%esi
 2110         movl    16(%esp),%ecx
 2111         adcl    %edx,%edi
 2112         movl    %ecx,%edx
 2113         imull   60(%esp),%ecx
 2114         addl    %ecx,%esi
 2115         movl    20(%esp),%eax
 2116         adcl    $0,%edi
 2117         imull   36(%esp),%edx
 2118         addl    %edi,%edx
 2119         movl    24(%esp),%ebx
 2120         movl    28(%esp),%ecx
 2121         movl    %edx,%edi
 2122         shrl    $2,%edx
 2123         andl    $3,%edi
 2124         leal    (%edx,%edx,4),%edx
 2125         addl    %edx,%eax
 2126         adcl    $0,%ebx
 2127         adcl    $0,%ecx
 2128         adcl    $0,%esi
 2129         adcl    $0,%edi
 2130         cmpl    92(%esp),%ebp
 2131         jne     .L004loop
 2132         movl    84(%esp),%edx
 2133         addl    $64,%esp
 2134         movl    %eax,(%edx)
 2135         movl    %ebx,4(%edx)
 2136         movl    %ecx,8(%edx)
 2137         movl    %esi,12(%edx)
 2138         movl    %edi,16(%edx)
 2139 .L003nodata:
 2140         popl    %edi
 2141         popl    %esi
 2142         popl    %ebx
 2143         popl    %ebp
 2144         ret
 2145 .size   poly1305_blocks,.-.L_poly1305_blocks_begin
 2146 .globl  poly1305_emit
 2147 .type   poly1305_emit,@function
 2148 .align  16
 2149 poly1305_emit:
 2150 .L_poly1305_emit_begin:
 2151         pushl   %ebp
 2152         pushl   %ebx
 2153         pushl   %esi
 2154         pushl   %edi
 2155         movl    20(%esp),%ebp
 2156 .Lenter_emit:
 2157         movl    24(%esp),%edi
 2158         movl    (%ebp),%eax
 2159         movl    4(%ebp),%ebx
 2160         movl    8(%ebp),%ecx
 2161         movl    12(%ebp),%edx
 2162         movl    16(%ebp),%esi
 2163         addl    $5,%eax
 2164         adcl    $0,%ebx
 2165         adcl    $0,%ecx
 2166         adcl    $0,%edx
 2167         adcl    $0,%esi
 2168         shrl    $2,%esi
 2169         negl    %esi
 2170         andl    %esi,%eax
 2171         andl    %esi,%ebx
 2172         andl    %esi,%ecx
 2173         andl    %esi,%edx
 2174         movl    %eax,(%edi)
 2175         movl    %ebx,4(%edi)
 2176         movl    %ecx,8(%edi)
 2177         movl    %edx,12(%edi)
 2178         notl    %esi
 2179         movl    (%ebp),%eax
 2180         movl    4(%ebp),%ebx
 2181         movl    8(%ebp),%ecx
 2182         movl    12(%ebp),%edx
 2183         movl    28(%esp),%ebp
 2184         andl    %esi,%eax
 2185         andl    %esi,%ebx
 2186         andl    %esi,%ecx
 2187         andl    %esi,%edx
 2188         orl     (%edi),%eax
 2189         orl     4(%edi),%ebx
 2190         orl     8(%edi),%ecx
 2191         orl     12(%edi),%edx
 2192         addl    (%ebp),%eax
 2193         adcl    4(%ebp),%ebx
 2194         adcl    8(%ebp),%ecx
 2195         adcl    12(%ebp),%edx
 2196         movl    %eax,(%edi)
 2197         movl    %ebx,4(%edi)
 2198         movl    %ecx,8(%edi)
 2199         movl    %edx,12(%edi)
 2200         popl    %edi
 2201         popl    %esi
 2202         popl    %ebx
 2203         popl    %ebp
 2204         ret
 2205 .size   poly1305_emit,.-.L_poly1305_emit_begin
 2206 .align  32
 2207 .type   _poly1305_init_sse2,@function
 2208 .align  16
 2209 _poly1305_init_sse2:
 2210         movdqu  24(%edi),%xmm4
 2211         leal    48(%edi),%edi
 2212         movl    %esp,%ebp
 2213         subl    $224,%esp
 2214         andl    $-16,%esp
 2215         movq    64(%ebx),%xmm7
 2216         movdqa  %xmm4,%xmm0
 2217         movdqa  %xmm4,%xmm1
 2218         movdqa  %xmm4,%xmm2
 2219         pand    %xmm7,%xmm0
 2220         psrlq   $26,%xmm1
 2221         psrldq  $6,%xmm2
 2222         pand    %xmm7,%xmm1
 2223         movdqa  %xmm2,%xmm3
 2224         psrlq   $4,%xmm2
 2225         psrlq   $30,%xmm3
 2226         pand    %xmm7,%xmm2
 2227         pand    %xmm7,%xmm3
 2228         psrldq  $13,%xmm4
 2229         leal    144(%esp),%edx
 2230         movl    $2,%ecx
 2231 .L005square:
 2232         movdqa  %xmm0,(%esp)
 2233         movdqa  %xmm1,16(%esp)
 2234         movdqa  %xmm2,32(%esp)
 2235         movdqa  %xmm3,48(%esp)
 2236         movdqa  %xmm4,64(%esp)
 2237         movdqa  %xmm1,%xmm6
 2238         movdqa  %xmm2,%xmm5
 2239         pslld   $2,%xmm6
 2240         pslld   $2,%xmm5
 2241         paddd   %xmm1,%xmm6
 2242         paddd   %xmm2,%xmm5
 2243         movdqa  %xmm6,80(%esp)
 2244         movdqa  %xmm5,96(%esp)
 2245         movdqa  %xmm3,%xmm6
 2246         movdqa  %xmm4,%xmm5
 2247         pslld   $2,%xmm6
 2248         pslld   $2,%xmm5
 2249         paddd   %xmm3,%xmm6
 2250         paddd   %xmm4,%xmm5
 2251         movdqa  %xmm6,112(%esp)
 2252         movdqa  %xmm5,128(%esp)
 2253         pshufd  $68,%xmm0,%xmm6
 2254         movdqa  %xmm1,%xmm5
 2255         pshufd  $68,%xmm1,%xmm1
 2256         pshufd  $68,%xmm2,%xmm2
 2257         pshufd  $68,%xmm3,%xmm3
 2258         pshufd  $68,%xmm4,%xmm4
 2259         movdqa  %xmm6,(%edx)
 2260         movdqa  %xmm1,16(%edx)
 2261         movdqa  %xmm2,32(%edx)
 2262         movdqa  %xmm3,48(%edx)
 2263         movdqa  %xmm4,64(%edx)
 2264         pmuludq %xmm0,%xmm4
 2265         pmuludq %xmm0,%xmm3
 2266         pmuludq %xmm0,%xmm2
 2267         pmuludq %xmm0,%xmm1
 2268         pmuludq %xmm6,%xmm0
 2269         movdqa  %xmm5,%xmm6
 2270         pmuludq 48(%edx),%xmm5
 2271         movdqa  %xmm6,%xmm7
 2272         pmuludq 32(%edx),%xmm6
 2273         paddq   %xmm5,%xmm4
 2274         movdqa  %xmm7,%xmm5
 2275         pmuludq 16(%edx),%xmm7
 2276         paddq   %xmm6,%xmm3
 2277         movdqa  80(%esp),%xmm6
 2278         pmuludq (%edx),%xmm5
 2279         paddq   %xmm7,%xmm2
 2280         pmuludq 64(%edx),%xmm6
 2281         movdqa  32(%esp),%xmm7
 2282         paddq   %xmm5,%xmm1
 2283         movdqa  %xmm7,%xmm5
 2284         pmuludq 32(%edx),%xmm7
 2285         paddq   %xmm6,%xmm0
 2286         movdqa  %xmm5,%xmm6
 2287         pmuludq 16(%edx),%xmm5
 2288         paddq   %xmm7,%xmm4
 2289         movdqa  96(%esp),%xmm7
 2290         pmuludq (%edx),%xmm6
 2291         paddq   %xmm5,%xmm3
 2292         movdqa  %xmm7,%xmm5
 2293         pmuludq 64(%edx),%xmm7
 2294         paddq   %xmm6,%xmm2
 2295         pmuludq 48(%edx),%xmm5
 2296         movdqa  48(%esp),%xmm6
 2297         paddq   %xmm7,%xmm1
 2298         movdqa  %xmm6,%xmm7
 2299         pmuludq 16(%edx),%xmm6
 2300         paddq   %xmm5,%xmm0
 2301         movdqa  112(%esp),%xmm5
 2302         pmuludq (%edx),%xmm7
 2303         paddq   %xmm6,%xmm4
 2304         movdqa  %xmm5,%xmm6
 2305         pmuludq 64(%edx),%xmm5
 2306         paddq   %xmm7,%xmm3
 2307         movdqa  %xmm6,%xmm7
 2308         pmuludq 48(%edx),%xmm6
 2309         paddq   %xmm5,%xmm2
 2310         pmuludq 32(%edx),%xmm7
 2311         movdqa  64(%esp),%xmm5
 2312         paddq   %xmm6,%xmm1
 2313         movdqa  128(%esp),%xmm6
 2314         pmuludq (%edx),%xmm5
 2315         paddq   %xmm7,%xmm0
 2316         movdqa  %xmm6,%xmm7
 2317         pmuludq 64(%edx),%xmm6
 2318         paddq   %xmm5,%xmm4
 2319         movdqa  %xmm7,%xmm5
 2320         pmuludq 16(%edx),%xmm7
 2321         paddq   %xmm6,%xmm3
 2322         movdqa  %xmm5,%xmm6
 2323         pmuludq 32(%edx),%xmm5
 2324         paddq   %xmm7,%xmm0
 2325         pmuludq 48(%edx),%xmm6
 2326         movdqa  64(%ebx),%xmm7
 2327         paddq   %xmm5,%xmm1
 2328         paddq   %xmm6,%xmm2
 2329         movdqa  %xmm3,%xmm5
 2330         pand    %xmm7,%xmm3
 2331         psrlq   $26,%xmm5
 2332         paddq   %xmm4,%xmm5
 2333         movdqa  %xmm0,%xmm6
 2334         pand    %xmm7,%xmm0
 2335         psrlq   $26,%xmm6
 2336         movdqa  %xmm5,%xmm4
 2337         paddq   %xmm1,%xmm6
 2338         psrlq   $26,%xmm5
 2339         pand    %xmm7,%xmm4
 2340         movdqa  %xmm6,%xmm1
 2341         psrlq   $26,%xmm6
 2342         paddd   %xmm5,%xmm0
 2343         psllq   $2,%xmm5
 2344         paddq   %xmm2,%xmm6
 2345         paddq   %xmm0,%xmm5
 2346         pand    %xmm7,%xmm1
 2347         movdqa  %xmm6,%xmm2
 2348         psrlq   $26,%xmm6
 2349         pand    %xmm7,%xmm2
 2350         paddd   %xmm3,%xmm6
 2351         movdqa  %xmm5,%xmm0
 2352         psrlq   $26,%xmm5
 2353         movdqa  %xmm6,%xmm3
 2354         psrlq   $26,%xmm6
 2355         pand    %xmm7,%xmm0
 2356         paddd   %xmm5,%xmm1
 2357         pand    %xmm7,%xmm3
 2358         paddd   %xmm6,%xmm4
 2359         decl    %ecx
 2360         jz      .L006square_break
 2361         punpcklqdq      (%esp),%xmm0
 2362         punpcklqdq      16(%esp),%xmm1
 2363         punpcklqdq      32(%esp),%xmm2
 2364         punpcklqdq      48(%esp),%xmm3
 2365         punpcklqdq      64(%esp),%xmm4
 2366         jmp     .L005square
 2367 .L006square_break:
 2368         psllq   $32,%xmm0
 2369         psllq   $32,%xmm1
 2370         psllq   $32,%xmm2
 2371         psllq   $32,%xmm3
 2372         psllq   $32,%xmm4
 2373         por     (%esp),%xmm0
 2374         por     16(%esp),%xmm1
 2375         por     32(%esp),%xmm2
 2376         por     48(%esp),%xmm3
 2377         por     64(%esp),%xmm4
 2378         pshufd  $141,%xmm0,%xmm0
 2379         pshufd  $141,%xmm1,%xmm1
 2380         pshufd  $141,%xmm2,%xmm2
 2381         pshufd  $141,%xmm3,%xmm3
 2382         pshufd  $141,%xmm4,%xmm4
 2383         movdqu  %xmm0,(%edi)
 2384         movdqu  %xmm1,16(%edi)
 2385         movdqu  %xmm2,32(%edi)
 2386         movdqu  %xmm3,48(%edi)
 2387         movdqu  %xmm4,64(%edi)
 2388         movdqa  %xmm1,%xmm6
 2389         movdqa  %xmm2,%xmm5
 2390         pslld   $2,%xmm6
 2391         pslld   $2,%xmm5
 2392         paddd   %xmm1,%xmm6
 2393         paddd   %xmm2,%xmm5
 2394         movdqu  %xmm6,80(%edi)
 2395         movdqu  %xmm5,96(%edi)
 2396         movdqa  %xmm3,%xmm6
 2397         movdqa  %xmm4,%xmm5
 2398         pslld   $2,%xmm6
 2399         pslld   $2,%xmm5
 2400         paddd   %xmm3,%xmm6
 2401         paddd   %xmm4,%xmm5
 2402         movdqu  %xmm6,112(%edi)
 2403         movdqu  %xmm5,128(%edi)
 2404         movl    %ebp,%esp
 2405         leal    -48(%edi),%edi
 2406         ret
 2407 .size   _poly1305_init_sse2,.-_poly1305_init_sse2
 2408 .align  32
 2409 .type   _poly1305_blocks_sse2,@function
 2410 .align  16
 2411 _poly1305_blocks_sse2:
 2412         pushl   %ebp
 2413         pushl   %ebx
 2414         pushl   %esi
 2415         pushl   %edi
 2416         movl    20(%esp),%edi
 2417         movl    24(%esp),%esi
 2418         movl    28(%esp),%ecx
 2419         movl    20(%edi),%eax
 2420         andl    $-16,%ecx
 2421         jz      .L007nodata
 2422         cmpl    $64,%ecx
 2423         jae     .L008enter_sse2
 2424         testl   %eax,%eax
 2425         jz      .Lenter_blocks
 2426 .align  16
 2427 .L008enter_sse2:
 2428         call    .L009pic_point
 2429 .L009pic_point:
 2430         popl    %ebx
 2431         leal    .Lconst_sse2-.L009pic_point(%ebx),%ebx
 2432         testl   %eax,%eax
 2433         jnz     .L010base2_26
 2434         call    _poly1305_init_sse2
 2435         movl    (%edi),%eax
 2436         movl    3(%edi),%ecx
 2437         movl    6(%edi),%edx
 2438         movl    9(%edi),%esi
 2439         movl    13(%edi),%ebp
 2440         movl    $1,20(%edi)
 2441         shrl    $2,%ecx
 2442         andl    $67108863,%eax
 2443         shrl    $4,%edx
 2444         andl    $67108863,%ecx
 2445         shrl    $6,%esi
 2446         andl    $67108863,%edx
 2447         movd    %eax,%xmm0
 2448         movd    %ecx,%xmm1
 2449         movd    %edx,%xmm2
 2450         movd    %esi,%xmm3
 2451         movd    %ebp,%xmm4
 2452         movl    24(%esp),%esi
 2453         movl    28(%esp),%ecx
 2454         jmp     .L011base2_32
 2455 .align  16
 2456 .L010base2_26:
 2457         movd    (%edi),%xmm0
 2458         movd    4(%edi),%xmm1
 2459         movd    8(%edi),%xmm2
 2460         movd    12(%edi),%xmm3
 2461         movd    16(%edi),%xmm4
 2462         movdqa  64(%ebx),%xmm7
 2463 .L011base2_32:
 2464         movl    32(%esp),%eax
 2465         movl    %esp,%ebp
 2466         subl    $528,%esp
 2467         andl    $-16,%esp
 2468         leal    48(%edi),%edi
 2469         shll    $24,%eax
 2470         testl   $31,%ecx
 2471         jz      .L012even
 2472         movdqu  (%esi),%xmm6
 2473         leal    16(%esi),%esi
 2474         movdqa  %xmm6,%xmm5
 2475         pand    %xmm7,%xmm6
 2476         paddd   %xmm6,%xmm0
 2477         movdqa  %xmm5,%xmm6
 2478         psrlq   $26,%xmm5
 2479         psrldq  $6,%xmm6
 2480         pand    %xmm7,%xmm5
 2481         paddd   %xmm5,%xmm1
 2482         movdqa  %xmm6,%xmm5
 2483         psrlq   $4,%xmm6
 2484         pand    %xmm7,%xmm6
 2485         paddd   %xmm6,%xmm2
 2486         movdqa  %xmm5,%xmm6
 2487         psrlq   $30,%xmm5
 2488         pand    %xmm7,%xmm5
 2489         psrldq  $7,%xmm6
 2490         paddd   %xmm5,%xmm3
 2491         movd    %eax,%xmm5
 2492         paddd   %xmm6,%xmm4
 2493         movd    12(%edi),%xmm6
 2494         paddd   %xmm5,%xmm4
 2495         movdqa  %xmm0,(%esp)
 2496         movdqa  %xmm1,16(%esp)
 2497         movdqa  %xmm2,32(%esp)
 2498         movdqa  %xmm3,48(%esp)
 2499         movdqa  %xmm4,64(%esp)
 2500         pmuludq %xmm6,%xmm0
 2501         pmuludq %xmm6,%xmm1
 2502         pmuludq %xmm6,%xmm2
 2503         movd    28(%edi),%xmm5
 2504         pmuludq %xmm6,%xmm3
 2505         pmuludq %xmm6,%xmm4
 2506         movdqa  %xmm5,%xmm6
 2507         pmuludq 48(%esp),%xmm5
 2508         movdqa  %xmm6,%xmm7
 2509         pmuludq 32(%esp),%xmm6
 2510         paddq   %xmm5,%xmm4
 2511         movdqa  %xmm7,%xmm5
 2512         pmuludq 16(%esp),%xmm7
 2513         paddq   %xmm6,%xmm3
 2514         movd    92(%edi),%xmm6
 2515         pmuludq (%esp),%xmm5
 2516         paddq   %xmm7,%xmm2
 2517         pmuludq 64(%esp),%xmm6
 2518         movd    44(%edi),%xmm7
 2519         paddq   %xmm5,%xmm1
 2520         movdqa  %xmm7,%xmm5
 2521         pmuludq 32(%esp),%xmm7
 2522         paddq   %xmm6,%xmm0
 2523         movdqa  %xmm5,%xmm6
 2524         pmuludq 16(%esp),%xmm5
 2525         paddq   %xmm7,%xmm4
 2526         movd    108(%edi),%xmm7
 2527         pmuludq (%esp),%xmm6
 2528         paddq   %xmm5,%xmm3
 2529         movdqa  %xmm7,%xmm5
 2530         pmuludq 64(%esp),%xmm7
 2531         paddq   %xmm6,%xmm2
 2532         pmuludq 48(%esp),%xmm5
 2533         movd    60(%edi),%xmm6
 2534         paddq   %xmm7,%xmm1
 2535         movdqa  %xmm6,%xmm7
 2536         pmuludq 16(%esp),%xmm6
 2537         paddq   %xmm5,%xmm0
 2538         movd    124(%edi),%xmm5
 2539         pmuludq (%esp),%xmm7
 2540         paddq   %xmm6,%xmm4
 2541         movdqa  %xmm5,%xmm6
 2542         pmuludq 64(%esp),%xmm5
 2543         paddq   %xmm7,%xmm3
 2544         movdqa  %xmm6,%xmm7
 2545         pmuludq 48(%esp),%xmm6
 2546         paddq   %xmm5,%xmm2
 2547         pmuludq 32(%esp),%xmm7
 2548         movd    76(%edi),%xmm5
 2549         paddq   %xmm6,%xmm1
 2550         movd    140(%edi),%xmm6
 2551         pmuludq (%esp),%xmm5
 2552         paddq   %xmm7,%xmm0
 2553         movdqa  %xmm6,%xmm7
 2554         pmuludq 64(%esp),%xmm6
 2555         paddq   %xmm5,%xmm4
 2556         movdqa  %xmm7,%xmm5
 2557         pmuludq 16(%esp),%xmm7
 2558         paddq   %xmm6,%xmm3
 2559         movdqa  %xmm5,%xmm6
 2560         pmuludq 32(%esp),%xmm5
 2561         paddq   %xmm7,%xmm0
 2562         pmuludq 48(%esp),%xmm6
 2563         movdqa  64(%ebx),%xmm7
 2564         paddq   %xmm5,%xmm1
 2565         paddq   %xmm6,%xmm2
 2566         movdqa  %xmm3,%xmm5
 2567         pand    %xmm7,%xmm3
 2568         psrlq   $26,%xmm5
 2569         paddq   %xmm4,%xmm5
 2570         movdqa  %xmm0,%xmm6
 2571         pand    %xmm7,%xmm0
 2572         psrlq   $26,%xmm6
 2573         movdqa  %xmm5,%xmm4
 2574         paddq   %xmm1,%xmm6
 2575         psrlq   $26,%xmm5
 2576         pand    %xmm7,%xmm4
 2577         movdqa  %xmm6,%xmm1
 2578         psrlq   $26,%xmm6
 2579         paddd   %xmm5,%xmm0
 2580         psllq   $2,%xmm5
 2581         paddq   %xmm2,%xmm6
 2582         paddq   %xmm0,%xmm5
 2583         pand    %xmm7,%xmm1
 2584         movdqa  %xmm6,%xmm2
 2585         psrlq   $26,%xmm6
 2586         pand    %xmm7,%xmm2
 2587         paddd   %xmm3,%xmm6
 2588         movdqa  %xmm5,%xmm0
 2589         psrlq   $26,%xmm5
 2590         movdqa  %xmm6,%xmm3
 2591         psrlq   $26,%xmm6
 2592         pand    %xmm7,%xmm0
 2593         paddd   %xmm5,%xmm1
 2594         pand    %xmm7,%xmm3
 2595         paddd   %xmm6,%xmm4
 2596         subl    $16,%ecx
 2597         jz      .L013done
 2598 .L012even:
 2599         leal    384(%esp),%edx
 2600         leal    -32(%esi),%eax
 2601         subl    $64,%ecx
 2602         movdqu  (%edi),%xmm5
 2603         pshufd  $68,%xmm5,%xmm6
 2604         cmovbl  %eax,%esi
 2605         pshufd  $238,%xmm5,%xmm5
 2606         movdqa  %xmm6,(%edx)
 2607         leal    160(%esp),%eax
 2608         movdqu  16(%edi),%xmm6
 2609         movdqa  %xmm5,-144(%edx)
 2610         pshufd  $68,%xmm6,%xmm5
 2611         pshufd  $238,%xmm6,%xmm6
 2612         movdqa  %xmm5,16(%edx)
 2613         movdqu  32(%edi),%xmm5
 2614         movdqa  %xmm6,-128(%edx)
 2615         pshufd  $68,%xmm5,%xmm6
 2616         pshufd  $238,%xmm5,%xmm5
 2617         movdqa  %xmm6,32(%edx)
 2618         movdqu  48(%edi),%xmm6
 2619         movdqa  %xmm5,-112(%edx)
 2620         pshufd  $68,%xmm6,%xmm5
 2621         pshufd  $238,%xmm6,%xmm6
 2622         movdqa  %xmm5,48(%edx)
 2623         movdqu  64(%edi),%xmm5
 2624         movdqa  %xmm6,-96(%edx)
 2625         pshufd  $68,%xmm5,%xmm6
 2626         pshufd  $238,%xmm5,%xmm5
 2627         movdqa  %xmm6,64(%edx)
 2628         movdqu  80(%edi),%xmm6
 2629         movdqa  %xmm5,-80(%edx)
 2630         pshufd  $68,%xmm6,%xmm5
 2631         pshufd  $238,%xmm6,%xmm6
 2632         movdqa  %xmm5,80(%edx)
 2633         movdqu  96(%edi),%xmm5
 2634         movdqa  %xmm6,-64(%edx)
 2635         pshufd  $68,%xmm5,%xmm6
 2636         pshufd  $238,%xmm5,%xmm5
 2637         movdqa  %xmm6,96(%edx)
 2638         movdqu  112(%edi),%xmm6
 2639         movdqa  %xmm5,-48(%edx)
 2640         pshufd  $68,%xmm6,%xmm5
 2641         pshufd  $238,%xmm6,%xmm6
 2642         movdqa  %xmm5,112(%edx)
 2643         movdqu  128(%edi),%xmm5
 2644         movdqa  %xmm6,-32(%edx)
 2645         pshufd  $68,%xmm5,%xmm6
 2646         pshufd  $238,%xmm5,%xmm5
 2647         movdqa  %xmm6,128(%edx)
 2648         movdqa  %xmm5,-16(%edx)
 2649         movdqu  32(%esi),%xmm5
 2650         movdqu  48(%esi),%xmm6
 2651         leal    32(%esi),%esi
 2652         movdqa  %xmm2,112(%esp)
 2653         movdqa  %xmm3,128(%esp)
 2654         movdqa  %xmm4,144(%esp)
 2655         movdqa  %xmm5,%xmm2
 2656         movdqa  %xmm6,%xmm3
 2657         psrldq  $6,%xmm2
 2658         psrldq  $6,%xmm3
 2659         movdqa  %xmm5,%xmm4
 2660         punpcklqdq      %xmm3,%xmm2
 2661         punpckhqdq      %xmm6,%xmm4
 2662         punpcklqdq      %xmm6,%xmm5
 2663         movdqa  %xmm2,%xmm3
 2664         psrlq   $4,%xmm2
 2665         psrlq   $30,%xmm3
 2666         movdqa  %xmm5,%xmm6
 2667         psrlq   $40,%xmm4
 2668         psrlq   $26,%xmm6
 2669         pand    %xmm7,%xmm5
 2670         pand    %xmm7,%xmm6
 2671         pand    %xmm7,%xmm2
 2672         pand    %xmm7,%xmm3
 2673         por     (%ebx),%xmm4
 2674         movdqa  %xmm0,80(%esp)
 2675         movdqa  %xmm1,96(%esp)
 2676         jbe     .L014skip_loop
 2677         jmp     .L015loop
 2678 .align  32
 2679 .L015loop:
 2680         movdqa  -144(%edx),%xmm7
 2681         movdqa  %xmm6,16(%eax)
 2682         movdqa  %xmm2,32(%eax)
 2683         movdqa  %xmm3,48(%eax)
 2684         movdqa  %xmm4,64(%eax)
 2685         movdqa  %xmm5,%xmm1
 2686         pmuludq %xmm7,%xmm5
 2687         movdqa  %xmm6,%xmm0
 2688         pmuludq %xmm7,%xmm6
 2689         pmuludq %xmm7,%xmm2
 2690         pmuludq %xmm7,%xmm3
 2691         pmuludq %xmm7,%xmm4
 2692         pmuludq -16(%edx),%xmm0
 2693         movdqa  %xmm1,%xmm7
 2694         pmuludq -128(%edx),%xmm1
 2695         paddq   %xmm5,%xmm0
 2696         movdqa  %xmm7,%xmm5
 2697         pmuludq -112(%edx),%xmm7
 2698         paddq   %xmm6,%xmm1
 2699         movdqa  %xmm5,%xmm6
 2700         pmuludq -96(%edx),%xmm5
 2701         paddq   %xmm7,%xmm2
 2702         movdqa  16(%eax),%xmm7
 2703         pmuludq -80(%edx),%xmm6
 2704         paddq   %xmm5,%xmm3
 2705         movdqa  %xmm7,%xmm5
 2706         pmuludq -128(%edx),%xmm7
 2707         paddq   %xmm6,%xmm4
 2708         movdqa  %xmm5,%xmm6
 2709         pmuludq -112(%edx),%xmm5
 2710         paddq   %xmm7,%xmm2
 2711         movdqa  32(%eax),%xmm7
 2712         pmuludq -96(%edx),%xmm6
 2713         paddq   %xmm5,%xmm3
 2714         movdqa  %xmm7,%xmm5
 2715         pmuludq -32(%edx),%xmm7
 2716         paddq   %xmm6,%xmm4
 2717         movdqa  %xmm5,%xmm6
 2718         pmuludq -16(%edx),%xmm5
 2719         paddq   %xmm7,%xmm0
 2720         movdqa  %xmm6,%xmm7
 2721         pmuludq -128(%edx),%xmm6
 2722         paddq   %xmm5,%xmm1
 2723         movdqa  48(%eax),%xmm5
 2724         pmuludq -112(%edx),%xmm7
 2725         paddq   %xmm6,%xmm3
 2726         movdqa  %xmm5,%xmm6
 2727         pmuludq -48(%edx),%xmm5
 2728         paddq   %xmm7,%xmm4
 2729         movdqa  %xmm6,%xmm7
 2730         pmuludq -32(%edx),%xmm6
 2731         paddq   %xmm5,%xmm0
 2732         movdqa  %xmm7,%xmm5
 2733         pmuludq -16(%edx),%xmm7
 2734         paddq   %xmm6,%xmm1
 2735         movdqa  64(%eax),%xmm6
 2736         pmuludq -128(%edx),%xmm5
 2737         paddq   %xmm7,%xmm2
 2738         movdqa  %xmm6,%xmm7
 2739         pmuludq -16(%edx),%xmm6
 2740         paddq   %xmm5,%xmm4
 2741         movdqa  %xmm7,%xmm5
 2742         pmuludq -64(%edx),%xmm7
 2743         paddq   %xmm6,%xmm3
 2744         movdqa  %xmm5,%xmm6
 2745         pmuludq -48(%edx),%xmm5
 2746         paddq   %xmm7,%xmm0
 2747         movdqa  64(%ebx),%xmm7
 2748         pmuludq -32(%edx),%xmm6
 2749         paddq   %xmm5,%xmm1
 2750         paddq   %xmm6,%xmm2
 2751         movdqu  -32(%esi),%xmm5
 2752         movdqu  -16(%esi),%xmm6
 2753         leal    32(%esi),%esi
 2754         movdqa  %xmm2,32(%esp)
 2755         movdqa  %xmm3,48(%esp)
 2756         movdqa  %xmm4,64(%esp)
 2757         movdqa  %xmm5,%xmm2
 2758         movdqa  %xmm6,%xmm3
 2759         psrldq  $6,%xmm2
 2760         psrldq  $6,%xmm3
 2761         movdqa  %xmm5,%xmm4
 2762         punpcklqdq      %xmm3,%xmm2
 2763         punpckhqdq      %xmm6,%xmm4
 2764         punpcklqdq      %xmm6,%xmm5
 2765         movdqa  %xmm2,%xmm3
 2766         psrlq   $4,%xmm2
 2767         psrlq   $30,%xmm3
 2768         movdqa  %xmm5,%xmm6
 2769         psrlq   $40,%xmm4
 2770         psrlq   $26,%xmm6
 2771         pand    %xmm7,%xmm5
 2772         pand    %xmm7,%xmm6
 2773         pand    %xmm7,%xmm2
 2774         pand    %xmm7,%xmm3
 2775         por     (%ebx),%xmm4
 2776         leal    -32(%esi),%eax
 2777         subl    $64,%ecx
 2778         paddd   80(%esp),%xmm5
 2779         paddd   96(%esp),%xmm6
 2780         paddd   112(%esp),%xmm2
 2781         paddd   128(%esp),%xmm3
 2782         paddd   144(%esp),%xmm4
 2783         cmovbl  %eax,%esi
 2784         leal    160(%esp),%eax
 2785         movdqa  (%edx),%xmm7
 2786         movdqa  %xmm1,16(%esp)
 2787         movdqa  %xmm6,16(%eax)
 2788         movdqa  %xmm2,32(%eax)
 2789         movdqa  %xmm3,48(%eax)
 2790         movdqa  %xmm4,64(%eax)
 2791         movdqa  %xmm5,%xmm1
 2792         pmuludq %xmm7,%xmm5
 2793         paddq   %xmm0,%xmm5
 2794         movdqa  %xmm6,%xmm0
 2795         pmuludq %xmm7,%xmm6
 2796         pmuludq %xmm7,%xmm2
 2797         pmuludq %xmm7,%xmm3
 2798         pmuludq %xmm7,%xmm4
 2799         paddq   16(%esp),%xmm6
 2800         paddq   32(%esp),%xmm2
 2801         paddq   48(%esp),%xmm3
 2802         paddq   64(%esp),%xmm4
 2803         pmuludq 128(%edx),%xmm0
 2804         movdqa  %xmm1,%xmm7
 2805         pmuludq 16(%edx),%xmm1
 2806         paddq   %xmm5,%xmm0
 2807         movdqa  %xmm7,%xmm5
 2808         pmuludq 32(%edx),%xmm7
 2809         paddq   %xmm6,%xmm1
 2810         movdqa  %xmm5,%xmm6
 2811         pmuludq 48(%edx),%xmm5
 2812         paddq   %xmm7,%xmm2
 2813         movdqa  16(%eax),%xmm7
 2814         pmuludq 64(%edx),%xmm6
 2815         paddq   %xmm5,%xmm3
 2816         movdqa  %xmm7,%xmm5
 2817         pmuludq 16(%edx),%xmm7
 2818         paddq   %xmm6,%xmm4
 2819         movdqa  %xmm5,%xmm6
 2820         pmuludq 32(%edx),%xmm5
 2821         paddq   %xmm7,%xmm2
 2822         movdqa  32(%eax),%xmm7
 2823         pmuludq 48(%edx),%xmm6
 2824         paddq   %xmm5,%xmm3
 2825         movdqa  %xmm7,%xmm5
 2826         pmuludq 112(%edx),%xmm7
 2827         paddq   %xmm6,%xmm4
 2828         movdqa  %xmm5,%xmm6
 2829         pmuludq 128(%edx),%xmm5
 2830         paddq   %xmm7,%xmm0
 2831         movdqa  %xmm6,%xmm7
 2832         pmuludq 16(%edx),%xmm6
 2833         paddq   %xmm5,%xmm1
 2834         movdqa  48(%eax),%xmm5
 2835         pmuludq 32(%edx),%xmm7
 2836         paddq   %xmm6,%xmm3
 2837         movdqa  %xmm5,%xmm6
 2838         pmuludq 96(%edx),%xmm5
 2839         paddq   %xmm7,%xmm4
 2840         movdqa  %xmm6,%xmm7
 2841         pmuludq 112(%edx),%xmm6
 2842         paddq   %xmm5,%xmm0
 2843         movdqa  %xmm7,%xmm5
 2844         pmuludq 128(%edx),%xmm7
 2845         paddq   %xmm6,%xmm1
 2846         movdqa  64(%eax),%xmm6
 2847         pmuludq 16(%edx),%xmm5
 2848         paddq   %xmm7,%xmm2
 2849         movdqa  %xmm6,%xmm7
 2850         pmuludq 128(%edx),%xmm6
 2851         paddq   %xmm5,%xmm4
 2852         movdqa  %xmm7,%xmm5
 2853         pmuludq 80(%edx),%xmm7
 2854         paddq   %xmm6,%xmm3
 2855         movdqa  %xmm5,%xmm6
 2856         pmuludq 96(%edx),%xmm5
 2857         paddq   %xmm7,%xmm0
 2858         movdqa  64(%ebx),%xmm7
 2859         pmuludq 112(%edx),%xmm6
 2860         paddq   %xmm5,%xmm1
 2861         paddq   %xmm6,%xmm2
 2862         movdqa  %xmm3,%xmm5
 2863         pand    %xmm7,%xmm3
 2864         psrlq   $26,%xmm5
 2865         paddq   %xmm4,%xmm5
 2866         movdqa  %xmm0,%xmm6
 2867         pand    %xmm7,%xmm0
 2868         psrlq   $26,%xmm6
 2869         movdqa  %xmm5,%xmm4
 2870         paddq   %xmm1,%xmm6
 2871         psrlq   $26,%xmm5
 2872         pand    %xmm7,%xmm4
 2873         movdqa  %xmm6,%xmm1
 2874         psrlq   $26,%xmm6
 2875         paddd   %xmm5,%xmm0
 2876         psllq   $2,%xmm5
 2877         paddq   %xmm2,%xmm6
 2878         paddq   %xmm0,%xmm5
 2879         pand    %xmm7,%xmm1
 2880         movdqa  %xmm6,%xmm2
 2881         psrlq   $26,%xmm6
 2882         pand    %xmm7,%xmm2
 2883         paddd   %xmm3,%xmm6
 2884         movdqa  %xmm5,%xmm0
 2885         psrlq   $26,%xmm5
 2886         movdqa  %xmm6,%xmm3
 2887         psrlq   $26,%xmm6
 2888         pand    %xmm7,%xmm0
 2889         paddd   %xmm5,%xmm1
 2890         pand    %xmm7,%xmm3
 2891         paddd   %xmm6,%xmm4
 2892         movdqu  32(%esi),%xmm5
 2893         movdqu  48(%esi),%xmm6
 2894         leal    32(%esi),%esi
 2895         movdqa  %xmm2,112(%esp)
 2896         movdqa  %xmm3,128(%esp)
 2897         movdqa  %xmm4,144(%esp)
 2898         movdqa  %xmm5,%xmm2
 2899         movdqa  %xmm6,%xmm3
 2900         psrldq  $6,%xmm2
 2901         psrldq  $6,%xmm3
 2902         movdqa  %xmm5,%xmm4
 2903         punpcklqdq      %xmm3,%xmm2
 2904         punpckhqdq      %xmm6,%xmm4
 2905         punpcklqdq      %xmm6,%xmm5
 2906         movdqa  %xmm2,%xmm3
 2907         psrlq   $4,%xmm2
 2908         psrlq   $30,%xmm3
 2909         movdqa  %xmm5,%xmm6
 2910         psrlq   $40,%xmm4
 2911         psrlq   $26,%xmm6
 2912         pand    %xmm7,%xmm5
 2913         pand    %xmm7,%xmm6
 2914         pand    %xmm7,%xmm2
 2915         pand    %xmm7,%xmm3
 2916         por     (%ebx),%xmm4
 2917         movdqa  %xmm0,80(%esp)
 2918         movdqa  %xmm1,96(%esp)
 2919         ja      .L015loop
 2920 .L014skip_loop:
 2921         pshufd  $16,-144(%edx),%xmm7
 2922         addl    $32,%ecx
 2923         jnz     .L016long_tail
 2924         paddd   %xmm0,%xmm5
 2925         paddd   %xmm1,%xmm6
 2926         paddd   112(%esp),%xmm2
 2927         paddd   128(%esp),%xmm3
 2928         paddd   144(%esp),%xmm4
 2929 .L016long_tail:
 2930         movdqa  %xmm5,(%eax)
 2931         movdqa  %xmm6,16(%eax)
 2932         movdqa  %xmm2,32(%eax)
 2933         movdqa  %xmm3,48(%eax)
 2934         movdqa  %xmm4,64(%eax)
 2935         pmuludq %xmm7,%xmm5
 2936         pmuludq %xmm7,%xmm6
 2937         pmuludq %xmm7,%xmm2
 2938         movdqa  %xmm5,%xmm0
 2939         pshufd  $16,-128(%edx),%xmm5
 2940         pmuludq %xmm7,%xmm3
 2941         movdqa  %xmm6,%xmm1
 2942         pmuludq %xmm7,%xmm4
 2943         movdqa  %xmm5,%xmm6
 2944         pmuludq 48(%eax),%xmm5
 2945         movdqa  %xmm6,%xmm7
 2946         pmuludq 32(%eax),%xmm6
 2947         paddq   %xmm5,%xmm4
 2948         movdqa  %xmm7,%xmm5
 2949         pmuludq 16(%eax),%xmm7
 2950         paddq   %xmm6,%xmm3
 2951         pshufd  $16,-64(%edx),%xmm6
 2952         pmuludq (%eax),%xmm5
 2953         paddq   %xmm7,%xmm2
 2954         pmuludq 64(%eax),%xmm6
 2955         pshufd  $16,-112(%edx),%xmm7
 2956         paddq   %xmm5,%xmm1
 2957         movdqa  %xmm7,%xmm5
 2958         pmuludq 32(%eax),%xmm7
 2959         paddq   %xmm6,%xmm0
 2960         movdqa  %xmm5,%xmm6
 2961         pmuludq 16(%eax),%xmm5
 2962         paddq   %xmm7,%xmm4
 2963         pshufd  $16,-48(%edx),%xmm7
 2964         pmuludq (%eax),%xmm6
 2965         paddq   %xmm5,%xmm3
 2966         movdqa  %xmm7,%xmm5
 2967         pmuludq 64(%eax),%xmm7
 2968         paddq   %xmm6,%xmm2
 2969         pmuludq 48(%eax),%xmm5
 2970         pshufd  $16,-96(%edx),%xmm6
 2971         paddq   %xmm7,%xmm1
 2972         movdqa  %xmm6,%xmm7
 2973         pmuludq 16(%eax),%xmm6
 2974         paddq   %xmm5,%xmm0
 2975         pshufd  $16,-32(%edx),%xmm5
 2976         pmuludq (%eax),%xmm7
 2977         paddq   %xmm6,%xmm4
 2978         movdqa  %xmm5,%xmm6
 2979         pmuludq 64(%eax),%xmm5
 2980         paddq   %xmm7,%xmm3
 2981         movdqa  %xmm6,%xmm7
 2982         pmuludq 48(%eax),%xmm6
 2983         paddq   %xmm5,%xmm2
 2984         pmuludq 32(%eax),%xmm7
 2985         pshufd  $16,-80(%edx),%xmm5
 2986         paddq   %xmm6,%xmm1
 2987         pshufd  $16,-16(%edx),%xmm6
 2988         pmuludq (%eax),%xmm5
 2989         paddq   %xmm7,%xmm0
 2990         movdqa  %xmm6,%xmm7
 2991         pmuludq 64(%eax),%xmm6
 2992         paddq   %xmm5,%xmm4
 2993         movdqa  %xmm7,%xmm5
 2994         pmuludq 16(%eax),%xmm7
 2995         paddq   %xmm6,%xmm3
 2996         movdqa  %xmm5,%xmm6
 2997         pmuludq 32(%eax),%xmm5
 2998         paddq   %xmm7,%xmm0
 2999         pmuludq 48(%eax),%xmm6
 3000         movdqa  64(%ebx),%xmm7
 3001         paddq   %xmm5,%xmm1
 3002         paddq   %xmm6,%xmm2
 3003         jz      .L017short_tail
 3004         movdqu  -32(%esi),%xmm5
 3005         movdqu  -16(%esi),%xmm6
 3006         leal    32(%esi),%esi
 3007         movdqa  %xmm2,32(%esp)
 3008         movdqa  %xmm3,48(%esp)
 3009         movdqa  %xmm4,64(%esp)
 3010         movdqa  %xmm5,%xmm2
 3011         movdqa  %xmm6,%xmm3
 3012         psrldq  $6,%xmm2
 3013         psrldq  $6,%xmm3
 3014         movdqa  %xmm5,%xmm4
 3015         punpcklqdq      %xmm3,%xmm2
 3016         punpckhqdq      %xmm6,%xmm4
 3017         punpcklqdq      %xmm6,%xmm5
 3018         movdqa  %xmm2,%xmm3
 3019         psrlq   $4,%xmm2
 3020         psrlq   $30,%xmm3
 3021         movdqa  %xmm5,%xmm6
 3022         psrlq   $40,%xmm4
 3023         psrlq   $26,%xmm6
 3024         pand    %xmm7,%xmm5
 3025         pand    %xmm7,%xmm6
 3026         pand    %xmm7,%xmm2
 3027         pand    %xmm7,%xmm3
 3028         por     (%ebx),%xmm4
 3029         pshufd  $16,(%edx),%xmm7
 3030         paddd   80(%esp),%xmm5
 3031         paddd   96(%esp),%xmm6
 3032         paddd   112(%esp),%xmm2
 3033         paddd   128(%esp),%xmm3
 3034         paddd   144(%esp),%xmm4
 3035         movdqa  %xmm5,(%esp)
 3036         pmuludq %xmm7,%xmm5
 3037         movdqa  %xmm6,16(%esp)
 3038         pmuludq %xmm7,%xmm6
 3039         paddq   %xmm5,%xmm0
 3040         movdqa  %xmm2,%xmm5
 3041         pmuludq %xmm7,%xmm2
 3042         paddq   %xmm6,%xmm1
 3043         movdqa  %xmm3,%xmm6
 3044         pmuludq %xmm7,%xmm3
 3045         paddq   32(%esp),%xmm2
 3046         movdqa  %xmm5,32(%esp)
 3047         pshufd  $16,16(%edx),%xmm5
 3048         paddq   48(%esp),%xmm3
 3049         movdqa  %xmm6,48(%esp)
 3050         movdqa  %xmm4,%xmm6
 3051         pmuludq %xmm7,%xmm4
 3052         paddq   64(%esp),%xmm4
 3053         movdqa  %xmm6,64(%esp)
 3054         movdqa  %xmm5,%xmm6
 3055         pmuludq 48(%esp),%xmm5
 3056         movdqa  %xmm6,%xmm7
 3057         pmuludq 32(%esp),%xmm6
 3058         paddq   %xmm5,%xmm4
 3059         movdqa  %xmm7,%xmm5
 3060         pmuludq 16(%esp),%xmm7
 3061         paddq   %xmm6,%xmm3
 3062         pshufd  $16,80(%edx),%xmm6
 3063         pmuludq (%esp),%xmm5
 3064         paddq   %xmm7,%xmm2
 3065         pmuludq 64(%esp),%xmm6
 3066         pshufd  $16,32(%edx),%xmm7
 3067         paddq   %xmm5,%xmm1
 3068         movdqa  %xmm7,%xmm5
 3069         pmuludq 32(%esp),%xmm7
 3070         paddq   %xmm6,%xmm0
 3071         movdqa  %xmm5,%xmm6
 3072         pmuludq 16(%esp),%xmm5
 3073         paddq   %xmm7,%xmm4
 3074         pshufd  $16,96(%edx),%xmm7
 3075         pmuludq (%esp),%xmm6
 3076         paddq   %xmm5,%xmm3
 3077         movdqa  %xmm7,%xmm5
 3078         pmuludq 64(%esp),%xmm7
 3079         paddq   %xmm6,%xmm2
 3080         pmuludq 48(%esp),%xmm5
 3081         pshufd  $16,48(%edx),%xmm6
 3082         paddq   %xmm7,%xmm1
 3083         movdqa  %xmm6,%xmm7
 3084         pmuludq 16(%esp),%xmm6
 3085         paddq   %xmm5,%xmm0
 3086         pshufd  $16,112(%edx),%xmm5
 3087         pmuludq (%esp),%xmm7
 3088         paddq   %xmm6,%xmm4
 3089         movdqa  %xmm5,%xmm6
 3090         pmuludq 64(%esp),%xmm5
 3091         paddq   %xmm7,%xmm3
 3092         movdqa  %xmm6,%xmm7
 3093         pmuludq 48(%esp),%xmm6
 3094         paddq   %xmm5,%xmm2
 3095         pmuludq 32(%esp),%xmm7
 3096         pshufd  $16,64(%edx),%xmm5
 3097         paddq   %xmm6,%xmm1
 3098         pshufd  $16,128(%edx),%xmm6
 3099         pmuludq (%esp),%xmm5
 3100         paddq   %xmm7,%xmm0
 3101         movdqa  %xmm6,%xmm7
 3102         pmuludq 64(%esp),%xmm6
 3103         paddq   %xmm5,%xmm4
 3104         movdqa  %xmm7,%xmm5
 3105         pmuludq 16(%esp),%xmm7
 3106         paddq   %xmm6,%xmm3
 3107         movdqa  %xmm5,%xmm6
 3108         pmuludq 32(%esp),%xmm5
 3109         paddq   %xmm7,%xmm0
 3110         pmuludq 48(%esp),%xmm6
 3111         movdqa  64(%ebx),%xmm7
 3112         paddq   %xmm5,%xmm1
 3113         paddq   %xmm6,%xmm2
 3114 .L017short_tail:
 3115         pshufd  $78,%xmm4,%xmm6
 3116         pshufd  $78,%xmm3,%xmm5
 3117         paddq   %xmm6,%xmm4
 3118         paddq   %xmm5,%xmm3
 3119         pshufd  $78,%xmm0,%xmm6
 3120         pshufd  $78,%xmm1,%xmm5
 3121         paddq   %xmm6,%xmm0
 3122         paddq   %xmm5,%xmm1
 3123         pshufd  $78,%xmm2,%xmm6
 3124         movdqa  %xmm3,%xmm5
 3125         pand    %xmm7,%xmm3
 3126         psrlq   $26,%xmm5
 3127         paddq   %xmm6,%xmm2
 3128         paddq   %xmm4,%xmm5
 3129         movdqa  %xmm0,%xmm6
 3130         pand    %xmm7,%xmm0
 3131         psrlq   $26,%xmm6
 3132         movdqa  %xmm5,%xmm4
 3133         paddq   %xmm1,%xmm6
 3134         psrlq   $26,%xmm5
 3135         pand    %xmm7,%xmm4
 3136         movdqa  %xmm6,%xmm1
 3137         psrlq   $26,%xmm6
 3138         paddd   %xmm5,%xmm0
 3139         psllq   $2,%xmm5
 3140         paddq   %xmm2,%xmm6
 3141         paddq   %xmm0,%xmm5
 3142         pand    %xmm7,%xmm1
 3143         movdqa  %xmm6,%xmm2
 3144         psrlq   $26,%xmm6
 3145         pand    %xmm7,%xmm2
 3146         paddd   %xmm3,%xmm6
 3147         movdqa  %xmm5,%xmm0
 3148         psrlq   $26,%xmm5
 3149         movdqa  %xmm6,%xmm3
 3150         psrlq   $26,%xmm6
 3151         pand    %xmm7,%xmm0
 3152         paddd   %xmm5,%xmm1
 3153         pand    %xmm7,%xmm3
 3154         paddd   %xmm6,%xmm4
 3155 .L013done:
 3156         movd    %xmm0,-48(%edi)
 3157         movd    %xmm1,-44(%edi)
 3158         movd    %xmm2,-40(%edi)
 3159         movd    %xmm3,-36(%edi)
 3160         movd    %xmm4,-32(%edi)
 3161         movl    %ebp,%esp
 3162 .L007nodata:
 3163         popl    %edi
 3164         popl    %esi
 3165         popl    %ebx
 3166         popl    %ebp
 3167         ret
 3168 .size   _poly1305_blocks_sse2,.-_poly1305_blocks_sse2
 3169 .align  32
 3170 .type   _poly1305_emit_sse2,@function
 3171 .align  16
 3172 _poly1305_emit_sse2:
 3173         pushl   %ebp
 3174         pushl   %ebx
 3175         pushl   %esi
 3176         pushl   %edi
 3177         movl    20(%esp),%ebp
 3178         cmpl    $0,20(%ebp)
 3179         je      .Lenter_emit
 3180         movl    (%ebp),%eax
 3181         movl    4(%ebp),%edi
 3182         movl    8(%ebp),%ecx
 3183         movl    12(%ebp),%edx
 3184         movl    16(%ebp),%esi
 3185         movl    %edi,%ebx
 3186         shll    $26,%edi
 3187         shrl    $6,%ebx
 3188         addl    %edi,%eax
 3189         movl    %ecx,%edi
 3190         adcl    $0,%ebx
 3191         shll    $20,%edi
 3192         shrl    $12,%ecx
 3193         addl    %edi,%ebx
 3194         movl    %edx,%edi
 3195         adcl    $0,%ecx
 3196         shll    $14,%edi
 3197         shrl    $18,%edx
 3198         addl    %edi,%ecx
 3199         movl    %esi,%edi
 3200         adcl    $0,%edx
 3201         shll    $8,%edi
 3202         shrl    $24,%esi
 3203         addl    %edi,%edx
 3204         adcl    $0,%esi
 3205         movl    %esi,%edi
 3206         andl    $3,%esi
 3207         shrl    $2,%edi
 3208         leal    (%edi,%edi,4),%ebp
 3209         movl    24(%esp),%edi
 3210         addl    %ebp,%eax
 3211         movl    28(%esp),%ebp
 3212         adcl    $0,%ebx
 3213         adcl    $0,%ecx
 3214         adcl    $0,%edx
 3215         adcl    $0,%esi
 3216         movd    %eax,%xmm0
 3217         addl    $5,%eax
 3218         movd    %ebx,%xmm1
 3219         adcl    $0,%ebx
 3220         movd    %ecx,%xmm2
 3221         adcl    $0,%ecx
 3222         movd    %edx,%xmm3
 3223         adcl    $0,%edx
 3224         adcl    $0,%esi
 3225         shrl    $2,%esi
 3226         negl    %esi
 3227         andl    %esi,%eax
 3228         andl    %esi,%ebx
 3229         andl    %esi,%ecx
 3230         andl    %esi,%edx
 3231         movl    %eax,(%edi)
 3232         movd    %xmm0,%eax
 3233         movl    %ebx,4(%edi)
 3234         movd    %xmm1,%ebx
 3235         movl    %ecx,8(%edi)
 3236         movd    %xmm2,%ecx
 3237         movl    %edx,12(%edi)
 3238         movd    %xmm3,%edx
 3239         notl    %esi
 3240         andl    %esi,%eax
 3241         andl    %esi,%ebx
 3242         orl     (%edi),%eax
 3243         andl    %esi,%ecx
 3244         orl     4(%edi),%ebx
 3245         andl    %esi,%edx
 3246         orl     8(%edi),%ecx
 3247         orl     12(%edi),%edx
 3248         addl    (%ebp),%eax
 3249         adcl    4(%ebp),%ebx
 3250         movl    %eax,(%edi)
 3251         adcl    8(%ebp),%ecx
 3252         movl    %ebx,4(%edi)
 3253         adcl    12(%ebp),%edx
 3254         movl    %ecx,8(%edi)
 3255         movl    %edx,12(%edi)
 3256         popl    %edi
 3257         popl    %esi
 3258         popl    %ebx
 3259         popl    %ebp
 3260         ret
 3261 .size   _poly1305_emit_sse2,.-_poly1305_emit_sse2
 3262 .align  32
 3263 .type   _poly1305_init_avx2,@function
 3264 .align  16
 3265 _poly1305_init_avx2:
 3266         vmovdqu 24(%edi),%xmm4
 3267         leal    48(%edi),%edi
 3268         movl    %esp,%ebp
 3269         subl    $224,%esp
 3270         andl    $-16,%esp
 3271         vmovdqa 64(%ebx),%xmm7
 3272         vpand   %xmm7,%xmm4,%xmm0
 3273         vpsrlq  $26,%xmm4,%xmm1
 3274         vpsrldq $6,%xmm4,%xmm3
 3275         vpand   %xmm7,%xmm1,%xmm1
 3276         vpsrlq  $4,%xmm3,%xmm2
 3277         vpsrlq  $30,%xmm3,%xmm3
 3278         vpand   %xmm7,%xmm2,%xmm2
 3279         vpand   %xmm7,%xmm3,%xmm3
 3280         vpsrldq $13,%xmm4,%xmm4
 3281         leal    144(%esp),%edx
 3282         movl    $2,%ecx
 3283 .L018square:
 3284         vmovdqa %xmm0,(%esp)
 3285         vmovdqa %xmm1,16(%esp)
 3286         vmovdqa %xmm2,32(%esp)
 3287         vmovdqa %xmm3,48(%esp)
 3288         vmovdqa %xmm4,64(%esp)
 3289         vpslld  $2,%xmm1,%xmm6
 3290         vpslld  $2,%xmm2,%xmm5
 3291         vpaddd  %xmm1,%xmm6,%xmm6
 3292         vpaddd  %xmm2,%xmm5,%xmm5
 3293         vmovdqa %xmm6,80(%esp)
 3294         vmovdqa %xmm5,96(%esp)
 3295         vpslld  $2,%xmm3,%xmm6
 3296         vpslld  $2,%xmm4,%xmm5
 3297         vpaddd  %xmm3,%xmm6,%xmm6
 3298         vpaddd  %xmm4,%xmm5,%xmm5
 3299         vmovdqa %xmm6,112(%esp)
 3300         vmovdqa %xmm5,128(%esp)
 3301         vpshufd $68,%xmm0,%xmm5
 3302         vmovdqa %xmm1,%xmm6
 3303         vpshufd $68,%xmm1,%xmm1
 3304         vpshufd $68,%xmm2,%xmm2
 3305         vpshufd $68,%xmm3,%xmm3
 3306         vpshufd $68,%xmm4,%xmm4
 3307         vmovdqa %xmm5,(%edx)
 3308         vmovdqa %xmm1,16(%edx)
 3309         vmovdqa %xmm2,32(%edx)
 3310         vmovdqa %xmm3,48(%edx)
 3311         vmovdqa %xmm4,64(%edx)
 3312         vpmuludq        %xmm0,%xmm4,%xmm4
 3313         vpmuludq        %xmm0,%xmm3,%xmm3
 3314         vpmuludq        %xmm0,%xmm2,%xmm2
 3315         vpmuludq        %xmm0,%xmm1,%xmm1
 3316         vpmuludq        %xmm0,%xmm5,%xmm0
 3317         vpmuludq        48(%edx),%xmm6,%xmm5
 3318         vpaddq  %xmm5,%xmm4,%xmm4
 3319         vpmuludq        32(%edx),%xmm6,%xmm7
 3320         vpaddq  %xmm7,%xmm3,%xmm3
 3321         vpmuludq        16(%edx),%xmm6,%xmm5
 3322         vpaddq  %xmm5,%xmm2,%xmm2
 3323         vmovdqa 80(%esp),%xmm7
 3324         vpmuludq        (%edx),%xmm6,%xmm6
 3325         vpaddq  %xmm6,%xmm1,%xmm1
 3326         vmovdqa 32(%esp),%xmm5
 3327         vpmuludq        64(%edx),%xmm7,%xmm7
 3328         vpaddq  %xmm7,%xmm0,%xmm0
 3329         vpmuludq        32(%edx),%xmm5,%xmm6
 3330         vpaddq  %xmm6,%xmm4,%xmm4
 3331         vpmuludq        16(%edx),%xmm5,%xmm7
 3332         vpaddq  %xmm7,%xmm3,%xmm3
 3333         vmovdqa 96(%esp),%xmm6
 3334         vpmuludq        (%edx),%xmm5,%xmm5
 3335         vpaddq  %xmm5,%xmm2,%xmm2
 3336         vpmuludq        64(%edx),%xmm6,%xmm7
 3337         vpaddq  %xmm7,%xmm1,%xmm1
 3338         vmovdqa 48(%esp),%xmm5
 3339         vpmuludq        48(%edx),%xmm6,%xmm6
 3340         vpaddq  %xmm6,%xmm0,%xmm0
 3341         vpmuludq        16(%edx),%xmm5,%xmm7
 3342         vpaddq  %xmm7,%xmm4,%xmm4
 3343         vmovdqa 112(%esp),%xmm6
 3344         vpmuludq        (%edx),%xmm5,%xmm5
 3345         vpaddq  %xmm5,%xmm3,%xmm3
 3346         vpmuludq        64(%edx),%xmm6,%xmm7
 3347         vpaddq  %xmm7,%xmm2,%xmm2
 3348         vpmuludq        48(%edx),%xmm6,%xmm5
 3349         vpaddq  %xmm5,%xmm1,%xmm1
 3350         vmovdqa 64(%esp),%xmm7
 3351         vpmuludq        32(%edx),%xmm6,%xmm6
 3352         vpaddq  %xmm6,%xmm0,%xmm0
 3353         vmovdqa 128(%esp),%xmm5
 3354         vpmuludq        (%edx),%xmm7,%xmm7
 3355         vpaddq  %xmm7,%xmm4,%xmm4
 3356         vpmuludq        64(%edx),%xmm5,%xmm6
 3357         vpaddq  %xmm6,%xmm3,%xmm3
 3358         vpmuludq        16(%edx),%xmm5,%xmm7
 3359         vpaddq  %xmm7,%xmm0,%xmm0
 3360         vpmuludq        32(%edx),%xmm5,%xmm6
 3361         vpaddq  %xmm6,%xmm1,%xmm1
 3362         vmovdqa 64(%ebx),%xmm7
 3363         vpmuludq        48(%edx),%xmm5,%xmm5
 3364         vpaddq  %xmm5,%xmm2,%xmm2
 3365         vpsrlq  $26,%xmm3,%xmm5
 3366         vpand   %xmm7,%xmm3,%xmm3
 3367         vpsrlq  $26,%xmm0,%xmm6
 3368         vpand   %xmm7,%xmm0,%xmm0
 3369         vpaddq  %xmm5,%xmm4,%xmm4
 3370         vpaddq  %xmm6,%xmm1,%xmm1
 3371         vpsrlq  $26,%xmm4,%xmm5
 3372         vpand   %xmm7,%xmm4,%xmm4
 3373         vpsrlq  $26,%xmm1,%xmm6
 3374         vpand   %xmm7,%xmm1,%xmm1
 3375         vpaddq  %xmm6,%xmm2,%xmm2
 3376         vpaddd  %xmm5,%xmm0,%xmm0
 3377         vpsllq  $2,%xmm5,%xmm5
 3378         vpsrlq  $26,%xmm2,%xmm6
 3379         vpand   %xmm7,%xmm2,%xmm2
 3380         vpaddd  %xmm5,%xmm0,%xmm0
 3381         vpaddd  %xmm6,%xmm3,%xmm3
 3382         vpsrlq  $26,%xmm3,%xmm6
 3383         vpsrlq  $26,%xmm0,%xmm5
 3384         vpand   %xmm7,%xmm0,%xmm0
 3385         vpand   %xmm7,%xmm3,%xmm3
 3386         vpaddd  %xmm5,%xmm1,%xmm1
 3387         vpaddd  %xmm6,%xmm4,%xmm4
 3388         decl    %ecx
 3389         jz      .L019square_break
 3390         vpunpcklqdq     (%esp),%xmm0,%xmm0
 3391         vpunpcklqdq     16(%esp),%xmm1,%xmm1
 3392         vpunpcklqdq     32(%esp),%xmm2,%xmm2
 3393         vpunpcklqdq     48(%esp),%xmm3,%xmm3
 3394         vpunpcklqdq     64(%esp),%xmm4,%xmm4
 3395         jmp     .L018square
 3396 .L019square_break:
 3397         vpsllq  $32,%xmm0,%xmm0
 3398         vpsllq  $32,%xmm1,%xmm1
 3399         vpsllq  $32,%xmm2,%xmm2
 3400         vpsllq  $32,%xmm3,%xmm3
 3401         vpsllq  $32,%xmm4,%xmm4
 3402         vpor    (%esp),%xmm0,%xmm0
 3403         vpor    16(%esp),%xmm1,%xmm1
 3404         vpor    32(%esp),%xmm2,%xmm2
 3405         vpor    48(%esp),%xmm3,%xmm3
 3406         vpor    64(%esp),%xmm4,%xmm4
 3407         vpshufd $141,%xmm0,%xmm0
 3408         vpshufd $141,%xmm1,%xmm1
 3409         vpshufd $141,%xmm2,%xmm2
 3410         vpshufd $141,%xmm3,%xmm3
 3411         vpshufd $141,%xmm4,%xmm4
 3412         vmovdqu %xmm0,(%edi)
 3413         vmovdqu %xmm1,16(%edi)
 3414         vmovdqu %xmm2,32(%edi)
 3415         vmovdqu %xmm3,48(%edi)
 3416         vmovdqu %xmm4,64(%edi)
 3417         vpslld  $2,%xmm1,%xmm6
 3418         vpslld  $2,%xmm2,%xmm5
 3419         vpaddd  %xmm1,%xmm6,%xmm6
 3420         vpaddd  %xmm2,%xmm5,%xmm5
 3421         vmovdqu %xmm6,80(%edi)
 3422         vmovdqu %xmm5,96(%edi)
 3423         vpslld  $2,%xmm3,%xmm6
 3424         vpslld  $2,%xmm4,%xmm5
 3425         vpaddd  %xmm3,%xmm6,%xmm6
 3426         vpaddd  %xmm4,%xmm5,%xmm5
 3427         vmovdqu %xmm6,112(%edi)
 3428         vmovdqu %xmm5,128(%edi)
 3429         movl    %ebp,%esp
 3430         leal    -48(%edi),%edi
 3431         ret
 3432 .size   _poly1305_init_avx2,.-_poly1305_init_avx2
 3433 .align  32
 3434 .type   _poly1305_blocks_avx2,@function
 3435 .align  16
 3436 _poly1305_blocks_avx2:
 3437         pushl   %ebp
 3438         pushl   %ebx
 3439         pushl   %esi
 3440         pushl   %edi
 3441         movl    20(%esp),%edi
 3442         movl    24(%esp),%esi
 3443         movl    28(%esp),%ecx
 3444         movl    20(%edi),%eax
 3445         andl    $-16,%ecx
 3446         jz      .L020nodata
 3447         cmpl    $64,%ecx
 3448         jae     .L021enter_avx2
 3449         testl   %eax,%eax
 3450         jz      .Lenter_blocks
 3451 .L021enter_avx2:
 3452         vzeroupper
 3453         call    .L022pic_point
 3454 .L022pic_point:
 3455         popl    %ebx
 3456         leal    .Lconst_sse2-.L022pic_point(%ebx),%ebx
 3457         testl   %eax,%eax
 3458         jnz     .L023base2_26
 3459         call    _poly1305_init_avx2
 3460         movl    (%edi),%eax
 3461         movl    3(%edi),%ecx
 3462         movl    6(%edi),%edx
 3463         movl    9(%edi),%esi
 3464         movl    13(%edi),%ebp
 3465         shrl    $2,%ecx
 3466         andl    $67108863,%eax
 3467         shrl    $4,%edx
 3468         andl    $67108863,%ecx
 3469         shrl    $6,%esi
 3470         andl    $67108863,%edx
 3471         movl    %eax,(%edi)
 3472         movl    %ecx,4(%edi)
 3473         movl    %edx,8(%edi)
 3474         movl    %esi,12(%edi)
 3475         movl    %ebp,16(%edi)
 3476         movl    $1,20(%edi)
 3477         movl    24(%esp),%esi
 3478         movl    28(%esp),%ecx
 3479 .L023base2_26:
 3480         movl    32(%esp),%eax
 3481         movl    %esp,%ebp
 3482         subl    $448,%esp
 3483         andl    $-512,%esp
 3484         vmovdqu 48(%edi),%xmm0
 3485         leal    288(%esp),%edx
 3486         vmovdqu 64(%edi),%xmm1
 3487         vmovdqu 80(%edi),%xmm2
 3488         vmovdqu 96(%edi),%xmm3
 3489         vmovdqu 112(%edi),%xmm4
 3490         leal    48(%edi),%edi
 3491         vpermq  $64,%ymm0,%ymm0
 3492         vpermq  $64,%ymm1,%ymm1
 3493         vpermq  $64,%ymm2,%ymm2
 3494         vpermq  $64,%ymm3,%ymm3
 3495         vpermq  $64,%ymm4,%ymm4
 3496         vpshufd $200,%ymm0,%ymm0
 3497         vpshufd $200,%ymm1,%ymm1
 3498         vpshufd $200,%ymm2,%ymm2
 3499         vpshufd $200,%ymm3,%ymm3
 3500         vpshufd $200,%ymm4,%ymm4
 3501         vmovdqa %ymm0,-128(%edx)
 3502         vmovdqu 80(%edi),%xmm0
 3503         vmovdqa %ymm1,-96(%edx)
 3504         vmovdqu 96(%edi),%xmm1
 3505         vmovdqa %ymm2,-64(%edx)
 3506         vmovdqu 112(%edi),%xmm2
 3507         vmovdqa %ymm3,-32(%edx)
 3508         vmovdqu 128(%edi),%xmm3
 3509         vmovdqa %ymm4,(%edx)
 3510         vpermq  $64,%ymm0,%ymm0
 3511         vpermq  $64,%ymm1,%ymm1
 3512         vpermq  $64,%ymm2,%ymm2
 3513         vpermq  $64,%ymm3,%ymm3
 3514         vpshufd $200,%ymm0,%ymm0
 3515         vpshufd $200,%ymm1,%ymm1
 3516         vpshufd $200,%ymm2,%ymm2
 3517         vpshufd $200,%ymm3,%ymm3
 3518         vmovdqa %ymm0,32(%edx)
 3519         vmovd   -48(%edi),%xmm0
 3520         vmovdqa %ymm1,64(%edx)
 3521         vmovd   -44(%edi),%xmm1
 3522         vmovdqa %ymm2,96(%edx)
 3523         vmovd   -40(%edi),%xmm2
 3524         vmovdqa %ymm3,128(%edx)
 3525         vmovd   -36(%edi),%xmm3
 3526         vmovd   -32(%edi),%xmm4
 3527         vmovdqa 64(%ebx),%ymm7
 3528         negl    %eax
 3529         testl   $63,%ecx
 3530         jz      .L024even
 3531         movl    %ecx,%edx
 3532         andl    $-64,%ecx
 3533         andl    $63,%edx
 3534         vmovdqu (%esi),%xmm5
 3535         cmpl    $32,%edx
 3536         jb      .L025one
 3537         vmovdqu 16(%esi),%xmm6
 3538         je      .L026two
 3539         vinserti128     $1,32(%esi),%ymm5,%ymm5
 3540         leal    48(%esi),%esi
 3541         leal    8(%ebx),%ebx
 3542         leal    296(%esp),%edx
 3543         jmp     .L027tail
 3544 .L026two:
 3545         leal    32(%esi),%esi
 3546         leal    16(%ebx),%ebx
 3547         leal    304(%esp),%edx
 3548         jmp     .L027tail
 3549 .L025one:
 3550         leal    16(%esi),%esi
 3551         vpxor   %ymm6,%ymm6,%ymm6
 3552         leal    32(%ebx,%eax,8),%ebx
 3553         leal    312(%esp),%edx
 3554         jmp     .L027tail
 3555 .align  32
 3556 .L024even:
 3557         vmovdqu (%esi),%xmm5
 3558         vmovdqu 16(%esi),%xmm6
 3559         vinserti128     $1,32(%esi),%ymm5,%ymm5
 3560         vinserti128     $1,48(%esi),%ymm6,%ymm6
 3561         leal    64(%esi),%esi
 3562         subl    $64,%ecx
 3563         jz      .L027tail
 3564 .L028loop:
 3565         vmovdqa %ymm2,64(%esp)
 3566         vpsrldq $6,%ymm5,%ymm2
 3567         vmovdqa %ymm0,(%esp)
 3568         vpsrldq $6,%ymm6,%ymm0
 3569         vmovdqa %ymm1,32(%esp)
 3570         vpunpckhqdq     %ymm6,%ymm5,%ymm1
 3571         vpunpcklqdq     %ymm6,%ymm5,%ymm5
 3572         vpunpcklqdq     %ymm0,%ymm2,%ymm2
 3573         vpsrlq  $30,%ymm2,%ymm0
 3574         vpsrlq  $4,%ymm2,%ymm2
 3575         vpsrlq  $26,%ymm5,%ymm6
 3576         vpsrlq  $40,%ymm1,%ymm1
 3577         vpand   %ymm7,%ymm2,%ymm2
 3578         vpand   %ymm7,%ymm5,%ymm5
 3579         vpand   %ymm7,%ymm6,%ymm6
 3580         vpand   %ymm7,%ymm0,%ymm0
 3581         vpor    (%ebx),%ymm1,%ymm1
 3582         vpaddq  64(%esp),%ymm2,%ymm2
 3583         vpaddq  (%esp),%ymm5,%ymm5
 3584         vpaddq  32(%esp),%ymm6,%ymm6
 3585         vpaddq  %ymm3,%ymm0,%ymm0
 3586         vpaddq  %ymm4,%ymm1,%ymm1
 3587         vpmuludq        -96(%edx),%ymm2,%ymm3
 3588         vmovdqa %ymm6,32(%esp)
 3589         vpmuludq        -64(%edx),%ymm2,%ymm4
 3590         vmovdqa %ymm0,96(%esp)
 3591         vpmuludq        96(%edx),%ymm2,%ymm0
 3592         vmovdqa %ymm1,128(%esp)
 3593         vpmuludq        128(%edx),%ymm2,%ymm1
 3594         vpmuludq        -128(%edx),%ymm2,%ymm2
 3595         vpmuludq        -32(%edx),%ymm5,%ymm7
 3596         vpaddq  %ymm7,%ymm3,%ymm3
 3597         vpmuludq        (%edx),%ymm5,%ymm6
 3598         vpaddq  %ymm6,%ymm4,%ymm4
 3599         vpmuludq        -128(%edx),%ymm5,%ymm7
 3600         vpaddq  %ymm7,%ymm0,%ymm0
 3601         vmovdqa 32(%esp),%ymm7
 3602         vpmuludq        -96(%edx),%ymm5,%ymm6
 3603         vpaddq  %ymm6,%ymm1,%ymm1
 3604         vpmuludq        -64(%edx),%ymm5,%ymm5
 3605         vpaddq  %ymm5,%ymm2,%ymm2
 3606         vpmuludq        -64(%edx),%ymm7,%ymm6
 3607         vpaddq  %ymm6,%ymm3,%ymm3
 3608         vpmuludq        -32(%edx),%ymm7,%ymm5
 3609         vpaddq  %ymm5,%ymm4,%ymm4
 3610         vpmuludq        128(%edx),%ymm7,%ymm6
 3611         vpaddq  %ymm6,%ymm0,%ymm0
 3612         vmovdqa 96(%esp),%ymm6
 3613         vpmuludq        -128(%edx),%ymm7,%ymm5
 3614         vpaddq  %ymm5,%ymm1,%ymm1
 3615         vpmuludq        -96(%edx),%ymm7,%ymm7
 3616         vpaddq  %ymm7,%ymm2,%ymm2
 3617         vpmuludq        -128(%edx),%ymm6,%ymm5
 3618         vpaddq  %ymm5,%ymm3,%ymm3
 3619         vpmuludq        -96(%edx),%ymm6,%ymm7
 3620         vpaddq  %ymm7,%ymm4,%ymm4
 3621         vpmuludq        64(%edx),%ymm6,%ymm5
 3622         vpaddq  %ymm5,%ymm0,%ymm0
 3623         vmovdqa 128(%esp),%ymm5
 3624         vpmuludq        96(%edx),%ymm6,%ymm7
 3625         vpaddq  %ymm7,%ymm1,%ymm1
 3626         vpmuludq        128(%edx),%ymm6,%ymm6
 3627         vpaddq  %ymm6,%ymm2,%ymm2
 3628         vpmuludq        128(%edx),%ymm5,%ymm7
 3629         vpaddq  %ymm7,%ymm3,%ymm3
 3630         vpmuludq        32(%edx),%ymm5,%ymm6
 3631         vpaddq  %ymm6,%ymm0,%ymm0
 3632         vpmuludq        -128(%edx),%ymm5,%ymm7
 3633         vpaddq  %ymm7,%ymm4,%ymm4
 3634         vmovdqa 64(%ebx),%ymm7
 3635         vpmuludq        64(%edx),%ymm5,%ymm6
 3636         vpaddq  %ymm6,%ymm1,%ymm1
 3637         vpmuludq        96(%edx),%ymm5,%ymm5
 3638         vpaddq  %ymm5,%ymm2,%ymm2
 3639         vpsrlq  $26,%ymm3,%ymm5
 3640         vpand   %ymm7,%ymm3,%ymm3
 3641         vpsrlq  $26,%ymm0,%ymm6
 3642         vpand   %ymm7,%ymm0,%ymm0
 3643         vpaddq  %ymm5,%ymm4,%ymm4
 3644         vpaddq  %ymm6,%ymm1,%ymm1
 3645         vpsrlq  $26,%ymm4,%ymm5
 3646         vpand   %ymm7,%ymm4,%ymm4
 3647         vpsrlq  $26,%ymm1,%ymm6
 3648         vpand   %ymm7,%ymm1,%ymm1
 3649         vpaddq  %ymm6,%ymm2,%ymm2
 3650         vpaddq  %ymm5,%ymm0,%ymm0
 3651         vpsllq  $2,%ymm5,%ymm5
 3652         vpsrlq  $26,%ymm2,%ymm6
 3653         vpand   %ymm7,%ymm2,%ymm2
 3654         vpaddq  %ymm5,%ymm0,%ymm0
 3655         vpaddq  %ymm6,%ymm3,%ymm3
 3656         vpsrlq  $26,%ymm3,%ymm6
 3657         vpsrlq  $26,%ymm0,%ymm5
 3658         vpand   %ymm7,%ymm0,%ymm0
 3659         vpand   %ymm7,%ymm3,%ymm3
 3660         vpaddq  %ymm5,%ymm1,%ymm1
 3661         vpaddq  %ymm6,%ymm4,%ymm4
 3662         vmovdqu (%esi),%xmm5
 3663         vmovdqu 16(%esi),%xmm6
 3664         vinserti128     $1,32(%esi),%ymm5,%ymm5
 3665         vinserti128     $1,48(%esi),%ymm6,%ymm6
 3666         leal    64(%esi),%esi
 3667         subl    $64,%ecx
 3668         jnz     .L028loop
 3669 .L027tail:
 3670         vmovdqa %ymm2,64(%esp)
 3671         vpsrldq $6,%ymm5,%ymm2
 3672         vmovdqa %ymm0,(%esp)
 3673         vpsrldq $6,%ymm6,%ymm0
 3674         vmovdqa %ymm1,32(%esp)
 3675         vpunpckhqdq     %ymm6,%ymm5,%ymm1
 3676         vpunpcklqdq     %ymm6,%ymm5,%ymm5
 3677         vpunpcklqdq     %ymm0,%ymm2,%ymm2
 3678         vpsrlq  $30,%ymm2,%ymm0
 3679         vpsrlq  $4,%ymm2,%ymm2
 3680         vpsrlq  $26,%ymm5,%ymm6
 3681         vpsrlq  $40,%ymm1,%ymm1
 3682         vpand   %ymm7,%ymm2,%ymm2
 3683         vpand   %ymm7,%ymm5,%ymm5
 3684         vpand   %ymm7,%ymm6,%ymm6
 3685         vpand   %ymm7,%ymm0,%ymm0
 3686         vpor    (%ebx),%ymm1,%ymm1
 3687         andl    $-64,%ebx
 3688         vpaddq  64(%esp),%ymm2,%ymm2
 3689         vpaddq  (%esp),%ymm5,%ymm5
 3690         vpaddq  32(%esp),%ymm6,%ymm6
 3691         vpaddq  %ymm3,%ymm0,%ymm0
 3692         vpaddq  %ymm4,%ymm1,%ymm1
 3693         vpmuludq        -92(%edx),%ymm2,%ymm3
 3694         vmovdqa %ymm6,32(%esp)
 3695         vpmuludq        -60(%edx),%ymm2,%ymm4
 3696         vmovdqa %ymm0,96(%esp)
 3697         vpmuludq        100(%edx),%ymm2,%ymm0
 3698         vmovdqa %ymm1,128(%esp)
 3699         vpmuludq        132(%edx),%ymm2,%ymm1
 3700         vpmuludq        -124(%edx),%ymm2,%ymm2
 3701         vpmuludq        -28(%edx),%ymm5,%ymm7
 3702         vpaddq  %ymm7,%ymm3,%ymm3
 3703         vpmuludq        4(%edx),%ymm5,%ymm6
 3704         vpaddq  %ymm6,%ymm4,%ymm4
 3705         vpmuludq        -124(%edx),%ymm5,%ymm7
 3706         vpaddq  %ymm7,%ymm0,%ymm0
 3707         vmovdqa 32(%esp),%ymm7
 3708         vpmuludq        -92(%edx),%ymm5,%ymm6
 3709         vpaddq  %ymm6,%ymm1,%ymm1
 3710         vpmuludq        -60(%edx),%ymm5,%ymm5
 3711         vpaddq  %ymm5,%ymm2,%ymm2
 3712         vpmuludq        -60(%edx),%ymm7,%ymm6
 3713         vpaddq  %ymm6,%ymm3,%ymm3
 3714         vpmuludq        -28(%edx),%ymm7,%ymm5
 3715         vpaddq  %ymm5,%ymm4,%ymm4
 3716         vpmuludq        132(%edx),%ymm7,%ymm6
 3717         vpaddq  %ymm6,%ymm0,%ymm0
 3718         vmovdqa 96(%esp),%ymm6
 3719         vpmuludq        -124(%edx),%ymm7,%ymm5
 3720         vpaddq  %ymm5,%ymm1,%ymm1
 3721         vpmuludq        -92(%edx),%ymm7,%ymm7
 3722         vpaddq  %ymm7,%ymm2,%ymm2
 3723         vpmuludq        -124(%edx),%ymm6,%ymm5
 3724         vpaddq  %ymm5,%ymm3,%ymm3
 3725         vpmuludq        -92(%edx),%ymm6,%ymm7
 3726         vpaddq  %ymm7,%ymm4,%ymm4
 3727         vpmuludq        68(%edx),%ymm6,%ymm5
 3728         vpaddq  %ymm5,%ymm0,%ymm0
 3729         vmovdqa 128(%esp),%ymm5
 3730         vpmuludq        100(%edx),%ymm6,%ymm7
 3731         vpaddq  %ymm7,%ymm1,%ymm1
 3732         vpmuludq        132(%edx),%ymm6,%ymm6
 3733         vpaddq  %ymm6,%ymm2,%ymm2
 3734         vpmuludq        132(%edx),%ymm5,%ymm7
 3735         vpaddq  %ymm7,%ymm3,%ymm3
 3736         vpmuludq        36(%edx),%ymm5,%ymm6
 3737         vpaddq  %ymm6,%ymm0,%ymm0
 3738         vpmuludq        -124(%edx),%ymm5,%ymm7
 3739         vpaddq  %ymm7,%ymm4,%ymm4
 3740         vmovdqa 64(%ebx),%ymm7
 3741         vpmuludq        68(%edx),%ymm5,%ymm6
 3742         vpaddq  %ymm6,%ymm1,%ymm1
 3743         vpmuludq        100(%edx),%ymm5,%ymm5
 3744         vpaddq  %ymm5,%ymm2,%ymm2
 3745         vpsrldq $8,%ymm4,%ymm5
 3746         vpsrldq $8,%ymm3,%ymm6
 3747         vpaddq  %ymm5,%ymm4,%ymm4
 3748         vpsrldq $8,%ymm0,%ymm5
 3749         vpaddq  %ymm6,%ymm3,%ymm3
 3750         vpsrldq $8,%ymm1,%ymm6
 3751         vpaddq  %ymm5,%ymm0,%ymm0
 3752         vpsrldq $8,%ymm2,%ymm5
 3753         vpaddq  %ymm6,%ymm1,%ymm1
 3754         vpermq  $2,%ymm4,%ymm6
 3755         vpaddq  %ymm5,%ymm2,%ymm2
 3756         vpermq  $2,%ymm3,%ymm5
 3757         vpaddq  %ymm6,%ymm4,%ymm4
 3758         vpermq  $2,%ymm0,%ymm6
 3759         vpaddq  %ymm5,%ymm3,%ymm3
 3760         vpermq  $2,%ymm1,%ymm5
 3761         vpaddq  %ymm6,%ymm0,%ymm0
 3762         vpermq  $2,%ymm2,%ymm6
 3763         vpaddq  %ymm5,%ymm1,%ymm1
 3764         vpaddq  %ymm6,%ymm2,%ymm2
 3765         vpsrlq  $26,%ymm3,%ymm5
 3766         vpand   %ymm7,%ymm3,%ymm3
 3767         vpsrlq  $26,%ymm0,%ymm6
 3768         vpand   %ymm7,%ymm0,%ymm0
 3769         vpaddq  %ymm5,%ymm4,%ymm4
 3770         vpaddq  %ymm6,%ymm1,%ymm1
 3771         vpsrlq  $26,%ymm4,%ymm5
 3772         vpand   %ymm7,%ymm4,%ymm4
 3773         vpsrlq  $26,%ymm1,%ymm6
 3774         vpand   %ymm7,%ymm1,%ymm1
 3775         vpaddq  %ymm6,%ymm2,%ymm2
 3776         vpaddq  %ymm5,%ymm0,%ymm0
 3777         vpsllq  $2,%ymm5,%ymm5
 3778         vpsrlq  $26,%ymm2,%ymm6
 3779         vpand   %ymm7,%ymm2,%ymm2
 3780         vpaddq  %ymm5,%ymm0,%ymm0
 3781         vpaddq  %ymm6,%ymm3,%ymm3
 3782         vpsrlq  $26,%ymm3,%ymm6
 3783         vpsrlq  $26,%ymm0,%ymm5
 3784         vpand   %ymm7,%ymm0,%ymm0
 3785         vpand   %ymm7,%ymm3,%ymm3
 3786         vpaddq  %ymm5,%ymm1,%ymm1
 3787         vpaddq  %ymm6,%ymm4,%ymm4
 3788         cmpl    $0,%ecx
 3789         je      .L029done
 3790         vpshufd $252,%xmm0,%xmm0
 3791         leal    288(%esp),%edx
 3792         vpshufd $252,%xmm1,%xmm1
 3793         vpshufd $252,%xmm2,%xmm2
 3794         vpshufd $252,%xmm3,%xmm3
 3795         vpshufd $252,%xmm4,%xmm4
 3796         jmp     .L024even
 3797 .align  16
 3798 .L029done:
 3799         vmovd   %xmm0,-48(%edi)
 3800         vmovd   %xmm1,-44(%edi)
 3801         vmovd   %xmm2,-40(%edi)
 3802         vmovd   %xmm3,-36(%edi)
 3803         vmovd   %xmm4,-32(%edi)
 3804         vzeroupper
 3805         movl    %ebp,%esp
 3806 .L020nodata:
 3807         popl    %edi
 3808         popl    %esi
 3809         popl    %ebx
 3810         popl    %ebp
 3811         ret
 3812 .size   _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
 3813 .align  64
 3814 .Lconst_sse2:
 3815 .long   16777216,0,16777216,0,16777216,0,16777216,0
 3816 .long   0,0,0,0,0,0,0,0
 3817 .long   67108863,0,67108863,0,67108863,0,67108863,0
 3818 .long   268435455,268435452,268435452,268435452
 3819 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
 3820 .byte   44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
 3821 .byte   60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
 3822 .byte   114,103,62,0
 3823 .align  4
 3824 .comm   OPENSSL_ia32cap_P,16,4
 3825 #endif

Cache object: 52d1e58cbca27ccda0b7df76a3548961


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.