The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/openssl/amd64/poly1305-x86_64.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $FreeBSD$ */
    2 /* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
    3 .text   
    4 
    5 
    6 
    7 .globl  poly1305_init
    8 .hidden poly1305_init
    9 .globl  poly1305_blocks
   10 .hidden poly1305_blocks
   11 .globl  poly1305_emit
   12 .hidden poly1305_emit
   13 
   14 .type   poly1305_init,@function
   15 .align  32
   16 poly1305_init:
   17 .cfi_startproc  
   18         xorq    %rax,%rax
   19         movq    %rax,0(%rdi)
   20         movq    %rax,8(%rdi)
   21         movq    %rax,16(%rdi)
   22 
   23         cmpq    $0,%rsi
   24         je      .Lno_key
   25 
   26         leaq    poly1305_blocks(%rip),%r10
   27         leaq    poly1305_emit(%rip),%r11
   28         movq    OPENSSL_ia32cap_P+4(%rip),%r9
   29         leaq    poly1305_blocks_avx(%rip),%rax
   30         leaq    poly1305_emit_avx(%rip),%rcx
   31         btq     $28,%r9
   32         cmovcq  %rax,%r10
   33         cmovcq  %rcx,%r11
   34         leaq    poly1305_blocks_avx2(%rip),%rax
   35         btq     $37,%r9
   36         cmovcq  %rax,%r10
   37         movq    $0x0ffffffc0fffffff,%rax
   38         movq    $0x0ffffffc0ffffffc,%rcx
   39         andq    0(%rsi),%rax
   40         andq    8(%rsi),%rcx
   41         movq    %rax,24(%rdi)
   42         movq    %rcx,32(%rdi)
   43         movq    %r10,0(%rdx)
   44         movq    %r11,8(%rdx)
   45         movl    $1,%eax
   46 .Lno_key:
   47         .byte   0xf3,0xc3
   48 .cfi_endproc    
   49 .size   poly1305_init,.-poly1305_init
   50 
   51 .type   poly1305_blocks,@function
   52 .align  32
   53 poly1305_blocks:
   54 .cfi_startproc  
   55 .Lblocks:
   56         shrq    $4,%rdx
   57         jz      .Lno_data
   58 
   59         pushq   %rbx
   60 .cfi_adjust_cfa_offset  8
   61 .cfi_offset     %rbx,-16
   62         pushq   %rbp
   63 .cfi_adjust_cfa_offset  8
   64 .cfi_offset     %rbp,-24
   65         pushq   %r12
   66 .cfi_adjust_cfa_offset  8
   67 .cfi_offset     %r12,-32
   68         pushq   %r13
   69 .cfi_adjust_cfa_offset  8
   70 .cfi_offset     %r13,-40
   71         pushq   %r14
   72 .cfi_adjust_cfa_offset  8
   73 .cfi_offset     %r14,-48
   74         pushq   %r15
   75 .cfi_adjust_cfa_offset  8
   76 .cfi_offset     %r15,-56
   77 .Lblocks_body:
   78 
   79         movq    %rdx,%r15
   80 
   81         movq    24(%rdi),%r11
   82         movq    32(%rdi),%r13
   83 
   84         movq    0(%rdi),%r14
   85         movq    8(%rdi),%rbx
   86         movq    16(%rdi),%rbp
   87 
   88         movq    %r13,%r12
   89         shrq    $2,%r13
   90         movq    %r12,%rax
   91         addq    %r12,%r13
   92         jmp     .Loop
   93 
   94 .align  32
   95 .Loop:
   96         addq    0(%rsi),%r14
   97         adcq    8(%rsi),%rbx
   98         leaq    16(%rsi),%rsi
   99         adcq    %rcx,%rbp
  100         mulq    %r14
  101         movq    %rax,%r9
  102         movq    %r11,%rax
  103         movq    %rdx,%r10
  104 
  105         mulq    %r14
  106         movq    %rax,%r14
  107         movq    %r11,%rax
  108         movq    %rdx,%r8
  109 
  110         mulq    %rbx
  111         addq    %rax,%r9
  112         movq    %r13,%rax
  113         adcq    %rdx,%r10
  114 
  115         mulq    %rbx
  116         movq    %rbp,%rbx
  117         addq    %rax,%r14
  118         adcq    %rdx,%r8
  119 
  120         imulq   %r13,%rbx
  121         addq    %rbx,%r9
  122         movq    %r8,%rbx
  123         adcq    $0,%r10
  124 
  125         imulq   %r11,%rbp
  126         addq    %r9,%rbx
  127         movq    $-4,%rax
  128         adcq    %rbp,%r10
  129 
  130         andq    %r10,%rax
  131         movq    %r10,%rbp
  132         shrq    $2,%r10
  133         andq    $3,%rbp
  134         addq    %r10,%rax
  135         addq    %rax,%r14
  136         adcq    $0,%rbx
  137         adcq    $0,%rbp
  138         movq    %r12,%rax
  139         decq    %r15
  140         jnz     .Loop
  141 
  142         movq    %r14,0(%rdi)
  143         movq    %rbx,8(%rdi)
  144         movq    %rbp,16(%rdi)
  145 
  146         movq    0(%rsp),%r15
  147 .cfi_restore    %r15
  148         movq    8(%rsp),%r14
  149 .cfi_restore    %r14
  150         movq    16(%rsp),%r13
  151 .cfi_restore    %r13
  152         movq    24(%rsp),%r12
  153 .cfi_restore    %r12
  154         movq    32(%rsp),%rbp
  155 .cfi_restore    %rbp
  156         movq    40(%rsp),%rbx
  157 .cfi_restore    %rbx
  158         leaq    48(%rsp),%rsp
  159 .cfi_adjust_cfa_offset  -48
  160 .Lno_data:
  161 .Lblocks_epilogue:
  162         .byte   0xf3,0xc3
  163 .cfi_endproc    
  164 .size   poly1305_blocks,.-poly1305_blocks
  165 
  166 .type   poly1305_emit,@function
  167 .align  32
  168 poly1305_emit:
  169 .cfi_startproc  
  170 .Lemit:
  171         movq    0(%rdi),%r8
  172         movq    8(%rdi),%r9
  173         movq    16(%rdi),%r10
  174 
  175         movq    %r8,%rax
  176         addq    $5,%r8
  177         movq    %r9,%rcx
  178         adcq    $0,%r9
  179         adcq    $0,%r10
  180         shrq    $2,%r10
  181         cmovnzq %r8,%rax
  182         cmovnzq %r9,%rcx
  183 
  184         addq    0(%rdx),%rax
  185         adcq    8(%rdx),%rcx
  186         movq    %rax,0(%rsi)
  187         movq    %rcx,8(%rsi)
  188 
  189         .byte   0xf3,0xc3
  190 .cfi_endproc    
  191 .size   poly1305_emit,.-poly1305_emit
  192 .type   __poly1305_block,@function
  193 .align  32
  194 __poly1305_block:
  195 .cfi_startproc  
  196         mulq    %r14
  197         movq    %rax,%r9
  198         movq    %r11,%rax
  199         movq    %rdx,%r10
  200 
  201         mulq    %r14
  202         movq    %rax,%r14
  203         movq    %r11,%rax
  204         movq    %rdx,%r8
  205 
  206         mulq    %rbx
  207         addq    %rax,%r9
  208         movq    %r13,%rax
  209         adcq    %rdx,%r10
  210 
  211         mulq    %rbx
  212         movq    %rbp,%rbx
  213         addq    %rax,%r14
  214         adcq    %rdx,%r8
  215 
  216         imulq   %r13,%rbx
  217         addq    %rbx,%r9
  218         movq    %r8,%rbx
  219         adcq    $0,%r10
  220 
  221         imulq   %r11,%rbp
  222         addq    %r9,%rbx
  223         movq    $-4,%rax
  224         adcq    %rbp,%r10
  225 
  226         andq    %r10,%rax
  227         movq    %r10,%rbp
  228         shrq    $2,%r10
  229         andq    $3,%rbp
  230         addq    %r10,%rax
  231         addq    %rax,%r14
  232         adcq    $0,%rbx
  233         adcq    $0,%rbp
  234         .byte   0xf3,0xc3
  235 .cfi_endproc    
  236 .size   __poly1305_block,.-__poly1305_block
  237 
  238 .type   __poly1305_init_avx,@function
  239 .align  32
  240 __poly1305_init_avx:
  241 .cfi_startproc  
  242         movq    %r11,%r14
  243         movq    %r12,%rbx
  244         xorq    %rbp,%rbp
  245 
  246         leaq    48+64(%rdi),%rdi
  247 
  248         movq    %r12,%rax
  249         call    __poly1305_block
  250 
  251         movl    $0x3ffffff,%eax
  252         movl    $0x3ffffff,%edx
  253         movq    %r14,%r8
  254         andl    %r14d,%eax
  255         movq    %r11,%r9
  256         andl    %r11d,%edx
  257         movl    %eax,-64(%rdi)
  258         shrq    $26,%r8
  259         movl    %edx,-60(%rdi)
  260         shrq    $26,%r9
  261 
  262         movl    $0x3ffffff,%eax
  263         movl    $0x3ffffff,%edx
  264         andl    %r8d,%eax
  265         andl    %r9d,%edx
  266         movl    %eax,-48(%rdi)
  267         leal    (%rax,%rax,4),%eax
  268         movl    %edx,-44(%rdi)
  269         leal    (%rdx,%rdx,4),%edx
  270         movl    %eax,-32(%rdi)
  271         shrq    $26,%r8
  272         movl    %edx,-28(%rdi)
  273         shrq    $26,%r9
  274 
  275         movq    %rbx,%rax
  276         movq    %r12,%rdx
  277         shlq    $12,%rax
  278         shlq    $12,%rdx
  279         orq     %r8,%rax
  280         orq     %r9,%rdx
  281         andl    $0x3ffffff,%eax
  282         andl    $0x3ffffff,%edx
  283         movl    %eax,-16(%rdi)
  284         leal    (%rax,%rax,4),%eax
  285         movl    %edx,-12(%rdi)
  286         leal    (%rdx,%rdx,4),%edx
  287         movl    %eax,0(%rdi)
  288         movq    %rbx,%r8
  289         movl    %edx,4(%rdi)
  290         movq    %r12,%r9
  291 
  292         movl    $0x3ffffff,%eax
  293         movl    $0x3ffffff,%edx
  294         shrq    $14,%r8
  295         shrq    $14,%r9
  296         andl    %r8d,%eax
  297         andl    %r9d,%edx
  298         movl    %eax,16(%rdi)
  299         leal    (%rax,%rax,4),%eax
  300         movl    %edx,20(%rdi)
  301         leal    (%rdx,%rdx,4),%edx
  302         movl    %eax,32(%rdi)
  303         shrq    $26,%r8
  304         movl    %edx,36(%rdi)
  305         shrq    $26,%r9
  306 
  307         movq    %rbp,%rax
  308         shlq    $24,%rax
  309         orq     %rax,%r8
  310         movl    %r8d,48(%rdi)
  311         leaq    (%r8,%r8,4),%r8
  312         movl    %r9d,52(%rdi)
  313         leaq    (%r9,%r9,4),%r9
  314         movl    %r8d,64(%rdi)
  315         movl    %r9d,68(%rdi)
  316 
  317         movq    %r12,%rax
  318         call    __poly1305_block
  319 
  320         movl    $0x3ffffff,%eax
  321         movq    %r14,%r8
  322         andl    %r14d,%eax
  323         shrq    $26,%r8
  324         movl    %eax,-52(%rdi)
  325 
  326         movl    $0x3ffffff,%edx
  327         andl    %r8d,%edx
  328         movl    %edx,-36(%rdi)
  329         leal    (%rdx,%rdx,4),%edx
  330         shrq    $26,%r8
  331         movl    %edx,-20(%rdi)
  332 
  333         movq    %rbx,%rax
  334         shlq    $12,%rax
  335         orq     %r8,%rax
  336         andl    $0x3ffffff,%eax
  337         movl    %eax,-4(%rdi)
  338         leal    (%rax,%rax,4),%eax
  339         movq    %rbx,%r8
  340         movl    %eax,12(%rdi)
  341 
  342         movl    $0x3ffffff,%edx
  343         shrq    $14,%r8
  344         andl    %r8d,%edx
  345         movl    %edx,28(%rdi)
  346         leal    (%rdx,%rdx,4),%edx
  347         shrq    $26,%r8
  348         movl    %edx,44(%rdi)
  349 
  350         movq    %rbp,%rax
  351         shlq    $24,%rax
  352         orq     %rax,%r8
  353         movl    %r8d,60(%rdi)
  354         leaq    (%r8,%r8,4),%r8
  355         movl    %r8d,76(%rdi)
  356 
  357         movq    %r12,%rax
  358         call    __poly1305_block
  359 
  360         movl    $0x3ffffff,%eax
  361         movq    %r14,%r8
  362         andl    %r14d,%eax
  363         shrq    $26,%r8
  364         movl    %eax,-56(%rdi)
  365 
  366         movl    $0x3ffffff,%edx
  367         andl    %r8d,%edx
  368         movl    %edx,-40(%rdi)
  369         leal    (%rdx,%rdx,4),%edx
  370         shrq    $26,%r8
  371         movl    %edx,-24(%rdi)
  372 
  373         movq    %rbx,%rax
  374         shlq    $12,%rax
  375         orq     %r8,%rax
  376         andl    $0x3ffffff,%eax
  377         movl    %eax,-8(%rdi)
  378         leal    (%rax,%rax,4),%eax
  379         movq    %rbx,%r8
  380         movl    %eax,8(%rdi)
  381 
  382         movl    $0x3ffffff,%edx
  383         shrq    $14,%r8
  384         andl    %r8d,%edx
  385         movl    %edx,24(%rdi)
  386         leal    (%rdx,%rdx,4),%edx
  387         shrq    $26,%r8
  388         movl    %edx,40(%rdi)
  389 
  390         movq    %rbp,%rax
  391         shlq    $24,%rax
  392         orq     %rax,%r8
  393         movl    %r8d,56(%rdi)
  394         leaq    (%r8,%r8,4),%r8
  395         movl    %r8d,72(%rdi)
  396 
  397         leaq    -48-64(%rdi),%rdi
  398         .byte   0xf3,0xc3
  399 .cfi_endproc    
  400 .size   __poly1305_init_avx,.-__poly1305_init_avx
  401 
  402 .type   poly1305_blocks_avx,@function
  403 .align  32
  404 poly1305_blocks_avx:
  405 .cfi_startproc  
  406         movl    20(%rdi),%r8d
  407         cmpq    $128,%rdx
  408         jae     .Lblocks_avx
  409         testl   %r8d,%r8d
  410         jz      .Lblocks
  411 
  412 .Lblocks_avx:
  413         andq    $-16,%rdx
  414         jz      .Lno_data_avx
  415 
  416         vzeroupper
  417 
  418         testl   %r8d,%r8d
  419         jz      .Lbase2_64_avx
  420 
  421         testq   $31,%rdx
  422         jz      .Leven_avx
  423 
  424         pushq   %rbx
  425 .cfi_adjust_cfa_offset  8
  426 .cfi_offset     %rbx,-16
  427         pushq   %rbp
  428 .cfi_adjust_cfa_offset  8
  429 .cfi_offset     %rbp,-24
  430         pushq   %r12
  431 .cfi_adjust_cfa_offset  8
  432 .cfi_offset     %r12,-32
  433         pushq   %r13
  434 .cfi_adjust_cfa_offset  8
  435 .cfi_offset     %r13,-40
  436         pushq   %r14
  437 .cfi_adjust_cfa_offset  8
  438 .cfi_offset     %r14,-48
  439         pushq   %r15
  440 .cfi_adjust_cfa_offset  8
  441 .cfi_offset     %r15,-56
  442 .Lblocks_avx_body:
  443 
  444         movq    %rdx,%r15
  445 
  446         movq    0(%rdi),%r8
  447         movq    8(%rdi),%r9
  448         movl    16(%rdi),%ebp
  449 
  450         movq    24(%rdi),%r11
  451         movq    32(%rdi),%r13
  452 
  453 
  454         movl    %r8d,%r14d
  455         andq    $-2147483648,%r8
  456         movq    %r9,%r12
  457         movl    %r9d,%ebx
  458         andq    $-2147483648,%r9
  459 
  460         shrq    $6,%r8
  461         shlq    $52,%r12
  462         addq    %r8,%r14
  463         shrq    $12,%rbx
  464         shrq    $18,%r9
  465         addq    %r12,%r14
  466         adcq    %r9,%rbx
  467 
  468         movq    %rbp,%r8
  469         shlq    $40,%r8
  470         shrq    $24,%rbp
  471         addq    %r8,%rbx
  472         adcq    $0,%rbp
  473 
  474         movq    $-4,%r9
  475         movq    %rbp,%r8
  476         andq    %rbp,%r9
  477         shrq    $2,%r8
  478         andq    $3,%rbp
  479         addq    %r9,%r8
  480         addq    %r8,%r14
  481         adcq    $0,%rbx
  482         adcq    $0,%rbp
  483 
  484         movq    %r13,%r12
  485         movq    %r13,%rax
  486         shrq    $2,%r13
  487         addq    %r12,%r13
  488 
  489         addq    0(%rsi),%r14
  490         adcq    8(%rsi),%rbx
  491         leaq    16(%rsi),%rsi
  492         adcq    %rcx,%rbp
  493 
  494         call    __poly1305_block
  495 
  496         testq   %rcx,%rcx
  497         jz      .Lstore_base2_64_avx
  498 
  499 
  500         movq    %r14,%rax
  501         movq    %r14,%rdx
  502         shrq    $52,%r14
  503         movq    %rbx,%r11
  504         movq    %rbx,%r12
  505         shrq    $26,%rdx
  506         andq    $0x3ffffff,%rax
  507         shlq    $12,%r11
  508         andq    $0x3ffffff,%rdx
  509         shrq    $14,%rbx
  510         orq     %r11,%r14
  511         shlq    $24,%rbp
  512         andq    $0x3ffffff,%r14
  513         shrq    $40,%r12
  514         andq    $0x3ffffff,%rbx
  515         orq     %r12,%rbp
  516 
  517         subq    $16,%r15
  518         jz      .Lstore_base2_26_avx
  519 
  520         vmovd   %eax,%xmm0
  521         vmovd   %edx,%xmm1
  522         vmovd   %r14d,%xmm2
  523         vmovd   %ebx,%xmm3
  524         vmovd   %ebp,%xmm4
  525         jmp     .Lproceed_avx
  526 
  527 .align  32
  528 .Lstore_base2_64_avx:
  529         movq    %r14,0(%rdi)
  530         movq    %rbx,8(%rdi)
  531         movq    %rbp,16(%rdi)
  532         jmp     .Ldone_avx
  533 
  534 .align  16
  535 .Lstore_base2_26_avx:
  536         movl    %eax,0(%rdi)
  537         movl    %edx,4(%rdi)
  538         movl    %r14d,8(%rdi)
  539         movl    %ebx,12(%rdi)
  540         movl    %ebp,16(%rdi)
  541 .align  16
  542 .Ldone_avx:
  543         movq    0(%rsp),%r15
  544 .cfi_restore    %r15
  545         movq    8(%rsp),%r14
  546 .cfi_restore    %r14
  547         movq    16(%rsp),%r13
  548 .cfi_restore    %r13
  549         movq    24(%rsp),%r12
  550 .cfi_restore    %r12
  551         movq    32(%rsp),%rbp
  552 .cfi_restore    %rbp
  553         movq    40(%rsp),%rbx
  554 .cfi_restore    %rbx
  555         leaq    48(%rsp),%rsp
  556 .cfi_adjust_cfa_offset  -48
  557 .Lno_data_avx:
  558 .Lblocks_avx_epilogue:
  559         .byte   0xf3,0xc3
  560 .cfi_endproc    
  561 
  562 .align  32
  563 .Lbase2_64_avx:
  564 .cfi_startproc  
  565         pushq   %rbx
  566 .cfi_adjust_cfa_offset  8
  567 .cfi_offset     %rbx,-16
  568         pushq   %rbp
  569 .cfi_adjust_cfa_offset  8
  570 .cfi_offset     %rbp,-24
  571         pushq   %r12
  572 .cfi_adjust_cfa_offset  8
  573 .cfi_offset     %r12,-32
  574         pushq   %r13
  575 .cfi_adjust_cfa_offset  8
  576 .cfi_offset     %r13,-40
  577         pushq   %r14
  578 .cfi_adjust_cfa_offset  8
  579 .cfi_offset     %r14,-48
  580         pushq   %r15
  581 .cfi_adjust_cfa_offset  8
  582 .cfi_offset     %r15,-56
  583 .Lbase2_64_avx_body:
  584 
  585         movq    %rdx,%r15
  586 
  587         movq    24(%rdi),%r11
  588         movq    32(%rdi),%r13
  589 
  590         movq    0(%rdi),%r14
  591         movq    8(%rdi),%rbx
  592         movl    16(%rdi),%ebp
  593 
  594         movq    %r13,%r12
  595         movq    %r13,%rax
  596         shrq    $2,%r13
  597         addq    %r12,%r13
  598 
  599         testq   $31,%rdx
  600         jz      .Linit_avx
  601 
  602         addq    0(%rsi),%r14
  603         adcq    8(%rsi),%rbx
  604         leaq    16(%rsi),%rsi
  605         adcq    %rcx,%rbp
  606         subq    $16,%r15
  607 
  608         call    __poly1305_block
  609 
  610 .Linit_avx:
  611 
  612         movq    %r14,%rax
  613         movq    %r14,%rdx
  614         shrq    $52,%r14
  615         movq    %rbx,%r8
  616         movq    %rbx,%r9
  617         shrq    $26,%rdx
  618         andq    $0x3ffffff,%rax
  619         shlq    $12,%r8
  620         andq    $0x3ffffff,%rdx
  621         shrq    $14,%rbx
  622         orq     %r8,%r14
  623         shlq    $24,%rbp
  624         andq    $0x3ffffff,%r14
  625         shrq    $40,%r9
  626         andq    $0x3ffffff,%rbx
  627         orq     %r9,%rbp
  628 
  629         vmovd   %eax,%xmm0
  630         vmovd   %edx,%xmm1
  631         vmovd   %r14d,%xmm2
  632         vmovd   %ebx,%xmm3
  633         vmovd   %ebp,%xmm4
  634         movl    $1,20(%rdi)
  635 
  636         call    __poly1305_init_avx
  637 
  638 .Lproceed_avx:
  639         movq    %r15,%rdx
  640 
  641         movq    0(%rsp),%r15
  642 .cfi_restore    %r15
  643         movq    8(%rsp),%r14
  644 .cfi_restore    %r14
  645         movq    16(%rsp),%r13
  646 .cfi_restore    %r13
  647         movq    24(%rsp),%r12
  648 .cfi_restore    %r12
  649         movq    32(%rsp),%rbp
  650 .cfi_restore    %rbp
  651         movq    40(%rsp),%rbx
  652 .cfi_restore    %rbx
  653         leaq    48(%rsp),%rax
  654         leaq    48(%rsp),%rsp
  655 .cfi_adjust_cfa_offset  -48
  656 .Lbase2_64_avx_epilogue:
  657         jmp     .Ldo_avx
  658 .cfi_endproc    
  659 
  660 .align  32
  661 .Leven_avx:
  662 .cfi_startproc  
  663         vmovd   0(%rdi),%xmm0
  664         vmovd   4(%rdi),%xmm1
  665         vmovd   8(%rdi),%xmm2
  666         vmovd   12(%rdi),%xmm3
  667         vmovd   16(%rdi),%xmm4
  668 
  669 .Ldo_avx:
  670         leaq    -88(%rsp),%r11
  671 .cfi_def_cfa    %r11,0x60
  672         subq    $0x178,%rsp
  673         subq    $64,%rdx
  674         leaq    -32(%rsi),%rax
  675         cmovcq  %rax,%rsi
  676 
  677         vmovdqu 48(%rdi),%xmm14
  678         leaq    112(%rdi),%rdi
  679         leaq    .Lconst(%rip),%rcx
  680 
  681 
  682 
  683         vmovdqu 32(%rsi),%xmm5
  684         vmovdqu 48(%rsi),%xmm6
  685         vmovdqa 64(%rcx),%xmm15
  686 
  687         vpsrldq $6,%xmm5,%xmm7
  688         vpsrldq $6,%xmm6,%xmm8
  689         vpunpckhqdq     %xmm6,%xmm5,%xmm9
  690         vpunpcklqdq     %xmm6,%xmm5,%xmm5
  691         vpunpcklqdq     %xmm8,%xmm7,%xmm8
  692 
  693         vpsrlq  $40,%xmm9,%xmm9
  694         vpsrlq  $26,%xmm5,%xmm6
  695         vpand   %xmm15,%xmm5,%xmm5
  696         vpsrlq  $4,%xmm8,%xmm7
  697         vpand   %xmm15,%xmm6,%xmm6
  698         vpsrlq  $30,%xmm8,%xmm8
  699         vpand   %xmm15,%xmm7,%xmm7
  700         vpand   %xmm15,%xmm8,%xmm8
  701         vpor    32(%rcx),%xmm9,%xmm9
  702 
  703         jbe     .Lskip_loop_avx
  704 
  705 
  706         vmovdqu -48(%rdi),%xmm11
  707         vmovdqu -32(%rdi),%xmm12
  708         vpshufd $0xEE,%xmm14,%xmm13
  709         vpshufd $0x44,%xmm14,%xmm10
  710         vmovdqa %xmm13,-144(%r11)
  711         vmovdqa %xmm10,0(%rsp)
  712         vpshufd $0xEE,%xmm11,%xmm14
  713         vmovdqu -16(%rdi),%xmm10
  714         vpshufd $0x44,%xmm11,%xmm11
  715         vmovdqa %xmm14,-128(%r11)
  716         vmovdqa %xmm11,16(%rsp)
  717         vpshufd $0xEE,%xmm12,%xmm13
  718         vmovdqu 0(%rdi),%xmm11
  719         vpshufd $0x44,%xmm12,%xmm12
  720         vmovdqa %xmm13,-112(%r11)
  721         vmovdqa %xmm12,32(%rsp)
  722         vpshufd $0xEE,%xmm10,%xmm14
  723         vmovdqu 16(%rdi),%xmm12
  724         vpshufd $0x44,%xmm10,%xmm10
  725         vmovdqa %xmm14,-96(%r11)
  726         vmovdqa %xmm10,48(%rsp)
  727         vpshufd $0xEE,%xmm11,%xmm13
  728         vmovdqu 32(%rdi),%xmm10
  729         vpshufd $0x44,%xmm11,%xmm11
  730         vmovdqa %xmm13,-80(%r11)
  731         vmovdqa %xmm11,64(%rsp)
  732         vpshufd $0xEE,%xmm12,%xmm14
  733         vmovdqu 48(%rdi),%xmm11
  734         vpshufd $0x44,%xmm12,%xmm12
  735         vmovdqa %xmm14,-64(%r11)
  736         vmovdqa %xmm12,80(%rsp)
  737         vpshufd $0xEE,%xmm10,%xmm13
  738         vmovdqu 64(%rdi),%xmm12
  739         vpshufd $0x44,%xmm10,%xmm10
  740         vmovdqa %xmm13,-48(%r11)
  741         vmovdqa %xmm10,96(%rsp)
  742         vpshufd $0xEE,%xmm11,%xmm14
  743         vpshufd $0x44,%xmm11,%xmm11
  744         vmovdqa %xmm14,-32(%r11)
  745         vmovdqa %xmm11,112(%rsp)
  746         vpshufd $0xEE,%xmm12,%xmm13
  747         vmovdqa 0(%rsp),%xmm14
  748         vpshufd $0x44,%xmm12,%xmm12
  749         vmovdqa %xmm13,-16(%r11)
  750         vmovdqa %xmm12,128(%rsp)
  751 
  752         jmp     .Loop_avx
  753 
  754 .align  32
  755 .Loop_avx:
  756 
  757 
  758 
  759 
  760 
  761 
  762 
  763 
  764 
  765 
  766 
  767 
  768 
  769 
  770 
  771 
  772 
  773 
  774 
  775 
  776         vpmuludq        %xmm5,%xmm14,%xmm10
  777         vpmuludq        %xmm6,%xmm14,%xmm11
  778         vmovdqa %xmm2,32(%r11)
  779         vpmuludq        %xmm7,%xmm14,%xmm12
  780         vmovdqa 16(%rsp),%xmm2
  781         vpmuludq        %xmm8,%xmm14,%xmm13
  782         vpmuludq        %xmm9,%xmm14,%xmm14
  783 
  784         vmovdqa %xmm0,0(%r11)
  785         vpmuludq        32(%rsp),%xmm9,%xmm0
  786         vmovdqa %xmm1,16(%r11)
  787         vpmuludq        %xmm8,%xmm2,%xmm1
  788         vpaddq  %xmm0,%xmm10,%xmm10
  789         vpaddq  %xmm1,%xmm14,%xmm14
  790         vmovdqa %xmm3,48(%r11)
  791         vpmuludq        %xmm7,%xmm2,%xmm0
  792         vpmuludq        %xmm6,%xmm2,%xmm1
  793         vpaddq  %xmm0,%xmm13,%xmm13
  794         vmovdqa 48(%rsp),%xmm3
  795         vpaddq  %xmm1,%xmm12,%xmm12
  796         vmovdqa %xmm4,64(%r11)
  797         vpmuludq        %xmm5,%xmm2,%xmm2
  798         vpmuludq        %xmm7,%xmm3,%xmm0
  799         vpaddq  %xmm2,%xmm11,%xmm11
  800 
  801         vmovdqa 64(%rsp),%xmm4
  802         vpaddq  %xmm0,%xmm14,%xmm14
  803         vpmuludq        %xmm6,%xmm3,%xmm1
  804         vpmuludq        %xmm5,%xmm3,%xmm3
  805         vpaddq  %xmm1,%xmm13,%xmm13
  806         vmovdqa 80(%rsp),%xmm2
  807         vpaddq  %xmm3,%xmm12,%xmm12
  808         vpmuludq        %xmm9,%xmm4,%xmm0
  809         vpmuludq        %xmm8,%xmm4,%xmm4
  810         vpaddq  %xmm0,%xmm11,%xmm11
  811         vmovdqa 96(%rsp),%xmm3
  812         vpaddq  %xmm4,%xmm10,%xmm10
  813 
  814         vmovdqa 128(%rsp),%xmm4
  815         vpmuludq        %xmm6,%xmm2,%xmm1
  816         vpmuludq        %xmm5,%xmm2,%xmm2
  817         vpaddq  %xmm1,%xmm14,%xmm14
  818         vpaddq  %xmm2,%xmm13,%xmm13
  819         vpmuludq        %xmm9,%xmm3,%xmm0
  820         vpmuludq        %xmm8,%xmm3,%xmm1
  821         vpaddq  %xmm0,%xmm12,%xmm12
  822         vmovdqu 0(%rsi),%xmm0
  823         vpaddq  %xmm1,%xmm11,%xmm11
  824         vpmuludq        %xmm7,%xmm3,%xmm3
  825         vpmuludq        %xmm7,%xmm4,%xmm7
  826         vpaddq  %xmm3,%xmm10,%xmm10
  827 
  828         vmovdqu 16(%rsi),%xmm1
  829         vpaddq  %xmm7,%xmm11,%xmm11
  830         vpmuludq        %xmm8,%xmm4,%xmm8
  831         vpmuludq        %xmm9,%xmm4,%xmm9
  832         vpsrldq $6,%xmm0,%xmm2
  833         vpaddq  %xmm8,%xmm12,%xmm12
  834         vpaddq  %xmm9,%xmm13,%xmm13
  835         vpsrldq $6,%xmm1,%xmm3
  836         vpmuludq        112(%rsp),%xmm5,%xmm9
  837         vpmuludq        %xmm6,%xmm4,%xmm5
  838         vpunpckhqdq     %xmm1,%xmm0,%xmm4
  839         vpaddq  %xmm9,%xmm14,%xmm14
  840         vmovdqa -144(%r11),%xmm9
  841         vpaddq  %xmm5,%xmm10,%xmm10
  842 
  843         vpunpcklqdq     %xmm1,%xmm0,%xmm0
  844         vpunpcklqdq     %xmm3,%xmm2,%xmm3
  845 
  846 
  847         vpsrldq $5,%xmm4,%xmm4
  848         vpsrlq  $26,%xmm0,%xmm1
  849         vpand   %xmm15,%xmm0,%xmm0
  850         vpsrlq  $4,%xmm3,%xmm2
  851         vpand   %xmm15,%xmm1,%xmm1
  852         vpand   0(%rcx),%xmm4,%xmm4
  853         vpsrlq  $30,%xmm3,%xmm3
  854         vpand   %xmm15,%xmm2,%xmm2
  855         vpand   %xmm15,%xmm3,%xmm3
  856         vpor    32(%rcx),%xmm4,%xmm4
  857 
  858         vpaddq  0(%r11),%xmm0,%xmm0
  859         vpaddq  16(%r11),%xmm1,%xmm1
  860         vpaddq  32(%r11),%xmm2,%xmm2
  861         vpaddq  48(%r11),%xmm3,%xmm3
  862         vpaddq  64(%r11),%xmm4,%xmm4
  863 
  864         leaq    32(%rsi),%rax
  865         leaq    64(%rsi),%rsi
  866         subq    $64,%rdx
  867         cmovcq  %rax,%rsi
  868 
  869 
  870 
  871 
  872 
  873 
  874 
  875 
  876 
  877 
  878         vpmuludq        %xmm0,%xmm9,%xmm5
  879         vpmuludq        %xmm1,%xmm9,%xmm6
  880         vpaddq  %xmm5,%xmm10,%xmm10
  881         vpaddq  %xmm6,%xmm11,%xmm11
  882         vmovdqa -128(%r11),%xmm7
  883         vpmuludq        %xmm2,%xmm9,%xmm5
  884         vpmuludq        %xmm3,%xmm9,%xmm6
  885         vpaddq  %xmm5,%xmm12,%xmm12
  886         vpaddq  %xmm6,%xmm13,%xmm13
  887         vpmuludq        %xmm4,%xmm9,%xmm9
  888         vpmuludq        -112(%r11),%xmm4,%xmm5
  889         vpaddq  %xmm9,%xmm14,%xmm14
  890 
  891         vpaddq  %xmm5,%xmm10,%xmm10
  892         vpmuludq        %xmm2,%xmm7,%xmm6
  893         vpmuludq        %xmm3,%xmm7,%xmm5
  894         vpaddq  %xmm6,%xmm13,%xmm13
  895         vmovdqa -96(%r11),%xmm8
  896         vpaddq  %xmm5,%xmm14,%xmm14
  897         vpmuludq        %xmm1,%xmm7,%xmm6
  898         vpmuludq        %xmm0,%xmm7,%xmm7
  899         vpaddq  %xmm6,%xmm12,%xmm12
  900         vpaddq  %xmm7,%xmm11,%xmm11
  901 
  902         vmovdqa -80(%r11),%xmm9
  903         vpmuludq        %xmm2,%xmm8,%xmm5
  904         vpmuludq        %xmm1,%xmm8,%xmm6
  905         vpaddq  %xmm5,%xmm14,%xmm14
  906         vpaddq  %xmm6,%xmm13,%xmm13
  907         vmovdqa -64(%r11),%xmm7
  908         vpmuludq        %xmm0,%xmm8,%xmm8
  909         vpmuludq        %xmm4,%xmm9,%xmm5
  910         vpaddq  %xmm8,%xmm12,%xmm12
  911         vpaddq  %xmm5,%xmm11,%xmm11
  912         vmovdqa -48(%r11),%xmm8
  913         vpmuludq        %xmm3,%xmm9,%xmm9
  914         vpmuludq        %xmm1,%xmm7,%xmm6
  915         vpaddq  %xmm9,%xmm10,%xmm10
  916 
  917         vmovdqa -16(%r11),%xmm9
  918         vpaddq  %xmm6,%xmm14,%xmm14
  919         vpmuludq        %xmm0,%xmm7,%xmm7
  920         vpmuludq        %xmm4,%xmm8,%xmm5
  921         vpaddq  %xmm7,%xmm13,%xmm13
  922         vpaddq  %xmm5,%xmm12,%xmm12
  923         vmovdqu 32(%rsi),%xmm5
  924         vpmuludq        %xmm3,%xmm8,%xmm7
  925         vpmuludq        %xmm2,%xmm8,%xmm8
  926         vpaddq  %xmm7,%xmm11,%xmm11
  927         vmovdqu 48(%rsi),%xmm6
  928         vpaddq  %xmm8,%xmm10,%xmm10
  929 
  930         vpmuludq        %xmm2,%xmm9,%xmm2
  931         vpmuludq        %xmm3,%xmm9,%xmm3
  932         vpsrldq $6,%xmm5,%xmm7
  933         vpaddq  %xmm2,%xmm11,%xmm11
  934         vpmuludq        %xmm4,%xmm9,%xmm4
  935         vpsrldq $6,%xmm6,%xmm8
  936         vpaddq  %xmm3,%xmm12,%xmm2
  937         vpaddq  %xmm4,%xmm13,%xmm3
  938         vpmuludq        -32(%r11),%xmm0,%xmm4
  939         vpmuludq        %xmm1,%xmm9,%xmm0
  940         vpunpckhqdq     %xmm6,%xmm5,%xmm9
  941         vpaddq  %xmm4,%xmm14,%xmm4
  942         vpaddq  %xmm0,%xmm10,%xmm0
  943 
  944         vpunpcklqdq     %xmm6,%xmm5,%xmm5
  945         vpunpcklqdq     %xmm8,%xmm7,%xmm8
  946 
  947 
  948         vpsrldq $5,%xmm9,%xmm9
  949         vpsrlq  $26,%xmm5,%xmm6
  950         vmovdqa 0(%rsp),%xmm14
  951         vpand   %xmm15,%xmm5,%xmm5
  952         vpsrlq  $4,%xmm8,%xmm7
  953         vpand   %xmm15,%xmm6,%xmm6
  954         vpand   0(%rcx),%xmm9,%xmm9
  955         vpsrlq  $30,%xmm8,%xmm8
  956         vpand   %xmm15,%xmm7,%xmm7
  957         vpand   %xmm15,%xmm8,%xmm8
  958         vpor    32(%rcx),%xmm9,%xmm9
  959 
  960 
  961 
  962 
  963 
  964         vpsrlq  $26,%xmm3,%xmm13
  965         vpand   %xmm15,%xmm3,%xmm3
  966         vpaddq  %xmm13,%xmm4,%xmm4
  967 
  968         vpsrlq  $26,%xmm0,%xmm10
  969         vpand   %xmm15,%xmm0,%xmm0
  970         vpaddq  %xmm10,%xmm11,%xmm1
  971 
  972         vpsrlq  $26,%xmm4,%xmm10
  973         vpand   %xmm15,%xmm4,%xmm4
  974 
  975         vpsrlq  $26,%xmm1,%xmm11
  976         vpand   %xmm15,%xmm1,%xmm1
  977         vpaddq  %xmm11,%xmm2,%xmm2
  978 
  979         vpaddq  %xmm10,%xmm0,%xmm0
  980         vpsllq  $2,%xmm10,%xmm10
  981         vpaddq  %xmm10,%xmm0,%xmm0
  982 
  983         vpsrlq  $26,%xmm2,%xmm12
  984         vpand   %xmm15,%xmm2,%xmm2
  985         vpaddq  %xmm12,%xmm3,%xmm3
  986 
  987         vpsrlq  $26,%xmm0,%xmm10
  988         vpand   %xmm15,%xmm0,%xmm0
  989         vpaddq  %xmm10,%xmm1,%xmm1
  990 
  991         vpsrlq  $26,%xmm3,%xmm13
  992         vpand   %xmm15,%xmm3,%xmm3
  993         vpaddq  %xmm13,%xmm4,%xmm4
  994 
  995         ja      .Loop_avx
  996 
  997 .Lskip_loop_avx:
  998 
  999 
 1000 
 1001         vpshufd $0x10,%xmm14,%xmm14
 1002         addq    $32,%rdx
 1003         jnz     .Long_tail_avx
 1004 
 1005         vpaddq  %xmm2,%xmm7,%xmm7
 1006         vpaddq  %xmm0,%xmm5,%xmm5
 1007         vpaddq  %xmm1,%xmm6,%xmm6
 1008         vpaddq  %xmm3,%xmm8,%xmm8
 1009         vpaddq  %xmm4,%xmm9,%xmm9
 1010 
 1011 .Long_tail_avx:
 1012         vmovdqa %xmm2,32(%r11)
 1013         vmovdqa %xmm0,0(%r11)
 1014         vmovdqa %xmm1,16(%r11)
 1015         vmovdqa %xmm3,48(%r11)
 1016         vmovdqa %xmm4,64(%r11)
 1017 
 1018 
 1019 
 1020 
 1021 
 1022 
 1023 
 1024         vpmuludq        %xmm7,%xmm14,%xmm12
 1025         vpmuludq        %xmm5,%xmm14,%xmm10
 1026         vpshufd $0x10,-48(%rdi),%xmm2
 1027         vpmuludq        %xmm6,%xmm14,%xmm11
 1028         vpmuludq        %xmm8,%xmm14,%xmm13
 1029         vpmuludq        %xmm9,%xmm14,%xmm14
 1030 
 1031         vpmuludq        %xmm8,%xmm2,%xmm0
 1032         vpaddq  %xmm0,%xmm14,%xmm14
 1033         vpshufd $0x10,-32(%rdi),%xmm3
 1034         vpmuludq        %xmm7,%xmm2,%xmm1
 1035         vpaddq  %xmm1,%xmm13,%xmm13
 1036         vpshufd $0x10,-16(%rdi),%xmm4
 1037         vpmuludq        %xmm6,%xmm2,%xmm0
 1038         vpaddq  %xmm0,%xmm12,%xmm12
 1039         vpmuludq        %xmm5,%xmm2,%xmm2
 1040         vpaddq  %xmm2,%xmm11,%xmm11
 1041         vpmuludq        %xmm9,%xmm3,%xmm3
 1042         vpaddq  %xmm3,%xmm10,%xmm10
 1043 
 1044         vpshufd $0x10,0(%rdi),%xmm2
 1045         vpmuludq        %xmm7,%xmm4,%xmm1
 1046         vpaddq  %xmm1,%xmm14,%xmm14
 1047         vpmuludq        %xmm6,%xmm4,%xmm0
 1048         vpaddq  %xmm0,%xmm13,%xmm13
 1049         vpshufd $0x10,16(%rdi),%xmm3
 1050         vpmuludq        %xmm5,%xmm4,%xmm4
 1051         vpaddq  %xmm4,%xmm12,%xmm12
 1052         vpmuludq        %xmm9,%xmm2,%xmm1
 1053         vpaddq  %xmm1,%xmm11,%xmm11
 1054         vpshufd $0x10,32(%rdi),%xmm4
 1055         vpmuludq        %xmm8,%xmm2,%xmm2
 1056         vpaddq  %xmm2,%xmm10,%xmm10
 1057 
 1058         vpmuludq        %xmm6,%xmm3,%xmm0
 1059         vpaddq  %xmm0,%xmm14,%xmm14
 1060         vpmuludq        %xmm5,%xmm3,%xmm3
 1061         vpaddq  %xmm3,%xmm13,%xmm13
 1062         vpshufd $0x10,48(%rdi),%xmm2
 1063         vpmuludq        %xmm9,%xmm4,%xmm1
 1064         vpaddq  %xmm1,%xmm12,%xmm12
 1065         vpshufd $0x10,64(%rdi),%xmm3
 1066         vpmuludq        %xmm8,%xmm4,%xmm0
 1067         vpaddq  %xmm0,%xmm11,%xmm11
 1068         vpmuludq        %xmm7,%xmm4,%xmm4
 1069         vpaddq  %xmm4,%xmm10,%xmm10
 1070 
 1071         vpmuludq        %xmm5,%xmm2,%xmm2
 1072         vpaddq  %xmm2,%xmm14,%xmm14
 1073         vpmuludq        %xmm9,%xmm3,%xmm1
 1074         vpaddq  %xmm1,%xmm13,%xmm13
 1075         vpmuludq        %xmm8,%xmm3,%xmm0
 1076         vpaddq  %xmm0,%xmm12,%xmm12
 1077         vpmuludq        %xmm7,%xmm3,%xmm1
 1078         vpaddq  %xmm1,%xmm11,%xmm11
 1079         vpmuludq        %xmm6,%xmm3,%xmm3
 1080         vpaddq  %xmm3,%xmm10,%xmm10
 1081 
 1082         jz      .Lshort_tail_avx
 1083 
 1084         vmovdqu 0(%rsi),%xmm0
 1085         vmovdqu 16(%rsi),%xmm1
 1086 
 1087         vpsrldq $6,%xmm0,%xmm2
 1088         vpsrldq $6,%xmm1,%xmm3
 1089         vpunpckhqdq     %xmm1,%xmm0,%xmm4
 1090         vpunpcklqdq     %xmm1,%xmm0,%xmm0
 1091         vpunpcklqdq     %xmm3,%xmm2,%xmm3
 1092 
 1093         vpsrlq  $40,%xmm4,%xmm4
 1094         vpsrlq  $26,%xmm0,%xmm1
 1095         vpand   %xmm15,%xmm0,%xmm0
 1096         vpsrlq  $4,%xmm3,%xmm2
 1097         vpand   %xmm15,%xmm1,%xmm1
 1098         vpsrlq  $30,%xmm3,%xmm3
 1099         vpand   %xmm15,%xmm2,%xmm2
 1100         vpand   %xmm15,%xmm3,%xmm3
 1101         vpor    32(%rcx),%xmm4,%xmm4
 1102 
 1103         vpshufd $0x32,-64(%rdi),%xmm9
 1104         vpaddq  0(%r11),%xmm0,%xmm0
 1105         vpaddq  16(%r11),%xmm1,%xmm1
 1106         vpaddq  32(%r11),%xmm2,%xmm2
 1107         vpaddq  48(%r11),%xmm3,%xmm3
 1108         vpaddq  64(%r11),%xmm4,%xmm4
 1109 
 1110 
 1111 
 1112 
 1113         vpmuludq        %xmm0,%xmm9,%xmm5
 1114         vpaddq  %xmm5,%xmm10,%xmm10
 1115         vpmuludq        %xmm1,%xmm9,%xmm6
 1116         vpaddq  %xmm6,%xmm11,%xmm11
 1117         vpmuludq        %xmm2,%xmm9,%xmm5
 1118         vpaddq  %xmm5,%xmm12,%xmm12
 1119         vpshufd $0x32,-48(%rdi),%xmm7
 1120         vpmuludq        %xmm3,%xmm9,%xmm6
 1121         vpaddq  %xmm6,%xmm13,%xmm13
 1122         vpmuludq        %xmm4,%xmm9,%xmm9
 1123         vpaddq  %xmm9,%xmm14,%xmm14
 1124 
 1125         vpmuludq        %xmm3,%xmm7,%xmm5
 1126         vpaddq  %xmm5,%xmm14,%xmm14
 1127         vpshufd $0x32,-32(%rdi),%xmm8
 1128         vpmuludq        %xmm2,%xmm7,%xmm6
 1129         vpaddq  %xmm6,%xmm13,%xmm13
 1130         vpshufd $0x32,-16(%rdi),%xmm9
 1131         vpmuludq        %xmm1,%xmm7,%xmm5
 1132         vpaddq  %xmm5,%xmm12,%xmm12
 1133         vpmuludq        %xmm0,%xmm7,%xmm7
 1134         vpaddq  %xmm7,%xmm11,%xmm11
 1135         vpmuludq        %xmm4,%xmm8,%xmm8
 1136         vpaddq  %xmm8,%xmm10,%xmm10
 1137 
 1138         vpshufd $0x32,0(%rdi),%xmm7
 1139         vpmuludq        %xmm2,%xmm9,%xmm6
 1140         vpaddq  %xmm6,%xmm14,%xmm14
 1141         vpmuludq        %xmm1,%xmm9,%xmm5
 1142         vpaddq  %xmm5,%xmm13,%xmm13
 1143         vpshufd $0x32,16(%rdi),%xmm8
 1144         vpmuludq        %xmm0,%xmm9,%xmm9
 1145         vpaddq  %xmm9,%xmm12,%xmm12
 1146         vpmuludq        %xmm4,%xmm7,%xmm6
 1147         vpaddq  %xmm6,%xmm11,%xmm11
 1148         vpshufd $0x32,32(%rdi),%xmm9
 1149         vpmuludq        %xmm3,%xmm7,%xmm7
 1150         vpaddq  %xmm7,%xmm10,%xmm10
 1151 
 1152         vpmuludq        %xmm1,%xmm8,%xmm5
 1153         vpaddq  %xmm5,%xmm14,%xmm14
 1154         vpmuludq        %xmm0,%xmm8,%xmm8
 1155         vpaddq  %xmm8,%xmm13,%xmm13
 1156         vpshufd $0x32,48(%rdi),%xmm7
 1157         vpmuludq        %xmm4,%xmm9,%xmm6
 1158         vpaddq  %xmm6,%xmm12,%xmm12
 1159         vpshufd $0x32,64(%rdi),%xmm8
 1160         vpmuludq        %xmm3,%xmm9,%xmm5
 1161         vpaddq  %xmm5,%xmm11,%xmm11
 1162         vpmuludq        %xmm2,%xmm9,%xmm9
 1163         vpaddq  %xmm9,%xmm10,%xmm10
 1164 
 1165         vpmuludq        %xmm0,%xmm7,%xmm7
 1166         vpaddq  %xmm7,%xmm14,%xmm14
 1167         vpmuludq        %xmm4,%xmm8,%xmm6
 1168         vpaddq  %xmm6,%xmm13,%xmm13
 1169         vpmuludq        %xmm3,%xmm8,%xmm5
 1170         vpaddq  %xmm5,%xmm12,%xmm12
 1171         vpmuludq        %xmm2,%xmm8,%xmm6
 1172         vpaddq  %xmm6,%xmm11,%xmm11
 1173         vpmuludq        %xmm1,%xmm8,%xmm8
 1174         vpaddq  %xmm8,%xmm10,%xmm10
 1175 
 1176 .Lshort_tail_avx:
 1177 
 1178 
 1179 
 1180         vpsrldq $8,%xmm14,%xmm9
 1181         vpsrldq $8,%xmm13,%xmm8
 1182         vpsrldq $8,%xmm11,%xmm6
 1183         vpsrldq $8,%xmm10,%xmm5
 1184         vpsrldq $8,%xmm12,%xmm7
 1185         vpaddq  %xmm8,%xmm13,%xmm13
 1186         vpaddq  %xmm9,%xmm14,%xmm14
 1187         vpaddq  %xmm5,%xmm10,%xmm10
 1188         vpaddq  %xmm6,%xmm11,%xmm11
 1189         vpaddq  %xmm7,%xmm12,%xmm12
 1190 
 1191 
 1192 
 1193 
 1194         vpsrlq  $26,%xmm13,%xmm3
 1195         vpand   %xmm15,%xmm13,%xmm13
 1196         vpaddq  %xmm3,%xmm14,%xmm14
 1197 
 1198         vpsrlq  $26,%xmm10,%xmm0
 1199         vpand   %xmm15,%xmm10,%xmm10
 1200         vpaddq  %xmm0,%xmm11,%xmm11
 1201 
 1202         vpsrlq  $26,%xmm14,%xmm4
 1203         vpand   %xmm15,%xmm14,%xmm14
 1204 
 1205         vpsrlq  $26,%xmm11,%xmm1
 1206         vpand   %xmm15,%xmm11,%xmm11
 1207         vpaddq  %xmm1,%xmm12,%xmm12
 1208 
 1209         vpaddq  %xmm4,%xmm10,%xmm10
 1210         vpsllq  $2,%xmm4,%xmm4
 1211         vpaddq  %xmm4,%xmm10,%xmm10
 1212 
 1213         vpsrlq  $26,%xmm12,%xmm2
 1214         vpand   %xmm15,%xmm12,%xmm12
 1215         vpaddq  %xmm2,%xmm13,%xmm13
 1216 
 1217         vpsrlq  $26,%xmm10,%xmm0
 1218         vpand   %xmm15,%xmm10,%xmm10
 1219         vpaddq  %xmm0,%xmm11,%xmm11
 1220 
 1221         vpsrlq  $26,%xmm13,%xmm3
 1222         vpand   %xmm15,%xmm13,%xmm13
 1223         vpaddq  %xmm3,%xmm14,%xmm14
 1224 
 1225         vmovd   %xmm10,-112(%rdi)
 1226         vmovd   %xmm11,-108(%rdi)
 1227         vmovd   %xmm12,-104(%rdi)
 1228         vmovd   %xmm13,-100(%rdi)
 1229         vmovd   %xmm14,-96(%rdi)
 1230         leaq    88(%r11),%rsp
 1231 .cfi_def_cfa    %rsp,8
 1232         vzeroupper
 1233         .byte   0xf3,0xc3
 1234 .cfi_endproc    
 1235 .size   poly1305_blocks_avx,.-poly1305_blocks_avx
 1236 
 1237 .type   poly1305_emit_avx,@function
 1238 .align  32
 1239 poly1305_emit_avx:
 1240 .cfi_startproc  
 1241         cmpl    $0,20(%rdi)
 1242         je      .Lemit
 1243 
 1244         movl    0(%rdi),%eax
 1245         movl    4(%rdi),%ecx
 1246         movl    8(%rdi),%r8d
 1247         movl    12(%rdi),%r11d
 1248         movl    16(%rdi),%r10d
 1249 
 1250         shlq    $26,%rcx
 1251         movq    %r8,%r9
 1252         shlq    $52,%r8
 1253         addq    %rcx,%rax
 1254         shrq    $12,%r9
 1255         addq    %rax,%r8
 1256         adcq    $0,%r9
 1257 
 1258         shlq    $14,%r11
 1259         movq    %r10,%rax
 1260         shrq    $24,%r10
 1261         addq    %r11,%r9
 1262         shlq    $40,%rax
 1263         addq    %rax,%r9
 1264         adcq    $0,%r10
 1265 
 1266         movq    %r10,%rax
 1267         movq    %r10,%rcx
 1268         andq    $3,%r10
 1269         shrq    $2,%rax
 1270         andq    $-4,%rcx
 1271         addq    %rcx,%rax
 1272         addq    %rax,%r8
 1273         adcq    $0,%r9
 1274         adcq    $0,%r10
 1275 
 1276         movq    %r8,%rax
 1277         addq    $5,%r8
 1278         movq    %r9,%rcx
 1279         adcq    $0,%r9
 1280         adcq    $0,%r10
 1281         shrq    $2,%r10
 1282         cmovnzq %r8,%rax
 1283         cmovnzq %r9,%rcx
 1284 
 1285         addq    0(%rdx),%rax
 1286         adcq    8(%rdx),%rcx
 1287         movq    %rax,0(%rsi)
 1288         movq    %rcx,8(%rsi)
 1289 
 1290         .byte   0xf3,0xc3
 1291 .cfi_endproc    
 1292 .size   poly1305_emit_avx,.-poly1305_emit_avx
 1293 .type   poly1305_blocks_avx2,@function
 1294 .align  32
 1295 poly1305_blocks_avx2:
 1296 .cfi_startproc  
 1297         movl    20(%rdi),%r8d
 1298         cmpq    $128,%rdx
 1299         jae     .Lblocks_avx2
 1300         testl   %r8d,%r8d
 1301         jz      .Lblocks
 1302 
 1303 .Lblocks_avx2:
 1304         andq    $-16,%rdx
 1305         jz      .Lno_data_avx2
 1306 
 1307         vzeroupper
 1308 
 1309         testl   %r8d,%r8d
 1310         jz      .Lbase2_64_avx2
 1311 
 1312         testq   $63,%rdx
 1313         jz      .Leven_avx2
 1314 
 1315         pushq   %rbx
 1316 .cfi_adjust_cfa_offset  8
 1317 .cfi_offset     %rbx,-16
 1318         pushq   %rbp
 1319 .cfi_adjust_cfa_offset  8
 1320 .cfi_offset     %rbp,-24
 1321         pushq   %r12
 1322 .cfi_adjust_cfa_offset  8
 1323 .cfi_offset     %r12,-32
 1324         pushq   %r13
 1325 .cfi_adjust_cfa_offset  8
 1326 .cfi_offset     %r13,-40
 1327         pushq   %r14
 1328 .cfi_adjust_cfa_offset  8
 1329 .cfi_offset     %r14,-48
 1330         pushq   %r15
 1331 .cfi_adjust_cfa_offset  8
 1332 .cfi_offset     %r15,-56
 1333 .Lblocks_avx2_body:
 1334 
 1335         movq    %rdx,%r15
 1336 
 1337         movq    0(%rdi),%r8
 1338         movq    8(%rdi),%r9
 1339         movl    16(%rdi),%ebp
 1340 
 1341         movq    24(%rdi),%r11
 1342         movq    32(%rdi),%r13
 1343 
 1344 
 1345         movl    %r8d,%r14d
 1346         andq    $-2147483648,%r8
 1347         movq    %r9,%r12
 1348         movl    %r9d,%ebx
 1349         andq    $-2147483648,%r9
 1350 
 1351         shrq    $6,%r8
 1352         shlq    $52,%r12
 1353         addq    %r8,%r14
 1354         shrq    $12,%rbx
 1355         shrq    $18,%r9
 1356         addq    %r12,%r14
 1357         adcq    %r9,%rbx
 1358 
 1359         movq    %rbp,%r8
 1360         shlq    $40,%r8
 1361         shrq    $24,%rbp
 1362         addq    %r8,%rbx
 1363         adcq    $0,%rbp
 1364 
 1365         movq    $-4,%r9
 1366         movq    %rbp,%r8
 1367         andq    %rbp,%r9
 1368         shrq    $2,%r8
 1369         andq    $3,%rbp
 1370         addq    %r9,%r8
 1371         addq    %r8,%r14
 1372         adcq    $0,%rbx
 1373         adcq    $0,%rbp
 1374 
 1375         movq    %r13,%r12
 1376         movq    %r13,%rax
 1377         shrq    $2,%r13
 1378         addq    %r12,%r13
 1379 
 1380 .Lbase2_26_pre_avx2:
 1381         addq    0(%rsi),%r14
 1382         adcq    8(%rsi),%rbx
 1383         leaq    16(%rsi),%rsi
 1384         adcq    %rcx,%rbp
 1385         subq    $16,%r15
 1386 
 1387         call    __poly1305_block
 1388         movq    %r12,%rax
 1389 
 1390         testq   $63,%r15
 1391         jnz     .Lbase2_26_pre_avx2
 1392 
 1393         testq   %rcx,%rcx
 1394         jz      .Lstore_base2_64_avx2
 1395 
 1396 
 1397         movq    %r14,%rax
 1398         movq    %r14,%rdx
 1399         shrq    $52,%r14
 1400         movq    %rbx,%r11
 1401         movq    %rbx,%r12
 1402         shrq    $26,%rdx
 1403         andq    $0x3ffffff,%rax
 1404         shlq    $12,%r11
 1405         andq    $0x3ffffff,%rdx
 1406         shrq    $14,%rbx
 1407         orq     %r11,%r14
 1408         shlq    $24,%rbp
 1409         andq    $0x3ffffff,%r14
 1410         shrq    $40,%r12
 1411         andq    $0x3ffffff,%rbx
 1412         orq     %r12,%rbp
 1413 
 1414         testq   %r15,%r15
 1415         jz      .Lstore_base2_26_avx2
 1416 
 1417         vmovd   %eax,%xmm0
 1418         vmovd   %edx,%xmm1
 1419         vmovd   %r14d,%xmm2
 1420         vmovd   %ebx,%xmm3
 1421         vmovd   %ebp,%xmm4
 1422         jmp     .Lproceed_avx2
 1423 
 1424 .align  32
 1425 .Lstore_base2_64_avx2:
 1426         movq    %r14,0(%rdi)
 1427         movq    %rbx,8(%rdi)
 1428         movq    %rbp,16(%rdi)
 1429         jmp     .Ldone_avx2
 1430 
 1431 .align  16
 1432 .Lstore_base2_26_avx2:
 1433         movl    %eax,0(%rdi)
 1434         movl    %edx,4(%rdi)
 1435         movl    %r14d,8(%rdi)
 1436         movl    %ebx,12(%rdi)
 1437         movl    %ebp,16(%rdi)
 1438 .align  16
 1439 .Ldone_avx2:
 1440         movq    0(%rsp),%r15
 1441 .cfi_restore    %r15
 1442         movq    8(%rsp),%r14
 1443 .cfi_restore    %r14
 1444         movq    16(%rsp),%r13
 1445 .cfi_restore    %r13
 1446         movq    24(%rsp),%r12
 1447 .cfi_restore    %r12
 1448         movq    32(%rsp),%rbp
 1449 .cfi_restore    %rbp
 1450         movq    40(%rsp),%rbx
 1451 .cfi_restore    %rbx
 1452         leaq    48(%rsp),%rsp
 1453 .cfi_adjust_cfa_offset  -48
 1454 .Lno_data_avx2:
 1455 .Lblocks_avx2_epilogue:
 1456         .byte   0xf3,0xc3
 1457 .cfi_endproc    
 1458 
 1459 .align  32
 1460 .Lbase2_64_avx2:
 1461 .cfi_startproc  
 1462         pushq   %rbx
 1463 .cfi_adjust_cfa_offset  8
 1464 .cfi_offset     %rbx,-16
 1465         pushq   %rbp
 1466 .cfi_adjust_cfa_offset  8
 1467 .cfi_offset     %rbp,-24
 1468         pushq   %r12
 1469 .cfi_adjust_cfa_offset  8
 1470 .cfi_offset     %r12,-32
 1471         pushq   %r13
 1472 .cfi_adjust_cfa_offset  8
 1473 .cfi_offset     %r13,-40
 1474         pushq   %r14
 1475 .cfi_adjust_cfa_offset  8
 1476 .cfi_offset     %r14,-48
 1477         pushq   %r15
 1478 .cfi_adjust_cfa_offset  8
 1479 .cfi_offset     %r15,-56
 1480 .Lbase2_64_avx2_body:
 1481 
 1482         movq    %rdx,%r15
 1483 
 1484         movq    24(%rdi),%r11
 1485         movq    32(%rdi),%r13
 1486 
 1487         movq    0(%rdi),%r14
 1488         movq    8(%rdi),%rbx
 1489         movl    16(%rdi),%ebp
 1490 
 1491         movq    %r13,%r12
 1492         movq    %r13,%rax
 1493         shrq    $2,%r13
 1494         addq    %r12,%r13
 1495 
 1496         testq   $63,%rdx
 1497         jz      .Linit_avx2
 1498 
 1499 .Lbase2_64_pre_avx2:
 1500         addq    0(%rsi),%r14
 1501         adcq    8(%rsi),%rbx
 1502         leaq    16(%rsi),%rsi
 1503         adcq    %rcx,%rbp
 1504         subq    $16,%r15
 1505 
 1506         call    __poly1305_block
 1507         movq    %r12,%rax
 1508 
 1509         testq   $63,%r15
 1510         jnz     .Lbase2_64_pre_avx2
 1511 
 1512 .Linit_avx2:
 1513 
 1514         movq    %r14,%rax
 1515         movq    %r14,%rdx
 1516         shrq    $52,%r14
 1517         movq    %rbx,%r8
 1518         movq    %rbx,%r9
 1519         shrq    $26,%rdx
 1520         andq    $0x3ffffff,%rax
 1521         shlq    $12,%r8
 1522         andq    $0x3ffffff,%rdx
 1523         shrq    $14,%rbx
 1524         orq     %r8,%r14
 1525         shlq    $24,%rbp
 1526         andq    $0x3ffffff,%r14
 1527         shrq    $40,%r9
 1528         andq    $0x3ffffff,%rbx
 1529         orq     %r9,%rbp
 1530 
 1531         vmovd   %eax,%xmm0
 1532         vmovd   %edx,%xmm1
 1533         vmovd   %r14d,%xmm2
 1534         vmovd   %ebx,%xmm3
 1535         vmovd   %ebp,%xmm4
 1536         movl    $1,20(%rdi)
 1537 
 1538         call    __poly1305_init_avx
 1539 
 1540 .Lproceed_avx2:
 1541         movq    %r15,%rdx
 1542         movl    OPENSSL_ia32cap_P+8(%rip),%r10d
 1543         movl    $3221291008,%r11d
 1544 
 1545         movq    0(%rsp),%r15
 1546 .cfi_restore    %r15
 1547         movq    8(%rsp),%r14
 1548 .cfi_restore    %r14
 1549         movq    16(%rsp),%r13
 1550 .cfi_restore    %r13
 1551         movq    24(%rsp),%r12
 1552 .cfi_restore    %r12
 1553         movq    32(%rsp),%rbp
 1554 .cfi_restore    %rbp
 1555         movq    40(%rsp),%rbx
 1556 .cfi_restore    %rbx
 1557         leaq    48(%rsp),%rax
 1558         leaq    48(%rsp),%rsp
 1559 .cfi_adjust_cfa_offset  -48
 1560 .Lbase2_64_avx2_epilogue:
 1561         jmp     .Ldo_avx2
 1562 .cfi_endproc    
 1563 
 1564 .align  32
 1565 .Leven_avx2:
 1566 .cfi_startproc  
 1567         movl    OPENSSL_ia32cap_P+8(%rip),%r10d
 1568         vmovd   0(%rdi),%xmm0
 1569         vmovd   4(%rdi),%xmm1
 1570         vmovd   8(%rdi),%xmm2
 1571         vmovd   12(%rdi),%xmm3
 1572         vmovd   16(%rdi),%xmm4
 1573 
 1574 .Ldo_avx2:
 1575         leaq    -8(%rsp),%r11
 1576 .cfi_def_cfa    %r11,16
 1577         subq    $0x128,%rsp
 1578         leaq    .Lconst(%rip),%rcx
 1579         leaq    48+64(%rdi),%rdi
 1580         vmovdqa 96(%rcx),%ymm7
 1581 
 1582 
 1583         vmovdqu -64(%rdi),%xmm9
 1584         andq    $-512,%rsp
 1585         vmovdqu -48(%rdi),%xmm10
 1586         vmovdqu -32(%rdi),%xmm6
 1587         vmovdqu -16(%rdi),%xmm11
 1588         vmovdqu 0(%rdi),%xmm12
 1589         vmovdqu 16(%rdi),%xmm13
 1590         leaq    144(%rsp),%rax
 1591         vmovdqu 32(%rdi),%xmm14
 1592         vpermd  %ymm9,%ymm7,%ymm9
 1593         vmovdqu 48(%rdi),%xmm15
 1594         vpermd  %ymm10,%ymm7,%ymm10
 1595         vmovdqu 64(%rdi),%xmm5
 1596         vpermd  %ymm6,%ymm7,%ymm6
 1597         vmovdqa %ymm9,0(%rsp)
 1598         vpermd  %ymm11,%ymm7,%ymm11
 1599         vmovdqa %ymm10,32-144(%rax)
 1600         vpermd  %ymm12,%ymm7,%ymm12
 1601         vmovdqa %ymm6,64-144(%rax)
 1602         vpermd  %ymm13,%ymm7,%ymm13
 1603         vmovdqa %ymm11,96-144(%rax)
 1604         vpermd  %ymm14,%ymm7,%ymm14
 1605         vmovdqa %ymm12,128-144(%rax)
 1606         vpermd  %ymm15,%ymm7,%ymm15
 1607         vmovdqa %ymm13,160-144(%rax)
 1608         vpermd  %ymm5,%ymm7,%ymm5
 1609         vmovdqa %ymm14,192-144(%rax)
 1610         vmovdqa %ymm15,224-144(%rax)
 1611         vmovdqa %ymm5,256-144(%rax)
 1612         vmovdqa 64(%rcx),%ymm5
 1613 
 1614 
 1615 
 1616         vmovdqu 0(%rsi),%xmm7
 1617         vmovdqu 16(%rsi),%xmm8
 1618         vinserti128     $1,32(%rsi),%ymm7,%ymm7
 1619         vinserti128     $1,48(%rsi),%ymm8,%ymm8
 1620         leaq    64(%rsi),%rsi
 1621 
 1622         vpsrldq $6,%ymm7,%ymm9
 1623         vpsrldq $6,%ymm8,%ymm10
 1624         vpunpckhqdq     %ymm8,%ymm7,%ymm6
 1625         vpunpcklqdq     %ymm10,%ymm9,%ymm9
 1626         vpunpcklqdq     %ymm8,%ymm7,%ymm7
 1627 
 1628         vpsrlq  $30,%ymm9,%ymm10
 1629         vpsrlq  $4,%ymm9,%ymm9
 1630         vpsrlq  $26,%ymm7,%ymm8
 1631         vpsrlq  $40,%ymm6,%ymm6
 1632         vpand   %ymm5,%ymm9,%ymm9
 1633         vpand   %ymm5,%ymm7,%ymm7
 1634         vpand   %ymm5,%ymm8,%ymm8
 1635         vpand   %ymm5,%ymm10,%ymm10
 1636         vpor    32(%rcx),%ymm6,%ymm6
 1637 
 1638         vpaddq  %ymm2,%ymm9,%ymm2
 1639         subq    $64,%rdx
 1640         jz      .Ltail_avx2
 1641         jmp     .Loop_avx2
 1642 
 1643 .align  32
 1644 .Loop_avx2:
 1645 
 1646 
 1647 
 1648 
 1649 
 1650 
 1651 
 1652 
 1653         vpaddq  %ymm0,%ymm7,%ymm0
 1654         vmovdqa 0(%rsp),%ymm7
 1655         vpaddq  %ymm1,%ymm8,%ymm1
 1656         vmovdqa 32(%rsp),%ymm8
 1657         vpaddq  %ymm3,%ymm10,%ymm3
 1658         vmovdqa 96(%rsp),%ymm9
 1659         vpaddq  %ymm4,%ymm6,%ymm4
 1660         vmovdqa 48(%rax),%ymm10
 1661         vmovdqa 112(%rax),%ymm5
 1662 
 1663 
 1664 
 1665 
 1666 
 1667 
 1668 
 1669 
 1670 
 1671 
 1672 
 1673 
 1674 
 1675 
 1676 
 1677 
 1678         vpmuludq        %ymm2,%ymm7,%ymm13
 1679         vpmuludq        %ymm2,%ymm8,%ymm14
 1680         vpmuludq        %ymm2,%ymm9,%ymm15
 1681         vpmuludq        %ymm2,%ymm10,%ymm11
 1682         vpmuludq        %ymm2,%ymm5,%ymm12
 1683 
 1684         vpmuludq        %ymm0,%ymm8,%ymm6
 1685         vpmuludq        %ymm1,%ymm8,%ymm2
 1686         vpaddq  %ymm6,%ymm12,%ymm12
 1687         vpaddq  %ymm2,%ymm13,%ymm13
 1688         vpmuludq        %ymm3,%ymm8,%ymm6
 1689         vpmuludq        64(%rsp),%ymm4,%ymm2
 1690         vpaddq  %ymm6,%ymm15,%ymm15
 1691         vpaddq  %ymm2,%ymm11,%ymm11
 1692         vmovdqa -16(%rax),%ymm8
 1693 
 1694         vpmuludq        %ymm0,%ymm7,%ymm6
 1695         vpmuludq        %ymm1,%ymm7,%ymm2
 1696         vpaddq  %ymm6,%ymm11,%ymm11
 1697         vpaddq  %ymm2,%ymm12,%ymm12
 1698         vpmuludq        %ymm3,%ymm7,%ymm6
 1699         vpmuludq        %ymm4,%ymm7,%ymm2
 1700         vmovdqu 0(%rsi),%xmm7
 1701         vpaddq  %ymm6,%ymm14,%ymm14
 1702         vpaddq  %ymm2,%ymm15,%ymm15
 1703         vinserti128     $1,32(%rsi),%ymm7,%ymm7
 1704 
 1705         vpmuludq        %ymm3,%ymm8,%ymm6
 1706         vpmuludq        %ymm4,%ymm8,%ymm2
 1707         vmovdqu 16(%rsi),%xmm8
 1708         vpaddq  %ymm6,%ymm11,%ymm11
 1709         vpaddq  %ymm2,%ymm12,%ymm12
 1710         vmovdqa 16(%rax),%ymm2
 1711         vpmuludq        %ymm1,%ymm9,%ymm6
 1712         vpmuludq        %ymm0,%ymm9,%ymm9
 1713         vpaddq  %ymm6,%ymm14,%ymm14
 1714         vpaddq  %ymm9,%ymm13,%ymm13
 1715         vinserti128     $1,48(%rsi),%ymm8,%ymm8
 1716         leaq    64(%rsi),%rsi
 1717 
 1718         vpmuludq        %ymm1,%ymm2,%ymm6
 1719         vpmuludq        %ymm0,%ymm2,%ymm2
 1720         vpsrldq $6,%ymm7,%ymm9
 1721         vpaddq  %ymm6,%ymm15,%ymm15
 1722         vpaddq  %ymm2,%ymm14,%ymm14
 1723         vpmuludq        %ymm3,%ymm10,%ymm6
 1724         vpmuludq        %ymm4,%ymm10,%ymm2
 1725         vpsrldq $6,%ymm8,%ymm10
 1726         vpaddq  %ymm6,%ymm12,%ymm12
 1727         vpaddq  %ymm2,%ymm13,%ymm13
 1728         vpunpckhqdq     %ymm8,%ymm7,%ymm6
 1729 
 1730         vpmuludq        %ymm3,%ymm5,%ymm3
 1731         vpmuludq        %ymm4,%ymm5,%ymm4
 1732         vpunpcklqdq     %ymm8,%ymm7,%ymm7
 1733         vpaddq  %ymm3,%ymm13,%ymm2
 1734         vpaddq  %ymm4,%ymm14,%ymm3
 1735         vpunpcklqdq     %ymm10,%ymm9,%ymm10
 1736         vpmuludq        80(%rax),%ymm0,%ymm4
 1737         vpmuludq        %ymm1,%ymm5,%ymm0
 1738         vmovdqa 64(%rcx),%ymm5
 1739         vpaddq  %ymm4,%ymm15,%ymm4
 1740         vpaddq  %ymm0,%ymm11,%ymm0
 1741 
 1742 
 1743 
 1744 
 1745         vpsrlq  $26,%ymm3,%ymm14
 1746         vpand   %ymm5,%ymm3,%ymm3
 1747         vpaddq  %ymm14,%ymm4,%ymm4
 1748 
 1749         vpsrlq  $26,%ymm0,%ymm11
 1750         vpand   %ymm5,%ymm0,%ymm0
 1751         vpaddq  %ymm11,%ymm12,%ymm1
 1752 
 1753         vpsrlq  $26,%ymm4,%ymm15
 1754         vpand   %ymm5,%ymm4,%ymm4
 1755 
 1756         vpsrlq  $4,%ymm10,%ymm9
 1757 
 1758         vpsrlq  $26,%ymm1,%ymm12
 1759         vpand   %ymm5,%ymm1,%ymm1
 1760         vpaddq  %ymm12,%ymm2,%ymm2
 1761 
 1762         vpaddq  %ymm15,%ymm0,%ymm0
 1763         vpsllq  $2,%ymm15,%ymm15
 1764         vpaddq  %ymm15,%ymm0,%ymm0
 1765 
 1766         vpand   %ymm5,%ymm9,%ymm9
 1767         vpsrlq  $26,%ymm7,%ymm8
 1768 
 1769         vpsrlq  $26,%ymm2,%ymm13
 1770         vpand   %ymm5,%ymm2,%ymm2
 1771         vpaddq  %ymm13,%ymm3,%ymm3
 1772 
 1773         vpaddq  %ymm9,%ymm2,%ymm2
 1774         vpsrlq  $30,%ymm10,%ymm10
 1775 
 1776         vpsrlq  $26,%ymm0,%ymm11
 1777         vpand   %ymm5,%ymm0,%ymm0
 1778         vpaddq  %ymm11,%ymm1,%ymm1
 1779 
 1780         vpsrlq  $40,%ymm6,%ymm6
 1781 
 1782         vpsrlq  $26,%ymm3,%ymm14
 1783         vpand   %ymm5,%ymm3,%ymm3
 1784         vpaddq  %ymm14,%ymm4,%ymm4
 1785 
 1786         vpand   %ymm5,%ymm7,%ymm7
 1787         vpand   %ymm5,%ymm8,%ymm8
 1788         vpand   %ymm5,%ymm10,%ymm10
 1789         vpor    32(%rcx),%ymm6,%ymm6
 1790 
 1791         subq    $64,%rdx
 1792         jnz     .Loop_avx2
 1793 
 1794 .byte   0x66,0x90
 1795 .Ltail_avx2:
 1796 
 1797 
 1798 
 1799 
 1800 
 1801 
 1802 
 1803         vpaddq  %ymm0,%ymm7,%ymm0
 1804         vmovdqu 4(%rsp),%ymm7
 1805         vpaddq  %ymm1,%ymm8,%ymm1
 1806         vmovdqu 36(%rsp),%ymm8
 1807         vpaddq  %ymm3,%ymm10,%ymm3
 1808         vmovdqu 100(%rsp),%ymm9
 1809         vpaddq  %ymm4,%ymm6,%ymm4
 1810         vmovdqu 52(%rax),%ymm10
 1811         vmovdqu 116(%rax),%ymm5
 1812 
 1813         vpmuludq        %ymm2,%ymm7,%ymm13
 1814         vpmuludq        %ymm2,%ymm8,%ymm14
 1815         vpmuludq        %ymm2,%ymm9,%ymm15
 1816         vpmuludq        %ymm2,%ymm10,%ymm11
 1817         vpmuludq        %ymm2,%ymm5,%ymm12
 1818 
 1819         vpmuludq        %ymm0,%ymm8,%ymm6
 1820         vpmuludq        %ymm1,%ymm8,%ymm2
 1821         vpaddq  %ymm6,%ymm12,%ymm12
 1822         vpaddq  %ymm2,%ymm13,%ymm13
 1823         vpmuludq        %ymm3,%ymm8,%ymm6
 1824         vpmuludq        68(%rsp),%ymm4,%ymm2
 1825         vpaddq  %ymm6,%ymm15,%ymm15
 1826         vpaddq  %ymm2,%ymm11,%ymm11
 1827 
 1828         vpmuludq        %ymm0,%ymm7,%ymm6
 1829         vpmuludq        %ymm1,%ymm7,%ymm2
 1830         vpaddq  %ymm6,%ymm11,%ymm11
 1831         vmovdqu -12(%rax),%ymm8
 1832         vpaddq  %ymm2,%ymm12,%ymm12
 1833         vpmuludq        %ymm3,%ymm7,%ymm6
 1834         vpmuludq        %ymm4,%ymm7,%ymm2
 1835         vpaddq  %ymm6,%ymm14,%ymm14
 1836         vpaddq  %ymm2,%ymm15,%ymm15
 1837 
 1838         vpmuludq        %ymm3,%ymm8,%ymm6
 1839         vpmuludq        %ymm4,%ymm8,%ymm2
 1840         vpaddq  %ymm6,%ymm11,%ymm11
 1841         vpaddq  %ymm2,%ymm12,%ymm12
 1842         vmovdqu 20(%rax),%ymm2
 1843         vpmuludq        %ymm1,%ymm9,%ymm6
 1844         vpmuludq        %ymm0,%ymm9,%ymm9
 1845         vpaddq  %ymm6,%ymm14,%ymm14
 1846         vpaddq  %ymm9,%ymm13,%ymm13
 1847 
 1848         vpmuludq        %ymm1,%ymm2,%ymm6
 1849         vpmuludq        %ymm0,%ymm2,%ymm2
 1850         vpaddq  %ymm6,%ymm15,%ymm15
 1851         vpaddq  %ymm2,%ymm14,%ymm14
 1852         vpmuludq        %ymm3,%ymm10,%ymm6
 1853         vpmuludq        %ymm4,%ymm10,%ymm2
 1854         vpaddq  %ymm6,%ymm12,%ymm12
 1855         vpaddq  %ymm2,%ymm13,%ymm13
 1856 
 1857         vpmuludq        %ymm3,%ymm5,%ymm3
 1858         vpmuludq        %ymm4,%ymm5,%ymm4
 1859         vpaddq  %ymm3,%ymm13,%ymm2
 1860         vpaddq  %ymm4,%ymm14,%ymm3
 1861         vpmuludq        84(%rax),%ymm0,%ymm4
 1862         vpmuludq        %ymm1,%ymm5,%ymm0
 1863         vmovdqa 64(%rcx),%ymm5
 1864         vpaddq  %ymm4,%ymm15,%ymm4
 1865         vpaddq  %ymm0,%ymm11,%ymm0
 1866 
 1867 
 1868 
 1869 
 1870         vpsrldq $8,%ymm12,%ymm8
 1871         vpsrldq $8,%ymm2,%ymm9
 1872         vpsrldq $8,%ymm3,%ymm10
 1873         vpsrldq $8,%ymm4,%ymm6
 1874         vpsrldq $8,%ymm0,%ymm7
 1875         vpaddq  %ymm8,%ymm12,%ymm12
 1876         vpaddq  %ymm9,%ymm2,%ymm2
 1877         vpaddq  %ymm10,%ymm3,%ymm3
 1878         vpaddq  %ymm6,%ymm4,%ymm4
 1879         vpaddq  %ymm7,%ymm0,%ymm0
 1880 
 1881         vpermq  $0x2,%ymm3,%ymm10
 1882         vpermq  $0x2,%ymm4,%ymm6
 1883         vpermq  $0x2,%ymm0,%ymm7
 1884         vpermq  $0x2,%ymm12,%ymm8
 1885         vpermq  $0x2,%ymm2,%ymm9
 1886         vpaddq  %ymm10,%ymm3,%ymm3
 1887         vpaddq  %ymm6,%ymm4,%ymm4
 1888         vpaddq  %ymm7,%ymm0,%ymm0
 1889         vpaddq  %ymm8,%ymm12,%ymm12
 1890         vpaddq  %ymm9,%ymm2,%ymm2
 1891 
 1892 
 1893 
 1894 
 1895         vpsrlq  $26,%ymm3,%ymm14
 1896         vpand   %ymm5,%ymm3,%ymm3
 1897         vpaddq  %ymm14,%ymm4,%ymm4
 1898 
 1899         vpsrlq  $26,%ymm0,%ymm11
 1900         vpand   %ymm5,%ymm0,%ymm0
 1901         vpaddq  %ymm11,%ymm12,%ymm1
 1902 
 1903         vpsrlq  $26,%ymm4,%ymm15
 1904         vpand   %ymm5,%ymm4,%ymm4
 1905 
 1906         vpsrlq  $26,%ymm1,%ymm12
 1907         vpand   %ymm5,%ymm1,%ymm1
 1908         vpaddq  %ymm12,%ymm2,%ymm2
 1909 
 1910         vpaddq  %ymm15,%ymm0,%ymm0
 1911         vpsllq  $2,%ymm15,%ymm15
 1912         vpaddq  %ymm15,%ymm0,%ymm0
 1913 
 1914         vpsrlq  $26,%ymm2,%ymm13
 1915         vpand   %ymm5,%ymm2,%ymm2
 1916         vpaddq  %ymm13,%ymm3,%ymm3
 1917 
 1918         vpsrlq  $26,%ymm0,%ymm11
 1919         vpand   %ymm5,%ymm0,%ymm0
 1920         vpaddq  %ymm11,%ymm1,%ymm1
 1921 
 1922         vpsrlq  $26,%ymm3,%ymm14
 1923         vpand   %ymm5,%ymm3,%ymm3
 1924         vpaddq  %ymm14,%ymm4,%ymm4
 1925 
 1926         vmovd   %xmm0,-112(%rdi)
 1927         vmovd   %xmm1,-108(%rdi)
 1928         vmovd   %xmm2,-104(%rdi)
 1929         vmovd   %xmm3,-100(%rdi)
 1930         vmovd   %xmm4,-96(%rdi)
 1931         leaq    8(%r11),%rsp
 1932 .cfi_def_cfa    %rsp,8
 1933         vzeroupper
 1934         .byte   0xf3,0xc3
 1935 .cfi_endproc    
 1936 .size   poly1305_blocks_avx2,.-poly1305_blocks_avx2
 1937 .align  64
 1938 .Lconst:
 1939 .Lmask24:
 1940 .long   0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
 1941 .L129:
 1942 .long   16777216,0,16777216,0,16777216,0,16777216,0
 1943 .Lmask26:
 1944 .long   0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 1945 .Lpermd_avx2:
 1946 .long   2,2,2,3,2,0,2,1
 1947 .Lpermd_avx512:
 1948 .long   0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 1949 
 1950 .L2_44_inp_permd:
 1951 .long   0,1,1,2,2,3,7,7
 1952 .L2_44_inp_shift:
 1953 .quad   0,12,24,64
 1954 .L2_44_mask:
 1955 .quad   0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
 1956 .L2_44_shift_rgt:
 1957 .quad   44,44,42,64
 1958 .L2_44_shift_lft:
 1959 .quad   8,8,10,64
 1960 
 1961 .align  64
 1962 .Lx_mask44:
 1963 .quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 1964 .quad   0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 1965 .Lx_mask42:
 1966 .quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 1967 .quad   0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 1968 .byte   80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 1969 .align  16
 1970 .globl  xor128_encrypt_n_pad
 1971 .type   xor128_encrypt_n_pad,@function
 1972 .align  16
 1973 xor128_encrypt_n_pad:
 1974 .cfi_startproc  
 1975         subq    %rdx,%rsi
 1976         subq    %rdx,%rdi
 1977         movq    %rcx,%r10
 1978         shrq    $4,%rcx
 1979         jz      .Ltail_enc
 1980         nop
 1981 .Loop_enc_xmm:
 1982         movdqu  (%rsi,%rdx,1),%xmm0
 1983         pxor    (%rdx),%xmm0
 1984         movdqu  %xmm0,(%rdi,%rdx,1)
 1985         movdqa  %xmm0,(%rdx)
 1986         leaq    16(%rdx),%rdx
 1987         decq    %rcx
 1988         jnz     .Loop_enc_xmm
 1989 
 1990         andq    $15,%r10
 1991         jz      .Ldone_enc
 1992 
 1993 .Ltail_enc:
 1994         movq    $16,%rcx
 1995         subq    %r10,%rcx
 1996         xorl    %eax,%eax
 1997 .Loop_enc_byte:
 1998         movb    (%rsi,%rdx,1),%al
 1999         xorb    (%rdx),%al
 2000         movb    %al,(%rdi,%rdx,1)
 2001         movb    %al,(%rdx)
 2002         leaq    1(%rdx),%rdx
 2003         decq    %r10
 2004         jnz     .Loop_enc_byte
 2005 
 2006         xorl    %eax,%eax
 2007 .Loop_enc_pad:
 2008         movb    %al,(%rdx)
 2009         leaq    1(%rdx),%rdx
 2010         decq    %rcx
 2011         jnz     .Loop_enc_pad
 2012 
 2013 .Ldone_enc:
 2014         movq    %rdx,%rax
 2015         .byte   0xf3,0xc3
 2016 .cfi_endproc    
 2017 .size   xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
 2018 
 2019 .globl  xor128_decrypt_n_pad
 2020 .type   xor128_decrypt_n_pad,@function
 2021 .align  16
 2022 xor128_decrypt_n_pad:
 2023 .cfi_startproc  
 2024         subq    %rdx,%rsi
 2025         subq    %rdx,%rdi
 2026         movq    %rcx,%r10
 2027         shrq    $4,%rcx
 2028         jz      .Ltail_dec
 2029         nop
 2030 .Loop_dec_xmm:
 2031         movdqu  (%rsi,%rdx,1),%xmm0
 2032         movdqa  (%rdx),%xmm1
 2033         pxor    %xmm0,%xmm1
 2034         movdqu  %xmm1,(%rdi,%rdx,1)
 2035         movdqa  %xmm0,(%rdx)
 2036         leaq    16(%rdx),%rdx
 2037         decq    %rcx
 2038         jnz     .Loop_dec_xmm
 2039 
 2040         pxor    %xmm1,%xmm1
 2041         andq    $15,%r10
 2042         jz      .Ldone_dec
 2043 
 2044 .Ltail_dec:
 2045         movq    $16,%rcx
 2046         subq    %r10,%rcx
 2047         xorl    %eax,%eax
 2048         xorq    %r11,%r11
 2049 .Loop_dec_byte:
 2050         movb    (%rsi,%rdx,1),%r11b
 2051         movb    (%rdx),%al
 2052         xorb    %r11b,%al
 2053         movb    %al,(%rdi,%rdx,1)
 2054         movb    %r11b,(%rdx)
 2055         leaq    1(%rdx),%rdx
 2056         decq    %r10
 2057         jnz     .Loop_dec_byte
 2058 
 2059         xorl    %eax,%eax
 2060 .Loop_dec_pad:
 2061         movb    %al,(%rdx)
 2062         leaq    1(%rdx),%rdx
 2063         decq    %rcx
 2064         jnz     .Loop_dec_pad
 2065 
 2066 .Ldone_dec:
 2067         movq    %rdx,%rax
 2068         .byte   0xf3,0xc3
 2069 .cfi_endproc    
 2070 .size   xor128_decrypt_n_pad,.-xor128_decrypt_n_pad

Cache object: 9118d171e283ada98abdd164fb36333f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.