The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/openssl/amd64/chacha-x86_64.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $FreeBSD$ */
    2 /* Do not modify. This file is auto-generated from chacha-x86_64.pl. */
    3 .text   
    4 
    5 
    6 
    7 .align  64
    8 .Lzero:
    9 .long   0,0,0,0
   10 .Lone:
   11 .long   1,0,0,0
   12 .Linc:
   13 .long   0,1,2,3
   14 .Lfour:
   15 .long   4,4,4,4
   16 .Lincy:
   17 .long   0,2,4,6,1,3,5,7
   18 .Leight:
   19 .long   8,8,8,8,8,8,8,8
   20 .Lrot16:
   21 .byte   0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
   22 .Lrot24:
   23 .byte   0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
   24 .Ltwoy:
   25 .long   2,0,0,0, 2,0,0,0
   26 .align  64
   27 .Lzeroz:
   28 .long   0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
   29 .Lfourz:
   30 .long   4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
   31 .Lincz:
   32 .long   0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
   33 .Lsixteen:
   34 .long   16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
   35 .Lsigma:
   36 .byte   101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
   37 .byte   67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   38 .globl  ChaCha20_ctr32
   39 .type   ChaCha20_ctr32,@function
   40 .align  64
   41 ChaCha20_ctr32:
   42 .cfi_startproc  
   43         cmpq    $0,%rdx
   44         je      .Lno_data
   45         movq    OPENSSL_ia32cap_P+4(%rip),%r10
   46         testl   $512,%r10d
   47         jnz     .LChaCha20_ssse3
   48 
   49         pushq   %rbx
   50 .cfi_adjust_cfa_offset  8
   51 .cfi_offset     %rbx,-16
   52         pushq   %rbp
   53 .cfi_adjust_cfa_offset  8
   54 .cfi_offset     %rbp,-24
   55         pushq   %r12
   56 .cfi_adjust_cfa_offset  8
   57 .cfi_offset     %r12,-32
   58         pushq   %r13
   59 .cfi_adjust_cfa_offset  8
   60 .cfi_offset     %r13,-40
   61         pushq   %r14
   62 .cfi_adjust_cfa_offset  8
   63 .cfi_offset     %r14,-48
   64         pushq   %r15
   65 .cfi_adjust_cfa_offset  8
   66 .cfi_offset     %r15,-56
   67         subq    $64+24,%rsp
   68 .cfi_adjust_cfa_offset  64+24
   69 .Lctr32_body:
   70 
   71 
   72         movdqu  (%rcx),%xmm1
   73         movdqu  16(%rcx),%xmm2
   74         movdqu  (%r8),%xmm3
   75         movdqa  .Lone(%rip),%xmm4
   76 
   77 
   78         movdqa  %xmm1,16(%rsp)
   79         movdqa  %xmm2,32(%rsp)
   80         movdqa  %xmm3,48(%rsp)
   81         movq    %rdx,%rbp
   82         jmp     .Loop_outer
   83 
   84 .align  32
   85 .Loop_outer:
   86         movl    $0x61707865,%eax
   87         movl    $0x3320646e,%ebx
   88         movl    $0x79622d32,%ecx
   89         movl    $0x6b206574,%edx
   90         movl    16(%rsp),%r8d
   91         movl    20(%rsp),%r9d
   92         movl    24(%rsp),%r10d
   93         movl    28(%rsp),%r11d
   94         movd    %xmm3,%r12d
   95         movl    52(%rsp),%r13d
   96         movl    56(%rsp),%r14d
   97         movl    60(%rsp),%r15d
   98 
   99         movq    %rbp,64+0(%rsp)
  100         movl    $10,%ebp
  101         movq    %rsi,64+8(%rsp)
  102 .byte   102,72,15,126,214
  103         movq    %rdi,64+16(%rsp)
  104         movq    %rsi,%rdi
  105         shrq    $32,%rdi
  106         jmp     .Loop
  107 
  108 .align  32
  109 .Loop:
  110         addl    %r8d,%eax
  111         xorl    %eax,%r12d
  112         roll    $16,%r12d
  113         addl    %r9d,%ebx
  114         xorl    %ebx,%r13d
  115         roll    $16,%r13d
  116         addl    %r12d,%esi
  117         xorl    %esi,%r8d
  118         roll    $12,%r8d
  119         addl    %r13d,%edi
  120         xorl    %edi,%r9d
  121         roll    $12,%r9d
  122         addl    %r8d,%eax
  123         xorl    %eax,%r12d
  124         roll    $8,%r12d
  125         addl    %r9d,%ebx
  126         xorl    %ebx,%r13d
  127         roll    $8,%r13d
  128         addl    %r12d,%esi
  129         xorl    %esi,%r8d
  130         roll    $7,%r8d
  131         addl    %r13d,%edi
  132         xorl    %edi,%r9d
  133         roll    $7,%r9d
  134         movl    %esi,32(%rsp)
  135         movl    %edi,36(%rsp)
  136         movl    40(%rsp),%esi
  137         movl    44(%rsp),%edi
  138         addl    %r10d,%ecx
  139         xorl    %ecx,%r14d
  140         roll    $16,%r14d
  141         addl    %r11d,%edx
  142         xorl    %edx,%r15d
  143         roll    $16,%r15d
  144         addl    %r14d,%esi
  145         xorl    %esi,%r10d
  146         roll    $12,%r10d
  147         addl    %r15d,%edi
  148         xorl    %edi,%r11d
  149         roll    $12,%r11d
  150         addl    %r10d,%ecx
  151         xorl    %ecx,%r14d
  152         roll    $8,%r14d
  153         addl    %r11d,%edx
  154         xorl    %edx,%r15d
  155         roll    $8,%r15d
  156         addl    %r14d,%esi
  157         xorl    %esi,%r10d
  158         roll    $7,%r10d
  159         addl    %r15d,%edi
  160         xorl    %edi,%r11d
  161         roll    $7,%r11d
  162         addl    %r9d,%eax
  163         xorl    %eax,%r15d
  164         roll    $16,%r15d
  165         addl    %r10d,%ebx
  166         xorl    %ebx,%r12d
  167         roll    $16,%r12d
  168         addl    %r15d,%esi
  169         xorl    %esi,%r9d
  170         roll    $12,%r9d
  171         addl    %r12d,%edi
  172         xorl    %edi,%r10d
  173         roll    $12,%r10d
  174         addl    %r9d,%eax
  175         xorl    %eax,%r15d
  176         roll    $8,%r15d
  177         addl    %r10d,%ebx
  178         xorl    %ebx,%r12d
  179         roll    $8,%r12d
  180         addl    %r15d,%esi
  181         xorl    %esi,%r9d
  182         roll    $7,%r9d
  183         addl    %r12d,%edi
  184         xorl    %edi,%r10d
  185         roll    $7,%r10d
  186         movl    %esi,40(%rsp)
  187         movl    %edi,44(%rsp)
  188         movl    32(%rsp),%esi
  189         movl    36(%rsp),%edi
  190         addl    %r11d,%ecx
  191         xorl    %ecx,%r13d
  192         roll    $16,%r13d
  193         addl    %r8d,%edx
  194         xorl    %edx,%r14d
  195         roll    $16,%r14d
  196         addl    %r13d,%esi
  197         xorl    %esi,%r11d
  198         roll    $12,%r11d
  199         addl    %r14d,%edi
  200         xorl    %edi,%r8d
  201         roll    $12,%r8d
  202         addl    %r11d,%ecx
  203         xorl    %ecx,%r13d
  204         roll    $8,%r13d
  205         addl    %r8d,%edx
  206         xorl    %edx,%r14d
  207         roll    $8,%r14d
  208         addl    %r13d,%esi
  209         xorl    %esi,%r11d
  210         roll    $7,%r11d
  211         addl    %r14d,%edi
  212         xorl    %edi,%r8d
  213         roll    $7,%r8d
  214         decl    %ebp
  215         jnz     .Loop
  216         movl    %edi,36(%rsp)
  217         movl    %esi,32(%rsp)
  218         movq    64(%rsp),%rbp
  219         movdqa  %xmm2,%xmm1
  220         movq    64+8(%rsp),%rsi
  221         paddd   %xmm4,%xmm3
  222         movq    64+16(%rsp),%rdi
  223 
  224         addl    $0x61707865,%eax
  225         addl    $0x3320646e,%ebx
  226         addl    $0x79622d32,%ecx
  227         addl    $0x6b206574,%edx
  228         addl    16(%rsp),%r8d
  229         addl    20(%rsp),%r9d
  230         addl    24(%rsp),%r10d
  231         addl    28(%rsp),%r11d
  232         addl    48(%rsp),%r12d
  233         addl    52(%rsp),%r13d
  234         addl    56(%rsp),%r14d
  235         addl    60(%rsp),%r15d
  236         paddd   32(%rsp),%xmm1
  237 
  238         cmpq    $64,%rbp
  239         jb      .Ltail
  240 
  241         xorl    0(%rsi),%eax
  242         xorl    4(%rsi),%ebx
  243         xorl    8(%rsi),%ecx
  244         xorl    12(%rsi),%edx
  245         xorl    16(%rsi),%r8d
  246         xorl    20(%rsi),%r9d
  247         xorl    24(%rsi),%r10d
  248         xorl    28(%rsi),%r11d
  249         movdqu  32(%rsi),%xmm0
  250         xorl    48(%rsi),%r12d
  251         xorl    52(%rsi),%r13d
  252         xorl    56(%rsi),%r14d
  253         xorl    60(%rsi),%r15d
  254         leaq    64(%rsi),%rsi
  255         pxor    %xmm1,%xmm0
  256 
  257         movdqa  %xmm2,32(%rsp)
  258         movd    %xmm3,48(%rsp)
  259 
  260         movl    %eax,0(%rdi)
  261         movl    %ebx,4(%rdi)
  262         movl    %ecx,8(%rdi)
  263         movl    %edx,12(%rdi)
  264         movl    %r8d,16(%rdi)
  265         movl    %r9d,20(%rdi)
  266         movl    %r10d,24(%rdi)
  267         movl    %r11d,28(%rdi)
  268         movdqu  %xmm0,32(%rdi)
  269         movl    %r12d,48(%rdi)
  270         movl    %r13d,52(%rdi)
  271         movl    %r14d,56(%rdi)
  272         movl    %r15d,60(%rdi)
  273         leaq    64(%rdi),%rdi
  274 
  275         subq    $64,%rbp
  276         jnz     .Loop_outer
  277 
  278         jmp     .Ldone
  279 
  280 .align  16
  281 .Ltail:
  282         movl    %eax,0(%rsp)
  283         movl    %ebx,4(%rsp)
  284         xorq    %rbx,%rbx
  285         movl    %ecx,8(%rsp)
  286         movl    %edx,12(%rsp)
  287         movl    %r8d,16(%rsp)
  288         movl    %r9d,20(%rsp)
  289         movl    %r10d,24(%rsp)
  290         movl    %r11d,28(%rsp)
  291         movdqa  %xmm1,32(%rsp)
  292         movl    %r12d,48(%rsp)
  293         movl    %r13d,52(%rsp)
  294         movl    %r14d,56(%rsp)
  295         movl    %r15d,60(%rsp)
  296 
  297 .Loop_tail:
  298         movzbl  (%rsi,%rbx,1),%eax
  299         movzbl  (%rsp,%rbx,1),%edx
  300         leaq    1(%rbx),%rbx
  301         xorl    %edx,%eax
  302         movb    %al,-1(%rdi,%rbx,1)
  303         decq    %rbp
  304         jnz     .Loop_tail
  305 
  306 .Ldone:
  307         leaq    64+24+48(%rsp),%rsi
  308 .cfi_def_cfa    %rsi,8
  309         movq    -48(%rsi),%r15
  310 .cfi_restore    %r15
  311         movq    -40(%rsi),%r14
  312 .cfi_restore    %r14
  313         movq    -32(%rsi),%r13
  314 .cfi_restore    %r13
  315         movq    -24(%rsi),%r12
  316 .cfi_restore    %r12
  317         movq    -16(%rsi),%rbp
  318 .cfi_restore    %rbp
  319         movq    -8(%rsi),%rbx
  320 .cfi_restore    %rbx
  321         leaq    (%rsi),%rsp
  322 .cfi_def_cfa_register   %rsp
  323 .Lno_data:
  324         .byte   0xf3,0xc3
  325 .cfi_endproc    
  326 .size   ChaCha20_ctr32,.-ChaCha20_ctr32
  327 .type   ChaCha20_ssse3,@function
  328 .align  32
  329 ChaCha20_ssse3:
  330 .cfi_startproc  
  331 .LChaCha20_ssse3:
  332         movq    %rsp,%r9
  333 .cfi_def_cfa_register   %r9
  334         testl   $2048,%r10d
  335         jnz     .LChaCha20_4xop
  336         cmpq    $128,%rdx
  337         je      .LChaCha20_128
  338         ja      .LChaCha20_4x
  339 
  340 .Ldo_sse3_after_all:
  341         subq    $64+8,%rsp
  342         movdqa  .Lsigma(%rip),%xmm0
  343         movdqu  (%rcx),%xmm1
  344         movdqu  16(%rcx),%xmm2
  345         movdqu  (%r8),%xmm3
  346         movdqa  .Lrot16(%rip),%xmm6
  347         movdqa  .Lrot24(%rip),%xmm7
  348 
  349         movdqa  %xmm0,0(%rsp)
  350         movdqa  %xmm1,16(%rsp)
  351         movdqa  %xmm2,32(%rsp)
  352         movdqa  %xmm3,48(%rsp)
  353         movq    $10,%r8
  354         jmp     .Loop_ssse3
  355 
  356 .align  32
  357 .Loop_outer_ssse3:
  358         movdqa  .Lone(%rip),%xmm3
  359         movdqa  0(%rsp),%xmm0
  360         movdqa  16(%rsp),%xmm1
  361         movdqa  32(%rsp),%xmm2
  362         paddd   48(%rsp),%xmm3
  363         movq    $10,%r8
  364         movdqa  %xmm3,48(%rsp)
  365         jmp     .Loop_ssse3
  366 
  367 .align  32
  368 .Loop_ssse3:
  369         paddd   %xmm1,%xmm0
  370         pxor    %xmm0,%xmm3
  371 .byte   102,15,56,0,222
  372         paddd   %xmm3,%xmm2
  373         pxor    %xmm2,%xmm1
  374         movdqa  %xmm1,%xmm4
  375         psrld   $20,%xmm1
  376         pslld   $12,%xmm4
  377         por     %xmm4,%xmm1
  378         paddd   %xmm1,%xmm0
  379         pxor    %xmm0,%xmm3
  380 .byte   102,15,56,0,223
  381         paddd   %xmm3,%xmm2
  382         pxor    %xmm2,%xmm1
  383         movdqa  %xmm1,%xmm4
  384         psrld   $25,%xmm1
  385         pslld   $7,%xmm4
  386         por     %xmm4,%xmm1
  387         pshufd  $78,%xmm2,%xmm2
  388         pshufd  $57,%xmm1,%xmm1
  389         pshufd  $147,%xmm3,%xmm3
  390         nop
  391         paddd   %xmm1,%xmm0
  392         pxor    %xmm0,%xmm3
  393 .byte   102,15,56,0,222
  394         paddd   %xmm3,%xmm2
  395         pxor    %xmm2,%xmm1
  396         movdqa  %xmm1,%xmm4
  397         psrld   $20,%xmm1
  398         pslld   $12,%xmm4
  399         por     %xmm4,%xmm1
  400         paddd   %xmm1,%xmm0
  401         pxor    %xmm0,%xmm3
  402 .byte   102,15,56,0,223
  403         paddd   %xmm3,%xmm2
  404         pxor    %xmm2,%xmm1
  405         movdqa  %xmm1,%xmm4
  406         psrld   $25,%xmm1
  407         pslld   $7,%xmm4
  408         por     %xmm4,%xmm1
  409         pshufd  $78,%xmm2,%xmm2
  410         pshufd  $147,%xmm1,%xmm1
  411         pshufd  $57,%xmm3,%xmm3
  412         decq    %r8
  413         jnz     .Loop_ssse3
  414         paddd   0(%rsp),%xmm0
  415         paddd   16(%rsp),%xmm1
  416         paddd   32(%rsp),%xmm2
  417         paddd   48(%rsp),%xmm3
  418 
  419         cmpq    $64,%rdx
  420         jb      .Ltail_ssse3
  421 
  422         movdqu  0(%rsi),%xmm4
  423         movdqu  16(%rsi),%xmm5
  424         pxor    %xmm4,%xmm0
  425         movdqu  32(%rsi),%xmm4
  426         pxor    %xmm5,%xmm1
  427         movdqu  48(%rsi),%xmm5
  428         leaq    64(%rsi),%rsi
  429         pxor    %xmm4,%xmm2
  430         pxor    %xmm5,%xmm3
  431 
  432         movdqu  %xmm0,0(%rdi)
  433         movdqu  %xmm1,16(%rdi)
  434         movdqu  %xmm2,32(%rdi)
  435         movdqu  %xmm3,48(%rdi)
  436         leaq    64(%rdi),%rdi
  437 
  438         subq    $64,%rdx
  439         jnz     .Loop_outer_ssse3
  440 
  441         jmp     .Ldone_ssse3
  442 
  443 .align  16
  444 .Ltail_ssse3:
  445         movdqa  %xmm0,0(%rsp)
  446         movdqa  %xmm1,16(%rsp)
  447         movdqa  %xmm2,32(%rsp)
  448         movdqa  %xmm3,48(%rsp)
  449         xorq    %r8,%r8
  450 
  451 .Loop_tail_ssse3:
  452         movzbl  (%rsi,%r8,1),%eax
  453         movzbl  (%rsp,%r8,1),%ecx
  454         leaq    1(%r8),%r8
  455         xorl    %ecx,%eax
  456         movb    %al,-1(%rdi,%r8,1)
  457         decq    %rdx
  458         jnz     .Loop_tail_ssse3
  459 
  460 .Ldone_ssse3:
  461         leaq    (%r9),%rsp
  462 .cfi_def_cfa_register   %rsp
  463 .Lssse3_epilogue:
  464         .byte   0xf3,0xc3
  465 .cfi_endproc    
  466 .size   ChaCha20_ssse3,.-ChaCha20_ssse3
  467 .type   ChaCha20_128,@function
  468 .align  32
  469 ChaCha20_128:
  470 .cfi_startproc  
  471 .LChaCha20_128:
  472         movq    %rsp,%r9
  473 .cfi_def_cfa_register   %r9
  474         subq    $64+8,%rsp
  475         movdqa  .Lsigma(%rip),%xmm8
  476         movdqu  (%rcx),%xmm9
  477         movdqu  16(%rcx),%xmm2
  478         movdqu  (%r8),%xmm3
  479         movdqa  .Lone(%rip),%xmm1
  480         movdqa  .Lrot16(%rip),%xmm6
  481         movdqa  .Lrot24(%rip),%xmm7
  482 
  483         movdqa  %xmm8,%xmm10
  484         movdqa  %xmm8,0(%rsp)
  485         movdqa  %xmm9,%xmm11
  486         movdqa  %xmm9,16(%rsp)
  487         movdqa  %xmm2,%xmm0
  488         movdqa  %xmm2,32(%rsp)
  489         paddd   %xmm3,%xmm1
  490         movdqa  %xmm3,48(%rsp)
  491         movq    $10,%r8
  492         jmp     .Loop_128
  493 
  494 .align  32
  495 .Loop_128:
  496         paddd   %xmm9,%xmm8
  497         pxor    %xmm8,%xmm3
  498         paddd   %xmm11,%xmm10
  499         pxor    %xmm10,%xmm1
  500 .byte   102,15,56,0,222
  501 .byte   102,15,56,0,206
  502         paddd   %xmm3,%xmm2
  503         paddd   %xmm1,%xmm0
  504         pxor    %xmm2,%xmm9
  505         pxor    %xmm0,%xmm11
  506         movdqa  %xmm9,%xmm4
  507         psrld   $20,%xmm9
  508         movdqa  %xmm11,%xmm5
  509         pslld   $12,%xmm4
  510         psrld   $20,%xmm11
  511         por     %xmm4,%xmm9
  512         pslld   $12,%xmm5
  513         por     %xmm5,%xmm11
  514         paddd   %xmm9,%xmm8
  515         pxor    %xmm8,%xmm3
  516         paddd   %xmm11,%xmm10
  517         pxor    %xmm10,%xmm1
  518 .byte   102,15,56,0,223
  519 .byte   102,15,56,0,207
  520         paddd   %xmm3,%xmm2
  521         paddd   %xmm1,%xmm0
  522         pxor    %xmm2,%xmm9
  523         pxor    %xmm0,%xmm11
  524         movdqa  %xmm9,%xmm4
  525         psrld   $25,%xmm9
  526         movdqa  %xmm11,%xmm5
  527         pslld   $7,%xmm4
  528         psrld   $25,%xmm11
  529         por     %xmm4,%xmm9
  530         pslld   $7,%xmm5
  531         por     %xmm5,%xmm11
  532         pshufd  $78,%xmm2,%xmm2
  533         pshufd  $57,%xmm9,%xmm9
  534         pshufd  $147,%xmm3,%xmm3
  535         pshufd  $78,%xmm0,%xmm0
  536         pshufd  $57,%xmm11,%xmm11
  537         pshufd  $147,%xmm1,%xmm1
  538         paddd   %xmm9,%xmm8
  539         pxor    %xmm8,%xmm3
  540         paddd   %xmm11,%xmm10
  541         pxor    %xmm10,%xmm1
  542 .byte   102,15,56,0,222
  543 .byte   102,15,56,0,206
  544         paddd   %xmm3,%xmm2
  545         paddd   %xmm1,%xmm0
  546         pxor    %xmm2,%xmm9
  547         pxor    %xmm0,%xmm11
  548         movdqa  %xmm9,%xmm4
  549         psrld   $20,%xmm9
  550         movdqa  %xmm11,%xmm5
  551         pslld   $12,%xmm4
  552         psrld   $20,%xmm11
  553         por     %xmm4,%xmm9
  554         pslld   $12,%xmm5
  555         por     %xmm5,%xmm11
  556         paddd   %xmm9,%xmm8
  557         pxor    %xmm8,%xmm3
  558         paddd   %xmm11,%xmm10
  559         pxor    %xmm10,%xmm1
  560 .byte   102,15,56,0,223
  561 .byte   102,15,56,0,207
  562         paddd   %xmm3,%xmm2
  563         paddd   %xmm1,%xmm0
  564         pxor    %xmm2,%xmm9
  565         pxor    %xmm0,%xmm11
  566         movdqa  %xmm9,%xmm4
  567         psrld   $25,%xmm9
  568         movdqa  %xmm11,%xmm5
  569         pslld   $7,%xmm4
  570         psrld   $25,%xmm11
  571         por     %xmm4,%xmm9
  572         pslld   $7,%xmm5
  573         por     %xmm5,%xmm11
  574         pshufd  $78,%xmm2,%xmm2
  575         pshufd  $147,%xmm9,%xmm9
  576         pshufd  $57,%xmm3,%xmm3
  577         pshufd  $78,%xmm0,%xmm0
  578         pshufd  $147,%xmm11,%xmm11
  579         pshufd  $57,%xmm1,%xmm1
  580         decq    %r8
  581         jnz     .Loop_128
  582         paddd   0(%rsp),%xmm8
  583         paddd   16(%rsp),%xmm9
  584         paddd   32(%rsp),%xmm2
  585         paddd   48(%rsp),%xmm3
  586         paddd   .Lone(%rip),%xmm1
  587         paddd   0(%rsp),%xmm10
  588         paddd   16(%rsp),%xmm11
  589         paddd   32(%rsp),%xmm0
  590         paddd   48(%rsp),%xmm1
  591 
  592         movdqu  0(%rsi),%xmm4
  593         movdqu  16(%rsi),%xmm5
  594         pxor    %xmm4,%xmm8
  595         movdqu  32(%rsi),%xmm4
  596         pxor    %xmm5,%xmm9
  597         movdqu  48(%rsi),%xmm5
  598         pxor    %xmm4,%xmm2
  599         movdqu  64(%rsi),%xmm4
  600         pxor    %xmm5,%xmm3
  601         movdqu  80(%rsi),%xmm5
  602         pxor    %xmm4,%xmm10
  603         movdqu  96(%rsi),%xmm4
  604         pxor    %xmm5,%xmm11
  605         movdqu  112(%rsi),%xmm5
  606         pxor    %xmm4,%xmm0
  607         pxor    %xmm5,%xmm1
  608 
  609         movdqu  %xmm8,0(%rdi)
  610         movdqu  %xmm9,16(%rdi)
  611         movdqu  %xmm2,32(%rdi)
  612         movdqu  %xmm3,48(%rdi)
  613         movdqu  %xmm10,64(%rdi)
  614         movdqu  %xmm11,80(%rdi)
  615         movdqu  %xmm0,96(%rdi)
  616         movdqu  %xmm1,112(%rdi)
  617         leaq    (%r9),%rsp
  618 .cfi_def_cfa_register   %rsp
  619 .L128_epilogue:
  620         .byte   0xf3,0xc3
  621 .cfi_endproc    
  622 .size   ChaCha20_128,.-ChaCha20_128
  623 .type   ChaCha20_4x,@function
  624 .align  32
  625 ChaCha20_4x:
  626 .cfi_startproc  
  627 .LChaCha20_4x:
  628         movq    %rsp,%r9
  629 .cfi_def_cfa_register   %r9
  630         movq    %r10,%r11
  631         shrq    $32,%r10
  632         testq   $32,%r10
  633         jnz     .LChaCha20_8x
  634         cmpq    $192,%rdx
  635         ja      .Lproceed4x
  636 
  637         andq    $71303168,%r11
  638         cmpq    $4194304,%r11
  639         je      .Ldo_sse3_after_all
  640 
  641 .Lproceed4x:
  642         subq    $0x140+8,%rsp
  643         movdqa  .Lsigma(%rip),%xmm11
  644         movdqu  (%rcx),%xmm15
  645         movdqu  16(%rcx),%xmm7
  646         movdqu  (%r8),%xmm3
  647         leaq    256(%rsp),%rcx
  648         leaq    .Lrot16(%rip),%r10
  649         leaq    .Lrot24(%rip),%r11
  650 
  651         pshufd  $0x00,%xmm11,%xmm8
  652         pshufd  $0x55,%xmm11,%xmm9
  653         movdqa  %xmm8,64(%rsp)
  654         pshufd  $0xaa,%xmm11,%xmm10
  655         movdqa  %xmm9,80(%rsp)
  656         pshufd  $0xff,%xmm11,%xmm11
  657         movdqa  %xmm10,96(%rsp)
  658         movdqa  %xmm11,112(%rsp)
  659 
  660         pshufd  $0x00,%xmm15,%xmm12
  661         pshufd  $0x55,%xmm15,%xmm13
  662         movdqa  %xmm12,128-256(%rcx)
  663         pshufd  $0xaa,%xmm15,%xmm14
  664         movdqa  %xmm13,144-256(%rcx)
  665         pshufd  $0xff,%xmm15,%xmm15
  666         movdqa  %xmm14,160-256(%rcx)
  667         movdqa  %xmm15,176-256(%rcx)
  668 
  669         pshufd  $0x00,%xmm7,%xmm4
  670         pshufd  $0x55,%xmm7,%xmm5
  671         movdqa  %xmm4,192-256(%rcx)
  672         pshufd  $0xaa,%xmm7,%xmm6
  673         movdqa  %xmm5,208-256(%rcx)
  674         pshufd  $0xff,%xmm7,%xmm7
  675         movdqa  %xmm6,224-256(%rcx)
  676         movdqa  %xmm7,240-256(%rcx)
  677 
  678         pshufd  $0x00,%xmm3,%xmm0
  679         pshufd  $0x55,%xmm3,%xmm1
  680         paddd   .Linc(%rip),%xmm0
  681         pshufd  $0xaa,%xmm3,%xmm2
  682         movdqa  %xmm1,272-256(%rcx)
  683         pshufd  $0xff,%xmm3,%xmm3
  684         movdqa  %xmm2,288-256(%rcx)
  685         movdqa  %xmm3,304-256(%rcx)
  686 
  687         jmp     .Loop_enter4x
  688 
  689 .align  32
  690 .Loop_outer4x:
  691         movdqa  64(%rsp),%xmm8
  692         movdqa  80(%rsp),%xmm9
  693         movdqa  96(%rsp),%xmm10
  694         movdqa  112(%rsp),%xmm11
  695         movdqa  128-256(%rcx),%xmm12
  696         movdqa  144-256(%rcx),%xmm13
  697         movdqa  160-256(%rcx),%xmm14
  698         movdqa  176-256(%rcx),%xmm15
  699         movdqa  192-256(%rcx),%xmm4
  700         movdqa  208-256(%rcx),%xmm5
  701         movdqa  224-256(%rcx),%xmm6
  702         movdqa  240-256(%rcx),%xmm7
  703         movdqa  256-256(%rcx),%xmm0
  704         movdqa  272-256(%rcx),%xmm1
  705         movdqa  288-256(%rcx),%xmm2
  706         movdqa  304-256(%rcx),%xmm3
  707         paddd   .Lfour(%rip),%xmm0
  708 
  709 .Loop_enter4x:
  710         movdqa  %xmm6,32(%rsp)
  711         movdqa  %xmm7,48(%rsp)
  712         movdqa  (%r10),%xmm7
  713         movl    $10,%eax
  714         movdqa  %xmm0,256-256(%rcx)
  715         jmp     .Loop4x
  716 
  717 .align  32
  718 .Loop4x:
  719         paddd   %xmm12,%xmm8
  720         paddd   %xmm13,%xmm9
  721         pxor    %xmm8,%xmm0
  722         pxor    %xmm9,%xmm1
  723 .byte   102,15,56,0,199
  724 .byte   102,15,56,0,207
  725         paddd   %xmm0,%xmm4
  726         paddd   %xmm1,%xmm5
  727         pxor    %xmm4,%xmm12
  728         pxor    %xmm5,%xmm13
  729         movdqa  %xmm12,%xmm6
  730         pslld   $12,%xmm12
  731         psrld   $20,%xmm6
  732         movdqa  %xmm13,%xmm7
  733         pslld   $12,%xmm13
  734         por     %xmm6,%xmm12
  735         psrld   $20,%xmm7
  736         movdqa  (%r11),%xmm6
  737         por     %xmm7,%xmm13
  738         paddd   %xmm12,%xmm8
  739         paddd   %xmm13,%xmm9
  740         pxor    %xmm8,%xmm0
  741         pxor    %xmm9,%xmm1
  742 .byte   102,15,56,0,198
  743 .byte   102,15,56,0,206
  744         paddd   %xmm0,%xmm4
  745         paddd   %xmm1,%xmm5
  746         pxor    %xmm4,%xmm12
  747         pxor    %xmm5,%xmm13
  748         movdqa  %xmm12,%xmm7
  749         pslld   $7,%xmm12
  750         psrld   $25,%xmm7
  751         movdqa  %xmm13,%xmm6
  752         pslld   $7,%xmm13
  753         por     %xmm7,%xmm12
  754         psrld   $25,%xmm6
  755         movdqa  (%r10),%xmm7
  756         por     %xmm6,%xmm13
  757         movdqa  %xmm4,0(%rsp)
  758         movdqa  %xmm5,16(%rsp)
  759         movdqa  32(%rsp),%xmm4
  760         movdqa  48(%rsp),%xmm5
  761         paddd   %xmm14,%xmm10
  762         paddd   %xmm15,%xmm11
  763         pxor    %xmm10,%xmm2
  764         pxor    %xmm11,%xmm3
  765 .byte   102,15,56,0,215
  766 .byte   102,15,56,0,223
  767         paddd   %xmm2,%xmm4
  768         paddd   %xmm3,%xmm5
  769         pxor    %xmm4,%xmm14
  770         pxor    %xmm5,%xmm15
  771         movdqa  %xmm14,%xmm6
  772         pslld   $12,%xmm14
  773         psrld   $20,%xmm6
  774         movdqa  %xmm15,%xmm7
  775         pslld   $12,%xmm15
  776         por     %xmm6,%xmm14
  777         psrld   $20,%xmm7
  778         movdqa  (%r11),%xmm6
  779         por     %xmm7,%xmm15
  780         paddd   %xmm14,%xmm10
  781         paddd   %xmm15,%xmm11
  782         pxor    %xmm10,%xmm2
  783         pxor    %xmm11,%xmm3
  784 .byte   102,15,56,0,214
  785 .byte   102,15,56,0,222
  786         paddd   %xmm2,%xmm4
  787         paddd   %xmm3,%xmm5
  788         pxor    %xmm4,%xmm14
  789         pxor    %xmm5,%xmm15
  790         movdqa  %xmm14,%xmm7
  791         pslld   $7,%xmm14
  792         psrld   $25,%xmm7
  793         movdqa  %xmm15,%xmm6
  794         pslld   $7,%xmm15
  795         por     %xmm7,%xmm14
  796         psrld   $25,%xmm6
  797         movdqa  (%r10),%xmm7
  798         por     %xmm6,%xmm15
  799         paddd   %xmm13,%xmm8
  800         paddd   %xmm14,%xmm9
  801         pxor    %xmm8,%xmm3
  802         pxor    %xmm9,%xmm0
  803 .byte   102,15,56,0,223
  804 .byte   102,15,56,0,199
  805         paddd   %xmm3,%xmm4
  806         paddd   %xmm0,%xmm5
  807         pxor    %xmm4,%xmm13
  808         pxor    %xmm5,%xmm14
  809         movdqa  %xmm13,%xmm6
  810         pslld   $12,%xmm13
  811         psrld   $20,%xmm6
  812         movdqa  %xmm14,%xmm7
  813         pslld   $12,%xmm14
  814         por     %xmm6,%xmm13
  815         psrld   $20,%xmm7
  816         movdqa  (%r11),%xmm6
  817         por     %xmm7,%xmm14
  818         paddd   %xmm13,%xmm8
  819         paddd   %xmm14,%xmm9
  820         pxor    %xmm8,%xmm3
  821         pxor    %xmm9,%xmm0
  822 .byte   102,15,56,0,222
  823 .byte   102,15,56,0,198
  824         paddd   %xmm3,%xmm4
  825         paddd   %xmm0,%xmm5
  826         pxor    %xmm4,%xmm13
  827         pxor    %xmm5,%xmm14
  828         movdqa  %xmm13,%xmm7
  829         pslld   $7,%xmm13
  830         psrld   $25,%xmm7
  831         movdqa  %xmm14,%xmm6
  832         pslld   $7,%xmm14
  833         por     %xmm7,%xmm13
  834         psrld   $25,%xmm6
  835         movdqa  (%r10),%xmm7
  836         por     %xmm6,%xmm14
  837         movdqa  %xmm4,32(%rsp)
  838         movdqa  %xmm5,48(%rsp)
  839         movdqa  0(%rsp),%xmm4
  840         movdqa  16(%rsp),%xmm5
  841         paddd   %xmm15,%xmm10
  842         paddd   %xmm12,%xmm11
  843         pxor    %xmm10,%xmm1
  844         pxor    %xmm11,%xmm2
  845 .byte   102,15,56,0,207
  846 .byte   102,15,56,0,215
  847         paddd   %xmm1,%xmm4
  848         paddd   %xmm2,%xmm5
  849         pxor    %xmm4,%xmm15
  850         pxor    %xmm5,%xmm12
  851         movdqa  %xmm15,%xmm6
  852         pslld   $12,%xmm15
  853         psrld   $20,%xmm6
  854         movdqa  %xmm12,%xmm7
  855         pslld   $12,%xmm12
  856         por     %xmm6,%xmm15
  857         psrld   $20,%xmm7
  858         movdqa  (%r11),%xmm6
  859         por     %xmm7,%xmm12
  860         paddd   %xmm15,%xmm10
  861         paddd   %xmm12,%xmm11
  862         pxor    %xmm10,%xmm1
  863         pxor    %xmm11,%xmm2
  864 .byte   102,15,56,0,206
  865 .byte   102,15,56,0,214
  866         paddd   %xmm1,%xmm4
  867         paddd   %xmm2,%xmm5
  868         pxor    %xmm4,%xmm15
  869         pxor    %xmm5,%xmm12
  870         movdqa  %xmm15,%xmm7
  871         pslld   $7,%xmm15
  872         psrld   $25,%xmm7
  873         movdqa  %xmm12,%xmm6
  874         pslld   $7,%xmm12
  875         por     %xmm7,%xmm15
  876         psrld   $25,%xmm6
  877         movdqa  (%r10),%xmm7
  878         por     %xmm6,%xmm12
  879         decl    %eax
  880         jnz     .Loop4x
  881 
  882         paddd   64(%rsp),%xmm8
  883         paddd   80(%rsp),%xmm9
  884         paddd   96(%rsp),%xmm10
  885         paddd   112(%rsp),%xmm11
  886 
  887         movdqa  %xmm8,%xmm6
  888         punpckldq       %xmm9,%xmm8
  889         movdqa  %xmm10,%xmm7
  890         punpckldq       %xmm11,%xmm10
  891         punpckhdq       %xmm9,%xmm6
  892         punpckhdq       %xmm11,%xmm7
  893         movdqa  %xmm8,%xmm9
  894         punpcklqdq      %xmm10,%xmm8
  895         movdqa  %xmm6,%xmm11
  896         punpcklqdq      %xmm7,%xmm6
  897         punpckhqdq      %xmm10,%xmm9
  898         punpckhqdq      %xmm7,%xmm11
  899         paddd   128-256(%rcx),%xmm12
  900         paddd   144-256(%rcx),%xmm13
  901         paddd   160-256(%rcx),%xmm14
  902         paddd   176-256(%rcx),%xmm15
  903 
  904         movdqa  %xmm8,0(%rsp)
  905         movdqa  %xmm9,16(%rsp)
  906         movdqa  32(%rsp),%xmm8
  907         movdqa  48(%rsp),%xmm9
  908 
  909         movdqa  %xmm12,%xmm10
  910         punpckldq       %xmm13,%xmm12
  911         movdqa  %xmm14,%xmm7
  912         punpckldq       %xmm15,%xmm14
  913         punpckhdq       %xmm13,%xmm10
  914         punpckhdq       %xmm15,%xmm7
  915         movdqa  %xmm12,%xmm13
  916         punpcklqdq      %xmm14,%xmm12
  917         movdqa  %xmm10,%xmm15
  918         punpcklqdq      %xmm7,%xmm10
  919         punpckhqdq      %xmm14,%xmm13
  920         punpckhqdq      %xmm7,%xmm15
  921         paddd   192-256(%rcx),%xmm4
  922         paddd   208-256(%rcx),%xmm5
  923         paddd   224-256(%rcx),%xmm8
  924         paddd   240-256(%rcx),%xmm9
  925 
  926         movdqa  %xmm6,32(%rsp)
  927         movdqa  %xmm11,48(%rsp)
  928 
  929         movdqa  %xmm4,%xmm14
  930         punpckldq       %xmm5,%xmm4
  931         movdqa  %xmm8,%xmm7
  932         punpckldq       %xmm9,%xmm8
  933         punpckhdq       %xmm5,%xmm14
  934         punpckhdq       %xmm9,%xmm7
  935         movdqa  %xmm4,%xmm5
  936         punpcklqdq      %xmm8,%xmm4
  937         movdqa  %xmm14,%xmm9
  938         punpcklqdq      %xmm7,%xmm14
  939         punpckhqdq      %xmm8,%xmm5
  940         punpckhqdq      %xmm7,%xmm9
  941         paddd   256-256(%rcx),%xmm0
  942         paddd   272-256(%rcx),%xmm1
  943         paddd   288-256(%rcx),%xmm2
  944         paddd   304-256(%rcx),%xmm3
  945 
  946         movdqa  %xmm0,%xmm8
  947         punpckldq       %xmm1,%xmm0
  948         movdqa  %xmm2,%xmm7
  949         punpckldq       %xmm3,%xmm2
  950         punpckhdq       %xmm1,%xmm8
  951         punpckhdq       %xmm3,%xmm7
  952         movdqa  %xmm0,%xmm1
  953         punpcklqdq      %xmm2,%xmm0
  954         movdqa  %xmm8,%xmm3
  955         punpcklqdq      %xmm7,%xmm8
  956         punpckhqdq      %xmm2,%xmm1
  957         punpckhqdq      %xmm7,%xmm3
  958         cmpq    $256,%rdx
  959         jb      .Ltail4x
  960 
  961         movdqu  0(%rsi),%xmm6
  962         movdqu  16(%rsi),%xmm11
  963         movdqu  32(%rsi),%xmm2
  964         movdqu  48(%rsi),%xmm7
  965         pxor    0(%rsp),%xmm6
  966         pxor    %xmm12,%xmm11
  967         pxor    %xmm4,%xmm2
  968         pxor    %xmm0,%xmm7
  969 
  970         movdqu  %xmm6,0(%rdi)
  971         movdqu  64(%rsi),%xmm6
  972         movdqu  %xmm11,16(%rdi)
  973         movdqu  80(%rsi),%xmm11
  974         movdqu  %xmm2,32(%rdi)
  975         movdqu  96(%rsi),%xmm2
  976         movdqu  %xmm7,48(%rdi)
  977         movdqu  112(%rsi),%xmm7
  978         leaq    128(%rsi),%rsi
  979         pxor    16(%rsp),%xmm6
  980         pxor    %xmm13,%xmm11
  981         pxor    %xmm5,%xmm2
  982         pxor    %xmm1,%xmm7
  983 
  984         movdqu  %xmm6,64(%rdi)
  985         movdqu  0(%rsi),%xmm6
  986         movdqu  %xmm11,80(%rdi)
  987         movdqu  16(%rsi),%xmm11
  988         movdqu  %xmm2,96(%rdi)
  989         movdqu  32(%rsi),%xmm2
  990         movdqu  %xmm7,112(%rdi)
  991         leaq    128(%rdi),%rdi
  992         movdqu  48(%rsi),%xmm7
  993         pxor    32(%rsp),%xmm6
  994         pxor    %xmm10,%xmm11
  995         pxor    %xmm14,%xmm2
  996         pxor    %xmm8,%xmm7
  997 
  998         movdqu  %xmm6,0(%rdi)
  999         movdqu  64(%rsi),%xmm6
 1000         movdqu  %xmm11,16(%rdi)
 1001         movdqu  80(%rsi),%xmm11
 1002         movdqu  %xmm2,32(%rdi)
 1003         movdqu  96(%rsi),%xmm2
 1004         movdqu  %xmm7,48(%rdi)
 1005         movdqu  112(%rsi),%xmm7
 1006         leaq    128(%rsi),%rsi
 1007         pxor    48(%rsp),%xmm6
 1008         pxor    %xmm15,%xmm11
 1009         pxor    %xmm9,%xmm2
 1010         pxor    %xmm3,%xmm7
 1011         movdqu  %xmm6,64(%rdi)
 1012         movdqu  %xmm11,80(%rdi)
 1013         movdqu  %xmm2,96(%rdi)
 1014         movdqu  %xmm7,112(%rdi)
 1015         leaq    128(%rdi),%rdi
 1016 
 1017         subq    $256,%rdx
 1018         jnz     .Loop_outer4x
 1019 
 1020         jmp     .Ldone4x
 1021 
 1022 .Ltail4x:
 1023         cmpq    $192,%rdx
 1024         jae     .L192_or_more4x
 1025         cmpq    $128,%rdx
 1026         jae     .L128_or_more4x
 1027         cmpq    $64,%rdx
 1028         jae     .L64_or_more4x
 1029 
 1030 
 1031         xorq    %r10,%r10
 1032 
 1033         movdqa  %xmm12,16(%rsp)
 1034         movdqa  %xmm4,32(%rsp)
 1035         movdqa  %xmm0,48(%rsp)
 1036         jmp     .Loop_tail4x
 1037 
 1038 .align  32
 1039 .L64_or_more4x:
 1040         movdqu  0(%rsi),%xmm6
 1041         movdqu  16(%rsi),%xmm11
 1042         movdqu  32(%rsi),%xmm2
 1043         movdqu  48(%rsi),%xmm7
 1044         pxor    0(%rsp),%xmm6
 1045         pxor    %xmm12,%xmm11
 1046         pxor    %xmm4,%xmm2
 1047         pxor    %xmm0,%xmm7
 1048         movdqu  %xmm6,0(%rdi)
 1049         movdqu  %xmm11,16(%rdi)
 1050         movdqu  %xmm2,32(%rdi)
 1051         movdqu  %xmm7,48(%rdi)
 1052         je      .Ldone4x
 1053 
 1054         movdqa  16(%rsp),%xmm6
 1055         leaq    64(%rsi),%rsi
 1056         xorq    %r10,%r10
 1057         movdqa  %xmm6,0(%rsp)
 1058         movdqa  %xmm13,16(%rsp)
 1059         leaq    64(%rdi),%rdi
 1060         movdqa  %xmm5,32(%rsp)
 1061         subq    $64,%rdx
 1062         movdqa  %xmm1,48(%rsp)
 1063         jmp     .Loop_tail4x
 1064 
 1065 .align  32
 1066 .L128_or_more4x:
 1067         movdqu  0(%rsi),%xmm6
 1068         movdqu  16(%rsi),%xmm11
 1069         movdqu  32(%rsi),%xmm2
 1070         movdqu  48(%rsi),%xmm7
 1071         pxor    0(%rsp),%xmm6
 1072         pxor    %xmm12,%xmm11
 1073         pxor    %xmm4,%xmm2
 1074         pxor    %xmm0,%xmm7
 1075 
 1076         movdqu  %xmm6,0(%rdi)
 1077         movdqu  64(%rsi),%xmm6
 1078         movdqu  %xmm11,16(%rdi)
 1079         movdqu  80(%rsi),%xmm11
 1080         movdqu  %xmm2,32(%rdi)
 1081         movdqu  96(%rsi),%xmm2
 1082         movdqu  %xmm7,48(%rdi)
 1083         movdqu  112(%rsi),%xmm7
 1084         pxor    16(%rsp),%xmm6
 1085         pxor    %xmm13,%xmm11
 1086         pxor    %xmm5,%xmm2
 1087         pxor    %xmm1,%xmm7
 1088         movdqu  %xmm6,64(%rdi)
 1089         movdqu  %xmm11,80(%rdi)
 1090         movdqu  %xmm2,96(%rdi)
 1091         movdqu  %xmm7,112(%rdi)
 1092         je      .Ldone4x
 1093 
 1094         movdqa  32(%rsp),%xmm6
 1095         leaq    128(%rsi),%rsi
 1096         xorq    %r10,%r10
 1097         movdqa  %xmm6,0(%rsp)
 1098         movdqa  %xmm10,16(%rsp)
 1099         leaq    128(%rdi),%rdi
 1100         movdqa  %xmm14,32(%rsp)
 1101         subq    $128,%rdx
 1102         movdqa  %xmm8,48(%rsp)
 1103         jmp     .Loop_tail4x
 1104 
 1105 .align  32
 1106 .L192_or_more4x:
 1107         movdqu  0(%rsi),%xmm6
 1108         movdqu  16(%rsi),%xmm11
 1109         movdqu  32(%rsi),%xmm2
 1110         movdqu  48(%rsi),%xmm7
 1111         pxor    0(%rsp),%xmm6
 1112         pxor    %xmm12,%xmm11
 1113         pxor    %xmm4,%xmm2
 1114         pxor    %xmm0,%xmm7
 1115 
 1116         movdqu  %xmm6,0(%rdi)
 1117         movdqu  64(%rsi),%xmm6
 1118         movdqu  %xmm11,16(%rdi)
 1119         movdqu  80(%rsi),%xmm11
 1120         movdqu  %xmm2,32(%rdi)
 1121         movdqu  96(%rsi),%xmm2
 1122         movdqu  %xmm7,48(%rdi)
 1123         movdqu  112(%rsi),%xmm7
 1124         leaq    128(%rsi),%rsi
 1125         pxor    16(%rsp),%xmm6
 1126         pxor    %xmm13,%xmm11
 1127         pxor    %xmm5,%xmm2
 1128         pxor    %xmm1,%xmm7
 1129 
 1130         movdqu  %xmm6,64(%rdi)
 1131         movdqu  0(%rsi),%xmm6
 1132         movdqu  %xmm11,80(%rdi)
 1133         movdqu  16(%rsi),%xmm11
 1134         movdqu  %xmm2,96(%rdi)
 1135         movdqu  32(%rsi),%xmm2
 1136         movdqu  %xmm7,112(%rdi)
 1137         leaq    128(%rdi),%rdi
 1138         movdqu  48(%rsi),%xmm7
 1139         pxor    32(%rsp),%xmm6
 1140         pxor    %xmm10,%xmm11
 1141         pxor    %xmm14,%xmm2
 1142         pxor    %xmm8,%xmm7
 1143         movdqu  %xmm6,0(%rdi)
 1144         movdqu  %xmm11,16(%rdi)
 1145         movdqu  %xmm2,32(%rdi)
 1146         movdqu  %xmm7,48(%rdi)
 1147         je      .Ldone4x
 1148 
 1149         movdqa  48(%rsp),%xmm6
 1150         leaq    64(%rsi),%rsi
 1151         xorq    %r10,%r10
 1152         movdqa  %xmm6,0(%rsp)
 1153         movdqa  %xmm15,16(%rsp)
 1154         leaq    64(%rdi),%rdi
 1155         movdqa  %xmm9,32(%rsp)
 1156         subq    $192,%rdx
 1157         movdqa  %xmm3,48(%rsp)
 1158 
 1159 .Loop_tail4x:
 1160         movzbl  (%rsi,%r10,1),%eax
 1161         movzbl  (%rsp,%r10,1),%ecx
 1162         leaq    1(%r10),%r10
 1163         xorl    %ecx,%eax
 1164         movb    %al,-1(%rdi,%r10,1)
 1165         decq    %rdx
 1166         jnz     .Loop_tail4x
 1167 
 1168 .Ldone4x:
 1169         leaq    (%r9),%rsp
 1170 .cfi_def_cfa_register   %rsp
 1171 .L4x_epilogue:
 1172         .byte   0xf3,0xc3
 1173 .cfi_endproc    
 1174 .size   ChaCha20_4x,.-ChaCha20_4x
 1175 .type   ChaCha20_4xop,@function
 1176 .align  32
 1177 ChaCha20_4xop:
 1178 .cfi_startproc  
 1179 .LChaCha20_4xop:
 1180         movq    %rsp,%r9
 1181 .cfi_def_cfa_register   %r9
 1182         subq    $0x140+8,%rsp
 1183         vzeroupper
 1184 
 1185         vmovdqa .Lsigma(%rip),%xmm11
 1186         vmovdqu (%rcx),%xmm3
 1187         vmovdqu 16(%rcx),%xmm15
 1188         vmovdqu (%r8),%xmm7
 1189         leaq    256(%rsp),%rcx
 1190 
 1191         vpshufd $0x00,%xmm11,%xmm8
 1192         vpshufd $0x55,%xmm11,%xmm9
 1193         vmovdqa %xmm8,64(%rsp)
 1194         vpshufd $0xaa,%xmm11,%xmm10
 1195         vmovdqa %xmm9,80(%rsp)
 1196         vpshufd $0xff,%xmm11,%xmm11
 1197         vmovdqa %xmm10,96(%rsp)
 1198         vmovdqa %xmm11,112(%rsp)
 1199 
 1200         vpshufd $0x00,%xmm3,%xmm0
 1201         vpshufd $0x55,%xmm3,%xmm1
 1202         vmovdqa %xmm0,128-256(%rcx)
 1203         vpshufd $0xaa,%xmm3,%xmm2
 1204         vmovdqa %xmm1,144-256(%rcx)
 1205         vpshufd $0xff,%xmm3,%xmm3
 1206         vmovdqa %xmm2,160-256(%rcx)
 1207         vmovdqa %xmm3,176-256(%rcx)
 1208 
 1209         vpshufd $0x00,%xmm15,%xmm12
 1210         vpshufd $0x55,%xmm15,%xmm13
 1211         vmovdqa %xmm12,192-256(%rcx)
 1212         vpshufd $0xaa,%xmm15,%xmm14
 1213         vmovdqa %xmm13,208-256(%rcx)
 1214         vpshufd $0xff,%xmm15,%xmm15
 1215         vmovdqa %xmm14,224-256(%rcx)
 1216         vmovdqa %xmm15,240-256(%rcx)
 1217 
 1218         vpshufd $0x00,%xmm7,%xmm4
 1219         vpshufd $0x55,%xmm7,%xmm5
 1220         vpaddd  .Linc(%rip),%xmm4,%xmm4
 1221         vpshufd $0xaa,%xmm7,%xmm6
 1222         vmovdqa %xmm5,272-256(%rcx)
 1223         vpshufd $0xff,%xmm7,%xmm7
 1224         vmovdqa %xmm6,288-256(%rcx)
 1225         vmovdqa %xmm7,304-256(%rcx)
 1226 
 1227         jmp     .Loop_enter4xop
 1228 
 1229 .align  32
 1230 .Loop_outer4xop:
 1231         vmovdqa 64(%rsp),%xmm8
 1232         vmovdqa 80(%rsp),%xmm9
 1233         vmovdqa 96(%rsp),%xmm10
 1234         vmovdqa 112(%rsp),%xmm11
 1235         vmovdqa 128-256(%rcx),%xmm0
 1236         vmovdqa 144-256(%rcx),%xmm1
 1237         vmovdqa 160-256(%rcx),%xmm2
 1238         vmovdqa 176-256(%rcx),%xmm3
 1239         vmovdqa 192-256(%rcx),%xmm12
 1240         vmovdqa 208-256(%rcx),%xmm13
 1241         vmovdqa 224-256(%rcx),%xmm14
 1242         vmovdqa 240-256(%rcx),%xmm15
 1243         vmovdqa 256-256(%rcx),%xmm4
 1244         vmovdqa 272-256(%rcx),%xmm5
 1245         vmovdqa 288-256(%rcx),%xmm6
 1246         vmovdqa 304-256(%rcx),%xmm7
 1247         vpaddd  .Lfour(%rip),%xmm4,%xmm4
 1248 
 1249 .Loop_enter4xop:
 1250         movl    $10,%eax
 1251         vmovdqa %xmm4,256-256(%rcx)
 1252         jmp     .Loop4xop
 1253 
 1254 .align  32
 1255 .Loop4xop:
 1256         vpaddd  %xmm0,%xmm8,%xmm8
 1257         vpaddd  %xmm1,%xmm9,%xmm9
 1258         vpaddd  %xmm2,%xmm10,%xmm10
 1259         vpaddd  %xmm3,%xmm11,%xmm11
 1260         vpxor   %xmm4,%xmm8,%xmm4
 1261         vpxor   %xmm5,%xmm9,%xmm5
 1262         vpxor   %xmm6,%xmm10,%xmm6
 1263         vpxor   %xmm7,%xmm11,%xmm7
 1264 .byte   143,232,120,194,228,16
 1265 .byte   143,232,120,194,237,16
 1266 .byte   143,232,120,194,246,16
 1267 .byte   143,232,120,194,255,16
 1268         vpaddd  %xmm4,%xmm12,%xmm12
 1269         vpaddd  %xmm5,%xmm13,%xmm13
 1270         vpaddd  %xmm6,%xmm14,%xmm14
 1271         vpaddd  %xmm7,%xmm15,%xmm15
 1272         vpxor   %xmm0,%xmm12,%xmm0
 1273         vpxor   %xmm1,%xmm13,%xmm1
 1274         vpxor   %xmm14,%xmm2,%xmm2
 1275         vpxor   %xmm15,%xmm3,%xmm3
 1276 .byte   143,232,120,194,192,12
 1277 .byte   143,232,120,194,201,12
 1278 .byte   143,232,120,194,210,12
 1279 .byte   143,232,120,194,219,12
 1280         vpaddd  %xmm8,%xmm0,%xmm8
 1281         vpaddd  %xmm9,%xmm1,%xmm9
 1282         vpaddd  %xmm2,%xmm10,%xmm10
 1283         vpaddd  %xmm3,%xmm11,%xmm11
 1284         vpxor   %xmm4,%xmm8,%xmm4
 1285         vpxor   %xmm5,%xmm9,%xmm5
 1286         vpxor   %xmm6,%xmm10,%xmm6
 1287         vpxor   %xmm7,%xmm11,%xmm7
 1288 .byte   143,232,120,194,228,8
 1289 .byte   143,232,120,194,237,8
 1290 .byte   143,232,120,194,246,8
 1291 .byte   143,232,120,194,255,8
 1292         vpaddd  %xmm4,%xmm12,%xmm12
 1293         vpaddd  %xmm5,%xmm13,%xmm13
 1294         vpaddd  %xmm6,%xmm14,%xmm14
 1295         vpaddd  %xmm7,%xmm15,%xmm15
 1296         vpxor   %xmm0,%xmm12,%xmm0
 1297         vpxor   %xmm1,%xmm13,%xmm1
 1298         vpxor   %xmm14,%xmm2,%xmm2
 1299         vpxor   %xmm15,%xmm3,%xmm3
 1300 .byte   143,232,120,194,192,7
 1301 .byte   143,232,120,194,201,7
 1302 .byte   143,232,120,194,210,7
 1303 .byte   143,232,120,194,219,7
 1304         vpaddd  %xmm1,%xmm8,%xmm8
 1305         vpaddd  %xmm2,%xmm9,%xmm9
 1306         vpaddd  %xmm3,%xmm10,%xmm10
 1307         vpaddd  %xmm0,%xmm11,%xmm11
 1308         vpxor   %xmm7,%xmm8,%xmm7
 1309         vpxor   %xmm4,%xmm9,%xmm4
 1310         vpxor   %xmm5,%xmm10,%xmm5
 1311         vpxor   %xmm6,%xmm11,%xmm6
 1312 .byte   143,232,120,194,255,16
 1313 .byte   143,232,120,194,228,16
 1314 .byte   143,232,120,194,237,16
 1315 .byte   143,232,120,194,246,16
 1316         vpaddd  %xmm7,%xmm14,%xmm14
 1317         vpaddd  %xmm4,%xmm15,%xmm15
 1318         vpaddd  %xmm5,%xmm12,%xmm12
 1319         vpaddd  %xmm6,%xmm13,%xmm13
 1320         vpxor   %xmm1,%xmm14,%xmm1
 1321         vpxor   %xmm2,%xmm15,%xmm2
 1322         vpxor   %xmm12,%xmm3,%xmm3
 1323         vpxor   %xmm13,%xmm0,%xmm0
 1324 .byte   143,232,120,194,201,12
 1325 .byte   143,232,120,194,210,12
 1326 .byte   143,232,120,194,219,12
 1327 .byte   143,232,120,194,192,12
 1328         vpaddd  %xmm8,%xmm1,%xmm8
 1329         vpaddd  %xmm9,%xmm2,%xmm9
 1330         vpaddd  %xmm3,%xmm10,%xmm10
 1331         vpaddd  %xmm0,%xmm11,%xmm11
 1332         vpxor   %xmm7,%xmm8,%xmm7
 1333         vpxor   %xmm4,%xmm9,%xmm4
 1334         vpxor   %xmm5,%xmm10,%xmm5
 1335         vpxor   %xmm6,%xmm11,%xmm6
 1336 .byte   143,232,120,194,255,8
 1337 .byte   143,232,120,194,228,8
 1338 .byte   143,232,120,194,237,8
 1339 .byte   143,232,120,194,246,8
 1340         vpaddd  %xmm7,%xmm14,%xmm14
 1341         vpaddd  %xmm4,%xmm15,%xmm15
 1342         vpaddd  %xmm5,%xmm12,%xmm12
 1343         vpaddd  %xmm6,%xmm13,%xmm13
 1344         vpxor   %xmm1,%xmm14,%xmm1
 1345         vpxor   %xmm2,%xmm15,%xmm2
 1346         vpxor   %xmm12,%xmm3,%xmm3
 1347         vpxor   %xmm13,%xmm0,%xmm0
 1348 .byte   143,232,120,194,201,7
 1349 .byte   143,232,120,194,210,7
 1350 .byte   143,232,120,194,219,7
 1351 .byte   143,232,120,194,192,7
 1352         decl    %eax
 1353         jnz     .Loop4xop
 1354 
 1355         vpaddd  64(%rsp),%xmm8,%xmm8
 1356         vpaddd  80(%rsp),%xmm9,%xmm9
 1357         vpaddd  96(%rsp),%xmm10,%xmm10
 1358         vpaddd  112(%rsp),%xmm11,%xmm11
 1359 
 1360         vmovdqa %xmm14,32(%rsp)
 1361         vmovdqa %xmm15,48(%rsp)
 1362 
 1363         vpunpckldq      %xmm9,%xmm8,%xmm14
 1364         vpunpckldq      %xmm11,%xmm10,%xmm15
 1365         vpunpckhdq      %xmm9,%xmm8,%xmm8
 1366         vpunpckhdq      %xmm11,%xmm10,%xmm10
 1367         vpunpcklqdq     %xmm15,%xmm14,%xmm9
 1368         vpunpckhqdq     %xmm15,%xmm14,%xmm14
 1369         vpunpcklqdq     %xmm10,%xmm8,%xmm11
 1370         vpunpckhqdq     %xmm10,%xmm8,%xmm8
 1371         vpaddd  128-256(%rcx),%xmm0,%xmm0
 1372         vpaddd  144-256(%rcx),%xmm1,%xmm1
 1373         vpaddd  160-256(%rcx),%xmm2,%xmm2
 1374         vpaddd  176-256(%rcx),%xmm3,%xmm3
 1375 
 1376         vmovdqa %xmm9,0(%rsp)
 1377         vmovdqa %xmm14,16(%rsp)
 1378         vmovdqa 32(%rsp),%xmm9
 1379         vmovdqa 48(%rsp),%xmm14
 1380 
 1381         vpunpckldq      %xmm1,%xmm0,%xmm10
 1382         vpunpckldq      %xmm3,%xmm2,%xmm15
 1383         vpunpckhdq      %xmm1,%xmm0,%xmm0
 1384         vpunpckhdq      %xmm3,%xmm2,%xmm2
 1385         vpunpcklqdq     %xmm15,%xmm10,%xmm1
 1386         vpunpckhqdq     %xmm15,%xmm10,%xmm10
 1387         vpunpcklqdq     %xmm2,%xmm0,%xmm3
 1388         vpunpckhqdq     %xmm2,%xmm0,%xmm0
 1389         vpaddd  192-256(%rcx),%xmm12,%xmm12
 1390         vpaddd  208-256(%rcx),%xmm13,%xmm13
 1391         vpaddd  224-256(%rcx),%xmm9,%xmm9
 1392         vpaddd  240-256(%rcx),%xmm14,%xmm14
 1393 
 1394         vpunpckldq      %xmm13,%xmm12,%xmm2
 1395         vpunpckldq      %xmm14,%xmm9,%xmm15
 1396         vpunpckhdq      %xmm13,%xmm12,%xmm12
 1397         vpunpckhdq      %xmm14,%xmm9,%xmm9
 1398         vpunpcklqdq     %xmm15,%xmm2,%xmm13
 1399         vpunpckhqdq     %xmm15,%xmm2,%xmm2
 1400         vpunpcklqdq     %xmm9,%xmm12,%xmm14
 1401         vpunpckhqdq     %xmm9,%xmm12,%xmm12
 1402         vpaddd  256-256(%rcx),%xmm4,%xmm4
 1403         vpaddd  272-256(%rcx),%xmm5,%xmm5
 1404         vpaddd  288-256(%rcx),%xmm6,%xmm6
 1405         vpaddd  304-256(%rcx),%xmm7,%xmm7
 1406 
 1407         vpunpckldq      %xmm5,%xmm4,%xmm9
 1408         vpunpckldq      %xmm7,%xmm6,%xmm15
 1409         vpunpckhdq      %xmm5,%xmm4,%xmm4
 1410         vpunpckhdq      %xmm7,%xmm6,%xmm6
 1411         vpunpcklqdq     %xmm15,%xmm9,%xmm5
 1412         vpunpckhqdq     %xmm15,%xmm9,%xmm9
 1413         vpunpcklqdq     %xmm6,%xmm4,%xmm7
 1414         vpunpckhqdq     %xmm6,%xmm4,%xmm4
 1415         vmovdqa 0(%rsp),%xmm6
 1416         vmovdqa 16(%rsp),%xmm15
 1417 
 1418         cmpq    $256,%rdx
 1419         jb      .Ltail4xop
 1420 
 1421         vpxor   0(%rsi),%xmm6,%xmm6
 1422         vpxor   16(%rsi),%xmm1,%xmm1
 1423         vpxor   32(%rsi),%xmm13,%xmm13
 1424         vpxor   48(%rsi),%xmm5,%xmm5
 1425         vpxor   64(%rsi),%xmm15,%xmm15
 1426         vpxor   80(%rsi),%xmm10,%xmm10
 1427         vpxor   96(%rsi),%xmm2,%xmm2
 1428         vpxor   112(%rsi),%xmm9,%xmm9
 1429         leaq    128(%rsi),%rsi
 1430         vpxor   0(%rsi),%xmm11,%xmm11
 1431         vpxor   16(%rsi),%xmm3,%xmm3
 1432         vpxor   32(%rsi),%xmm14,%xmm14
 1433         vpxor   48(%rsi),%xmm7,%xmm7
 1434         vpxor   64(%rsi),%xmm8,%xmm8
 1435         vpxor   80(%rsi),%xmm0,%xmm0
 1436         vpxor   96(%rsi),%xmm12,%xmm12
 1437         vpxor   112(%rsi),%xmm4,%xmm4
 1438         leaq    128(%rsi),%rsi
 1439 
 1440         vmovdqu %xmm6,0(%rdi)
 1441         vmovdqu %xmm1,16(%rdi)
 1442         vmovdqu %xmm13,32(%rdi)
 1443         vmovdqu %xmm5,48(%rdi)
 1444         vmovdqu %xmm15,64(%rdi)
 1445         vmovdqu %xmm10,80(%rdi)
 1446         vmovdqu %xmm2,96(%rdi)
 1447         vmovdqu %xmm9,112(%rdi)
 1448         leaq    128(%rdi),%rdi
 1449         vmovdqu %xmm11,0(%rdi)
 1450         vmovdqu %xmm3,16(%rdi)
 1451         vmovdqu %xmm14,32(%rdi)
 1452         vmovdqu %xmm7,48(%rdi)
 1453         vmovdqu %xmm8,64(%rdi)
 1454         vmovdqu %xmm0,80(%rdi)
 1455         vmovdqu %xmm12,96(%rdi)
 1456         vmovdqu %xmm4,112(%rdi)
 1457         leaq    128(%rdi),%rdi
 1458 
 1459         subq    $256,%rdx
 1460         jnz     .Loop_outer4xop
 1461 
 1462         jmp     .Ldone4xop
 1463 
 1464 .align  32
 1465 .Ltail4xop:
 1466         cmpq    $192,%rdx
 1467         jae     .L192_or_more4xop
 1468         cmpq    $128,%rdx
 1469         jae     .L128_or_more4xop
 1470         cmpq    $64,%rdx
 1471         jae     .L64_or_more4xop
 1472 
 1473         xorq    %r10,%r10
 1474         vmovdqa %xmm6,0(%rsp)
 1475         vmovdqa %xmm1,16(%rsp)
 1476         vmovdqa %xmm13,32(%rsp)
 1477         vmovdqa %xmm5,48(%rsp)
 1478         jmp     .Loop_tail4xop
 1479 
 1480 .align  32
 1481 .L64_or_more4xop:
 1482         vpxor   0(%rsi),%xmm6,%xmm6
 1483         vpxor   16(%rsi),%xmm1,%xmm1
 1484         vpxor   32(%rsi),%xmm13,%xmm13
 1485         vpxor   48(%rsi),%xmm5,%xmm5
 1486         vmovdqu %xmm6,0(%rdi)
 1487         vmovdqu %xmm1,16(%rdi)
 1488         vmovdqu %xmm13,32(%rdi)
 1489         vmovdqu %xmm5,48(%rdi)
 1490         je      .Ldone4xop
 1491 
 1492         leaq    64(%rsi),%rsi
 1493         vmovdqa %xmm15,0(%rsp)
 1494         xorq    %r10,%r10
 1495         vmovdqa %xmm10,16(%rsp)
 1496         leaq    64(%rdi),%rdi
 1497         vmovdqa %xmm2,32(%rsp)
 1498         subq    $64,%rdx
 1499         vmovdqa %xmm9,48(%rsp)
 1500         jmp     .Loop_tail4xop
 1501 
 1502 .align  32
 1503 .L128_or_more4xop:
 1504         vpxor   0(%rsi),%xmm6,%xmm6
 1505         vpxor   16(%rsi),%xmm1,%xmm1
 1506         vpxor   32(%rsi),%xmm13,%xmm13
 1507         vpxor   48(%rsi),%xmm5,%xmm5
 1508         vpxor   64(%rsi),%xmm15,%xmm15
 1509         vpxor   80(%rsi),%xmm10,%xmm10
 1510         vpxor   96(%rsi),%xmm2,%xmm2
 1511         vpxor   112(%rsi),%xmm9,%xmm9
 1512 
 1513         vmovdqu %xmm6,0(%rdi)
 1514         vmovdqu %xmm1,16(%rdi)
 1515         vmovdqu %xmm13,32(%rdi)
 1516         vmovdqu %xmm5,48(%rdi)
 1517         vmovdqu %xmm15,64(%rdi)
 1518         vmovdqu %xmm10,80(%rdi)
 1519         vmovdqu %xmm2,96(%rdi)
 1520         vmovdqu %xmm9,112(%rdi)
 1521         je      .Ldone4xop
 1522 
 1523         leaq    128(%rsi),%rsi
 1524         vmovdqa %xmm11,0(%rsp)
 1525         xorq    %r10,%r10
 1526         vmovdqa %xmm3,16(%rsp)
 1527         leaq    128(%rdi),%rdi
 1528         vmovdqa %xmm14,32(%rsp)
 1529         subq    $128,%rdx
 1530         vmovdqa %xmm7,48(%rsp)
 1531         jmp     .Loop_tail4xop
 1532 
 1533 .align  32
 1534 .L192_or_more4xop:
 1535         vpxor   0(%rsi),%xmm6,%xmm6
 1536         vpxor   16(%rsi),%xmm1,%xmm1
 1537         vpxor   32(%rsi),%xmm13,%xmm13
 1538         vpxor   48(%rsi),%xmm5,%xmm5
 1539         vpxor   64(%rsi),%xmm15,%xmm15
 1540         vpxor   80(%rsi),%xmm10,%xmm10
 1541         vpxor   96(%rsi),%xmm2,%xmm2
 1542         vpxor   112(%rsi),%xmm9,%xmm9
 1543         leaq    128(%rsi),%rsi
 1544         vpxor   0(%rsi),%xmm11,%xmm11
 1545         vpxor   16(%rsi),%xmm3,%xmm3
 1546         vpxor   32(%rsi),%xmm14,%xmm14
 1547         vpxor   48(%rsi),%xmm7,%xmm7
 1548 
 1549         vmovdqu %xmm6,0(%rdi)
 1550         vmovdqu %xmm1,16(%rdi)
 1551         vmovdqu %xmm13,32(%rdi)
 1552         vmovdqu %xmm5,48(%rdi)
 1553         vmovdqu %xmm15,64(%rdi)
 1554         vmovdqu %xmm10,80(%rdi)
 1555         vmovdqu %xmm2,96(%rdi)
 1556         vmovdqu %xmm9,112(%rdi)
 1557         leaq    128(%rdi),%rdi
 1558         vmovdqu %xmm11,0(%rdi)
 1559         vmovdqu %xmm3,16(%rdi)
 1560         vmovdqu %xmm14,32(%rdi)
 1561         vmovdqu %xmm7,48(%rdi)
 1562         je      .Ldone4xop
 1563 
 1564         leaq    64(%rsi),%rsi
 1565         vmovdqa %xmm8,0(%rsp)
 1566         xorq    %r10,%r10
 1567         vmovdqa %xmm0,16(%rsp)
 1568         leaq    64(%rdi),%rdi
 1569         vmovdqa %xmm12,32(%rsp)
 1570         subq    $192,%rdx
 1571         vmovdqa %xmm4,48(%rsp)
 1572 
 1573 .Loop_tail4xop:
 1574         movzbl  (%rsi,%r10,1),%eax
 1575         movzbl  (%rsp,%r10,1),%ecx
 1576         leaq    1(%r10),%r10
 1577         xorl    %ecx,%eax
 1578         movb    %al,-1(%rdi,%r10,1)
 1579         decq    %rdx
 1580         jnz     .Loop_tail4xop
 1581 
 1582 .Ldone4xop:
 1583         vzeroupper
 1584         leaq    (%r9),%rsp
 1585 .cfi_def_cfa_register   %rsp
 1586 .L4xop_epilogue:
 1587         .byte   0xf3,0xc3
 1588 .cfi_endproc    
 1589 .size   ChaCha20_4xop,.-ChaCha20_4xop
 1590 .type   ChaCha20_8x,@function
 1591 .align  32
 1592 ChaCha20_8x:
 1593 .cfi_startproc  
 1594 .LChaCha20_8x:
 1595         movq    %rsp,%r9
 1596 .cfi_def_cfa_register   %r9
 1597         subq    $0x280+8,%rsp
 1598         andq    $-32,%rsp
 1599         vzeroupper
 1600 
 1601 
 1602 
 1603 
 1604 
 1605 
 1606 
 1607 
 1608 
 1609 
 1610         vbroadcasti128  .Lsigma(%rip),%ymm11
 1611         vbroadcasti128  (%rcx),%ymm3
 1612         vbroadcasti128  16(%rcx),%ymm15
 1613         vbroadcasti128  (%r8),%ymm7
 1614         leaq    256(%rsp),%rcx
 1615         leaq    512(%rsp),%rax
 1616         leaq    .Lrot16(%rip),%r10
 1617         leaq    .Lrot24(%rip),%r11
 1618 
 1619         vpshufd $0x00,%ymm11,%ymm8
 1620         vpshufd $0x55,%ymm11,%ymm9
 1621         vmovdqa %ymm8,128-256(%rcx)
 1622         vpshufd $0xaa,%ymm11,%ymm10
 1623         vmovdqa %ymm9,160-256(%rcx)
 1624         vpshufd $0xff,%ymm11,%ymm11
 1625         vmovdqa %ymm10,192-256(%rcx)
 1626         vmovdqa %ymm11,224-256(%rcx)
 1627 
 1628         vpshufd $0x00,%ymm3,%ymm0
 1629         vpshufd $0x55,%ymm3,%ymm1
 1630         vmovdqa %ymm0,256-256(%rcx)
 1631         vpshufd $0xaa,%ymm3,%ymm2
 1632         vmovdqa %ymm1,288-256(%rcx)
 1633         vpshufd $0xff,%ymm3,%ymm3
 1634         vmovdqa %ymm2,320-256(%rcx)
 1635         vmovdqa %ymm3,352-256(%rcx)
 1636 
 1637         vpshufd $0x00,%ymm15,%ymm12
 1638         vpshufd $0x55,%ymm15,%ymm13
 1639         vmovdqa %ymm12,384-512(%rax)
 1640         vpshufd $0xaa,%ymm15,%ymm14
 1641         vmovdqa %ymm13,416-512(%rax)
 1642         vpshufd $0xff,%ymm15,%ymm15
 1643         vmovdqa %ymm14,448-512(%rax)
 1644         vmovdqa %ymm15,480-512(%rax)
 1645 
 1646         vpshufd $0x00,%ymm7,%ymm4
 1647         vpshufd $0x55,%ymm7,%ymm5
 1648         vpaddd  .Lincy(%rip),%ymm4,%ymm4
 1649         vpshufd $0xaa,%ymm7,%ymm6
 1650         vmovdqa %ymm5,544-512(%rax)
 1651         vpshufd $0xff,%ymm7,%ymm7
 1652         vmovdqa %ymm6,576-512(%rax)
 1653         vmovdqa %ymm7,608-512(%rax)
 1654 
 1655         jmp     .Loop_enter8x
 1656 
 1657 .align  32
 1658 .Loop_outer8x:
 1659         vmovdqa 128-256(%rcx),%ymm8
 1660         vmovdqa 160-256(%rcx),%ymm9
 1661         vmovdqa 192-256(%rcx),%ymm10
 1662         vmovdqa 224-256(%rcx),%ymm11
 1663         vmovdqa 256-256(%rcx),%ymm0
 1664         vmovdqa 288-256(%rcx),%ymm1
 1665         vmovdqa 320-256(%rcx),%ymm2
 1666         vmovdqa 352-256(%rcx),%ymm3
 1667         vmovdqa 384-512(%rax),%ymm12
 1668         vmovdqa 416-512(%rax),%ymm13
 1669         vmovdqa 448-512(%rax),%ymm14
 1670         vmovdqa 480-512(%rax),%ymm15
 1671         vmovdqa 512-512(%rax),%ymm4
 1672         vmovdqa 544-512(%rax),%ymm5
 1673         vmovdqa 576-512(%rax),%ymm6
 1674         vmovdqa 608-512(%rax),%ymm7
 1675         vpaddd  .Leight(%rip),%ymm4,%ymm4
 1676 
 1677 .Loop_enter8x:
 1678         vmovdqa %ymm14,64(%rsp)
 1679         vmovdqa %ymm15,96(%rsp)
 1680         vbroadcasti128  (%r10),%ymm15
 1681         vmovdqa %ymm4,512-512(%rax)
 1682         movl    $10,%eax
 1683         jmp     .Loop8x
 1684 
 1685 .align  32
 1686 .Loop8x:
 1687         vpaddd  %ymm0,%ymm8,%ymm8
 1688         vpxor   %ymm4,%ymm8,%ymm4
 1689         vpshufb %ymm15,%ymm4,%ymm4
 1690         vpaddd  %ymm1,%ymm9,%ymm9
 1691         vpxor   %ymm5,%ymm9,%ymm5
 1692         vpshufb %ymm15,%ymm5,%ymm5
 1693         vpaddd  %ymm4,%ymm12,%ymm12
 1694         vpxor   %ymm0,%ymm12,%ymm0
 1695         vpslld  $12,%ymm0,%ymm14
 1696         vpsrld  $20,%ymm0,%ymm0
 1697         vpor    %ymm0,%ymm14,%ymm0
 1698         vbroadcasti128  (%r11),%ymm14
 1699         vpaddd  %ymm5,%ymm13,%ymm13
 1700         vpxor   %ymm1,%ymm13,%ymm1
 1701         vpslld  $12,%ymm1,%ymm15
 1702         vpsrld  $20,%ymm1,%ymm1
 1703         vpor    %ymm1,%ymm15,%ymm1
 1704         vpaddd  %ymm0,%ymm8,%ymm8
 1705         vpxor   %ymm4,%ymm8,%ymm4
 1706         vpshufb %ymm14,%ymm4,%ymm4
 1707         vpaddd  %ymm1,%ymm9,%ymm9
 1708         vpxor   %ymm5,%ymm9,%ymm5
 1709         vpshufb %ymm14,%ymm5,%ymm5
 1710         vpaddd  %ymm4,%ymm12,%ymm12
 1711         vpxor   %ymm0,%ymm12,%ymm0
 1712         vpslld  $7,%ymm0,%ymm15
 1713         vpsrld  $25,%ymm0,%ymm0
 1714         vpor    %ymm0,%ymm15,%ymm0
 1715         vbroadcasti128  (%r10),%ymm15
 1716         vpaddd  %ymm5,%ymm13,%ymm13
 1717         vpxor   %ymm1,%ymm13,%ymm1
 1718         vpslld  $7,%ymm1,%ymm14
 1719         vpsrld  $25,%ymm1,%ymm1
 1720         vpor    %ymm1,%ymm14,%ymm1
 1721         vmovdqa %ymm12,0(%rsp)
 1722         vmovdqa %ymm13,32(%rsp)
 1723         vmovdqa 64(%rsp),%ymm12
 1724         vmovdqa 96(%rsp),%ymm13
 1725         vpaddd  %ymm2,%ymm10,%ymm10
 1726         vpxor   %ymm6,%ymm10,%ymm6
 1727         vpshufb %ymm15,%ymm6,%ymm6
 1728         vpaddd  %ymm3,%ymm11,%ymm11
 1729         vpxor   %ymm7,%ymm11,%ymm7
 1730         vpshufb %ymm15,%ymm7,%ymm7
 1731         vpaddd  %ymm6,%ymm12,%ymm12
 1732         vpxor   %ymm2,%ymm12,%ymm2
 1733         vpslld  $12,%ymm2,%ymm14
 1734         vpsrld  $20,%ymm2,%ymm2
 1735         vpor    %ymm2,%ymm14,%ymm2
 1736         vbroadcasti128  (%r11),%ymm14
 1737         vpaddd  %ymm7,%ymm13,%ymm13
 1738         vpxor   %ymm3,%ymm13,%ymm3
 1739         vpslld  $12,%ymm3,%ymm15
 1740         vpsrld  $20,%ymm3,%ymm3
 1741         vpor    %ymm3,%ymm15,%ymm3
 1742         vpaddd  %ymm2,%ymm10,%ymm10
 1743         vpxor   %ymm6,%ymm10,%ymm6
 1744         vpshufb %ymm14,%ymm6,%ymm6
 1745         vpaddd  %ymm3,%ymm11,%ymm11
 1746         vpxor   %ymm7,%ymm11,%ymm7
 1747         vpshufb %ymm14,%ymm7,%ymm7
 1748         vpaddd  %ymm6,%ymm12,%ymm12
 1749         vpxor   %ymm2,%ymm12,%ymm2
 1750         vpslld  $7,%ymm2,%ymm15
 1751         vpsrld  $25,%ymm2,%ymm2
 1752         vpor    %ymm2,%ymm15,%ymm2
 1753         vbroadcasti128  (%r10),%ymm15
 1754         vpaddd  %ymm7,%ymm13,%ymm13
 1755         vpxor   %ymm3,%ymm13,%ymm3
 1756         vpslld  $7,%ymm3,%ymm14
 1757         vpsrld  $25,%ymm3,%ymm3
 1758         vpor    %ymm3,%ymm14,%ymm3
 1759         vpaddd  %ymm1,%ymm8,%ymm8
 1760         vpxor   %ymm7,%ymm8,%ymm7
 1761         vpshufb %ymm15,%ymm7,%ymm7
 1762         vpaddd  %ymm2,%ymm9,%ymm9
 1763         vpxor   %ymm4,%ymm9,%ymm4
 1764         vpshufb %ymm15,%ymm4,%ymm4
 1765         vpaddd  %ymm7,%ymm12,%ymm12
 1766         vpxor   %ymm1,%ymm12,%ymm1
 1767         vpslld  $12,%ymm1,%ymm14
 1768         vpsrld  $20,%ymm1,%ymm1
 1769         vpor    %ymm1,%ymm14,%ymm1
 1770         vbroadcasti128  (%r11),%ymm14
 1771         vpaddd  %ymm4,%ymm13,%ymm13
 1772         vpxor   %ymm2,%ymm13,%ymm2
 1773         vpslld  $12,%ymm2,%ymm15
 1774         vpsrld  $20,%ymm2,%ymm2
 1775         vpor    %ymm2,%ymm15,%ymm2
 1776         vpaddd  %ymm1,%ymm8,%ymm8
 1777         vpxor   %ymm7,%ymm8,%ymm7
 1778         vpshufb %ymm14,%ymm7,%ymm7
 1779         vpaddd  %ymm2,%ymm9,%ymm9
 1780         vpxor   %ymm4,%ymm9,%ymm4
 1781         vpshufb %ymm14,%ymm4,%ymm4
 1782         vpaddd  %ymm7,%ymm12,%ymm12
 1783         vpxor   %ymm1,%ymm12,%ymm1
 1784         vpslld  $7,%ymm1,%ymm15
 1785         vpsrld  $25,%ymm1,%ymm1
 1786         vpor    %ymm1,%ymm15,%ymm1
 1787         vbroadcasti128  (%r10),%ymm15
 1788         vpaddd  %ymm4,%ymm13,%ymm13
 1789         vpxor   %ymm2,%ymm13,%ymm2
 1790         vpslld  $7,%ymm2,%ymm14
 1791         vpsrld  $25,%ymm2,%ymm2
 1792         vpor    %ymm2,%ymm14,%ymm2
 1793         vmovdqa %ymm12,64(%rsp)
 1794         vmovdqa %ymm13,96(%rsp)
 1795         vmovdqa 0(%rsp),%ymm12
 1796         vmovdqa 32(%rsp),%ymm13
 1797         vpaddd  %ymm3,%ymm10,%ymm10
 1798         vpxor   %ymm5,%ymm10,%ymm5
 1799         vpshufb %ymm15,%ymm5,%ymm5
 1800         vpaddd  %ymm0,%ymm11,%ymm11
 1801         vpxor   %ymm6,%ymm11,%ymm6
 1802         vpshufb %ymm15,%ymm6,%ymm6
 1803         vpaddd  %ymm5,%ymm12,%ymm12
 1804         vpxor   %ymm3,%ymm12,%ymm3
 1805         vpslld  $12,%ymm3,%ymm14
 1806         vpsrld  $20,%ymm3,%ymm3
 1807         vpor    %ymm3,%ymm14,%ymm3
 1808         vbroadcasti128  (%r11),%ymm14
 1809         vpaddd  %ymm6,%ymm13,%ymm13
 1810         vpxor   %ymm0,%ymm13,%ymm0
 1811         vpslld  $12,%ymm0,%ymm15
 1812         vpsrld  $20,%ymm0,%ymm0
 1813         vpor    %ymm0,%ymm15,%ymm0
 1814         vpaddd  %ymm3,%ymm10,%ymm10
 1815         vpxor   %ymm5,%ymm10,%ymm5
 1816         vpshufb %ymm14,%ymm5,%ymm5
 1817         vpaddd  %ymm0,%ymm11,%ymm11
 1818         vpxor   %ymm6,%ymm11,%ymm6
 1819         vpshufb %ymm14,%ymm6,%ymm6
 1820         vpaddd  %ymm5,%ymm12,%ymm12
 1821         vpxor   %ymm3,%ymm12,%ymm3
 1822         vpslld  $7,%ymm3,%ymm15
 1823         vpsrld  $25,%ymm3,%ymm3
 1824         vpor    %ymm3,%ymm15,%ymm3
 1825         vbroadcasti128  (%r10),%ymm15
 1826         vpaddd  %ymm6,%ymm13,%ymm13
 1827         vpxor   %ymm0,%ymm13,%ymm0
 1828         vpslld  $7,%ymm0,%ymm14
 1829         vpsrld  $25,%ymm0,%ymm0
 1830         vpor    %ymm0,%ymm14,%ymm0
 1831         decl    %eax
 1832         jnz     .Loop8x
 1833 
 1834         leaq    512(%rsp),%rax
 1835         vpaddd  128-256(%rcx),%ymm8,%ymm8
 1836         vpaddd  160-256(%rcx),%ymm9,%ymm9
 1837         vpaddd  192-256(%rcx),%ymm10,%ymm10
 1838         vpaddd  224-256(%rcx),%ymm11,%ymm11
 1839 
 1840         vpunpckldq      %ymm9,%ymm8,%ymm14
 1841         vpunpckldq      %ymm11,%ymm10,%ymm15
 1842         vpunpckhdq      %ymm9,%ymm8,%ymm8
 1843         vpunpckhdq      %ymm11,%ymm10,%ymm10
 1844         vpunpcklqdq     %ymm15,%ymm14,%ymm9
 1845         vpunpckhqdq     %ymm15,%ymm14,%ymm14
 1846         vpunpcklqdq     %ymm10,%ymm8,%ymm11
 1847         vpunpckhqdq     %ymm10,%ymm8,%ymm8
 1848         vpaddd  256-256(%rcx),%ymm0,%ymm0
 1849         vpaddd  288-256(%rcx),%ymm1,%ymm1
 1850         vpaddd  320-256(%rcx),%ymm2,%ymm2
 1851         vpaddd  352-256(%rcx),%ymm3,%ymm3
 1852 
 1853         vpunpckldq      %ymm1,%ymm0,%ymm10
 1854         vpunpckldq      %ymm3,%ymm2,%ymm15
 1855         vpunpckhdq      %ymm1,%ymm0,%ymm0
 1856         vpunpckhdq      %ymm3,%ymm2,%ymm2
 1857         vpunpcklqdq     %ymm15,%ymm10,%ymm1
 1858         vpunpckhqdq     %ymm15,%ymm10,%ymm10
 1859         vpunpcklqdq     %ymm2,%ymm0,%ymm3
 1860         vpunpckhqdq     %ymm2,%ymm0,%ymm0
 1861         vperm2i128      $0x20,%ymm1,%ymm9,%ymm15
 1862         vperm2i128      $0x31,%ymm1,%ymm9,%ymm1
 1863         vperm2i128      $0x20,%ymm10,%ymm14,%ymm9
 1864         vperm2i128      $0x31,%ymm10,%ymm14,%ymm10
 1865         vperm2i128      $0x20,%ymm3,%ymm11,%ymm14
 1866         vperm2i128      $0x31,%ymm3,%ymm11,%ymm3
 1867         vperm2i128      $0x20,%ymm0,%ymm8,%ymm11
 1868         vperm2i128      $0x31,%ymm0,%ymm8,%ymm0
 1869         vmovdqa %ymm15,0(%rsp)
 1870         vmovdqa %ymm9,32(%rsp)
 1871         vmovdqa 64(%rsp),%ymm15
 1872         vmovdqa 96(%rsp),%ymm9
 1873 
 1874         vpaddd  384-512(%rax),%ymm12,%ymm12
 1875         vpaddd  416-512(%rax),%ymm13,%ymm13
 1876         vpaddd  448-512(%rax),%ymm15,%ymm15
 1877         vpaddd  480-512(%rax),%ymm9,%ymm9
 1878 
 1879         vpunpckldq      %ymm13,%ymm12,%ymm2
 1880         vpunpckldq      %ymm9,%ymm15,%ymm8
 1881         vpunpckhdq      %ymm13,%ymm12,%ymm12
 1882         vpunpckhdq      %ymm9,%ymm15,%ymm15
 1883         vpunpcklqdq     %ymm8,%ymm2,%ymm13
 1884         vpunpckhqdq     %ymm8,%ymm2,%ymm2
 1885         vpunpcklqdq     %ymm15,%ymm12,%ymm9
 1886         vpunpckhqdq     %ymm15,%ymm12,%ymm12
 1887         vpaddd  512-512(%rax),%ymm4,%ymm4
 1888         vpaddd  544-512(%rax),%ymm5,%ymm5
 1889         vpaddd  576-512(%rax),%ymm6,%ymm6
 1890         vpaddd  608-512(%rax),%ymm7,%ymm7
 1891 
 1892         vpunpckldq      %ymm5,%ymm4,%ymm15
 1893         vpunpckldq      %ymm7,%ymm6,%ymm8
 1894         vpunpckhdq      %ymm5,%ymm4,%ymm4
 1895         vpunpckhdq      %ymm7,%ymm6,%ymm6
 1896         vpunpcklqdq     %ymm8,%ymm15,%ymm5
 1897         vpunpckhqdq     %ymm8,%ymm15,%ymm15
 1898         vpunpcklqdq     %ymm6,%ymm4,%ymm7
 1899         vpunpckhqdq     %ymm6,%ymm4,%ymm4
 1900         vperm2i128      $0x20,%ymm5,%ymm13,%ymm8
 1901         vperm2i128      $0x31,%ymm5,%ymm13,%ymm5
 1902         vperm2i128      $0x20,%ymm15,%ymm2,%ymm13
 1903         vperm2i128      $0x31,%ymm15,%ymm2,%ymm15
 1904         vperm2i128      $0x20,%ymm7,%ymm9,%ymm2
 1905         vperm2i128      $0x31,%ymm7,%ymm9,%ymm7
 1906         vperm2i128      $0x20,%ymm4,%ymm12,%ymm9
 1907         vperm2i128      $0x31,%ymm4,%ymm12,%ymm4
 1908         vmovdqa 0(%rsp),%ymm6
 1909         vmovdqa 32(%rsp),%ymm12
 1910 
 1911         cmpq    $512,%rdx
 1912         jb      .Ltail8x
 1913 
 1914         vpxor   0(%rsi),%ymm6,%ymm6
 1915         vpxor   32(%rsi),%ymm8,%ymm8
 1916         vpxor   64(%rsi),%ymm1,%ymm1
 1917         vpxor   96(%rsi),%ymm5,%ymm5
 1918         leaq    128(%rsi),%rsi
 1919         vmovdqu %ymm6,0(%rdi)
 1920         vmovdqu %ymm8,32(%rdi)
 1921         vmovdqu %ymm1,64(%rdi)
 1922         vmovdqu %ymm5,96(%rdi)
 1923         leaq    128(%rdi),%rdi
 1924 
 1925         vpxor   0(%rsi),%ymm12,%ymm12
 1926         vpxor   32(%rsi),%ymm13,%ymm13
 1927         vpxor   64(%rsi),%ymm10,%ymm10
 1928         vpxor   96(%rsi),%ymm15,%ymm15
 1929         leaq    128(%rsi),%rsi
 1930         vmovdqu %ymm12,0(%rdi)
 1931         vmovdqu %ymm13,32(%rdi)
 1932         vmovdqu %ymm10,64(%rdi)
 1933         vmovdqu %ymm15,96(%rdi)
 1934         leaq    128(%rdi),%rdi
 1935 
 1936         vpxor   0(%rsi),%ymm14,%ymm14
 1937         vpxor   32(%rsi),%ymm2,%ymm2
 1938         vpxor   64(%rsi),%ymm3,%ymm3
 1939         vpxor   96(%rsi),%ymm7,%ymm7
 1940         leaq    128(%rsi),%rsi
 1941         vmovdqu %ymm14,0(%rdi)
 1942         vmovdqu %ymm2,32(%rdi)
 1943         vmovdqu %ymm3,64(%rdi)
 1944         vmovdqu %ymm7,96(%rdi)
 1945         leaq    128(%rdi),%rdi
 1946 
 1947         vpxor   0(%rsi),%ymm11,%ymm11
 1948         vpxor   32(%rsi),%ymm9,%ymm9
 1949         vpxor   64(%rsi),%ymm0,%ymm0
 1950         vpxor   96(%rsi),%ymm4,%ymm4
 1951         leaq    128(%rsi),%rsi
 1952         vmovdqu %ymm11,0(%rdi)
 1953         vmovdqu %ymm9,32(%rdi)
 1954         vmovdqu %ymm0,64(%rdi)
 1955         vmovdqu %ymm4,96(%rdi)
 1956         leaq    128(%rdi),%rdi
 1957 
 1958         subq    $512,%rdx
 1959         jnz     .Loop_outer8x
 1960 
 1961         jmp     .Ldone8x
 1962 
 1963 .Ltail8x:
 1964         cmpq    $448,%rdx
 1965         jae     .L448_or_more8x
 1966         cmpq    $384,%rdx
 1967         jae     .L384_or_more8x
 1968         cmpq    $320,%rdx
 1969         jae     .L320_or_more8x
 1970         cmpq    $256,%rdx
 1971         jae     .L256_or_more8x
 1972         cmpq    $192,%rdx
 1973         jae     .L192_or_more8x
 1974         cmpq    $128,%rdx
 1975         jae     .L128_or_more8x
 1976         cmpq    $64,%rdx
 1977         jae     .L64_or_more8x
 1978 
 1979         xorq    %r10,%r10
 1980         vmovdqa %ymm6,0(%rsp)
 1981         vmovdqa %ymm8,32(%rsp)
 1982         jmp     .Loop_tail8x
 1983 
 1984 .align  32
 1985 .L64_or_more8x:
 1986         vpxor   0(%rsi),%ymm6,%ymm6
 1987         vpxor   32(%rsi),%ymm8,%ymm8
 1988         vmovdqu %ymm6,0(%rdi)
 1989         vmovdqu %ymm8,32(%rdi)
 1990         je      .Ldone8x
 1991 
 1992         leaq    64(%rsi),%rsi
 1993         xorq    %r10,%r10
 1994         vmovdqa %ymm1,0(%rsp)
 1995         leaq    64(%rdi),%rdi
 1996         subq    $64,%rdx
 1997         vmovdqa %ymm5,32(%rsp)
 1998         jmp     .Loop_tail8x
 1999 
 2000 .align  32
 2001 .L128_or_more8x:
 2002         vpxor   0(%rsi),%ymm6,%ymm6
 2003         vpxor   32(%rsi),%ymm8,%ymm8
 2004         vpxor   64(%rsi),%ymm1,%ymm1
 2005         vpxor   96(%rsi),%ymm5,%ymm5
 2006         vmovdqu %ymm6,0(%rdi)
 2007         vmovdqu %ymm8,32(%rdi)
 2008         vmovdqu %ymm1,64(%rdi)
 2009         vmovdqu %ymm5,96(%rdi)
 2010         je      .Ldone8x
 2011 
 2012         leaq    128(%rsi),%rsi
 2013         xorq    %r10,%r10
 2014         vmovdqa %ymm12,0(%rsp)
 2015         leaq    128(%rdi),%rdi
 2016         subq    $128,%rdx
 2017         vmovdqa %ymm13,32(%rsp)
 2018         jmp     .Loop_tail8x
 2019 
 2020 .align  32
 2021 .L192_or_more8x:
 2022         vpxor   0(%rsi),%ymm6,%ymm6
 2023         vpxor   32(%rsi),%ymm8,%ymm8
 2024         vpxor   64(%rsi),%ymm1,%ymm1
 2025         vpxor   96(%rsi),%ymm5,%ymm5
 2026         vpxor   128(%rsi),%ymm12,%ymm12
 2027         vpxor   160(%rsi),%ymm13,%ymm13
 2028         vmovdqu %ymm6,0(%rdi)
 2029         vmovdqu %ymm8,32(%rdi)
 2030         vmovdqu %ymm1,64(%rdi)
 2031         vmovdqu %ymm5,96(%rdi)
 2032         vmovdqu %ymm12,128(%rdi)
 2033         vmovdqu %ymm13,160(%rdi)
 2034         je      .Ldone8x
 2035 
 2036         leaq    192(%rsi),%rsi
 2037         xorq    %r10,%r10
 2038         vmovdqa %ymm10,0(%rsp)
 2039         leaq    192(%rdi),%rdi
 2040         subq    $192,%rdx
 2041         vmovdqa %ymm15,32(%rsp)
 2042         jmp     .Loop_tail8x
 2043 
 2044 .align  32
 2045 .L256_or_more8x:
 2046         vpxor   0(%rsi),%ymm6,%ymm6
 2047         vpxor   32(%rsi),%ymm8,%ymm8
 2048         vpxor   64(%rsi),%ymm1,%ymm1
 2049         vpxor   96(%rsi),%ymm5,%ymm5
 2050         vpxor   128(%rsi),%ymm12,%ymm12
 2051         vpxor   160(%rsi),%ymm13,%ymm13
 2052         vpxor   192(%rsi),%ymm10,%ymm10
 2053         vpxor   224(%rsi),%ymm15,%ymm15
 2054         vmovdqu %ymm6,0(%rdi)
 2055         vmovdqu %ymm8,32(%rdi)
 2056         vmovdqu %ymm1,64(%rdi)
 2057         vmovdqu %ymm5,96(%rdi)
 2058         vmovdqu %ymm12,128(%rdi)
 2059         vmovdqu %ymm13,160(%rdi)
 2060         vmovdqu %ymm10,192(%rdi)
 2061         vmovdqu %ymm15,224(%rdi)
 2062         je      .Ldone8x
 2063 
 2064         leaq    256(%rsi),%rsi
 2065         xorq    %r10,%r10
 2066         vmovdqa %ymm14,0(%rsp)
 2067         leaq    256(%rdi),%rdi
 2068         subq    $256,%rdx
 2069         vmovdqa %ymm2,32(%rsp)
 2070         jmp     .Loop_tail8x
 2071 
 2072 .align  32
 2073 .L320_or_more8x:
 2074         vpxor   0(%rsi),%ymm6,%ymm6
 2075         vpxor   32(%rsi),%ymm8,%ymm8
 2076         vpxor   64(%rsi),%ymm1,%ymm1
 2077         vpxor   96(%rsi),%ymm5,%ymm5
 2078         vpxor   128(%rsi),%ymm12,%ymm12
 2079         vpxor   160(%rsi),%ymm13,%ymm13
 2080         vpxor   192(%rsi),%ymm10,%ymm10
 2081         vpxor   224(%rsi),%ymm15,%ymm15
 2082         vpxor   256(%rsi),%ymm14,%ymm14
 2083         vpxor   288(%rsi),%ymm2,%ymm2
 2084         vmovdqu %ymm6,0(%rdi)
 2085         vmovdqu %ymm8,32(%rdi)
 2086         vmovdqu %ymm1,64(%rdi)
 2087         vmovdqu %ymm5,96(%rdi)
 2088         vmovdqu %ymm12,128(%rdi)
 2089         vmovdqu %ymm13,160(%rdi)
 2090         vmovdqu %ymm10,192(%rdi)
 2091         vmovdqu %ymm15,224(%rdi)
 2092         vmovdqu %ymm14,256(%rdi)
 2093         vmovdqu %ymm2,288(%rdi)
 2094         je      .Ldone8x
 2095 
 2096         leaq    320(%rsi),%rsi
 2097         xorq    %r10,%r10
 2098         vmovdqa %ymm3,0(%rsp)
 2099         leaq    320(%rdi),%rdi
 2100         subq    $320,%rdx
 2101         vmovdqa %ymm7,32(%rsp)
 2102         jmp     .Loop_tail8x
 2103 
 2104 .align  32
 2105 .L384_or_more8x:
 2106         vpxor   0(%rsi),%ymm6,%ymm6
 2107         vpxor   32(%rsi),%ymm8,%ymm8
 2108         vpxor   64(%rsi),%ymm1,%ymm1
 2109         vpxor   96(%rsi),%ymm5,%ymm5
 2110         vpxor   128(%rsi),%ymm12,%ymm12
 2111         vpxor   160(%rsi),%ymm13,%ymm13
 2112         vpxor   192(%rsi),%ymm10,%ymm10
 2113         vpxor   224(%rsi),%ymm15,%ymm15
 2114         vpxor   256(%rsi),%ymm14,%ymm14
 2115         vpxor   288(%rsi),%ymm2,%ymm2
 2116         vpxor   320(%rsi),%ymm3,%ymm3
 2117         vpxor   352(%rsi),%ymm7,%ymm7
 2118         vmovdqu %ymm6,0(%rdi)
 2119         vmovdqu %ymm8,32(%rdi)
 2120         vmovdqu %ymm1,64(%rdi)
 2121         vmovdqu %ymm5,96(%rdi)
 2122         vmovdqu %ymm12,128(%rdi)
 2123         vmovdqu %ymm13,160(%rdi)
 2124         vmovdqu %ymm10,192(%rdi)
 2125         vmovdqu %ymm15,224(%rdi)
 2126         vmovdqu %ymm14,256(%rdi)
 2127         vmovdqu %ymm2,288(%rdi)
 2128         vmovdqu %ymm3,320(%rdi)
 2129         vmovdqu %ymm7,352(%rdi)
 2130         je      .Ldone8x
 2131 
 2132         leaq    384(%rsi),%rsi
 2133         xorq    %r10,%r10
 2134         vmovdqa %ymm11,0(%rsp)
 2135         leaq    384(%rdi),%rdi
 2136         subq    $384,%rdx
 2137         vmovdqa %ymm9,32(%rsp)
 2138         jmp     .Loop_tail8x
 2139 
 2140 .align  32
 2141 .L448_or_more8x:
 2142         vpxor   0(%rsi),%ymm6,%ymm6
 2143         vpxor   32(%rsi),%ymm8,%ymm8
 2144         vpxor   64(%rsi),%ymm1,%ymm1
 2145         vpxor   96(%rsi),%ymm5,%ymm5
 2146         vpxor   128(%rsi),%ymm12,%ymm12
 2147         vpxor   160(%rsi),%ymm13,%ymm13
 2148         vpxor   192(%rsi),%ymm10,%ymm10
 2149         vpxor   224(%rsi),%ymm15,%ymm15
 2150         vpxor   256(%rsi),%ymm14,%ymm14
 2151         vpxor   288(%rsi),%ymm2,%ymm2
 2152         vpxor   320(%rsi),%ymm3,%ymm3
 2153         vpxor   352(%rsi),%ymm7,%ymm7
 2154         vpxor   384(%rsi),%ymm11,%ymm11
 2155         vpxor   416(%rsi),%ymm9,%ymm9
 2156         vmovdqu %ymm6,0(%rdi)
 2157         vmovdqu %ymm8,32(%rdi)
 2158         vmovdqu %ymm1,64(%rdi)
 2159         vmovdqu %ymm5,96(%rdi)
 2160         vmovdqu %ymm12,128(%rdi)
 2161         vmovdqu %ymm13,160(%rdi)
 2162         vmovdqu %ymm10,192(%rdi)
 2163         vmovdqu %ymm15,224(%rdi)
 2164         vmovdqu %ymm14,256(%rdi)
 2165         vmovdqu %ymm2,288(%rdi)
 2166         vmovdqu %ymm3,320(%rdi)
 2167         vmovdqu %ymm7,352(%rdi)
 2168         vmovdqu %ymm11,384(%rdi)
 2169         vmovdqu %ymm9,416(%rdi)
 2170         je      .Ldone8x
 2171 
 2172         leaq    448(%rsi),%rsi
 2173         xorq    %r10,%r10
 2174         vmovdqa %ymm0,0(%rsp)
 2175         leaq    448(%rdi),%rdi
 2176         subq    $448,%rdx
 2177         vmovdqa %ymm4,32(%rsp)
 2178 
 2179 .Loop_tail8x:
 2180         movzbl  (%rsi,%r10,1),%eax
 2181         movzbl  (%rsp,%r10,1),%ecx
 2182         leaq    1(%r10),%r10
 2183         xorl    %ecx,%eax
 2184         movb    %al,-1(%rdi,%r10,1)
 2185         decq    %rdx
 2186         jnz     .Loop_tail8x
 2187 
 2188 .Ldone8x:
 2189         vzeroall
 2190         leaq    (%r9),%rsp
 2191 .cfi_def_cfa_register   %rsp
 2192 .L8x_epilogue:
 2193         .byte   0xf3,0xc3
 2194 .cfi_endproc    
 2195 .size   ChaCha20_8x,.-ChaCha20_8x

Cache object: 1f14a43c4d6c14bf7a0f038fa415b4a0


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.