The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/crypto/openssl/amd64/x86_64-mont5.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /* $FreeBSD$ */
    2 /* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
    3 .text   
    4 
    5 
    6 
    7 .globl  bn_mul_mont_gather5
    8 .type   bn_mul_mont_gather5,@function
    9 .align  64
   10 bn_mul_mont_gather5:
   11 .cfi_startproc  
   12         movl    %r9d,%r9d
   13         movq    %rsp,%rax
   14 .cfi_def_cfa_register   %rax
   15         testl   $7,%r9d
   16         jnz     .Lmul_enter
   17         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
   18         jmp     .Lmul4x_enter
   19 
   20 .align  16
   21 .Lmul_enter:
   22         movd    8(%rsp),%xmm5
   23         pushq   %rbx
   24 .cfi_offset     %rbx,-16
   25         pushq   %rbp
   26 .cfi_offset     %rbp,-24
   27         pushq   %r12
   28 .cfi_offset     %r12,-32
   29         pushq   %r13
   30 .cfi_offset     %r13,-40
   31         pushq   %r14
   32 .cfi_offset     %r14,-48
   33         pushq   %r15
   34 .cfi_offset     %r15,-56
   35 
   36         negq    %r9
   37         movq    %rsp,%r11
   38         leaq    -280(%rsp,%r9,8),%r10
   39         negq    %r9
   40         andq    $-1024,%r10
   41 
   42 
   43 
   44 
   45 
   46 
   47 
   48 
   49 
   50         subq    %r10,%r11
   51         andq    $-4096,%r11
   52         leaq    (%r10,%r11,1),%rsp
   53         movq    (%rsp),%r11
   54         cmpq    %r10,%rsp
   55         ja      .Lmul_page_walk
   56         jmp     .Lmul_page_walk_done
   57 
   58 .Lmul_page_walk:
   59         leaq    -4096(%rsp),%rsp
   60         movq    (%rsp),%r11
   61         cmpq    %r10,%rsp
   62         ja      .Lmul_page_walk
   63 .Lmul_page_walk_done:
   64 
   65         leaq    .Linc(%rip),%r10
   66         movq    %rax,8(%rsp,%r9,8)
   67 .cfi_escape     0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
   68 .Lmul_body:
   69 
   70         leaq    128(%rdx),%r12
   71         movdqa  0(%r10),%xmm0
   72         movdqa  16(%r10),%xmm1
   73         leaq    24-112(%rsp,%r9,8),%r10
   74         andq    $-16,%r10
   75 
   76         pshufd  $0,%xmm5,%xmm5
   77         movdqa  %xmm1,%xmm4
   78         movdqa  %xmm1,%xmm2
   79         paddd   %xmm0,%xmm1
   80         pcmpeqd %xmm5,%xmm0
   81 .byte   0x67
   82         movdqa  %xmm4,%xmm3
   83         paddd   %xmm1,%xmm2
   84         pcmpeqd %xmm5,%xmm1
   85         movdqa  %xmm0,112(%r10)
   86         movdqa  %xmm4,%xmm0
   87 
   88         paddd   %xmm2,%xmm3
   89         pcmpeqd %xmm5,%xmm2
   90         movdqa  %xmm1,128(%r10)
   91         movdqa  %xmm4,%xmm1
   92 
   93         paddd   %xmm3,%xmm0
   94         pcmpeqd %xmm5,%xmm3
   95         movdqa  %xmm2,144(%r10)
   96         movdqa  %xmm4,%xmm2
   97 
   98         paddd   %xmm0,%xmm1
   99         pcmpeqd %xmm5,%xmm0
  100         movdqa  %xmm3,160(%r10)
  101         movdqa  %xmm4,%xmm3
  102         paddd   %xmm1,%xmm2
  103         pcmpeqd %xmm5,%xmm1
  104         movdqa  %xmm0,176(%r10)
  105         movdqa  %xmm4,%xmm0
  106 
  107         paddd   %xmm2,%xmm3
  108         pcmpeqd %xmm5,%xmm2
  109         movdqa  %xmm1,192(%r10)
  110         movdqa  %xmm4,%xmm1
  111 
  112         paddd   %xmm3,%xmm0
  113         pcmpeqd %xmm5,%xmm3
  114         movdqa  %xmm2,208(%r10)
  115         movdqa  %xmm4,%xmm2
  116 
  117         paddd   %xmm0,%xmm1
  118         pcmpeqd %xmm5,%xmm0
  119         movdqa  %xmm3,224(%r10)
  120         movdqa  %xmm4,%xmm3
  121         paddd   %xmm1,%xmm2
  122         pcmpeqd %xmm5,%xmm1
  123         movdqa  %xmm0,240(%r10)
  124         movdqa  %xmm4,%xmm0
  125 
  126         paddd   %xmm2,%xmm3
  127         pcmpeqd %xmm5,%xmm2
  128         movdqa  %xmm1,256(%r10)
  129         movdqa  %xmm4,%xmm1
  130 
  131         paddd   %xmm3,%xmm0
  132         pcmpeqd %xmm5,%xmm3
  133         movdqa  %xmm2,272(%r10)
  134         movdqa  %xmm4,%xmm2
  135 
  136         paddd   %xmm0,%xmm1
  137         pcmpeqd %xmm5,%xmm0
  138         movdqa  %xmm3,288(%r10)
  139         movdqa  %xmm4,%xmm3
  140         paddd   %xmm1,%xmm2
  141         pcmpeqd %xmm5,%xmm1
  142         movdqa  %xmm0,304(%r10)
  143 
  144         paddd   %xmm2,%xmm3
  145 .byte   0x67
  146         pcmpeqd %xmm5,%xmm2
  147         movdqa  %xmm1,320(%r10)
  148 
  149         pcmpeqd %xmm5,%xmm3
  150         movdqa  %xmm2,336(%r10)
  151         pand    64(%r12),%xmm0
  152 
  153         pand    80(%r12),%xmm1
  154         pand    96(%r12),%xmm2
  155         movdqa  %xmm3,352(%r10)
  156         pand    112(%r12),%xmm3
  157         por     %xmm2,%xmm0
  158         por     %xmm3,%xmm1
  159         movdqa  -128(%r12),%xmm4
  160         movdqa  -112(%r12),%xmm5
  161         movdqa  -96(%r12),%xmm2
  162         pand    112(%r10),%xmm4
  163         movdqa  -80(%r12),%xmm3
  164         pand    128(%r10),%xmm5
  165         por     %xmm4,%xmm0
  166         pand    144(%r10),%xmm2
  167         por     %xmm5,%xmm1
  168         pand    160(%r10),%xmm3
  169         por     %xmm2,%xmm0
  170         por     %xmm3,%xmm1
  171         movdqa  -64(%r12),%xmm4
  172         movdqa  -48(%r12),%xmm5
  173         movdqa  -32(%r12),%xmm2
  174         pand    176(%r10),%xmm4
  175         movdqa  -16(%r12),%xmm3
  176         pand    192(%r10),%xmm5
  177         por     %xmm4,%xmm0
  178         pand    208(%r10),%xmm2
  179         por     %xmm5,%xmm1
  180         pand    224(%r10),%xmm3
  181         por     %xmm2,%xmm0
  182         por     %xmm3,%xmm1
  183         movdqa  0(%r12),%xmm4
  184         movdqa  16(%r12),%xmm5
  185         movdqa  32(%r12),%xmm2
  186         pand    240(%r10),%xmm4
  187         movdqa  48(%r12),%xmm3
  188         pand    256(%r10),%xmm5
  189         por     %xmm4,%xmm0
  190         pand    272(%r10),%xmm2
  191         por     %xmm5,%xmm1
  192         pand    288(%r10),%xmm3
  193         por     %xmm2,%xmm0
  194         por     %xmm3,%xmm1
  195         por     %xmm1,%xmm0
  196         pshufd  $0x4e,%xmm0,%xmm1
  197         por     %xmm1,%xmm0
  198         leaq    256(%r12),%r12
  199 .byte   102,72,15,126,195
  200 
  201         movq    (%r8),%r8
  202         movq    (%rsi),%rax
  203 
  204         xorq    %r14,%r14
  205         xorq    %r15,%r15
  206 
  207         movq    %r8,%rbp
  208         mulq    %rbx
  209         movq    %rax,%r10
  210         movq    (%rcx),%rax
  211 
  212         imulq   %r10,%rbp
  213         movq    %rdx,%r11
  214 
  215         mulq    %rbp
  216         addq    %rax,%r10
  217         movq    8(%rsi),%rax
  218         adcq    $0,%rdx
  219         movq    %rdx,%r13
  220 
  221         leaq    1(%r15),%r15
  222         jmp     .L1st_enter
  223 
  224 .align  16
  225 .L1st:
  226         addq    %rax,%r13
  227         movq    (%rsi,%r15,8),%rax
  228         adcq    $0,%rdx
  229         addq    %r11,%r13
  230         movq    %r10,%r11
  231         adcq    $0,%rdx
  232         movq    %r13,-16(%rsp,%r15,8)
  233         movq    %rdx,%r13
  234 
  235 .L1st_enter:
  236         mulq    %rbx
  237         addq    %rax,%r11
  238         movq    (%rcx,%r15,8),%rax
  239         adcq    $0,%rdx
  240         leaq    1(%r15),%r15
  241         movq    %rdx,%r10
  242 
  243         mulq    %rbp
  244         cmpq    %r9,%r15
  245         jne     .L1st
  246 
  247 
  248         addq    %rax,%r13
  249         adcq    $0,%rdx
  250         addq    %r11,%r13
  251         adcq    $0,%rdx
  252         movq    %r13,-16(%rsp,%r9,8)
  253         movq    %rdx,%r13
  254         movq    %r10,%r11
  255 
  256         xorq    %rdx,%rdx
  257         addq    %r11,%r13
  258         adcq    $0,%rdx
  259         movq    %r13,-8(%rsp,%r9,8)
  260         movq    %rdx,(%rsp,%r9,8)
  261 
  262         leaq    1(%r14),%r14
  263         jmp     .Louter
  264 .align  16
  265 .Louter:
  266         leaq    24+128(%rsp,%r9,8),%rdx
  267         andq    $-16,%rdx
  268         pxor    %xmm4,%xmm4
  269         pxor    %xmm5,%xmm5
  270         movdqa  -128(%r12),%xmm0
  271         movdqa  -112(%r12),%xmm1
  272         movdqa  -96(%r12),%xmm2
  273         movdqa  -80(%r12),%xmm3
  274         pand    -128(%rdx),%xmm0
  275         pand    -112(%rdx),%xmm1
  276         por     %xmm0,%xmm4
  277         pand    -96(%rdx),%xmm2
  278         por     %xmm1,%xmm5
  279         pand    -80(%rdx),%xmm3
  280         por     %xmm2,%xmm4
  281         por     %xmm3,%xmm5
  282         movdqa  -64(%r12),%xmm0
  283         movdqa  -48(%r12),%xmm1
  284         movdqa  -32(%r12),%xmm2
  285         movdqa  -16(%r12),%xmm3
  286         pand    -64(%rdx),%xmm0
  287         pand    -48(%rdx),%xmm1
  288         por     %xmm0,%xmm4
  289         pand    -32(%rdx),%xmm2
  290         por     %xmm1,%xmm5
  291         pand    -16(%rdx),%xmm3
  292         por     %xmm2,%xmm4
  293         por     %xmm3,%xmm5
  294         movdqa  0(%r12),%xmm0
  295         movdqa  16(%r12),%xmm1
  296         movdqa  32(%r12),%xmm2
  297         movdqa  48(%r12),%xmm3
  298         pand    0(%rdx),%xmm0
  299         pand    16(%rdx),%xmm1
  300         por     %xmm0,%xmm4
  301         pand    32(%rdx),%xmm2
  302         por     %xmm1,%xmm5
  303         pand    48(%rdx),%xmm3
  304         por     %xmm2,%xmm4
  305         por     %xmm3,%xmm5
  306         movdqa  64(%r12),%xmm0
  307         movdqa  80(%r12),%xmm1
  308         movdqa  96(%r12),%xmm2
  309         movdqa  112(%r12),%xmm3
  310         pand    64(%rdx),%xmm0
  311         pand    80(%rdx),%xmm1
  312         por     %xmm0,%xmm4
  313         pand    96(%rdx),%xmm2
  314         por     %xmm1,%xmm5
  315         pand    112(%rdx),%xmm3
  316         por     %xmm2,%xmm4
  317         por     %xmm3,%xmm5
  318         por     %xmm5,%xmm4
  319         pshufd  $0x4e,%xmm4,%xmm0
  320         por     %xmm4,%xmm0
  321         leaq    256(%r12),%r12
  322 
  323         movq    (%rsi),%rax
  324 .byte   102,72,15,126,195
  325 
  326         xorq    %r15,%r15
  327         movq    %r8,%rbp
  328         movq    (%rsp),%r10
  329 
  330         mulq    %rbx
  331         addq    %rax,%r10
  332         movq    (%rcx),%rax
  333         adcq    $0,%rdx
  334 
  335         imulq   %r10,%rbp
  336         movq    %rdx,%r11
  337 
  338         mulq    %rbp
  339         addq    %rax,%r10
  340         movq    8(%rsi),%rax
  341         adcq    $0,%rdx
  342         movq    8(%rsp),%r10
  343         movq    %rdx,%r13
  344 
  345         leaq    1(%r15),%r15
  346         jmp     .Linner_enter
  347 
  348 .align  16
  349 .Linner:
  350         addq    %rax,%r13
  351         movq    (%rsi,%r15,8),%rax
  352         adcq    $0,%rdx
  353         addq    %r10,%r13
  354         movq    (%rsp,%r15,8),%r10
  355         adcq    $0,%rdx
  356         movq    %r13,-16(%rsp,%r15,8)
  357         movq    %rdx,%r13
  358 
  359 .Linner_enter:
  360         mulq    %rbx
  361         addq    %rax,%r11
  362         movq    (%rcx,%r15,8),%rax
  363         adcq    $0,%rdx
  364         addq    %r11,%r10
  365         movq    %rdx,%r11
  366         adcq    $0,%r11
  367         leaq    1(%r15),%r15
  368 
  369         mulq    %rbp
  370         cmpq    %r9,%r15
  371         jne     .Linner
  372 
  373         addq    %rax,%r13
  374         adcq    $0,%rdx
  375         addq    %r10,%r13
  376         movq    (%rsp,%r9,8),%r10
  377         adcq    $0,%rdx
  378         movq    %r13,-16(%rsp,%r9,8)
  379         movq    %rdx,%r13
  380 
  381         xorq    %rdx,%rdx
  382         addq    %r11,%r13
  383         adcq    $0,%rdx
  384         addq    %r10,%r13
  385         adcq    $0,%rdx
  386         movq    %r13,-8(%rsp,%r9,8)
  387         movq    %rdx,(%rsp,%r9,8)
  388 
  389         leaq    1(%r14),%r14
  390         cmpq    %r9,%r14
  391         jb      .Louter
  392 
  393         xorq    %r14,%r14
  394         movq    (%rsp),%rax
  395         leaq    (%rsp),%rsi
  396         movq    %r9,%r15
  397         jmp     .Lsub
  398 .align  16
  399 .Lsub:  sbbq    (%rcx,%r14,8),%rax
  400         movq    %rax,(%rdi,%r14,8)
  401         movq    8(%rsi,%r14,8),%rax
  402         leaq    1(%r14),%r14
  403         decq    %r15
  404         jnz     .Lsub
  405 
  406         sbbq    $0,%rax
  407         movq    $-1,%rbx
  408         xorq    %rax,%rbx
  409         xorq    %r14,%r14
  410         movq    %r9,%r15
  411 
  412 .Lcopy:
  413         movq    (%rdi,%r14,8),%rcx
  414         movq    (%rsp,%r14,8),%rdx
  415         andq    %rbx,%rcx
  416         andq    %rax,%rdx
  417         movq    %r14,(%rsp,%r14,8)
  418         orq     %rcx,%rdx
  419         movq    %rdx,(%rdi,%r14,8)
  420         leaq    1(%r14),%r14
  421         subq    $1,%r15
  422         jnz     .Lcopy
  423 
  424         movq    8(%rsp,%r9,8),%rsi
  425 .cfi_def_cfa    %rsi,8
  426         movq    $1,%rax
  427 
  428         movq    -48(%rsi),%r15
  429 .cfi_restore    %r15
  430         movq    -40(%rsi),%r14
  431 .cfi_restore    %r14
  432         movq    -32(%rsi),%r13
  433 .cfi_restore    %r13
  434         movq    -24(%rsi),%r12
  435 .cfi_restore    %r12
  436         movq    -16(%rsi),%rbp
  437 .cfi_restore    %rbp
  438         movq    -8(%rsi),%rbx
  439 .cfi_restore    %rbx
  440         leaq    (%rsi),%rsp
  441 .cfi_def_cfa_register   %rsp
  442 .Lmul_epilogue:
  443         .byte   0xf3,0xc3
  444 .cfi_endproc    
  445 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
  446 .type   bn_mul4x_mont_gather5,@function
  447 .align  32
  448 bn_mul4x_mont_gather5:
  449 .cfi_startproc  
  450 .byte   0x67
  451         movq    %rsp,%rax
  452 .cfi_def_cfa_register   %rax
  453 .Lmul4x_enter:
  454         andl    $0x80108,%r11d
  455         cmpl    $0x80108,%r11d
  456         je      .Lmulx4x_enter
  457         pushq   %rbx
  458 .cfi_offset     %rbx,-16
  459         pushq   %rbp
  460 .cfi_offset     %rbp,-24
  461         pushq   %r12
  462 .cfi_offset     %r12,-32
  463         pushq   %r13
  464 .cfi_offset     %r13,-40
  465         pushq   %r14
  466 .cfi_offset     %r14,-48
  467         pushq   %r15
  468 .cfi_offset     %r15,-56
  469 .Lmul4x_prologue:
  470 
  471 .byte   0x67
  472         shll    $3,%r9d
  473         leaq    (%r9,%r9,2),%r10
  474         negq    %r9
  475 
  476 
  477 
  478 
  479 
  480 
  481 
  482 
  483 
  484 
  485         leaq    -320(%rsp,%r9,2),%r11
  486         movq    %rsp,%rbp
  487         subq    %rdi,%r11
  488         andq    $4095,%r11
  489         cmpq    %r11,%r10
  490         jb      .Lmul4xsp_alt
  491         subq    %r11,%rbp
  492         leaq    -320(%rbp,%r9,2),%rbp
  493         jmp     .Lmul4xsp_done
  494 
  495 .align  32
  496 .Lmul4xsp_alt:
  497         leaq    4096-320(,%r9,2),%r10
  498         leaq    -320(%rbp,%r9,2),%rbp
  499         subq    %r10,%r11
  500         movq    $0,%r10
  501         cmovcq  %r10,%r11
  502         subq    %r11,%rbp
  503 .Lmul4xsp_done:
  504         andq    $-64,%rbp
  505         movq    %rsp,%r11
  506         subq    %rbp,%r11
  507         andq    $-4096,%r11
  508         leaq    (%r11,%rbp,1),%rsp
  509         movq    (%rsp),%r10
  510         cmpq    %rbp,%rsp
  511         ja      .Lmul4x_page_walk
  512         jmp     .Lmul4x_page_walk_done
  513 
  514 .Lmul4x_page_walk:
  515         leaq    -4096(%rsp),%rsp
  516         movq    (%rsp),%r10
  517         cmpq    %rbp,%rsp
  518         ja      .Lmul4x_page_walk
  519 .Lmul4x_page_walk_done:
  520 
  521         negq    %r9
  522 
  523         movq    %rax,40(%rsp)
  524 .cfi_escape     0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  525 .Lmul4x_body:
  526 
  527         call    mul4x_internal
  528 
  529         movq    40(%rsp),%rsi
  530 .cfi_def_cfa    %rsi,8
  531         movq    $1,%rax
  532 
  533         movq    -48(%rsi),%r15
  534 .cfi_restore    %r15
  535         movq    -40(%rsi),%r14
  536 .cfi_restore    %r14
  537         movq    -32(%rsi),%r13
  538 .cfi_restore    %r13
  539         movq    -24(%rsi),%r12
  540 .cfi_restore    %r12
  541         movq    -16(%rsi),%rbp
  542 .cfi_restore    %rbp
  543         movq    -8(%rsi),%rbx
  544 .cfi_restore    %rbx
  545         leaq    (%rsi),%rsp
  546 .cfi_def_cfa_register   %rsp
  547 .Lmul4x_epilogue:
  548         .byte   0xf3,0xc3
  549 .cfi_endproc    
  550 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
  551 
  552 .type   mul4x_internal,@function
  553 .align  32
  554 mul4x_internal:
  555 .cfi_startproc  
  556         shlq    $5,%r9
  557         movd    8(%rax),%xmm5
  558         leaq    .Linc(%rip),%rax
  559         leaq    128(%rdx,%r9,1),%r13
  560         shrq    $5,%r9
  561         movdqa  0(%rax),%xmm0
  562         movdqa  16(%rax),%xmm1
  563         leaq    88-112(%rsp,%r9,1),%r10
  564         leaq    128(%rdx),%r12
  565 
  566         pshufd  $0,%xmm5,%xmm5
  567         movdqa  %xmm1,%xmm4
  568 .byte   0x67,0x67
  569         movdqa  %xmm1,%xmm2
  570         paddd   %xmm0,%xmm1
  571         pcmpeqd %xmm5,%xmm0
  572 .byte   0x67
  573         movdqa  %xmm4,%xmm3
  574         paddd   %xmm1,%xmm2
  575         pcmpeqd %xmm5,%xmm1
  576         movdqa  %xmm0,112(%r10)
  577         movdqa  %xmm4,%xmm0
  578 
  579         paddd   %xmm2,%xmm3
  580         pcmpeqd %xmm5,%xmm2
  581         movdqa  %xmm1,128(%r10)
  582         movdqa  %xmm4,%xmm1
  583 
  584         paddd   %xmm3,%xmm0
  585         pcmpeqd %xmm5,%xmm3
  586         movdqa  %xmm2,144(%r10)
  587         movdqa  %xmm4,%xmm2
  588 
  589         paddd   %xmm0,%xmm1
  590         pcmpeqd %xmm5,%xmm0
  591         movdqa  %xmm3,160(%r10)
  592         movdqa  %xmm4,%xmm3
  593         paddd   %xmm1,%xmm2
  594         pcmpeqd %xmm5,%xmm1
  595         movdqa  %xmm0,176(%r10)
  596         movdqa  %xmm4,%xmm0
  597 
  598         paddd   %xmm2,%xmm3
  599         pcmpeqd %xmm5,%xmm2
  600         movdqa  %xmm1,192(%r10)
  601         movdqa  %xmm4,%xmm1
  602 
  603         paddd   %xmm3,%xmm0
  604         pcmpeqd %xmm5,%xmm3
  605         movdqa  %xmm2,208(%r10)
  606         movdqa  %xmm4,%xmm2
  607 
  608         paddd   %xmm0,%xmm1
  609         pcmpeqd %xmm5,%xmm0
  610         movdqa  %xmm3,224(%r10)
  611         movdqa  %xmm4,%xmm3
  612         paddd   %xmm1,%xmm2
  613         pcmpeqd %xmm5,%xmm1
  614         movdqa  %xmm0,240(%r10)
  615         movdqa  %xmm4,%xmm0
  616 
  617         paddd   %xmm2,%xmm3
  618         pcmpeqd %xmm5,%xmm2
  619         movdqa  %xmm1,256(%r10)
  620         movdqa  %xmm4,%xmm1
  621 
  622         paddd   %xmm3,%xmm0
  623         pcmpeqd %xmm5,%xmm3
  624         movdqa  %xmm2,272(%r10)
  625         movdqa  %xmm4,%xmm2
  626 
  627         paddd   %xmm0,%xmm1
  628         pcmpeqd %xmm5,%xmm0
  629         movdqa  %xmm3,288(%r10)
  630         movdqa  %xmm4,%xmm3
  631         paddd   %xmm1,%xmm2
  632         pcmpeqd %xmm5,%xmm1
  633         movdqa  %xmm0,304(%r10)
  634 
  635         paddd   %xmm2,%xmm3
  636 .byte   0x67
  637         pcmpeqd %xmm5,%xmm2
  638         movdqa  %xmm1,320(%r10)
  639 
  640         pcmpeqd %xmm5,%xmm3
  641         movdqa  %xmm2,336(%r10)
  642         pand    64(%r12),%xmm0
  643 
  644         pand    80(%r12),%xmm1
  645         pand    96(%r12),%xmm2
  646         movdqa  %xmm3,352(%r10)
  647         pand    112(%r12),%xmm3
  648         por     %xmm2,%xmm0
  649         por     %xmm3,%xmm1
  650         movdqa  -128(%r12),%xmm4
  651         movdqa  -112(%r12),%xmm5
  652         movdqa  -96(%r12),%xmm2
  653         pand    112(%r10),%xmm4
  654         movdqa  -80(%r12),%xmm3
  655         pand    128(%r10),%xmm5
  656         por     %xmm4,%xmm0
  657         pand    144(%r10),%xmm2
  658         por     %xmm5,%xmm1
  659         pand    160(%r10),%xmm3
  660         por     %xmm2,%xmm0
  661         por     %xmm3,%xmm1
  662         movdqa  -64(%r12),%xmm4
  663         movdqa  -48(%r12),%xmm5
  664         movdqa  -32(%r12),%xmm2
  665         pand    176(%r10),%xmm4
  666         movdqa  -16(%r12),%xmm3
  667         pand    192(%r10),%xmm5
  668         por     %xmm4,%xmm0
  669         pand    208(%r10),%xmm2
  670         por     %xmm5,%xmm1
  671         pand    224(%r10),%xmm3
  672         por     %xmm2,%xmm0
  673         por     %xmm3,%xmm1
  674         movdqa  0(%r12),%xmm4
  675         movdqa  16(%r12),%xmm5
  676         movdqa  32(%r12),%xmm2
  677         pand    240(%r10),%xmm4
  678         movdqa  48(%r12),%xmm3
  679         pand    256(%r10),%xmm5
  680         por     %xmm4,%xmm0
  681         pand    272(%r10),%xmm2
  682         por     %xmm5,%xmm1
  683         pand    288(%r10),%xmm3
  684         por     %xmm2,%xmm0
  685         por     %xmm3,%xmm1
  686         por     %xmm1,%xmm0
  687         pshufd  $0x4e,%xmm0,%xmm1
  688         por     %xmm1,%xmm0
  689         leaq    256(%r12),%r12
  690 .byte   102,72,15,126,195
  691 
  692         movq    %r13,16+8(%rsp)
  693         movq    %rdi,56+8(%rsp)
  694 
  695         movq    (%r8),%r8
  696         movq    (%rsi),%rax
  697         leaq    (%rsi,%r9,1),%rsi
  698         negq    %r9
  699 
  700         movq    %r8,%rbp
  701         mulq    %rbx
  702         movq    %rax,%r10
  703         movq    (%rcx),%rax
  704 
  705         imulq   %r10,%rbp
  706         leaq    64+8(%rsp),%r14
  707         movq    %rdx,%r11
  708 
  709         mulq    %rbp
  710         addq    %rax,%r10
  711         movq    8(%rsi,%r9,1),%rax
  712         adcq    $0,%rdx
  713         movq    %rdx,%rdi
  714 
  715         mulq    %rbx
  716         addq    %rax,%r11
  717         movq    8(%rcx),%rax
  718         adcq    $0,%rdx
  719         movq    %rdx,%r10
  720 
  721         mulq    %rbp
  722         addq    %rax,%rdi
  723         movq    16(%rsi,%r9,1),%rax
  724         adcq    $0,%rdx
  725         addq    %r11,%rdi
  726         leaq    32(%r9),%r15
  727         leaq    32(%rcx),%rcx
  728         adcq    $0,%rdx
  729         movq    %rdi,(%r14)
  730         movq    %rdx,%r13
  731         jmp     .L1st4x
  732 
  733 .align  32
  734 .L1st4x:
  735         mulq    %rbx
  736         addq    %rax,%r10
  737         movq    -16(%rcx),%rax
  738         leaq    32(%r14),%r14
  739         adcq    $0,%rdx
  740         movq    %rdx,%r11
  741 
  742         mulq    %rbp
  743         addq    %rax,%r13
  744         movq    -8(%rsi,%r15,1),%rax
  745         adcq    $0,%rdx
  746         addq    %r10,%r13
  747         adcq    $0,%rdx
  748         movq    %r13,-24(%r14)
  749         movq    %rdx,%rdi
  750 
  751         mulq    %rbx
  752         addq    %rax,%r11
  753         movq    -8(%rcx),%rax
  754         adcq    $0,%rdx
  755         movq    %rdx,%r10
  756 
  757         mulq    %rbp
  758         addq    %rax,%rdi
  759         movq    (%rsi,%r15,1),%rax
  760         adcq    $0,%rdx
  761         addq    %r11,%rdi
  762         adcq    $0,%rdx
  763         movq    %rdi,-16(%r14)
  764         movq    %rdx,%r13
  765 
  766         mulq    %rbx
  767         addq    %rax,%r10
  768         movq    0(%rcx),%rax
  769         adcq    $0,%rdx
  770         movq    %rdx,%r11
  771 
  772         mulq    %rbp
  773         addq    %rax,%r13
  774         movq    8(%rsi,%r15,1),%rax
  775         adcq    $0,%rdx
  776         addq    %r10,%r13
  777         adcq    $0,%rdx
  778         movq    %r13,-8(%r14)
  779         movq    %rdx,%rdi
  780 
  781         mulq    %rbx
  782         addq    %rax,%r11
  783         movq    8(%rcx),%rax
  784         adcq    $0,%rdx
  785         movq    %rdx,%r10
  786 
  787         mulq    %rbp
  788         addq    %rax,%rdi
  789         movq    16(%rsi,%r15,1),%rax
  790         adcq    $0,%rdx
  791         addq    %r11,%rdi
  792         leaq    32(%rcx),%rcx
  793         adcq    $0,%rdx
  794         movq    %rdi,(%r14)
  795         movq    %rdx,%r13
  796 
  797         addq    $32,%r15
  798         jnz     .L1st4x
  799 
  800         mulq    %rbx
  801         addq    %rax,%r10
  802         movq    -16(%rcx),%rax
  803         leaq    32(%r14),%r14
  804         adcq    $0,%rdx
  805         movq    %rdx,%r11
  806 
  807         mulq    %rbp
  808         addq    %rax,%r13
  809         movq    -8(%rsi),%rax
  810         adcq    $0,%rdx
  811         addq    %r10,%r13
  812         adcq    $0,%rdx
  813         movq    %r13,-24(%r14)
  814         movq    %rdx,%rdi
  815 
  816         mulq    %rbx
  817         addq    %rax,%r11
  818         movq    -8(%rcx),%rax
  819         adcq    $0,%rdx
  820         movq    %rdx,%r10
  821 
  822         mulq    %rbp
  823         addq    %rax,%rdi
  824         movq    (%rsi,%r9,1),%rax
  825         adcq    $0,%rdx
  826         addq    %r11,%rdi
  827         adcq    $0,%rdx
  828         movq    %rdi,-16(%r14)
  829         movq    %rdx,%r13
  830 
  831         leaq    (%rcx,%r9,1),%rcx
  832 
  833         xorq    %rdi,%rdi
  834         addq    %r10,%r13
  835         adcq    $0,%rdi
  836         movq    %r13,-8(%r14)
  837 
  838         jmp     .Louter4x
  839 
  840 .align  32
  841 .Louter4x:
  842         leaq    16+128(%r14),%rdx
  843         pxor    %xmm4,%xmm4
  844         pxor    %xmm5,%xmm5
  845         movdqa  -128(%r12),%xmm0
  846         movdqa  -112(%r12),%xmm1
  847         movdqa  -96(%r12),%xmm2
  848         movdqa  -80(%r12),%xmm3
  849         pand    -128(%rdx),%xmm0
  850         pand    -112(%rdx),%xmm1
  851         por     %xmm0,%xmm4
  852         pand    -96(%rdx),%xmm2
  853         por     %xmm1,%xmm5
  854         pand    -80(%rdx),%xmm3
  855         por     %xmm2,%xmm4
  856         por     %xmm3,%xmm5
  857         movdqa  -64(%r12),%xmm0
  858         movdqa  -48(%r12),%xmm1
  859         movdqa  -32(%r12),%xmm2
  860         movdqa  -16(%r12),%xmm3
  861         pand    -64(%rdx),%xmm0
  862         pand    -48(%rdx),%xmm1
  863         por     %xmm0,%xmm4
  864         pand    -32(%rdx),%xmm2
  865         por     %xmm1,%xmm5
  866         pand    -16(%rdx),%xmm3
  867         por     %xmm2,%xmm4
  868         por     %xmm3,%xmm5
  869         movdqa  0(%r12),%xmm0
  870         movdqa  16(%r12),%xmm1
  871         movdqa  32(%r12),%xmm2
  872         movdqa  48(%r12),%xmm3
  873         pand    0(%rdx),%xmm0
  874         pand    16(%rdx),%xmm1
  875         por     %xmm0,%xmm4
  876         pand    32(%rdx),%xmm2
  877         por     %xmm1,%xmm5
  878         pand    48(%rdx),%xmm3
  879         por     %xmm2,%xmm4
  880         por     %xmm3,%xmm5
  881         movdqa  64(%r12),%xmm0
  882         movdqa  80(%r12),%xmm1
  883         movdqa  96(%r12),%xmm2
  884         movdqa  112(%r12),%xmm3
  885         pand    64(%rdx),%xmm0
  886         pand    80(%rdx),%xmm1
  887         por     %xmm0,%xmm4
  888         pand    96(%rdx),%xmm2
  889         por     %xmm1,%xmm5
  890         pand    112(%rdx),%xmm3
  891         por     %xmm2,%xmm4
  892         por     %xmm3,%xmm5
  893         por     %xmm5,%xmm4
  894         pshufd  $0x4e,%xmm4,%xmm0
  895         por     %xmm4,%xmm0
  896         leaq    256(%r12),%r12
  897 .byte   102,72,15,126,195
  898 
  899         movq    (%r14,%r9,1),%r10
  900         movq    %r8,%rbp
  901         mulq    %rbx
  902         addq    %rax,%r10
  903         movq    (%rcx),%rax
  904         adcq    $0,%rdx
  905 
  906         imulq   %r10,%rbp
  907         movq    %rdx,%r11
  908         movq    %rdi,(%r14)
  909 
  910         leaq    (%r14,%r9,1),%r14
  911 
  912         mulq    %rbp
  913         addq    %rax,%r10
  914         movq    8(%rsi,%r9,1),%rax
  915         adcq    $0,%rdx
  916         movq    %rdx,%rdi
  917 
  918         mulq    %rbx
  919         addq    %rax,%r11
  920         movq    8(%rcx),%rax
  921         adcq    $0,%rdx
  922         addq    8(%r14),%r11
  923         adcq    $0,%rdx
  924         movq    %rdx,%r10
  925 
  926         mulq    %rbp
  927         addq    %rax,%rdi
  928         movq    16(%rsi,%r9,1),%rax
  929         adcq    $0,%rdx
  930         addq    %r11,%rdi
  931         leaq    32(%r9),%r15
  932         leaq    32(%rcx),%rcx
  933         adcq    $0,%rdx
  934         movq    %rdx,%r13
  935         jmp     .Linner4x
  936 
  937 .align  32
  938 .Linner4x:
  939         mulq    %rbx
  940         addq    %rax,%r10
  941         movq    -16(%rcx),%rax
  942         adcq    $0,%rdx
  943         addq    16(%r14),%r10
  944         leaq    32(%r14),%r14
  945         adcq    $0,%rdx
  946         movq    %rdx,%r11
  947 
  948         mulq    %rbp
  949         addq    %rax,%r13
  950         movq    -8(%rsi,%r15,1),%rax
  951         adcq    $0,%rdx
  952         addq    %r10,%r13
  953         adcq    $0,%rdx
  954         movq    %rdi,-32(%r14)
  955         movq    %rdx,%rdi
  956 
  957         mulq    %rbx
  958         addq    %rax,%r11
  959         movq    -8(%rcx),%rax
  960         adcq    $0,%rdx
  961         addq    -8(%r14),%r11
  962         adcq    $0,%rdx
  963         movq    %rdx,%r10
  964 
  965         mulq    %rbp
  966         addq    %rax,%rdi
  967         movq    (%rsi,%r15,1),%rax
  968         adcq    $0,%rdx
  969         addq    %r11,%rdi
  970         adcq    $0,%rdx
  971         movq    %r13,-24(%r14)
  972         movq    %rdx,%r13
  973 
  974         mulq    %rbx
  975         addq    %rax,%r10
  976         movq    0(%rcx),%rax
  977         adcq    $0,%rdx
  978         addq    (%r14),%r10
  979         adcq    $0,%rdx
  980         movq    %rdx,%r11
  981 
  982         mulq    %rbp
  983         addq    %rax,%r13
  984         movq    8(%rsi,%r15,1),%rax
  985         adcq    $0,%rdx
  986         addq    %r10,%r13
  987         adcq    $0,%rdx
  988         movq    %rdi,-16(%r14)
  989         movq    %rdx,%rdi
  990 
  991         mulq    %rbx
  992         addq    %rax,%r11
  993         movq    8(%rcx),%rax
  994         adcq    $0,%rdx
  995         addq    8(%r14),%r11
  996         adcq    $0,%rdx
  997         movq    %rdx,%r10
  998 
  999         mulq    %rbp
 1000         addq    %rax,%rdi
 1001         movq    16(%rsi,%r15,1),%rax
 1002         adcq    $0,%rdx
 1003         addq    %r11,%rdi
 1004         leaq    32(%rcx),%rcx
 1005         adcq    $0,%rdx
 1006         movq    %r13,-8(%r14)
 1007         movq    %rdx,%r13
 1008 
 1009         addq    $32,%r15
 1010         jnz     .Linner4x
 1011 
 1012         mulq    %rbx
 1013         addq    %rax,%r10
 1014         movq    -16(%rcx),%rax
 1015         adcq    $0,%rdx
 1016         addq    16(%r14),%r10
 1017         leaq    32(%r14),%r14
 1018         adcq    $0,%rdx
 1019         movq    %rdx,%r11
 1020 
 1021         mulq    %rbp
 1022         addq    %rax,%r13
 1023         movq    -8(%rsi),%rax
 1024         adcq    $0,%rdx
 1025         addq    %r10,%r13
 1026         adcq    $0,%rdx
 1027         movq    %rdi,-32(%r14)
 1028         movq    %rdx,%rdi
 1029 
 1030         mulq    %rbx
 1031         addq    %rax,%r11
 1032         movq    %rbp,%rax
 1033         movq    -8(%rcx),%rbp
 1034         adcq    $0,%rdx
 1035         addq    -8(%r14),%r11
 1036         adcq    $0,%rdx
 1037         movq    %rdx,%r10
 1038 
 1039         mulq    %rbp
 1040         addq    %rax,%rdi
 1041         movq    (%rsi,%r9,1),%rax
 1042         adcq    $0,%rdx
 1043         addq    %r11,%rdi
 1044         adcq    $0,%rdx
 1045         movq    %r13,-24(%r14)
 1046         movq    %rdx,%r13
 1047 
 1048         movq    %rdi,-16(%r14)
 1049         leaq    (%rcx,%r9,1),%rcx
 1050 
 1051         xorq    %rdi,%rdi
 1052         addq    %r10,%r13
 1053         adcq    $0,%rdi
 1054         addq    (%r14),%r13
 1055         adcq    $0,%rdi
 1056         movq    %r13,-8(%r14)
 1057 
 1058         cmpq    16+8(%rsp),%r12
 1059         jb      .Louter4x
 1060         xorq    %rax,%rax
 1061         subq    %r13,%rbp
 1062         adcq    %r15,%r15
 1063         orq     %r15,%rdi
 1064         subq    %rdi,%rax
 1065         leaq    (%r14,%r9,1),%rbx
 1066         movq    (%rcx),%r12
 1067         leaq    (%rcx),%rbp
 1068         movq    %r9,%rcx
 1069         sarq    $3+2,%rcx
 1070         movq    56+8(%rsp),%rdi
 1071         decq    %r12
 1072         xorq    %r10,%r10
 1073         movq    8(%rbp),%r13
 1074         movq    16(%rbp),%r14
 1075         movq    24(%rbp),%r15
 1076         jmp     .Lsqr4x_sub_entry
 1077 .cfi_endproc    
 1078 .size   mul4x_internal,.-mul4x_internal
 1079 .globl  bn_power5
 1080 .type   bn_power5,@function
 1081 .align  32
 1082 bn_power5:
 1083 .cfi_startproc  
 1084         movq    %rsp,%rax
 1085 .cfi_def_cfa_register   %rax
 1086         movl    OPENSSL_ia32cap_P+8(%rip),%r11d
 1087         andl    $0x80108,%r11d
 1088         cmpl    $0x80108,%r11d
 1089         je      .Lpowerx5_enter
 1090         pushq   %rbx
 1091 .cfi_offset     %rbx,-16
 1092         pushq   %rbp
 1093 .cfi_offset     %rbp,-24
 1094         pushq   %r12
 1095 .cfi_offset     %r12,-32
 1096         pushq   %r13
 1097 .cfi_offset     %r13,-40
 1098         pushq   %r14
 1099 .cfi_offset     %r14,-48
 1100         pushq   %r15
 1101 .cfi_offset     %r15,-56
 1102 .Lpower5_prologue:
 1103 
 1104         shll    $3,%r9d
 1105         leal    (%r9,%r9,2),%r10d
 1106         negq    %r9
 1107         movq    (%r8),%r8
 1108 
 1109 
 1110 
 1111 
 1112 
 1113 
 1114 
 1115 
 1116         leaq    -320(%rsp,%r9,2),%r11
 1117         movq    %rsp,%rbp
 1118         subq    %rdi,%r11
 1119         andq    $4095,%r11
 1120         cmpq    %r11,%r10
 1121         jb      .Lpwr_sp_alt
 1122         subq    %r11,%rbp
 1123         leaq    -320(%rbp,%r9,2),%rbp
 1124         jmp     .Lpwr_sp_done
 1125 
 1126 .align  32
 1127 .Lpwr_sp_alt:
 1128         leaq    4096-320(,%r9,2),%r10
 1129         leaq    -320(%rbp,%r9,2),%rbp
 1130         subq    %r10,%r11
 1131         movq    $0,%r10
 1132         cmovcq  %r10,%r11
 1133         subq    %r11,%rbp
 1134 .Lpwr_sp_done:
 1135         andq    $-64,%rbp
 1136         movq    %rsp,%r11
 1137         subq    %rbp,%r11
 1138         andq    $-4096,%r11
 1139         leaq    (%r11,%rbp,1),%rsp
 1140         movq    (%rsp),%r10
 1141         cmpq    %rbp,%rsp
 1142         ja      .Lpwr_page_walk
 1143         jmp     .Lpwr_page_walk_done
 1144 
 1145 .Lpwr_page_walk:
 1146         leaq    -4096(%rsp),%rsp
 1147         movq    (%rsp),%r10
 1148         cmpq    %rbp,%rsp
 1149         ja      .Lpwr_page_walk
 1150 .Lpwr_page_walk_done:
 1151 
 1152         movq    %r9,%r10
 1153         negq    %r9
 1154 
 1155 
 1156 
 1157 
 1158 
 1159 
 1160 
 1161 
 1162 
 1163 
 1164         movq    %r8,32(%rsp)
 1165         movq    %rax,40(%rsp)
 1166 .cfi_escape     0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 1167 .Lpower5_body:
 1168 .byte   102,72,15,110,207
 1169 .byte   102,72,15,110,209
 1170 .byte   102,73,15,110,218
 1171 .byte   102,72,15,110,226
 1172 
 1173         call    __bn_sqr8x_internal
 1174         call    __bn_post4x_internal
 1175         call    __bn_sqr8x_internal
 1176         call    __bn_post4x_internal
 1177         call    __bn_sqr8x_internal
 1178         call    __bn_post4x_internal
 1179         call    __bn_sqr8x_internal
 1180         call    __bn_post4x_internal
 1181         call    __bn_sqr8x_internal
 1182         call    __bn_post4x_internal
 1183 
 1184 .byte   102,72,15,126,209
 1185 .byte   102,72,15,126,226
 1186         movq    %rsi,%rdi
 1187         movq    40(%rsp),%rax
 1188         leaq    32(%rsp),%r8
 1189 
 1190         call    mul4x_internal
 1191 
 1192         movq    40(%rsp),%rsi
 1193 .cfi_def_cfa    %rsi,8
 1194         movq    $1,%rax
 1195         movq    -48(%rsi),%r15
 1196 .cfi_restore    %r15
 1197         movq    -40(%rsi),%r14
 1198 .cfi_restore    %r14
 1199         movq    -32(%rsi),%r13
 1200 .cfi_restore    %r13
 1201         movq    -24(%rsi),%r12
 1202 .cfi_restore    %r12
 1203         movq    -16(%rsi),%rbp
 1204 .cfi_restore    %rbp
 1205         movq    -8(%rsi),%rbx
 1206 .cfi_restore    %rbx
 1207         leaq    (%rsi),%rsp
 1208 .cfi_def_cfa_register   %rsp
 1209 .Lpower5_epilogue:
 1210         .byte   0xf3,0xc3
 1211 .cfi_endproc    
 1212 .size   bn_power5,.-bn_power5
 1213 
 1214 .globl  bn_sqr8x_internal
 1215 .hidden bn_sqr8x_internal
 1216 .type   bn_sqr8x_internal,@function
 1217 .align  32
 1218 bn_sqr8x_internal:
 1219 __bn_sqr8x_internal:
 1220 .cfi_startproc  
 1221 
 1222 
 1223 
 1224 
 1225 
 1226 
 1227 
 1228 
 1229 
 1230 
 1231 
 1232 
 1233 
 1234 
 1235 
 1236 
 1237 
 1238 
 1239 
 1240 
 1241 
 1242 
 1243 
 1244 
 1245 
 1246 
 1247 
 1248 
 1249 
 1250 
 1251 
 1252 
 1253 
 1254 
 1255 
 1256 
 1257 
 1258 
 1259 
 1260 
 1261 
 1262 
 1263 
 1264 
 1265 
 1266 
 1267 
 1268 
 1269 
 1270 
 1271 
 1272 
 1273 
 1274 
 1275 
 1276 
 1277 
 1278 
 1279 
 1280 
 1281 
 1282 
 1283 
 1284 
 1285 
 1286 
 1287 
 1288 
 1289 
 1290 
 1291 
 1292 
 1293 
 1294         leaq    32(%r10),%rbp
 1295         leaq    (%rsi,%r9,1),%rsi
 1296 
 1297         movq    %r9,%rcx
 1298 
 1299 
 1300         movq    -32(%rsi,%rbp,1),%r14
 1301         leaq    48+8(%rsp,%r9,2),%rdi
 1302         movq    -24(%rsi,%rbp,1),%rax
 1303         leaq    -32(%rdi,%rbp,1),%rdi
 1304         movq    -16(%rsi,%rbp,1),%rbx
 1305         movq    %rax,%r15
 1306 
 1307         mulq    %r14
 1308         movq    %rax,%r10
 1309         movq    %rbx,%rax
 1310         movq    %rdx,%r11
 1311         movq    %r10,-24(%rdi,%rbp,1)
 1312 
 1313         mulq    %r14
 1314         addq    %rax,%r11
 1315         movq    %rbx,%rax
 1316         adcq    $0,%rdx
 1317         movq    %r11,-16(%rdi,%rbp,1)
 1318         movq    %rdx,%r10
 1319 
 1320 
 1321         movq    -8(%rsi,%rbp,1),%rbx
 1322         mulq    %r15
 1323         movq    %rax,%r12
 1324         movq    %rbx,%rax
 1325         movq    %rdx,%r13
 1326 
 1327         leaq    (%rbp),%rcx
 1328         mulq    %r14
 1329         addq    %rax,%r10
 1330         movq    %rbx,%rax
 1331         movq    %rdx,%r11
 1332         adcq    $0,%r11
 1333         addq    %r12,%r10
 1334         adcq    $0,%r11
 1335         movq    %r10,-8(%rdi,%rcx,1)
 1336         jmp     .Lsqr4x_1st
 1337 
 1338 .align  32
 1339 .Lsqr4x_1st:
 1340         movq    (%rsi,%rcx,1),%rbx
 1341         mulq    %r15
 1342         addq    %rax,%r13
 1343         movq    %rbx,%rax
 1344         movq    %rdx,%r12
 1345         adcq    $0,%r12
 1346 
 1347         mulq    %r14
 1348         addq    %rax,%r11
 1349         movq    %rbx,%rax
 1350         movq    8(%rsi,%rcx,1),%rbx
 1351         movq    %rdx,%r10
 1352         adcq    $0,%r10
 1353         addq    %r13,%r11
 1354         adcq    $0,%r10
 1355 
 1356 
 1357         mulq    %r15
 1358         addq    %rax,%r12
 1359         movq    %rbx,%rax
 1360         movq    %r11,(%rdi,%rcx,1)
 1361         movq    %rdx,%r13
 1362         adcq    $0,%r13
 1363 
 1364         mulq    %r14
 1365         addq    %rax,%r10
 1366         movq    %rbx,%rax
 1367         movq    16(%rsi,%rcx,1),%rbx
 1368         movq    %rdx,%r11
 1369         adcq    $0,%r11
 1370         addq    %r12,%r10
 1371         adcq    $0,%r11
 1372 
 1373         mulq    %r15
 1374         addq    %rax,%r13
 1375         movq    %rbx,%rax
 1376         movq    %r10,8(%rdi,%rcx,1)
 1377         movq    %rdx,%r12
 1378         adcq    $0,%r12
 1379 
 1380         mulq    %r14
 1381         addq    %rax,%r11
 1382         movq    %rbx,%rax
 1383         movq    24(%rsi,%rcx,1),%rbx
 1384         movq    %rdx,%r10
 1385         adcq    $0,%r10
 1386         addq    %r13,%r11
 1387         adcq    $0,%r10
 1388 
 1389 
 1390         mulq    %r15
 1391         addq    %rax,%r12
 1392         movq    %rbx,%rax
 1393         movq    %r11,16(%rdi,%rcx,1)
 1394         movq    %rdx,%r13
 1395         adcq    $0,%r13
 1396         leaq    32(%rcx),%rcx
 1397 
 1398         mulq    %r14
 1399         addq    %rax,%r10
 1400         movq    %rbx,%rax
 1401         movq    %rdx,%r11
 1402         adcq    $0,%r11
 1403         addq    %r12,%r10
 1404         adcq    $0,%r11
 1405         movq    %r10,-8(%rdi,%rcx,1)
 1406 
 1407         cmpq    $0,%rcx
 1408         jne     .Lsqr4x_1st
 1409 
 1410         mulq    %r15
 1411         addq    %rax,%r13
 1412         leaq    16(%rbp),%rbp
 1413         adcq    $0,%rdx
 1414         addq    %r11,%r13
 1415         adcq    $0,%rdx
 1416 
 1417         movq    %r13,(%rdi)
 1418         movq    %rdx,%r12
 1419         movq    %rdx,8(%rdi)
 1420         jmp     .Lsqr4x_outer
 1421 
 1422 .align  32
 1423 .Lsqr4x_outer:
 1424         movq    -32(%rsi,%rbp,1),%r14
 1425         leaq    48+8(%rsp,%r9,2),%rdi
 1426         movq    -24(%rsi,%rbp,1),%rax
 1427         leaq    -32(%rdi,%rbp,1),%rdi
 1428         movq    -16(%rsi,%rbp,1),%rbx
 1429         movq    %rax,%r15
 1430 
 1431         mulq    %r14
 1432         movq    -24(%rdi,%rbp,1),%r10
 1433         addq    %rax,%r10
 1434         movq    %rbx,%rax
 1435         adcq    $0,%rdx
 1436         movq    %r10,-24(%rdi,%rbp,1)
 1437         movq    %rdx,%r11
 1438 
 1439         mulq    %r14
 1440         addq    %rax,%r11
 1441         movq    %rbx,%rax
 1442         adcq    $0,%rdx
 1443         addq    -16(%rdi,%rbp,1),%r11
 1444         movq    %rdx,%r10
 1445         adcq    $0,%r10
 1446         movq    %r11,-16(%rdi,%rbp,1)
 1447 
 1448         xorq    %r12,%r12
 1449 
 1450         movq    -8(%rsi,%rbp,1),%rbx
 1451         mulq    %r15
 1452         addq    %rax,%r12
 1453         movq    %rbx,%rax
 1454         adcq    $0,%rdx
 1455         addq    -8(%rdi,%rbp,1),%r12
 1456         movq    %rdx,%r13
 1457         adcq    $0,%r13
 1458 
 1459         mulq    %r14
 1460         addq    %rax,%r10
 1461         movq    %rbx,%rax
 1462         adcq    $0,%rdx
 1463         addq    %r12,%r10
 1464         movq    %rdx,%r11
 1465         adcq    $0,%r11
 1466         movq    %r10,-8(%rdi,%rbp,1)
 1467 
 1468         leaq    (%rbp),%rcx
 1469         jmp     .Lsqr4x_inner
 1470 
 1471 .align  32
 1472 .Lsqr4x_inner:
 1473         movq    (%rsi,%rcx,1),%rbx
 1474         mulq    %r15
 1475         addq    %rax,%r13
 1476         movq    %rbx,%rax
 1477         movq    %rdx,%r12
 1478         adcq    $0,%r12
 1479         addq    (%rdi,%rcx,1),%r13
 1480         adcq    $0,%r12
 1481 
 1482 .byte   0x67
 1483         mulq    %r14
 1484         addq    %rax,%r11
 1485         movq    %rbx,%rax
 1486         movq    8(%rsi,%rcx,1),%rbx
 1487         movq    %rdx,%r10
 1488         adcq    $0,%r10
 1489         addq    %r13,%r11
 1490         adcq    $0,%r10
 1491 
 1492         mulq    %r15
 1493         addq    %rax,%r12
 1494         movq    %r11,(%rdi,%rcx,1)
 1495         movq    %rbx,%rax
 1496         movq    %rdx,%r13
 1497         adcq    $0,%r13
 1498         addq    8(%rdi,%rcx,1),%r12
 1499         leaq    16(%rcx),%rcx
 1500         adcq    $0,%r13
 1501 
 1502         mulq    %r14
 1503         addq    %rax,%r10
 1504         movq    %rbx,%rax
 1505         adcq    $0,%rdx
 1506         addq    %r12,%r10
 1507         movq    %rdx,%r11
 1508         adcq    $0,%r11
 1509         movq    %r10,-8(%rdi,%rcx,1)
 1510 
 1511         cmpq    $0,%rcx
 1512         jne     .Lsqr4x_inner
 1513 
 1514 .byte   0x67
 1515         mulq    %r15
 1516         addq    %rax,%r13
 1517         adcq    $0,%rdx
 1518         addq    %r11,%r13
 1519         adcq    $0,%rdx
 1520 
 1521         movq    %r13,(%rdi)
 1522         movq    %rdx,%r12
 1523         movq    %rdx,8(%rdi)
 1524 
 1525         addq    $16,%rbp
 1526         jnz     .Lsqr4x_outer
 1527 
 1528 
 1529         movq    -32(%rsi),%r14
 1530         leaq    48+8(%rsp,%r9,2),%rdi
 1531         movq    -24(%rsi),%rax
 1532         leaq    -32(%rdi,%rbp,1),%rdi
 1533         movq    -16(%rsi),%rbx
 1534         movq    %rax,%r15
 1535 
 1536         mulq    %r14
 1537         addq    %rax,%r10
 1538         movq    %rbx,%rax
 1539         movq    %rdx,%r11
 1540         adcq    $0,%r11
 1541 
 1542         mulq    %r14
 1543         addq    %rax,%r11
 1544         movq    %rbx,%rax
 1545         movq    %r10,-24(%rdi)
 1546         movq    %rdx,%r10
 1547         adcq    $0,%r10
 1548         addq    %r13,%r11
 1549         movq    -8(%rsi),%rbx
 1550         adcq    $0,%r10
 1551 
 1552         mulq    %r15
 1553         addq    %rax,%r12
 1554         movq    %rbx,%rax
 1555         movq    %r11,-16(%rdi)
 1556         movq    %rdx,%r13
 1557         adcq    $0,%r13
 1558 
 1559         mulq    %r14
 1560         addq    %rax,%r10
 1561         movq    %rbx,%rax
 1562         movq    %rdx,%r11
 1563         adcq    $0,%r11
 1564         addq    %r12,%r10
 1565         adcq    $0,%r11
 1566         movq    %r10,-8(%rdi)
 1567 
 1568         mulq    %r15
 1569         addq    %rax,%r13
 1570         movq    -16(%rsi),%rax
 1571         adcq    $0,%rdx
 1572         addq    %r11,%r13
 1573         adcq    $0,%rdx
 1574 
 1575         movq    %r13,(%rdi)
 1576         movq    %rdx,%r12
 1577         movq    %rdx,8(%rdi)
 1578 
 1579         mulq    %rbx
 1580         addq    $16,%rbp
 1581         xorq    %r14,%r14
 1582         subq    %r9,%rbp
 1583         xorq    %r15,%r15
 1584 
 1585         addq    %r12,%rax
 1586         adcq    $0,%rdx
 1587         movq    %rax,8(%rdi)
 1588         movq    %rdx,16(%rdi)
 1589         movq    %r15,24(%rdi)
 1590 
 1591         movq    -16(%rsi,%rbp,1),%rax
 1592         leaq    48+8(%rsp),%rdi
 1593         xorq    %r10,%r10
 1594         movq    8(%rdi),%r11
 1595 
 1596         leaq    (%r14,%r10,2),%r12
 1597         shrq    $63,%r10
 1598         leaq    (%rcx,%r11,2),%r13
 1599         shrq    $63,%r11
 1600         orq     %r10,%r13
 1601         movq    16(%rdi),%r10
 1602         movq    %r11,%r14
 1603         mulq    %rax
 1604         negq    %r15
 1605         movq    24(%rdi),%r11
 1606         adcq    %rax,%r12
 1607         movq    -8(%rsi,%rbp,1),%rax
 1608         movq    %r12,(%rdi)
 1609         adcq    %rdx,%r13
 1610 
 1611         leaq    (%r14,%r10,2),%rbx
 1612         movq    %r13,8(%rdi)
 1613         sbbq    %r15,%r15
 1614         shrq    $63,%r10
 1615         leaq    (%rcx,%r11,2),%r8
 1616         shrq    $63,%r11
 1617         orq     %r10,%r8
 1618         movq    32(%rdi),%r10
 1619         movq    %r11,%r14
 1620         mulq    %rax
 1621         negq    %r15
 1622         movq    40(%rdi),%r11
 1623         adcq    %rax,%rbx
 1624         movq    0(%rsi,%rbp,1),%rax
 1625         movq    %rbx,16(%rdi)
 1626         adcq    %rdx,%r8
 1627         leaq    16(%rbp),%rbp
 1628         movq    %r8,24(%rdi)
 1629         sbbq    %r15,%r15
 1630         leaq    64(%rdi),%rdi
 1631         jmp     .Lsqr4x_shift_n_add
 1632 
 1633 .align  32
 1634 .Lsqr4x_shift_n_add:
 1635         leaq    (%r14,%r10,2),%r12
 1636         shrq    $63,%r10
 1637         leaq    (%rcx,%r11,2),%r13
 1638         shrq    $63,%r11
 1639         orq     %r10,%r13
 1640         movq    -16(%rdi),%r10
 1641         movq    %r11,%r14
 1642         mulq    %rax
 1643         negq    %r15
 1644         movq    -8(%rdi),%r11
 1645         adcq    %rax,%r12
 1646         movq    -8(%rsi,%rbp,1),%rax
 1647         movq    %r12,-32(%rdi)
 1648         adcq    %rdx,%r13
 1649 
 1650         leaq    (%r14,%r10,2),%rbx
 1651         movq    %r13,-24(%rdi)
 1652         sbbq    %r15,%r15
 1653         shrq    $63,%r10
 1654         leaq    (%rcx,%r11,2),%r8
 1655         shrq    $63,%r11
 1656         orq     %r10,%r8
 1657         movq    0(%rdi),%r10
 1658         movq    %r11,%r14
 1659         mulq    %rax
 1660         negq    %r15
 1661         movq    8(%rdi),%r11
 1662         adcq    %rax,%rbx
 1663         movq    0(%rsi,%rbp,1),%rax
 1664         movq    %rbx,-16(%rdi)
 1665         adcq    %rdx,%r8
 1666 
 1667         leaq    (%r14,%r10,2),%r12
 1668         movq    %r8,-8(%rdi)
 1669         sbbq    %r15,%r15
 1670         shrq    $63,%r10
 1671         leaq    (%rcx,%r11,2),%r13
 1672         shrq    $63,%r11
 1673         orq     %r10,%r13
 1674         movq    16(%rdi),%r10
 1675         movq    %r11,%r14
 1676         mulq    %rax
 1677         negq    %r15
 1678         movq    24(%rdi),%r11
 1679         adcq    %rax,%r12
 1680         movq    8(%rsi,%rbp,1),%rax
 1681         movq    %r12,0(%rdi)
 1682         adcq    %rdx,%r13
 1683 
 1684         leaq    (%r14,%r10,2),%rbx
 1685         movq    %r13,8(%rdi)
 1686         sbbq    %r15,%r15
 1687         shrq    $63,%r10
 1688         leaq    (%rcx,%r11,2),%r8
 1689         shrq    $63,%r11
 1690         orq     %r10,%r8
 1691         movq    32(%rdi),%r10
 1692         movq    %r11,%r14
 1693         mulq    %rax
 1694         negq    %r15
 1695         movq    40(%rdi),%r11
 1696         adcq    %rax,%rbx
 1697         movq    16(%rsi,%rbp,1),%rax
 1698         movq    %rbx,16(%rdi)
 1699         adcq    %rdx,%r8
 1700         movq    %r8,24(%rdi)
 1701         sbbq    %r15,%r15
 1702         leaq    64(%rdi),%rdi
 1703         addq    $32,%rbp
 1704         jnz     .Lsqr4x_shift_n_add
 1705 
 1706         leaq    (%r14,%r10,2),%r12
 1707 .byte   0x67
 1708         shrq    $63,%r10
 1709         leaq    (%rcx,%r11,2),%r13
 1710         shrq    $63,%r11
 1711         orq     %r10,%r13
 1712         movq    -16(%rdi),%r10
 1713         movq    %r11,%r14
 1714         mulq    %rax
 1715         negq    %r15
 1716         movq    -8(%rdi),%r11
 1717         adcq    %rax,%r12
 1718         movq    -8(%rsi),%rax
 1719         movq    %r12,-32(%rdi)
 1720         adcq    %rdx,%r13
 1721 
 1722         leaq    (%r14,%r10,2),%rbx
 1723         movq    %r13,-24(%rdi)
 1724         sbbq    %r15,%r15
 1725         shrq    $63,%r10
 1726         leaq    (%rcx,%r11,2),%r8
 1727         shrq    $63,%r11
 1728         orq     %r10,%r8
 1729         mulq    %rax
 1730         negq    %r15
 1731         adcq    %rax,%rbx
 1732         adcq    %rdx,%r8
 1733         movq    %rbx,-16(%rdi)
 1734         movq    %r8,-8(%rdi)
 1735 .byte   102,72,15,126,213
 1736 __bn_sqr8x_reduction:
 1737         xorq    %rax,%rax
 1738         leaq    (%r9,%rbp,1),%rcx
 1739         leaq    48+8(%rsp,%r9,2),%rdx
 1740         movq    %rcx,0+8(%rsp)
 1741         leaq    48+8(%rsp,%r9,1),%rdi
 1742         movq    %rdx,8+8(%rsp)
 1743         negq    %r9
 1744         jmp     .L8x_reduction_loop
 1745 
 1746 .align  32
 1747 .L8x_reduction_loop:
 1748         leaq    (%rdi,%r9,1),%rdi
 1749 .byte   0x66
 1750         movq    0(%rdi),%rbx
 1751         movq    8(%rdi),%r9
 1752         movq    16(%rdi),%r10
 1753         movq    24(%rdi),%r11
 1754         movq    32(%rdi),%r12
 1755         movq    40(%rdi),%r13
 1756         movq    48(%rdi),%r14
 1757         movq    56(%rdi),%r15
 1758         movq    %rax,(%rdx)
 1759         leaq    64(%rdi),%rdi
 1760 
 1761 .byte   0x67
 1762         movq    %rbx,%r8
 1763         imulq   32+8(%rsp),%rbx
 1764         movq    0(%rbp),%rax
 1765         movl    $8,%ecx
 1766         jmp     .L8x_reduce
 1767 
 1768 .align  32
 1769 .L8x_reduce:
 1770         mulq    %rbx
 1771         movq    8(%rbp),%rax
 1772         negq    %r8
 1773         movq    %rdx,%r8
 1774         adcq    $0,%r8
 1775 
 1776         mulq    %rbx
 1777         addq    %rax,%r9
 1778         movq    16(%rbp),%rax
 1779         adcq    $0,%rdx
 1780         addq    %r9,%r8
 1781         movq    %rbx,48-8+8(%rsp,%rcx,8)
 1782         movq    %rdx,%r9
 1783         adcq    $0,%r9
 1784 
 1785         mulq    %rbx
 1786         addq    %rax,%r10
 1787         movq    24(%rbp),%rax
 1788         adcq    $0,%rdx
 1789         addq    %r10,%r9
 1790         movq    32+8(%rsp),%rsi
 1791         movq    %rdx,%r10
 1792         adcq    $0,%r10
 1793 
 1794         mulq    %rbx
 1795         addq    %rax,%r11
 1796         movq    32(%rbp),%rax
 1797         adcq    $0,%rdx
 1798         imulq   %r8,%rsi
 1799         addq    %r11,%r10
 1800         movq    %rdx,%r11
 1801         adcq    $0,%r11
 1802 
 1803         mulq    %rbx
 1804         addq    %rax,%r12
 1805         movq    40(%rbp),%rax
 1806         adcq    $0,%rdx
 1807         addq    %r12,%r11
 1808         movq    %rdx,%r12
 1809         adcq    $0,%r12
 1810 
 1811         mulq    %rbx
 1812         addq    %rax,%r13
 1813         movq    48(%rbp),%rax
 1814         adcq    $0,%rdx
 1815         addq    %r13,%r12
 1816         movq    %rdx,%r13
 1817         adcq    $0,%r13
 1818 
 1819         mulq    %rbx
 1820         addq    %rax,%r14
 1821         movq    56(%rbp),%rax
 1822         adcq    $0,%rdx
 1823         addq    %r14,%r13
 1824         movq    %rdx,%r14
 1825         adcq    $0,%r14
 1826 
 1827         mulq    %rbx
 1828         movq    %rsi,%rbx
 1829         addq    %rax,%r15
 1830         movq    0(%rbp),%rax
 1831         adcq    $0,%rdx
 1832         addq    %r15,%r14
 1833         movq    %rdx,%r15
 1834         adcq    $0,%r15
 1835 
 1836         decl    %ecx
 1837         jnz     .L8x_reduce
 1838 
 1839         leaq    64(%rbp),%rbp
 1840         xorq    %rax,%rax
 1841         movq    8+8(%rsp),%rdx
 1842         cmpq    0+8(%rsp),%rbp
 1843         jae     .L8x_no_tail
 1844 
 1845 .byte   0x66
 1846         addq    0(%rdi),%r8
 1847         adcq    8(%rdi),%r9
 1848         adcq    16(%rdi),%r10
 1849         adcq    24(%rdi),%r11
 1850         adcq    32(%rdi),%r12
 1851         adcq    40(%rdi),%r13
 1852         adcq    48(%rdi),%r14
 1853         adcq    56(%rdi),%r15
 1854         sbbq    %rsi,%rsi
 1855 
 1856         movq    48+56+8(%rsp),%rbx
 1857         movl    $8,%ecx
 1858         movq    0(%rbp),%rax
 1859         jmp     .L8x_tail
 1860 
 1861 .align  32
 1862 .L8x_tail:
 1863         mulq    %rbx
 1864         addq    %rax,%r8
 1865         movq    8(%rbp),%rax
 1866         movq    %r8,(%rdi)
 1867         movq    %rdx,%r8
 1868         adcq    $0,%r8
 1869 
 1870         mulq    %rbx
 1871         addq    %rax,%r9
 1872         movq    16(%rbp),%rax
 1873         adcq    $0,%rdx
 1874         addq    %r9,%r8
 1875         leaq    8(%rdi),%rdi
 1876         movq    %rdx,%r9
 1877         adcq    $0,%r9
 1878 
 1879         mulq    %rbx
 1880         addq    %rax,%r10
 1881         movq    24(%rbp),%rax
 1882         adcq    $0,%rdx
 1883         addq    %r10,%r9
 1884         movq    %rdx,%r10
 1885         adcq    $0,%r10
 1886 
 1887         mulq    %rbx
 1888         addq    %rax,%r11
 1889         movq    32(%rbp),%rax
 1890         adcq    $0,%rdx
 1891         addq    %r11,%r10
 1892         movq    %rdx,%r11
 1893         adcq    $0,%r11
 1894 
 1895         mulq    %rbx
 1896         addq    %rax,%r12
 1897         movq    40(%rbp),%rax
 1898         adcq    $0,%rdx
 1899         addq    %r12,%r11
 1900         movq    %rdx,%r12
 1901         adcq    $0,%r12
 1902 
 1903         mulq    %rbx
 1904         addq    %rax,%r13
 1905         movq    48(%rbp),%rax
 1906         adcq    $0,%rdx
 1907         addq    %r13,%r12
 1908         movq    %rdx,%r13
 1909         adcq    $0,%r13
 1910 
 1911         mulq    %rbx
 1912         addq    %rax,%r14
 1913         movq    56(%rbp),%rax
 1914         adcq    $0,%rdx
 1915         addq    %r14,%r13
 1916         movq    %rdx,%r14
 1917         adcq    $0,%r14
 1918 
 1919         mulq    %rbx
 1920         movq    48-16+8(%rsp,%rcx,8),%rbx
 1921         addq    %rax,%r15
 1922         adcq    $0,%rdx
 1923         addq    %r15,%r14
 1924         movq    0(%rbp),%rax
 1925         movq    %rdx,%r15
 1926         adcq    $0,%r15
 1927 
 1928         decl    %ecx
 1929         jnz     .L8x_tail
 1930 
 1931         leaq    64(%rbp),%rbp
 1932         movq    8+8(%rsp),%rdx
 1933         cmpq    0+8(%rsp),%rbp
 1934         jae     .L8x_tail_done
 1935 
 1936         movq    48+56+8(%rsp),%rbx
 1937         negq    %rsi
 1938         movq    0(%rbp),%rax
 1939         adcq    0(%rdi),%r8
 1940         adcq    8(%rdi),%r9
 1941         adcq    16(%rdi),%r10
 1942         adcq    24(%rdi),%r11
 1943         adcq    32(%rdi),%r12
 1944         adcq    40(%rdi),%r13
 1945         adcq    48(%rdi),%r14
 1946         adcq    56(%rdi),%r15
 1947         sbbq    %rsi,%rsi
 1948 
 1949         movl    $8,%ecx
 1950         jmp     .L8x_tail
 1951 
 1952 .align  32
 1953 .L8x_tail_done:
 1954         xorq    %rax,%rax
 1955         addq    (%rdx),%r8
 1956         adcq    $0,%r9
 1957         adcq    $0,%r10
 1958         adcq    $0,%r11
 1959         adcq    $0,%r12
 1960         adcq    $0,%r13
 1961         adcq    $0,%r14
 1962         adcq    $0,%r15
 1963         adcq    $0,%rax
 1964 
 1965         negq    %rsi
 1966 .L8x_no_tail:
 1967         adcq    0(%rdi),%r8
 1968         adcq    8(%rdi),%r9
 1969         adcq    16(%rdi),%r10
 1970         adcq    24(%rdi),%r11
 1971         adcq    32(%rdi),%r12
 1972         adcq    40(%rdi),%r13
 1973         adcq    48(%rdi),%r14
 1974         adcq    56(%rdi),%r15
 1975         adcq    $0,%rax
 1976         movq    -8(%rbp),%rcx
 1977         xorq    %rsi,%rsi
 1978 
 1979 .byte   102,72,15,126,213
 1980 
 1981         movq    %r8,0(%rdi)
 1982         movq    %r9,8(%rdi)
 1983 .byte   102,73,15,126,217
 1984         movq    %r10,16(%rdi)
 1985         movq    %r11,24(%rdi)
 1986         movq    %r12,32(%rdi)
 1987         movq    %r13,40(%rdi)
 1988         movq    %r14,48(%rdi)
 1989         movq    %r15,56(%rdi)
 1990         leaq    64(%rdi),%rdi
 1991 
 1992         cmpq    %rdx,%rdi
 1993         jb      .L8x_reduction_loop
 1994         .byte   0xf3,0xc3
 1995 .cfi_endproc    
 1996 .size   bn_sqr8x_internal,.-bn_sqr8x_internal
 1997 .type   __bn_post4x_internal,@function
 1998 .align  32
 1999 __bn_post4x_internal:
 2000 .cfi_startproc  
 2001         movq    0(%rbp),%r12
 2002         leaq    (%rdi,%r9,1),%rbx
 2003         movq    %r9,%rcx
 2004 .byte   102,72,15,126,207
 2005         negq    %rax
 2006 .byte   102,72,15,126,206
 2007         sarq    $3+2,%rcx
 2008         decq    %r12
 2009         xorq    %r10,%r10
 2010         movq    8(%rbp),%r13
 2011         movq    16(%rbp),%r14
 2012         movq    24(%rbp),%r15
 2013         jmp     .Lsqr4x_sub_entry
 2014 
 2015 .align  16
 2016 .Lsqr4x_sub:
 2017         movq    0(%rbp),%r12
 2018         movq    8(%rbp),%r13
 2019         movq    16(%rbp),%r14
 2020         movq    24(%rbp),%r15
 2021 .Lsqr4x_sub_entry:
 2022         leaq    32(%rbp),%rbp
 2023         notq    %r12
 2024         notq    %r13
 2025         notq    %r14
 2026         notq    %r15
 2027         andq    %rax,%r12
 2028         andq    %rax,%r13
 2029         andq    %rax,%r14
 2030         andq    %rax,%r15
 2031 
 2032         negq    %r10
 2033         adcq    0(%rbx),%r12
 2034         adcq    8(%rbx),%r13
 2035         adcq    16(%rbx),%r14
 2036         adcq    24(%rbx),%r15
 2037         movq    %r12,0(%rdi)
 2038         leaq    32(%rbx),%rbx
 2039         movq    %r13,8(%rdi)
 2040         sbbq    %r10,%r10
 2041         movq    %r14,16(%rdi)
 2042         movq    %r15,24(%rdi)
 2043         leaq    32(%rdi),%rdi
 2044 
 2045         incq    %rcx
 2046         jnz     .Lsqr4x_sub
 2047 
 2048         movq    %r9,%r10
 2049         negq    %r9
 2050         .byte   0xf3,0xc3
 2051 .cfi_endproc    
 2052 .size   __bn_post4x_internal,.-__bn_post4x_internal
 2053 .type   bn_mulx4x_mont_gather5,@function
 2054 .align  32
 2055 bn_mulx4x_mont_gather5:
 2056 .cfi_startproc  
 2057         movq    %rsp,%rax
 2058 .cfi_def_cfa_register   %rax
 2059 .Lmulx4x_enter:
 2060         pushq   %rbx
 2061 .cfi_offset     %rbx,-16
 2062         pushq   %rbp
 2063 .cfi_offset     %rbp,-24
 2064         pushq   %r12
 2065 .cfi_offset     %r12,-32
 2066         pushq   %r13
 2067 .cfi_offset     %r13,-40
 2068         pushq   %r14
 2069 .cfi_offset     %r14,-48
 2070         pushq   %r15
 2071 .cfi_offset     %r15,-56
 2072 .Lmulx4x_prologue:
 2073 
 2074         shll    $3,%r9d
 2075         leaq    (%r9,%r9,2),%r10
 2076         negq    %r9
 2077         movq    (%r8),%r8
 2078 
 2079 
 2080 
 2081 
 2082 
 2083 
 2084 
 2085 
 2086 
 2087 
 2088         leaq    -320(%rsp,%r9,2),%r11
 2089         movq    %rsp,%rbp
 2090         subq    %rdi,%r11
 2091         andq    $4095,%r11
 2092         cmpq    %r11,%r10
 2093         jb      .Lmulx4xsp_alt
 2094         subq    %r11,%rbp
 2095         leaq    -320(%rbp,%r9,2),%rbp
 2096         jmp     .Lmulx4xsp_done
 2097 
 2098 .Lmulx4xsp_alt:
 2099         leaq    4096-320(,%r9,2),%r10
 2100         leaq    -320(%rbp,%r9,2),%rbp
 2101         subq    %r10,%r11
 2102         movq    $0,%r10
 2103         cmovcq  %r10,%r11
 2104         subq    %r11,%rbp
 2105 .Lmulx4xsp_done:
 2106         andq    $-64,%rbp
 2107         movq    %rsp,%r11
 2108         subq    %rbp,%r11
 2109         andq    $-4096,%r11
 2110         leaq    (%r11,%rbp,1),%rsp
 2111         movq    (%rsp),%r10
 2112         cmpq    %rbp,%rsp
 2113         ja      .Lmulx4x_page_walk
 2114         jmp     .Lmulx4x_page_walk_done
 2115 
 2116 .Lmulx4x_page_walk:
 2117         leaq    -4096(%rsp),%rsp
 2118         movq    (%rsp),%r10
 2119         cmpq    %rbp,%rsp
 2120         ja      .Lmulx4x_page_walk
 2121 .Lmulx4x_page_walk_done:
 2122 
 2123 
 2124 
 2125 
 2126 
 2127 
 2128 
 2129 
 2130 
 2131 
 2132 
 2133 
 2134 
 2135         movq    %r8,32(%rsp)
 2136         movq    %rax,40(%rsp)
 2137 .cfi_escape     0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 2138 .Lmulx4x_body:
 2139         call    mulx4x_internal
 2140 
 2141         movq    40(%rsp),%rsi
 2142 .cfi_def_cfa    %rsi,8
 2143         movq    $1,%rax
 2144 
 2145         movq    -48(%rsi),%r15
 2146 .cfi_restore    %r15
 2147         movq    -40(%rsi),%r14
 2148 .cfi_restore    %r14
 2149         movq    -32(%rsi),%r13
 2150 .cfi_restore    %r13
 2151         movq    -24(%rsi),%r12
 2152 .cfi_restore    %r12
 2153         movq    -16(%rsi),%rbp
 2154 .cfi_restore    %rbp
 2155         movq    -8(%rsi),%rbx
 2156 .cfi_restore    %rbx
 2157         leaq    (%rsi),%rsp
 2158 .cfi_def_cfa_register   %rsp
 2159 .Lmulx4x_epilogue:
 2160         .byte   0xf3,0xc3
 2161 .cfi_endproc    
 2162 .size   bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
 2163 
 2164 .type   mulx4x_internal,@function
 2165 .align  32
 2166 mulx4x_internal:
 2167 .cfi_startproc  
 2168         movq    %r9,8(%rsp)
 2169         movq    %r9,%r10
 2170         negq    %r9
 2171         shlq    $5,%r9
 2172         negq    %r10
 2173         leaq    128(%rdx,%r9,1),%r13
 2174         shrq    $5+5,%r9
 2175         movd    8(%rax),%xmm5
 2176         subq    $1,%r9
 2177         leaq    .Linc(%rip),%rax
 2178         movq    %r13,16+8(%rsp)
 2179         movq    %r9,24+8(%rsp)
 2180         movq    %rdi,56+8(%rsp)
 2181         movdqa  0(%rax),%xmm0
 2182         movdqa  16(%rax),%xmm1
 2183         leaq    88-112(%rsp,%r10,1),%r10
 2184         leaq    128(%rdx),%rdi
 2185 
 2186         pshufd  $0,%xmm5,%xmm5
 2187         movdqa  %xmm1,%xmm4
 2188 .byte   0x67
 2189         movdqa  %xmm1,%xmm2
 2190 .byte   0x67
 2191         paddd   %xmm0,%xmm1
 2192         pcmpeqd %xmm5,%xmm0
 2193         movdqa  %xmm4,%xmm3
 2194         paddd   %xmm1,%xmm2
 2195         pcmpeqd %xmm5,%xmm1
 2196         movdqa  %xmm0,112(%r10)
 2197         movdqa  %xmm4,%xmm0
 2198 
 2199         paddd   %xmm2,%xmm3
 2200         pcmpeqd %xmm5,%xmm2
 2201         movdqa  %xmm1,128(%r10)
 2202         movdqa  %xmm4,%xmm1
 2203 
 2204         paddd   %xmm3,%xmm0
 2205         pcmpeqd %xmm5,%xmm3
 2206         movdqa  %xmm2,144(%r10)
 2207         movdqa  %xmm4,%xmm2
 2208 
 2209         paddd   %xmm0,%xmm1
 2210         pcmpeqd %xmm5,%xmm0
 2211         movdqa  %xmm3,160(%r10)
 2212         movdqa  %xmm4,%xmm3
 2213         paddd   %xmm1,%xmm2
 2214         pcmpeqd %xmm5,%xmm1
 2215         movdqa  %xmm0,176(%r10)
 2216         movdqa  %xmm4,%xmm0
 2217 
 2218         paddd   %xmm2,%xmm3
 2219         pcmpeqd %xmm5,%xmm2
 2220         movdqa  %xmm1,192(%r10)
 2221         movdqa  %xmm4,%xmm1
 2222 
 2223         paddd   %xmm3,%xmm0
 2224         pcmpeqd %xmm5,%xmm3
 2225         movdqa  %xmm2,208(%r10)
 2226         movdqa  %xmm4,%xmm2
 2227 
 2228         paddd   %xmm0,%xmm1
 2229         pcmpeqd %xmm5,%xmm0
 2230         movdqa  %xmm3,224(%r10)
 2231         movdqa  %xmm4,%xmm3
 2232         paddd   %xmm1,%xmm2
 2233         pcmpeqd %xmm5,%xmm1
 2234         movdqa  %xmm0,240(%r10)
 2235         movdqa  %xmm4,%xmm0
 2236 
 2237         paddd   %xmm2,%xmm3
 2238         pcmpeqd %xmm5,%xmm2
 2239         movdqa  %xmm1,256(%r10)
 2240         movdqa  %xmm4,%xmm1
 2241 
 2242         paddd   %xmm3,%xmm0
 2243         pcmpeqd %xmm5,%xmm3
 2244         movdqa  %xmm2,272(%r10)
 2245         movdqa  %xmm4,%xmm2
 2246 
 2247         paddd   %xmm0,%xmm1
 2248         pcmpeqd %xmm5,%xmm0
 2249         movdqa  %xmm3,288(%r10)
 2250         movdqa  %xmm4,%xmm3
 2251 .byte   0x67
 2252         paddd   %xmm1,%xmm2
 2253         pcmpeqd %xmm5,%xmm1
 2254         movdqa  %xmm0,304(%r10)
 2255 
 2256         paddd   %xmm2,%xmm3
 2257         pcmpeqd %xmm5,%xmm2
 2258         movdqa  %xmm1,320(%r10)
 2259 
 2260         pcmpeqd %xmm5,%xmm3
 2261         movdqa  %xmm2,336(%r10)
 2262 
 2263         pand    64(%rdi),%xmm0
 2264         pand    80(%rdi),%xmm1
 2265         pand    96(%rdi),%xmm2
 2266         movdqa  %xmm3,352(%r10)
 2267         pand    112(%rdi),%xmm3
 2268         por     %xmm2,%xmm0
 2269         por     %xmm3,%xmm1
 2270         movdqa  -128(%rdi),%xmm4
 2271         movdqa  -112(%rdi),%xmm5
 2272         movdqa  -96(%rdi),%xmm2
 2273         pand    112(%r10),%xmm4
 2274         movdqa  -80(%rdi),%xmm3
 2275         pand    128(%r10),%xmm5
 2276         por     %xmm4,%xmm0
 2277         pand    144(%r10),%xmm2
 2278         por     %xmm5,%xmm1
 2279         pand    160(%r10),%xmm3
 2280         por     %xmm2,%xmm0
 2281         por     %xmm3,%xmm1
 2282         movdqa  -64(%rdi),%xmm4
 2283         movdqa  -48(%rdi),%xmm5
 2284         movdqa  -32(%rdi),%xmm2
 2285         pand    176(%r10),%xmm4
 2286         movdqa  -16(%rdi),%xmm3
 2287         pand    192(%r10),%xmm5
 2288         por     %xmm4,%xmm0
 2289         pand    208(%r10),%xmm2
 2290         por     %xmm5,%xmm1
 2291         pand    224(%r10),%xmm3
 2292         por     %xmm2,%xmm0
 2293         por     %xmm3,%xmm1
 2294         movdqa  0(%rdi),%xmm4
 2295         movdqa  16(%rdi),%xmm5
 2296         movdqa  32(%rdi),%xmm2
 2297         pand    240(%r10),%xmm4
 2298         movdqa  48(%rdi),%xmm3
 2299         pand    256(%r10),%xmm5
 2300         por     %xmm4,%xmm0
 2301         pand    272(%r10),%xmm2
 2302         por     %xmm5,%xmm1
 2303         pand    288(%r10),%xmm3
 2304         por     %xmm2,%xmm0
 2305         por     %xmm3,%xmm1
 2306         pxor    %xmm1,%xmm0
 2307         pshufd  $0x4e,%xmm0,%xmm1
 2308         por     %xmm1,%xmm0
 2309         leaq    256(%rdi),%rdi
 2310 .byte   102,72,15,126,194
 2311         leaq    64+32+8(%rsp),%rbx
 2312 
 2313         movq    %rdx,%r9
 2314         mulxq   0(%rsi),%r8,%rax
 2315         mulxq   8(%rsi),%r11,%r12
 2316         addq    %rax,%r11
 2317         mulxq   16(%rsi),%rax,%r13
 2318         adcq    %rax,%r12
 2319         adcq    $0,%r13
 2320         mulxq   24(%rsi),%rax,%r14
 2321 
 2322         movq    %r8,%r15
 2323         imulq   32+8(%rsp),%r8
 2324         xorq    %rbp,%rbp
 2325         movq    %r8,%rdx
 2326 
 2327         movq    %rdi,8+8(%rsp)
 2328 
 2329         leaq    32(%rsi),%rsi
 2330         adcxq   %rax,%r13
 2331         adcxq   %rbp,%r14
 2332 
 2333         mulxq   0(%rcx),%rax,%r10
 2334         adcxq   %rax,%r15
 2335         adoxq   %r11,%r10
 2336         mulxq   8(%rcx),%rax,%r11
 2337         adcxq   %rax,%r10
 2338         adoxq   %r12,%r11
 2339         mulxq   16(%rcx),%rax,%r12
 2340         movq    24+8(%rsp),%rdi
 2341         movq    %r10,-32(%rbx)
 2342         adcxq   %rax,%r11
 2343         adoxq   %r13,%r12
 2344         mulxq   24(%rcx),%rax,%r15
 2345         movq    %r9,%rdx
 2346         movq    %r11,-24(%rbx)
 2347         adcxq   %rax,%r12
 2348         adoxq   %rbp,%r15
 2349         leaq    32(%rcx),%rcx
 2350         movq    %r12,-16(%rbx)
 2351         jmp     .Lmulx4x_1st
 2352 
 2353 .align  32
 2354 .Lmulx4x_1st:
 2355         adcxq   %rbp,%r15
 2356         mulxq   0(%rsi),%r10,%rax
 2357         adcxq   %r14,%r10
 2358         mulxq   8(%rsi),%r11,%r14
 2359         adcxq   %rax,%r11
 2360         mulxq   16(%rsi),%r12,%rax
 2361         adcxq   %r14,%r12
 2362         mulxq   24(%rsi),%r13,%r14
 2363 .byte   0x67,0x67
 2364         movq    %r8,%rdx
 2365         adcxq   %rax,%r13
 2366         adcxq   %rbp,%r14
 2367         leaq    32(%rsi),%rsi
 2368         leaq    32(%rbx),%rbx
 2369 
 2370         adoxq   %r15,%r10
 2371         mulxq   0(%rcx),%rax,%r15
 2372         adcxq   %rax,%r10
 2373         adoxq   %r15,%r11
 2374         mulxq   8(%rcx),%rax,%r15
 2375         adcxq   %rax,%r11
 2376         adoxq   %r15,%r12
 2377         mulxq   16(%rcx),%rax,%r15
 2378         movq    %r10,-40(%rbx)
 2379         adcxq   %rax,%r12
 2380         movq    %r11,-32(%rbx)
 2381         adoxq   %r15,%r13
 2382         mulxq   24(%rcx),%rax,%r15
 2383         movq    %r9,%rdx
 2384         movq    %r12,-24(%rbx)
 2385         adcxq   %rax,%r13
 2386         adoxq   %rbp,%r15
 2387         leaq    32(%rcx),%rcx
 2388         movq    %r13,-16(%rbx)
 2389 
 2390         decq    %rdi
 2391         jnz     .Lmulx4x_1st
 2392 
 2393         movq    8(%rsp),%rax
 2394         adcq    %rbp,%r15
 2395         leaq    (%rsi,%rax,1),%rsi
 2396         addq    %r15,%r14
 2397         movq    8+8(%rsp),%rdi
 2398         adcq    %rbp,%rbp
 2399         movq    %r14,-8(%rbx)
 2400         jmp     .Lmulx4x_outer
 2401 
 2402 .align  32
 2403 .Lmulx4x_outer:
 2404         leaq    16-256(%rbx),%r10
 2405         pxor    %xmm4,%xmm4
 2406 .byte   0x67,0x67
 2407         pxor    %xmm5,%xmm5
 2408         movdqa  -128(%rdi),%xmm0
 2409         movdqa  -112(%rdi),%xmm1
 2410         movdqa  -96(%rdi),%xmm2
 2411         pand    256(%r10),%xmm0
 2412         movdqa  -80(%rdi),%xmm3
 2413         pand    272(%r10),%xmm1
 2414         por     %xmm0,%xmm4
 2415         pand    288(%r10),%xmm2
 2416         por     %xmm1,%xmm5
 2417         pand    304(%r10),%xmm3
 2418         por     %xmm2,%xmm4
 2419         por     %xmm3,%xmm5
 2420         movdqa  -64(%rdi),%xmm0
 2421         movdqa  -48(%rdi),%xmm1
 2422         movdqa  -32(%rdi),%xmm2
 2423         pand    320(%r10),%xmm0
 2424         movdqa  -16(%rdi),%xmm3
 2425         pand    336(%r10),%xmm1
 2426         por     %xmm0,%xmm4
 2427         pand    352(%r10),%xmm2
 2428         por     %xmm1,%xmm5
 2429         pand    368(%r10),%xmm3
 2430         por     %xmm2,%xmm4
 2431         por     %xmm3,%xmm5
 2432         movdqa  0(%rdi),%xmm0
 2433         movdqa  16(%rdi),%xmm1
 2434         movdqa  32(%rdi),%xmm2
 2435         pand    384(%r10),%xmm0
 2436         movdqa  48(%rdi),%xmm3
 2437         pand    400(%r10),%xmm1
 2438         por     %xmm0,%xmm4
 2439         pand    416(%r10),%xmm2
 2440         por     %xmm1,%xmm5
 2441         pand    432(%r10),%xmm3
 2442         por     %xmm2,%xmm4
 2443         por     %xmm3,%xmm5
 2444         movdqa  64(%rdi),%xmm0
 2445         movdqa  80(%rdi),%xmm1
 2446         movdqa  96(%rdi),%xmm2
 2447         pand    448(%r10),%xmm0
 2448         movdqa  112(%rdi),%xmm3
 2449         pand    464(%r10),%xmm1
 2450         por     %xmm0,%xmm4
 2451         pand    480(%r10),%xmm2
 2452         por     %xmm1,%xmm5
 2453         pand    496(%r10),%xmm3
 2454         por     %xmm2,%xmm4
 2455         por     %xmm3,%xmm5
 2456         por     %xmm5,%xmm4
 2457         pshufd  $0x4e,%xmm4,%xmm0
 2458         por     %xmm4,%xmm0
 2459         leaq    256(%rdi),%rdi
 2460 .byte   102,72,15,126,194
 2461 
 2462         movq    %rbp,(%rbx)
 2463         leaq    32(%rbx,%rax,1),%rbx
 2464         mulxq   0(%rsi),%r8,%r11
 2465         xorq    %rbp,%rbp
 2466         movq    %rdx,%r9
 2467         mulxq   8(%rsi),%r14,%r12
 2468         adoxq   -32(%rbx),%r8
 2469         adcxq   %r14,%r11
 2470         mulxq   16(%rsi),%r15,%r13
 2471         adoxq   -24(%rbx),%r11
 2472         adcxq   %r15,%r12
 2473         mulxq   24(%rsi),%rdx,%r14
 2474         adoxq   -16(%rbx),%r12
 2475         adcxq   %rdx,%r13
 2476         leaq    (%rcx,%rax,1),%rcx
 2477         leaq    32(%rsi),%rsi
 2478         adoxq   -8(%rbx),%r13
 2479         adcxq   %rbp,%r14
 2480         adoxq   %rbp,%r14
 2481 
 2482         movq    %r8,%r15
 2483         imulq   32+8(%rsp),%r8
 2484 
 2485         movq    %r8,%rdx
 2486         xorq    %rbp,%rbp
 2487         movq    %rdi,8+8(%rsp)
 2488 
 2489         mulxq   0(%rcx),%rax,%r10
 2490         adcxq   %rax,%r15
 2491         adoxq   %r11,%r10
 2492         mulxq   8(%rcx),%rax,%r11
 2493         adcxq   %rax,%r10
 2494         adoxq   %r12,%r11
 2495         mulxq   16(%rcx),%rax,%r12
 2496         adcxq   %rax,%r11
 2497         adoxq   %r13,%r12
 2498         mulxq   24(%rcx),%rax,%r15
 2499         movq    %r9,%rdx
 2500         movq    24+8(%rsp),%rdi
 2501         movq    %r10,-32(%rbx)
 2502         adcxq   %rax,%r12
 2503         movq    %r11,-24(%rbx)
 2504         adoxq   %rbp,%r15
 2505         movq    %r12,-16(%rbx)
 2506         leaq    32(%rcx),%rcx
 2507         jmp     .Lmulx4x_inner
 2508 
 2509 .align  32
 2510 .Lmulx4x_inner:
 2511         mulxq   0(%rsi),%r10,%rax
 2512         adcxq   %rbp,%r15
 2513         adoxq   %r14,%r10
 2514         mulxq   8(%rsi),%r11,%r14
 2515         adcxq   0(%rbx),%r10
 2516         adoxq   %rax,%r11
 2517         mulxq   16(%rsi),%r12,%rax
 2518         adcxq   8(%rbx),%r11
 2519         adoxq   %r14,%r12
 2520         mulxq   24(%rsi),%r13,%r14
 2521         movq    %r8,%rdx
 2522         adcxq   16(%rbx),%r12
 2523         adoxq   %rax,%r13
 2524         adcxq   24(%rbx),%r13
 2525         adoxq   %rbp,%r14
 2526         leaq    32(%rsi),%rsi
 2527         leaq    32(%rbx),%rbx
 2528         adcxq   %rbp,%r14
 2529 
 2530         adoxq   %r15,%r10
 2531         mulxq   0(%rcx),%rax,%r15
 2532         adcxq   %rax,%r10
 2533         adoxq   %r15,%r11
 2534         mulxq   8(%rcx),%rax,%r15
 2535         adcxq   %rax,%r11
 2536         adoxq   %r15,%r12
 2537         mulxq   16(%rcx),%rax,%r15
 2538         movq    %r10,-40(%rbx)
 2539         adcxq   %rax,%r12
 2540         adoxq   %r15,%r13
 2541         movq    %r11,-32(%rbx)
 2542         mulxq   24(%rcx),%rax,%r15
 2543         movq    %r9,%rdx
 2544         leaq    32(%rcx),%rcx
 2545         movq    %r12,-24(%rbx)
 2546         adcxq   %rax,%r13
 2547         adoxq   %rbp,%r15
 2548         movq    %r13,-16(%rbx)
 2549 
 2550         decq    %rdi
 2551         jnz     .Lmulx4x_inner
 2552 
 2553         movq    0+8(%rsp),%rax
 2554         adcq    %rbp,%r15
 2555         subq    0(%rbx),%rdi
 2556         movq    8+8(%rsp),%rdi
 2557         movq    16+8(%rsp),%r10
 2558         adcq    %r15,%r14
 2559         leaq    (%rsi,%rax,1),%rsi
 2560         adcq    %rbp,%rbp
 2561         movq    %r14,-8(%rbx)
 2562 
 2563         cmpq    %r10,%rdi
 2564         jb      .Lmulx4x_outer
 2565 
 2566         movq    -8(%rcx),%r10
 2567         movq    %rbp,%r8
 2568         movq    (%rcx,%rax,1),%r12
 2569         leaq    (%rcx,%rax,1),%rbp
 2570         movq    %rax,%rcx
 2571         leaq    (%rbx,%rax,1),%rdi
 2572         xorl    %eax,%eax
 2573         xorq    %r15,%r15
 2574         subq    %r14,%r10
 2575         adcq    %r15,%r15
 2576         orq     %r15,%r8
 2577         sarq    $3+2,%rcx
 2578         subq    %r8,%rax
 2579         movq    56+8(%rsp),%rdx
 2580         decq    %r12
 2581         movq    8(%rbp),%r13
 2582         xorq    %r8,%r8
 2583         movq    16(%rbp),%r14
 2584         movq    24(%rbp),%r15
 2585         jmp     .Lsqrx4x_sub_entry
 2586 .cfi_endproc    
 2587 .size   mulx4x_internal,.-mulx4x_internal
 2588 .type   bn_powerx5,@function
 2589 .align  32
 2590 bn_powerx5:
 2591 .cfi_startproc  
 2592         movq    %rsp,%rax
 2593 .cfi_def_cfa_register   %rax
 2594 .Lpowerx5_enter:
 2595         pushq   %rbx
 2596 .cfi_offset     %rbx,-16
 2597         pushq   %rbp
 2598 .cfi_offset     %rbp,-24
 2599         pushq   %r12
 2600 .cfi_offset     %r12,-32
 2601         pushq   %r13
 2602 .cfi_offset     %r13,-40
 2603         pushq   %r14
 2604 .cfi_offset     %r14,-48
 2605         pushq   %r15
 2606 .cfi_offset     %r15,-56
 2607 .Lpowerx5_prologue:
 2608 
 2609         shll    $3,%r9d
 2610         leaq    (%r9,%r9,2),%r10
 2611         negq    %r9
 2612         movq    (%r8),%r8
 2613 
 2614 
 2615 
 2616 
 2617 
 2618 
 2619 
 2620 
 2621         leaq    -320(%rsp,%r9,2),%r11
 2622         movq    %rsp,%rbp
 2623         subq    %rdi,%r11
 2624         andq    $4095,%r11
 2625         cmpq    %r11,%r10
 2626         jb      .Lpwrx_sp_alt
 2627         subq    %r11,%rbp
 2628         leaq    -320(%rbp,%r9,2),%rbp
 2629         jmp     .Lpwrx_sp_done
 2630 
 2631 .align  32
 2632 .Lpwrx_sp_alt:
 2633         leaq    4096-320(,%r9,2),%r10
 2634         leaq    -320(%rbp,%r9,2),%rbp
 2635         subq    %r10,%r11
 2636         movq    $0,%r10
 2637         cmovcq  %r10,%r11
 2638         subq    %r11,%rbp
 2639 .Lpwrx_sp_done:
 2640         andq    $-64,%rbp
 2641         movq    %rsp,%r11
 2642         subq    %rbp,%r11
 2643         andq    $-4096,%r11
 2644         leaq    (%r11,%rbp,1),%rsp
 2645         movq    (%rsp),%r10
 2646         cmpq    %rbp,%rsp
 2647         ja      .Lpwrx_page_walk
 2648         jmp     .Lpwrx_page_walk_done
 2649 
 2650 .Lpwrx_page_walk:
 2651         leaq    -4096(%rsp),%rsp
 2652         movq    (%rsp),%r10
 2653         cmpq    %rbp,%rsp
 2654         ja      .Lpwrx_page_walk
 2655 .Lpwrx_page_walk_done:
 2656 
 2657         movq    %r9,%r10
 2658         negq    %r9
 2659 
 2660 
 2661 
 2662 
 2663 
 2664 
 2665 
 2666 
 2667 
 2668 
 2669 
 2670 
 2671         pxor    %xmm0,%xmm0
 2672 .byte   102,72,15,110,207
 2673 .byte   102,72,15,110,209
 2674 .byte   102,73,15,110,218
 2675 .byte   102,72,15,110,226
 2676         movq    %r8,32(%rsp)
 2677         movq    %rax,40(%rsp)
 2678 .cfi_escape     0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 2679 .Lpowerx5_body:
 2680 
 2681         call    __bn_sqrx8x_internal
 2682         call    __bn_postx4x_internal
 2683         call    __bn_sqrx8x_internal
 2684         call    __bn_postx4x_internal
 2685         call    __bn_sqrx8x_internal
 2686         call    __bn_postx4x_internal
 2687         call    __bn_sqrx8x_internal
 2688         call    __bn_postx4x_internal
 2689         call    __bn_sqrx8x_internal
 2690         call    __bn_postx4x_internal
 2691 
 2692         movq    %r10,%r9
 2693         movq    %rsi,%rdi
 2694 .byte   102,72,15,126,209
 2695 .byte   102,72,15,126,226
 2696         movq    40(%rsp),%rax
 2697 
 2698         call    mulx4x_internal
 2699 
 2700         movq    40(%rsp),%rsi
 2701 .cfi_def_cfa    %rsi,8
 2702         movq    $1,%rax
 2703 
 2704         movq    -48(%rsi),%r15
 2705 .cfi_restore    %r15
 2706         movq    -40(%rsi),%r14
 2707 .cfi_restore    %r14
 2708         movq    -32(%rsi),%r13
 2709 .cfi_restore    %r13
 2710         movq    -24(%rsi),%r12
 2711 .cfi_restore    %r12
 2712         movq    -16(%rsi),%rbp
 2713 .cfi_restore    %rbp
 2714         movq    -8(%rsi),%rbx
 2715 .cfi_restore    %rbx
 2716         leaq    (%rsi),%rsp
 2717 .cfi_def_cfa_register   %rsp
 2718 .Lpowerx5_epilogue:
 2719         .byte   0xf3,0xc3
 2720 .cfi_endproc    
 2721 .size   bn_powerx5,.-bn_powerx5
 2722 
 2723 .globl  bn_sqrx8x_internal
 2724 .hidden bn_sqrx8x_internal
 2725 .type   bn_sqrx8x_internal,@function
 2726 .align  32
 2727 bn_sqrx8x_internal:
 2728 __bn_sqrx8x_internal:
 2729 .cfi_startproc  
 2730 
 2731 
 2732 
 2733 
 2734 
 2735 
 2736 
 2737 
 2738 
 2739 
 2740 
 2741 
 2742 
 2743 
 2744 
 2745 
 2746 
 2747 
 2748 
 2749 
 2750 
 2751 
 2752 
 2753 
 2754 
 2755 
 2756 
 2757 
 2758 
 2759 
 2760 
 2761 
 2762 
 2763 
 2764 
 2765 
 2766 
 2767 
 2768 
 2769 
 2770         leaq    48+8(%rsp),%rdi
 2771         leaq    (%rsi,%r9,1),%rbp
 2772         movq    %r9,0+8(%rsp)
 2773         movq    %rbp,8+8(%rsp)
 2774         jmp     .Lsqr8x_zero_start
 2775 
 2776 .align  32
 2777 .byte   0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
 2778 .Lsqrx8x_zero:
 2779 .byte   0x3e
 2780         movdqa  %xmm0,0(%rdi)
 2781         movdqa  %xmm0,16(%rdi)
 2782         movdqa  %xmm0,32(%rdi)
 2783         movdqa  %xmm0,48(%rdi)
 2784 .Lsqr8x_zero_start:
 2785         movdqa  %xmm0,64(%rdi)
 2786         movdqa  %xmm0,80(%rdi)
 2787         movdqa  %xmm0,96(%rdi)
 2788         movdqa  %xmm0,112(%rdi)
 2789         leaq    128(%rdi),%rdi
 2790         subq    $64,%r9
 2791         jnz     .Lsqrx8x_zero
 2792 
 2793         movq    0(%rsi),%rdx
 2794 
 2795         xorq    %r10,%r10
 2796         xorq    %r11,%r11
 2797         xorq    %r12,%r12
 2798         xorq    %r13,%r13
 2799         xorq    %r14,%r14
 2800         xorq    %r15,%r15
 2801         leaq    48+8(%rsp),%rdi
 2802         xorq    %rbp,%rbp
 2803         jmp     .Lsqrx8x_outer_loop
 2804 
 2805 .align  32
 2806 .Lsqrx8x_outer_loop:
 2807         mulxq   8(%rsi),%r8,%rax
 2808         adcxq   %r9,%r8
 2809         adoxq   %rax,%r10
 2810         mulxq   16(%rsi),%r9,%rax
 2811         adcxq   %r10,%r9
 2812         adoxq   %rax,%r11
 2813 .byte   0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
 2814         adcxq   %r11,%r10
 2815         adoxq   %rax,%r12
 2816 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
 2817         adcxq   %r12,%r11
 2818         adoxq   %rax,%r13
 2819         mulxq   40(%rsi),%r12,%rax
 2820         adcxq   %r13,%r12
 2821         adoxq   %rax,%r14
 2822         mulxq   48(%rsi),%r13,%rax
 2823         adcxq   %r14,%r13
 2824         adoxq   %r15,%rax
 2825         mulxq   56(%rsi),%r14,%r15
 2826         movq    8(%rsi),%rdx
 2827         adcxq   %rax,%r14
 2828         adoxq   %rbp,%r15
 2829         adcq    64(%rdi),%r15
 2830         movq    %r8,8(%rdi)
 2831         movq    %r9,16(%rdi)
 2832         sbbq    %rcx,%rcx
 2833         xorq    %rbp,%rbp
 2834 
 2835 
 2836         mulxq   16(%rsi),%r8,%rbx
 2837         mulxq   24(%rsi),%r9,%rax
 2838         adcxq   %r10,%r8
 2839         adoxq   %rbx,%r9
 2840         mulxq   32(%rsi),%r10,%rbx
 2841         adcxq   %r11,%r9
 2842         adoxq   %rax,%r10
 2843 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
 2844         adcxq   %r12,%r10
 2845         adoxq   %rbx,%r11
 2846 .byte   0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
 2847         adcxq   %r13,%r11
 2848         adoxq   %r14,%r12
 2849 .byte   0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
 2850         movq    16(%rsi),%rdx
 2851         adcxq   %rax,%r12
 2852         adoxq   %rbx,%r13
 2853         adcxq   %r15,%r13
 2854         adoxq   %rbp,%r14
 2855         adcxq   %rbp,%r14
 2856 
 2857         movq    %r8,24(%rdi)
 2858         movq    %r9,32(%rdi)
 2859 
 2860         mulxq   24(%rsi),%r8,%rbx
 2861         mulxq   32(%rsi),%r9,%rax
 2862         adcxq   %r10,%r8
 2863         adoxq   %rbx,%r9
 2864         mulxq   40(%rsi),%r10,%rbx
 2865         adcxq   %r11,%r9
 2866         adoxq   %rax,%r10
 2867 .byte   0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
 2868         adcxq   %r12,%r10
 2869         adoxq   %r13,%r11
 2870 .byte   0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
 2871 .byte   0x3e
 2872         movq    24(%rsi),%rdx
 2873         adcxq   %rbx,%r11
 2874         adoxq   %rax,%r12
 2875         adcxq   %r14,%r12
 2876         movq    %r8,40(%rdi)
 2877         movq    %r9,48(%rdi)
 2878         mulxq   32(%rsi),%r8,%rax
 2879         adoxq   %rbp,%r13
 2880         adcxq   %rbp,%r13
 2881 
 2882         mulxq   40(%rsi),%r9,%rbx
 2883         adcxq   %r10,%r8
 2884         adoxq   %rax,%r9
 2885         mulxq   48(%rsi),%r10,%rax
 2886         adcxq   %r11,%r9
 2887         adoxq   %r12,%r10
 2888         mulxq   56(%rsi),%r11,%r12
 2889         movq    32(%rsi),%rdx
 2890         movq    40(%rsi),%r14
 2891         adcxq   %rbx,%r10
 2892         adoxq   %rax,%r11
 2893         movq    48(%rsi),%r15
 2894         adcxq   %r13,%r11
 2895         adoxq   %rbp,%r12
 2896         adcxq   %rbp,%r12
 2897 
 2898         movq    %r8,56(%rdi)
 2899         movq    %r9,64(%rdi)
 2900 
 2901         mulxq   %r14,%r9,%rax
 2902         movq    56(%rsi),%r8
 2903         adcxq   %r10,%r9
 2904         mulxq   %r15,%r10,%rbx
 2905         adoxq   %rax,%r10
 2906         adcxq   %r11,%r10
 2907         mulxq   %r8,%r11,%rax
 2908         movq    %r14,%rdx
 2909         adoxq   %rbx,%r11
 2910         adcxq   %r12,%r11
 2911 
 2912         adcxq   %rbp,%rax
 2913 
 2914         mulxq   %r15,%r14,%rbx
 2915         mulxq   %r8,%r12,%r13
 2916         movq    %r15,%rdx
 2917         leaq    64(%rsi),%rsi
 2918         adcxq   %r14,%r11
 2919         adoxq   %rbx,%r12
 2920         adcxq   %rax,%r12
 2921         adoxq   %rbp,%r13
 2922 
 2923 .byte   0x67,0x67
 2924         mulxq   %r8,%r8,%r14
 2925         adcxq   %r8,%r13
 2926         adcxq   %rbp,%r14
 2927 
 2928         cmpq    8+8(%rsp),%rsi
 2929         je      .Lsqrx8x_outer_break
 2930 
 2931         negq    %rcx
 2932         movq    $-8,%rcx
 2933         movq    %rbp,%r15
 2934         movq    64(%rdi),%r8
 2935         adcxq   72(%rdi),%r9
 2936         adcxq   80(%rdi),%r10
 2937         adcxq   88(%rdi),%r11
 2938         adcq    96(%rdi),%r12
 2939         adcq    104(%rdi),%r13
 2940         adcq    112(%rdi),%r14
 2941         adcq    120(%rdi),%r15
 2942         leaq    (%rsi),%rbp
 2943         leaq    128(%rdi),%rdi
 2944         sbbq    %rax,%rax
 2945 
 2946         movq    -64(%rsi),%rdx
 2947         movq    %rax,16+8(%rsp)
 2948         movq    %rdi,24+8(%rsp)
 2949 
 2950 
 2951         xorl    %eax,%eax
 2952         jmp     .Lsqrx8x_loop
 2953 
 2954 .align  32
 2955 .Lsqrx8x_loop:
 2956         movq    %r8,%rbx
 2957         mulxq   0(%rbp),%rax,%r8
 2958         adcxq   %rax,%rbx
 2959         adoxq   %r9,%r8
 2960 
 2961         mulxq   8(%rbp),%rax,%r9
 2962         adcxq   %rax,%r8
 2963         adoxq   %r10,%r9
 2964 
 2965         mulxq   16(%rbp),%rax,%r10
 2966         adcxq   %rax,%r9
 2967         adoxq   %r11,%r10
 2968 
 2969         mulxq   24(%rbp),%rax,%r11
 2970         adcxq   %rax,%r10
 2971         adoxq   %r12,%r11
 2972 
 2973 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
 2974         adcxq   %rax,%r11
 2975         adoxq   %r13,%r12
 2976 
 2977         mulxq   40(%rbp),%rax,%r13
 2978         adcxq   %rax,%r12
 2979         adoxq   %r14,%r13
 2980 
 2981         mulxq   48(%rbp),%rax,%r14
 2982         movq    %rbx,(%rdi,%rcx,8)
 2983         movl    $0,%ebx
 2984         adcxq   %rax,%r13
 2985         adoxq   %r15,%r14
 2986 
 2987 .byte   0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
 2988         movq    8(%rsi,%rcx,8),%rdx
 2989         adcxq   %rax,%r14
 2990         adoxq   %rbx,%r15
 2991         adcxq   %rbx,%r15
 2992 
 2993 .byte   0x67
 2994         incq    %rcx
 2995         jnz     .Lsqrx8x_loop
 2996 
 2997         leaq    64(%rbp),%rbp
 2998         movq    $-8,%rcx
 2999         cmpq    8+8(%rsp),%rbp
 3000         je      .Lsqrx8x_break
 3001 
 3002         subq    16+8(%rsp),%rbx
 3003 .byte   0x66
 3004         movq    -64(%rsi),%rdx
 3005         adcxq   0(%rdi),%r8
 3006         adcxq   8(%rdi),%r9
 3007         adcq    16(%rdi),%r10
 3008         adcq    24(%rdi),%r11
 3009         adcq    32(%rdi),%r12
 3010         adcq    40(%rdi),%r13
 3011         adcq    48(%rdi),%r14
 3012         adcq    56(%rdi),%r15
 3013         leaq    64(%rdi),%rdi
 3014 .byte   0x67
 3015         sbbq    %rax,%rax
 3016         xorl    %ebx,%ebx
 3017         movq    %rax,16+8(%rsp)
 3018         jmp     .Lsqrx8x_loop
 3019 
 3020 .align  32
 3021 .Lsqrx8x_break:
 3022         xorq    %rbp,%rbp
 3023         subq    16+8(%rsp),%rbx
 3024         adcxq   %rbp,%r8
 3025         movq    24+8(%rsp),%rcx
 3026         adcxq   %rbp,%r9
 3027         movq    0(%rsi),%rdx
 3028         adcq    $0,%r10
 3029         movq    %r8,0(%rdi)
 3030         adcq    $0,%r11
 3031         adcq    $0,%r12
 3032         adcq    $0,%r13
 3033         adcq    $0,%r14
 3034         adcq    $0,%r15
 3035         cmpq    %rcx,%rdi
 3036         je      .Lsqrx8x_outer_loop
 3037 
 3038         movq    %r9,8(%rdi)
 3039         movq    8(%rcx),%r9
 3040         movq    %r10,16(%rdi)
 3041         movq    16(%rcx),%r10
 3042         movq    %r11,24(%rdi)
 3043         movq    24(%rcx),%r11
 3044         movq    %r12,32(%rdi)
 3045         movq    32(%rcx),%r12
 3046         movq    %r13,40(%rdi)
 3047         movq    40(%rcx),%r13
 3048         movq    %r14,48(%rdi)
 3049         movq    48(%rcx),%r14
 3050         movq    %r15,56(%rdi)
 3051         movq    56(%rcx),%r15
 3052         movq    %rcx,%rdi
 3053         jmp     .Lsqrx8x_outer_loop
 3054 
 3055 .align  32
 3056 .Lsqrx8x_outer_break:
 3057         movq    %r9,72(%rdi)
 3058 .byte   102,72,15,126,217
 3059         movq    %r10,80(%rdi)
 3060         movq    %r11,88(%rdi)
 3061         movq    %r12,96(%rdi)
 3062         movq    %r13,104(%rdi)
 3063         movq    %r14,112(%rdi)
 3064         leaq    48+8(%rsp),%rdi
 3065         movq    (%rsi,%rcx,1),%rdx
 3066 
 3067         movq    8(%rdi),%r11
 3068         xorq    %r10,%r10
 3069         movq    0+8(%rsp),%r9
 3070         adoxq   %r11,%r11
 3071         movq    16(%rdi),%r12
 3072         movq    24(%rdi),%r13
 3073 
 3074 
 3075 .align  32
 3076 .Lsqrx4x_shift_n_add:
 3077         mulxq   %rdx,%rax,%rbx
 3078         adoxq   %r12,%r12
 3079         adcxq   %r10,%rax
 3080 .byte   0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
 3081 .byte   0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
 3082         adoxq   %r13,%r13
 3083         adcxq   %r11,%rbx
 3084         movq    40(%rdi),%r11
 3085         movq    %rax,0(%rdi)
 3086         movq    %rbx,8(%rdi)
 3087 
 3088         mulxq   %rdx,%rax,%rbx
 3089         adoxq   %r10,%r10
 3090         adcxq   %r12,%rax
 3091         movq    16(%rsi,%rcx,1),%rdx
 3092         movq    48(%rdi),%r12
 3093         adoxq   %r11,%r11
 3094         adcxq   %r13,%rbx
 3095         movq    56(%rdi),%r13
 3096         movq    %rax,16(%rdi)
 3097         movq    %rbx,24(%rdi)
 3098 
 3099         mulxq   %rdx,%rax,%rbx
 3100         adoxq   %r12,%r12
 3101         adcxq   %r10,%rax
 3102         movq    24(%rsi,%rcx,1),%rdx
 3103         leaq    32(%rcx),%rcx
 3104         movq    64(%rdi),%r10
 3105         adoxq   %r13,%r13
 3106         adcxq   %r11,%rbx
 3107         movq    72(%rdi),%r11
 3108         movq    %rax,32(%rdi)
 3109         movq    %rbx,40(%rdi)
 3110 
 3111         mulxq   %rdx,%rax,%rbx
 3112         adoxq   %r10,%r10
 3113         adcxq   %r12,%rax
 3114         jrcxz   .Lsqrx4x_shift_n_add_break
 3115 .byte   0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
 3116         adoxq   %r11,%r11
 3117         adcxq   %r13,%rbx
 3118         movq    80(%rdi),%r12
 3119         movq    88(%rdi),%r13
 3120         movq    %rax,48(%rdi)
 3121         movq    %rbx,56(%rdi)
 3122         leaq    64(%rdi),%rdi
 3123         nop
 3124         jmp     .Lsqrx4x_shift_n_add
 3125 
 3126 .align  32
 3127 .Lsqrx4x_shift_n_add_break:
 3128         adcxq   %r13,%rbx
 3129         movq    %rax,48(%rdi)
 3130         movq    %rbx,56(%rdi)
 3131         leaq    64(%rdi),%rdi
 3132 .byte   102,72,15,126,213
 3133 __bn_sqrx8x_reduction:
 3134         xorl    %eax,%eax
 3135         movq    32+8(%rsp),%rbx
 3136         movq    48+8(%rsp),%rdx
 3137         leaq    -64(%rbp,%r9,1),%rcx
 3138 
 3139         movq    %rcx,0+8(%rsp)
 3140         movq    %rdi,8+8(%rsp)
 3141 
 3142         leaq    48+8(%rsp),%rdi
 3143         jmp     .Lsqrx8x_reduction_loop
 3144 
 3145 .align  32
 3146 .Lsqrx8x_reduction_loop:
 3147         movq    8(%rdi),%r9
 3148         movq    16(%rdi),%r10
 3149         movq    24(%rdi),%r11
 3150         movq    32(%rdi),%r12
 3151         movq    %rdx,%r8
 3152         imulq   %rbx,%rdx
 3153         movq    40(%rdi),%r13
 3154         movq    48(%rdi),%r14
 3155         movq    56(%rdi),%r15
 3156         movq    %rax,24+8(%rsp)
 3157 
 3158         leaq    64(%rdi),%rdi
 3159         xorq    %rsi,%rsi
 3160         movq    $-8,%rcx
 3161         jmp     .Lsqrx8x_reduce
 3162 
 3163 .align  32
 3164 .Lsqrx8x_reduce:
 3165         movq    %r8,%rbx
 3166         mulxq   0(%rbp),%rax,%r8
 3167         adcxq   %rbx,%rax
 3168         adoxq   %r9,%r8
 3169 
 3170         mulxq   8(%rbp),%rbx,%r9
 3171         adcxq   %rbx,%r8
 3172         adoxq   %r10,%r9
 3173 
 3174         mulxq   16(%rbp),%rbx,%r10
 3175         adcxq   %rbx,%r9
 3176         adoxq   %r11,%r10
 3177 
 3178         mulxq   24(%rbp),%rbx,%r11
 3179         adcxq   %rbx,%r10
 3180         adoxq   %r12,%r11
 3181 
 3182 .byte   0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
 3183         movq    %rdx,%rax
 3184         movq    %r8,%rdx
 3185         adcxq   %rbx,%r11
 3186         adoxq   %r13,%r12
 3187 
 3188         mulxq   32+8(%rsp),%rbx,%rdx
 3189         movq    %rax,%rdx
 3190         movq    %rax,64+48+8(%rsp,%rcx,8)
 3191 
 3192         mulxq   40(%rbp),%rax,%r13
 3193         adcxq   %rax,%r12
 3194         adoxq   %r14,%r13
 3195 
 3196         mulxq   48(%rbp),%rax,%r14
 3197         adcxq   %rax,%r13
 3198         adoxq   %r15,%r14
 3199 
 3200         mulxq   56(%rbp),%rax,%r15
 3201         movq    %rbx,%rdx
 3202         adcxq   %rax,%r14
 3203         adoxq   %rsi,%r15
 3204         adcxq   %rsi,%r15
 3205 
 3206 .byte   0x67,0x67,0x67
 3207         incq    %rcx
 3208         jnz     .Lsqrx8x_reduce
 3209 
 3210         movq    %rsi,%rax
 3211         cmpq    0+8(%rsp),%rbp
 3212         jae     .Lsqrx8x_no_tail
 3213 
 3214         movq    48+8(%rsp),%rdx
 3215         addq    0(%rdi),%r8
 3216         leaq    64(%rbp),%rbp
 3217         movq    $-8,%rcx
 3218         adcxq   8(%rdi),%r9
 3219         adcxq   16(%rdi),%r10
 3220         adcq    24(%rdi),%r11
 3221         adcq    32(%rdi),%r12
 3222         adcq    40(%rdi),%r13
 3223         adcq    48(%rdi),%r14
 3224         adcq    56(%rdi),%r15
 3225         leaq    64(%rdi),%rdi
 3226         sbbq    %rax,%rax
 3227 
 3228         xorq    %rsi,%rsi
 3229         movq    %rax,16+8(%rsp)
 3230         jmp     .Lsqrx8x_tail
 3231 
 3232 .align  32
 3233 .Lsqrx8x_tail:
 3234         movq    %r8,%rbx
 3235         mulxq   0(%rbp),%rax,%r8
 3236         adcxq   %rax,%rbx
 3237         adoxq   %r9,%r8
 3238 
 3239         mulxq   8(%rbp),%rax,%r9
 3240         adcxq   %rax,%r8
 3241         adoxq   %r10,%r9
 3242 
 3243         mulxq   16(%rbp),%rax,%r10
 3244         adcxq   %rax,%r9
 3245         adoxq   %r11,%r10
 3246 
 3247         mulxq   24(%rbp),%rax,%r11
 3248         adcxq   %rax,%r10
 3249         adoxq   %r12,%r11
 3250 
 3251 .byte   0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
 3252         adcxq   %rax,%r11
 3253         adoxq   %r13,%r12
 3254 
 3255         mulxq   40(%rbp),%rax,%r13
 3256         adcxq   %rax,%r12
 3257         adoxq   %r14,%r13
 3258 
 3259         mulxq   48(%rbp),%rax,%r14
 3260         adcxq   %rax,%r13
 3261         adoxq   %r15,%r14
 3262 
 3263         mulxq   56(%rbp),%rax,%r15
 3264         movq    72+48+8(%rsp,%rcx,8),%rdx
 3265         adcxq   %rax,%r14
 3266         adoxq   %rsi,%r15
 3267         movq    %rbx,(%rdi,%rcx,8)
 3268         movq    %r8,%rbx
 3269         adcxq   %rsi,%r15
 3270 
 3271         incq    %rcx
 3272         jnz     .Lsqrx8x_tail
 3273 
 3274         cmpq    0+8(%rsp),%rbp
 3275         jae     .Lsqrx8x_tail_done
 3276 
 3277         subq    16+8(%rsp),%rsi
 3278         movq    48+8(%rsp),%rdx
 3279         leaq    64(%rbp),%rbp
 3280         adcq    0(%rdi),%r8
 3281         adcq    8(%rdi),%r9
 3282         adcq    16(%rdi),%r10
 3283         adcq    24(%rdi),%r11
 3284         adcq    32(%rdi),%r12
 3285         adcq    40(%rdi),%r13
 3286         adcq    48(%rdi),%r14
 3287         adcq    56(%rdi),%r15
 3288         leaq    64(%rdi),%rdi
 3289         sbbq    %rax,%rax
 3290         subq    $8,%rcx
 3291 
 3292         xorq    %rsi,%rsi
 3293         movq    %rax,16+8(%rsp)
 3294         jmp     .Lsqrx8x_tail
 3295 
 3296 .align  32
 3297 .Lsqrx8x_tail_done:
 3298         xorq    %rax,%rax
 3299         addq    24+8(%rsp),%r8
 3300         adcq    $0,%r9
 3301         adcq    $0,%r10
 3302         adcq    $0,%r11
 3303         adcq    $0,%r12
 3304         adcq    $0,%r13
 3305         adcq    $0,%r14
 3306         adcq    $0,%r15
 3307         adcq    $0,%rax
 3308 
 3309         subq    16+8(%rsp),%rsi
 3310 .Lsqrx8x_no_tail:
 3311         adcq    0(%rdi),%r8
 3312 .byte   102,72,15,126,217
 3313         adcq    8(%rdi),%r9
 3314         movq    56(%rbp),%rsi
 3315 .byte   102,72,15,126,213
 3316         adcq    16(%rdi),%r10
 3317         adcq    24(%rdi),%r11
 3318         adcq    32(%rdi),%r12
 3319         adcq    40(%rdi),%r13
 3320         adcq    48(%rdi),%r14
 3321         adcq    56(%rdi),%r15
 3322         adcq    $0,%rax
 3323 
 3324         movq    32+8(%rsp),%rbx
 3325         movq    64(%rdi,%rcx,1),%rdx
 3326 
 3327         movq    %r8,0(%rdi)
 3328         leaq    64(%rdi),%r8
 3329         movq    %r9,8(%rdi)
 3330         movq    %r10,16(%rdi)
 3331         movq    %r11,24(%rdi)
 3332         movq    %r12,32(%rdi)
 3333         movq    %r13,40(%rdi)
 3334         movq    %r14,48(%rdi)
 3335         movq    %r15,56(%rdi)
 3336 
 3337         leaq    64(%rdi,%rcx,1),%rdi
 3338         cmpq    8+8(%rsp),%r8
 3339         jb      .Lsqrx8x_reduction_loop
 3340         .byte   0xf3,0xc3
 3341 .cfi_endproc    
 3342 .size   bn_sqrx8x_internal,.-bn_sqrx8x_internal
 3343 .align  32
 3344 __bn_postx4x_internal:
 3345 .cfi_startproc  
 3346         movq    0(%rbp),%r12
 3347         movq    %rcx,%r10
 3348         movq    %rcx,%r9
 3349         negq    %rax
 3350         sarq    $3+2,%rcx
 3351 
 3352 .byte   102,72,15,126,202
 3353 .byte   102,72,15,126,206
 3354         decq    %r12
 3355         movq    8(%rbp),%r13
 3356         xorq    %r8,%r8
 3357         movq    16(%rbp),%r14
 3358         movq    24(%rbp),%r15
 3359         jmp     .Lsqrx4x_sub_entry
 3360 
 3361 .align  16
 3362 .Lsqrx4x_sub:
 3363         movq    0(%rbp),%r12
 3364         movq    8(%rbp),%r13
 3365         movq    16(%rbp),%r14
 3366         movq    24(%rbp),%r15
 3367 .Lsqrx4x_sub_entry:
 3368         andnq   %rax,%r12,%r12
 3369         leaq    32(%rbp),%rbp
 3370         andnq   %rax,%r13,%r13
 3371         andnq   %rax,%r14,%r14
 3372         andnq   %rax,%r15,%r15
 3373 
 3374         negq    %r8
 3375         adcq    0(%rdi),%r12
 3376         adcq    8(%rdi),%r13
 3377         adcq    16(%rdi),%r14
 3378         adcq    24(%rdi),%r15
 3379         movq    %r12,0(%rdx)
 3380         leaq    32(%rdi),%rdi
 3381         movq    %r13,8(%rdx)
 3382         sbbq    %r8,%r8
 3383         movq    %r14,16(%rdx)
 3384         movq    %r15,24(%rdx)
 3385         leaq    32(%rdx),%rdx
 3386 
 3387         incq    %rcx
 3388         jnz     .Lsqrx4x_sub
 3389 
 3390         negq    %r9
 3391 
 3392         .byte   0xf3,0xc3
 3393 .cfi_endproc    
 3394 .size   __bn_postx4x_internal,.-__bn_postx4x_internal
 3395 .globl  bn_get_bits5
 3396 .type   bn_get_bits5,@function
 3397 .align  16
 3398 bn_get_bits5:
 3399 .cfi_startproc  
 3400         leaq    0(%rdi),%r10
 3401         leaq    1(%rdi),%r11
 3402         movl    %esi,%ecx
 3403         shrl    $4,%esi
 3404         andl    $15,%ecx
 3405         leal    -8(%rcx),%eax
 3406         cmpl    $11,%ecx
 3407         cmovaq  %r11,%r10
 3408         cmoval  %eax,%ecx
 3409         movzwl  (%r10,%rsi,2),%eax
 3410         shrl    %cl,%eax
 3411         andl    $31,%eax
 3412         .byte   0xf3,0xc3
 3413 .cfi_endproc    
 3414 .size   bn_get_bits5,.-bn_get_bits5
 3415 
 3416 .globl  bn_scatter5
 3417 .type   bn_scatter5,@function
 3418 .align  16
 3419 bn_scatter5:
 3420 .cfi_startproc  
 3421         cmpl    $0,%esi
 3422         jz      .Lscatter_epilogue
 3423         leaq    (%rdx,%rcx,8),%rdx
 3424 .Lscatter:
 3425         movq    (%rdi),%rax
 3426         leaq    8(%rdi),%rdi
 3427         movq    %rax,(%rdx)
 3428         leaq    256(%rdx),%rdx
 3429         subl    $1,%esi
 3430         jnz     .Lscatter
 3431 .Lscatter_epilogue:
 3432         .byte   0xf3,0xc3
 3433 .cfi_endproc    
 3434 .size   bn_scatter5,.-bn_scatter5
 3435 
 3436 .globl  bn_gather5
 3437 .type   bn_gather5,@function
 3438 .align  32
 3439 bn_gather5:
 3440 .LSEH_begin_bn_gather5:
 3441 .cfi_startproc  
 3442 
 3443 .byte   0x4c,0x8d,0x14,0x24
 3444 .byte   0x48,0x81,0xec,0x08,0x01,0x00,0x00
 3445         leaq    .Linc(%rip),%rax
 3446         andq    $-16,%rsp
 3447 
 3448         movd    %ecx,%xmm5
 3449         movdqa  0(%rax),%xmm0
 3450         movdqa  16(%rax),%xmm1
 3451         leaq    128(%rdx),%r11
 3452         leaq    128(%rsp),%rax
 3453 
 3454         pshufd  $0,%xmm5,%xmm5
 3455         movdqa  %xmm1,%xmm4
 3456         movdqa  %xmm1,%xmm2
 3457         paddd   %xmm0,%xmm1
 3458         pcmpeqd %xmm5,%xmm0
 3459         movdqa  %xmm4,%xmm3
 3460 
 3461         paddd   %xmm1,%xmm2
 3462         pcmpeqd %xmm5,%xmm1
 3463         movdqa  %xmm0,-128(%rax)
 3464         movdqa  %xmm4,%xmm0
 3465 
 3466         paddd   %xmm2,%xmm3
 3467         pcmpeqd %xmm5,%xmm2
 3468         movdqa  %xmm1,-112(%rax)
 3469         movdqa  %xmm4,%xmm1
 3470 
 3471         paddd   %xmm3,%xmm0
 3472         pcmpeqd %xmm5,%xmm3
 3473         movdqa  %xmm2,-96(%rax)
 3474         movdqa  %xmm4,%xmm2
 3475         paddd   %xmm0,%xmm1
 3476         pcmpeqd %xmm5,%xmm0
 3477         movdqa  %xmm3,-80(%rax)
 3478         movdqa  %xmm4,%xmm3
 3479 
 3480         paddd   %xmm1,%xmm2
 3481         pcmpeqd %xmm5,%xmm1
 3482         movdqa  %xmm0,-64(%rax)
 3483         movdqa  %xmm4,%xmm0
 3484 
 3485         paddd   %xmm2,%xmm3
 3486         pcmpeqd %xmm5,%xmm2
 3487         movdqa  %xmm1,-48(%rax)
 3488         movdqa  %xmm4,%xmm1
 3489 
 3490         paddd   %xmm3,%xmm0
 3491         pcmpeqd %xmm5,%xmm3
 3492         movdqa  %xmm2,-32(%rax)
 3493         movdqa  %xmm4,%xmm2
 3494         paddd   %xmm0,%xmm1
 3495         pcmpeqd %xmm5,%xmm0
 3496         movdqa  %xmm3,-16(%rax)
 3497         movdqa  %xmm4,%xmm3
 3498 
 3499         paddd   %xmm1,%xmm2
 3500         pcmpeqd %xmm5,%xmm1
 3501         movdqa  %xmm0,0(%rax)
 3502         movdqa  %xmm4,%xmm0
 3503 
 3504         paddd   %xmm2,%xmm3
 3505         pcmpeqd %xmm5,%xmm2
 3506         movdqa  %xmm1,16(%rax)
 3507         movdqa  %xmm4,%xmm1
 3508 
 3509         paddd   %xmm3,%xmm0
 3510         pcmpeqd %xmm5,%xmm3
 3511         movdqa  %xmm2,32(%rax)
 3512         movdqa  %xmm4,%xmm2
 3513         paddd   %xmm0,%xmm1
 3514         pcmpeqd %xmm5,%xmm0
 3515         movdqa  %xmm3,48(%rax)
 3516         movdqa  %xmm4,%xmm3
 3517 
 3518         paddd   %xmm1,%xmm2
 3519         pcmpeqd %xmm5,%xmm1
 3520         movdqa  %xmm0,64(%rax)
 3521         movdqa  %xmm4,%xmm0
 3522 
 3523         paddd   %xmm2,%xmm3
 3524         pcmpeqd %xmm5,%xmm2
 3525         movdqa  %xmm1,80(%rax)
 3526         movdqa  %xmm4,%xmm1
 3527 
 3528         paddd   %xmm3,%xmm0
 3529         pcmpeqd %xmm5,%xmm3
 3530         movdqa  %xmm2,96(%rax)
 3531         movdqa  %xmm4,%xmm2
 3532         movdqa  %xmm3,112(%rax)
 3533         jmp     .Lgather
 3534 
 3535 .align  32
 3536 .Lgather:
 3537         pxor    %xmm4,%xmm4
 3538         pxor    %xmm5,%xmm5
 3539         movdqa  -128(%r11),%xmm0
 3540         movdqa  -112(%r11),%xmm1
 3541         movdqa  -96(%r11),%xmm2
 3542         pand    -128(%rax),%xmm0
 3543         movdqa  -80(%r11),%xmm3
 3544         pand    -112(%rax),%xmm1
 3545         por     %xmm0,%xmm4
 3546         pand    -96(%rax),%xmm2
 3547         por     %xmm1,%xmm5
 3548         pand    -80(%rax),%xmm3
 3549         por     %xmm2,%xmm4
 3550         por     %xmm3,%xmm5
 3551         movdqa  -64(%r11),%xmm0
 3552         movdqa  -48(%r11),%xmm1
 3553         movdqa  -32(%r11),%xmm2
 3554         pand    -64(%rax),%xmm0
 3555         movdqa  -16(%r11),%xmm3
 3556         pand    -48(%rax),%xmm1
 3557         por     %xmm0,%xmm4
 3558         pand    -32(%rax),%xmm2
 3559         por     %xmm1,%xmm5
 3560         pand    -16(%rax),%xmm3
 3561         por     %xmm2,%xmm4
 3562         por     %xmm3,%xmm5
 3563         movdqa  0(%r11),%xmm0
 3564         movdqa  16(%r11),%xmm1
 3565         movdqa  32(%r11),%xmm2
 3566         pand    0(%rax),%xmm0
 3567         movdqa  48(%r11),%xmm3
 3568         pand    16(%rax),%xmm1
 3569         por     %xmm0,%xmm4
 3570         pand    32(%rax),%xmm2
 3571         por     %xmm1,%xmm5
 3572         pand    48(%rax),%xmm3
 3573         por     %xmm2,%xmm4
 3574         por     %xmm3,%xmm5
 3575         movdqa  64(%r11),%xmm0
 3576         movdqa  80(%r11),%xmm1
 3577         movdqa  96(%r11),%xmm2
 3578         pand    64(%rax),%xmm0
 3579         movdqa  112(%r11),%xmm3
 3580         pand    80(%rax),%xmm1
 3581         por     %xmm0,%xmm4
 3582         pand    96(%rax),%xmm2
 3583         por     %xmm1,%xmm5
 3584         pand    112(%rax),%xmm3
 3585         por     %xmm2,%xmm4
 3586         por     %xmm3,%xmm5
 3587         por     %xmm5,%xmm4
 3588         leaq    256(%r11),%r11
 3589         pshufd  $0x4e,%xmm4,%xmm0
 3590         por     %xmm4,%xmm0
 3591         movq    %xmm0,(%rdi)
 3592         leaq    8(%rdi),%rdi
 3593         subl    $1,%esi
 3594         jnz     .Lgather
 3595 
 3596         leaq    (%r10),%rsp
 3597         .byte   0xf3,0xc3
 3598 .LSEH_end_bn_gather5:
 3599 .cfi_endproc    
 3600 .size   bn_gather5,.-bn_gather5
 3601 .align  64
 3602 .Linc:
 3603 .long   0,0, 1,1
 3604 .long   2,2, 2,2
 3605 .byte   77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0

Cache object: 13973611771309e2f13c90dd70c9d98c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.