The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
    2 #
    3 # Licensed under the Apache License 2.0 (the "License").  You may not use
    4 # this file except in compliance with the License.  You can obtain a copy
    5 # in the file LICENSE in the source distribution or at
    6 # https://www.openssl.org/source/license.html
    7 
    8 #
    9 # ====================================================================
   10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   11 # project. The module is, however, dual licensed under OpenSSL and
   12 # CRYPTOGAMS licenses depending on where you obtain it. For further
   13 # details see http://www.openssl.org/~appro/cryptogams/.
   14 # ====================================================================
   15 #
   16 #
   17 # AES-NI-CTR+GHASH stitch.
   18 #
   19 # February 2013
   20 #
   21 # OpenSSL GCM implementation is organized in such way that its
   22 # performance is rather close to the sum of its streamed components,
   23 # in the context parallelized AES-NI CTR and modulo-scheduled
   24 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
   25 # was observed to perform significantly better than the sum of the
   26 # components on contemporary CPUs, the effort was deemed impossible to
   27 # justify. This module is based on combination of Intel submissions,
   28 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
   29 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
   30 # pressure with notable relative improvement, achieving 1.0 cycle per
   31 # byte processed with 128-bit key on Haswell processor, 0.74 - on
   32 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
   33 # measurements for favourable packet size, one divisible by 96.
   34 # Applications using the EVP interface will observe a few percent
   35 # worse performance.]
   36 #
   37 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
   38 #
   39 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
   40 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
   41 
   42 # Generated once from
   43 # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
   44 # and modified for ICP. Modification are kept at a bare minimum to ease later
   45 # upstream merges.
   46 
   47 #if defined(__x86_64__) && defined(HAVE_AVX) && \
   48     defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
   49 
   50 #define _ASM
   51 #include <sys/asm_linkage.h>
   52 
   53 /* Windows userland links with OpenSSL */
   54 #if !defined (_WIN32) || defined (_KERNEL)
   55 
   56 .extern gcm_avx_can_use_movbe
   57 
   58 .text
   59 
   60 #ifdef HAVE_MOVBE
   61 .balign 32
   62 FUNCTION(_aesni_ctr32_ghash_6x)
   63 .cfi_startproc
   64         ENDBR
   65         vmovdqu 32(%r11),%xmm2
   66         subq    $6,%rdx
   67         vpxor   %xmm4,%xmm4,%xmm4
   68         vmovdqu 0-128(%rcx),%xmm15
   69         vpaddb  %xmm2,%xmm1,%xmm10
   70         vpaddb  %xmm2,%xmm10,%xmm11
   71         vpaddb  %xmm2,%xmm11,%xmm12
   72         vpaddb  %xmm2,%xmm12,%xmm13
   73         vpaddb  %xmm2,%xmm13,%xmm14
   74         vpxor   %xmm15,%xmm1,%xmm9
   75         vmovdqu %xmm4,16+8(%rsp)
   76         jmp     .Loop6x
   77 
   78 .balign 32
   79 .Loop6x:
   80         addl    $100663296,%ebx
   81         jc      .Lhandle_ctr32
   82         vmovdqu 0-32(%r9),%xmm3
   83         vpaddb  %xmm2,%xmm14,%xmm1
   84         vpxor   %xmm15,%xmm10,%xmm10
   85         vpxor   %xmm15,%xmm11,%xmm11
   86 
   87 .Lresume_ctr32:
   88         vmovdqu %xmm1,(%r8)
   89         vpclmulqdq      $0x10,%xmm3,%xmm7,%xmm5
   90         vpxor   %xmm15,%xmm12,%xmm12
   91         vmovups 16-128(%rcx),%xmm2
   92         vpclmulqdq      $0x01,%xmm3,%xmm7,%xmm6
   93         xorq    %r12,%r12
   94         cmpq    %r14,%r15
   95 
   96         vaesenc %xmm2,%xmm9,%xmm9
   97         vmovdqu 48+8(%rsp),%xmm0
   98         vpxor   %xmm15,%xmm13,%xmm13
   99         vpclmulqdq      $0x00,%xmm3,%xmm7,%xmm1
  100         vaesenc %xmm2,%xmm10,%xmm10
  101         vpxor   %xmm15,%xmm14,%xmm14
  102         setnc   %r12b
  103         vpclmulqdq      $0x11,%xmm3,%xmm7,%xmm7
  104         vaesenc %xmm2,%xmm11,%xmm11
  105         vmovdqu 16-32(%r9),%xmm3
  106         negq    %r12
  107         vaesenc %xmm2,%xmm12,%xmm12
  108         vpxor   %xmm5,%xmm6,%xmm6
  109         vpclmulqdq      $0x00,%xmm3,%xmm0,%xmm5
  110         vpxor   %xmm4,%xmm8,%xmm8
  111         vaesenc %xmm2,%xmm13,%xmm13
  112         vpxor   %xmm5,%xmm1,%xmm4
  113         andq    $0x60,%r12
  114         vmovups 32-128(%rcx),%xmm15
  115         vpclmulqdq      $0x10,%xmm3,%xmm0,%xmm1
  116         vaesenc %xmm2,%xmm14,%xmm14
  117 
  118         vpclmulqdq      $0x01,%xmm3,%xmm0,%xmm2
  119         leaq    (%r14,%r12,1),%r14
  120         vaesenc %xmm15,%xmm9,%xmm9
  121         vpxor   16+8(%rsp),%xmm8,%xmm8
  122         vpclmulqdq      $0x11,%xmm3,%xmm0,%xmm3
  123         vmovdqu 64+8(%rsp),%xmm0
  124         vaesenc %xmm15,%xmm10,%xmm10
  125         movbeq  88(%r14),%r13
  126         vaesenc %xmm15,%xmm11,%xmm11
  127         movbeq  80(%r14),%r12
  128         vaesenc %xmm15,%xmm12,%xmm12
  129         movq    %r13,32+8(%rsp)
  130         vaesenc %xmm15,%xmm13,%xmm13
  131         movq    %r12,40+8(%rsp)
  132         vmovdqu 48-32(%r9),%xmm5
  133         vaesenc %xmm15,%xmm14,%xmm14
  134 
  135         vmovups 48-128(%rcx),%xmm15
  136         vpxor   %xmm1,%xmm6,%xmm6
  137         vpclmulqdq      $0x00,%xmm5,%xmm0,%xmm1
  138         vaesenc %xmm15,%xmm9,%xmm9
  139         vpxor   %xmm2,%xmm6,%xmm6
  140         vpclmulqdq      $0x10,%xmm5,%xmm0,%xmm2
  141         vaesenc %xmm15,%xmm10,%xmm10
  142         vpxor   %xmm3,%xmm7,%xmm7
  143         vpclmulqdq      $0x01,%xmm5,%xmm0,%xmm3
  144         vaesenc %xmm15,%xmm11,%xmm11
  145         vpclmulqdq      $0x11,%xmm5,%xmm0,%xmm5
  146         vmovdqu 80+8(%rsp),%xmm0
  147         vaesenc %xmm15,%xmm12,%xmm12
  148         vaesenc %xmm15,%xmm13,%xmm13
  149         vpxor   %xmm1,%xmm4,%xmm4
  150         vmovdqu 64-32(%r9),%xmm1
  151         vaesenc %xmm15,%xmm14,%xmm14
  152 
  153         vmovups 64-128(%rcx),%xmm15
  154         vpxor   %xmm2,%xmm6,%xmm6
  155         vpclmulqdq      $0x00,%xmm1,%xmm0,%xmm2
  156         vaesenc %xmm15,%xmm9,%xmm9
  157         vpxor   %xmm3,%xmm6,%xmm6
  158         vpclmulqdq      $0x10,%xmm1,%xmm0,%xmm3
  159         vaesenc %xmm15,%xmm10,%xmm10
  160         movbeq  72(%r14),%r13
  161         vpxor   %xmm5,%xmm7,%xmm7
  162         vpclmulqdq      $0x01,%xmm1,%xmm0,%xmm5
  163         vaesenc %xmm15,%xmm11,%xmm11
  164         movbeq  64(%r14),%r12
  165         vpclmulqdq      $0x11,%xmm1,%xmm0,%xmm1
  166         vmovdqu 96+8(%rsp),%xmm0
  167         vaesenc %xmm15,%xmm12,%xmm12
  168         movq    %r13,48+8(%rsp)
  169         vaesenc %xmm15,%xmm13,%xmm13
  170         movq    %r12,56+8(%rsp)
  171         vpxor   %xmm2,%xmm4,%xmm4
  172         vmovdqu 96-32(%r9),%xmm2
  173         vaesenc %xmm15,%xmm14,%xmm14
  174 
  175         vmovups 80-128(%rcx),%xmm15
  176         vpxor   %xmm3,%xmm6,%xmm6
  177         vpclmulqdq      $0x00,%xmm2,%xmm0,%xmm3
  178         vaesenc %xmm15,%xmm9,%xmm9
  179         vpxor   %xmm5,%xmm6,%xmm6
  180         vpclmulqdq      $0x10,%xmm2,%xmm0,%xmm5
  181         vaesenc %xmm15,%xmm10,%xmm10
  182         movbeq  56(%r14),%r13
  183         vpxor   %xmm1,%xmm7,%xmm7
  184         vpclmulqdq      $0x01,%xmm2,%xmm0,%xmm1
  185         vpxor   112+8(%rsp),%xmm8,%xmm8
  186         vaesenc %xmm15,%xmm11,%xmm11
  187         movbeq  48(%r14),%r12
  188         vpclmulqdq      $0x11,%xmm2,%xmm0,%xmm2
  189         vaesenc %xmm15,%xmm12,%xmm12
  190         movq    %r13,64+8(%rsp)
  191         vaesenc %xmm15,%xmm13,%xmm13
  192         movq    %r12,72+8(%rsp)
  193         vpxor   %xmm3,%xmm4,%xmm4
  194         vmovdqu 112-32(%r9),%xmm3
  195         vaesenc %xmm15,%xmm14,%xmm14
  196 
  197         vmovups 96-128(%rcx),%xmm15
  198         vpxor   %xmm5,%xmm6,%xmm6
  199         vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm5
  200         vaesenc %xmm15,%xmm9,%xmm9
  201         vpxor   %xmm1,%xmm6,%xmm6
  202         vpclmulqdq      $0x01,%xmm3,%xmm8,%xmm1
  203         vaesenc %xmm15,%xmm10,%xmm10
  204         movbeq  40(%r14),%r13
  205         vpxor   %xmm2,%xmm7,%xmm7
  206         vpclmulqdq      $0x00,%xmm3,%xmm8,%xmm2
  207         vaesenc %xmm15,%xmm11,%xmm11
  208         movbeq  32(%r14),%r12
  209         vpclmulqdq      $0x11,%xmm3,%xmm8,%xmm8
  210         vaesenc %xmm15,%xmm12,%xmm12
  211         movq    %r13,80+8(%rsp)
  212         vaesenc %xmm15,%xmm13,%xmm13
  213         movq    %r12,88+8(%rsp)
  214         vpxor   %xmm5,%xmm6,%xmm6
  215         vaesenc %xmm15,%xmm14,%xmm14
  216         vpxor   %xmm1,%xmm6,%xmm6
  217 
  218         vmovups 112-128(%rcx),%xmm15
  219         vpslldq $8,%xmm6,%xmm5
  220         vpxor   %xmm2,%xmm4,%xmm4
  221         vmovdqu 16(%r11),%xmm3
  222 
  223         vaesenc %xmm15,%xmm9,%xmm9
  224         vpxor   %xmm8,%xmm7,%xmm7
  225         vaesenc %xmm15,%xmm10,%xmm10
  226         vpxor   %xmm5,%xmm4,%xmm4
  227         movbeq  24(%r14),%r13
  228         vaesenc %xmm15,%xmm11,%xmm11
  229         movbeq  16(%r14),%r12
  230         vpalignr        $8,%xmm4,%xmm4,%xmm0
  231         vpclmulqdq      $0x10,%xmm3,%xmm4,%xmm4
  232         movq    %r13,96+8(%rsp)
  233         vaesenc %xmm15,%xmm12,%xmm12
  234         movq    %r12,104+8(%rsp)
  235         vaesenc %xmm15,%xmm13,%xmm13
  236         vmovups 128-128(%rcx),%xmm1
  237         vaesenc %xmm15,%xmm14,%xmm14
  238 
  239         vaesenc %xmm1,%xmm9,%xmm9
  240         vmovups 144-128(%rcx),%xmm15
  241         vaesenc %xmm1,%xmm10,%xmm10
  242         vpsrldq $8,%xmm6,%xmm6
  243         vaesenc %xmm1,%xmm11,%xmm11
  244         vpxor   %xmm6,%xmm7,%xmm7
  245         vaesenc %xmm1,%xmm12,%xmm12
  246         vpxor   %xmm0,%xmm4,%xmm4
  247         movbeq  8(%r14),%r13
  248         vaesenc %xmm1,%xmm13,%xmm13
  249         movbeq  0(%r14),%r12
  250         vaesenc %xmm1,%xmm14,%xmm14
  251         vmovups 160-128(%rcx),%xmm1
  252         cmpl    $12,%ebp        // ICP uses 10,12,14 not 9,11,13 for rounds.
  253         jb      .Lenc_tail
  254 
  255         vaesenc %xmm15,%xmm9,%xmm9
  256         vaesenc %xmm15,%xmm10,%xmm10
  257         vaesenc %xmm15,%xmm11,%xmm11
  258         vaesenc %xmm15,%xmm12,%xmm12
  259         vaesenc %xmm15,%xmm13,%xmm13
  260         vaesenc %xmm15,%xmm14,%xmm14
  261 
  262         vaesenc %xmm1,%xmm9,%xmm9
  263         vaesenc %xmm1,%xmm10,%xmm10
  264         vaesenc %xmm1,%xmm11,%xmm11
  265         vaesenc %xmm1,%xmm12,%xmm12
  266         vaesenc %xmm1,%xmm13,%xmm13
  267         vmovups 176-128(%rcx),%xmm15
  268         vaesenc %xmm1,%xmm14,%xmm14
  269         vmovups 192-128(%rcx),%xmm1
  270         cmpl    $14,%ebp        // ICP does not zero key schedule.
  271         jb      .Lenc_tail
  272 
  273         vaesenc %xmm15,%xmm9,%xmm9
  274         vaesenc %xmm15,%xmm10,%xmm10
  275         vaesenc %xmm15,%xmm11,%xmm11
  276         vaesenc %xmm15,%xmm12,%xmm12
  277         vaesenc %xmm15,%xmm13,%xmm13
  278         vaesenc %xmm15,%xmm14,%xmm14
  279 
  280         vaesenc %xmm1,%xmm9,%xmm9
  281         vaesenc %xmm1,%xmm10,%xmm10
  282         vaesenc %xmm1,%xmm11,%xmm11
  283         vaesenc %xmm1,%xmm12,%xmm12
  284         vaesenc %xmm1,%xmm13,%xmm13
  285         vmovups 208-128(%rcx),%xmm15
  286         vaesenc %xmm1,%xmm14,%xmm14
  287         vmovups 224-128(%rcx),%xmm1
  288         jmp     .Lenc_tail
  289 
  290 .balign 32
  291 .Lhandle_ctr32:
  292         vmovdqu (%r11),%xmm0
  293         vpshufb %xmm0,%xmm1,%xmm6
  294         vmovdqu 48(%r11),%xmm5
  295         vpaddd  64(%r11),%xmm6,%xmm10
  296         vpaddd  %xmm5,%xmm6,%xmm11
  297         vmovdqu 0-32(%r9),%xmm3
  298         vpaddd  %xmm5,%xmm10,%xmm12
  299         vpshufb %xmm0,%xmm10,%xmm10
  300         vpaddd  %xmm5,%xmm11,%xmm13
  301         vpshufb %xmm0,%xmm11,%xmm11
  302         vpxor   %xmm15,%xmm10,%xmm10
  303         vpaddd  %xmm5,%xmm12,%xmm14
  304         vpshufb %xmm0,%xmm12,%xmm12
  305         vpxor   %xmm15,%xmm11,%xmm11
  306         vpaddd  %xmm5,%xmm13,%xmm1
  307         vpshufb %xmm0,%xmm13,%xmm13
  308         vpshufb %xmm0,%xmm14,%xmm14
  309         vpshufb %xmm0,%xmm1,%xmm1
  310         jmp     .Lresume_ctr32
  311 
  312 .balign 32
  313 .Lenc_tail:
  314         vaesenc %xmm15,%xmm9,%xmm9
  315         vmovdqu %xmm7,16+8(%rsp)
  316         vpalignr        $8,%xmm4,%xmm4,%xmm8
  317         vaesenc %xmm15,%xmm10,%xmm10
  318         vpclmulqdq      $0x10,%xmm3,%xmm4,%xmm4
  319         vpxor   0(%rdi),%xmm1,%xmm2
  320         vaesenc %xmm15,%xmm11,%xmm11
  321         vpxor   16(%rdi),%xmm1,%xmm0
  322         vaesenc %xmm15,%xmm12,%xmm12
  323         vpxor   32(%rdi),%xmm1,%xmm5
  324         vaesenc %xmm15,%xmm13,%xmm13
  325         vpxor   48(%rdi),%xmm1,%xmm6
  326         vaesenc %xmm15,%xmm14,%xmm14
  327         vpxor   64(%rdi),%xmm1,%xmm7
  328         vpxor   80(%rdi),%xmm1,%xmm3
  329         vmovdqu (%r8),%xmm1
  330 
  331         vaesenclast     %xmm2,%xmm9,%xmm9
  332         vmovdqu 32(%r11),%xmm2
  333         vaesenclast     %xmm0,%xmm10,%xmm10
  334         vpaddb  %xmm2,%xmm1,%xmm0
  335         movq    %r13,112+8(%rsp)
  336         leaq    96(%rdi),%rdi
  337         vaesenclast     %xmm5,%xmm11,%xmm11
  338         vpaddb  %xmm2,%xmm0,%xmm5
  339         movq    %r12,120+8(%rsp)
  340         leaq    96(%rsi),%rsi
  341         vmovdqu 0-128(%rcx),%xmm15
  342         vaesenclast     %xmm6,%xmm12,%xmm12
  343         vpaddb  %xmm2,%xmm5,%xmm6
  344         vaesenclast     %xmm7,%xmm13,%xmm13
  345         vpaddb  %xmm2,%xmm6,%xmm7
  346         vaesenclast     %xmm3,%xmm14,%xmm14
  347         vpaddb  %xmm2,%xmm7,%xmm3
  348 
  349         addq    $0x60,%r10
  350         subq    $0x6,%rdx
  351         jc      .L6x_done
  352 
  353         vmovups %xmm9,-96(%rsi)
  354         vpxor   %xmm15,%xmm1,%xmm9
  355         vmovups %xmm10,-80(%rsi)
  356         vmovdqa %xmm0,%xmm10
  357         vmovups %xmm11,-64(%rsi)
  358         vmovdqa %xmm5,%xmm11
  359         vmovups %xmm12,-48(%rsi)
  360         vmovdqa %xmm6,%xmm12
  361         vmovups %xmm13,-32(%rsi)
  362         vmovdqa %xmm7,%xmm13
  363         vmovups %xmm14,-16(%rsi)
  364         vmovdqa %xmm3,%xmm14
  365         vmovdqu 32+8(%rsp),%xmm7
  366         jmp     .Loop6x
  367 
  368 .L6x_done:
  369         vpxor   16+8(%rsp),%xmm8,%xmm8
  370         vpxor   %xmm4,%xmm8,%xmm8
  371 
  372         RET
  373 .cfi_endproc
  374 SET_SIZE(_aesni_ctr32_ghash_6x)
  375 #endif /* ifdef HAVE_MOVBE */
  376 
  377 .balign 32
  378 FUNCTION(_aesni_ctr32_ghash_no_movbe_6x)
  379 .cfi_startproc
  380         ENDBR
  381         vmovdqu 32(%r11),%xmm2
  382         subq    $6,%rdx
  383         vpxor   %xmm4,%xmm4,%xmm4
  384         vmovdqu 0-128(%rcx),%xmm15
  385         vpaddb  %xmm2,%xmm1,%xmm10
  386         vpaddb  %xmm2,%xmm10,%xmm11
  387         vpaddb  %xmm2,%xmm11,%xmm12
  388         vpaddb  %xmm2,%xmm12,%xmm13
  389         vpaddb  %xmm2,%xmm13,%xmm14
  390         vpxor   %xmm15,%xmm1,%xmm9
  391         vmovdqu %xmm4,16+8(%rsp)
  392         jmp     .Loop6x_nmb
  393 
  394 .balign 32
  395 .Loop6x_nmb:
  396         addl    $100663296,%ebx
  397         jc      .Lhandle_ctr32_nmb
  398         vmovdqu 0-32(%r9),%xmm3
  399         vpaddb  %xmm2,%xmm14,%xmm1
  400         vpxor   %xmm15,%xmm10,%xmm10
  401         vpxor   %xmm15,%xmm11,%xmm11
  402 
  403 .Lresume_ctr32_nmb:
  404         vmovdqu %xmm1,(%r8)
  405         vpclmulqdq      $0x10,%xmm3,%xmm7,%xmm5
  406         vpxor   %xmm15,%xmm12,%xmm12
  407         vmovups 16-128(%rcx),%xmm2
  408         vpclmulqdq      $0x01,%xmm3,%xmm7,%xmm6
  409         xorq    %r12,%r12
  410         cmpq    %r14,%r15
  411 
  412         vaesenc %xmm2,%xmm9,%xmm9
  413         vmovdqu 48+8(%rsp),%xmm0
  414         vpxor   %xmm15,%xmm13,%xmm13
  415         vpclmulqdq      $0x00,%xmm3,%xmm7,%xmm1
  416         vaesenc %xmm2,%xmm10,%xmm10
  417         vpxor   %xmm15,%xmm14,%xmm14
  418         setnc   %r12b
  419         vpclmulqdq      $0x11,%xmm3,%xmm7,%xmm7
  420         vaesenc %xmm2,%xmm11,%xmm11
  421         vmovdqu 16-32(%r9),%xmm3
  422         negq    %r12
  423         vaesenc %xmm2,%xmm12,%xmm12
  424         vpxor   %xmm5,%xmm6,%xmm6
  425         vpclmulqdq      $0x00,%xmm3,%xmm0,%xmm5
  426         vpxor   %xmm4,%xmm8,%xmm8
  427         vaesenc %xmm2,%xmm13,%xmm13
  428         vpxor   %xmm5,%xmm1,%xmm4
  429         andq    $0x60,%r12
  430         vmovups 32-128(%rcx),%xmm15
  431         vpclmulqdq      $0x10,%xmm3,%xmm0,%xmm1
  432         vaesenc %xmm2,%xmm14,%xmm14
  433 
  434         vpclmulqdq      $0x01,%xmm3,%xmm0,%xmm2
  435         leaq    (%r14,%r12,1),%r14
  436         vaesenc %xmm15,%xmm9,%xmm9
  437         vpxor   16+8(%rsp),%xmm8,%xmm8
  438         vpclmulqdq      $0x11,%xmm3,%xmm0,%xmm3
  439         vmovdqu 64+8(%rsp),%xmm0
  440         vaesenc %xmm15,%xmm10,%xmm10
  441         movq    88(%r14),%r13
  442         bswapq  %r13
  443         vaesenc %xmm15,%xmm11,%xmm11
  444         movq    80(%r14),%r12
  445         bswapq  %r12
  446         vaesenc %xmm15,%xmm12,%xmm12
  447         movq    %r13,32+8(%rsp)
  448         vaesenc %xmm15,%xmm13,%xmm13
  449         movq    %r12,40+8(%rsp)
  450         vmovdqu 48-32(%r9),%xmm5
  451         vaesenc %xmm15,%xmm14,%xmm14
  452 
  453         vmovups 48-128(%rcx),%xmm15
  454         vpxor   %xmm1,%xmm6,%xmm6
  455         vpclmulqdq      $0x00,%xmm5,%xmm0,%xmm1
  456         vaesenc %xmm15,%xmm9,%xmm9
  457         vpxor   %xmm2,%xmm6,%xmm6
  458         vpclmulqdq      $0x10,%xmm5,%xmm0,%xmm2
  459         vaesenc %xmm15,%xmm10,%xmm10
  460         vpxor   %xmm3,%xmm7,%xmm7
  461         vpclmulqdq      $0x01,%xmm5,%xmm0,%xmm3
  462         vaesenc %xmm15,%xmm11,%xmm11
  463         vpclmulqdq      $0x11,%xmm5,%xmm0,%xmm5
  464         vmovdqu 80+8(%rsp),%xmm0
  465         vaesenc %xmm15,%xmm12,%xmm12
  466         vaesenc %xmm15,%xmm13,%xmm13
  467         vpxor   %xmm1,%xmm4,%xmm4
  468         vmovdqu 64-32(%r9),%xmm1
  469         vaesenc %xmm15,%xmm14,%xmm14
  470 
  471         vmovups 64-128(%rcx),%xmm15
  472         vpxor   %xmm2,%xmm6,%xmm6
  473         vpclmulqdq      $0x00,%xmm1,%xmm0,%xmm2
  474         vaesenc %xmm15,%xmm9,%xmm9
  475         vpxor   %xmm3,%xmm6,%xmm6
  476         vpclmulqdq      $0x10,%xmm1,%xmm0,%xmm3
  477         vaesenc %xmm15,%xmm10,%xmm10
  478         movq    72(%r14),%r13
  479         bswapq  %r13
  480         vpxor   %xmm5,%xmm7,%xmm7
  481         vpclmulqdq      $0x01,%xmm1,%xmm0,%xmm5
  482         vaesenc %xmm15,%xmm11,%xmm11
  483         movq    64(%r14),%r12
  484         bswapq  %r12
  485         vpclmulqdq      $0x11,%xmm1,%xmm0,%xmm1
  486         vmovdqu 96+8(%rsp),%xmm0
  487         vaesenc %xmm15,%xmm12,%xmm12
  488         movq    %r13,48+8(%rsp)
  489         vaesenc %xmm15,%xmm13,%xmm13
  490         movq    %r12,56+8(%rsp)
  491         vpxor   %xmm2,%xmm4,%xmm4
  492         vmovdqu 96-32(%r9),%xmm2
  493         vaesenc %xmm15,%xmm14,%xmm14
  494 
  495         vmovups 80-128(%rcx),%xmm15
  496         vpxor   %xmm3,%xmm6,%xmm6
  497         vpclmulqdq      $0x00,%xmm2,%xmm0,%xmm3
  498         vaesenc %xmm15,%xmm9,%xmm9
  499         vpxor   %xmm5,%xmm6,%xmm6
  500         vpclmulqdq      $0x10,%xmm2,%xmm0,%xmm5
  501         vaesenc %xmm15,%xmm10,%xmm10
  502         movq    56(%r14),%r13
  503         bswapq  %r13
  504         vpxor   %xmm1,%xmm7,%xmm7
  505         vpclmulqdq      $0x01,%xmm2,%xmm0,%xmm1
  506         vpxor   112+8(%rsp),%xmm8,%xmm8
  507         vaesenc %xmm15,%xmm11,%xmm11
  508         movq    48(%r14),%r12
  509         bswapq  %r12
  510         vpclmulqdq      $0x11,%xmm2,%xmm0,%xmm2
  511         vaesenc %xmm15,%xmm12,%xmm12
  512         movq    %r13,64+8(%rsp)
  513         vaesenc %xmm15,%xmm13,%xmm13
  514         movq    %r12,72+8(%rsp)
  515         vpxor   %xmm3,%xmm4,%xmm4
  516         vmovdqu 112-32(%r9),%xmm3
  517         vaesenc %xmm15,%xmm14,%xmm14
  518 
  519         vmovups 96-128(%rcx),%xmm15
  520         vpxor   %xmm5,%xmm6,%xmm6
  521         vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm5
  522         vaesenc %xmm15,%xmm9,%xmm9
  523         vpxor   %xmm1,%xmm6,%xmm6
  524         vpclmulqdq      $0x01,%xmm3,%xmm8,%xmm1
  525         vaesenc %xmm15,%xmm10,%xmm10
  526         movq    40(%r14),%r13
  527         bswapq  %r13
  528         vpxor   %xmm2,%xmm7,%xmm7
  529         vpclmulqdq      $0x00,%xmm3,%xmm8,%xmm2
  530         vaesenc %xmm15,%xmm11,%xmm11
  531         movq    32(%r14),%r12
  532         bswapq  %r12
  533         vpclmulqdq      $0x11,%xmm3,%xmm8,%xmm8
  534         vaesenc %xmm15,%xmm12,%xmm12
  535         movq    %r13,80+8(%rsp)
  536         vaesenc %xmm15,%xmm13,%xmm13
  537         movq    %r12,88+8(%rsp)
  538         vpxor   %xmm5,%xmm6,%xmm6
  539         vaesenc %xmm15,%xmm14,%xmm14
  540         vpxor   %xmm1,%xmm6,%xmm6
  541 
  542         vmovups 112-128(%rcx),%xmm15
  543         vpslldq $8,%xmm6,%xmm5
  544         vpxor   %xmm2,%xmm4,%xmm4
  545         vmovdqu 16(%r11),%xmm3
  546 
  547         vaesenc %xmm15,%xmm9,%xmm9
  548         vpxor   %xmm8,%xmm7,%xmm7
  549         vaesenc %xmm15,%xmm10,%xmm10
  550         vpxor   %xmm5,%xmm4,%xmm4
  551         movq    24(%r14),%r13
  552         bswapq  %r13
  553         vaesenc %xmm15,%xmm11,%xmm11
  554         movq    16(%r14),%r12
  555         bswapq  %r12
  556         vpalignr        $8,%xmm4,%xmm4,%xmm0
  557         vpclmulqdq      $0x10,%xmm3,%xmm4,%xmm4
  558         movq    %r13,96+8(%rsp)
  559         vaesenc %xmm15,%xmm12,%xmm12
  560         movq    %r12,104+8(%rsp)
  561         vaesenc %xmm15,%xmm13,%xmm13
  562         vmovups 128-128(%rcx),%xmm1
  563         vaesenc %xmm15,%xmm14,%xmm14
  564 
  565         vaesenc %xmm1,%xmm9,%xmm9
  566         vmovups 144-128(%rcx),%xmm15
  567         vaesenc %xmm1,%xmm10,%xmm10
  568         vpsrldq $8,%xmm6,%xmm6
  569         vaesenc %xmm1,%xmm11,%xmm11
  570         vpxor   %xmm6,%xmm7,%xmm7
  571         vaesenc %xmm1,%xmm12,%xmm12
  572         vpxor   %xmm0,%xmm4,%xmm4
  573         movq    8(%r14),%r13
  574         bswapq  %r13
  575         vaesenc %xmm1,%xmm13,%xmm13
  576         movq    0(%r14),%r12
  577         bswapq  %r12
  578         vaesenc %xmm1,%xmm14,%xmm14
  579         vmovups 160-128(%rcx),%xmm1
  580         cmpl    $12,%ebp        // ICP uses 10,12,14 not 9,11,13 for rounds.
  581         jb      .Lenc_tail_nmb
  582 
  583         vaesenc %xmm15,%xmm9,%xmm9
  584         vaesenc %xmm15,%xmm10,%xmm10
  585         vaesenc %xmm15,%xmm11,%xmm11
  586         vaesenc %xmm15,%xmm12,%xmm12
  587         vaesenc %xmm15,%xmm13,%xmm13
  588         vaesenc %xmm15,%xmm14,%xmm14
  589 
  590         vaesenc %xmm1,%xmm9,%xmm9
  591         vaesenc %xmm1,%xmm10,%xmm10
  592         vaesenc %xmm1,%xmm11,%xmm11
  593         vaesenc %xmm1,%xmm12,%xmm12
  594         vaesenc %xmm1,%xmm13,%xmm13
  595         vmovups 176-128(%rcx),%xmm15
  596         vaesenc %xmm1,%xmm14,%xmm14
  597         vmovups 192-128(%rcx),%xmm1
  598         cmpl    $14,%ebp        // ICP does not zero key schedule.
  599         jb      .Lenc_tail_nmb
  600 
  601         vaesenc %xmm15,%xmm9,%xmm9
  602         vaesenc %xmm15,%xmm10,%xmm10
  603         vaesenc %xmm15,%xmm11,%xmm11
  604         vaesenc %xmm15,%xmm12,%xmm12
  605         vaesenc %xmm15,%xmm13,%xmm13
  606         vaesenc %xmm15,%xmm14,%xmm14
  607 
  608         vaesenc %xmm1,%xmm9,%xmm9
  609         vaesenc %xmm1,%xmm10,%xmm10
  610         vaesenc %xmm1,%xmm11,%xmm11
  611         vaesenc %xmm1,%xmm12,%xmm12
  612         vaesenc %xmm1,%xmm13,%xmm13
  613         vmovups 208-128(%rcx),%xmm15
  614         vaesenc %xmm1,%xmm14,%xmm14
  615         vmovups 224-128(%rcx),%xmm1
  616         jmp     .Lenc_tail_nmb
  617 
  618 .balign 32
  619 .Lhandle_ctr32_nmb:
  620         vmovdqu (%r11),%xmm0
  621         vpshufb %xmm0,%xmm1,%xmm6
  622         vmovdqu 48(%r11),%xmm5
  623         vpaddd  64(%r11),%xmm6,%xmm10
  624         vpaddd  %xmm5,%xmm6,%xmm11
  625         vmovdqu 0-32(%r9),%xmm3
  626         vpaddd  %xmm5,%xmm10,%xmm12
  627         vpshufb %xmm0,%xmm10,%xmm10
  628         vpaddd  %xmm5,%xmm11,%xmm13
  629         vpshufb %xmm0,%xmm11,%xmm11
  630         vpxor   %xmm15,%xmm10,%xmm10
  631         vpaddd  %xmm5,%xmm12,%xmm14
  632         vpshufb %xmm0,%xmm12,%xmm12
  633         vpxor   %xmm15,%xmm11,%xmm11
  634         vpaddd  %xmm5,%xmm13,%xmm1
  635         vpshufb %xmm0,%xmm13,%xmm13
  636         vpshufb %xmm0,%xmm14,%xmm14
  637         vpshufb %xmm0,%xmm1,%xmm1
  638         jmp     .Lresume_ctr32_nmb
  639 
  640 .balign 32
  641 .Lenc_tail_nmb:
  642         vaesenc %xmm15,%xmm9,%xmm9
  643         vmovdqu %xmm7,16+8(%rsp)
  644         vpalignr        $8,%xmm4,%xmm4,%xmm8
  645         vaesenc %xmm15,%xmm10,%xmm10
  646         vpclmulqdq      $0x10,%xmm3,%xmm4,%xmm4
  647         vpxor   0(%rdi),%xmm1,%xmm2
  648         vaesenc %xmm15,%xmm11,%xmm11
  649         vpxor   16(%rdi),%xmm1,%xmm0
  650         vaesenc %xmm15,%xmm12,%xmm12
  651         vpxor   32(%rdi),%xmm1,%xmm5
  652         vaesenc %xmm15,%xmm13,%xmm13
  653         vpxor   48(%rdi),%xmm1,%xmm6
  654         vaesenc %xmm15,%xmm14,%xmm14
  655         vpxor   64(%rdi),%xmm1,%xmm7
  656         vpxor   80(%rdi),%xmm1,%xmm3
  657         vmovdqu (%r8),%xmm1
  658 
  659         vaesenclast     %xmm2,%xmm9,%xmm9
  660         vmovdqu 32(%r11),%xmm2
  661         vaesenclast     %xmm0,%xmm10,%xmm10
  662         vpaddb  %xmm2,%xmm1,%xmm0
  663         movq    %r13,112+8(%rsp)
  664         leaq    96(%rdi),%rdi
  665         vaesenclast     %xmm5,%xmm11,%xmm11
  666         vpaddb  %xmm2,%xmm0,%xmm5
  667         movq    %r12,120+8(%rsp)
  668         leaq    96(%rsi),%rsi
  669         vmovdqu 0-128(%rcx),%xmm15
  670         vaesenclast     %xmm6,%xmm12,%xmm12
  671         vpaddb  %xmm2,%xmm5,%xmm6
  672         vaesenclast     %xmm7,%xmm13,%xmm13
  673         vpaddb  %xmm2,%xmm6,%xmm7
  674         vaesenclast     %xmm3,%xmm14,%xmm14
  675         vpaddb  %xmm2,%xmm7,%xmm3
  676 
  677         addq    $0x60,%r10
  678         subq    $0x6,%rdx
  679         jc      .L6x_done_nmb
  680 
  681         vmovups %xmm9,-96(%rsi)
  682         vpxor   %xmm15,%xmm1,%xmm9
  683         vmovups %xmm10,-80(%rsi)
  684         vmovdqa %xmm0,%xmm10
  685         vmovups %xmm11,-64(%rsi)
  686         vmovdqa %xmm5,%xmm11
  687         vmovups %xmm12,-48(%rsi)
  688         vmovdqa %xmm6,%xmm12
  689         vmovups %xmm13,-32(%rsi)
  690         vmovdqa %xmm7,%xmm13
  691         vmovups %xmm14,-16(%rsi)
  692         vmovdqa %xmm3,%xmm14
  693         vmovdqu 32+8(%rsp),%xmm7
  694         jmp     .Loop6x_nmb
  695 
  696 .L6x_done_nmb:
  697         vpxor   16+8(%rsp),%xmm8,%xmm8
  698         vpxor   %xmm4,%xmm8,%xmm8
  699 
  700         RET
  701 .cfi_endproc
  702 SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x)
  703 
  704 ENTRY_ALIGN(aesni_gcm_decrypt, 32)
  705 .cfi_startproc
  706         ENDBR
  707         xorq    %r10,%r10
  708         cmpq    $0x60,%rdx
  709         jb      .Lgcm_dec_abort
  710 
  711         leaq    (%rsp),%rax
  712 .cfi_def_cfa_register   %rax
  713         pushq   %rbx
  714 .cfi_offset     %rbx,-16
  715         pushq   %rbp
  716 .cfi_offset     %rbp,-24
  717         pushq   %r12
  718 .cfi_offset     %r12,-32
  719         pushq   %r13
  720 .cfi_offset     %r13,-40
  721         pushq   %r14
  722 .cfi_offset     %r14,-48
  723         pushq   %r15
  724 .cfi_offset     %r15,-56
  725         pushq   %r9
  726 .cfi_offset     %r9,-64
  727         vzeroupper
  728 
  729         vmovdqu (%r8),%xmm1
  730         addq    $-128,%rsp
  731         movl    12(%r8),%ebx
  732         leaq    .Lbswap_mask(%rip),%r11
  733         leaq    -128(%rcx),%r14
  734         movq    $0xf80,%r15
  735         vmovdqu (%r9),%xmm8
  736         andq    $-128,%rsp
  737         vmovdqu (%r11),%xmm0
  738         leaq    128(%rcx),%rcx
  739         movq    32(%r9),%r9
  740         leaq    32(%r9),%r9
  741         movl    504-128(%rcx),%ebp      // ICP has a larger offset for rounds.
  742         vpshufb %xmm0,%xmm8,%xmm8
  743 
  744         andq    %r15,%r14
  745         andq    %rsp,%r15
  746         subq    %r14,%r15
  747         jc      .Ldec_no_key_aliasing
  748         cmpq    $768,%r15
  749         jnc     .Ldec_no_key_aliasing
  750         subq    %r15,%rsp
  751 .Ldec_no_key_aliasing:
  752 
  753         vmovdqu 80(%rdi),%xmm7
  754         leaq    (%rdi),%r14
  755         vmovdqu 64(%rdi),%xmm4
  756         leaq    -192(%rdi,%rdx,1),%r15
  757         vmovdqu 48(%rdi),%xmm5
  758         shrq    $4,%rdx
  759         xorq    %r10,%r10
  760         vmovdqu 32(%rdi),%xmm6
  761         vpshufb %xmm0,%xmm7,%xmm7
  762         vmovdqu 16(%rdi),%xmm2
  763         vpshufb %xmm0,%xmm4,%xmm4
  764         vmovdqu (%rdi),%xmm3
  765         vpshufb %xmm0,%xmm5,%xmm5
  766         vmovdqu %xmm4,48(%rsp)
  767         vpshufb %xmm0,%xmm6,%xmm6
  768         vmovdqu %xmm5,64(%rsp)
  769         vpshufb %xmm0,%xmm2,%xmm2
  770         vmovdqu %xmm6,80(%rsp)
  771         vpshufb %xmm0,%xmm3,%xmm3
  772         vmovdqu %xmm2,96(%rsp)
  773         vmovdqu %xmm3,112(%rsp)
  774 
  775 #ifdef HAVE_MOVBE
  776 #ifdef _KERNEL
  777         testl   $1,gcm_avx_can_use_movbe(%rip)
  778 #else
  779         testl   $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
  780 #endif
  781         jz      1f
  782         call    _aesni_ctr32_ghash_6x
  783         jmp     2f
  784 1:
  785 #endif
  786         call    _aesni_ctr32_ghash_no_movbe_6x
  787 2:
  788         vmovups %xmm9,-96(%rsi)
  789         vmovups %xmm10,-80(%rsi)
  790         vmovups %xmm11,-64(%rsi)
  791         vmovups %xmm12,-48(%rsi)
  792         vmovups %xmm13,-32(%rsi)
  793         vmovups %xmm14,-16(%rsi)
  794 
  795         vpshufb (%r11),%xmm8,%xmm8
  796         movq    -56(%rax),%r9
  797 .cfi_restore    %r9
  798         vmovdqu %xmm8,(%r9)
  799 
  800         vzeroupper
  801         movq    -48(%rax),%r15
  802 .cfi_restore    %r15
  803         movq    -40(%rax),%r14
  804 .cfi_restore    %r14
  805         movq    -32(%rax),%r13
  806 .cfi_restore    %r13
  807         movq    -24(%rax),%r12
  808 .cfi_restore    %r12
  809         movq    -16(%rax),%rbp
  810 .cfi_restore    %rbp
  811         movq    -8(%rax),%rbx
  812 .cfi_restore    %rbx
  813         leaq    (%rax),%rsp
  814 .cfi_def_cfa_register   %rsp
  815 .Lgcm_dec_abort:
  816         movq    %r10,%rax
  817         RET
  818 .cfi_endproc
  819 SET_SIZE(aesni_gcm_decrypt)
  820 
  821 .balign 32
  822 FUNCTION(_aesni_ctr32_6x)
  823 .cfi_startproc
  824         ENDBR
  825         vmovdqu 0-128(%rcx),%xmm4
  826         vmovdqu 32(%r11),%xmm2
  827         leaq    -2(%rbp),%r13   // ICP uses 10,12,14 not 9,11,13 for rounds.
  828         vmovups 16-128(%rcx),%xmm15
  829         leaq    32-128(%rcx),%r12
  830         vpxor   %xmm4,%xmm1,%xmm9
  831         addl    $100663296,%ebx
  832         jc      .Lhandle_ctr32_2
  833         vpaddb  %xmm2,%xmm1,%xmm10
  834         vpaddb  %xmm2,%xmm10,%xmm11
  835         vpxor   %xmm4,%xmm10,%xmm10
  836         vpaddb  %xmm2,%xmm11,%xmm12
  837         vpxor   %xmm4,%xmm11,%xmm11
  838         vpaddb  %xmm2,%xmm12,%xmm13
  839         vpxor   %xmm4,%xmm12,%xmm12
  840         vpaddb  %xmm2,%xmm13,%xmm14
  841         vpxor   %xmm4,%xmm13,%xmm13
  842         vpaddb  %xmm2,%xmm14,%xmm1
  843         vpxor   %xmm4,%xmm14,%xmm14
  844         jmp     .Loop_ctr32
  845 
  846 .balign 16
  847 .Loop_ctr32:
  848         vaesenc %xmm15,%xmm9,%xmm9
  849         vaesenc %xmm15,%xmm10,%xmm10
  850         vaesenc %xmm15,%xmm11,%xmm11
  851         vaesenc %xmm15,%xmm12,%xmm12
  852         vaesenc %xmm15,%xmm13,%xmm13
  853         vaesenc %xmm15,%xmm14,%xmm14
  854         vmovups (%r12),%xmm15
  855         leaq    16(%r12),%r12
  856         decl    %r13d
  857         jnz     .Loop_ctr32
  858 
  859         vmovdqu (%r12),%xmm3
  860         vaesenc %xmm15,%xmm9,%xmm9
  861         vpxor   0(%rdi),%xmm3,%xmm4
  862         vaesenc %xmm15,%xmm10,%xmm10
  863         vpxor   16(%rdi),%xmm3,%xmm5
  864         vaesenc %xmm15,%xmm11,%xmm11
  865         vpxor   32(%rdi),%xmm3,%xmm6
  866         vaesenc %xmm15,%xmm12,%xmm12
  867         vpxor   48(%rdi),%xmm3,%xmm8
  868         vaesenc %xmm15,%xmm13,%xmm13
  869         vpxor   64(%rdi),%xmm3,%xmm2
  870         vaesenc %xmm15,%xmm14,%xmm14
  871         vpxor   80(%rdi),%xmm3,%xmm3
  872         leaq    96(%rdi),%rdi
  873 
  874         vaesenclast     %xmm4,%xmm9,%xmm9
  875         vaesenclast     %xmm5,%xmm10,%xmm10
  876         vaesenclast     %xmm6,%xmm11,%xmm11
  877         vaesenclast     %xmm8,%xmm12,%xmm12
  878         vaesenclast     %xmm2,%xmm13,%xmm13
  879         vaesenclast     %xmm3,%xmm14,%xmm14
  880         vmovups %xmm9,0(%rsi)
  881         vmovups %xmm10,16(%rsi)
  882         vmovups %xmm11,32(%rsi)
  883         vmovups %xmm12,48(%rsi)
  884         vmovups %xmm13,64(%rsi)
  885         vmovups %xmm14,80(%rsi)
  886         leaq    96(%rsi),%rsi
  887 
  888         RET
  889 .balign 32
  890 .Lhandle_ctr32_2:
  891         vpshufb %xmm0,%xmm1,%xmm6
  892         vmovdqu 48(%r11),%xmm5
  893         vpaddd  64(%r11),%xmm6,%xmm10
  894         vpaddd  %xmm5,%xmm6,%xmm11
  895         vpaddd  %xmm5,%xmm10,%xmm12
  896         vpshufb %xmm0,%xmm10,%xmm10
  897         vpaddd  %xmm5,%xmm11,%xmm13
  898         vpshufb %xmm0,%xmm11,%xmm11
  899         vpxor   %xmm4,%xmm10,%xmm10
  900         vpaddd  %xmm5,%xmm12,%xmm14
  901         vpshufb %xmm0,%xmm12,%xmm12
  902         vpxor   %xmm4,%xmm11,%xmm11
  903         vpaddd  %xmm5,%xmm13,%xmm1
  904         vpshufb %xmm0,%xmm13,%xmm13
  905         vpxor   %xmm4,%xmm12,%xmm12
  906         vpshufb %xmm0,%xmm14,%xmm14
  907         vpxor   %xmm4,%xmm13,%xmm13
  908         vpshufb %xmm0,%xmm1,%xmm1
  909         vpxor   %xmm4,%xmm14,%xmm14
  910         jmp     .Loop_ctr32
  911 .cfi_endproc
  912 SET_SIZE(_aesni_ctr32_6x)
  913 
  914 ENTRY_ALIGN(aesni_gcm_encrypt, 32)
  915 .cfi_startproc
  916         ENDBR
  917         xorq    %r10,%r10
  918         cmpq    $288,%rdx
  919         jb      .Lgcm_enc_abort
  920 
  921         leaq    (%rsp),%rax
  922 .cfi_def_cfa_register   %rax
  923         pushq   %rbx
  924 .cfi_offset     %rbx,-16
  925         pushq   %rbp
  926 .cfi_offset     %rbp,-24
  927         pushq   %r12
  928 .cfi_offset     %r12,-32
  929         pushq   %r13
  930 .cfi_offset     %r13,-40
  931         pushq   %r14
  932 .cfi_offset     %r14,-48
  933         pushq   %r15
  934 .cfi_offset     %r15,-56
  935         pushq   %r9
  936 .cfi_offset     %r9,-64
  937         vzeroupper
  938 
  939         vmovdqu (%r8),%xmm1
  940         addq    $-128,%rsp
  941         movl    12(%r8),%ebx
  942         leaq    .Lbswap_mask(%rip),%r11
  943         leaq    -128(%rcx),%r14
  944         movq    $0xf80,%r15
  945         leaq    128(%rcx),%rcx
  946         vmovdqu (%r11),%xmm0
  947         andq    $-128,%rsp
  948         movl    504-128(%rcx),%ebp      // ICP has an larger offset for rounds.
  949 
  950         andq    %r15,%r14
  951         andq    %rsp,%r15
  952         subq    %r14,%r15
  953         jc      .Lenc_no_key_aliasing
  954         cmpq    $768,%r15
  955         jnc     .Lenc_no_key_aliasing
  956         subq    %r15,%rsp
  957 .Lenc_no_key_aliasing:
  958 
  959         leaq    (%rsi),%r14
  960         leaq    -192(%rsi,%rdx,1),%r15
  961         shrq    $4,%rdx
  962 
  963         call    _aesni_ctr32_6x
  964         vpshufb %xmm0,%xmm9,%xmm8
  965         vpshufb %xmm0,%xmm10,%xmm2
  966         vmovdqu %xmm8,112(%rsp)
  967         vpshufb %xmm0,%xmm11,%xmm4
  968         vmovdqu %xmm2,96(%rsp)
  969         vpshufb %xmm0,%xmm12,%xmm5
  970         vmovdqu %xmm4,80(%rsp)
  971         vpshufb %xmm0,%xmm13,%xmm6
  972         vmovdqu %xmm5,64(%rsp)
  973         vpshufb %xmm0,%xmm14,%xmm7
  974         vmovdqu %xmm6,48(%rsp)
  975 
  976         call    _aesni_ctr32_6x
  977 
  978         vmovdqu (%r9),%xmm8
  979         movq    32(%r9),%r9
  980         leaq    32(%r9),%r9
  981         subq    $12,%rdx
  982         movq    $192,%r10
  983         vpshufb %xmm0,%xmm8,%xmm8
  984 
  985 #ifdef HAVE_MOVBE
  986 #ifdef _KERNEL
  987         testl   $1,gcm_avx_can_use_movbe(%rip)
  988 #else
  989         testl   $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
  990 #endif
  991         jz      1f
  992         call    _aesni_ctr32_ghash_6x
  993         jmp     2f
  994 1:
  995 #endif
  996         call    _aesni_ctr32_ghash_no_movbe_6x
  997 2:
  998         vmovdqu 32(%rsp),%xmm7
  999         vmovdqu (%r11),%xmm0
 1000         vmovdqu 0-32(%r9),%xmm3
 1001         vpunpckhqdq     %xmm7,%xmm7,%xmm1
 1002         vmovdqu 32-32(%r9),%xmm15
 1003         vmovups %xmm9,-96(%rsi)
 1004         vpshufb %xmm0,%xmm9,%xmm9
 1005         vpxor   %xmm7,%xmm1,%xmm1
 1006         vmovups %xmm10,-80(%rsi)
 1007         vpshufb %xmm0,%xmm10,%xmm10
 1008         vmovups %xmm11,-64(%rsi)
 1009         vpshufb %xmm0,%xmm11,%xmm11
 1010         vmovups %xmm12,-48(%rsi)
 1011         vpshufb %xmm0,%xmm12,%xmm12
 1012         vmovups %xmm13,-32(%rsi)
 1013         vpshufb %xmm0,%xmm13,%xmm13
 1014         vmovups %xmm14,-16(%rsi)
 1015         vpshufb %xmm0,%xmm14,%xmm14
 1016         vmovdqu %xmm9,16(%rsp)
 1017         vmovdqu 48(%rsp),%xmm6
 1018         vmovdqu 16-32(%r9),%xmm0
 1019         vpunpckhqdq     %xmm6,%xmm6,%xmm2
 1020         vpclmulqdq      $0x00,%xmm3,%xmm7,%xmm5
 1021         vpxor   %xmm6,%xmm2,%xmm2
 1022         vpclmulqdq      $0x11,%xmm3,%xmm7,%xmm7
 1023         vpclmulqdq      $0x00,%xmm15,%xmm1,%xmm1
 1024 
 1025         vmovdqu 64(%rsp),%xmm9
 1026         vpclmulqdq      $0x00,%xmm0,%xmm6,%xmm4
 1027         vmovdqu 48-32(%r9),%xmm3
 1028         vpxor   %xmm5,%xmm4,%xmm4
 1029         vpunpckhqdq     %xmm9,%xmm9,%xmm5
 1030         vpclmulqdq      $0x11,%xmm0,%xmm6,%xmm6
 1031         vpxor   %xmm9,%xmm5,%xmm5
 1032         vpxor   %xmm7,%xmm6,%xmm6
 1033         vpclmulqdq      $0x10,%xmm15,%xmm2,%xmm2
 1034         vmovdqu 80-32(%r9),%xmm15
 1035         vpxor   %xmm1,%xmm2,%xmm2
 1036 
 1037         vmovdqu 80(%rsp),%xmm1
 1038         vpclmulqdq      $0x00,%xmm3,%xmm9,%xmm7
 1039         vmovdqu 64-32(%r9),%xmm0
 1040         vpxor   %xmm4,%xmm7,%xmm7
 1041         vpunpckhqdq     %xmm1,%xmm1,%xmm4
 1042         vpclmulqdq      $0x11,%xmm3,%xmm9,%xmm9
 1043         vpxor   %xmm1,%xmm4,%xmm4
 1044         vpxor   %xmm6,%xmm9,%xmm9
 1045         vpclmulqdq      $0x00,%xmm15,%xmm5,%xmm5
 1046         vpxor   %xmm2,%xmm5,%xmm5
 1047 
 1048         vmovdqu 96(%rsp),%xmm2
 1049         vpclmulqdq      $0x00,%xmm0,%xmm1,%xmm6
 1050         vmovdqu 96-32(%r9),%xmm3
 1051         vpxor   %xmm7,%xmm6,%xmm6
 1052         vpunpckhqdq     %xmm2,%xmm2,%xmm7
 1053         vpclmulqdq      $0x11,%xmm0,%xmm1,%xmm1
 1054         vpxor   %xmm2,%xmm7,%xmm7
 1055         vpxor   %xmm9,%xmm1,%xmm1
 1056         vpclmulqdq      $0x10,%xmm15,%xmm4,%xmm4
 1057         vmovdqu 128-32(%r9),%xmm15
 1058         vpxor   %xmm5,%xmm4,%xmm4
 1059 
 1060         vpxor   112(%rsp),%xmm8,%xmm8
 1061         vpclmulqdq      $0x00,%xmm3,%xmm2,%xmm5
 1062         vmovdqu 112-32(%r9),%xmm0
 1063         vpunpckhqdq     %xmm8,%xmm8,%xmm9
 1064         vpxor   %xmm6,%xmm5,%xmm5
 1065         vpclmulqdq      $0x11,%xmm3,%xmm2,%xmm2
 1066         vpxor   %xmm8,%xmm9,%xmm9
 1067         vpxor   %xmm1,%xmm2,%xmm2
 1068         vpclmulqdq      $0x00,%xmm15,%xmm7,%xmm7
 1069         vpxor   %xmm4,%xmm7,%xmm4
 1070 
 1071         vpclmulqdq      $0x00,%xmm0,%xmm8,%xmm6
 1072         vmovdqu 0-32(%r9),%xmm3
 1073         vpunpckhqdq     %xmm14,%xmm14,%xmm1
 1074         vpclmulqdq      $0x11,%xmm0,%xmm8,%xmm8
 1075         vpxor   %xmm14,%xmm1,%xmm1
 1076         vpxor   %xmm5,%xmm6,%xmm5
 1077         vpclmulqdq      $0x10,%xmm15,%xmm9,%xmm9
 1078         vmovdqu 32-32(%r9),%xmm15
 1079         vpxor   %xmm2,%xmm8,%xmm7
 1080         vpxor   %xmm4,%xmm9,%xmm6
 1081 
 1082         vmovdqu 16-32(%r9),%xmm0
 1083         vpxor   %xmm5,%xmm7,%xmm9
 1084         vpclmulqdq      $0x00,%xmm3,%xmm14,%xmm4
 1085         vpxor   %xmm9,%xmm6,%xmm6
 1086         vpunpckhqdq     %xmm13,%xmm13,%xmm2
 1087         vpclmulqdq      $0x11,%xmm3,%xmm14,%xmm14
 1088         vpxor   %xmm13,%xmm2,%xmm2
 1089         vpslldq $8,%xmm6,%xmm9
 1090         vpclmulqdq      $0x00,%xmm15,%xmm1,%xmm1
 1091         vpxor   %xmm9,%xmm5,%xmm8
 1092         vpsrldq $8,%xmm6,%xmm6
 1093         vpxor   %xmm6,%xmm7,%xmm7
 1094 
 1095         vpclmulqdq      $0x00,%xmm0,%xmm13,%xmm5
 1096         vmovdqu 48-32(%r9),%xmm3
 1097         vpxor   %xmm4,%xmm5,%xmm5
 1098         vpunpckhqdq     %xmm12,%xmm12,%xmm9
 1099         vpclmulqdq      $0x11,%xmm0,%xmm13,%xmm13
 1100         vpxor   %xmm12,%xmm9,%xmm9
 1101         vpxor   %xmm14,%xmm13,%xmm13
 1102         vpalignr        $8,%xmm8,%xmm8,%xmm14
 1103         vpclmulqdq      $0x10,%xmm15,%xmm2,%xmm2
 1104         vmovdqu 80-32(%r9),%xmm15
 1105         vpxor   %xmm1,%xmm2,%xmm2
 1106 
 1107         vpclmulqdq      $0x00,%xmm3,%xmm12,%xmm4
 1108         vmovdqu 64-32(%r9),%xmm0
 1109         vpxor   %xmm5,%xmm4,%xmm4
 1110         vpunpckhqdq     %xmm11,%xmm11,%xmm1
 1111         vpclmulqdq      $0x11,%xmm3,%xmm12,%xmm12
 1112         vpxor   %xmm11,%xmm1,%xmm1
 1113         vpxor   %xmm13,%xmm12,%xmm12
 1114         vxorps  16(%rsp),%xmm7,%xmm7
 1115         vpclmulqdq      $0x00,%xmm15,%xmm9,%xmm9
 1116         vpxor   %xmm2,%xmm9,%xmm9
 1117 
 1118         vpclmulqdq      $0x10,16(%r11),%xmm8,%xmm8
 1119         vxorps  %xmm14,%xmm8,%xmm8
 1120 
 1121         vpclmulqdq      $0x00,%xmm0,%xmm11,%xmm5
 1122         vmovdqu 96-32(%r9),%xmm3
 1123         vpxor   %xmm4,%xmm5,%xmm5
 1124         vpunpckhqdq     %xmm10,%xmm10,%xmm2
 1125         vpclmulqdq      $0x11,%xmm0,%xmm11,%xmm11
 1126         vpxor   %xmm10,%xmm2,%xmm2
 1127         vpalignr        $8,%xmm8,%xmm8,%xmm14
 1128         vpxor   %xmm12,%xmm11,%xmm11
 1129         vpclmulqdq      $0x10,%xmm15,%xmm1,%xmm1
 1130         vmovdqu 128-32(%r9),%xmm15
 1131         vpxor   %xmm9,%xmm1,%xmm1
 1132 
 1133         vxorps  %xmm7,%xmm14,%xmm14
 1134         vpclmulqdq      $0x10,16(%r11),%xmm8,%xmm8
 1135         vxorps  %xmm14,%xmm8,%xmm8
 1136 
 1137         vpclmulqdq      $0x00,%xmm3,%xmm10,%xmm4
 1138         vmovdqu 112-32(%r9),%xmm0
 1139         vpxor   %xmm5,%xmm4,%xmm4
 1140         vpunpckhqdq     %xmm8,%xmm8,%xmm9
 1141         vpclmulqdq      $0x11,%xmm3,%xmm10,%xmm10
 1142         vpxor   %xmm8,%xmm9,%xmm9
 1143         vpxor   %xmm11,%xmm10,%xmm10
 1144         vpclmulqdq      $0x00,%xmm15,%xmm2,%xmm2
 1145         vpxor   %xmm1,%xmm2,%xmm2
 1146 
 1147         vpclmulqdq      $0x00,%xmm0,%xmm8,%xmm5
 1148         vpclmulqdq      $0x11,%xmm0,%xmm8,%xmm7
 1149         vpxor   %xmm4,%xmm5,%xmm5
 1150         vpclmulqdq      $0x10,%xmm15,%xmm9,%xmm6
 1151         vpxor   %xmm10,%xmm7,%xmm7
 1152         vpxor   %xmm2,%xmm6,%xmm6
 1153 
 1154         vpxor   %xmm5,%xmm7,%xmm4
 1155         vpxor   %xmm4,%xmm6,%xmm6
 1156         vpslldq $8,%xmm6,%xmm1
 1157         vmovdqu 16(%r11),%xmm3
 1158         vpsrldq $8,%xmm6,%xmm6
 1159         vpxor   %xmm1,%xmm5,%xmm8
 1160         vpxor   %xmm6,%xmm7,%xmm7
 1161 
 1162         vpalignr        $8,%xmm8,%xmm8,%xmm2
 1163         vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm8
 1164         vpxor   %xmm2,%xmm8,%xmm8
 1165 
 1166         vpalignr        $8,%xmm8,%xmm8,%xmm2
 1167         vpclmulqdq      $0x10,%xmm3,%xmm8,%xmm8
 1168         vpxor   %xmm7,%xmm2,%xmm2
 1169         vpxor   %xmm2,%xmm8,%xmm8
 1170         vpshufb (%r11),%xmm8,%xmm8
 1171         movq    -56(%rax),%r9
 1172 .cfi_restore    %r9
 1173         vmovdqu %xmm8,(%r9)
 1174 
 1175         vzeroupper
 1176         movq    -48(%rax),%r15
 1177 .cfi_restore    %r15
 1178         movq    -40(%rax),%r14
 1179 .cfi_restore    %r14
 1180         movq    -32(%rax),%r13
 1181 .cfi_restore    %r13
 1182         movq    -24(%rax),%r12
 1183 .cfi_restore    %r12
 1184         movq    -16(%rax),%rbp
 1185 .cfi_restore    %rbp
 1186         movq    -8(%rax),%rbx
 1187 .cfi_restore    %rbx
 1188         leaq    (%rax),%rsp
 1189 .cfi_def_cfa_register   %rsp
 1190 .Lgcm_enc_abort:
 1191         movq    %r10,%rax
 1192         RET
 1193 .cfi_endproc
 1194 SET_SIZE(aesni_gcm_encrypt)
 1195 
 1196 #endif /* !_WIN32 || _KERNEL */
 1197 
 1198 /* Some utility routines */
 1199 
 1200 /*
 1201  * clear all fpu registers
 1202  * void clear_fpu_regs_avx(void);
 1203  */
 1204 ENTRY_ALIGN(clear_fpu_regs_avx, 32)
 1205         vzeroall
 1206         RET
 1207 SET_SIZE(clear_fpu_regs_avx)
 1208 
 1209 /*
 1210  * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
 1211  *
 1212  * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
 1213  * stores the result at `dst'. The XOR is performed using FPU registers,
 1214  * so make sure FPU state is saved when running this in the kernel.
 1215  */
 1216 ENTRY_ALIGN(gcm_xor_avx, 32)
 1217         movdqu  (%rdi), %xmm0
 1218         movdqu  (%rsi), %xmm1
 1219         pxor    %xmm1, %xmm0
 1220         movdqu  %xmm0, (%rsi)
 1221         RET
 1222 SET_SIZE(gcm_xor_avx)
 1223 
 1224 /*
 1225  * Toggle a boolean_t value atomically and return the new value.
 1226  * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
 1227  */
 1228 ENTRY_ALIGN(atomic_toggle_boolean_nv, 32)
 1229         xorl    %eax, %eax
 1230         lock
 1231         xorl    $1, (%rdi)
 1232         jz      1f
 1233         movl    $1, %eax
 1234 1:
 1235         RET
 1236 SET_SIZE(atomic_toggle_boolean_nv)
 1237 
 1238 SECTION_STATIC
 1239 
 1240 .balign 64
 1241 .Lbswap_mask:
 1242 .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 1243 .Lpoly:
 1244 .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
 1245 .Lone_msb:
 1246 .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 1247 .Ltwo_lsb:
 1248 .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 1249 .Lone_lsb:
 1250 .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 1251 .byte   65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 1252 .balign 64
 1253 
 1254 /* Mark the stack non-executable. */
 1255 #if defined(__linux__) && defined(__ELF__)
 1256 .section .note.GNU-stack,"",%progbits
 1257 #endif
 1258 
 1259 #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */

Cache object: 0a32b8b766b849810f89da71e4e78e5c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.