The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse41.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
   24  * Copyright (c) 2019-2020 Samuel Neves
   25  * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
   26  */
   27 
   28 #if defined(HAVE_SSE4_1)
   29 
   30 #define _ASM
   31 #include <sys/asm_linkage.h>
   32 
   33 .intel_syntax noprefix
   34 
   35 .text
   36 
   37 ENTRY_ALIGN(zfs_blake3_hash_many_sse41, 64)
   38         ENDBR
   39         push    r15
   40         push    r14
   41         push    r13
   42         push    r12
   43         push    rbx
   44         push    rbp
   45         mov     rbp, rsp
   46         sub     rsp, 360
   47         and     rsp, 0xFFFFFFFFFFFFFFC0
   48         neg     r9d
   49         movd    xmm0, r9d
   50         pshufd  xmm0, xmm0, 0x00
   51         movdqa  xmmword ptr [rsp+0x130], xmm0
   52         movdqa  xmm1, xmm0
   53         pand    xmm1, xmmword ptr [ADD0+rip]
   54         pand    xmm0, xmmword ptr [ADD1+rip]
   55         movdqa  xmmword ptr [rsp+0x150], xmm0
   56         movd    xmm0, r8d
   57         pshufd  xmm0, xmm0, 0x00
   58         paddd   xmm0, xmm1
   59         movdqa  xmmword ptr [rsp+0x110], xmm0
   60         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
   61         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
   62         pcmpgtd xmm1, xmm0
   63         shr     r8, 32
   64         movd    xmm2, r8d
   65         pshufd  xmm2, xmm2, 0x00
   66         psubd   xmm2, xmm1
   67         movdqa  xmmword ptr [rsp+0x120], xmm2
   68         mov     rbx, qword ptr [rbp+0x50]
   69         mov     r15, rdx
   70         shl     r15, 6
   71         movzx   r13d, byte ptr [rbp+0x38]
   72         movzx   r12d, byte ptr [rbp+0x48]
   73         cmp     rsi, 4
   74         jc      3f
   75 2:
   76         movdqu  xmm3, xmmword ptr [rcx]
   77         pshufd  xmm0, xmm3, 0x00
   78         pshufd  xmm1, xmm3, 0x55
   79         pshufd  xmm2, xmm3, 0xAA
   80         pshufd  xmm3, xmm3, 0xFF
   81         movdqu  xmm7, xmmword ptr [rcx+0x10]
   82         pshufd  xmm4, xmm7, 0x00
   83         pshufd  xmm5, xmm7, 0x55
   84         pshufd  xmm6, xmm7, 0xAA
   85         pshufd  xmm7, xmm7, 0xFF
   86         mov     r8, qword ptr [rdi]
   87         mov     r9, qword ptr [rdi+0x8]
   88         mov     r10, qword ptr [rdi+0x10]
   89         mov     r11, qword ptr [rdi+0x18]
   90         movzx   eax, byte ptr [rbp+0x40]
   91         or      eax, r13d
   92         xor     edx, edx
   93 9:
   94         mov     r14d, eax
   95         or      eax, r12d
   96         add     rdx, 64
   97         cmp     rdx, r15
   98         cmovne  eax, r14d
   99         movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
  100         movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
  101         movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
  102         movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
  103         movdqa  xmm12, xmm8
  104         punpckldq xmm8, xmm9
  105         punpckhdq xmm12, xmm9
  106         movdqa  xmm14, xmm10
  107         punpckldq xmm10, xmm11
  108         punpckhdq xmm14, xmm11
  109         movdqa  xmm9, xmm8
  110         punpcklqdq xmm8, xmm10
  111         punpckhqdq xmm9, xmm10
  112         movdqa  xmm13, xmm12
  113         punpcklqdq xmm12, xmm14
  114         punpckhqdq xmm13, xmm14
  115         movdqa  xmmword ptr [rsp], xmm8
  116         movdqa  xmmword ptr [rsp+0x10], xmm9
  117         movdqa  xmmword ptr [rsp+0x20], xmm12
  118         movdqa  xmmword ptr [rsp+0x30], xmm13
  119         movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
  120         movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
  121         movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
  122         movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
  123         movdqa  xmm12, xmm8
  124         punpckldq xmm8, xmm9
  125         punpckhdq xmm12, xmm9
  126         movdqa  xmm14, xmm10
  127         punpckldq xmm10, xmm11
  128         punpckhdq xmm14, xmm11
  129         movdqa  xmm9, xmm8
  130         punpcklqdq xmm8, xmm10
  131         punpckhqdq xmm9, xmm10
  132         movdqa  xmm13, xmm12
  133         punpcklqdq xmm12, xmm14
  134         punpckhqdq xmm13, xmm14
  135         movdqa  xmmword ptr [rsp+0x40], xmm8
  136         movdqa  xmmword ptr [rsp+0x50], xmm9
  137         movdqa  xmmword ptr [rsp+0x60], xmm12
  138         movdqa  xmmword ptr [rsp+0x70], xmm13
  139         movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
  140         movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
  141         movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
  142         movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
  143         movdqa  xmm12, xmm8
  144         punpckldq xmm8, xmm9
  145         punpckhdq xmm12, xmm9
  146         movdqa  xmm14, xmm10
  147         punpckldq xmm10, xmm11
  148         punpckhdq xmm14, xmm11
  149         movdqa  xmm9, xmm8
  150         punpcklqdq xmm8, xmm10
  151         punpckhqdq xmm9, xmm10
  152         movdqa  xmm13, xmm12
  153         punpcklqdq xmm12, xmm14
  154         punpckhqdq xmm13, xmm14
  155         movdqa  xmmword ptr [rsp+0x80], xmm8
  156         movdqa  xmmword ptr [rsp+0x90], xmm9
  157         movdqa  xmmword ptr [rsp+0xA0], xmm12
  158         movdqa  xmmword ptr [rsp+0xB0], xmm13
  159         movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
  160         movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
  161         movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
  162         movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
  163         movdqa  xmm12, xmm8
  164         punpckldq xmm8, xmm9
  165         punpckhdq xmm12, xmm9
  166         movdqa  xmm14, xmm10
  167         punpckldq xmm10, xmm11
  168         punpckhdq xmm14, xmm11
  169         movdqa  xmm9, xmm8
  170         punpcklqdq xmm8, xmm10
  171         punpckhqdq xmm9, xmm10
  172         movdqa  xmm13, xmm12
  173         punpcklqdq xmm12, xmm14
  174         punpckhqdq xmm13, xmm14
  175         movdqa  xmmword ptr [rsp+0xC0], xmm8
  176         movdqa  xmmword ptr [rsp+0xD0], xmm9
  177         movdqa  xmmword ptr [rsp+0xE0], xmm12
  178         movdqa  xmmword ptr [rsp+0xF0], xmm13
  179         movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
  180         movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
  181         movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
  182         movdqa  xmm12, xmmword ptr [rsp+0x110]
  183         movdqa  xmm13, xmmword ptr [rsp+0x120]
  184         movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
  185         movd    xmm15, eax
  186         pshufd  xmm15, xmm15, 0x00
  187         prefetcht0 [r8+rdx+0x80]
  188         prefetcht0 [r9+rdx+0x80]
  189         prefetcht0 [r10+rdx+0x80]
  190         prefetcht0 [r11+rdx+0x80]
  191         paddd   xmm0, xmmword ptr [rsp]
  192         paddd   xmm1, xmmword ptr [rsp+0x20]
  193         paddd   xmm2, xmmword ptr [rsp+0x40]
  194         paddd   xmm3, xmmword ptr [rsp+0x60]
  195         paddd   xmm0, xmm4
  196         paddd   xmm1, xmm5
  197         paddd   xmm2, xmm6
  198         paddd   xmm3, xmm7
  199         pxor    xmm12, xmm0
  200         pxor    xmm13, xmm1
  201         pxor    xmm14, xmm2
  202         pxor    xmm15, xmm3
  203         movdqa  xmm8, xmmword ptr [ROT16+rip]
  204         pshufb  xmm12, xmm8
  205         pshufb  xmm13, xmm8
  206         pshufb  xmm14, xmm8
  207         pshufb  xmm15, xmm8
  208         movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
  209         paddd   xmm8, xmm12
  210         paddd   xmm9, xmm13
  211         paddd   xmm10, xmm14
  212         paddd   xmm11, xmm15
  213         pxor    xmm4, xmm8
  214         pxor    xmm5, xmm9
  215         pxor    xmm6, xmm10
  216         pxor    xmm7, xmm11
  217         movdqa  xmmword ptr [rsp+0x100], xmm8
  218         movdqa  xmm8, xmm4
  219         psrld   xmm8, 12
  220         pslld   xmm4, 20
  221         por     xmm4, xmm8
  222         movdqa  xmm8, xmm5
  223         psrld   xmm8, 12
  224         pslld   xmm5, 20
  225         por     xmm5, xmm8
  226         movdqa  xmm8, xmm6
  227         psrld   xmm8, 12
  228         pslld   xmm6, 20
  229         por     xmm6, xmm8
  230         movdqa  xmm8, xmm7
  231         psrld   xmm8, 12
  232         pslld   xmm7, 20
  233         por     xmm7, xmm8
  234         paddd   xmm0, xmmword ptr [rsp+0x10]
  235         paddd   xmm1, xmmword ptr [rsp+0x30]
  236         paddd   xmm2, xmmword ptr [rsp+0x50]
  237         paddd   xmm3, xmmword ptr [rsp+0x70]
  238         paddd   xmm0, xmm4
  239         paddd   xmm1, xmm5
  240         paddd   xmm2, xmm6
  241         paddd   xmm3, xmm7
  242         pxor    xmm12, xmm0
  243         pxor    xmm13, xmm1
  244         pxor    xmm14, xmm2
  245         pxor    xmm15, xmm3
  246         movdqa  xmm8, xmmword ptr [ROT8+rip]
  247         pshufb  xmm12, xmm8
  248         pshufb  xmm13, xmm8
  249         pshufb  xmm14, xmm8
  250         pshufb  xmm15, xmm8
  251         movdqa  xmm8, xmmword ptr [rsp+0x100]
  252         paddd   xmm8, xmm12
  253         paddd   xmm9, xmm13
  254         paddd   xmm10, xmm14
  255         paddd   xmm11, xmm15
  256         pxor    xmm4, xmm8
  257         pxor    xmm5, xmm9
  258         pxor    xmm6, xmm10
  259         pxor    xmm7, xmm11
  260         movdqa  xmmword ptr [rsp+0x100], xmm8
  261         movdqa  xmm8, xmm4
  262         psrld   xmm8, 7
  263         pslld   xmm4, 25
  264         por     xmm4, xmm8
  265         movdqa  xmm8, xmm5
  266         psrld   xmm8, 7
  267         pslld   xmm5, 25
  268         por     xmm5, xmm8
  269         movdqa  xmm8, xmm6
  270         psrld   xmm8, 7
  271         pslld   xmm6, 25
  272         por     xmm6, xmm8
  273         movdqa  xmm8, xmm7
  274         psrld   xmm8, 7
  275         pslld   xmm7, 25
  276         por     xmm7, xmm8
  277         paddd   xmm0, xmmword ptr [rsp+0x80]
  278         paddd   xmm1, xmmword ptr [rsp+0xA0]
  279         paddd   xmm2, xmmword ptr [rsp+0xC0]
  280         paddd   xmm3, xmmword ptr [rsp+0xE0]
  281         paddd   xmm0, xmm5
  282         paddd   xmm1, xmm6
  283         paddd   xmm2, xmm7
  284         paddd   xmm3, xmm4
  285         pxor    xmm15, xmm0
  286         pxor    xmm12, xmm1
  287         pxor    xmm13, xmm2
  288         pxor    xmm14, xmm3
  289         movdqa  xmm8, xmmword ptr [ROT16+rip]
  290         pshufb  xmm15, xmm8
  291         pshufb  xmm12, xmm8
  292         pshufb  xmm13, xmm8
  293         pshufb  xmm14, xmm8
  294         paddd   xmm10, xmm15
  295         paddd   xmm11, xmm12
  296         movdqa  xmm8, xmmword ptr [rsp+0x100]
  297         paddd   xmm8, xmm13
  298         paddd   xmm9, xmm14
  299         pxor    xmm5, xmm10
  300         pxor    xmm6, xmm11
  301         pxor    xmm7, xmm8
  302         pxor    xmm4, xmm9
  303         movdqa  xmmword ptr [rsp+0x100], xmm8
  304         movdqa  xmm8, xmm5
  305         psrld   xmm8, 12
  306         pslld   xmm5, 20
  307         por     xmm5, xmm8
  308         movdqa  xmm8, xmm6
  309         psrld   xmm8, 12
  310         pslld   xmm6, 20
  311         por     xmm6, xmm8
  312         movdqa  xmm8, xmm7
  313         psrld   xmm8, 12
  314         pslld   xmm7, 20
  315         por     xmm7, xmm8
  316         movdqa  xmm8, xmm4
  317         psrld   xmm8, 12
  318         pslld   xmm4, 20
  319         por     xmm4, xmm8
  320         paddd   xmm0, xmmword ptr [rsp+0x90]
  321         paddd   xmm1, xmmword ptr [rsp+0xB0]
  322         paddd   xmm2, xmmword ptr [rsp+0xD0]
  323         paddd   xmm3, xmmword ptr [rsp+0xF0]
  324         paddd   xmm0, xmm5
  325         paddd   xmm1, xmm6
  326         paddd   xmm2, xmm7
  327         paddd   xmm3, xmm4
  328         pxor    xmm15, xmm0
  329         pxor    xmm12, xmm1
  330         pxor    xmm13, xmm2
  331         pxor    xmm14, xmm3
  332         movdqa  xmm8, xmmword ptr [ROT8+rip]
  333         pshufb  xmm15, xmm8
  334         pshufb  xmm12, xmm8
  335         pshufb  xmm13, xmm8
  336         pshufb  xmm14, xmm8
  337         paddd   xmm10, xmm15
  338         paddd   xmm11, xmm12
  339         movdqa  xmm8, xmmword ptr [rsp+0x100]
  340         paddd   xmm8, xmm13
  341         paddd   xmm9, xmm14
  342         pxor    xmm5, xmm10
  343         pxor    xmm6, xmm11
  344         pxor    xmm7, xmm8
  345         pxor    xmm4, xmm9
  346         movdqa  xmmword ptr [rsp+0x100], xmm8
  347         movdqa  xmm8, xmm5
  348         psrld   xmm8, 7
  349         pslld   xmm5, 25
  350         por     xmm5, xmm8
  351         movdqa  xmm8, xmm6
  352         psrld   xmm8, 7
  353         pslld   xmm6, 25
  354         por     xmm6, xmm8
  355         movdqa  xmm8, xmm7
  356         psrld   xmm8, 7
  357         pslld   xmm7, 25
  358         por     xmm7, xmm8
  359         movdqa  xmm8, xmm4
  360         psrld   xmm8, 7
  361         pslld   xmm4, 25
  362         por     xmm4, xmm8
  363         paddd   xmm0, xmmword ptr [rsp+0x20]
  364         paddd   xmm1, xmmword ptr [rsp+0x30]
  365         paddd   xmm2, xmmword ptr [rsp+0x70]
  366         paddd   xmm3, xmmword ptr [rsp+0x40]
  367         paddd   xmm0, xmm4
  368         paddd   xmm1, xmm5
  369         paddd   xmm2, xmm6
  370         paddd   xmm3, xmm7
  371         pxor    xmm12, xmm0
  372         pxor    xmm13, xmm1
  373         pxor    xmm14, xmm2
  374         pxor    xmm15, xmm3
  375         movdqa  xmm8, xmmword ptr [ROT16+rip]
  376         pshufb  xmm12, xmm8
  377         pshufb  xmm13, xmm8
  378         pshufb  xmm14, xmm8
  379         pshufb  xmm15, xmm8
  380         movdqa  xmm8, xmmword ptr [rsp+0x100]
  381         paddd   xmm8, xmm12
  382         paddd   xmm9, xmm13
  383         paddd   xmm10, xmm14
  384         paddd   xmm11, xmm15
  385         pxor    xmm4, xmm8
  386         pxor    xmm5, xmm9
  387         pxor    xmm6, xmm10
  388         pxor    xmm7, xmm11
  389         movdqa  xmmword ptr [rsp+0x100], xmm8
  390         movdqa  xmm8, xmm4
  391         psrld   xmm8, 12
  392         pslld   xmm4, 20
  393         por     xmm4, xmm8
  394         movdqa  xmm8, xmm5
  395         psrld   xmm8, 12
  396         pslld   xmm5, 20
  397         por     xmm5, xmm8
  398         movdqa  xmm8, xmm6
  399         psrld   xmm8, 12
  400         pslld   xmm6, 20
  401         por     xmm6, xmm8
  402         movdqa  xmm8, xmm7
  403         psrld   xmm8, 12
  404         pslld   xmm7, 20
  405         por     xmm7, xmm8
  406         paddd   xmm0, xmmword ptr [rsp+0x60]
  407         paddd   xmm1, xmmword ptr [rsp+0xA0]
  408         paddd   xmm2, xmmword ptr [rsp]
  409         paddd   xmm3, xmmword ptr [rsp+0xD0]
  410         paddd   xmm0, xmm4
  411         paddd   xmm1, xmm5
  412         paddd   xmm2, xmm6
  413         paddd   xmm3, xmm7
  414         pxor    xmm12, xmm0
  415         pxor    xmm13, xmm1
  416         pxor    xmm14, xmm2
  417         pxor    xmm15, xmm3
  418         movdqa  xmm8, xmmword ptr [ROT8+rip]
  419         pshufb  xmm12, xmm8
  420         pshufb  xmm13, xmm8
  421         pshufb  xmm14, xmm8
  422         pshufb  xmm15, xmm8
  423         movdqa  xmm8, xmmword ptr [rsp+0x100]
  424         paddd   xmm8, xmm12
  425         paddd   xmm9, xmm13
  426         paddd   xmm10, xmm14
  427         paddd   xmm11, xmm15
  428         pxor    xmm4, xmm8
  429         pxor    xmm5, xmm9
  430         pxor    xmm6, xmm10
  431         pxor    xmm7, xmm11
  432         movdqa  xmmword ptr [rsp+0x100], xmm8
  433         movdqa  xmm8, xmm4
  434         psrld   xmm8, 7
  435         pslld   xmm4, 25
  436         por     xmm4, xmm8
  437         movdqa  xmm8, xmm5
  438         psrld   xmm8, 7
  439         pslld   xmm5, 25
  440         por     xmm5, xmm8
  441         movdqa  xmm8, xmm6
  442         psrld   xmm8, 7
  443         pslld   xmm6, 25
  444         por     xmm6, xmm8
  445         movdqa  xmm8, xmm7
  446         psrld   xmm8, 7
  447         pslld   xmm7, 25
  448         por     xmm7, xmm8
  449         paddd   xmm0, xmmword ptr [rsp+0x10]
  450         paddd   xmm1, xmmword ptr [rsp+0xC0]
  451         paddd   xmm2, xmmword ptr [rsp+0x90]
  452         paddd   xmm3, xmmword ptr [rsp+0xF0]
  453         paddd   xmm0, xmm5
  454         paddd   xmm1, xmm6
  455         paddd   xmm2, xmm7
  456         paddd   xmm3, xmm4
  457         pxor    xmm15, xmm0
  458         pxor    xmm12, xmm1
  459         pxor    xmm13, xmm2
  460         pxor    xmm14, xmm3
  461         movdqa  xmm8, xmmword ptr [ROT16+rip]
  462         pshufb  xmm15, xmm8
  463         pshufb  xmm12, xmm8
  464         pshufb  xmm13, xmm8
  465         pshufb  xmm14, xmm8
  466         paddd   xmm10, xmm15
  467         paddd   xmm11, xmm12
  468         movdqa  xmm8, xmmword ptr [rsp+0x100]
  469         paddd   xmm8, xmm13
  470         paddd   xmm9, xmm14
  471         pxor    xmm5, xmm10
  472         pxor    xmm6, xmm11
  473         pxor    xmm7, xmm8
  474         pxor    xmm4, xmm9
  475         movdqa  xmmword ptr [rsp+0x100], xmm8
  476         movdqa  xmm8, xmm5
  477         psrld   xmm8, 12
  478         pslld   xmm5, 20
  479         por     xmm5, xmm8
  480         movdqa  xmm8, xmm6
  481         psrld   xmm8, 12
  482         pslld   xmm6, 20
  483         por     xmm6, xmm8
  484         movdqa  xmm8, xmm7
  485         psrld   xmm8, 12
  486         pslld   xmm7, 20
  487         por     xmm7, xmm8
  488         movdqa  xmm8, xmm4
  489         psrld   xmm8, 12
  490         pslld   xmm4, 20
  491         por     xmm4, xmm8
  492         paddd   xmm0, xmmword ptr [rsp+0xB0]
  493         paddd   xmm1, xmmword ptr [rsp+0x50]
  494         paddd   xmm2, xmmword ptr [rsp+0xE0]
  495         paddd   xmm3, xmmword ptr [rsp+0x80]
  496         paddd   xmm0, xmm5
  497         paddd   xmm1, xmm6
  498         paddd   xmm2, xmm7
  499         paddd   xmm3, xmm4
  500         pxor    xmm15, xmm0
  501         pxor    xmm12, xmm1
  502         pxor    xmm13, xmm2
  503         pxor    xmm14, xmm3
  504         movdqa  xmm8, xmmword ptr [ROT8+rip]
  505         pshufb  xmm15, xmm8
  506         pshufb  xmm12, xmm8
  507         pshufb  xmm13, xmm8
  508         pshufb  xmm14, xmm8
  509         paddd   xmm10, xmm15
  510         paddd   xmm11, xmm12
  511         movdqa  xmm8, xmmword ptr [rsp+0x100]
  512         paddd   xmm8, xmm13
  513         paddd   xmm9, xmm14
  514         pxor    xmm5, xmm10
  515         pxor    xmm6, xmm11
  516         pxor    xmm7, xmm8
  517         pxor    xmm4, xmm9
  518         movdqa  xmmword ptr [rsp+0x100], xmm8
  519         movdqa  xmm8, xmm5
  520         psrld   xmm8, 7
  521         pslld   xmm5, 25
  522         por     xmm5, xmm8
  523         movdqa  xmm8, xmm6
  524         psrld   xmm8, 7
  525         pslld   xmm6, 25
  526         por     xmm6, xmm8
  527         movdqa  xmm8, xmm7
  528         psrld   xmm8, 7
  529         pslld   xmm7, 25
  530         por     xmm7, xmm8
  531         movdqa  xmm8, xmm4
  532         psrld   xmm8, 7
  533         pslld   xmm4, 25
  534         por     xmm4, xmm8
  535         paddd   xmm0, xmmword ptr [rsp+0x30]
  536         paddd   xmm1, xmmword ptr [rsp+0xA0]
  537         paddd   xmm2, xmmword ptr [rsp+0xD0]
  538         paddd   xmm3, xmmword ptr [rsp+0x70]
  539         paddd   xmm0, xmm4
  540         paddd   xmm1, xmm5
  541         paddd   xmm2, xmm6
  542         paddd   xmm3, xmm7
  543         pxor    xmm12, xmm0
  544         pxor    xmm13, xmm1
  545         pxor    xmm14, xmm2
  546         pxor    xmm15, xmm3
  547         movdqa  xmm8, xmmword ptr [ROT16+rip]
  548         pshufb  xmm12, xmm8
  549         pshufb  xmm13, xmm8
  550         pshufb  xmm14, xmm8
  551         pshufb  xmm15, xmm8
  552         movdqa  xmm8, xmmword ptr [rsp+0x100]
  553         paddd   xmm8, xmm12
  554         paddd   xmm9, xmm13
  555         paddd   xmm10, xmm14
  556         paddd   xmm11, xmm15
  557         pxor    xmm4, xmm8
  558         pxor    xmm5, xmm9
  559         pxor    xmm6, xmm10
  560         pxor    xmm7, xmm11
  561         movdqa  xmmword ptr [rsp+0x100], xmm8
  562         movdqa  xmm8, xmm4
  563         psrld   xmm8, 12
  564         pslld   xmm4, 20
  565         por     xmm4, xmm8
  566         movdqa  xmm8, xmm5
  567         psrld   xmm8, 12
  568         pslld   xmm5, 20
  569         por     xmm5, xmm8
  570         movdqa  xmm8, xmm6
  571         psrld   xmm8, 12
  572         pslld   xmm6, 20
  573         por     xmm6, xmm8
  574         movdqa  xmm8, xmm7
  575         psrld   xmm8, 12
  576         pslld   xmm7, 20
  577         por     xmm7, xmm8
  578         paddd   xmm0, xmmword ptr [rsp+0x40]
  579         paddd   xmm1, xmmword ptr [rsp+0xC0]
  580         paddd   xmm2, xmmword ptr [rsp+0x20]
  581         paddd   xmm3, xmmword ptr [rsp+0xE0]
  582         paddd   xmm0, xmm4
  583         paddd   xmm1, xmm5
  584         paddd   xmm2, xmm6
  585         paddd   xmm3, xmm7
  586         pxor    xmm12, xmm0
  587         pxor    xmm13, xmm1
  588         pxor    xmm14, xmm2
  589         pxor    xmm15, xmm3
  590         movdqa  xmm8, xmmword ptr [ROT8+rip]
  591         pshufb  xmm12, xmm8
  592         pshufb  xmm13, xmm8
  593         pshufb  xmm14, xmm8
  594         pshufb  xmm15, xmm8
  595         movdqa  xmm8, xmmword ptr [rsp+0x100]
  596         paddd   xmm8, xmm12
  597         paddd   xmm9, xmm13
  598         paddd   xmm10, xmm14
  599         paddd   xmm11, xmm15
  600         pxor    xmm4, xmm8
  601         pxor    xmm5, xmm9
  602         pxor    xmm6, xmm10
  603         pxor    xmm7, xmm11
  604         movdqa  xmmword ptr [rsp+0x100], xmm8
  605         movdqa  xmm8, xmm4
  606         psrld   xmm8, 7
  607         pslld   xmm4, 25
  608         por     xmm4, xmm8
  609         movdqa  xmm8, xmm5
  610         psrld   xmm8, 7
  611         pslld   xmm5, 25
  612         por     xmm5, xmm8
  613         movdqa  xmm8, xmm6
  614         psrld   xmm8, 7
  615         pslld   xmm6, 25
  616         por     xmm6, xmm8
  617         movdqa  xmm8, xmm7
  618         psrld   xmm8, 7
  619         pslld   xmm7, 25
  620         por     xmm7, xmm8
  621         paddd   xmm0, xmmword ptr [rsp+0x60]
  622         paddd   xmm1, xmmword ptr [rsp+0x90]
  623         paddd   xmm2, xmmword ptr [rsp+0xB0]
  624         paddd   xmm3, xmmword ptr [rsp+0x80]
  625         paddd   xmm0, xmm5
  626         paddd   xmm1, xmm6
  627         paddd   xmm2, xmm7
  628         paddd   xmm3, xmm4
  629         pxor    xmm15, xmm0
  630         pxor    xmm12, xmm1
  631         pxor    xmm13, xmm2
  632         pxor    xmm14, xmm3
  633         movdqa  xmm8, xmmword ptr [ROT16+rip]
  634         pshufb  xmm15, xmm8
  635         pshufb  xmm12, xmm8
  636         pshufb  xmm13, xmm8
  637         pshufb  xmm14, xmm8
  638         paddd   xmm10, xmm15
  639         paddd   xmm11, xmm12
  640         movdqa  xmm8, xmmword ptr [rsp+0x100]
  641         paddd   xmm8, xmm13
  642         paddd   xmm9, xmm14
  643         pxor    xmm5, xmm10
  644         pxor    xmm6, xmm11
  645         pxor    xmm7, xmm8
  646         pxor    xmm4, xmm9
  647         movdqa  xmmword ptr [rsp+0x100], xmm8
  648         movdqa  xmm8, xmm5
  649         psrld   xmm8, 12
  650         pslld   xmm5, 20
  651         por     xmm5, xmm8
  652         movdqa  xmm8, xmm6
  653         psrld   xmm8, 12
  654         pslld   xmm6, 20
  655         por     xmm6, xmm8
  656         movdqa  xmm8, xmm7
  657         psrld   xmm8, 12
  658         pslld   xmm7, 20
  659         por     xmm7, xmm8
  660         movdqa  xmm8, xmm4
  661         psrld   xmm8, 12
  662         pslld   xmm4, 20
  663         por     xmm4, xmm8
  664         paddd   xmm0, xmmword ptr [rsp+0x50]
  665         paddd   xmm1, xmmword ptr [rsp]
  666         paddd   xmm2, xmmword ptr [rsp+0xF0]
  667         paddd   xmm3, xmmword ptr [rsp+0x10]
  668         paddd   xmm0, xmm5
  669         paddd   xmm1, xmm6
  670         paddd   xmm2, xmm7
  671         paddd   xmm3, xmm4
  672         pxor    xmm15, xmm0
  673         pxor    xmm12, xmm1
  674         pxor    xmm13, xmm2
  675         pxor    xmm14, xmm3
  676         movdqa  xmm8, xmmword ptr [ROT8+rip]
  677         pshufb  xmm15, xmm8
  678         pshufb  xmm12, xmm8
  679         pshufb  xmm13, xmm8
  680         pshufb  xmm14, xmm8
  681         paddd   xmm10, xmm15
  682         paddd   xmm11, xmm12
  683         movdqa  xmm8, xmmword ptr [rsp+0x100]
  684         paddd   xmm8, xmm13
  685         paddd   xmm9, xmm14
  686         pxor    xmm5, xmm10
  687         pxor    xmm6, xmm11
  688         pxor    xmm7, xmm8
  689         pxor    xmm4, xmm9
  690         movdqa  xmmword ptr [rsp+0x100], xmm8
  691         movdqa  xmm8, xmm5
  692         psrld   xmm8, 7
  693         pslld   xmm5, 25
  694         por     xmm5, xmm8
  695         movdqa  xmm8, xmm6
  696         psrld   xmm8, 7
  697         pslld   xmm6, 25
  698         por     xmm6, xmm8
  699         movdqa  xmm8, xmm7
  700         psrld   xmm8, 7
  701         pslld   xmm7, 25
  702         por     xmm7, xmm8
  703         movdqa  xmm8, xmm4
  704         psrld   xmm8, 7
  705         pslld   xmm4, 25
  706         por     xmm4, xmm8
  707         paddd   xmm0, xmmword ptr [rsp+0xA0]
  708         paddd   xmm1, xmmword ptr [rsp+0xC0]
  709         paddd   xmm2, xmmword ptr [rsp+0xE0]
  710         paddd   xmm3, xmmword ptr [rsp+0xD0]
  711         paddd   xmm0, xmm4
  712         paddd   xmm1, xmm5
  713         paddd   xmm2, xmm6
  714         paddd   xmm3, xmm7
  715         pxor    xmm12, xmm0
  716         pxor    xmm13, xmm1
  717         pxor    xmm14, xmm2
  718         pxor    xmm15, xmm3
  719         movdqa  xmm8, xmmword ptr [ROT16+rip]
  720         pshufb  xmm12, xmm8
  721         pshufb  xmm13, xmm8
  722         pshufb  xmm14, xmm8
  723         pshufb  xmm15, xmm8
  724         movdqa  xmm8, xmmword ptr [rsp+0x100]
  725         paddd   xmm8, xmm12
  726         paddd   xmm9, xmm13
  727         paddd   xmm10, xmm14
  728         paddd   xmm11, xmm15
  729         pxor    xmm4, xmm8
  730         pxor    xmm5, xmm9
  731         pxor    xmm6, xmm10
  732         pxor    xmm7, xmm11
  733         movdqa  xmmword ptr [rsp+0x100], xmm8
  734         movdqa  xmm8, xmm4
  735         psrld   xmm8, 12
  736         pslld   xmm4, 20
  737         por     xmm4, xmm8
  738         movdqa  xmm8, xmm5
  739         psrld   xmm8, 12
  740         pslld   xmm5, 20
  741         por     xmm5, xmm8
  742         movdqa  xmm8, xmm6
  743         psrld   xmm8, 12
  744         pslld   xmm6, 20
  745         por     xmm6, xmm8
  746         movdqa  xmm8, xmm7
  747         psrld   xmm8, 12
  748         pslld   xmm7, 20
  749         por     xmm7, xmm8
  750         paddd   xmm0, xmmword ptr [rsp+0x70]
  751         paddd   xmm1, xmmword ptr [rsp+0x90]
  752         paddd   xmm2, xmmword ptr [rsp+0x30]
  753         paddd   xmm3, xmmword ptr [rsp+0xF0]
  754         paddd   xmm0, xmm4
  755         paddd   xmm1, xmm5
  756         paddd   xmm2, xmm6
  757         paddd   xmm3, xmm7
  758         pxor    xmm12, xmm0
  759         pxor    xmm13, xmm1
  760         pxor    xmm14, xmm2
  761         pxor    xmm15, xmm3
  762         movdqa  xmm8, xmmword ptr [ROT8+rip]
  763         pshufb  xmm12, xmm8
  764         pshufb  xmm13, xmm8
  765         pshufb  xmm14, xmm8
  766         pshufb  xmm15, xmm8
  767         movdqa  xmm8, xmmword ptr [rsp+0x100]
  768         paddd   xmm8, xmm12
  769         paddd   xmm9, xmm13
  770         paddd   xmm10, xmm14
  771         paddd   xmm11, xmm15
  772         pxor    xmm4, xmm8
  773         pxor    xmm5, xmm9
  774         pxor    xmm6, xmm10
  775         pxor    xmm7, xmm11
  776         movdqa  xmmword ptr [rsp+0x100], xmm8
  777         movdqa  xmm8, xmm4
  778         psrld   xmm8, 7
  779         pslld   xmm4, 25
  780         por     xmm4, xmm8
  781         movdqa  xmm8, xmm5
  782         psrld   xmm8, 7
  783         pslld   xmm5, 25
  784         por     xmm5, xmm8
  785         movdqa  xmm8, xmm6
  786         psrld   xmm8, 7
  787         pslld   xmm6, 25
  788         por     xmm6, xmm8
  789         movdqa  xmm8, xmm7
  790         psrld   xmm8, 7
  791         pslld   xmm7, 25
  792         por     xmm7, xmm8
  793         paddd   xmm0, xmmword ptr [rsp+0x40]
  794         paddd   xmm1, xmmword ptr [rsp+0xB0]
  795         paddd   xmm2, xmmword ptr [rsp+0x50]
  796         paddd   xmm3, xmmword ptr [rsp+0x10]
  797         paddd   xmm0, xmm5
  798         paddd   xmm1, xmm6
  799         paddd   xmm2, xmm7
  800         paddd   xmm3, xmm4
  801         pxor    xmm15, xmm0
  802         pxor    xmm12, xmm1
  803         pxor    xmm13, xmm2
  804         pxor    xmm14, xmm3
  805         movdqa  xmm8, xmmword ptr [ROT16+rip]
  806         pshufb  xmm15, xmm8
  807         pshufb  xmm12, xmm8
  808         pshufb  xmm13, xmm8
  809         pshufb  xmm14, xmm8
  810         paddd   xmm10, xmm15
  811         paddd   xmm11, xmm12
  812         movdqa  xmm8, xmmword ptr [rsp+0x100]
  813         paddd   xmm8, xmm13
  814         paddd   xmm9, xmm14
  815         pxor    xmm5, xmm10
  816         pxor    xmm6, xmm11
  817         pxor    xmm7, xmm8
  818         pxor    xmm4, xmm9
  819         movdqa  xmmword ptr [rsp+0x100], xmm8
  820         movdqa  xmm8, xmm5
  821         psrld   xmm8, 12
  822         pslld   xmm5, 20
  823         por     xmm5, xmm8
  824         movdqa  xmm8, xmm6
  825         psrld   xmm8, 12
  826         pslld   xmm6, 20
  827         por     xmm6, xmm8
  828         movdqa  xmm8, xmm7
  829         psrld   xmm8, 12
  830         pslld   xmm7, 20
  831         por     xmm7, xmm8
  832         movdqa  xmm8, xmm4
  833         psrld   xmm8, 12
  834         pslld   xmm4, 20
  835         por     xmm4, xmm8
  836         paddd   xmm0, xmmword ptr [rsp]
  837         paddd   xmm1, xmmword ptr [rsp+0x20]
  838         paddd   xmm2, xmmword ptr [rsp+0x80]
  839         paddd   xmm3, xmmword ptr [rsp+0x60]
  840         paddd   xmm0, xmm5
  841         paddd   xmm1, xmm6
  842         paddd   xmm2, xmm7
  843         paddd   xmm3, xmm4
  844         pxor    xmm15, xmm0
  845         pxor    xmm12, xmm1
  846         pxor    xmm13, xmm2
  847         pxor    xmm14, xmm3
  848         movdqa  xmm8, xmmword ptr [ROT8+rip]
  849         pshufb  xmm15, xmm8
  850         pshufb  xmm12, xmm8
  851         pshufb  xmm13, xmm8
  852         pshufb  xmm14, xmm8
  853         paddd   xmm10, xmm15
  854         paddd   xmm11, xmm12
  855         movdqa  xmm8, xmmword ptr [rsp+0x100]
  856         paddd   xmm8, xmm13
  857         paddd   xmm9, xmm14
  858         pxor    xmm5, xmm10
  859         pxor    xmm6, xmm11
  860         pxor    xmm7, xmm8
  861         pxor    xmm4, xmm9
  862         movdqa  xmmword ptr [rsp+0x100], xmm8
  863         movdqa  xmm8, xmm5
  864         psrld   xmm8, 7
  865         pslld   xmm5, 25
  866         por     xmm5, xmm8
  867         movdqa  xmm8, xmm6
  868         psrld   xmm8, 7
  869         pslld   xmm6, 25
  870         por     xmm6, xmm8
  871         movdqa  xmm8, xmm7
  872         psrld   xmm8, 7
  873         pslld   xmm7, 25
  874         por     xmm7, xmm8
  875         movdqa  xmm8, xmm4
  876         psrld   xmm8, 7
  877         pslld   xmm4, 25
  878         por     xmm4, xmm8
  879         paddd   xmm0, xmmword ptr [rsp+0xC0]
  880         paddd   xmm1, xmmword ptr [rsp+0x90]
  881         paddd   xmm2, xmmword ptr [rsp+0xF0]
  882         paddd   xmm3, xmmword ptr [rsp+0xE0]
  883         paddd   xmm0, xmm4
  884         paddd   xmm1, xmm5
  885         paddd   xmm2, xmm6
  886         paddd   xmm3, xmm7
  887         pxor    xmm12, xmm0
  888         pxor    xmm13, xmm1
  889         pxor    xmm14, xmm2
  890         pxor    xmm15, xmm3
  891         movdqa  xmm8, xmmword ptr [ROT16+rip]
  892         pshufb  xmm12, xmm8
  893         pshufb  xmm13, xmm8
  894         pshufb  xmm14, xmm8
  895         pshufb  xmm15, xmm8
  896         movdqa  xmm8, xmmword ptr [rsp+0x100]
  897         paddd   xmm8, xmm12
  898         paddd   xmm9, xmm13
  899         paddd   xmm10, xmm14
  900         paddd   xmm11, xmm15
  901         pxor    xmm4, xmm8
  902         pxor    xmm5, xmm9
  903         pxor    xmm6, xmm10
  904         pxor    xmm7, xmm11
  905         movdqa  xmmword ptr [rsp+0x100], xmm8
  906         movdqa  xmm8, xmm4
  907         psrld   xmm8, 12
  908         pslld   xmm4, 20
  909         por     xmm4, xmm8
  910         movdqa  xmm8, xmm5
  911         psrld   xmm8, 12
  912         pslld   xmm5, 20
  913         por     xmm5, xmm8
  914         movdqa  xmm8, xmm6
  915         psrld   xmm8, 12
  916         pslld   xmm6, 20
  917         por     xmm6, xmm8
  918         movdqa  xmm8, xmm7
  919         psrld   xmm8, 12
  920         pslld   xmm7, 20
  921         por     xmm7, xmm8
  922         paddd   xmm0, xmmword ptr [rsp+0xD0]
  923         paddd   xmm1, xmmword ptr [rsp+0xB0]
  924         paddd   xmm2, xmmword ptr [rsp+0xA0]
  925         paddd   xmm3, xmmword ptr [rsp+0x80]
  926         paddd   xmm0, xmm4
  927         paddd   xmm1, xmm5
  928         paddd   xmm2, xmm6
  929         paddd   xmm3, xmm7
  930         pxor    xmm12, xmm0
  931         pxor    xmm13, xmm1
  932         pxor    xmm14, xmm2
  933         pxor    xmm15, xmm3
  934         movdqa  xmm8, xmmword ptr [ROT8+rip]
  935         pshufb  xmm12, xmm8
  936         pshufb  xmm13, xmm8
  937         pshufb  xmm14, xmm8
  938         pshufb  xmm15, xmm8
  939         movdqa  xmm8, xmmword ptr [rsp+0x100]
  940         paddd   xmm8, xmm12
  941         paddd   xmm9, xmm13
  942         paddd   xmm10, xmm14
  943         paddd   xmm11, xmm15
  944         pxor    xmm4, xmm8
  945         pxor    xmm5, xmm9
  946         pxor    xmm6, xmm10
  947         pxor    xmm7, xmm11
  948         movdqa  xmmword ptr [rsp+0x100], xmm8
  949         movdqa  xmm8, xmm4
  950         psrld   xmm8, 7
  951         pslld   xmm4, 25
  952         por     xmm4, xmm8
  953         movdqa  xmm8, xmm5
  954         psrld   xmm8, 7
  955         pslld   xmm5, 25
  956         por     xmm5, xmm8
  957         movdqa  xmm8, xmm6
  958         psrld   xmm8, 7
  959         pslld   xmm6, 25
  960         por     xmm6, xmm8
  961         movdqa  xmm8, xmm7
  962         psrld   xmm8, 7
  963         pslld   xmm7, 25
  964         por     xmm7, xmm8
  965         paddd   xmm0, xmmword ptr [rsp+0x70]
  966         paddd   xmm1, xmmword ptr [rsp+0x50]
  967         paddd   xmm2, xmmword ptr [rsp]
  968         paddd   xmm3, xmmword ptr [rsp+0x60]
  969         paddd   xmm0, xmm5
  970         paddd   xmm1, xmm6
  971         paddd   xmm2, xmm7
  972         paddd   xmm3, xmm4
  973         pxor    xmm15, xmm0
  974         pxor    xmm12, xmm1
  975         pxor    xmm13, xmm2
  976         pxor    xmm14, xmm3
  977         movdqa  xmm8, xmmword ptr [ROT16+rip]
  978         pshufb  xmm15, xmm8
  979         pshufb  xmm12, xmm8
  980         pshufb  xmm13, xmm8
  981         pshufb  xmm14, xmm8
  982         paddd   xmm10, xmm15
  983         paddd   xmm11, xmm12
  984         movdqa  xmm8, xmmword ptr [rsp+0x100]
  985         paddd   xmm8, xmm13
  986         paddd   xmm9, xmm14
  987         pxor    xmm5, xmm10
  988         pxor    xmm6, xmm11
  989         pxor    xmm7, xmm8
  990         pxor    xmm4, xmm9
  991         movdqa  xmmword ptr [rsp+0x100], xmm8
  992         movdqa  xmm8, xmm5
  993         psrld   xmm8, 12
  994         pslld   xmm5, 20
  995         por     xmm5, xmm8
  996         movdqa  xmm8, xmm6
  997         psrld   xmm8, 12
  998         pslld   xmm6, 20
  999         por     xmm6, xmm8
 1000         movdqa  xmm8, xmm7
 1001         psrld   xmm8, 12
 1002         pslld   xmm7, 20
 1003         por     xmm7, xmm8
 1004         movdqa  xmm8, xmm4
 1005         psrld   xmm8, 12
 1006         pslld   xmm4, 20
 1007         por     xmm4, xmm8
 1008         paddd   xmm0, xmmword ptr [rsp+0x20]
 1009         paddd   xmm1, xmmword ptr [rsp+0x30]
 1010         paddd   xmm2, xmmword ptr [rsp+0x10]
 1011         paddd   xmm3, xmmword ptr [rsp+0x40]
 1012         paddd   xmm0, xmm5
 1013         paddd   xmm1, xmm6
 1014         paddd   xmm2, xmm7
 1015         paddd   xmm3, xmm4
 1016         pxor    xmm15, xmm0
 1017         pxor    xmm12, xmm1
 1018         pxor    xmm13, xmm2
 1019         pxor    xmm14, xmm3
 1020         movdqa  xmm8, xmmword ptr [ROT8+rip]
 1021         pshufb  xmm15, xmm8
 1022         pshufb  xmm12, xmm8
 1023         pshufb  xmm13, xmm8
 1024         pshufb  xmm14, xmm8
 1025         paddd   xmm10, xmm15
 1026         paddd   xmm11, xmm12
 1027         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1028         paddd   xmm8, xmm13
 1029         paddd   xmm9, xmm14
 1030         pxor    xmm5, xmm10
 1031         pxor    xmm6, xmm11
 1032         pxor    xmm7, xmm8
 1033         pxor    xmm4, xmm9
 1034         movdqa  xmmword ptr [rsp+0x100], xmm8
 1035         movdqa  xmm8, xmm5
 1036         psrld   xmm8, 7
 1037         pslld   xmm5, 25
 1038         por     xmm5, xmm8
 1039         movdqa  xmm8, xmm6
 1040         psrld   xmm8, 7
 1041         pslld   xmm6, 25
 1042         por     xmm6, xmm8
 1043         movdqa  xmm8, xmm7
 1044         psrld   xmm8, 7
 1045         pslld   xmm7, 25
 1046         por     xmm7, xmm8
 1047         movdqa  xmm8, xmm4
 1048         psrld   xmm8, 7
 1049         pslld   xmm4, 25
 1050         por     xmm4, xmm8
 1051         paddd   xmm0, xmmword ptr [rsp+0x90]
 1052         paddd   xmm1, xmmword ptr [rsp+0xB0]
 1053         paddd   xmm2, xmmword ptr [rsp+0x80]
 1054         paddd   xmm3, xmmword ptr [rsp+0xF0]
 1055         paddd   xmm0, xmm4
 1056         paddd   xmm1, xmm5
 1057         paddd   xmm2, xmm6
 1058         paddd   xmm3, xmm7
 1059         pxor    xmm12, xmm0
 1060         pxor    xmm13, xmm1
 1061         pxor    xmm14, xmm2
 1062         pxor    xmm15, xmm3
 1063         movdqa  xmm8, xmmword ptr [ROT16+rip]
 1064         pshufb  xmm12, xmm8
 1065         pshufb  xmm13, xmm8
 1066         pshufb  xmm14, xmm8
 1067         pshufb  xmm15, xmm8
 1068         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1069         paddd   xmm8, xmm12
 1070         paddd   xmm9, xmm13
 1071         paddd   xmm10, xmm14
 1072         paddd   xmm11, xmm15
 1073         pxor    xmm4, xmm8
 1074         pxor    xmm5, xmm9
 1075         pxor    xmm6, xmm10
 1076         pxor    xmm7, xmm11
 1077         movdqa  xmmword ptr [rsp+0x100], xmm8
 1078         movdqa  xmm8, xmm4
 1079         psrld   xmm8, 12
 1080         pslld   xmm4, 20
 1081         por     xmm4, xmm8
 1082         movdqa  xmm8, xmm5
 1083         psrld   xmm8, 12
 1084         pslld   xmm5, 20
 1085         por     xmm5, xmm8
 1086         movdqa  xmm8, xmm6
 1087         psrld   xmm8, 12
 1088         pslld   xmm6, 20
 1089         por     xmm6, xmm8
 1090         movdqa  xmm8, xmm7
 1091         psrld   xmm8, 12
 1092         pslld   xmm7, 20
 1093         por     xmm7, xmm8
 1094         paddd   xmm0, xmmword ptr [rsp+0xE0]
 1095         paddd   xmm1, xmmword ptr [rsp+0x50]
 1096         paddd   xmm2, xmmword ptr [rsp+0xC0]
 1097         paddd   xmm3, xmmword ptr [rsp+0x10]
 1098         paddd   xmm0, xmm4
 1099         paddd   xmm1, xmm5
 1100         paddd   xmm2, xmm6
 1101         paddd   xmm3, xmm7
 1102         pxor    xmm12, xmm0
 1103         pxor    xmm13, xmm1
 1104         pxor    xmm14, xmm2
 1105         pxor    xmm15, xmm3
 1106         movdqa  xmm8, xmmword ptr [ROT8+rip]
 1107         pshufb  xmm12, xmm8
 1108         pshufb  xmm13, xmm8
 1109         pshufb  xmm14, xmm8
 1110         pshufb  xmm15, xmm8
 1111         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1112         paddd   xmm8, xmm12
 1113         paddd   xmm9, xmm13
 1114         paddd   xmm10, xmm14
 1115         paddd   xmm11, xmm15
 1116         pxor    xmm4, xmm8
 1117         pxor    xmm5, xmm9
 1118         pxor    xmm6, xmm10
 1119         pxor    xmm7, xmm11
 1120         movdqa  xmmword ptr [rsp+0x100], xmm8
 1121         movdqa  xmm8, xmm4
 1122         psrld   xmm8, 7
 1123         pslld   xmm4, 25
 1124         por     xmm4, xmm8
 1125         movdqa  xmm8, xmm5
 1126         psrld   xmm8, 7
 1127         pslld   xmm5, 25
 1128         por     xmm5, xmm8
 1129         movdqa  xmm8, xmm6
 1130         psrld   xmm8, 7
 1131         pslld   xmm6, 25
 1132         por     xmm6, xmm8
 1133         movdqa  xmm8, xmm7
 1134         psrld   xmm8, 7
 1135         pslld   xmm7, 25
 1136         por     xmm7, xmm8
 1137         paddd   xmm0, xmmword ptr [rsp+0xD0]
 1138         paddd   xmm1, xmmword ptr [rsp]
 1139         paddd   xmm2, xmmword ptr [rsp+0x20]
 1140         paddd   xmm3, xmmword ptr [rsp+0x40]
 1141         paddd   xmm0, xmm5
 1142         paddd   xmm1, xmm6
 1143         paddd   xmm2, xmm7
 1144         paddd   xmm3, xmm4
 1145         pxor    xmm15, xmm0
 1146         pxor    xmm12, xmm1
 1147         pxor    xmm13, xmm2
 1148         pxor    xmm14, xmm3
 1149         movdqa  xmm8, xmmword ptr [ROT16+rip]
 1150         pshufb  xmm15, xmm8
 1151         pshufb  xmm12, xmm8
 1152         pshufb  xmm13, xmm8
 1153         pshufb  xmm14, xmm8
 1154         paddd   xmm10, xmm15
 1155         paddd   xmm11, xmm12
 1156         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1157         paddd   xmm8, xmm13
 1158         paddd   xmm9, xmm14
 1159         pxor    xmm5, xmm10
 1160         pxor    xmm6, xmm11
 1161         pxor    xmm7, xmm8
 1162         pxor    xmm4, xmm9
 1163         movdqa  xmmword ptr [rsp+0x100], xmm8
 1164         movdqa  xmm8, xmm5
 1165         psrld   xmm8, 12
 1166         pslld   xmm5, 20
 1167         por     xmm5, xmm8
 1168         movdqa  xmm8, xmm6
 1169         psrld   xmm8, 12
 1170         pslld   xmm6, 20
 1171         por     xmm6, xmm8
 1172         movdqa  xmm8, xmm7
 1173         psrld   xmm8, 12
 1174         pslld   xmm7, 20
 1175         por     xmm7, xmm8
 1176         movdqa  xmm8, xmm4
 1177         psrld   xmm8, 12
 1178         pslld   xmm4, 20
 1179         por     xmm4, xmm8
 1180         paddd   xmm0, xmmword ptr [rsp+0x30]
 1181         paddd   xmm1, xmmword ptr [rsp+0xA0]
 1182         paddd   xmm2, xmmword ptr [rsp+0x60]
 1183         paddd   xmm3, xmmword ptr [rsp+0x70]
 1184         paddd   xmm0, xmm5
 1185         paddd   xmm1, xmm6
 1186         paddd   xmm2, xmm7
 1187         paddd   xmm3, xmm4
 1188         pxor    xmm15, xmm0
 1189         pxor    xmm12, xmm1
 1190         pxor    xmm13, xmm2
 1191         pxor    xmm14, xmm3
 1192         movdqa  xmm8, xmmword ptr [ROT8+rip]
 1193         pshufb  xmm15, xmm8
 1194         pshufb  xmm12, xmm8
 1195         pshufb  xmm13, xmm8
 1196         pshufb  xmm14, xmm8
 1197         paddd   xmm10, xmm15
 1198         paddd   xmm11, xmm12
 1199         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1200         paddd   xmm8, xmm13
 1201         paddd   xmm9, xmm14
 1202         pxor    xmm5, xmm10
 1203         pxor    xmm6, xmm11
 1204         pxor    xmm7, xmm8
 1205         pxor    xmm4, xmm9
 1206         movdqa  xmmword ptr [rsp+0x100], xmm8
 1207         movdqa  xmm8, xmm5
 1208         psrld   xmm8, 7
 1209         pslld   xmm5, 25
 1210         por     xmm5, xmm8
 1211         movdqa  xmm8, xmm6
 1212         psrld   xmm8, 7
 1213         pslld   xmm6, 25
 1214         por     xmm6, xmm8
 1215         movdqa  xmm8, xmm7
 1216         psrld   xmm8, 7
 1217         pslld   xmm7, 25
 1218         por     xmm7, xmm8
 1219         movdqa  xmm8, xmm4
 1220         psrld   xmm8, 7
 1221         pslld   xmm4, 25
 1222         por     xmm4, xmm8
 1223         paddd   xmm0, xmmword ptr [rsp+0xB0]
 1224         paddd   xmm1, xmmword ptr [rsp+0x50]
 1225         paddd   xmm2, xmmword ptr [rsp+0x10]
 1226         paddd   xmm3, xmmword ptr [rsp+0x80]
 1227         paddd   xmm0, xmm4
 1228         paddd   xmm1, xmm5
 1229         paddd   xmm2, xmm6
 1230         paddd   xmm3, xmm7
 1231         pxor    xmm12, xmm0
 1232         pxor    xmm13, xmm1
 1233         pxor    xmm14, xmm2
 1234         pxor    xmm15, xmm3
 1235         movdqa  xmm8, xmmword ptr [ROT16+rip]
 1236         pshufb  xmm12, xmm8
 1237         pshufb  xmm13, xmm8
 1238         pshufb  xmm14, xmm8
 1239         pshufb  xmm15, xmm8
 1240         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1241         paddd   xmm8, xmm12
 1242         paddd   xmm9, xmm13
 1243         paddd   xmm10, xmm14
 1244         paddd   xmm11, xmm15
 1245         pxor    xmm4, xmm8
 1246         pxor    xmm5, xmm9
 1247         pxor    xmm6, xmm10
 1248         pxor    xmm7, xmm11
 1249         movdqa  xmmword ptr [rsp+0x100], xmm8
 1250         movdqa  xmm8, xmm4
 1251         psrld   xmm8, 12
 1252         pslld   xmm4, 20
 1253         por     xmm4, xmm8
 1254         movdqa  xmm8, xmm5
 1255         psrld   xmm8, 12
 1256         pslld   xmm5, 20
 1257         por     xmm5, xmm8
 1258         movdqa  xmm8, xmm6
 1259         psrld   xmm8, 12
 1260         pslld   xmm6, 20
 1261         por     xmm6, xmm8
 1262         movdqa  xmm8, xmm7
 1263         psrld   xmm8, 12
 1264         pslld   xmm7, 20
 1265         por     xmm7, xmm8
 1266         paddd   xmm0, xmmword ptr [rsp+0xF0]
 1267         paddd   xmm1, xmmword ptr [rsp]
 1268         paddd   xmm2, xmmword ptr [rsp+0x90]
 1269         paddd   xmm3, xmmword ptr [rsp+0x60]
 1270         paddd   xmm0, xmm4
 1271         paddd   xmm1, xmm5
 1272         paddd   xmm2, xmm6
 1273         paddd   xmm3, xmm7
 1274         pxor    xmm12, xmm0
 1275         pxor    xmm13, xmm1
 1276         pxor    xmm14, xmm2
 1277         pxor    xmm15, xmm3
 1278         movdqa  xmm8, xmmword ptr [ROT8+rip]
 1279         pshufb  xmm12, xmm8
 1280         pshufb  xmm13, xmm8
 1281         pshufb  xmm14, xmm8
 1282         pshufb  xmm15, xmm8
 1283         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1284         paddd   xmm8, xmm12
 1285         paddd   xmm9, xmm13
 1286         paddd   xmm10, xmm14
 1287         paddd   xmm11, xmm15
 1288         pxor    xmm4, xmm8
 1289         pxor    xmm5, xmm9
 1290         pxor    xmm6, xmm10
 1291         pxor    xmm7, xmm11
 1292         movdqa  xmmword ptr [rsp+0x100], xmm8
 1293         movdqa  xmm8, xmm4
 1294         psrld   xmm8, 7
 1295         pslld   xmm4, 25
 1296         por     xmm4, xmm8
 1297         movdqa  xmm8, xmm5
 1298         psrld   xmm8, 7
 1299         pslld   xmm5, 25
 1300         por     xmm5, xmm8
 1301         movdqa  xmm8, xmm6
 1302         psrld   xmm8, 7
 1303         pslld   xmm6, 25
 1304         por     xmm6, xmm8
 1305         movdqa  xmm8, xmm7
 1306         psrld   xmm8, 7
 1307         pslld   xmm7, 25
 1308         por     xmm7, xmm8
 1309         paddd   xmm0, xmmword ptr [rsp+0xE0]
 1310         paddd   xmm1, xmmword ptr [rsp+0x20]
 1311         paddd   xmm2, xmmword ptr [rsp+0x30]
 1312         paddd   xmm3, xmmword ptr [rsp+0x70]
 1313         paddd   xmm0, xmm5
 1314         paddd   xmm1, xmm6
 1315         paddd   xmm2, xmm7
 1316         paddd   xmm3, xmm4
 1317         pxor    xmm15, xmm0
 1318         pxor    xmm12, xmm1
 1319         pxor    xmm13, xmm2
 1320         pxor    xmm14, xmm3
 1321         movdqa  xmm8, xmmword ptr [ROT16+rip]
 1322         pshufb  xmm15, xmm8
 1323         pshufb  xmm12, xmm8
 1324         pshufb  xmm13, xmm8
 1325         pshufb  xmm14, xmm8
 1326         paddd   xmm10, xmm15
 1327         paddd   xmm11, xmm12
 1328         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1329         paddd   xmm8, xmm13
 1330         paddd   xmm9, xmm14
 1331         pxor    xmm5, xmm10
 1332         pxor    xmm6, xmm11
 1333         pxor    xmm7, xmm8
 1334         pxor    xmm4, xmm9
 1335         movdqa  xmmword ptr [rsp+0x100], xmm8
 1336         movdqa  xmm8, xmm5
 1337         psrld   xmm8, 12
 1338         pslld   xmm5, 20
 1339         por     xmm5, xmm8
 1340         movdqa  xmm8, xmm6
 1341         psrld   xmm8, 12
 1342         pslld   xmm6, 20
 1343         por     xmm6, xmm8
 1344         movdqa  xmm8, xmm7
 1345         psrld   xmm8, 12
 1346         pslld   xmm7, 20
 1347         por     xmm7, xmm8
 1348         movdqa  xmm8, xmm4
 1349         psrld   xmm8, 12
 1350         pslld   xmm4, 20
 1351         por     xmm4, xmm8
 1352         paddd   xmm0, xmmword ptr [rsp+0xA0]
 1353         paddd   xmm1, xmmword ptr [rsp+0xC0]
 1354         paddd   xmm2, xmmword ptr [rsp+0x40]
 1355         paddd   xmm3, xmmword ptr [rsp+0xD0]
 1356         paddd   xmm0, xmm5
 1357         paddd   xmm1, xmm6
 1358         paddd   xmm2, xmm7
 1359         paddd   xmm3, xmm4
 1360         pxor    xmm15, xmm0
 1361         pxor    xmm12, xmm1
 1362         pxor    xmm13, xmm2
 1363         pxor    xmm14, xmm3
 1364         movdqa  xmm8, xmmword ptr [ROT8+rip]
 1365         pshufb  xmm15, xmm8
 1366         pshufb  xmm12, xmm8
 1367         pshufb  xmm13, xmm8
 1368         pshufb  xmm14, xmm8
 1369         paddd   xmm10, xmm15
 1370         paddd   xmm11, xmm12
 1371         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1372         paddd   xmm8, xmm13
 1373         paddd   xmm9, xmm14
 1374         pxor    xmm5, xmm10
 1375         pxor    xmm6, xmm11
 1376         pxor    xmm7, xmm8
 1377         pxor    xmm4, xmm9
 1378         pxor    xmm0, xmm8
 1379         pxor    xmm1, xmm9
 1380         pxor    xmm2, xmm10
 1381         pxor    xmm3, xmm11
 1382         movdqa  xmm8, xmm5
 1383         psrld   xmm8, 7
 1384         pslld   xmm5, 25
 1385         por     xmm5, xmm8
 1386         movdqa  xmm8, xmm6
 1387         psrld   xmm8, 7
 1388         pslld   xmm6, 25
 1389         por     xmm6, xmm8
 1390         movdqa  xmm8, xmm7
 1391         psrld   xmm8, 7
 1392         pslld   xmm7, 25
 1393         por     xmm7, xmm8
 1394         movdqa  xmm8, xmm4
 1395         psrld   xmm8, 7
 1396         pslld   xmm4, 25
 1397         por     xmm4, xmm8
 1398         pxor    xmm4, xmm12
 1399         pxor    xmm5, xmm13
 1400         pxor    xmm6, xmm14
 1401         pxor    xmm7, xmm15
 1402         mov     eax, r13d
 1403         jne     9b
 1404         movdqa  xmm9, xmm0
 1405         punpckldq xmm0, xmm1
 1406         punpckhdq xmm9, xmm1
 1407         movdqa  xmm11, xmm2
 1408         punpckldq xmm2, xmm3
 1409         punpckhdq xmm11, xmm3
 1410         movdqa  xmm1, xmm0
 1411         punpcklqdq xmm0, xmm2
 1412         punpckhqdq xmm1, xmm2
 1413         movdqa  xmm3, xmm9
 1414         punpcklqdq xmm9, xmm11
 1415         punpckhqdq xmm3, xmm11
 1416         movdqu  xmmword ptr [rbx], xmm0
 1417         movdqu  xmmword ptr [rbx+0x20], xmm1
 1418         movdqu  xmmword ptr [rbx+0x40], xmm9
 1419         movdqu  xmmword ptr [rbx+0x60], xmm3
 1420         movdqa  xmm9, xmm4
 1421         punpckldq xmm4, xmm5
 1422         punpckhdq xmm9, xmm5
 1423         movdqa  xmm11, xmm6
 1424         punpckldq xmm6, xmm7
 1425         punpckhdq xmm11, xmm7
 1426         movdqa  xmm5, xmm4
 1427         punpcklqdq xmm4, xmm6
 1428         punpckhqdq xmm5, xmm6
 1429         movdqa  xmm7, xmm9
 1430         punpcklqdq xmm9, xmm11
 1431         punpckhqdq xmm7, xmm11
 1432         movdqu  xmmword ptr [rbx+0x10], xmm4
 1433         movdqu  xmmword ptr [rbx+0x30], xmm5
 1434         movdqu  xmmword ptr [rbx+0x50], xmm9
 1435         movdqu  xmmword ptr [rbx+0x70], xmm7
 1436         movdqa  xmm1, xmmword ptr [rsp+0x110]
 1437         movdqa  xmm0, xmm1
 1438         paddd   xmm1, xmmword ptr [rsp+0x150]
 1439         movdqa  xmmword ptr [rsp+0x110], xmm1
 1440         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
 1441         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
 1442         pcmpgtd xmm0, xmm1
 1443         movdqa  xmm1, xmmword ptr [rsp+0x120]
 1444         psubd   xmm1, xmm0
 1445         movdqa  xmmword ptr [rsp+0x120], xmm1
 1446         add     rbx, 128
 1447         add     rdi, 32
 1448         sub     rsi, 4
 1449         cmp     rsi, 4
 1450         jnc     2b
 1451         test    rsi, rsi
 1452         jnz     3f
 1453 4:
 1454         mov     rsp, rbp
 1455         pop     rbp
 1456         pop     rbx
 1457         pop     r12
 1458         pop     r13
 1459         pop     r14
 1460         pop     r15
 1461         RET
 1462 .p2align 5
 1463 3:
 1464         test    esi, 0x2
 1465         je      3f
 1466         movups  xmm0, xmmword ptr [rcx]
 1467         movups  xmm1, xmmword ptr [rcx+0x10]
 1468         movaps  xmm8, xmm0
 1469         movaps  xmm9, xmm1
 1470         movd    xmm13, dword ptr [rsp+0x110]
 1471         pinsrd  xmm13, dword ptr [rsp+0x120], 1
 1472         pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
 1473         movaps  xmmword ptr [rsp], xmm13
 1474         movd    xmm14, dword ptr [rsp+0x114]
 1475         pinsrd  xmm14, dword ptr [rsp+0x124], 1
 1476         pinsrd  xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
 1477         movaps  xmmword ptr [rsp+0x10], xmm14
 1478         mov     r8, qword ptr [rdi]
 1479         mov     r9, qword ptr [rdi+0x8]
 1480         movzx   eax, byte ptr [rbp+0x40]
 1481         or      eax, r13d
 1482         xor     edx, edx
 1483 2:
 1484         mov     r14d, eax
 1485         or      eax, r12d
 1486         add     rdx, 64
 1487         cmp     rdx, r15
 1488         cmovne  eax, r14d
 1489         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 1490         movaps  xmm10, xmm2
 1491         movups  xmm4, xmmword ptr [r8+rdx-0x40]
 1492         movups  xmm5, xmmword ptr [r8+rdx-0x30]
 1493         movaps  xmm3, xmm4
 1494         shufps  xmm4, xmm5, 136
 1495         shufps  xmm3, xmm5, 221
 1496         movaps  xmm5, xmm3
 1497         movups  xmm6, xmmword ptr [r8+rdx-0x20]
 1498         movups  xmm7, xmmword ptr [r8+rdx-0x10]
 1499         movaps  xmm3, xmm6
 1500         shufps  xmm6, xmm7, 136
 1501         pshufd  xmm6, xmm6, 0x93
 1502         shufps  xmm3, xmm7, 221
 1503         pshufd  xmm7, xmm3, 0x93
 1504         movups  xmm12, xmmword ptr [r9+rdx-0x40]
 1505         movups  xmm13, xmmword ptr [r9+rdx-0x30]
 1506         movaps  xmm11, xmm12
 1507         shufps  xmm12, xmm13, 136
 1508         shufps  xmm11, xmm13, 221
 1509         movaps  xmm13, xmm11
 1510         movups  xmm14, xmmword ptr [r9+rdx-0x20]
 1511         movups  xmm15, xmmword ptr [r9+rdx-0x10]
 1512         movaps  xmm11, xmm14
 1513         shufps  xmm14, xmm15, 136
 1514         pshufd  xmm14, xmm14, 0x93
 1515         shufps  xmm11, xmm15, 221
 1516         pshufd  xmm15, xmm11, 0x93
 1517         movaps  xmm3, xmmword ptr [rsp]
 1518         movaps  xmm11, xmmword ptr [rsp+0x10]
 1519         pinsrd  xmm3, eax, 3
 1520         pinsrd  xmm11, eax, 3
 1521         mov     al, 7
 1522 9:
 1523         paddd   xmm0, xmm4
 1524         paddd   xmm8, xmm12
 1525         movaps  xmmword ptr [rsp+0x20], xmm4
 1526         movaps  xmmword ptr [rsp+0x30], xmm12
 1527         paddd   xmm0, xmm1
 1528         paddd   xmm8, xmm9
 1529         pxor    xmm3, xmm0
 1530         pxor    xmm11, xmm8
 1531         movaps  xmm12, xmmword ptr [ROT16+rip]
 1532         pshufb  xmm3, xmm12
 1533         pshufb  xmm11, xmm12
 1534         paddd   xmm2, xmm3
 1535         paddd   xmm10, xmm11
 1536         pxor    xmm1, xmm2
 1537         pxor    xmm9, xmm10
 1538         movdqa  xmm4, xmm1
 1539         pslld   xmm1, 20
 1540         psrld   xmm4, 12
 1541         por     xmm1, xmm4
 1542         movdqa  xmm4, xmm9
 1543         pslld   xmm9, 20
 1544         psrld   xmm4, 12
 1545         por     xmm9, xmm4
 1546         paddd   xmm0, xmm5
 1547         paddd   xmm8, xmm13
 1548         movaps  xmmword ptr [rsp+0x40], xmm5
 1549         movaps  xmmword ptr [rsp+0x50], xmm13
 1550         paddd   xmm0, xmm1
 1551         paddd   xmm8, xmm9
 1552         pxor    xmm3, xmm0
 1553         pxor    xmm11, xmm8
 1554         movaps  xmm13, xmmword ptr [ROT8+rip]
 1555         pshufb  xmm3, xmm13
 1556         pshufb  xmm11, xmm13
 1557         paddd   xmm2, xmm3
 1558         paddd   xmm10, xmm11
 1559         pxor    xmm1, xmm2
 1560         pxor    xmm9, xmm10
 1561         movdqa  xmm4, xmm1
 1562         pslld   xmm1, 25
 1563         psrld   xmm4, 7
 1564         por     xmm1, xmm4
 1565         movdqa  xmm4, xmm9
 1566         pslld   xmm9, 25
 1567         psrld   xmm4, 7
 1568         por     xmm9, xmm4
 1569         pshufd  xmm0, xmm0, 0x93
 1570         pshufd  xmm8, xmm8, 0x93
 1571         pshufd  xmm3, xmm3, 0x4E
 1572         pshufd  xmm11, xmm11, 0x4E
 1573         pshufd  xmm2, xmm2, 0x39
 1574         pshufd  xmm10, xmm10, 0x39
 1575         paddd   xmm0, xmm6
 1576         paddd   xmm8, xmm14
 1577         paddd   xmm0, xmm1
 1578         paddd   xmm8, xmm9
 1579         pxor    xmm3, xmm0
 1580         pxor    xmm11, xmm8
 1581         pshufb  xmm3, xmm12
 1582         pshufb  xmm11, xmm12
 1583         paddd   xmm2, xmm3
 1584         paddd   xmm10, xmm11
 1585         pxor    xmm1, xmm2
 1586         pxor    xmm9, xmm10
 1587         movdqa  xmm4, xmm1
 1588         pslld   xmm1, 20
 1589         psrld   xmm4, 12
 1590         por     xmm1, xmm4
 1591         movdqa  xmm4, xmm9
 1592         pslld   xmm9, 20
 1593         psrld   xmm4, 12
 1594         por     xmm9, xmm4
 1595         paddd   xmm0, xmm7
 1596         paddd   xmm8, xmm15
 1597         paddd   xmm0, xmm1
 1598         paddd   xmm8, xmm9
 1599         pxor    xmm3, xmm0
 1600         pxor    xmm11, xmm8
 1601         pshufb  xmm3, xmm13
 1602         pshufb  xmm11, xmm13
 1603         paddd   xmm2, xmm3
 1604         paddd   xmm10, xmm11
 1605         pxor    xmm1, xmm2
 1606         pxor    xmm9, xmm10
 1607         movdqa  xmm4, xmm1
 1608         pslld   xmm1, 25
 1609         psrld   xmm4, 7
 1610         por     xmm1, xmm4
 1611         movdqa  xmm4, xmm9
 1612         pslld   xmm9, 25
 1613         psrld   xmm4, 7
 1614         por     xmm9, xmm4
 1615         pshufd  xmm0, xmm0, 0x39
 1616         pshufd  xmm8, xmm8, 0x39
 1617         pshufd  xmm3, xmm3, 0x4E
 1618         pshufd  xmm11, xmm11, 0x4E
 1619         pshufd  xmm2, xmm2, 0x93
 1620         pshufd  xmm10, xmm10, 0x93
 1621         dec     al
 1622         je      9f
 1623         movdqa  xmm12, xmmword ptr [rsp+0x20]
 1624         movdqa  xmm5, xmmword ptr [rsp+0x40]
 1625         pshufd  xmm13, xmm12, 0x0F
 1626         shufps  xmm12, xmm5, 214
 1627         pshufd  xmm4, xmm12, 0x39
 1628         movdqa  xmm12, xmm6
 1629         shufps  xmm12, xmm7, 250
 1630         pblendw xmm13, xmm12, 0xCC
 1631         movdqa  xmm12, xmm7
 1632         punpcklqdq xmm12, xmm5
 1633         pblendw xmm12, xmm6, 0xC0
 1634         pshufd  xmm12, xmm12, 0x78
 1635         punpckhdq xmm5, xmm7
 1636         punpckldq xmm6, xmm5
 1637         pshufd  xmm7, xmm6, 0x1E
 1638         movdqa  xmmword ptr [rsp+0x20], xmm13
 1639         movdqa  xmmword ptr [rsp+0x40], xmm12
 1640         movdqa  xmm5, xmmword ptr [rsp+0x30]
 1641         movdqa  xmm13, xmmword ptr [rsp+0x50]
 1642         pshufd  xmm6, xmm5, 0x0F
 1643         shufps  xmm5, xmm13, 214
 1644         pshufd  xmm12, xmm5, 0x39
 1645         movdqa  xmm5, xmm14
 1646         shufps  xmm5, xmm15, 250
 1647         pblendw xmm6, xmm5, 0xCC
 1648         movdqa  xmm5, xmm15
 1649         punpcklqdq xmm5, xmm13
 1650         pblendw xmm5, xmm14, 0xC0
 1651         pshufd  xmm5, xmm5, 0x78
 1652         punpckhdq xmm13, xmm15
 1653         punpckldq xmm14, xmm13
 1654         pshufd  xmm15, xmm14, 0x1E
 1655         movdqa  xmm13, xmm6
 1656         movdqa  xmm14, xmm5
 1657         movdqa  xmm5, xmmword ptr [rsp+0x20]
 1658         movdqa  xmm6, xmmword ptr [rsp+0x40]
 1659         jmp     9b
 1660 9:
 1661         pxor    xmm0, xmm2
 1662         pxor    xmm1, xmm3
 1663         pxor    xmm8, xmm10
 1664         pxor    xmm9, xmm11
 1665         mov     eax, r13d
 1666         cmp     rdx, r15
 1667         jne     2b
 1668         movups  xmmword ptr [rbx], xmm0
 1669         movups  xmmword ptr [rbx+0x10], xmm1
 1670         movups  xmmword ptr [rbx+0x20], xmm8
 1671         movups  xmmword ptr [rbx+0x30], xmm9
 1672         movdqa  xmm0, xmmword ptr [rsp+0x130]
 1673         movdqa  xmm1, xmmword ptr [rsp+0x110]
 1674         movdqa  xmm2, xmmword ptr [rsp+0x120]
 1675         movdqu  xmm3, xmmword ptr [rsp+0x118]
 1676         movdqu  xmm4, xmmword ptr [rsp+0x128]
 1677         blendvps xmm1, xmm3, xmm0
 1678         blendvps xmm2, xmm4, xmm0
 1679         movdqa  xmmword ptr [rsp+0x110], xmm1
 1680         movdqa  xmmword ptr [rsp+0x120], xmm2
 1681         add     rdi, 16
 1682         add     rbx, 64
 1683         sub     rsi, 2
 1684 3:
 1685         test    esi, 0x1
 1686         je      4b
 1687         movups  xmm0, xmmword ptr [rcx]
 1688         movups  xmm1, xmmword ptr [rcx+0x10]
 1689         movd    xmm13, dword ptr [rsp+0x110]
 1690         pinsrd  xmm13, dword ptr [rsp+0x120], 1
 1691         pinsrd  xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
 1692         movaps  xmm14, xmmword ptr [ROT8+rip]
 1693         movaps  xmm15, xmmword ptr [ROT16+rip]
 1694         mov     r8, qword ptr [rdi]
 1695         movzx   eax, byte ptr [rbp+0x40]
 1696         or      eax, r13d
 1697         xor     edx, edx
 1698 2:
 1699         mov     r14d, eax
 1700         or      eax, r12d
 1701         add     rdx, 64
 1702         cmp     rdx, r15
 1703         cmovne  eax, r14d
 1704         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 1705         movaps  xmm3, xmm13
 1706         pinsrd  xmm3, eax, 3
 1707         movups  xmm4, xmmword ptr [r8+rdx-0x40]
 1708         movups  xmm5, xmmword ptr [r8+rdx-0x30]
 1709         movaps  xmm8, xmm4
 1710         shufps  xmm4, xmm5, 136
 1711         shufps  xmm8, xmm5, 221
 1712         movaps  xmm5, xmm8
 1713         movups  xmm6, xmmword ptr [r8+rdx-0x20]
 1714         movups  xmm7, xmmword ptr [r8+rdx-0x10]
 1715         movaps  xmm8, xmm6
 1716         shufps  xmm6, xmm7, 136
 1717         pshufd  xmm6, xmm6, 0x93
 1718         shufps  xmm8, xmm7, 221
 1719         pshufd  xmm7, xmm8, 0x93
 1720         mov     al, 7
 1721 9:
 1722         paddd   xmm0, xmm4
 1723         paddd   xmm0, xmm1
 1724         pxor    xmm3, xmm0
 1725         pshufb  xmm3, xmm15
 1726         paddd   xmm2, xmm3
 1727         pxor    xmm1, xmm2
 1728         movdqa  xmm11, xmm1
 1729         pslld   xmm1, 20
 1730         psrld   xmm11, 12
 1731         por     xmm1, xmm11
 1732         paddd   xmm0, xmm5
 1733         paddd   xmm0, xmm1
 1734         pxor    xmm3, xmm0
 1735         pshufb  xmm3, xmm14
 1736         paddd   xmm2, xmm3
 1737         pxor    xmm1, xmm2
 1738         movdqa  xmm11, xmm1
 1739         pslld   xmm1, 25
 1740         psrld   xmm11, 7
 1741         por     xmm1, xmm11
 1742         pshufd  xmm0, xmm0, 0x93
 1743         pshufd  xmm3, xmm3, 0x4E
 1744         pshufd  xmm2, xmm2, 0x39
 1745         paddd   xmm0, xmm6
 1746         paddd   xmm0, xmm1
 1747         pxor    xmm3, xmm0
 1748         pshufb  xmm3, xmm15
 1749         paddd   xmm2, xmm3
 1750         pxor    xmm1, xmm2
 1751         movdqa  xmm11, xmm1
 1752         pslld   xmm1, 20
 1753         psrld   xmm11, 12
 1754         por     xmm1, xmm11
 1755         paddd   xmm0, xmm7
 1756         paddd   xmm0, xmm1
 1757         pxor    xmm3, xmm0
 1758         pshufb  xmm3, xmm14
 1759         paddd   xmm2, xmm3
 1760         pxor    xmm1, xmm2
 1761         movdqa  xmm11, xmm1
 1762         pslld   xmm1, 25
 1763         psrld   xmm11, 7
 1764         por     xmm1, xmm11
 1765         pshufd  xmm0, xmm0, 0x39
 1766         pshufd  xmm3, xmm3, 0x4E
 1767         pshufd  xmm2, xmm2, 0x93
 1768         dec     al
 1769         jz      9f
 1770         movdqa  xmm8, xmm4
 1771         shufps  xmm8, xmm5, 214
 1772         pshufd  xmm9, xmm4, 0x0F
 1773         pshufd  xmm4, xmm8, 0x39
 1774         movdqa  xmm8, xmm6
 1775         shufps  xmm8, xmm7, 250
 1776         pblendw xmm9, xmm8, 0xCC
 1777         movdqa  xmm8, xmm7
 1778         punpcklqdq xmm8, xmm5
 1779         pblendw xmm8, xmm6, 0xC0
 1780         pshufd  xmm8, xmm8, 0x78
 1781         punpckhdq xmm5, xmm7
 1782         punpckldq xmm6, xmm5
 1783         pshufd  xmm7, xmm6, 0x1E
 1784         movdqa  xmm5, xmm9
 1785         movdqa  xmm6, xmm8
 1786         jmp     9b
 1787 9:
 1788         pxor    xmm0, xmm2
 1789         pxor    xmm1, xmm3
 1790         mov     eax, r13d
 1791         cmp     rdx, r15
 1792         jne     2b
 1793         movups  xmmword ptr [rbx], xmm0
 1794         movups  xmmword ptr [rbx+0x10], xmm1
 1795         jmp     4b
 1796 SET_SIZE(zfs_blake3_hash_many_sse41)
 1797 
 1798 ENTRY_ALIGN(zfs_blake3_compress_in_place_sse41, 64)
 1799         ENDBR
 1800         movups  xmm0, xmmword ptr [rdi]
 1801         movups  xmm1, xmmword ptr [rdi+0x10]
 1802         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 1803         shl     r8, 32
 1804         add     rdx, r8
 1805         movq    xmm3, rcx
 1806         movq    xmm4, rdx
 1807         punpcklqdq xmm3, xmm4
 1808         movups  xmm4, xmmword ptr [rsi]
 1809         movups  xmm5, xmmword ptr [rsi+0x10]
 1810         movaps  xmm8, xmm4
 1811         shufps  xmm4, xmm5, 136
 1812         shufps  xmm8, xmm5, 221
 1813         movaps  xmm5, xmm8
 1814         movups  xmm6, xmmword ptr [rsi+0x20]
 1815         movups  xmm7, xmmword ptr [rsi+0x30]
 1816         movaps  xmm8, xmm6
 1817         shufps  xmm6, xmm7, 136
 1818         pshufd  xmm6, xmm6, 0x93
 1819         shufps  xmm8, xmm7, 221
 1820         pshufd  xmm7, xmm8, 0x93
 1821         movaps  xmm14, xmmword ptr [ROT8+rip]
 1822         movaps  xmm15, xmmword ptr [ROT16+rip]
 1823         mov     al, 7
 1824 9:
 1825         paddd   xmm0, xmm4
 1826         paddd   xmm0, xmm1
 1827         pxor    xmm3, xmm0
 1828         pshufb  xmm3, xmm15
 1829         paddd   xmm2, xmm3
 1830         pxor    xmm1, xmm2
 1831         movdqa  xmm11, xmm1
 1832         pslld   xmm1, 20
 1833         psrld   xmm11, 12
 1834         por     xmm1, xmm11
 1835         paddd   xmm0, xmm5
 1836         paddd   xmm0, xmm1
 1837         pxor    xmm3, xmm0
 1838         pshufb  xmm3, xmm14
 1839         paddd   xmm2, xmm3
 1840         pxor    xmm1, xmm2
 1841         movdqa  xmm11, xmm1
 1842         pslld   xmm1, 25
 1843         psrld   xmm11, 7
 1844         por     xmm1, xmm11
 1845         pshufd  xmm0, xmm0, 0x93
 1846         pshufd  xmm3, xmm3, 0x4E
 1847         pshufd  xmm2, xmm2, 0x39
 1848         paddd   xmm0, xmm6
 1849         paddd   xmm0, xmm1
 1850         pxor    xmm3, xmm0
 1851         pshufb  xmm3, xmm15
 1852         paddd   xmm2, xmm3
 1853         pxor    xmm1, xmm2
 1854         movdqa  xmm11, xmm1
 1855         pslld   xmm1, 20
 1856         psrld   xmm11, 12
 1857         por     xmm1, xmm11
 1858         paddd   xmm0, xmm7
 1859         paddd   xmm0, xmm1
 1860         pxor    xmm3, xmm0
 1861         pshufb  xmm3, xmm14
 1862         paddd   xmm2, xmm3
 1863         pxor    xmm1, xmm2
 1864         movdqa  xmm11, xmm1
 1865         pslld   xmm1, 25
 1866         psrld   xmm11, 7
 1867         por     xmm1, xmm11
 1868         pshufd  xmm0, xmm0, 0x39
 1869         pshufd  xmm3, xmm3, 0x4E
 1870         pshufd  xmm2, xmm2, 0x93
 1871         dec     al
 1872         jz      9f
 1873         movdqa  xmm8, xmm4
 1874         shufps  xmm8, xmm5, 214
 1875         pshufd  xmm9, xmm4, 0x0F
 1876         pshufd  xmm4, xmm8, 0x39
 1877         movdqa  xmm8, xmm6
 1878         shufps  xmm8, xmm7, 250
 1879         pblendw xmm9, xmm8, 0xCC
 1880         movdqa  xmm8, xmm7
 1881         punpcklqdq xmm8, xmm5
 1882         pblendw xmm8, xmm6, 0xC0
 1883         pshufd  xmm8, xmm8, 0x78
 1884         punpckhdq xmm5, xmm7
 1885         punpckldq xmm6, xmm5
 1886         pshufd  xmm7, xmm6, 0x1E
 1887         movdqa  xmm5, xmm9
 1888         movdqa  xmm6, xmm8
 1889         jmp     9b
 1890 9:
 1891         pxor    xmm0, xmm2
 1892         pxor    xmm1, xmm3
 1893         movups  xmmword ptr [rdi], xmm0
 1894         movups  xmmword ptr [rdi+0x10], xmm1
 1895         RET
 1896 SET_SIZE(zfs_blake3_compress_in_place_sse41)
 1897 
 1898 ENTRY_ALIGN(zfs_blake3_compress_xof_sse41, 64)
 1899         ENDBR
 1900         movups  xmm0, xmmword ptr [rdi]
 1901         movups  xmm1, xmmword ptr [rdi+0x10]
 1902         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 1903         movzx   eax, r8b
 1904         movzx   edx, dl
 1905         shl     rax, 32
 1906         add     rdx, rax
 1907         movq    xmm3, rcx
 1908         movq    xmm4, rdx
 1909         punpcklqdq xmm3, xmm4
 1910         movups  xmm4, xmmword ptr [rsi]
 1911         movups  xmm5, xmmword ptr [rsi+0x10]
 1912         movaps  xmm8, xmm4
 1913         shufps  xmm4, xmm5, 136
 1914         shufps  xmm8, xmm5, 221
 1915         movaps  xmm5, xmm8
 1916         movups  xmm6, xmmword ptr [rsi+0x20]
 1917         movups  xmm7, xmmword ptr [rsi+0x30]
 1918         movaps  xmm8, xmm6
 1919         shufps  xmm6, xmm7, 136
 1920         pshufd  xmm6, xmm6, 0x93
 1921         shufps  xmm8, xmm7, 221
 1922         pshufd  xmm7, xmm8, 0x93
 1923         movaps  xmm14, xmmword ptr [ROT8+rip]
 1924         movaps  xmm15, xmmword ptr [ROT16+rip]
 1925         mov     al, 7
 1926 9:
 1927         paddd   xmm0, xmm4
 1928         paddd   xmm0, xmm1
 1929         pxor    xmm3, xmm0
 1930         pshufb  xmm3, xmm15
 1931         paddd   xmm2, xmm3
 1932         pxor    xmm1, xmm2
 1933         movdqa  xmm11, xmm1
 1934         pslld   xmm1, 20
 1935         psrld   xmm11, 12
 1936         por     xmm1, xmm11
 1937         paddd   xmm0, xmm5
 1938         paddd   xmm0, xmm1
 1939         pxor    xmm3, xmm0
 1940         pshufb  xmm3, xmm14
 1941         paddd   xmm2, xmm3
 1942         pxor    xmm1, xmm2
 1943         movdqa  xmm11, xmm1
 1944         pslld   xmm1, 25
 1945         psrld   xmm11, 7
 1946         por     xmm1, xmm11
 1947         pshufd  xmm0, xmm0, 0x93
 1948         pshufd  xmm3, xmm3, 0x4E
 1949         pshufd  xmm2, xmm2, 0x39
 1950         paddd   xmm0, xmm6
 1951         paddd   xmm0, xmm1
 1952         pxor    xmm3, xmm0
 1953         pshufb  xmm3, xmm15
 1954         paddd   xmm2, xmm3
 1955         pxor    xmm1, xmm2
 1956         movdqa  xmm11, xmm1
 1957         pslld   xmm1, 20
 1958         psrld   xmm11, 12
 1959         por     xmm1, xmm11
 1960         paddd   xmm0, xmm7
 1961         paddd   xmm0, xmm1
 1962         pxor    xmm3, xmm0
 1963         pshufb  xmm3, xmm14
 1964         paddd   xmm2, xmm3
 1965         pxor    xmm1, xmm2
 1966         movdqa  xmm11, xmm1
 1967         pslld   xmm1, 25
 1968         psrld   xmm11, 7
 1969         por     xmm1, xmm11
 1970         pshufd  xmm0, xmm0, 0x39
 1971         pshufd  xmm3, xmm3, 0x4E
 1972         pshufd  xmm2, xmm2, 0x93
 1973         dec     al
 1974         jz      9f
 1975         movdqa  xmm8, xmm4
 1976         shufps  xmm8, xmm5, 214
 1977         pshufd  xmm9, xmm4, 0x0F
 1978         pshufd  xmm4, xmm8, 0x39
 1979         movdqa  xmm8, xmm6
 1980         shufps  xmm8, xmm7, 250
 1981         pblendw xmm9, xmm8, 0xCC
 1982         movdqa  xmm8, xmm7
 1983         punpcklqdq xmm8, xmm5
 1984         pblendw xmm8, xmm6, 0xC0
 1985         pshufd  xmm8, xmm8, 0x78
 1986         punpckhdq xmm5, xmm7
 1987         punpckldq xmm6, xmm5
 1988         pshufd  xmm7, xmm6, 0x1E
 1989         movdqa  xmm5, xmm9
 1990         movdqa  xmm6, xmm8
 1991         jmp     9b
 1992 9:
 1993         movdqu  xmm4, xmmword ptr [rdi]
 1994         movdqu  xmm5, xmmword ptr [rdi+0x10]
 1995         pxor    xmm0, xmm2
 1996         pxor    xmm1, xmm3
 1997         pxor    xmm2, xmm4
 1998         pxor    xmm3, xmm5
 1999         movups  xmmword ptr [r9], xmm0
 2000         movups  xmmword ptr [r9+0x10], xmm1
 2001         movups  xmmword ptr [r9+0x20], xmm2
 2002         movups  xmmword ptr [r9+0x30], xmm3
 2003         RET
 2004 SET_SIZE(zfs_blake3_compress_xof_sse41)
 2005 
 2006 SECTION_STATIC
 2007 
 2008 .p2align  6
 2009 BLAKE3_IV:
 2010         .long  0x6A09E667, 0xBB67AE85
 2011         .long  0x3C6EF372, 0xA54FF53A
 2012 ROT16:
 2013         .byte  2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
 2014 ROT8:
 2015         .byte  1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
 2016 ADD0:
 2017         .long  0, 1, 2, 3
 2018 ADD1:
 2019         .long  4, 4, 4, 4
 2020 BLAKE3_IV_0:
 2021         .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
 2022 BLAKE3_IV_1:
 2023         .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
 2024 BLAKE3_IV_2:
 2025         .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
 2026 BLAKE3_IV_3:
 2027         .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
 2028 BLAKE3_BLOCK_LEN:
 2029         .long  64, 64, 64, 64
 2030 CMP_MSB_MASK:
 2031         .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
 2032 
 2033 #endif  /* HAVE_SSE4_1 */
 2034 
 2035 #ifdef __ELF__
 2036 .section .note.GNU-stack,"",%progbits
 2037 #endif

Cache object: 2ebc0e1a0c092891cacf3a9774c906fa


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.