The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
   24  * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
   25  * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
   26  */
   27 
   28 #if defined(HAVE_SSE2)
   29 
   30 #define _ASM
   31 #include <sys/asm_linkage.h>
   32 
   33 .intel_syntax noprefix
   34 
   35 SECTION_TEXT
   36 
   37 ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64)
   38         ENDBR
   39         push    r15
   40         push    r14
   41         push    r13
   42         push    r12
   43         push    rbx
   44         push    rbp
   45         mov     rbp, rsp
   46         sub     rsp, 360
   47         and     rsp, 0xFFFFFFFFFFFFFFC0
   48         neg     r9d
   49         movd    xmm0, r9d
   50         pshufd  xmm0, xmm0, 0x00
   51         movdqa  xmmword ptr [rsp+0x130], xmm0
   52         movdqa  xmm1, xmm0
   53         pand    xmm1, xmmword ptr [ADD0+rip]
   54         pand    xmm0, xmmword ptr [ADD1+rip]
   55         movdqa  xmmword ptr [rsp+0x150], xmm0
   56         movd    xmm0, r8d
   57         pshufd  xmm0, xmm0, 0x00
   58         paddd   xmm0, xmm1
   59         movdqa  xmmword ptr [rsp+0x110], xmm0
   60         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
   61         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
   62         pcmpgtd xmm1, xmm0
   63         shr     r8, 32
   64         movd    xmm2, r8d
   65         pshufd  xmm2, xmm2, 0x00
   66         psubd   xmm2, xmm1
   67         movdqa  xmmword ptr [rsp+0x120], xmm2
   68         mov     rbx, qword ptr [rbp+0x50]
   69         mov     r15, rdx
   70         shl     r15, 6
   71         movzx   r13d, byte ptr [rbp+0x38]
   72         movzx   r12d, byte ptr [rbp+0x48]
   73         cmp     rsi, 4
   74         jc      3f
   75 2:
   76         movdqu  xmm3, xmmword ptr [rcx]
   77         pshufd  xmm0, xmm3, 0x00
   78         pshufd  xmm1, xmm3, 0x55
   79         pshufd  xmm2, xmm3, 0xAA
   80         pshufd  xmm3, xmm3, 0xFF
   81         movdqu  xmm7, xmmword ptr [rcx+0x10]
   82         pshufd  xmm4, xmm7, 0x00
   83         pshufd  xmm5, xmm7, 0x55
   84         pshufd  xmm6, xmm7, 0xAA
   85         pshufd  xmm7, xmm7, 0xFF
   86         mov     r8, qword ptr [rdi]
   87         mov     r9, qword ptr [rdi+0x8]
   88         mov     r10, qword ptr [rdi+0x10]
   89         mov     r11, qword ptr [rdi+0x18]
   90         movzx   eax, byte ptr [rbp+0x40]
   91         or      eax, r13d
   92         xor     edx, edx
   93 9:
   94         mov     r14d, eax
   95         or      eax, r12d
   96         add     rdx, 64
   97         cmp     rdx, r15
   98         cmovne  eax, r14d
   99         movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
  100         movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
  101         movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
  102         movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
  103         movdqa  xmm12, xmm8
  104         punpckldq xmm8, xmm9
  105         punpckhdq xmm12, xmm9
  106         movdqa  xmm14, xmm10
  107         punpckldq xmm10, xmm11
  108         punpckhdq xmm14, xmm11
  109         movdqa  xmm9, xmm8
  110         punpcklqdq xmm8, xmm10
  111         punpckhqdq xmm9, xmm10
  112         movdqa  xmm13, xmm12
  113         punpcklqdq xmm12, xmm14
  114         punpckhqdq xmm13, xmm14
  115         movdqa  xmmword ptr [rsp], xmm8
  116         movdqa  xmmword ptr [rsp+0x10], xmm9
  117         movdqa  xmmword ptr [rsp+0x20], xmm12
  118         movdqa  xmmword ptr [rsp+0x30], xmm13
  119         movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
  120         movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
  121         movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
  122         movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
  123         movdqa  xmm12, xmm8
  124         punpckldq xmm8, xmm9
  125         punpckhdq xmm12, xmm9
  126         movdqa  xmm14, xmm10
  127         punpckldq xmm10, xmm11
  128         punpckhdq xmm14, xmm11
  129         movdqa  xmm9, xmm8
  130         punpcklqdq xmm8, xmm10
  131         punpckhqdq xmm9, xmm10
  132         movdqa  xmm13, xmm12
  133         punpcklqdq xmm12, xmm14
  134         punpckhqdq xmm13, xmm14
  135         movdqa  xmmword ptr [rsp+0x40], xmm8
  136         movdqa  xmmword ptr [rsp+0x50], xmm9
  137         movdqa  xmmword ptr [rsp+0x60], xmm12
  138         movdqa  xmmword ptr [rsp+0x70], xmm13
  139         movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
  140         movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
  141         movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
  142         movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
  143         movdqa  xmm12, xmm8
  144         punpckldq xmm8, xmm9
  145         punpckhdq xmm12, xmm9
  146         movdqa  xmm14, xmm10
  147         punpckldq xmm10, xmm11
  148         punpckhdq xmm14, xmm11
  149         movdqa  xmm9, xmm8
  150         punpcklqdq xmm8, xmm10
  151         punpckhqdq xmm9, xmm10
  152         movdqa  xmm13, xmm12
  153         punpcklqdq xmm12, xmm14
  154         punpckhqdq xmm13, xmm14
  155         movdqa  xmmword ptr [rsp+0x80], xmm8
  156         movdqa  xmmword ptr [rsp+0x90], xmm9
  157         movdqa  xmmword ptr [rsp+0xA0], xmm12
  158         movdqa  xmmword ptr [rsp+0xB0], xmm13
  159         movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
  160         movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
  161         movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
  162         movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
  163         movdqa  xmm12, xmm8
  164         punpckldq xmm8, xmm9
  165         punpckhdq xmm12, xmm9
  166         movdqa  xmm14, xmm10
  167         punpckldq xmm10, xmm11
  168         punpckhdq xmm14, xmm11
  169         movdqa  xmm9, xmm8
  170         punpcklqdq xmm8, xmm10
  171         punpckhqdq xmm9, xmm10
  172         movdqa  xmm13, xmm12
  173         punpcklqdq xmm12, xmm14
  174         punpckhqdq xmm13, xmm14
  175         movdqa  xmmword ptr [rsp+0xC0], xmm8
  176         movdqa  xmmword ptr [rsp+0xD0], xmm9
  177         movdqa  xmmword ptr [rsp+0xE0], xmm12
  178         movdqa  xmmword ptr [rsp+0xF0], xmm13
  179         movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
  180         movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
  181         movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
  182         movdqa  xmm12, xmmword ptr [rsp+0x110]
  183         movdqa  xmm13, xmmword ptr [rsp+0x120]
  184         movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
  185         movd    xmm15, eax
  186         pshufd  xmm15, xmm15, 0x00
  187         prefetcht0 [r8+rdx+0x80]
  188         prefetcht0 [r9+rdx+0x80]
  189         prefetcht0 [r10+rdx+0x80]
  190         prefetcht0 [r11+rdx+0x80]
  191         paddd   xmm0, xmmword ptr [rsp]
  192         paddd   xmm1, xmmword ptr [rsp+0x20]
  193         paddd   xmm2, xmmword ptr [rsp+0x40]
  194         paddd   xmm3, xmmword ptr [rsp+0x60]
  195         paddd   xmm0, xmm4
  196         paddd   xmm1, xmm5
  197         paddd   xmm2, xmm6
  198         paddd   xmm3, xmm7
  199         pxor    xmm12, xmm0
  200         pxor    xmm13, xmm1
  201         pxor    xmm14, xmm2
  202         pxor    xmm15, xmm3
  203         pshuflw xmm12, xmm12, 0xB1
  204         pshufhw xmm12, xmm12, 0xB1
  205         pshuflw xmm13, xmm13, 0xB1
  206         pshufhw xmm13, xmm13, 0xB1
  207         pshuflw xmm14, xmm14, 0xB1
  208         pshufhw xmm14, xmm14, 0xB1
  209         pshuflw xmm15, xmm15, 0xB1
  210         pshufhw xmm15, xmm15, 0xB1
  211         movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
  212         paddd   xmm8, xmm12
  213         paddd   xmm9, xmm13
  214         paddd   xmm10, xmm14
  215         paddd   xmm11, xmm15
  216         pxor    xmm4, xmm8
  217         pxor    xmm5, xmm9
  218         pxor    xmm6, xmm10
  219         pxor    xmm7, xmm11
  220         movdqa  xmmword ptr [rsp+0x100], xmm8
  221         movdqa  xmm8, xmm4
  222         psrld   xmm8, 12
  223         pslld   xmm4, 20
  224         por     xmm4, xmm8
  225         movdqa  xmm8, xmm5
  226         psrld   xmm8, 12
  227         pslld   xmm5, 20
  228         por     xmm5, xmm8
  229         movdqa  xmm8, xmm6
  230         psrld   xmm8, 12
  231         pslld   xmm6, 20
  232         por     xmm6, xmm8
  233         movdqa  xmm8, xmm7
  234         psrld   xmm8, 12
  235         pslld   xmm7, 20
  236         por     xmm7, xmm8
  237         paddd   xmm0, xmmword ptr [rsp+0x10]
  238         paddd   xmm1, xmmword ptr [rsp+0x30]
  239         paddd   xmm2, xmmword ptr [rsp+0x50]
  240         paddd   xmm3, xmmword ptr [rsp+0x70]
  241         paddd   xmm0, xmm4
  242         paddd   xmm1, xmm5
  243         paddd   xmm2, xmm6
  244         paddd   xmm3, xmm7
  245         pxor    xmm12, xmm0
  246         pxor    xmm13, xmm1
  247         pxor    xmm14, xmm2
  248         pxor    xmm15, xmm3
  249         movdqa  xmm8, xmm12
  250         psrld   xmm12, 8
  251         pslld   xmm8, 24
  252         pxor    xmm12, xmm8
  253         movdqa  xmm8, xmm13
  254         psrld   xmm13, 8
  255         pslld   xmm8, 24
  256         pxor    xmm13, xmm8
  257         movdqa  xmm8, xmm14
  258         psrld   xmm14, 8
  259         pslld   xmm8, 24
  260         pxor    xmm14, xmm8
  261         movdqa  xmm8, xmm15
  262         psrld   xmm15, 8
  263         pslld   xmm8, 24
  264         pxor    xmm15, xmm8
  265         movdqa  xmm8, xmmword ptr [rsp+0x100]
  266         paddd   xmm8, xmm12
  267         paddd   xmm9, xmm13
  268         paddd   xmm10, xmm14
  269         paddd   xmm11, xmm15
  270         pxor    xmm4, xmm8
  271         pxor    xmm5, xmm9
  272         pxor    xmm6, xmm10
  273         pxor    xmm7, xmm11
  274         movdqa  xmmword ptr [rsp+0x100], xmm8
  275         movdqa  xmm8, xmm4
  276         psrld   xmm8, 7
  277         pslld   xmm4, 25
  278         por     xmm4, xmm8
  279         movdqa  xmm8, xmm5
  280         psrld   xmm8, 7
  281         pslld   xmm5, 25
  282         por     xmm5, xmm8
  283         movdqa  xmm8, xmm6
  284         psrld   xmm8, 7
  285         pslld   xmm6, 25
  286         por     xmm6, xmm8
  287         movdqa  xmm8, xmm7
  288         psrld   xmm8, 7
  289         pslld   xmm7, 25
  290         por     xmm7, xmm8
  291         paddd   xmm0, xmmword ptr [rsp+0x80]
  292         paddd   xmm1, xmmword ptr [rsp+0xA0]
  293         paddd   xmm2, xmmword ptr [rsp+0xC0]
  294         paddd   xmm3, xmmword ptr [rsp+0xE0]
  295         paddd   xmm0, xmm5
  296         paddd   xmm1, xmm6
  297         paddd   xmm2, xmm7
  298         paddd   xmm3, xmm4
  299         pxor    xmm15, xmm0
  300         pxor    xmm12, xmm1
  301         pxor    xmm13, xmm2
  302         pxor    xmm14, xmm3
  303         pshuflw xmm15, xmm15, 0xB1
  304         pshufhw xmm15, xmm15, 0xB1
  305         pshuflw xmm12, xmm12, 0xB1
  306         pshufhw xmm12, xmm12, 0xB1
  307         pshuflw xmm13, xmm13, 0xB1
  308         pshufhw xmm13, xmm13, 0xB1
  309         pshuflw xmm14, xmm14, 0xB1
  310         pshufhw xmm14, xmm14, 0xB1
  311         paddd   xmm10, xmm15
  312         paddd   xmm11, xmm12
  313         movdqa  xmm8, xmmword ptr [rsp+0x100]
  314         paddd   xmm8, xmm13
  315         paddd   xmm9, xmm14
  316         pxor    xmm5, xmm10
  317         pxor    xmm6, xmm11
  318         pxor    xmm7, xmm8
  319         pxor    xmm4, xmm9
  320         movdqa  xmmword ptr [rsp+0x100], xmm8
  321         movdqa  xmm8, xmm5
  322         psrld   xmm8, 12
  323         pslld   xmm5, 20
  324         por     xmm5, xmm8
  325         movdqa  xmm8, xmm6
  326         psrld   xmm8, 12
  327         pslld   xmm6, 20
  328         por     xmm6, xmm8
  329         movdqa  xmm8, xmm7
  330         psrld   xmm8, 12
  331         pslld   xmm7, 20
  332         por     xmm7, xmm8
  333         movdqa  xmm8, xmm4
  334         psrld   xmm8, 12
  335         pslld   xmm4, 20
  336         por     xmm4, xmm8
  337         paddd   xmm0, xmmword ptr [rsp+0x90]
  338         paddd   xmm1, xmmword ptr [rsp+0xB0]
  339         paddd   xmm2, xmmword ptr [rsp+0xD0]
  340         paddd   xmm3, xmmword ptr [rsp+0xF0]
  341         paddd   xmm0, xmm5
  342         paddd   xmm1, xmm6
  343         paddd   xmm2, xmm7
  344         paddd   xmm3, xmm4
  345         pxor    xmm15, xmm0
  346         pxor    xmm12, xmm1
  347         pxor    xmm13, xmm2
  348         pxor    xmm14, xmm3
  349         movdqa  xmm8, xmm15
  350         psrld   xmm15, 8
  351         pslld   xmm8, 24
  352         pxor    xmm15, xmm8
  353         movdqa  xmm8, xmm12
  354         psrld   xmm12, 8
  355         pslld   xmm8, 24
  356         pxor    xmm12, xmm8
  357         movdqa  xmm8, xmm13
  358         psrld   xmm13, 8
  359         pslld   xmm8, 24
  360         pxor    xmm13, xmm8
  361         movdqa  xmm8, xmm14
  362         psrld   xmm14, 8
  363         pslld   xmm8, 24
  364         pxor    xmm14, xmm8
  365         paddd   xmm10, xmm15
  366         paddd   xmm11, xmm12
  367         movdqa  xmm8, xmmword ptr [rsp+0x100]
  368         paddd   xmm8, xmm13
  369         paddd   xmm9, xmm14
  370         pxor    xmm5, xmm10
  371         pxor    xmm6, xmm11
  372         pxor    xmm7, xmm8
  373         pxor    xmm4, xmm9
  374         movdqa  xmmword ptr [rsp+0x100], xmm8
  375         movdqa  xmm8, xmm5
  376         psrld   xmm8, 7
  377         pslld   xmm5, 25
  378         por     xmm5, xmm8
  379         movdqa  xmm8, xmm6
  380         psrld   xmm8, 7
  381         pslld   xmm6, 25
  382         por     xmm6, xmm8
  383         movdqa  xmm8, xmm7
  384         psrld   xmm8, 7
  385         pslld   xmm7, 25
  386         por     xmm7, xmm8
  387         movdqa  xmm8, xmm4
  388         psrld   xmm8, 7
  389         pslld   xmm4, 25
  390         por     xmm4, xmm8
  391         paddd   xmm0, xmmword ptr [rsp+0x20]
  392         paddd   xmm1, xmmword ptr [rsp+0x30]
  393         paddd   xmm2, xmmword ptr [rsp+0x70]
  394         paddd   xmm3, xmmword ptr [rsp+0x40]
  395         paddd   xmm0, xmm4
  396         paddd   xmm1, xmm5
  397         paddd   xmm2, xmm6
  398         paddd   xmm3, xmm7
  399         pxor    xmm12, xmm0
  400         pxor    xmm13, xmm1
  401         pxor    xmm14, xmm2
  402         pxor    xmm15, xmm3
  403         pshuflw xmm12, xmm12, 0xB1
  404         pshufhw xmm12, xmm12, 0xB1
  405         pshuflw xmm13, xmm13, 0xB1
  406         pshufhw xmm13, xmm13, 0xB1
  407         pshuflw xmm14, xmm14, 0xB1
  408         pshufhw xmm14, xmm14, 0xB1
  409         pshuflw xmm15, xmm15, 0xB1
  410         pshufhw xmm15, xmm15, 0xB1
  411         movdqa  xmm8, xmmword ptr [rsp+0x100]
  412         paddd   xmm8, xmm12
  413         paddd   xmm9, xmm13
  414         paddd   xmm10, xmm14
  415         paddd   xmm11, xmm15
  416         pxor    xmm4, xmm8
  417         pxor    xmm5, xmm9
  418         pxor    xmm6, xmm10
  419         pxor    xmm7, xmm11
  420         movdqa  xmmword ptr [rsp+0x100], xmm8
  421         movdqa  xmm8, xmm4
  422         psrld   xmm8, 12
  423         pslld   xmm4, 20
  424         por     xmm4, xmm8
  425         movdqa  xmm8, xmm5
  426         psrld   xmm8, 12
  427         pslld   xmm5, 20
  428         por     xmm5, xmm8
  429         movdqa  xmm8, xmm6
  430         psrld   xmm8, 12
  431         pslld   xmm6, 20
  432         por     xmm6, xmm8
  433         movdqa  xmm8, xmm7
  434         psrld   xmm8, 12
  435         pslld   xmm7, 20
  436         por     xmm7, xmm8
  437         paddd   xmm0, xmmword ptr [rsp+0x60]
  438         paddd   xmm1, xmmword ptr [rsp+0xA0]
  439         paddd   xmm2, xmmword ptr [rsp]
  440         paddd   xmm3, xmmword ptr [rsp+0xD0]
  441         paddd   xmm0, xmm4
  442         paddd   xmm1, xmm5
  443         paddd   xmm2, xmm6
  444         paddd   xmm3, xmm7
  445         pxor    xmm12, xmm0
  446         pxor    xmm13, xmm1
  447         pxor    xmm14, xmm2
  448         pxor    xmm15, xmm3
  449         movdqa  xmm8, xmm12
  450         psrld   xmm12, 8
  451         pslld   xmm8, 24
  452         pxor    xmm12, xmm8
  453         movdqa  xmm8, xmm13
  454         psrld   xmm13, 8
  455         pslld   xmm8, 24
  456         pxor    xmm13, xmm8
  457         movdqa  xmm8, xmm14
  458         psrld   xmm14, 8
  459         pslld   xmm8, 24
  460         pxor    xmm14, xmm8
  461         movdqa  xmm8, xmm15
  462         psrld   xmm15, 8
  463         pslld   xmm8, 24
  464         pxor    xmm15, xmm8
  465         movdqa  xmm8, xmmword ptr [rsp+0x100]
  466         paddd   xmm8, xmm12
  467         paddd   xmm9, xmm13
  468         paddd   xmm10, xmm14
  469         paddd   xmm11, xmm15
  470         pxor    xmm4, xmm8
  471         pxor    xmm5, xmm9
  472         pxor    xmm6, xmm10
  473         pxor    xmm7, xmm11
  474         movdqa  xmmword ptr [rsp+0x100], xmm8
  475         movdqa  xmm8, xmm4
  476         psrld   xmm8, 7
  477         pslld   xmm4, 25
  478         por     xmm4, xmm8
  479         movdqa  xmm8, xmm5
  480         psrld   xmm8, 7
  481         pslld   xmm5, 25
  482         por     xmm5, xmm8
  483         movdqa  xmm8, xmm6
  484         psrld   xmm8, 7
  485         pslld   xmm6, 25
  486         por     xmm6, xmm8
  487         movdqa  xmm8, xmm7
  488         psrld   xmm8, 7
  489         pslld   xmm7, 25
  490         por     xmm7, xmm8
  491         paddd   xmm0, xmmword ptr [rsp+0x10]
  492         paddd   xmm1, xmmword ptr [rsp+0xC0]
  493         paddd   xmm2, xmmword ptr [rsp+0x90]
  494         paddd   xmm3, xmmword ptr [rsp+0xF0]
  495         paddd   xmm0, xmm5
  496         paddd   xmm1, xmm6
  497         paddd   xmm2, xmm7
  498         paddd   xmm3, xmm4
  499         pxor    xmm15, xmm0
  500         pxor    xmm12, xmm1
  501         pxor    xmm13, xmm2
  502         pxor    xmm14, xmm3
  503         pshuflw xmm15, xmm15, 0xB1
  504         pshufhw xmm15, xmm15, 0xB1
  505         pshuflw xmm12, xmm12, 0xB1
  506         pshufhw xmm12, xmm12, 0xB1
  507         pshuflw xmm13, xmm13, 0xB1
  508         pshufhw xmm13, xmm13, 0xB1
  509         pshuflw xmm14, xmm14, 0xB1
  510         pshufhw xmm14, xmm14, 0xB1
  511         paddd   xmm10, xmm15
  512         paddd   xmm11, xmm12
  513         movdqa  xmm8, xmmword ptr [rsp+0x100]
  514         paddd   xmm8, xmm13
  515         paddd   xmm9, xmm14
  516         pxor    xmm5, xmm10
  517         pxor    xmm6, xmm11
  518         pxor    xmm7, xmm8
  519         pxor    xmm4, xmm9
  520         movdqa  xmmword ptr [rsp+0x100], xmm8
  521         movdqa  xmm8, xmm5
  522         psrld   xmm8, 12
  523         pslld   xmm5, 20
  524         por     xmm5, xmm8
  525         movdqa  xmm8, xmm6
  526         psrld   xmm8, 12
  527         pslld   xmm6, 20
  528         por     xmm6, xmm8
  529         movdqa  xmm8, xmm7
  530         psrld   xmm8, 12
  531         pslld   xmm7, 20
  532         por     xmm7, xmm8
  533         movdqa  xmm8, xmm4
  534         psrld   xmm8, 12
  535         pslld   xmm4, 20
  536         por     xmm4, xmm8
  537         paddd   xmm0, xmmword ptr [rsp+0xB0]
  538         paddd   xmm1, xmmword ptr [rsp+0x50]
  539         paddd   xmm2, xmmword ptr [rsp+0xE0]
  540         paddd   xmm3, xmmword ptr [rsp+0x80]
  541         paddd   xmm0, xmm5
  542         paddd   xmm1, xmm6
  543         paddd   xmm2, xmm7
  544         paddd   xmm3, xmm4
  545         pxor    xmm15, xmm0
  546         pxor    xmm12, xmm1
  547         pxor    xmm13, xmm2
  548         pxor    xmm14, xmm3
  549         movdqa  xmm8, xmm15
  550         psrld   xmm15, 8
  551         pslld   xmm8, 24
  552         pxor    xmm15, xmm8
  553         movdqa  xmm8, xmm12
  554         psrld   xmm12, 8
  555         pslld   xmm8, 24
  556         pxor    xmm12, xmm8
  557         movdqa  xmm8, xmm13
  558         psrld   xmm13, 8
  559         pslld   xmm8, 24
  560         pxor    xmm13, xmm8
  561         movdqa  xmm8, xmm14
  562         psrld   xmm14, 8
  563         pslld   xmm8, 24
  564         pxor    xmm14, xmm8
  565         paddd   xmm10, xmm15
  566         paddd   xmm11, xmm12
  567         movdqa  xmm8, xmmword ptr [rsp+0x100]
  568         paddd   xmm8, xmm13
  569         paddd   xmm9, xmm14
  570         pxor    xmm5, xmm10
  571         pxor    xmm6, xmm11
  572         pxor    xmm7, xmm8
  573         pxor    xmm4, xmm9
  574         movdqa  xmmword ptr [rsp+0x100], xmm8
  575         movdqa  xmm8, xmm5
  576         psrld   xmm8, 7
  577         pslld   xmm5, 25
  578         por     xmm5, xmm8
  579         movdqa  xmm8, xmm6
  580         psrld   xmm8, 7
  581         pslld   xmm6, 25
  582         por     xmm6, xmm8
  583         movdqa  xmm8, xmm7
  584         psrld   xmm8, 7
  585         pslld   xmm7, 25
  586         por     xmm7, xmm8
  587         movdqa  xmm8, xmm4
  588         psrld   xmm8, 7
  589         pslld   xmm4, 25
  590         por     xmm4, xmm8
  591         paddd   xmm0, xmmword ptr [rsp+0x30]
  592         paddd   xmm1, xmmword ptr [rsp+0xA0]
  593         paddd   xmm2, xmmword ptr [rsp+0xD0]
  594         paddd   xmm3, xmmword ptr [rsp+0x70]
  595         paddd   xmm0, xmm4
  596         paddd   xmm1, xmm5
  597         paddd   xmm2, xmm6
  598         paddd   xmm3, xmm7
  599         pxor    xmm12, xmm0
  600         pxor    xmm13, xmm1
  601         pxor    xmm14, xmm2
  602         pxor    xmm15, xmm3
  603         pshuflw xmm12, xmm12, 0xB1
  604         pshufhw xmm12, xmm12, 0xB1
  605         pshuflw xmm13, xmm13, 0xB1
  606         pshufhw xmm13, xmm13, 0xB1
  607         pshuflw xmm14, xmm14, 0xB1
  608         pshufhw xmm14, xmm14, 0xB1
  609         pshuflw xmm15, xmm15, 0xB1
  610         pshufhw xmm15, xmm15, 0xB1
  611         movdqa  xmm8, xmmword ptr [rsp+0x100]
  612         paddd   xmm8, xmm12
  613         paddd   xmm9, xmm13
  614         paddd   xmm10, xmm14
  615         paddd   xmm11, xmm15
  616         pxor    xmm4, xmm8
  617         pxor    xmm5, xmm9
  618         pxor    xmm6, xmm10
  619         pxor    xmm7, xmm11
  620         movdqa  xmmword ptr [rsp+0x100], xmm8
  621         movdqa  xmm8, xmm4
  622         psrld   xmm8, 12
  623         pslld   xmm4, 20
  624         por     xmm4, xmm8
  625         movdqa  xmm8, xmm5
  626         psrld   xmm8, 12
  627         pslld   xmm5, 20
  628         por     xmm5, xmm8
  629         movdqa  xmm8, xmm6
  630         psrld   xmm8, 12
  631         pslld   xmm6, 20
  632         por     xmm6, xmm8
  633         movdqa  xmm8, xmm7
  634         psrld   xmm8, 12
  635         pslld   xmm7, 20
  636         por     xmm7, xmm8
  637         paddd   xmm0, xmmword ptr [rsp+0x40]
  638         paddd   xmm1, xmmword ptr [rsp+0xC0]
  639         paddd   xmm2, xmmword ptr [rsp+0x20]
  640         paddd   xmm3, xmmword ptr [rsp+0xE0]
  641         paddd   xmm0, xmm4
  642         paddd   xmm1, xmm5
  643         paddd   xmm2, xmm6
  644         paddd   xmm3, xmm7
  645         pxor    xmm12, xmm0
  646         pxor    xmm13, xmm1
  647         pxor    xmm14, xmm2
  648         pxor    xmm15, xmm3
  649         movdqa  xmm8, xmm12
  650         psrld   xmm12, 8
  651         pslld   xmm8, 24
  652         pxor    xmm12, xmm8
  653         movdqa  xmm8, xmm13
  654         psrld   xmm13, 8
  655         pslld   xmm8, 24
  656         pxor    xmm13, xmm8
  657         movdqa  xmm8, xmm14
  658         psrld   xmm14, 8
  659         pslld   xmm8, 24
  660         pxor    xmm14, xmm8
  661         movdqa  xmm8, xmm15
  662         psrld   xmm15, 8
  663         pslld   xmm8, 24
  664         pxor    xmm15, xmm8
  665         movdqa  xmm8, xmmword ptr [rsp+0x100]
  666         paddd   xmm8, xmm12
  667         paddd   xmm9, xmm13
  668         paddd   xmm10, xmm14
  669         paddd   xmm11, xmm15
  670         pxor    xmm4, xmm8
  671         pxor    xmm5, xmm9
  672         pxor    xmm6, xmm10
  673         pxor    xmm7, xmm11
  674         movdqa  xmmword ptr [rsp+0x100], xmm8
  675         movdqa  xmm8, xmm4
  676         psrld   xmm8, 7
  677         pslld   xmm4, 25
  678         por     xmm4, xmm8
  679         movdqa  xmm8, xmm5
  680         psrld   xmm8, 7
  681         pslld   xmm5, 25
  682         por     xmm5, xmm8
  683         movdqa  xmm8, xmm6
  684         psrld   xmm8, 7
  685         pslld   xmm6, 25
  686         por     xmm6, xmm8
  687         movdqa  xmm8, xmm7
  688         psrld   xmm8, 7
  689         pslld   xmm7, 25
  690         por     xmm7, xmm8
  691         paddd   xmm0, xmmword ptr [rsp+0x60]
  692         paddd   xmm1, xmmword ptr [rsp+0x90]
  693         paddd   xmm2, xmmword ptr [rsp+0xB0]
  694         paddd   xmm3, xmmword ptr [rsp+0x80]
  695         paddd   xmm0, xmm5
  696         paddd   xmm1, xmm6
  697         paddd   xmm2, xmm7
  698         paddd   xmm3, xmm4
  699         pxor    xmm15, xmm0
  700         pxor    xmm12, xmm1
  701         pxor    xmm13, xmm2
  702         pxor    xmm14, xmm3
  703         pshuflw xmm15, xmm15, 0xB1
  704         pshufhw xmm15, xmm15, 0xB1
  705         pshuflw xmm12, xmm12, 0xB1
  706         pshufhw xmm12, xmm12, 0xB1
  707         pshuflw xmm13, xmm13, 0xB1
  708         pshufhw xmm13, xmm13, 0xB1
  709         pshuflw xmm14, xmm14, 0xB1
  710         pshufhw xmm14, xmm14, 0xB1
  711         paddd   xmm10, xmm15
  712         paddd   xmm11, xmm12
  713         movdqa  xmm8, xmmword ptr [rsp+0x100]
  714         paddd   xmm8, xmm13
  715         paddd   xmm9, xmm14
  716         pxor    xmm5, xmm10
  717         pxor    xmm6, xmm11
  718         pxor    xmm7, xmm8
  719         pxor    xmm4, xmm9
  720         movdqa  xmmword ptr [rsp+0x100], xmm8
  721         movdqa  xmm8, xmm5
  722         psrld   xmm8, 12
  723         pslld   xmm5, 20
  724         por     xmm5, xmm8
  725         movdqa  xmm8, xmm6
  726         psrld   xmm8, 12
  727         pslld   xmm6, 20
  728         por     xmm6, xmm8
  729         movdqa  xmm8, xmm7
  730         psrld   xmm8, 12
  731         pslld   xmm7, 20
  732         por     xmm7, xmm8
  733         movdqa  xmm8, xmm4
  734         psrld   xmm8, 12
  735         pslld   xmm4, 20
  736         por     xmm4, xmm8
  737         paddd   xmm0, xmmword ptr [rsp+0x50]
  738         paddd   xmm1, xmmword ptr [rsp]
  739         paddd   xmm2, xmmword ptr [rsp+0xF0]
  740         paddd   xmm3, xmmword ptr [rsp+0x10]
  741         paddd   xmm0, xmm5
  742         paddd   xmm1, xmm6
  743         paddd   xmm2, xmm7
  744         paddd   xmm3, xmm4
  745         pxor    xmm15, xmm0
  746         pxor    xmm12, xmm1
  747         pxor    xmm13, xmm2
  748         pxor    xmm14, xmm3
  749         movdqa  xmm8, xmm15
  750         psrld   xmm15, 8
  751         pslld   xmm8, 24
  752         pxor    xmm15, xmm8
  753         movdqa  xmm8, xmm12
  754         psrld   xmm12, 8
  755         pslld   xmm8, 24
  756         pxor    xmm12, xmm8
  757         movdqa  xmm8, xmm13
  758         psrld   xmm13, 8
  759         pslld   xmm8, 24
  760         pxor    xmm13, xmm8
  761         movdqa  xmm8, xmm14
  762         psrld   xmm14, 8
  763         pslld   xmm8, 24
  764         pxor    xmm14, xmm8
  765         paddd   xmm10, xmm15
  766         paddd   xmm11, xmm12
  767         movdqa  xmm8, xmmword ptr [rsp+0x100]
  768         paddd   xmm8, xmm13
  769         paddd   xmm9, xmm14
  770         pxor    xmm5, xmm10
  771         pxor    xmm6, xmm11
  772         pxor    xmm7, xmm8
  773         pxor    xmm4, xmm9
  774         movdqa  xmmword ptr [rsp+0x100], xmm8
  775         movdqa  xmm8, xmm5
  776         psrld   xmm8, 7
  777         pslld   xmm5, 25
  778         por     xmm5, xmm8
  779         movdqa  xmm8, xmm6
  780         psrld   xmm8, 7
  781         pslld   xmm6, 25
  782         por     xmm6, xmm8
  783         movdqa  xmm8, xmm7
  784         psrld   xmm8, 7
  785         pslld   xmm7, 25
  786         por     xmm7, xmm8
  787         movdqa  xmm8, xmm4
  788         psrld   xmm8, 7
  789         pslld   xmm4, 25
  790         por     xmm4, xmm8
  791         paddd   xmm0, xmmword ptr [rsp+0xA0]
  792         paddd   xmm1, xmmword ptr [rsp+0xC0]
  793         paddd   xmm2, xmmword ptr [rsp+0xE0]
  794         paddd   xmm3, xmmword ptr [rsp+0xD0]
  795         paddd   xmm0, xmm4
  796         paddd   xmm1, xmm5
  797         paddd   xmm2, xmm6
  798         paddd   xmm3, xmm7
  799         pxor    xmm12, xmm0
  800         pxor    xmm13, xmm1
  801         pxor    xmm14, xmm2
  802         pxor    xmm15, xmm3
  803         pshuflw xmm12, xmm12, 0xB1
  804         pshufhw xmm12, xmm12, 0xB1
  805         pshuflw xmm13, xmm13, 0xB1
  806         pshufhw xmm13, xmm13, 0xB1
  807         pshuflw xmm14, xmm14, 0xB1
  808         pshufhw xmm14, xmm14, 0xB1
  809         pshuflw xmm15, xmm15, 0xB1
  810         pshufhw xmm15, xmm15, 0xB1
  811         movdqa  xmm8, xmmword ptr [rsp+0x100]
  812         paddd   xmm8, xmm12
  813         paddd   xmm9, xmm13
  814         paddd   xmm10, xmm14
  815         paddd   xmm11, xmm15
  816         pxor    xmm4, xmm8
  817         pxor    xmm5, xmm9
  818         pxor    xmm6, xmm10
  819         pxor    xmm7, xmm11
  820         movdqa  xmmword ptr [rsp+0x100], xmm8
  821         movdqa  xmm8, xmm4
  822         psrld   xmm8, 12
  823         pslld   xmm4, 20
  824         por     xmm4, xmm8
  825         movdqa  xmm8, xmm5
  826         psrld   xmm8, 12
  827         pslld   xmm5, 20
  828         por     xmm5, xmm8
  829         movdqa  xmm8, xmm6
  830         psrld   xmm8, 12
  831         pslld   xmm6, 20
  832         por     xmm6, xmm8
  833         movdqa  xmm8, xmm7
  834         psrld   xmm8, 12
  835         pslld   xmm7, 20
  836         por     xmm7, xmm8
  837         paddd   xmm0, xmmword ptr [rsp+0x70]
  838         paddd   xmm1, xmmword ptr [rsp+0x90]
  839         paddd   xmm2, xmmword ptr [rsp+0x30]
  840         paddd   xmm3, xmmword ptr [rsp+0xF0]
  841         paddd   xmm0, xmm4
  842         paddd   xmm1, xmm5
  843         paddd   xmm2, xmm6
  844         paddd   xmm3, xmm7
  845         pxor    xmm12, xmm0
  846         pxor    xmm13, xmm1
  847         pxor    xmm14, xmm2
  848         pxor    xmm15, xmm3
  849         movdqa  xmm8, xmm12
  850         psrld   xmm12, 8
  851         pslld   xmm8, 24
  852         pxor    xmm12, xmm8
  853         movdqa  xmm8, xmm13
  854         psrld   xmm13, 8
  855         pslld   xmm8, 24
  856         pxor    xmm13, xmm8
  857         movdqa  xmm8, xmm14
  858         psrld   xmm14, 8
  859         pslld   xmm8, 24
  860         pxor    xmm14, xmm8
  861         movdqa  xmm8, xmm15
  862         psrld   xmm15, 8
  863         pslld   xmm8, 24
  864         pxor    xmm15, xmm8
  865         movdqa  xmm8, xmmword ptr [rsp+0x100]
  866         paddd   xmm8, xmm12
  867         paddd   xmm9, xmm13
  868         paddd   xmm10, xmm14
  869         paddd   xmm11, xmm15
  870         pxor    xmm4, xmm8
  871         pxor    xmm5, xmm9
  872         pxor    xmm6, xmm10
  873         pxor    xmm7, xmm11
  874         movdqa  xmmword ptr [rsp+0x100], xmm8
  875         movdqa  xmm8, xmm4
  876         psrld   xmm8, 7
  877         pslld   xmm4, 25
  878         por     xmm4, xmm8
  879         movdqa  xmm8, xmm5
  880         psrld   xmm8, 7
  881         pslld   xmm5, 25
  882         por     xmm5, xmm8
  883         movdqa  xmm8, xmm6
  884         psrld   xmm8, 7
  885         pslld   xmm6, 25
  886         por     xmm6, xmm8
  887         movdqa  xmm8, xmm7
  888         psrld   xmm8, 7
  889         pslld   xmm7, 25
  890         por     xmm7, xmm8
  891         paddd   xmm0, xmmword ptr [rsp+0x40]
  892         paddd   xmm1, xmmword ptr [rsp+0xB0]
  893         paddd   xmm2, xmmword ptr [rsp+0x50]
  894         paddd   xmm3, xmmword ptr [rsp+0x10]
  895         paddd   xmm0, xmm5
  896         paddd   xmm1, xmm6
  897         paddd   xmm2, xmm7
  898         paddd   xmm3, xmm4
  899         pxor    xmm15, xmm0
  900         pxor    xmm12, xmm1
  901         pxor    xmm13, xmm2
  902         pxor    xmm14, xmm3
  903         pshuflw xmm15, xmm15, 0xB1
  904         pshufhw xmm15, xmm15, 0xB1
  905         pshuflw xmm12, xmm12, 0xB1
  906         pshufhw xmm12, xmm12, 0xB1
  907         pshuflw xmm13, xmm13, 0xB1
  908         pshufhw xmm13, xmm13, 0xB1
  909         pshuflw xmm14, xmm14, 0xB1
  910         pshufhw xmm14, xmm14, 0xB1
  911         paddd   xmm10, xmm15
  912         paddd   xmm11, xmm12
  913         movdqa  xmm8, xmmword ptr [rsp+0x100]
  914         paddd   xmm8, xmm13
  915         paddd   xmm9, xmm14
  916         pxor    xmm5, xmm10
  917         pxor    xmm6, xmm11
  918         pxor    xmm7, xmm8
  919         pxor    xmm4, xmm9
  920         movdqa  xmmword ptr [rsp+0x100], xmm8
  921         movdqa  xmm8, xmm5
  922         psrld   xmm8, 12
  923         pslld   xmm5, 20
  924         por     xmm5, xmm8
  925         movdqa  xmm8, xmm6
  926         psrld   xmm8, 12
  927         pslld   xmm6, 20
  928         por     xmm6, xmm8
  929         movdqa  xmm8, xmm7
  930         psrld   xmm8, 12
  931         pslld   xmm7, 20
  932         por     xmm7, xmm8
  933         movdqa  xmm8, xmm4
  934         psrld   xmm8, 12
  935         pslld   xmm4, 20
  936         por     xmm4, xmm8
  937         paddd   xmm0, xmmword ptr [rsp]
  938         paddd   xmm1, xmmword ptr [rsp+0x20]
  939         paddd   xmm2, xmmword ptr [rsp+0x80]
  940         paddd   xmm3, xmmword ptr [rsp+0x60]
  941         paddd   xmm0, xmm5
  942         paddd   xmm1, xmm6
  943         paddd   xmm2, xmm7
  944         paddd   xmm3, xmm4
  945         pxor    xmm15, xmm0
  946         pxor    xmm12, xmm1
  947         pxor    xmm13, xmm2
  948         pxor    xmm14, xmm3
  949         movdqa  xmm8, xmm15
  950         psrld   xmm15, 8
  951         pslld   xmm8, 24
  952         pxor    xmm15, xmm8
  953         movdqa  xmm8, xmm12
  954         psrld   xmm12, 8
  955         pslld   xmm8, 24
  956         pxor    xmm12, xmm8
  957         movdqa  xmm8, xmm13
  958         psrld   xmm13, 8
  959         pslld   xmm8, 24
  960         pxor    xmm13, xmm8
  961         movdqa  xmm8, xmm14
  962         psrld   xmm14, 8
  963         pslld   xmm8, 24
  964         pxor    xmm14, xmm8
  965         paddd   xmm10, xmm15
  966         paddd   xmm11, xmm12
  967         movdqa  xmm8, xmmword ptr [rsp+0x100]
  968         paddd   xmm8, xmm13
  969         paddd   xmm9, xmm14
  970         pxor    xmm5, xmm10
  971         pxor    xmm6, xmm11
  972         pxor    xmm7, xmm8
  973         pxor    xmm4, xmm9
  974         movdqa  xmmword ptr [rsp+0x100], xmm8
  975         movdqa  xmm8, xmm5
  976         psrld   xmm8, 7
  977         pslld   xmm5, 25
  978         por     xmm5, xmm8
  979         movdqa  xmm8, xmm6
  980         psrld   xmm8, 7
  981         pslld   xmm6, 25
  982         por     xmm6, xmm8
  983         movdqa  xmm8, xmm7
  984         psrld   xmm8, 7
  985         pslld   xmm7, 25
  986         por     xmm7, xmm8
  987         movdqa  xmm8, xmm4
  988         psrld   xmm8, 7
  989         pslld   xmm4, 25
  990         por     xmm4, xmm8
  991         paddd   xmm0, xmmword ptr [rsp+0xC0]
  992         paddd   xmm1, xmmword ptr [rsp+0x90]
  993         paddd   xmm2, xmmword ptr [rsp+0xF0]
  994         paddd   xmm3, xmmword ptr [rsp+0xE0]
  995         paddd   xmm0, xmm4
  996         paddd   xmm1, xmm5
  997         paddd   xmm2, xmm6
  998         paddd   xmm3, xmm7
  999         pxor    xmm12, xmm0
 1000         pxor    xmm13, xmm1
 1001         pxor    xmm14, xmm2
 1002         pxor    xmm15, xmm3
 1003         pshuflw xmm12, xmm12, 0xB1
 1004         pshufhw xmm12, xmm12, 0xB1
 1005         pshuflw xmm13, xmm13, 0xB1
 1006         pshufhw xmm13, xmm13, 0xB1
 1007         pshuflw xmm14, xmm14, 0xB1
 1008         pshufhw xmm14, xmm14, 0xB1
 1009         pshuflw xmm15, xmm15, 0xB1
 1010         pshufhw xmm15, xmm15, 0xB1
 1011         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1012         paddd   xmm8, xmm12
 1013         paddd   xmm9, xmm13
 1014         paddd   xmm10, xmm14
 1015         paddd   xmm11, xmm15
 1016         pxor    xmm4, xmm8
 1017         pxor    xmm5, xmm9
 1018         pxor    xmm6, xmm10
 1019         pxor    xmm7, xmm11
 1020         movdqa  xmmword ptr [rsp+0x100], xmm8
 1021         movdqa  xmm8, xmm4
 1022         psrld   xmm8, 12
 1023         pslld   xmm4, 20
 1024         por     xmm4, xmm8
 1025         movdqa  xmm8, xmm5
 1026         psrld   xmm8, 12
 1027         pslld   xmm5, 20
 1028         por     xmm5, xmm8
 1029         movdqa  xmm8, xmm6
 1030         psrld   xmm8, 12
 1031         pslld   xmm6, 20
 1032         por     xmm6, xmm8
 1033         movdqa  xmm8, xmm7
 1034         psrld   xmm8, 12
 1035         pslld   xmm7, 20
 1036         por     xmm7, xmm8
 1037         paddd   xmm0, xmmword ptr [rsp+0xD0]
 1038         paddd   xmm1, xmmword ptr [rsp+0xB0]
 1039         paddd   xmm2, xmmword ptr [rsp+0xA0]
 1040         paddd   xmm3, xmmword ptr [rsp+0x80]
 1041         paddd   xmm0, xmm4
 1042         paddd   xmm1, xmm5
 1043         paddd   xmm2, xmm6
 1044         paddd   xmm3, xmm7
 1045         pxor    xmm12, xmm0
 1046         pxor    xmm13, xmm1
 1047         pxor    xmm14, xmm2
 1048         pxor    xmm15, xmm3
 1049         movdqa  xmm8, xmm12
 1050         psrld   xmm12, 8
 1051         pslld   xmm8, 24
 1052         pxor    xmm12, xmm8
 1053         movdqa  xmm8, xmm13
 1054         psrld   xmm13, 8
 1055         pslld   xmm8, 24
 1056         pxor    xmm13, xmm8
 1057         movdqa  xmm8, xmm14
 1058         psrld   xmm14, 8
 1059         pslld   xmm8, 24
 1060         pxor    xmm14, xmm8
 1061         movdqa  xmm8, xmm15
 1062         psrld   xmm15, 8
 1063         pslld   xmm8, 24
 1064         pxor    xmm15, xmm8
 1065         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1066         paddd   xmm8, xmm12
 1067         paddd   xmm9, xmm13
 1068         paddd   xmm10, xmm14
 1069         paddd   xmm11, xmm15
 1070         pxor    xmm4, xmm8
 1071         pxor    xmm5, xmm9
 1072         pxor    xmm6, xmm10
 1073         pxor    xmm7, xmm11
 1074         movdqa  xmmword ptr [rsp+0x100], xmm8
 1075         movdqa  xmm8, xmm4
 1076         psrld   xmm8, 7
 1077         pslld   xmm4, 25
 1078         por     xmm4, xmm8
 1079         movdqa  xmm8, xmm5
 1080         psrld   xmm8, 7
 1081         pslld   xmm5, 25
 1082         por     xmm5, xmm8
 1083         movdqa  xmm8, xmm6
 1084         psrld   xmm8, 7
 1085         pslld   xmm6, 25
 1086         por     xmm6, xmm8
 1087         movdqa  xmm8, xmm7
 1088         psrld   xmm8, 7
 1089         pslld   xmm7, 25
 1090         por     xmm7, xmm8
 1091         paddd   xmm0, xmmword ptr [rsp+0x70]
 1092         paddd   xmm1, xmmword ptr [rsp+0x50]
 1093         paddd   xmm2, xmmword ptr [rsp]
 1094         paddd   xmm3, xmmword ptr [rsp+0x60]
 1095         paddd   xmm0, xmm5
 1096         paddd   xmm1, xmm6
 1097         paddd   xmm2, xmm7
 1098         paddd   xmm3, xmm4
 1099         pxor    xmm15, xmm0
 1100         pxor    xmm12, xmm1
 1101         pxor    xmm13, xmm2
 1102         pxor    xmm14, xmm3
 1103         pshuflw xmm15, xmm15, 0xB1
 1104         pshufhw xmm15, xmm15, 0xB1
 1105         pshuflw xmm12, xmm12, 0xB1
 1106         pshufhw xmm12, xmm12, 0xB1
 1107         pshuflw xmm13, xmm13, 0xB1
 1108         pshufhw xmm13, xmm13, 0xB1
 1109         pshuflw xmm14, xmm14, 0xB1
 1110         pshufhw xmm14, xmm14, 0xB1
 1111         paddd   xmm10, xmm15
 1112         paddd   xmm11, xmm12
 1113         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1114         paddd   xmm8, xmm13
 1115         paddd   xmm9, xmm14
 1116         pxor    xmm5, xmm10
 1117         pxor    xmm6, xmm11
 1118         pxor    xmm7, xmm8
 1119         pxor    xmm4, xmm9
 1120         movdqa  xmmword ptr [rsp+0x100], xmm8
 1121         movdqa  xmm8, xmm5
 1122         psrld   xmm8, 12
 1123         pslld   xmm5, 20
 1124         por     xmm5, xmm8
 1125         movdqa  xmm8, xmm6
 1126         psrld   xmm8, 12
 1127         pslld   xmm6, 20
 1128         por     xmm6, xmm8
 1129         movdqa  xmm8, xmm7
 1130         psrld   xmm8, 12
 1131         pslld   xmm7, 20
 1132         por     xmm7, xmm8
 1133         movdqa  xmm8, xmm4
 1134         psrld   xmm8, 12
 1135         pslld   xmm4, 20
 1136         por     xmm4, xmm8
 1137         paddd   xmm0, xmmword ptr [rsp+0x20]
 1138         paddd   xmm1, xmmword ptr [rsp+0x30]
 1139         paddd   xmm2, xmmword ptr [rsp+0x10]
 1140         paddd   xmm3, xmmword ptr [rsp+0x40]
 1141         paddd   xmm0, xmm5
 1142         paddd   xmm1, xmm6
 1143         paddd   xmm2, xmm7
 1144         paddd   xmm3, xmm4
 1145         pxor    xmm15, xmm0
 1146         pxor    xmm12, xmm1
 1147         pxor    xmm13, xmm2
 1148         pxor    xmm14, xmm3
 1149         movdqa  xmm8, xmm15
 1150         psrld   xmm15, 8
 1151         pslld   xmm8, 24
 1152         pxor    xmm15, xmm8
 1153         movdqa  xmm8, xmm12
 1154         psrld   xmm12, 8
 1155         pslld   xmm8, 24
 1156         pxor    xmm12, xmm8
 1157         movdqa  xmm8, xmm13
 1158         psrld   xmm13, 8
 1159         pslld   xmm8, 24
 1160         pxor    xmm13, xmm8
 1161         movdqa  xmm8, xmm14
 1162         psrld   xmm14, 8
 1163         pslld   xmm8, 24
 1164         pxor    xmm14, xmm8
 1165         paddd   xmm10, xmm15
 1166         paddd   xmm11, xmm12
 1167         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1168         paddd   xmm8, xmm13
 1169         paddd   xmm9, xmm14
 1170         pxor    xmm5, xmm10
 1171         pxor    xmm6, xmm11
 1172         pxor    xmm7, xmm8
 1173         pxor    xmm4, xmm9
 1174         movdqa  xmmword ptr [rsp+0x100], xmm8
 1175         movdqa  xmm8, xmm5
 1176         psrld   xmm8, 7
 1177         pslld   xmm5, 25
 1178         por     xmm5, xmm8
 1179         movdqa  xmm8, xmm6
 1180         psrld   xmm8, 7
 1181         pslld   xmm6, 25
 1182         por     xmm6, xmm8
 1183         movdqa  xmm8, xmm7
 1184         psrld   xmm8, 7
 1185         pslld   xmm7, 25
 1186         por     xmm7, xmm8
 1187         movdqa  xmm8, xmm4
 1188         psrld   xmm8, 7
 1189         pslld   xmm4, 25
 1190         por     xmm4, xmm8
 1191         paddd   xmm0, xmmword ptr [rsp+0x90]
 1192         paddd   xmm1, xmmword ptr [rsp+0xB0]
 1193         paddd   xmm2, xmmword ptr [rsp+0x80]
 1194         paddd   xmm3, xmmword ptr [rsp+0xF0]
 1195         paddd   xmm0, xmm4
 1196         paddd   xmm1, xmm5
 1197         paddd   xmm2, xmm6
 1198         paddd   xmm3, xmm7
 1199         pxor    xmm12, xmm0
 1200         pxor    xmm13, xmm1
 1201         pxor    xmm14, xmm2
 1202         pxor    xmm15, xmm3
 1203         pshuflw xmm12, xmm12, 0xB1
 1204         pshufhw xmm12, xmm12, 0xB1
 1205         pshuflw xmm13, xmm13, 0xB1
 1206         pshufhw xmm13, xmm13, 0xB1
 1207         pshuflw xmm14, xmm14, 0xB1
 1208         pshufhw xmm14, xmm14, 0xB1
 1209         pshuflw xmm15, xmm15, 0xB1
 1210         pshufhw xmm15, xmm15, 0xB1
 1211         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1212         paddd   xmm8, xmm12
 1213         paddd   xmm9, xmm13
 1214         paddd   xmm10, xmm14
 1215         paddd   xmm11, xmm15
 1216         pxor    xmm4, xmm8
 1217         pxor    xmm5, xmm9
 1218         pxor    xmm6, xmm10
 1219         pxor    xmm7, xmm11
 1220         movdqa  xmmword ptr [rsp+0x100], xmm8
 1221         movdqa  xmm8, xmm4
 1222         psrld   xmm8, 12
 1223         pslld   xmm4, 20
 1224         por     xmm4, xmm8
 1225         movdqa  xmm8, xmm5
 1226         psrld   xmm8, 12
 1227         pslld   xmm5, 20
 1228         por     xmm5, xmm8
 1229         movdqa  xmm8, xmm6
 1230         psrld   xmm8, 12
 1231         pslld   xmm6, 20
 1232         por     xmm6, xmm8
 1233         movdqa  xmm8, xmm7
 1234         psrld   xmm8, 12
 1235         pslld   xmm7, 20
 1236         por     xmm7, xmm8
 1237         paddd   xmm0, xmmword ptr [rsp+0xE0]
 1238         paddd   xmm1, xmmword ptr [rsp+0x50]
 1239         paddd   xmm2, xmmword ptr [rsp+0xC0]
 1240         paddd   xmm3, xmmword ptr [rsp+0x10]
 1241         paddd   xmm0, xmm4
 1242         paddd   xmm1, xmm5
 1243         paddd   xmm2, xmm6
 1244         paddd   xmm3, xmm7
 1245         pxor    xmm12, xmm0
 1246         pxor    xmm13, xmm1
 1247         pxor    xmm14, xmm2
 1248         pxor    xmm15, xmm3
 1249         movdqa  xmm8, xmm12
 1250         psrld   xmm12, 8
 1251         pslld   xmm8, 24
 1252         pxor    xmm12, xmm8
 1253         movdqa  xmm8, xmm13
 1254         psrld   xmm13, 8
 1255         pslld   xmm8, 24
 1256         pxor    xmm13, xmm8
 1257         movdqa  xmm8, xmm14
 1258         psrld   xmm14, 8
 1259         pslld   xmm8, 24
 1260         pxor    xmm14, xmm8
 1261         movdqa  xmm8, xmm15
 1262         psrld   xmm15, 8
 1263         pslld   xmm8, 24
 1264         pxor    xmm15, xmm8
 1265         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1266         paddd   xmm8, xmm12
 1267         paddd   xmm9, xmm13
 1268         paddd   xmm10, xmm14
 1269         paddd   xmm11, xmm15
 1270         pxor    xmm4, xmm8
 1271         pxor    xmm5, xmm9
 1272         pxor    xmm6, xmm10
 1273         pxor    xmm7, xmm11
 1274         movdqa  xmmword ptr [rsp+0x100], xmm8
 1275         movdqa  xmm8, xmm4
 1276         psrld   xmm8, 7
 1277         pslld   xmm4, 25
 1278         por     xmm4, xmm8
 1279         movdqa  xmm8, xmm5
 1280         psrld   xmm8, 7
 1281         pslld   xmm5, 25
 1282         por     xmm5, xmm8
 1283         movdqa  xmm8, xmm6
 1284         psrld   xmm8, 7
 1285         pslld   xmm6, 25
 1286         por     xmm6, xmm8
 1287         movdqa  xmm8, xmm7
 1288         psrld   xmm8, 7
 1289         pslld   xmm7, 25
 1290         por     xmm7, xmm8
 1291         paddd   xmm0, xmmword ptr [rsp+0xD0]
 1292         paddd   xmm1, xmmword ptr [rsp]
 1293         paddd   xmm2, xmmword ptr [rsp+0x20]
 1294         paddd   xmm3, xmmword ptr [rsp+0x40]
 1295         paddd   xmm0, xmm5
 1296         paddd   xmm1, xmm6
 1297         paddd   xmm2, xmm7
 1298         paddd   xmm3, xmm4
 1299         pxor    xmm15, xmm0
 1300         pxor    xmm12, xmm1
 1301         pxor    xmm13, xmm2
 1302         pxor    xmm14, xmm3
 1303         pshuflw xmm15, xmm15, 0xB1
 1304         pshufhw xmm15, xmm15, 0xB1
 1305         pshuflw xmm12, xmm12, 0xB1
 1306         pshufhw xmm12, xmm12, 0xB1
 1307         pshuflw xmm13, xmm13, 0xB1
 1308         pshufhw xmm13, xmm13, 0xB1
 1309         pshuflw xmm14, xmm14, 0xB1
 1310         pshufhw xmm14, xmm14, 0xB1
 1311         paddd   xmm10, xmm15
 1312         paddd   xmm11, xmm12
 1313         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1314         paddd   xmm8, xmm13
 1315         paddd   xmm9, xmm14
 1316         pxor    xmm5, xmm10
 1317         pxor    xmm6, xmm11
 1318         pxor    xmm7, xmm8
 1319         pxor    xmm4, xmm9
 1320         movdqa  xmmword ptr [rsp+0x100], xmm8
 1321         movdqa  xmm8, xmm5
 1322         psrld   xmm8, 12
 1323         pslld   xmm5, 20
 1324         por     xmm5, xmm8
 1325         movdqa  xmm8, xmm6
 1326         psrld   xmm8, 12
 1327         pslld   xmm6, 20
 1328         por     xmm6, xmm8
 1329         movdqa  xmm8, xmm7
 1330         psrld   xmm8, 12
 1331         pslld   xmm7, 20
 1332         por     xmm7, xmm8
 1333         movdqa  xmm8, xmm4
 1334         psrld   xmm8, 12
 1335         pslld   xmm4, 20
 1336         por     xmm4, xmm8
 1337         paddd   xmm0, xmmword ptr [rsp+0x30]
 1338         paddd   xmm1, xmmword ptr [rsp+0xA0]
 1339         paddd   xmm2, xmmword ptr [rsp+0x60]
 1340         paddd   xmm3, xmmword ptr [rsp+0x70]
 1341         paddd   xmm0, xmm5
 1342         paddd   xmm1, xmm6
 1343         paddd   xmm2, xmm7
 1344         paddd   xmm3, xmm4
 1345         pxor    xmm15, xmm0
 1346         pxor    xmm12, xmm1
 1347         pxor    xmm13, xmm2
 1348         pxor    xmm14, xmm3
 1349         movdqa  xmm8, xmm15
 1350         psrld   xmm15, 8
 1351         pslld   xmm8, 24
 1352         pxor    xmm15, xmm8
 1353         movdqa  xmm8, xmm12
 1354         psrld   xmm12, 8
 1355         pslld   xmm8, 24
 1356         pxor    xmm12, xmm8
 1357         movdqa  xmm8, xmm13
 1358         psrld   xmm13, 8
 1359         pslld   xmm8, 24
 1360         pxor    xmm13, xmm8
 1361         movdqa  xmm8, xmm14
 1362         psrld   xmm14, 8
 1363         pslld   xmm8, 24
 1364         pxor    xmm14, xmm8
 1365         paddd   xmm10, xmm15
 1366         paddd   xmm11, xmm12
 1367         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1368         paddd   xmm8, xmm13
 1369         paddd   xmm9, xmm14
 1370         pxor    xmm5, xmm10
 1371         pxor    xmm6, xmm11
 1372         pxor    xmm7, xmm8
 1373         pxor    xmm4, xmm9
 1374         movdqa  xmmword ptr [rsp+0x100], xmm8
 1375         movdqa  xmm8, xmm5
 1376         psrld   xmm8, 7
 1377         pslld   xmm5, 25
 1378         por     xmm5, xmm8
 1379         movdqa  xmm8, xmm6
 1380         psrld   xmm8, 7
 1381         pslld   xmm6, 25
 1382         por     xmm6, xmm8
 1383         movdqa  xmm8, xmm7
 1384         psrld   xmm8, 7
 1385         pslld   xmm7, 25
 1386         por     xmm7, xmm8
 1387         movdqa  xmm8, xmm4
 1388         psrld   xmm8, 7
 1389         pslld   xmm4, 25
 1390         por     xmm4, xmm8
 1391         paddd   xmm0, xmmword ptr [rsp+0xB0]
 1392         paddd   xmm1, xmmword ptr [rsp+0x50]
 1393         paddd   xmm2, xmmword ptr [rsp+0x10]
 1394         paddd   xmm3, xmmword ptr [rsp+0x80]
 1395         paddd   xmm0, xmm4
 1396         paddd   xmm1, xmm5
 1397         paddd   xmm2, xmm6
 1398         paddd   xmm3, xmm7
 1399         pxor    xmm12, xmm0
 1400         pxor    xmm13, xmm1
 1401         pxor    xmm14, xmm2
 1402         pxor    xmm15, xmm3
 1403         pshuflw xmm12, xmm12, 0xB1
 1404         pshufhw xmm12, xmm12, 0xB1
 1405         pshuflw xmm13, xmm13, 0xB1
 1406         pshufhw xmm13, xmm13, 0xB1
 1407         pshuflw xmm14, xmm14, 0xB1
 1408         pshufhw xmm14, xmm14, 0xB1
 1409         pshuflw xmm15, xmm15, 0xB1
 1410         pshufhw xmm15, xmm15, 0xB1
 1411         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1412         paddd   xmm8, xmm12
 1413         paddd   xmm9, xmm13
 1414         paddd   xmm10, xmm14
 1415         paddd   xmm11, xmm15
 1416         pxor    xmm4, xmm8
 1417         pxor    xmm5, xmm9
 1418         pxor    xmm6, xmm10
 1419         pxor    xmm7, xmm11
 1420         movdqa  xmmword ptr [rsp+0x100], xmm8
 1421         movdqa  xmm8, xmm4
 1422         psrld   xmm8, 12
 1423         pslld   xmm4, 20
 1424         por     xmm4, xmm8
 1425         movdqa  xmm8, xmm5
 1426         psrld   xmm8, 12
 1427         pslld   xmm5, 20
 1428         por     xmm5, xmm8
 1429         movdqa  xmm8, xmm6
 1430         psrld   xmm8, 12
 1431         pslld   xmm6, 20
 1432         por     xmm6, xmm8
 1433         movdqa  xmm8, xmm7
 1434         psrld   xmm8, 12
 1435         pslld   xmm7, 20
 1436         por     xmm7, xmm8
 1437         paddd   xmm0, xmmword ptr [rsp+0xF0]
 1438         paddd   xmm1, xmmword ptr [rsp]
 1439         paddd   xmm2, xmmword ptr [rsp+0x90]
 1440         paddd   xmm3, xmmword ptr [rsp+0x60]
 1441         paddd   xmm0, xmm4
 1442         paddd   xmm1, xmm5
 1443         paddd   xmm2, xmm6
 1444         paddd   xmm3, xmm7
 1445         pxor    xmm12, xmm0
 1446         pxor    xmm13, xmm1
 1447         pxor    xmm14, xmm2
 1448         pxor    xmm15, xmm3
 1449         movdqa  xmm8, xmm12
 1450         psrld   xmm12, 8
 1451         pslld   xmm8, 24
 1452         pxor    xmm12, xmm8
 1453         movdqa  xmm8, xmm13
 1454         psrld   xmm13, 8
 1455         pslld   xmm8, 24
 1456         pxor    xmm13, xmm8
 1457         movdqa  xmm8, xmm14
 1458         psrld   xmm14, 8
 1459         pslld   xmm8, 24
 1460         pxor    xmm14, xmm8
 1461         movdqa  xmm8, xmm15
 1462         psrld   xmm15, 8
 1463         pslld   xmm8, 24
 1464         pxor    xmm15, xmm8
 1465         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1466         paddd   xmm8, xmm12
 1467         paddd   xmm9, xmm13
 1468         paddd   xmm10, xmm14
 1469         paddd   xmm11, xmm15
 1470         pxor    xmm4, xmm8
 1471         pxor    xmm5, xmm9
 1472         pxor    xmm6, xmm10
 1473         pxor    xmm7, xmm11
 1474         movdqa  xmmword ptr [rsp+0x100], xmm8
 1475         movdqa  xmm8, xmm4
 1476         psrld   xmm8, 7
 1477         pslld   xmm4, 25
 1478         por     xmm4, xmm8
 1479         movdqa  xmm8, xmm5
 1480         psrld   xmm8, 7
 1481         pslld   xmm5, 25
 1482         por     xmm5, xmm8
 1483         movdqa  xmm8, xmm6
 1484         psrld   xmm8, 7
 1485         pslld   xmm6, 25
 1486         por     xmm6, xmm8
 1487         movdqa  xmm8, xmm7
 1488         psrld   xmm8, 7
 1489         pslld   xmm7, 25
 1490         por     xmm7, xmm8
 1491         paddd   xmm0, xmmword ptr [rsp+0xE0]
 1492         paddd   xmm1, xmmword ptr [rsp+0x20]
 1493         paddd   xmm2, xmmword ptr [rsp+0x30]
 1494         paddd   xmm3, xmmword ptr [rsp+0x70]
 1495         paddd   xmm0, xmm5
 1496         paddd   xmm1, xmm6
 1497         paddd   xmm2, xmm7
 1498         paddd   xmm3, xmm4
 1499         pxor    xmm15, xmm0
 1500         pxor    xmm12, xmm1
 1501         pxor    xmm13, xmm2
 1502         pxor    xmm14, xmm3
 1503         pshuflw xmm15, xmm15, 0xB1
 1504         pshufhw xmm15, xmm15, 0xB1
 1505         pshuflw xmm12, xmm12, 0xB1
 1506         pshufhw xmm12, xmm12, 0xB1
 1507         pshuflw xmm13, xmm13, 0xB1
 1508         pshufhw xmm13, xmm13, 0xB1
 1509         pshuflw xmm14, xmm14, 0xB1
 1510         pshufhw xmm14, xmm14, 0xB1
 1511         paddd   xmm10, xmm15
 1512         paddd   xmm11, xmm12
 1513         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1514         paddd   xmm8, xmm13
 1515         paddd   xmm9, xmm14
 1516         pxor    xmm5, xmm10
 1517         pxor    xmm6, xmm11
 1518         pxor    xmm7, xmm8
 1519         pxor    xmm4, xmm9
 1520         movdqa  xmmword ptr [rsp+0x100], xmm8
 1521         movdqa  xmm8, xmm5
 1522         psrld   xmm8, 12
 1523         pslld   xmm5, 20
 1524         por     xmm5, xmm8
 1525         movdqa  xmm8, xmm6
 1526         psrld   xmm8, 12
 1527         pslld   xmm6, 20
 1528         por     xmm6, xmm8
 1529         movdqa  xmm8, xmm7
 1530         psrld   xmm8, 12
 1531         pslld   xmm7, 20
 1532         por     xmm7, xmm8
 1533         movdqa  xmm8, xmm4
 1534         psrld   xmm8, 12
 1535         pslld   xmm4, 20
 1536         por     xmm4, xmm8
 1537         paddd   xmm0, xmmword ptr [rsp+0xA0]
 1538         paddd   xmm1, xmmword ptr [rsp+0xC0]
 1539         paddd   xmm2, xmmword ptr [rsp+0x40]
 1540         paddd   xmm3, xmmword ptr [rsp+0xD0]
 1541         paddd   xmm0, xmm5
 1542         paddd   xmm1, xmm6
 1543         paddd   xmm2, xmm7
 1544         paddd   xmm3, xmm4
 1545         pxor    xmm15, xmm0
 1546         pxor    xmm12, xmm1
 1547         pxor    xmm13, xmm2
 1548         pxor    xmm14, xmm3
 1549         movdqa  xmm8, xmm15
 1550         psrld   xmm15, 8
 1551         pslld   xmm8, 24
 1552         pxor    xmm15, xmm8
 1553         movdqa  xmm8, xmm12
 1554         psrld   xmm12, 8
 1555         pslld   xmm8, 24
 1556         pxor    xmm12, xmm8
 1557         movdqa  xmm8, xmm13
 1558         psrld   xmm13, 8
 1559         pslld   xmm8, 24
 1560         pxor    xmm13, xmm8
 1561         movdqa  xmm8, xmm14
 1562         psrld   xmm14, 8
 1563         pslld   xmm8, 24
 1564         pxor    xmm14, xmm8
 1565         paddd   xmm10, xmm15
 1566         paddd   xmm11, xmm12
 1567         movdqa  xmm8, xmmword ptr [rsp+0x100]
 1568         paddd   xmm8, xmm13
 1569         paddd   xmm9, xmm14
 1570         pxor    xmm5, xmm10
 1571         pxor    xmm6, xmm11
 1572         pxor    xmm7, xmm8
 1573         pxor    xmm4, xmm9
 1574         pxor    xmm0, xmm8
 1575         pxor    xmm1, xmm9
 1576         pxor    xmm2, xmm10
 1577         pxor    xmm3, xmm11
 1578         movdqa  xmm8, xmm5
 1579         psrld   xmm8, 7
 1580         pslld   xmm5, 25
 1581         por     xmm5, xmm8
 1582         movdqa  xmm8, xmm6
 1583         psrld   xmm8, 7
 1584         pslld   xmm6, 25
 1585         por     xmm6, xmm8
 1586         movdqa  xmm8, xmm7
 1587         psrld   xmm8, 7
 1588         pslld   xmm7, 25
 1589         por     xmm7, xmm8
 1590         movdqa  xmm8, xmm4
 1591         psrld   xmm8, 7
 1592         pslld   xmm4, 25
 1593         por     xmm4, xmm8
 1594         pxor    xmm4, xmm12
 1595         pxor    xmm5, xmm13
 1596         pxor    xmm6, xmm14
 1597         pxor    xmm7, xmm15
 1598         mov     eax, r13d
 1599         jne     9b
 1600         movdqa  xmm9, xmm0
 1601         punpckldq xmm0, xmm1
 1602         punpckhdq xmm9, xmm1
 1603         movdqa  xmm11, xmm2
 1604         punpckldq xmm2, xmm3
 1605         punpckhdq xmm11, xmm3
 1606         movdqa  xmm1, xmm0
 1607         punpcklqdq xmm0, xmm2
 1608         punpckhqdq xmm1, xmm2
 1609         movdqa  xmm3, xmm9
 1610         punpcklqdq xmm9, xmm11
 1611         punpckhqdq xmm3, xmm11
 1612         movdqu  xmmword ptr [rbx], xmm0
 1613         movdqu  xmmword ptr [rbx+0x20], xmm1
 1614         movdqu  xmmword ptr [rbx+0x40], xmm9
 1615         movdqu  xmmword ptr [rbx+0x60], xmm3
 1616         movdqa  xmm9, xmm4
 1617         punpckldq xmm4, xmm5
 1618         punpckhdq xmm9, xmm5
 1619         movdqa  xmm11, xmm6
 1620         punpckldq xmm6, xmm7
 1621         punpckhdq xmm11, xmm7
 1622         movdqa  xmm5, xmm4
 1623         punpcklqdq xmm4, xmm6
 1624         punpckhqdq xmm5, xmm6
 1625         movdqa  xmm7, xmm9
 1626         punpcklqdq xmm9, xmm11
 1627         punpckhqdq xmm7, xmm11
 1628         movdqu  xmmword ptr [rbx+0x10], xmm4
 1629         movdqu  xmmword ptr [rbx+0x30], xmm5
 1630         movdqu  xmmword ptr [rbx+0x50], xmm9
 1631         movdqu  xmmword ptr [rbx+0x70], xmm7
 1632         movdqa  xmm1, xmmword ptr [rsp+0x110]
 1633         movdqa  xmm0, xmm1
 1634         paddd   xmm1, xmmword ptr [rsp+0x150]
 1635         movdqa  xmmword ptr [rsp+0x110], xmm1
 1636         pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
 1637         pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
 1638         pcmpgtd xmm0, xmm1
 1639         movdqa  xmm1, xmmword ptr [rsp+0x120]
 1640         psubd   xmm1, xmm0
 1641         movdqa  xmmword ptr [rsp+0x120], xmm1
 1642         add     rbx, 128
 1643         add     rdi, 32
 1644         sub     rsi, 4
 1645         cmp     rsi, 4
 1646         jnc     2b
 1647         test    rsi, rsi
 1648         jnz     3f
 1649 4:
 1650         mov     rsp, rbp
 1651         pop     rbp
 1652         pop     rbx
 1653         pop     r12
 1654         pop     r13
 1655         pop     r14
 1656         pop     r15
 1657         RET
 1658 .p2align 5
 1659 3:
 1660         test    esi, 0x2
 1661         je      3f
 1662         movups  xmm0, xmmword ptr [rcx]
 1663         movups  xmm1, xmmword ptr [rcx+0x10]
 1664         movaps  xmm8, xmm0
 1665         movaps  xmm9, xmm1
 1666         movd    xmm13, dword ptr [rsp+0x110]
 1667         movd    xmm14, dword ptr [rsp+0x120]
 1668         punpckldq xmm13, xmm14
 1669         movaps  xmmword ptr [rsp], xmm13
 1670         movd    xmm14, dword ptr [rsp+0x114]
 1671         movd    xmm13, dword ptr [rsp+0x124]
 1672         punpckldq xmm14, xmm13
 1673         movaps  xmmword ptr [rsp+0x10], xmm14
 1674         mov     r8, qword ptr [rdi]
 1675         mov     r9, qword ptr [rdi+0x8]
 1676         movzx   eax, byte ptr [rbp+0x40]
 1677         or      eax, r13d
 1678         xor     edx, edx
 1679 2:
 1680         mov     r14d, eax
 1681         or      eax, r12d
 1682         add     rdx, 64
 1683         cmp     rdx, r15
 1684         cmovne  eax, r14d
 1685         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 1686         movaps  xmm10, xmm2
 1687         movups  xmm4, xmmword ptr [r8+rdx-0x40]
 1688         movups  xmm5, xmmword ptr [r8+rdx-0x30]
 1689         movaps  xmm3, xmm4
 1690         shufps  xmm4, xmm5, 136
 1691         shufps  xmm3, xmm5, 221
 1692         movaps  xmm5, xmm3
 1693         movups  xmm6, xmmword ptr [r8+rdx-0x20]
 1694         movups  xmm7, xmmword ptr [r8+rdx-0x10]
 1695         movaps  xmm3, xmm6
 1696         shufps  xmm6, xmm7, 136
 1697         pshufd  xmm6, xmm6, 0x93
 1698         shufps  xmm3, xmm7, 221
 1699         pshufd  xmm7, xmm3, 0x93
 1700         movups  xmm12, xmmword ptr [r9+rdx-0x40]
 1701         movups  xmm13, xmmword ptr [r9+rdx-0x30]
 1702         movaps  xmm11, xmm12
 1703         shufps  xmm12, xmm13, 136
 1704         shufps  xmm11, xmm13, 221
 1705         movaps  xmm13, xmm11
 1706         movups  xmm14, xmmword ptr [r9+rdx-0x20]
 1707         movups  xmm15, xmmword ptr [r9+rdx-0x10]
 1708         movaps  xmm11, xmm14
 1709         shufps  xmm14, xmm15, 136
 1710         pshufd  xmm14, xmm14, 0x93
 1711         shufps  xmm11, xmm15, 221
 1712         pshufd  xmm15, xmm11, 0x93
 1713         shl     rax, 0x20
 1714         or      rax, 0x40
 1715         movq    xmm3, rax
 1716         movdqa  xmmword ptr [rsp+0x20], xmm3
 1717         movaps  xmm3, xmmword ptr [rsp]
 1718         movaps  xmm11, xmmword ptr [rsp+0x10]
 1719         punpcklqdq xmm3, xmmword ptr [rsp+0x20]
 1720         punpcklqdq xmm11, xmmword ptr [rsp+0x20]
 1721         mov     al, 7
 1722 9:
 1723         paddd   xmm0, xmm4
 1724         paddd   xmm8, xmm12
 1725         movaps  xmmword ptr [rsp+0x20], xmm4
 1726         movaps  xmmword ptr [rsp+0x30], xmm12
 1727         paddd   xmm0, xmm1
 1728         paddd   xmm8, xmm9
 1729         pxor    xmm3, xmm0
 1730         pxor    xmm11, xmm8
 1731         pshuflw xmm3, xmm3, 0xB1
 1732         pshufhw xmm3, xmm3, 0xB1
 1733         pshuflw xmm11, xmm11, 0xB1
 1734         pshufhw xmm11, xmm11, 0xB1
 1735         paddd   xmm2, xmm3
 1736         paddd   xmm10, xmm11
 1737         pxor    xmm1, xmm2
 1738         pxor    xmm9, xmm10
 1739         movdqa  xmm4, xmm1
 1740         pslld   xmm1, 20
 1741         psrld   xmm4, 12
 1742         por     xmm1, xmm4
 1743         movdqa  xmm4, xmm9
 1744         pslld   xmm9, 20
 1745         psrld   xmm4, 12
 1746         por     xmm9, xmm4
 1747         paddd   xmm0, xmm5
 1748         paddd   xmm8, xmm13
 1749         movaps  xmmword ptr [rsp+0x40], xmm5
 1750         movaps  xmmword ptr [rsp+0x50], xmm13
 1751         paddd   xmm0, xmm1
 1752         paddd   xmm8, xmm9
 1753         pxor    xmm3, xmm0
 1754         pxor    xmm11, xmm8
 1755         movdqa  xmm13, xmm3
 1756         psrld   xmm3, 8
 1757         pslld   xmm13, 24
 1758         pxor    xmm3, xmm13
 1759         movdqa  xmm13, xmm11
 1760         psrld   xmm11, 8
 1761         pslld   xmm13, 24
 1762         pxor    xmm11, xmm13
 1763         paddd   xmm2, xmm3
 1764         paddd   xmm10, xmm11
 1765         pxor    xmm1, xmm2
 1766         pxor    xmm9, xmm10
 1767         movdqa  xmm4, xmm1
 1768         pslld   xmm1, 25
 1769         psrld   xmm4, 7
 1770         por     xmm1, xmm4
 1771         movdqa  xmm4, xmm9
 1772         pslld   xmm9, 25
 1773         psrld   xmm4, 7
 1774         por     xmm9, xmm4
 1775         pshufd  xmm0, xmm0, 0x93
 1776         pshufd  xmm8, xmm8, 0x93
 1777         pshufd  xmm3, xmm3, 0x4E
 1778         pshufd  xmm11, xmm11, 0x4E
 1779         pshufd  xmm2, xmm2, 0x39
 1780         pshufd  xmm10, xmm10, 0x39
 1781         paddd   xmm0, xmm6
 1782         paddd   xmm8, xmm14
 1783         paddd   xmm0, xmm1
 1784         paddd   xmm8, xmm9
 1785         pxor    xmm3, xmm0
 1786         pxor    xmm11, xmm8
 1787         pshuflw xmm3, xmm3, 0xB1
 1788         pshufhw xmm3, xmm3, 0xB1
 1789         pshuflw xmm11, xmm11, 0xB1
 1790         pshufhw xmm11, xmm11, 0xB1
 1791         paddd   xmm2, xmm3
 1792         paddd   xmm10, xmm11
 1793         pxor    xmm1, xmm2
 1794         pxor    xmm9, xmm10
 1795         movdqa  xmm4, xmm1
 1796         pslld   xmm1, 20
 1797         psrld   xmm4, 12
 1798         por     xmm1, xmm4
 1799         movdqa  xmm4, xmm9
 1800         pslld   xmm9, 20
 1801         psrld   xmm4, 12
 1802         por     xmm9, xmm4
 1803         paddd   xmm0, xmm7
 1804         paddd   xmm8, xmm15
 1805         paddd   xmm0, xmm1
 1806         paddd   xmm8, xmm9
 1807         pxor    xmm3, xmm0
 1808         pxor    xmm11, xmm8
 1809         movdqa  xmm13, xmm3
 1810         psrld   xmm3, 8
 1811         pslld   xmm13, 24
 1812         pxor    xmm3, xmm13
 1813         movdqa  xmm13, xmm11
 1814         psrld   xmm11, 8
 1815         pslld   xmm13, 24
 1816         pxor    xmm11, xmm13
 1817         paddd   xmm2, xmm3
 1818         paddd   xmm10, xmm11
 1819         pxor    xmm1, xmm2
 1820         pxor    xmm9, xmm10
 1821         movdqa  xmm4, xmm1
 1822         pslld   xmm1, 25
 1823         psrld   xmm4, 7
 1824         por     xmm1, xmm4
 1825         movdqa  xmm4, xmm9
 1826         pslld   xmm9, 25
 1827         psrld   xmm4, 7
 1828         por     xmm9, xmm4
 1829         pshufd  xmm0, xmm0, 0x39
 1830         pshufd  xmm8, xmm8, 0x39
 1831         pshufd  xmm3, xmm3, 0x4E
 1832         pshufd  xmm11, xmm11, 0x4E
 1833         pshufd  xmm2, xmm2, 0x93
 1834         pshufd  xmm10, xmm10, 0x93
 1835         dec     al
 1836         je      9f
 1837         movdqa  xmm12, xmmword ptr [rsp+0x20]
 1838         movdqa  xmm5, xmmword ptr [rsp+0x40]
 1839         pshufd  xmm13, xmm12, 0x0F
 1840         shufps  xmm12, xmm5, 214
 1841         pshufd  xmm4, xmm12, 0x39
 1842         movdqa  xmm12, xmm6
 1843         shufps  xmm12, xmm7, 250
 1844         pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
 1845         pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
 1846         por     xmm13, xmm12
 1847         movdqa  xmmword ptr [rsp+0x20], xmm13
 1848         movdqa  xmm12, xmm7
 1849         punpcklqdq xmm12, xmm5
 1850         movdqa  xmm13, xmm6
 1851         pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
 1852         pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
 1853         por     xmm12, xmm13
 1854         pshufd  xmm12, xmm12, 0x78
 1855         punpckhdq xmm5, xmm7
 1856         punpckldq xmm6, xmm5
 1857         pshufd  xmm7, xmm6, 0x1E
 1858         movdqa  xmmword ptr [rsp+0x40], xmm12
 1859         movdqa  xmm5, xmmword ptr [rsp+0x30]
 1860         movdqa  xmm13, xmmword ptr [rsp+0x50]
 1861         pshufd  xmm6, xmm5, 0x0F
 1862         shufps  xmm5, xmm13, 214
 1863         pshufd  xmm12, xmm5, 0x39
 1864         movdqa  xmm5, xmm14
 1865         shufps  xmm5, xmm15, 250
 1866         pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
 1867         pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
 1868         por     xmm6, xmm5
 1869         movdqa  xmm5, xmm15
 1870         punpcklqdq xmm5, xmm13
 1871         movdqa  xmmword ptr [rsp+0x30], xmm2
 1872         movdqa  xmm2, xmm14
 1873         pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
 1874         pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
 1875         por     xmm5, xmm2
 1876         movdqa  xmm2, xmmword ptr [rsp+0x30]
 1877         pshufd  xmm5, xmm5, 0x78
 1878         punpckhdq xmm13, xmm15
 1879         punpckldq xmm14, xmm13
 1880         pshufd  xmm15, xmm14, 0x1E
 1881         movdqa  xmm13, xmm6
 1882         movdqa  xmm14, xmm5
 1883         movdqa  xmm5, xmmword ptr [rsp+0x20]
 1884         movdqa  xmm6, xmmword ptr [rsp+0x40]
 1885         jmp     9b
 1886 9:
 1887         pxor    xmm0, xmm2
 1888         pxor    xmm1, xmm3
 1889         pxor    xmm8, xmm10
 1890         pxor    xmm9, xmm11
 1891         mov     eax, r13d
 1892         cmp     rdx, r15
 1893         jne     2b
 1894         movups  xmmword ptr [rbx], xmm0
 1895         movups  xmmword ptr [rbx+0x10], xmm1
 1896         movups  xmmword ptr [rbx+0x20], xmm8
 1897         movups  xmmword ptr [rbx+0x30], xmm9
 1898         mov     eax, dword ptr [rsp+0x130]
 1899         neg     eax
 1900         mov    r10d, dword ptr [rsp+0x110+8*rax]
 1901         mov    r11d, dword ptr [rsp+0x120+8*rax]
 1902         mov dword ptr [rsp+0x110], r10d
 1903         mov dword ptr [rsp+0x120], r11d
 1904         add     rdi, 16
 1905         add     rbx, 64
 1906         sub     rsi, 2
 1907 3:
 1908         test    esi, 0x1
 1909         je      4b
 1910         movups  xmm0, xmmword ptr [rcx]
 1911         movups  xmm1, xmmword ptr [rcx+0x10]
 1912         movd    xmm13, dword ptr [rsp+0x110]
 1913         movd    xmm14, dword ptr [rsp+0x120]
 1914         punpckldq xmm13, xmm14
 1915         mov     r8, qword ptr [rdi]
 1916         movzx   eax, byte ptr [rbp+0x40]
 1917         or      eax, r13d
 1918         xor     edx, edx
 1919 2:
 1920         mov     r14d, eax
 1921         or      eax, r12d
 1922         add     rdx, 64
 1923         cmp     rdx, r15
 1924         cmovne  eax, r14d
 1925         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 1926         shl     rax, 32
 1927         or      rax, 64
 1928         movq    xmm12, rax
 1929         movdqa  xmm3, xmm13
 1930         punpcklqdq xmm3, xmm12
 1931         movups  xmm4, xmmword ptr [r8+rdx-0x40]
 1932         movups  xmm5, xmmword ptr [r8+rdx-0x30]
 1933         movaps  xmm8, xmm4
 1934         shufps  xmm4, xmm5, 136
 1935         shufps  xmm8, xmm5, 221
 1936         movaps  xmm5, xmm8
 1937         movups  xmm6, xmmword ptr [r8+rdx-0x20]
 1938         movups  xmm7, xmmword ptr [r8+rdx-0x10]
 1939         movaps  xmm8, xmm6
 1940         shufps  xmm6, xmm7, 136
 1941         pshufd  xmm6, xmm6, 0x93
 1942         shufps  xmm8, xmm7, 221
 1943         pshufd  xmm7, xmm8, 0x93
 1944         mov     al, 7
 1945 9:
 1946         paddd   xmm0, xmm4
 1947         paddd   xmm0, xmm1
 1948         pxor    xmm3, xmm0
 1949         pshuflw xmm3, xmm3, 0xB1
 1950         pshufhw xmm3, xmm3, 0xB1
 1951         paddd   xmm2, xmm3
 1952         pxor    xmm1, xmm2
 1953         movdqa  xmm11, xmm1
 1954         pslld   xmm1, 20
 1955         psrld   xmm11, 12
 1956         por     xmm1, xmm11
 1957         paddd   xmm0, xmm5
 1958         paddd   xmm0, xmm1
 1959         pxor    xmm3, xmm0
 1960         movdqa  xmm14, xmm3
 1961         psrld   xmm3, 8
 1962         pslld   xmm14, 24
 1963         pxor    xmm3, xmm14
 1964         paddd   xmm2, xmm3
 1965         pxor    xmm1, xmm2
 1966         movdqa  xmm11, xmm1
 1967         pslld   xmm1, 25
 1968         psrld   xmm11, 7
 1969         por     xmm1, xmm11
 1970         pshufd  xmm0, xmm0, 0x93
 1971         pshufd  xmm3, xmm3, 0x4E
 1972         pshufd  xmm2, xmm2, 0x39
 1973         paddd   xmm0, xmm6
 1974         paddd   xmm0, xmm1
 1975         pxor    xmm3, xmm0
 1976         pshuflw xmm3, xmm3, 0xB1
 1977         pshufhw xmm3, xmm3, 0xB1
 1978         paddd   xmm2, xmm3
 1979         pxor    xmm1, xmm2
 1980         movdqa  xmm11, xmm1
 1981         pslld   xmm1, 20
 1982         psrld   xmm11, 12
 1983         por     xmm1, xmm11
 1984         paddd   xmm0, xmm7
 1985         paddd   xmm0, xmm1
 1986         pxor    xmm3, xmm0
 1987         movdqa  xmm14, xmm3
 1988         psrld   xmm3, 8
 1989         pslld   xmm14, 24
 1990         pxor    xmm3, xmm14
 1991         paddd   xmm2, xmm3
 1992         pxor    xmm1, xmm2
 1993         movdqa  xmm11, xmm1
 1994         pslld   xmm1, 25
 1995         psrld   xmm11, 7
 1996         por     xmm1, xmm11
 1997         pshufd  xmm0, xmm0, 0x39
 1998         pshufd  xmm3, xmm3, 0x4E
 1999         pshufd  xmm2, xmm2, 0x93
 2000         dec     al
 2001         jz      9f
 2002         movdqa  xmm8, xmm4
 2003         shufps  xmm8, xmm5, 214
 2004         pshufd  xmm9, xmm4, 0x0F
 2005         pshufd  xmm4, xmm8, 0x39
 2006         movdqa  xmm8, xmm6
 2007         shufps  xmm8, xmm7, 250
 2008         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
 2009         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
 2010         por     xmm9, xmm8
 2011         movdqa  xmm8, xmm7
 2012         punpcklqdq xmm8, xmm5
 2013         movdqa  xmm10, xmm6
 2014         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
 2015         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
 2016         por     xmm8, xmm10
 2017         pshufd  xmm8, xmm8, 0x78
 2018         punpckhdq xmm5, xmm7
 2019         punpckldq xmm6, xmm5
 2020         pshufd  xmm7, xmm6, 0x1E
 2021         movdqa  xmm5, xmm9
 2022         movdqa  xmm6, xmm8
 2023         jmp     9b
 2024 9:
 2025         pxor    xmm0, xmm2
 2026         pxor    xmm1, xmm3
 2027         mov     eax, r13d
 2028         cmp     rdx, r15
 2029         jne     2b
 2030         movups  xmmword ptr [rbx], xmm0
 2031         movups  xmmword ptr [rbx+0x10], xmm1
 2032         jmp     4b
 2033 SET_SIZE(zfs_blake3_hash_many_sse2)
 2034 
 2035 ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64)
 2036         ENDBR
 2037         movups  xmm0, xmmword ptr [rdi]
 2038         movups  xmm1, xmmword ptr [rdi+0x10]
 2039         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 2040         shl     r8, 32
 2041         add     rdx, r8
 2042         movq    xmm3, rcx
 2043         movq    xmm4, rdx
 2044         punpcklqdq xmm3, xmm4
 2045         movups  xmm4, xmmword ptr [rsi]
 2046         movups  xmm5, xmmword ptr [rsi+0x10]
 2047         movaps  xmm8, xmm4
 2048         shufps  xmm4, xmm5, 136
 2049         shufps  xmm8, xmm5, 221
 2050         movaps  xmm5, xmm8
 2051         movups  xmm6, xmmword ptr [rsi+0x20]
 2052         movups  xmm7, xmmword ptr [rsi+0x30]
 2053         movaps  xmm8, xmm6
 2054         shufps  xmm6, xmm7, 136
 2055         pshufd  xmm6, xmm6, 0x93
 2056         shufps  xmm8, xmm7, 221
 2057         pshufd  xmm7, xmm8, 0x93
 2058         mov     al, 7
 2059 9:
 2060         paddd   xmm0, xmm4
 2061         paddd   xmm0, xmm1
 2062         pxor    xmm3, xmm0
 2063         pshuflw xmm3, xmm3, 0xB1
 2064         pshufhw xmm3, xmm3, 0xB1
 2065         paddd   xmm2, xmm3
 2066         pxor    xmm1, xmm2
 2067         movdqa  xmm11, xmm1
 2068         pslld   xmm1, 20
 2069         psrld   xmm11, 12
 2070         por     xmm1, xmm11
 2071         paddd   xmm0, xmm5
 2072         paddd   xmm0, xmm1
 2073         pxor    xmm3, xmm0
 2074         movdqa  xmm14, xmm3
 2075         psrld   xmm3, 8
 2076         pslld   xmm14, 24
 2077         pxor    xmm3, xmm14
 2078         paddd   xmm2, xmm3
 2079         pxor    xmm1, xmm2
 2080         movdqa  xmm11, xmm1
 2081         pslld   xmm1, 25
 2082         psrld   xmm11, 7
 2083         por     xmm1, xmm11
 2084         pshufd  xmm0, xmm0, 0x93
 2085         pshufd  xmm3, xmm3, 0x4E
 2086         pshufd  xmm2, xmm2, 0x39
 2087         paddd   xmm0, xmm6
 2088         paddd   xmm0, xmm1
 2089         pxor    xmm3, xmm0
 2090         pshuflw xmm3, xmm3, 0xB1
 2091         pshufhw xmm3, xmm3, 0xB1
 2092         paddd   xmm2, xmm3
 2093         pxor    xmm1, xmm2
 2094         movdqa  xmm11, xmm1
 2095         pslld   xmm1, 20
 2096         psrld   xmm11, 12
 2097         por     xmm1, xmm11
 2098         paddd   xmm0, xmm7
 2099         paddd   xmm0, xmm1
 2100         pxor    xmm3, xmm0
 2101         movdqa  xmm14, xmm3
 2102         psrld   xmm3, 8
 2103         pslld   xmm14, 24
 2104         pxor    xmm3, xmm14
 2105         paddd   xmm2, xmm3
 2106         pxor    xmm1, xmm2
 2107         movdqa  xmm11, xmm1
 2108         pslld   xmm1, 25
 2109         psrld   xmm11, 7
 2110         por     xmm1, xmm11
 2111         pshufd  xmm0, xmm0, 0x39
 2112         pshufd  xmm3, xmm3, 0x4E
 2113         pshufd  xmm2, xmm2, 0x93
 2114         dec     al
 2115         jz      9f
 2116         movdqa  xmm8, xmm4
 2117         shufps  xmm8, xmm5, 214
 2118         pshufd  xmm9, xmm4, 0x0F
 2119         pshufd  xmm4, xmm8, 0x39
 2120         movdqa  xmm8, xmm6
 2121         shufps  xmm8, xmm7, 250
 2122         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
 2123         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
 2124         por     xmm9, xmm8
 2125         movdqa  xmm8, xmm7
 2126         punpcklqdq xmm8, xmm5
 2127         movdqa  xmm10, xmm6
 2128         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
 2129         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
 2130         por     xmm8, xmm10
 2131         pshufd  xmm8, xmm8, 0x78
 2132         punpckhdq xmm5, xmm7
 2133         punpckldq xmm6, xmm5
 2134         pshufd  xmm7, xmm6, 0x1E
 2135         movdqa  xmm5, xmm9
 2136         movdqa  xmm6, xmm8
 2137         jmp     9b
 2138 9:
 2139         pxor    xmm0, xmm2
 2140         pxor    xmm1, xmm3
 2141         movups  xmmword ptr [rdi], xmm0
 2142         movups  xmmword ptr [rdi+0x10], xmm1
 2143         RET
 2144 SET_SIZE(zfs_blake3_compress_in_place_sse2)
 2145 
 2146 ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64)
 2147         ENDBR
 2148         movups  xmm0, xmmword ptr [rdi]
 2149         movups  xmm1, xmmword ptr [rdi+0x10]
 2150         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
 2151         movzx   eax, r8b
 2152         movzx   edx, dl
 2153         shl     rax, 32
 2154         add     rdx, rax
 2155         movq    xmm3, rcx
 2156         movq    xmm4, rdx
 2157         punpcklqdq xmm3, xmm4
 2158         movups  xmm4, xmmword ptr [rsi]
 2159         movups  xmm5, xmmword ptr [rsi+0x10]
 2160         movaps  xmm8, xmm4
 2161         shufps  xmm4, xmm5, 136
 2162         shufps  xmm8, xmm5, 221
 2163         movaps  xmm5, xmm8
 2164         movups  xmm6, xmmword ptr [rsi+0x20]
 2165         movups  xmm7, xmmword ptr [rsi+0x30]
 2166         movaps  xmm8, xmm6
 2167         shufps  xmm6, xmm7, 136
 2168         pshufd  xmm6, xmm6, 0x93
 2169         shufps  xmm8, xmm7, 221
 2170         pshufd  xmm7, xmm8, 0x93
 2171         mov     al, 7
 2172 9:
 2173         paddd   xmm0, xmm4
 2174         paddd   xmm0, xmm1
 2175         pxor    xmm3, xmm0
 2176         pshuflw xmm3, xmm3, 0xB1
 2177         pshufhw xmm3, xmm3, 0xB1
 2178         paddd   xmm2, xmm3
 2179         pxor    xmm1, xmm2
 2180         movdqa  xmm11, xmm1
 2181         pslld   xmm1, 20
 2182         psrld   xmm11, 12
 2183         por     xmm1, xmm11
 2184         paddd   xmm0, xmm5
 2185         paddd   xmm0, xmm1
 2186         pxor    xmm3, xmm0
 2187         movdqa  xmm14, xmm3
 2188         psrld   xmm3, 8
 2189         pslld   xmm14, 24
 2190         pxor    xmm3, xmm14
 2191         paddd   xmm2, xmm3
 2192         pxor    xmm1, xmm2
 2193         movdqa  xmm11, xmm1
 2194         pslld   xmm1, 25
 2195         psrld   xmm11, 7
 2196         por     xmm1, xmm11
 2197         pshufd  xmm0, xmm0, 0x93
 2198         pshufd  xmm3, xmm3, 0x4E
 2199         pshufd  xmm2, xmm2, 0x39
 2200         paddd   xmm0, xmm6
 2201         paddd   xmm0, xmm1
 2202         pxor    xmm3, xmm0
 2203         pshuflw xmm3, xmm3, 0xB1
 2204         pshufhw xmm3, xmm3, 0xB1
 2205         paddd   xmm2, xmm3
 2206         pxor    xmm1, xmm2
 2207         movdqa  xmm11, xmm1
 2208         pslld   xmm1, 20
 2209         psrld   xmm11, 12
 2210         por     xmm1, xmm11
 2211         paddd   xmm0, xmm7
 2212         paddd   xmm0, xmm1
 2213         pxor    xmm3, xmm0
 2214         movdqa  xmm14, xmm3
 2215         psrld   xmm3, 8
 2216         pslld   xmm14, 24
 2217         pxor    xmm3, xmm14
 2218         paddd   xmm2, xmm3
 2219         pxor    xmm1, xmm2
 2220         movdqa  xmm11, xmm1
 2221         pslld   xmm1, 25
 2222         psrld   xmm11, 7
 2223         por     xmm1, xmm11
 2224         pshufd  xmm0, xmm0, 0x39
 2225         pshufd  xmm3, xmm3, 0x4E
 2226         pshufd  xmm2, xmm2, 0x93
 2227         dec     al
 2228         jz      9f
 2229         movdqa  xmm8, xmm4
 2230         shufps  xmm8, xmm5, 214
 2231         pshufd  xmm9, xmm4, 0x0F
 2232         pshufd  xmm4, xmm8, 0x39
 2233         movdqa  xmm8, xmm6
 2234         shufps  xmm8, xmm7, 250
 2235         pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
 2236         pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
 2237         por     xmm9, xmm8
 2238         movdqa  xmm8, xmm7
 2239         punpcklqdq xmm8, xmm5
 2240         movdqa  xmm10, xmm6
 2241         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
 2242         pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
 2243         por     xmm8, xmm10
 2244         pshufd  xmm8, xmm8, 0x78
 2245         punpckhdq xmm5, xmm7
 2246         punpckldq xmm6, xmm5
 2247         pshufd  xmm7, xmm6, 0x1E
 2248         movdqa  xmm5, xmm9
 2249         movdqa  xmm6, xmm8
 2250         jmp     9b
 2251 9:
 2252         movdqu  xmm4, xmmword ptr [rdi]
 2253         movdqu  xmm5, xmmword ptr [rdi+0x10]
 2254         pxor    xmm0, xmm2
 2255         pxor    xmm1, xmm3
 2256         pxor    xmm2, xmm4
 2257         pxor    xmm3, xmm5
 2258         movups  xmmword ptr [r9], xmm0
 2259         movups  xmmword ptr [r9+0x10], xmm1
 2260         movups  xmmword ptr [r9+0x20], xmm2
 2261         movups  xmmword ptr [r9+0x30], xmm3
 2262         RET
 2263 SET_SIZE(zfs_blake3_compress_xof_sse2)
 2264 
 2265 SECTION_STATIC
 2266 .p2align  6
 2267 BLAKE3_IV:
 2268         .long  0x6A09E667, 0xBB67AE85
 2269         .long  0x3C6EF372, 0xA54FF53A
 2270 ADD0:
 2271         .long  0, 1, 2, 3
 2272 ADD1:
 2273         .long  4, 4, 4, 4
 2274 BLAKE3_IV_0:
 2275         .long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
 2276 BLAKE3_IV_1:
 2277         .long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
 2278 BLAKE3_IV_2:
 2279         .long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
 2280 BLAKE3_IV_3:
 2281         .long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
 2282 BLAKE3_BLOCK_LEN:
 2283         .long  64, 64, 64, 64
 2284 CMP_MSB_MASK:
 2285         .long  0x80000000, 0x80000000, 0x80000000, 0x80000000
 2286 PBLENDW_0x33_MASK:
 2287         .long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
 2288 PBLENDW_0xCC_MASK:
 2289         .long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
 2290 PBLENDW_0x3F_MASK:
 2291         .long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
 2292 PBLENDW_0xC0_MASK:
 2293         .long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
 2294 
 2295 #endif  /* HAVE_SSE2 */
 2296 
 2297 #ifdef __ELF__
 2298 .section .note.GNU-stack,"",%progbits
 2299 #endif

Cache object: 2cc3f8888a9a15c6ac16c2573da44315


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.