1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 */
27
28 #if defined(HAVE_SSE2)
29
30 #define _ASM
31 #include <sys/asm_linkage.h>
32
33 .intel_syntax noprefix
34
35 SECTION_TEXT
36
37 ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64)
38 ENDBR
39 push r15
40 push r14
41 push r13
42 push r12
43 push rbx
44 push rbp
45 mov rbp, rsp
46 sub rsp, 360
47 and rsp, 0xFFFFFFFFFFFFFFC0
48 neg r9d
49 movd xmm0, r9d
50 pshufd xmm0, xmm0, 0x00
51 movdqa xmmword ptr [rsp+0x130], xmm0
52 movdqa xmm1, xmm0
53 pand xmm1, xmmword ptr [ADD0+rip]
54 pand xmm0, xmmword ptr [ADD1+rip]
55 movdqa xmmword ptr [rsp+0x150], xmm0
56 movd xmm0, r8d
57 pshufd xmm0, xmm0, 0x00
58 paddd xmm0, xmm1
59 movdqa xmmword ptr [rsp+0x110], xmm0
60 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
61 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
62 pcmpgtd xmm1, xmm0
63 shr r8, 32
64 movd xmm2, r8d
65 pshufd xmm2, xmm2, 0x00
66 psubd xmm2, xmm1
67 movdqa xmmword ptr [rsp+0x120], xmm2
68 mov rbx, qword ptr [rbp+0x50]
69 mov r15, rdx
70 shl r15, 6
71 movzx r13d, byte ptr [rbp+0x38]
72 movzx r12d, byte ptr [rbp+0x48]
73 cmp rsi, 4
74 jc 3f
75 2:
76 movdqu xmm3, xmmword ptr [rcx]
77 pshufd xmm0, xmm3, 0x00
78 pshufd xmm1, xmm3, 0x55
79 pshufd xmm2, xmm3, 0xAA
80 pshufd xmm3, xmm3, 0xFF
81 movdqu xmm7, xmmword ptr [rcx+0x10]
82 pshufd xmm4, xmm7, 0x00
83 pshufd xmm5, xmm7, 0x55
84 pshufd xmm6, xmm7, 0xAA
85 pshufd xmm7, xmm7, 0xFF
86 mov r8, qword ptr [rdi]
87 mov r9, qword ptr [rdi+0x8]
88 mov r10, qword ptr [rdi+0x10]
89 mov r11, qword ptr [rdi+0x18]
90 movzx eax, byte ptr [rbp+0x40]
91 or eax, r13d
92 xor edx, edx
93 9:
94 mov r14d, eax
95 or eax, r12d
96 add rdx, 64
97 cmp rdx, r15
98 cmovne eax, r14d
99 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
100 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
101 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
102 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
103 movdqa xmm12, xmm8
104 punpckldq xmm8, xmm9
105 punpckhdq xmm12, xmm9
106 movdqa xmm14, xmm10
107 punpckldq xmm10, xmm11
108 punpckhdq xmm14, xmm11
109 movdqa xmm9, xmm8
110 punpcklqdq xmm8, xmm10
111 punpckhqdq xmm9, xmm10
112 movdqa xmm13, xmm12
113 punpcklqdq xmm12, xmm14
114 punpckhqdq xmm13, xmm14
115 movdqa xmmword ptr [rsp], xmm8
116 movdqa xmmword ptr [rsp+0x10], xmm9
117 movdqa xmmword ptr [rsp+0x20], xmm12
118 movdqa xmmword ptr [rsp+0x30], xmm13
119 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
120 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
121 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
122 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
123 movdqa xmm12, xmm8
124 punpckldq xmm8, xmm9
125 punpckhdq xmm12, xmm9
126 movdqa xmm14, xmm10
127 punpckldq xmm10, xmm11
128 punpckhdq xmm14, xmm11
129 movdqa xmm9, xmm8
130 punpcklqdq xmm8, xmm10
131 punpckhqdq xmm9, xmm10
132 movdqa xmm13, xmm12
133 punpcklqdq xmm12, xmm14
134 punpckhqdq xmm13, xmm14
135 movdqa xmmword ptr [rsp+0x40], xmm8
136 movdqa xmmword ptr [rsp+0x50], xmm9
137 movdqa xmmword ptr [rsp+0x60], xmm12
138 movdqa xmmword ptr [rsp+0x70], xmm13
139 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
140 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
141 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
142 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
143 movdqa xmm12, xmm8
144 punpckldq xmm8, xmm9
145 punpckhdq xmm12, xmm9
146 movdqa xmm14, xmm10
147 punpckldq xmm10, xmm11
148 punpckhdq xmm14, xmm11
149 movdqa xmm9, xmm8
150 punpcklqdq xmm8, xmm10
151 punpckhqdq xmm9, xmm10
152 movdqa xmm13, xmm12
153 punpcklqdq xmm12, xmm14
154 punpckhqdq xmm13, xmm14
155 movdqa xmmword ptr [rsp+0x80], xmm8
156 movdqa xmmword ptr [rsp+0x90], xmm9
157 movdqa xmmword ptr [rsp+0xA0], xmm12
158 movdqa xmmword ptr [rsp+0xB0], xmm13
159 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
160 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
161 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
162 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
163 movdqa xmm12, xmm8
164 punpckldq xmm8, xmm9
165 punpckhdq xmm12, xmm9
166 movdqa xmm14, xmm10
167 punpckldq xmm10, xmm11
168 punpckhdq xmm14, xmm11
169 movdqa xmm9, xmm8
170 punpcklqdq xmm8, xmm10
171 punpckhqdq xmm9, xmm10
172 movdqa xmm13, xmm12
173 punpcklqdq xmm12, xmm14
174 punpckhqdq xmm13, xmm14
175 movdqa xmmword ptr [rsp+0xC0], xmm8
176 movdqa xmmword ptr [rsp+0xD0], xmm9
177 movdqa xmmword ptr [rsp+0xE0], xmm12
178 movdqa xmmword ptr [rsp+0xF0], xmm13
179 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
180 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
181 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
182 movdqa xmm12, xmmword ptr [rsp+0x110]
183 movdqa xmm13, xmmword ptr [rsp+0x120]
184 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
185 movd xmm15, eax
186 pshufd xmm15, xmm15, 0x00
187 prefetcht0 [r8+rdx+0x80]
188 prefetcht0 [r9+rdx+0x80]
189 prefetcht0 [r10+rdx+0x80]
190 prefetcht0 [r11+rdx+0x80]
191 paddd xmm0, xmmword ptr [rsp]
192 paddd xmm1, xmmword ptr [rsp+0x20]
193 paddd xmm2, xmmword ptr [rsp+0x40]
194 paddd xmm3, xmmword ptr [rsp+0x60]
195 paddd xmm0, xmm4
196 paddd xmm1, xmm5
197 paddd xmm2, xmm6
198 paddd xmm3, xmm7
199 pxor xmm12, xmm0
200 pxor xmm13, xmm1
201 pxor xmm14, xmm2
202 pxor xmm15, xmm3
203 pshuflw xmm12, xmm12, 0xB1
204 pshufhw xmm12, xmm12, 0xB1
205 pshuflw xmm13, xmm13, 0xB1
206 pshufhw xmm13, xmm13, 0xB1
207 pshuflw xmm14, xmm14, 0xB1
208 pshufhw xmm14, xmm14, 0xB1
209 pshuflw xmm15, xmm15, 0xB1
210 pshufhw xmm15, xmm15, 0xB1
211 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
212 paddd xmm8, xmm12
213 paddd xmm9, xmm13
214 paddd xmm10, xmm14
215 paddd xmm11, xmm15
216 pxor xmm4, xmm8
217 pxor xmm5, xmm9
218 pxor xmm6, xmm10
219 pxor xmm7, xmm11
220 movdqa xmmword ptr [rsp+0x100], xmm8
221 movdqa xmm8, xmm4
222 psrld xmm8, 12
223 pslld xmm4, 20
224 por xmm4, xmm8
225 movdqa xmm8, xmm5
226 psrld xmm8, 12
227 pslld xmm5, 20
228 por xmm5, xmm8
229 movdqa xmm8, xmm6
230 psrld xmm8, 12
231 pslld xmm6, 20
232 por xmm6, xmm8
233 movdqa xmm8, xmm7
234 psrld xmm8, 12
235 pslld xmm7, 20
236 por xmm7, xmm8
237 paddd xmm0, xmmword ptr [rsp+0x10]
238 paddd xmm1, xmmword ptr [rsp+0x30]
239 paddd xmm2, xmmword ptr [rsp+0x50]
240 paddd xmm3, xmmword ptr [rsp+0x70]
241 paddd xmm0, xmm4
242 paddd xmm1, xmm5
243 paddd xmm2, xmm6
244 paddd xmm3, xmm7
245 pxor xmm12, xmm0
246 pxor xmm13, xmm1
247 pxor xmm14, xmm2
248 pxor xmm15, xmm3
249 movdqa xmm8, xmm12
250 psrld xmm12, 8
251 pslld xmm8, 24
252 pxor xmm12, xmm8
253 movdqa xmm8, xmm13
254 psrld xmm13, 8
255 pslld xmm8, 24
256 pxor xmm13, xmm8
257 movdqa xmm8, xmm14
258 psrld xmm14, 8
259 pslld xmm8, 24
260 pxor xmm14, xmm8
261 movdqa xmm8, xmm15
262 psrld xmm15, 8
263 pslld xmm8, 24
264 pxor xmm15, xmm8
265 movdqa xmm8, xmmword ptr [rsp+0x100]
266 paddd xmm8, xmm12
267 paddd xmm9, xmm13
268 paddd xmm10, xmm14
269 paddd xmm11, xmm15
270 pxor xmm4, xmm8
271 pxor xmm5, xmm9
272 pxor xmm6, xmm10
273 pxor xmm7, xmm11
274 movdqa xmmword ptr [rsp+0x100], xmm8
275 movdqa xmm8, xmm4
276 psrld xmm8, 7
277 pslld xmm4, 25
278 por xmm4, xmm8
279 movdqa xmm8, xmm5
280 psrld xmm8, 7
281 pslld xmm5, 25
282 por xmm5, xmm8
283 movdqa xmm8, xmm6
284 psrld xmm8, 7
285 pslld xmm6, 25
286 por xmm6, xmm8
287 movdqa xmm8, xmm7
288 psrld xmm8, 7
289 pslld xmm7, 25
290 por xmm7, xmm8
291 paddd xmm0, xmmword ptr [rsp+0x80]
292 paddd xmm1, xmmword ptr [rsp+0xA0]
293 paddd xmm2, xmmword ptr [rsp+0xC0]
294 paddd xmm3, xmmword ptr [rsp+0xE0]
295 paddd xmm0, xmm5
296 paddd xmm1, xmm6
297 paddd xmm2, xmm7
298 paddd xmm3, xmm4
299 pxor xmm15, xmm0
300 pxor xmm12, xmm1
301 pxor xmm13, xmm2
302 pxor xmm14, xmm3
303 pshuflw xmm15, xmm15, 0xB1
304 pshufhw xmm15, xmm15, 0xB1
305 pshuflw xmm12, xmm12, 0xB1
306 pshufhw xmm12, xmm12, 0xB1
307 pshuflw xmm13, xmm13, 0xB1
308 pshufhw xmm13, xmm13, 0xB1
309 pshuflw xmm14, xmm14, 0xB1
310 pshufhw xmm14, xmm14, 0xB1
311 paddd xmm10, xmm15
312 paddd xmm11, xmm12
313 movdqa xmm8, xmmword ptr [rsp+0x100]
314 paddd xmm8, xmm13
315 paddd xmm9, xmm14
316 pxor xmm5, xmm10
317 pxor xmm6, xmm11
318 pxor xmm7, xmm8
319 pxor xmm4, xmm9
320 movdqa xmmword ptr [rsp+0x100], xmm8
321 movdqa xmm8, xmm5
322 psrld xmm8, 12
323 pslld xmm5, 20
324 por xmm5, xmm8
325 movdqa xmm8, xmm6
326 psrld xmm8, 12
327 pslld xmm6, 20
328 por xmm6, xmm8
329 movdqa xmm8, xmm7
330 psrld xmm8, 12
331 pslld xmm7, 20
332 por xmm7, xmm8
333 movdqa xmm8, xmm4
334 psrld xmm8, 12
335 pslld xmm4, 20
336 por xmm4, xmm8
337 paddd xmm0, xmmword ptr [rsp+0x90]
338 paddd xmm1, xmmword ptr [rsp+0xB0]
339 paddd xmm2, xmmword ptr [rsp+0xD0]
340 paddd xmm3, xmmword ptr [rsp+0xF0]
341 paddd xmm0, xmm5
342 paddd xmm1, xmm6
343 paddd xmm2, xmm7
344 paddd xmm3, xmm4
345 pxor xmm15, xmm0
346 pxor xmm12, xmm1
347 pxor xmm13, xmm2
348 pxor xmm14, xmm3
349 movdqa xmm8, xmm15
350 psrld xmm15, 8
351 pslld xmm8, 24
352 pxor xmm15, xmm8
353 movdqa xmm8, xmm12
354 psrld xmm12, 8
355 pslld xmm8, 24
356 pxor xmm12, xmm8
357 movdqa xmm8, xmm13
358 psrld xmm13, 8
359 pslld xmm8, 24
360 pxor xmm13, xmm8
361 movdqa xmm8, xmm14
362 psrld xmm14, 8
363 pslld xmm8, 24
364 pxor xmm14, xmm8
365 paddd xmm10, xmm15
366 paddd xmm11, xmm12
367 movdqa xmm8, xmmword ptr [rsp+0x100]
368 paddd xmm8, xmm13
369 paddd xmm9, xmm14
370 pxor xmm5, xmm10
371 pxor xmm6, xmm11
372 pxor xmm7, xmm8
373 pxor xmm4, xmm9
374 movdqa xmmword ptr [rsp+0x100], xmm8
375 movdqa xmm8, xmm5
376 psrld xmm8, 7
377 pslld xmm5, 25
378 por xmm5, xmm8
379 movdqa xmm8, xmm6
380 psrld xmm8, 7
381 pslld xmm6, 25
382 por xmm6, xmm8
383 movdqa xmm8, xmm7
384 psrld xmm8, 7
385 pslld xmm7, 25
386 por xmm7, xmm8
387 movdqa xmm8, xmm4
388 psrld xmm8, 7
389 pslld xmm4, 25
390 por xmm4, xmm8
391 paddd xmm0, xmmword ptr [rsp+0x20]
392 paddd xmm1, xmmword ptr [rsp+0x30]
393 paddd xmm2, xmmword ptr [rsp+0x70]
394 paddd xmm3, xmmword ptr [rsp+0x40]
395 paddd xmm0, xmm4
396 paddd xmm1, xmm5
397 paddd xmm2, xmm6
398 paddd xmm3, xmm7
399 pxor xmm12, xmm0
400 pxor xmm13, xmm1
401 pxor xmm14, xmm2
402 pxor xmm15, xmm3
403 pshuflw xmm12, xmm12, 0xB1
404 pshufhw xmm12, xmm12, 0xB1
405 pshuflw xmm13, xmm13, 0xB1
406 pshufhw xmm13, xmm13, 0xB1
407 pshuflw xmm14, xmm14, 0xB1
408 pshufhw xmm14, xmm14, 0xB1
409 pshuflw xmm15, xmm15, 0xB1
410 pshufhw xmm15, xmm15, 0xB1
411 movdqa xmm8, xmmword ptr [rsp+0x100]
412 paddd xmm8, xmm12
413 paddd xmm9, xmm13
414 paddd xmm10, xmm14
415 paddd xmm11, xmm15
416 pxor xmm4, xmm8
417 pxor xmm5, xmm9
418 pxor xmm6, xmm10
419 pxor xmm7, xmm11
420 movdqa xmmword ptr [rsp+0x100], xmm8
421 movdqa xmm8, xmm4
422 psrld xmm8, 12
423 pslld xmm4, 20
424 por xmm4, xmm8
425 movdqa xmm8, xmm5
426 psrld xmm8, 12
427 pslld xmm5, 20
428 por xmm5, xmm8
429 movdqa xmm8, xmm6
430 psrld xmm8, 12
431 pslld xmm6, 20
432 por xmm6, xmm8
433 movdqa xmm8, xmm7
434 psrld xmm8, 12
435 pslld xmm7, 20
436 por xmm7, xmm8
437 paddd xmm0, xmmword ptr [rsp+0x60]
438 paddd xmm1, xmmword ptr [rsp+0xA0]
439 paddd xmm2, xmmword ptr [rsp]
440 paddd xmm3, xmmword ptr [rsp+0xD0]
441 paddd xmm0, xmm4
442 paddd xmm1, xmm5
443 paddd xmm2, xmm6
444 paddd xmm3, xmm7
445 pxor xmm12, xmm0
446 pxor xmm13, xmm1
447 pxor xmm14, xmm2
448 pxor xmm15, xmm3
449 movdqa xmm8, xmm12
450 psrld xmm12, 8
451 pslld xmm8, 24
452 pxor xmm12, xmm8
453 movdqa xmm8, xmm13
454 psrld xmm13, 8
455 pslld xmm8, 24
456 pxor xmm13, xmm8
457 movdqa xmm8, xmm14
458 psrld xmm14, 8
459 pslld xmm8, 24
460 pxor xmm14, xmm8
461 movdqa xmm8, xmm15
462 psrld xmm15, 8
463 pslld xmm8, 24
464 pxor xmm15, xmm8
465 movdqa xmm8, xmmword ptr [rsp+0x100]
466 paddd xmm8, xmm12
467 paddd xmm9, xmm13
468 paddd xmm10, xmm14
469 paddd xmm11, xmm15
470 pxor xmm4, xmm8
471 pxor xmm5, xmm9
472 pxor xmm6, xmm10
473 pxor xmm7, xmm11
474 movdqa xmmword ptr [rsp+0x100], xmm8
475 movdqa xmm8, xmm4
476 psrld xmm8, 7
477 pslld xmm4, 25
478 por xmm4, xmm8
479 movdqa xmm8, xmm5
480 psrld xmm8, 7
481 pslld xmm5, 25
482 por xmm5, xmm8
483 movdqa xmm8, xmm6
484 psrld xmm8, 7
485 pslld xmm6, 25
486 por xmm6, xmm8
487 movdqa xmm8, xmm7
488 psrld xmm8, 7
489 pslld xmm7, 25
490 por xmm7, xmm8
491 paddd xmm0, xmmword ptr [rsp+0x10]
492 paddd xmm1, xmmword ptr [rsp+0xC0]
493 paddd xmm2, xmmword ptr [rsp+0x90]
494 paddd xmm3, xmmword ptr [rsp+0xF0]
495 paddd xmm0, xmm5
496 paddd xmm1, xmm6
497 paddd xmm2, xmm7
498 paddd xmm3, xmm4
499 pxor xmm15, xmm0
500 pxor xmm12, xmm1
501 pxor xmm13, xmm2
502 pxor xmm14, xmm3
503 pshuflw xmm15, xmm15, 0xB1
504 pshufhw xmm15, xmm15, 0xB1
505 pshuflw xmm12, xmm12, 0xB1
506 pshufhw xmm12, xmm12, 0xB1
507 pshuflw xmm13, xmm13, 0xB1
508 pshufhw xmm13, xmm13, 0xB1
509 pshuflw xmm14, xmm14, 0xB1
510 pshufhw xmm14, xmm14, 0xB1
511 paddd xmm10, xmm15
512 paddd xmm11, xmm12
513 movdqa xmm8, xmmword ptr [rsp+0x100]
514 paddd xmm8, xmm13
515 paddd xmm9, xmm14
516 pxor xmm5, xmm10
517 pxor xmm6, xmm11
518 pxor xmm7, xmm8
519 pxor xmm4, xmm9
520 movdqa xmmword ptr [rsp+0x100], xmm8
521 movdqa xmm8, xmm5
522 psrld xmm8, 12
523 pslld xmm5, 20
524 por xmm5, xmm8
525 movdqa xmm8, xmm6
526 psrld xmm8, 12
527 pslld xmm6, 20
528 por xmm6, xmm8
529 movdqa xmm8, xmm7
530 psrld xmm8, 12
531 pslld xmm7, 20
532 por xmm7, xmm8
533 movdqa xmm8, xmm4
534 psrld xmm8, 12
535 pslld xmm4, 20
536 por xmm4, xmm8
537 paddd xmm0, xmmword ptr [rsp+0xB0]
538 paddd xmm1, xmmword ptr [rsp+0x50]
539 paddd xmm2, xmmword ptr [rsp+0xE0]
540 paddd xmm3, xmmword ptr [rsp+0x80]
541 paddd xmm0, xmm5
542 paddd xmm1, xmm6
543 paddd xmm2, xmm7
544 paddd xmm3, xmm4
545 pxor xmm15, xmm0
546 pxor xmm12, xmm1
547 pxor xmm13, xmm2
548 pxor xmm14, xmm3
549 movdqa xmm8, xmm15
550 psrld xmm15, 8
551 pslld xmm8, 24
552 pxor xmm15, xmm8
553 movdqa xmm8, xmm12
554 psrld xmm12, 8
555 pslld xmm8, 24
556 pxor xmm12, xmm8
557 movdqa xmm8, xmm13
558 psrld xmm13, 8
559 pslld xmm8, 24
560 pxor xmm13, xmm8
561 movdqa xmm8, xmm14
562 psrld xmm14, 8
563 pslld xmm8, 24
564 pxor xmm14, xmm8
565 paddd xmm10, xmm15
566 paddd xmm11, xmm12
567 movdqa xmm8, xmmword ptr [rsp+0x100]
568 paddd xmm8, xmm13
569 paddd xmm9, xmm14
570 pxor xmm5, xmm10
571 pxor xmm6, xmm11
572 pxor xmm7, xmm8
573 pxor xmm4, xmm9
574 movdqa xmmword ptr [rsp+0x100], xmm8
575 movdqa xmm8, xmm5
576 psrld xmm8, 7
577 pslld xmm5, 25
578 por xmm5, xmm8
579 movdqa xmm8, xmm6
580 psrld xmm8, 7
581 pslld xmm6, 25
582 por xmm6, xmm8
583 movdqa xmm8, xmm7
584 psrld xmm8, 7
585 pslld xmm7, 25
586 por xmm7, xmm8
587 movdqa xmm8, xmm4
588 psrld xmm8, 7
589 pslld xmm4, 25
590 por xmm4, xmm8
591 paddd xmm0, xmmword ptr [rsp+0x30]
592 paddd xmm1, xmmword ptr [rsp+0xA0]
593 paddd xmm2, xmmword ptr [rsp+0xD0]
594 paddd xmm3, xmmword ptr [rsp+0x70]
595 paddd xmm0, xmm4
596 paddd xmm1, xmm5
597 paddd xmm2, xmm6
598 paddd xmm3, xmm7
599 pxor xmm12, xmm0
600 pxor xmm13, xmm1
601 pxor xmm14, xmm2
602 pxor xmm15, xmm3
603 pshuflw xmm12, xmm12, 0xB1
604 pshufhw xmm12, xmm12, 0xB1
605 pshuflw xmm13, xmm13, 0xB1
606 pshufhw xmm13, xmm13, 0xB1
607 pshuflw xmm14, xmm14, 0xB1
608 pshufhw xmm14, xmm14, 0xB1
609 pshuflw xmm15, xmm15, 0xB1
610 pshufhw xmm15, xmm15, 0xB1
611 movdqa xmm8, xmmword ptr [rsp+0x100]
612 paddd xmm8, xmm12
613 paddd xmm9, xmm13
614 paddd xmm10, xmm14
615 paddd xmm11, xmm15
616 pxor xmm4, xmm8
617 pxor xmm5, xmm9
618 pxor xmm6, xmm10
619 pxor xmm7, xmm11
620 movdqa xmmword ptr [rsp+0x100], xmm8
621 movdqa xmm8, xmm4
622 psrld xmm8, 12
623 pslld xmm4, 20
624 por xmm4, xmm8
625 movdqa xmm8, xmm5
626 psrld xmm8, 12
627 pslld xmm5, 20
628 por xmm5, xmm8
629 movdqa xmm8, xmm6
630 psrld xmm8, 12
631 pslld xmm6, 20
632 por xmm6, xmm8
633 movdqa xmm8, xmm7
634 psrld xmm8, 12
635 pslld xmm7, 20
636 por xmm7, xmm8
637 paddd xmm0, xmmword ptr [rsp+0x40]
638 paddd xmm1, xmmword ptr [rsp+0xC0]
639 paddd xmm2, xmmword ptr [rsp+0x20]
640 paddd xmm3, xmmword ptr [rsp+0xE0]
641 paddd xmm0, xmm4
642 paddd xmm1, xmm5
643 paddd xmm2, xmm6
644 paddd xmm3, xmm7
645 pxor xmm12, xmm0
646 pxor xmm13, xmm1
647 pxor xmm14, xmm2
648 pxor xmm15, xmm3
649 movdqa xmm8, xmm12
650 psrld xmm12, 8
651 pslld xmm8, 24
652 pxor xmm12, xmm8
653 movdqa xmm8, xmm13
654 psrld xmm13, 8
655 pslld xmm8, 24
656 pxor xmm13, xmm8
657 movdqa xmm8, xmm14
658 psrld xmm14, 8
659 pslld xmm8, 24
660 pxor xmm14, xmm8
661 movdqa xmm8, xmm15
662 psrld xmm15, 8
663 pslld xmm8, 24
664 pxor xmm15, xmm8
665 movdqa xmm8, xmmword ptr [rsp+0x100]
666 paddd xmm8, xmm12
667 paddd xmm9, xmm13
668 paddd xmm10, xmm14
669 paddd xmm11, xmm15
670 pxor xmm4, xmm8
671 pxor xmm5, xmm9
672 pxor xmm6, xmm10
673 pxor xmm7, xmm11
674 movdqa xmmword ptr [rsp+0x100], xmm8
675 movdqa xmm8, xmm4
676 psrld xmm8, 7
677 pslld xmm4, 25
678 por xmm4, xmm8
679 movdqa xmm8, xmm5
680 psrld xmm8, 7
681 pslld xmm5, 25
682 por xmm5, xmm8
683 movdqa xmm8, xmm6
684 psrld xmm8, 7
685 pslld xmm6, 25
686 por xmm6, xmm8
687 movdqa xmm8, xmm7
688 psrld xmm8, 7
689 pslld xmm7, 25
690 por xmm7, xmm8
691 paddd xmm0, xmmword ptr [rsp+0x60]
692 paddd xmm1, xmmword ptr [rsp+0x90]
693 paddd xmm2, xmmword ptr [rsp+0xB0]
694 paddd xmm3, xmmword ptr [rsp+0x80]
695 paddd xmm0, xmm5
696 paddd xmm1, xmm6
697 paddd xmm2, xmm7
698 paddd xmm3, xmm4
699 pxor xmm15, xmm0
700 pxor xmm12, xmm1
701 pxor xmm13, xmm2
702 pxor xmm14, xmm3
703 pshuflw xmm15, xmm15, 0xB1
704 pshufhw xmm15, xmm15, 0xB1
705 pshuflw xmm12, xmm12, 0xB1
706 pshufhw xmm12, xmm12, 0xB1
707 pshuflw xmm13, xmm13, 0xB1
708 pshufhw xmm13, xmm13, 0xB1
709 pshuflw xmm14, xmm14, 0xB1
710 pshufhw xmm14, xmm14, 0xB1
711 paddd xmm10, xmm15
712 paddd xmm11, xmm12
713 movdqa xmm8, xmmword ptr [rsp+0x100]
714 paddd xmm8, xmm13
715 paddd xmm9, xmm14
716 pxor xmm5, xmm10
717 pxor xmm6, xmm11
718 pxor xmm7, xmm8
719 pxor xmm4, xmm9
720 movdqa xmmword ptr [rsp+0x100], xmm8
721 movdqa xmm8, xmm5
722 psrld xmm8, 12
723 pslld xmm5, 20
724 por xmm5, xmm8
725 movdqa xmm8, xmm6
726 psrld xmm8, 12
727 pslld xmm6, 20
728 por xmm6, xmm8
729 movdqa xmm8, xmm7
730 psrld xmm8, 12
731 pslld xmm7, 20
732 por xmm7, xmm8
733 movdqa xmm8, xmm4
734 psrld xmm8, 12
735 pslld xmm4, 20
736 por xmm4, xmm8
737 paddd xmm0, xmmword ptr [rsp+0x50]
738 paddd xmm1, xmmword ptr [rsp]
739 paddd xmm2, xmmword ptr [rsp+0xF0]
740 paddd xmm3, xmmword ptr [rsp+0x10]
741 paddd xmm0, xmm5
742 paddd xmm1, xmm6
743 paddd xmm2, xmm7
744 paddd xmm3, xmm4
745 pxor xmm15, xmm0
746 pxor xmm12, xmm1
747 pxor xmm13, xmm2
748 pxor xmm14, xmm3
749 movdqa xmm8, xmm15
750 psrld xmm15, 8
751 pslld xmm8, 24
752 pxor xmm15, xmm8
753 movdqa xmm8, xmm12
754 psrld xmm12, 8
755 pslld xmm8, 24
756 pxor xmm12, xmm8
757 movdqa xmm8, xmm13
758 psrld xmm13, 8
759 pslld xmm8, 24
760 pxor xmm13, xmm8
761 movdqa xmm8, xmm14
762 psrld xmm14, 8
763 pslld xmm8, 24
764 pxor xmm14, xmm8
765 paddd xmm10, xmm15
766 paddd xmm11, xmm12
767 movdqa xmm8, xmmword ptr [rsp+0x100]
768 paddd xmm8, xmm13
769 paddd xmm9, xmm14
770 pxor xmm5, xmm10
771 pxor xmm6, xmm11
772 pxor xmm7, xmm8
773 pxor xmm4, xmm9
774 movdqa xmmword ptr [rsp+0x100], xmm8
775 movdqa xmm8, xmm5
776 psrld xmm8, 7
777 pslld xmm5, 25
778 por xmm5, xmm8
779 movdqa xmm8, xmm6
780 psrld xmm8, 7
781 pslld xmm6, 25
782 por xmm6, xmm8
783 movdqa xmm8, xmm7
784 psrld xmm8, 7
785 pslld xmm7, 25
786 por xmm7, xmm8
787 movdqa xmm8, xmm4
788 psrld xmm8, 7
789 pslld xmm4, 25
790 por xmm4, xmm8
791 paddd xmm0, xmmword ptr [rsp+0xA0]
792 paddd xmm1, xmmword ptr [rsp+0xC0]
793 paddd xmm2, xmmword ptr [rsp+0xE0]
794 paddd xmm3, xmmword ptr [rsp+0xD0]
795 paddd xmm0, xmm4
796 paddd xmm1, xmm5
797 paddd xmm2, xmm6
798 paddd xmm3, xmm7
799 pxor xmm12, xmm0
800 pxor xmm13, xmm1
801 pxor xmm14, xmm2
802 pxor xmm15, xmm3
803 pshuflw xmm12, xmm12, 0xB1
804 pshufhw xmm12, xmm12, 0xB1
805 pshuflw xmm13, xmm13, 0xB1
806 pshufhw xmm13, xmm13, 0xB1
807 pshuflw xmm14, xmm14, 0xB1
808 pshufhw xmm14, xmm14, 0xB1
809 pshuflw xmm15, xmm15, 0xB1
810 pshufhw xmm15, xmm15, 0xB1
811 movdqa xmm8, xmmword ptr [rsp+0x100]
812 paddd xmm8, xmm12
813 paddd xmm9, xmm13
814 paddd xmm10, xmm14
815 paddd xmm11, xmm15
816 pxor xmm4, xmm8
817 pxor xmm5, xmm9
818 pxor xmm6, xmm10
819 pxor xmm7, xmm11
820 movdqa xmmword ptr [rsp+0x100], xmm8
821 movdqa xmm8, xmm4
822 psrld xmm8, 12
823 pslld xmm4, 20
824 por xmm4, xmm8
825 movdqa xmm8, xmm5
826 psrld xmm8, 12
827 pslld xmm5, 20
828 por xmm5, xmm8
829 movdqa xmm8, xmm6
830 psrld xmm8, 12
831 pslld xmm6, 20
832 por xmm6, xmm8
833 movdqa xmm8, xmm7
834 psrld xmm8, 12
835 pslld xmm7, 20
836 por xmm7, xmm8
837 paddd xmm0, xmmword ptr [rsp+0x70]
838 paddd xmm1, xmmword ptr [rsp+0x90]
839 paddd xmm2, xmmword ptr [rsp+0x30]
840 paddd xmm3, xmmword ptr [rsp+0xF0]
841 paddd xmm0, xmm4
842 paddd xmm1, xmm5
843 paddd xmm2, xmm6
844 paddd xmm3, xmm7
845 pxor xmm12, xmm0
846 pxor xmm13, xmm1
847 pxor xmm14, xmm2
848 pxor xmm15, xmm3
849 movdqa xmm8, xmm12
850 psrld xmm12, 8
851 pslld xmm8, 24
852 pxor xmm12, xmm8
853 movdqa xmm8, xmm13
854 psrld xmm13, 8
855 pslld xmm8, 24
856 pxor xmm13, xmm8
857 movdqa xmm8, xmm14
858 psrld xmm14, 8
859 pslld xmm8, 24
860 pxor xmm14, xmm8
861 movdqa xmm8, xmm15
862 psrld xmm15, 8
863 pslld xmm8, 24
864 pxor xmm15, xmm8
865 movdqa xmm8, xmmword ptr [rsp+0x100]
866 paddd xmm8, xmm12
867 paddd xmm9, xmm13
868 paddd xmm10, xmm14
869 paddd xmm11, xmm15
870 pxor xmm4, xmm8
871 pxor xmm5, xmm9
872 pxor xmm6, xmm10
873 pxor xmm7, xmm11
874 movdqa xmmword ptr [rsp+0x100], xmm8
875 movdqa xmm8, xmm4
876 psrld xmm8, 7
877 pslld xmm4, 25
878 por xmm4, xmm8
879 movdqa xmm8, xmm5
880 psrld xmm8, 7
881 pslld xmm5, 25
882 por xmm5, xmm8
883 movdqa xmm8, xmm6
884 psrld xmm8, 7
885 pslld xmm6, 25
886 por xmm6, xmm8
887 movdqa xmm8, xmm7
888 psrld xmm8, 7
889 pslld xmm7, 25
890 por xmm7, xmm8
891 paddd xmm0, xmmword ptr [rsp+0x40]
892 paddd xmm1, xmmword ptr [rsp+0xB0]
893 paddd xmm2, xmmword ptr [rsp+0x50]
894 paddd xmm3, xmmword ptr [rsp+0x10]
895 paddd xmm0, xmm5
896 paddd xmm1, xmm6
897 paddd xmm2, xmm7
898 paddd xmm3, xmm4
899 pxor xmm15, xmm0
900 pxor xmm12, xmm1
901 pxor xmm13, xmm2
902 pxor xmm14, xmm3
903 pshuflw xmm15, xmm15, 0xB1
904 pshufhw xmm15, xmm15, 0xB1
905 pshuflw xmm12, xmm12, 0xB1
906 pshufhw xmm12, xmm12, 0xB1
907 pshuflw xmm13, xmm13, 0xB1
908 pshufhw xmm13, xmm13, 0xB1
909 pshuflw xmm14, xmm14, 0xB1
910 pshufhw xmm14, xmm14, 0xB1
911 paddd xmm10, xmm15
912 paddd xmm11, xmm12
913 movdqa xmm8, xmmword ptr [rsp+0x100]
914 paddd xmm8, xmm13
915 paddd xmm9, xmm14
916 pxor xmm5, xmm10
917 pxor xmm6, xmm11
918 pxor xmm7, xmm8
919 pxor xmm4, xmm9
920 movdqa xmmword ptr [rsp+0x100], xmm8
921 movdqa xmm8, xmm5
922 psrld xmm8, 12
923 pslld xmm5, 20
924 por xmm5, xmm8
925 movdqa xmm8, xmm6
926 psrld xmm8, 12
927 pslld xmm6, 20
928 por xmm6, xmm8
929 movdqa xmm8, xmm7
930 psrld xmm8, 12
931 pslld xmm7, 20
932 por xmm7, xmm8
933 movdqa xmm8, xmm4
934 psrld xmm8, 12
935 pslld xmm4, 20
936 por xmm4, xmm8
937 paddd xmm0, xmmword ptr [rsp]
938 paddd xmm1, xmmword ptr [rsp+0x20]
939 paddd xmm2, xmmword ptr [rsp+0x80]
940 paddd xmm3, xmmword ptr [rsp+0x60]
941 paddd xmm0, xmm5
942 paddd xmm1, xmm6
943 paddd xmm2, xmm7
944 paddd xmm3, xmm4
945 pxor xmm15, xmm0
946 pxor xmm12, xmm1
947 pxor xmm13, xmm2
948 pxor xmm14, xmm3
949 movdqa xmm8, xmm15
950 psrld xmm15, 8
951 pslld xmm8, 24
952 pxor xmm15, xmm8
953 movdqa xmm8, xmm12
954 psrld xmm12, 8
955 pslld xmm8, 24
956 pxor xmm12, xmm8
957 movdqa xmm8, xmm13
958 psrld xmm13, 8
959 pslld xmm8, 24
960 pxor xmm13, xmm8
961 movdqa xmm8, xmm14
962 psrld xmm14, 8
963 pslld xmm8, 24
964 pxor xmm14, xmm8
965 paddd xmm10, xmm15
966 paddd xmm11, xmm12
967 movdqa xmm8, xmmword ptr [rsp+0x100]
968 paddd xmm8, xmm13
969 paddd xmm9, xmm14
970 pxor xmm5, xmm10
971 pxor xmm6, xmm11
972 pxor xmm7, xmm8
973 pxor xmm4, xmm9
974 movdqa xmmword ptr [rsp+0x100], xmm8
975 movdqa xmm8, xmm5
976 psrld xmm8, 7
977 pslld xmm5, 25
978 por xmm5, xmm8
979 movdqa xmm8, xmm6
980 psrld xmm8, 7
981 pslld xmm6, 25
982 por xmm6, xmm8
983 movdqa xmm8, xmm7
984 psrld xmm8, 7
985 pslld xmm7, 25
986 por xmm7, xmm8
987 movdqa xmm8, xmm4
988 psrld xmm8, 7
989 pslld xmm4, 25
990 por xmm4, xmm8
991 paddd xmm0, xmmword ptr [rsp+0xC0]
992 paddd xmm1, xmmword ptr [rsp+0x90]
993 paddd xmm2, xmmword ptr [rsp+0xF0]
994 paddd xmm3, xmmword ptr [rsp+0xE0]
995 paddd xmm0, xmm4
996 paddd xmm1, xmm5
997 paddd xmm2, xmm6
998 paddd xmm3, xmm7
999 pxor xmm12, xmm0
1000 pxor xmm13, xmm1
1001 pxor xmm14, xmm2
1002 pxor xmm15, xmm3
1003 pshuflw xmm12, xmm12, 0xB1
1004 pshufhw xmm12, xmm12, 0xB1
1005 pshuflw xmm13, xmm13, 0xB1
1006 pshufhw xmm13, xmm13, 0xB1
1007 pshuflw xmm14, xmm14, 0xB1
1008 pshufhw xmm14, xmm14, 0xB1
1009 pshuflw xmm15, xmm15, 0xB1
1010 pshufhw xmm15, xmm15, 0xB1
1011 movdqa xmm8, xmmword ptr [rsp+0x100]
1012 paddd xmm8, xmm12
1013 paddd xmm9, xmm13
1014 paddd xmm10, xmm14
1015 paddd xmm11, xmm15
1016 pxor xmm4, xmm8
1017 pxor xmm5, xmm9
1018 pxor xmm6, xmm10
1019 pxor xmm7, xmm11
1020 movdqa xmmword ptr [rsp+0x100], xmm8
1021 movdqa xmm8, xmm4
1022 psrld xmm8, 12
1023 pslld xmm4, 20
1024 por xmm4, xmm8
1025 movdqa xmm8, xmm5
1026 psrld xmm8, 12
1027 pslld xmm5, 20
1028 por xmm5, xmm8
1029 movdqa xmm8, xmm6
1030 psrld xmm8, 12
1031 pslld xmm6, 20
1032 por xmm6, xmm8
1033 movdqa xmm8, xmm7
1034 psrld xmm8, 12
1035 pslld xmm7, 20
1036 por xmm7, xmm8
1037 paddd xmm0, xmmword ptr [rsp+0xD0]
1038 paddd xmm1, xmmword ptr [rsp+0xB0]
1039 paddd xmm2, xmmword ptr [rsp+0xA0]
1040 paddd xmm3, xmmword ptr [rsp+0x80]
1041 paddd xmm0, xmm4
1042 paddd xmm1, xmm5
1043 paddd xmm2, xmm6
1044 paddd xmm3, xmm7
1045 pxor xmm12, xmm0
1046 pxor xmm13, xmm1
1047 pxor xmm14, xmm2
1048 pxor xmm15, xmm3
1049 movdqa xmm8, xmm12
1050 psrld xmm12, 8
1051 pslld xmm8, 24
1052 pxor xmm12, xmm8
1053 movdqa xmm8, xmm13
1054 psrld xmm13, 8
1055 pslld xmm8, 24
1056 pxor xmm13, xmm8
1057 movdqa xmm8, xmm14
1058 psrld xmm14, 8
1059 pslld xmm8, 24
1060 pxor xmm14, xmm8
1061 movdqa xmm8, xmm15
1062 psrld xmm15, 8
1063 pslld xmm8, 24
1064 pxor xmm15, xmm8
1065 movdqa xmm8, xmmword ptr [rsp+0x100]
1066 paddd xmm8, xmm12
1067 paddd xmm9, xmm13
1068 paddd xmm10, xmm14
1069 paddd xmm11, xmm15
1070 pxor xmm4, xmm8
1071 pxor xmm5, xmm9
1072 pxor xmm6, xmm10
1073 pxor xmm7, xmm11
1074 movdqa xmmword ptr [rsp+0x100], xmm8
1075 movdqa xmm8, xmm4
1076 psrld xmm8, 7
1077 pslld xmm4, 25
1078 por xmm4, xmm8
1079 movdqa xmm8, xmm5
1080 psrld xmm8, 7
1081 pslld xmm5, 25
1082 por xmm5, xmm8
1083 movdqa xmm8, xmm6
1084 psrld xmm8, 7
1085 pslld xmm6, 25
1086 por xmm6, xmm8
1087 movdqa xmm8, xmm7
1088 psrld xmm8, 7
1089 pslld xmm7, 25
1090 por xmm7, xmm8
1091 paddd xmm0, xmmword ptr [rsp+0x70]
1092 paddd xmm1, xmmword ptr [rsp+0x50]
1093 paddd xmm2, xmmword ptr [rsp]
1094 paddd xmm3, xmmword ptr [rsp+0x60]
1095 paddd xmm0, xmm5
1096 paddd xmm1, xmm6
1097 paddd xmm2, xmm7
1098 paddd xmm3, xmm4
1099 pxor xmm15, xmm0
1100 pxor xmm12, xmm1
1101 pxor xmm13, xmm2
1102 pxor xmm14, xmm3
1103 pshuflw xmm15, xmm15, 0xB1
1104 pshufhw xmm15, xmm15, 0xB1
1105 pshuflw xmm12, xmm12, 0xB1
1106 pshufhw xmm12, xmm12, 0xB1
1107 pshuflw xmm13, xmm13, 0xB1
1108 pshufhw xmm13, xmm13, 0xB1
1109 pshuflw xmm14, xmm14, 0xB1
1110 pshufhw xmm14, xmm14, 0xB1
1111 paddd xmm10, xmm15
1112 paddd xmm11, xmm12
1113 movdqa xmm8, xmmword ptr [rsp+0x100]
1114 paddd xmm8, xmm13
1115 paddd xmm9, xmm14
1116 pxor xmm5, xmm10
1117 pxor xmm6, xmm11
1118 pxor xmm7, xmm8
1119 pxor xmm4, xmm9
1120 movdqa xmmword ptr [rsp+0x100], xmm8
1121 movdqa xmm8, xmm5
1122 psrld xmm8, 12
1123 pslld xmm5, 20
1124 por xmm5, xmm8
1125 movdqa xmm8, xmm6
1126 psrld xmm8, 12
1127 pslld xmm6, 20
1128 por xmm6, xmm8
1129 movdqa xmm8, xmm7
1130 psrld xmm8, 12
1131 pslld xmm7, 20
1132 por xmm7, xmm8
1133 movdqa xmm8, xmm4
1134 psrld xmm8, 12
1135 pslld xmm4, 20
1136 por xmm4, xmm8
1137 paddd xmm0, xmmword ptr [rsp+0x20]
1138 paddd xmm1, xmmword ptr [rsp+0x30]
1139 paddd xmm2, xmmword ptr [rsp+0x10]
1140 paddd xmm3, xmmword ptr [rsp+0x40]
1141 paddd xmm0, xmm5
1142 paddd xmm1, xmm6
1143 paddd xmm2, xmm7
1144 paddd xmm3, xmm4
1145 pxor xmm15, xmm0
1146 pxor xmm12, xmm1
1147 pxor xmm13, xmm2
1148 pxor xmm14, xmm3
1149 movdqa xmm8, xmm15
1150 psrld xmm15, 8
1151 pslld xmm8, 24
1152 pxor xmm15, xmm8
1153 movdqa xmm8, xmm12
1154 psrld xmm12, 8
1155 pslld xmm8, 24
1156 pxor xmm12, xmm8
1157 movdqa xmm8, xmm13
1158 psrld xmm13, 8
1159 pslld xmm8, 24
1160 pxor xmm13, xmm8
1161 movdqa xmm8, xmm14
1162 psrld xmm14, 8
1163 pslld xmm8, 24
1164 pxor xmm14, xmm8
1165 paddd xmm10, xmm15
1166 paddd xmm11, xmm12
1167 movdqa xmm8, xmmword ptr [rsp+0x100]
1168 paddd xmm8, xmm13
1169 paddd xmm9, xmm14
1170 pxor xmm5, xmm10
1171 pxor xmm6, xmm11
1172 pxor xmm7, xmm8
1173 pxor xmm4, xmm9
1174 movdqa xmmword ptr [rsp+0x100], xmm8
1175 movdqa xmm8, xmm5
1176 psrld xmm8, 7
1177 pslld xmm5, 25
1178 por xmm5, xmm8
1179 movdqa xmm8, xmm6
1180 psrld xmm8, 7
1181 pslld xmm6, 25
1182 por xmm6, xmm8
1183 movdqa xmm8, xmm7
1184 psrld xmm8, 7
1185 pslld xmm7, 25
1186 por xmm7, xmm8
1187 movdqa xmm8, xmm4
1188 psrld xmm8, 7
1189 pslld xmm4, 25
1190 por xmm4, xmm8
1191 paddd xmm0, xmmword ptr [rsp+0x90]
1192 paddd xmm1, xmmword ptr [rsp+0xB0]
1193 paddd xmm2, xmmword ptr [rsp+0x80]
1194 paddd xmm3, xmmword ptr [rsp+0xF0]
1195 paddd xmm0, xmm4
1196 paddd xmm1, xmm5
1197 paddd xmm2, xmm6
1198 paddd xmm3, xmm7
1199 pxor xmm12, xmm0
1200 pxor xmm13, xmm1
1201 pxor xmm14, xmm2
1202 pxor xmm15, xmm3
1203 pshuflw xmm12, xmm12, 0xB1
1204 pshufhw xmm12, xmm12, 0xB1
1205 pshuflw xmm13, xmm13, 0xB1
1206 pshufhw xmm13, xmm13, 0xB1
1207 pshuflw xmm14, xmm14, 0xB1
1208 pshufhw xmm14, xmm14, 0xB1
1209 pshuflw xmm15, xmm15, 0xB1
1210 pshufhw xmm15, xmm15, 0xB1
1211 movdqa xmm8, xmmword ptr [rsp+0x100]
1212 paddd xmm8, xmm12
1213 paddd xmm9, xmm13
1214 paddd xmm10, xmm14
1215 paddd xmm11, xmm15
1216 pxor xmm4, xmm8
1217 pxor xmm5, xmm9
1218 pxor xmm6, xmm10
1219 pxor xmm7, xmm11
1220 movdqa xmmword ptr [rsp+0x100], xmm8
1221 movdqa xmm8, xmm4
1222 psrld xmm8, 12
1223 pslld xmm4, 20
1224 por xmm4, xmm8
1225 movdqa xmm8, xmm5
1226 psrld xmm8, 12
1227 pslld xmm5, 20
1228 por xmm5, xmm8
1229 movdqa xmm8, xmm6
1230 psrld xmm8, 12
1231 pslld xmm6, 20
1232 por xmm6, xmm8
1233 movdqa xmm8, xmm7
1234 psrld xmm8, 12
1235 pslld xmm7, 20
1236 por xmm7, xmm8
1237 paddd xmm0, xmmword ptr [rsp+0xE0]
1238 paddd xmm1, xmmword ptr [rsp+0x50]
1239 paddd xmm2, xmmword ptr [rsp+0xC0]
1240 paddd xmm3, xmmword ptr [rsp+0x10]
1241 paddd xmm0, xmm4
1242 paddd xmm1, xmm5
1243 paddd xmm2, xmm6
1244 paddd xmm3, xmm7
1245 pxor xmm12, xmm0
1246 pxor xmm13, xmm1
1247 pxor xmm14, xmm2
1248 pxor xmm15, xmm3
1249 movdqa xmm8, xmm12
1250 psrld xmm12, 8
1251 pslld xmm8, 24
1252 pxor xmm12, xmm8
1253 movdqa xmm8, xmm13
1254 psrld xmm13, 8
1255 pslld xmm8, 24
1256 pxor xmm13, xmm8
1257 movdqa xmm8, xmm14
1258 psrld xmm14, 8
1259 pslld xmm8, 24
1260 pxor xmm14, xmm8
1261 movdqa xmm8, xmm15
1262 psrld xmm15, 8
1263 pslld xmm8, 24
1264 pxor xmm15, xmm8
1265 movdqa xmm8, xmmword ptr [rsp+0x100]
1266 paddd xmm8, xmm12
1267 paddd xmm9, xmm13
1268 paddd xmm10, xmm14
1269 paddd xmm11, xmm15
1270 pxor xmm4, xmm8
1271 pxor xmm5, xmm9
1272 pxor xmm6, xmm10
1273 pxor xmm7, xmm11
1274 movdqa xmmword ptr [rsp+0x100], xmm8
1275 movdqa xmm8, xmm4
1276 psrld xmm8, 7
1277 pslld xmm4, 25
1278 por xmm4, xmm8
1279 movdqa xmm8, xmm5
1280 psrld xmm8, 7
1281 pslld xmm5, 25
1282 por xmm5, xmm8
1283 movdqa xmm8, xmm6
1284 psrld xmm8, 7
1285 pslld xmm6, 25
1286 por xmm6, xmm8
1287 movdqa xmm8, xmm7
1288 psrld xmm8, 7
1289 pslld xmm7, 25
1290 por xmm7, xmm8
1291 paddd xmm0, xmmword ptr [rsp+0xD0]
1292 paddd xmm1, xmmword ptr [rsp]
1293 paddd xmm2, xmmword ptr [rsp+0x20]
1294 paddd xmm3, xmmword ptr [rsp+0x40]
1295 paddd xmm0, xmm5
1296 paddd xmm1, xmm6
1297 paddd xmm2, xmm7
1298 paddd xmm3, xmm4
1299 pxor xmm15, xmm0
1300 pxor xmm12, xmm1
1301 pxor xmm13, xmm2
1302 pxor xmm14, xmm3
1303 pshuflw xmm15, xmm15, 0xB1
1304 pshufhw xmm15, xmm15, 0xB1
1305 pshuflw xmm12, xmm12, 0xB1
1306 pshufhw xmm12, xmm12, 0xB1
1307 pshuflw xmm13, xmm13, 0xB1
1308 pshufhw xmm13, xmm13, 0xB1
1309 pshuflw xmm14, xmm14, 0xB1
1310 pshufhw xmm14, xmm14, 0xB1
1311 paddd xmm10, xmm15
1312 paddd xmm11, xmm12
1313 movdqa xmm8, xmmword ptr [rsp+0x100]
1314 paddd xmm8, xmm13
1315 paddd xmm9, xmm14
1316 pxor xmm5, xmm10
1317 pxor xmm6, xmm11
1318 pxor xmm7, xmm8
1319 pxor xmm4, xmm9
1320 movdqa xmmword ptr [rsp+0x100], xmm8
1321 movdqa xmm8, xmm5
1322 psrld xmm8, 12
1323 pslld xmm5, 20
1324 por xmm5, xmm8
1325 movdqa xmm8, xmm6
1326 psrld xmm8, 12
1327 pslld xmm6, 20
1328 por xmm6, xmm8
1329 movdqa xmm8, xmm7
1330 psrld xmm8, 12
1331 pslld xmm7, 20
1332 por xmm7, xmm8
1333 movdqa xmm8, xmm4
1334 psrld xmm8, 12
1335 pslld xmm4, 20
1336 por xmm4, xmm8
1337 paddd xmm0, xmmword ptr [rsp+0x30]
1338 paddd xmm1, xmmword ptr [rsp+0xA0]
1339 paddd xmm2, xmmword ptr [rsp+0x60]
1340 paddd xmm3, xmmword ptr [rsp+0x70]
1341 paddd xmm0, xmm5
1342 paddd xmm1, xmm6
1343 paddd xmm2, xmm7
1344 paddd xmm3, xmm4
1345 pxor xmm15, xmm0
1346 pxor xmm12, xmm1
1347 pxor xmm13, xmm2
1348 pxor xmm14, xmm3
1349 movdqa xmm8, xmm15
1350 psrld xmm15, 8
1351 pslld xmm8, 24
1352 pxor xmm15, xmm8
1353 movdqa xmm8, xmm12
1354 psrld xmm12, 8
1355 pslld xmm8, 24
1356 pxor xmm12, xmm8
1357 movdqa xmm8, xmm13
1358 psrld xmm13, 8
1359 pslld xmm8, 24
1360 pxor xmm13, xmm8
1361 movdqa xmm8, xmm14
1362 psrld xmm14, 8
1363 pslld xmm8, 24
1364 pxor xmm14, xmm8
1365 paddd xmm10, xmm15
1366 paddd xmm11, xmm12
1367 movdqa xmm8, xmmword ptr [rsp+0x100]
1368 paddd xmm8, xmm13
1369 paddd xmm9, xmm14
1370 pxor xmm5, xmm10
1371 pxor xmm6, xmm11
1372 pxor xmm7, xmm8
1373 pxor xmm4, xmm9
1374 movdqa xmmword ptr [rsp+0x100], xmm8
1375 movdqa xmm8, xmm5
1376 psrld xmm8, 7
1377 pslld xmm5, 25
1378 por xmm5, xmm8
1379 movdqa xmm8, xmm6
1380 psrld xmm8, 7
1381 pslld xmm6, 25
1382 por xmm6, xmm8
1383 movdqa xmm8, xmm7
1384 psrld xmm8, 7
1385 pslld xmm7, 25
1386 por xmm7, xmm8
1387 movdqa xmm8, xmm4
1388 psrld xmm8, 7
1389 pslld xmm4, 25
1390 por xmm4, xmm8
1391 paddd xmm0, xmmword ptr [rsp+0xB0]
1392 paddd xmm1, xmmword ptr [rsp+0x50]
1393 paddd xmm2, xmmword ptr [rsp+0x10]
1394 paddd xmm3, xmmword ptr [rsp+0x80]
1395 paddd xmm0, xmm4
1396 paddd xmm1, xmm5
1397 paddd xmm2, xmm6
1398 paddd xmm3, xmm7
1399 pxor xmm12, xmm0
1400 pxor xmm13, xmm1
1401 pxor xmm14, xmm2
1402 pxor xmm15, xmm3
1403 pshuflw xmm12, xmm12, 0xB1
1404 pshufhw xmm12, xmm12, 0xB1
1405 pshuflw xmm13, xmm13, 0xB1
1406 pshufhw xmm13, xmm13, 0xB1
1407 pshuflw xmm14, xmm14, 0xB1
1408 pshufhw xmm14, xmm14, 0xB1
1409 pshuflw xmm15, xmm15, 0xB1
1410 pshufhw xmm15, xmm15, 0xB1
1411 movdqa xmm8, xmmword ptr [rsp+0x100]
1412 paddd xmm8, xmm12
1413 paddd xmm9, xmm13
1414 paddd xmm10, xmm14
1415 paddd xmm11, xmm15
1416 pxor xmm4, xmm8
1417 pxor xmm5, xmm9
1418 pxor xmm6, xmm10
1419 pxor xmm7, xmm11
1420 movdqa xmmword ptr [rsp+0x100], xmm8
1421 movdqa xmm8, xmm4
1422 psrld xmm8, 12
1423 pslld xmm4, 20
1424 por xmm4, xmm8
1425 movdqa xmm8, xmm5
1426 psrld xmm8, 12
1427 pslld xmm5, 20
1428 por xmm5, xmm8
1429 movdqa xmm8, xmm6
1430 psrld xmm8, 12
1431 pslld xmm6, 20
1432 por xmm6, xmm8
1433 movdqa xmm8, xmm7
1434 psrld xmm8, 12
1435 pslld xmm7, 20
1436 por xmm7, xmm8
1437 paddd xmm0, xmmword ptr [rsp+0xF0]
1438 paddd xmm1, xmmword ptr [rsp]
1439 paddd xmm2, xmmword ptr [rsp+0x90]
1440 paddd xmm3, xmmword ptr [rsp+0x60]
1441 paddd xmm0, xmm4
1442 paddd xmm1, xmm5
1443 paddd xmm2, xmm6
1444 paddd xmm3, xmm7
1445 pxor xmm12, xmm0
1446 pxor xmm13, xmm1
1447 pxor xmm14, xmm2
1448 pxor xmm15, xmm3
1449 movdqa xmm8, xmm12
1450 psrld xmm12, 8
1451 pslld xmm8, 24
1452 pxor xmm12, xmm8
1453 movdqa xmm8, xmm13
1454 psrld xmm13, 8
1455 pslld xmm8, 24
1456 pxor xmm13, xmm8
1457 movdqa xmm8, xmm14
1458 psrld xmm14, 8
1459 pslld xmm8, 24
1460 pxor xmm14, xmm8
1461 movdqa xmm8, xmm15
1462 psrld xmm15, 8
1463 pslld xmm8, 24
1464 pxor xmm15, xmm8
1465 movdqa xmm8, xmmword ptr [rsp+0x100]
1466 paddd xmm8, xmm12
1467 paddd xmm9, xmm13
1468 paddd xmm10, xmm14
1469 paddd xmm11, xmm15
1470 pxor xmm4, xmm8
1471 pxor xmm5, xmm9
1472 pxor xmm6, xmm10
1473 pxor xmm7, xmm11
1474 movdqa xmmword ptr [rsp+0x100], xmm8
1475 movdqa xmm8, xmm4
1476 psrld xmm8, 7
1477 pslld xmm4, 25
1478 por xmm4, xmm8
1479 movdqa xmm8, xmm5
1480 psrld xmm8, 7
1481 pslld xmm5, 25
1482 por xmm5, xmm8
1483 movdqa xmm8, xmm6
1484 psrld xmm8, 7
1485 pslld xmm6, 25
1486 por xmm6, xmm8
1487 movdqa xmm8, xmm7
1488 psrld xmm8, 7
1489 pslld xmm7, 25
1490 por xmm7, xmm8
1491 paddd xmm0, xmmword ptr [rsp+0xE0]
1492 paddd xmm1, xmmword ptr [rsp+0x20]
1493 paddd xmm2, xmmword ptr [rsp+0x30]
1494 paddd xmm3, xmmword ptr [rsp+0x70]
1495 paddd xmm0, xmm5
1496 paddd xmm1, xmm6
1497 paddd xmm2, xmm7
1498 paddd xmm3, xmm4
1499 pxor xmm15, xmm0
1500 pxor xmm12, xmm1
1501 pxor xmm13, xmm2
1502 pxor xmm14, xmm3
1503 pshuflw xmm15, xmm15, 0xB1
1504 pshufhw xmm15, xmm15, 0xB1
1505 pshuflw xmm12, xmm12, 0xB1
1506 pshufhw xmm12, xmm12, 0xB1
1507 pshuflw xmm13, xmm13, 0xB1
1508 pshufhw xmm13, xmm13, 0xB1
1509 pshuflw xmm14, xmm14, 0xB1
1510 pshufhw xmm14, xmm14, 0xB1
1511 paddd xmm10, xmm15
1512 paddd xmm11, xmm12
1513 movdqa xmm8, xmmword ptr [rsp+0x100]
1514 paddd xmm8, xmm13
1515 paddd xmm9, xmm14
1516 pxor xmm5, xmm10
1517 pxor xmm6, xmm11
1518 pxor xmm7, xmm8
1519 pxor xmm4, xmm9
1520 movdqa xmmword ptr [rsp+0x100], xmm8
1521 movdqa xmm8, xmm5
1522 psrld xmm8, 12
1523 pslld xmm5, 20
1524 por xmm5, xmm8
1525 movdqa xmm8, xmm6
1526 psrld xmm8, 12
1527 pslld xmm6, 20
1528 por xmm6, xmm8
1529 movdqa xmm8, xmm7
1530 psrld xmm8, 12
1531 pslld xmm7, 20
1532 por xmm7, xmm8
1533 movdqa xmm8, xmm4
1534 psrld xmm8, 12
1535 pslld xmm4, 20
1536 por xmm4, xmm8
1537 paddd xmm0, xmmword ptr [rsp+0xA0]
1538 paddd xmm1, xmmword ptr [rsp+0xC0]
1539 paddd xmm2, xmmword ptr [rsp+0x40]
1540 paddd xmm3, xmmword ptr [rsp+0xD0]
1541 paddd xmm0, xmm5
1542 paddd xmm1, xmm6
1543 paddd xmm2, xmm7
1544 paddd xmm3, xmm4
1545 pxor xmm15, xmm0
1546 pxor xmm12, xmm1
1547 pxor xmm13, xmm2
1548 pxor xmm14, xmm3
1549 movdqa xmm8, xmm15
1550 psrld xmm15, 8
1551 pslld xmm8, 24
1552 pxor xmm15, xmm8
1553 movdqa xmm8, xmm12
1554 psrld xmm12, 8
1555 pslld xmm8, 24
1556 pxor xmm12, xmm8
1557 movdqa xmm8, xmm13
1558 psrld xmm13, 8
1559 pslld xmm8, 24
1560 pxor xmm13, xmm8
1561 movdqa xmm8, xmm14
1562 psrld xmm14, 8
1563 pslld xmm8, 24
1564 pxor xmm14, xmm8
1565 paddd xmm10, xmm15
1566 paddd xmm11, xmm12
1567 movdqa xmm8, xmmword ptr [rsp+0x100]
1568 paddd xmm8, xmm13
1569 paddd xmm9, xmm14
1570 pxor xmm5, xmm10
1571 pxor xmm6, xmm11
1572 pxor xmm7, xmm8
1573 pxor xmm4, xmm9
1574 pxor xmm0, xmm8
1575 pxor xmm1, xmm9
1576 pxor xmm2, xmm10
1577 pxor xmm3, xmm11
1578 movdqa xmm8, xmm5
1579 psrld xmm8, 7
1580 pslld xmm5, 25
1581 por xmm5, xmm8
1582 movdqa xmm8, xmm6
1583 psrld xmm8, 7
1584 pslld xmm6, 25
1585 por xmm6, xmm8
1586 movdqa xmm8, xmm7
1587 psrld xmm8, 7
1588 pslld xmm7, 25
1589 por xmm7, xmm8
1590 movdqa xmm8, xmm4
1591 psrld xmm8, 7
1592 pslld xmm4, 25
1593 por xmm4, xmm8
1594 pxor xmm4, xmm12
1595 pxor xmm5, xmm13
1596 pxor xmm6, xmm14
1597 pxor xmm7, xmm15
1598 mov eax, r13d
1599 jne 9b
1600 movdqa xmm9, xmm0
1601 punpckldq xmm0, xmm1
1602 punpckhdq xmm9, xmm1
1603 movdqa xmm11, xmm2
1604 punpckldq xmm2, xmm3
1605 punpckhdq xmm11, xmm3
1606 movdqa xmm1, xmm0
1607 punpcklqdq xmm0, xmm2
1608 punpckhqdq xmm1, xmm2
1609 movdqa xmm3, xmm9
1610 punpcklqdq xmm9, xmm11
1611 punpckhqdq xmm3, xmm11
1612 movdqu xmmword ptr [rbx], xmm0
1613 movdqu xmmword ptr [rbx+0x20], xmm1
1614 movdqu xmmword ptr [rbx+0x40], xmm9
1615 movdqu xmmword ptr [rbx+0x60], xmm3
1616 movdqa xmm9, xmm4
1617 punpckldq xmm4, xmm5
1618 punpckhdq xmm9, xmm5
1619 movdqa xmm11, xmm6
1620 punpckldq xmm6, xmm7
1621 punpckhdq xmm11, xmm7
1622 movdqa xmm5, xmm4
1623 punpcklqdq xmm4, xmm6
1624 punpckhqdq xmm5, xmm6
1625 movdqa xmm7, xmm9
1626 punpcklqdq xmm9, xmm11
1627 punpckhqdq xmm7, xmm11
1628 movdqu xmmword ptr [rbx+0x10], xmm4
1629 movdqu xmmword ptr [rbx+0x30], xmm5
1630 movdqu xmmword ptr [rbx+0x50], xmm9
1631 movdqu xmmword ptr [rbx+0x70], xmm7
1632 movdqa xmm1, xmmword ptr [rsp+0x110]
1633 movdqa xmm0, xmm1
1634 paddd xmm1, xmmword ptr [rsp+0x150]
1635 movdqa xmmword ptr [rsp+0x110], xmm1
1636 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1637 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1638 pcmpgtd xmm0, xmm1
1639 movdqa xmm1, xmmword ptr [rsp+0x120]
1640 psubd xmm1, xmm0
1641 movdqa xmmword ptr [rsp+0x120], xmm1
1642 add rbx, 128
1643 add rdi, 32
1644 sub rsi, 4
1645 cmp rsi, 4
1646 jnc 2b
1647 test rsi, rsi
1648 jnz 3f
1649 4:
1650 mov rsp, rbp
1651 pop rbp
1652 pop rbx
1653 pop r12
1654 pop r13
1655 pop r14
1656 pop r15
1657 RET
1658 .p2align 5
1659 3:
1660 test esi, 0x2
1661 je 3f
1662 movups xmm0, xmmword ptr [rcx]
1663 movups xmm1, xmmword ptr [rcx+0x10]
1664 movaps xmm8, xmm0
1665 movaps xmm9, xmm1
1666 movd xmm13, dword ptr [rsp+0x110]
1667 movd xmm14, dword ptr [rsp+0x120]
1668 punpckldq xmm13, xmm14
1669 movaps xmmword ptr [rsp], xmm13
1670 movd xmm14, dword ptr [rsp+0x114]
1671 movd xmm13, dword ptr [rsp+0x124]
1672 punpckldq xmm14, xmm13
1673 movaps xmmword ptr [rsp+0x10], xmm14
1674 mov r8, qword ptr [rdi]
1675 mov r9, qword ptr [rdi+0x8]
1676 movzx eax, byte ptr [rbp+0x40]
1677 or eax, r13d
1678 xor edx, edx
1679 2:
1680 mov r14d, eax
1681 or eax, r12d
1682 add rdx, 64
1683 cmp rdx, r15
1684 cmovne eax, r14d
1685 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1686 movaps xmm10, xmm2
1687 movups xmm4, xmmword ptr [r8+rdx-0x40]
1688 movups xmm5, xmmword ptr [r8+rdx-0x30]
1689 movaps xmm3, xmm4
1690 shufps xmm4, xmm5, 136
1691 shufps xmm3, xmm5, 221
1692 movaps xmm5, xmm3
1693 movups xmm6, xmmword ptr [r8+rdx-0x20]
1694 movups xmm7, xmmword ptr [r8+rdx-0x10]
1695 movaps xmm3, xmm6
1696 shufps xmm6, xmm7, 136
1697 pshufd xmm6, xmm6, 0x93
1698 shufps xmm3, xmm7, 221
1699 pshufd xmm7, xmm3, 0x93
1700 movups xmm12, xmmword ptr [r9+rdx-0x40]
1701 movups xmm13, xmmword ptr [r9+rdx-0x30]
1702 movaps xmm11, xmm12
1703 shufps xmm12, xmm13, 136
1704 shufps xmm11, xmm13, 221
1705 movaps xmm13, xmm11
1706 movups xmm14, xmmword ptr [r9+rdx-0x20]
1707 movups xmm15, xmmword ptr [r9+rdx-0x10]
1708 movaps xmm11, xmm14
1709 shufps xmm14, xmm15, 136
1710 pshufd xmm14, xmm14, 0x93
1711 shufps xmm11, xmm15, 221
1712 pshufd xmm15, xmm11, 0x93
1713 shl rax, 0x20
1714 or rax, 0x40
1715 movq xmm3, rax
1716 movdqa xmmword ptr [rsp+0x20], xmm3
1717 movaps xmm3, xmmword ptr [rsp]
1718 movaps xmm11, xmmword ptr [rsp+0x10]
1719 punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1720 punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1721 mov al, 7
1722 9:
1723 paddd xmm0, xmm4
1724 paddd xmm8, xmm12
1725 movaps xmmword ptr [rsp+0x20], xmm4
1726 movaps xmmword ptr [rsp+0x30], xmm12
1727 paddd xmm0, xmm1
1728 paddd xmm8, xmm9
1729 pxor xmm3, xmm0
1730 pxor xmm11, xmm8
1731 pshuflw xmm3, xmm3, 0xB1
1732 pshufhw xmm3, xmm3, 0xB1
1733 pshuflw xmm11, xmm11, 0xB1
1734 pshufhw xmm11, xmm11, 0xB1
1735 paddd xmm2, xmm3
1736 paddd xmm10, xmm11
1737 pxor xmm1, xmm2
1738 pxor xmm9, xmm10
1739 movdqa xmm4, xmm1
1740 pslld xmm1, 20
1741 psrld xmm4, 12
1742 por xmm1, xmm4
1743 movdqa xmm4, xmm9
1744 pslld xmm9, 20
1745 psrld xmm4, 12
1746 por xmm9, xmm4
1747 paddd xmm0, xmm5
1748 paddd xmm8, xmm13
1749 movaps xmmword ptr [rsp+0x40], xmm5
1750 movaps xmmword ptr [rsp+0x50], xmm13
1751 paddd xmm0, xmm1
1752 paddd xmm8, xmm9
1753 pxor xmm3, xmm0
1754 pxor xmm11, xmm8
1755 movdqa xmm13, xmm3
1756 psrld xmm3, 8
1757 pslld xmm13, 24
1758 pxor xmm3, xmm13
1759 movdqa xmm13, xmm11
1760 psrld xmm11, 8
1761 pslld xmm13, 24
1762 pxor xmm11, xmm13
1763 paddd xmm2, xmm3
1764 paddd xmm10, xmm11
1765 pxor xmm1, xmm2
1766 pxor xmm9, xmm10
1767 movdqa xmm4, xmm1
1768 pslld xmm1, 25
1769 psrld xmm4, 7
1770 por xmm1, xmm4
1771 movdqa xmm4, xmm9
1772 pslld xmm9, 25
1773 psrld xmm4, 7
1774 por xmm9, xmm4
1775 pshufd xmm0, xmm0, 0x93
1776 pshufd xmm8, xmm8, 0x93
1777 pshufd xmm3, xmm3, 0x4E
1778 pshufd xmm11, xmm11, 0x4E
1779 pshufd xmm2, xmm2, 0x39
1780 pshufd xmm10, xmm10, 0x39
1781 paddd xmm0, xmm6
1782 paddd xmm8, xmm14
1783 paddd xmm0, xmm1
1784 paddd xmm8, xmm9
1785 pxor xmm3, xmm0
1786 pxor xmm11, xmm8
1787 pshuflw xmm3, xmm3, 0xB1
1788 pshufhw xmm3, xmm3, 0xB1
1789 pshuflw xmm11, xmm11, 0xB1
1790 pshufhw xmm11, xmm11, 0xB1
1791 paddd xmm2, xmm3
1792 paddd xmm10, xmm11
1793 pxor xmm1, xmm2
1794 pxor xmm9, xmm10
1795 movdqa xmm4, xmm1
1796 pslld xmm1, 20
1797 psrld xmm4, 12
1798 por xmm1, xmm4
1799 movdqa xmm4, xmm9
1800 pslld xmm9, 20
1801 psrld xmm4, 12
1802 por xmm9, xmm4
1803 paddd xmm0, xmm7
1804 paddd xmm8, xmm15
1805 paddd xmm0, xmm1
1806 paddd xmm8, xmm9
1807 pxor xmm3, xmm0
1808 pxor xmm11, xmm8
1809 movdqa xmm13, xmm3
1810 psrld xmm3, 8
1811 pslld xmm13, 24
1812 pxor xmm3, xmm13
1813 movdqa xmm13, xmm11
1814 psrld xmm11, 8
1815 pslld xmm13, 24
1816 pxor xmm11, xmm13
1817 paddd xmm2, xmm3
1818 paddd xmm10, xmm11
1819 pxor xmm1, xmm2
1820 pxor xmm9, xmm10
1821 movdqa xmm4, xmm1
1822 pslld xmm1, 25
1823 psrld xmm4, 7
1824 por xmm1, xmm4
1825 movdqa xmm4, xmm9
1826 pslld xmm9, 25
1827 psrld xmm4, 7
1828 por xmm9, xmm4
1829 pshufd xmm0, xmm0, 0x39
1830 pshufd xmm8, xmm8, 0x39
1831 pshufd xmm3, xmm3, 0x4E
1832 pshufd xmm11, xmm11, 0x4E
1833 pshufd xmm2, xmm2, 0x93
1834 pshufd xmm10, xmm10, 0x93
1835 dec al
1836 je 9f
1837 movdqa xmm12, xmmword ptr [rsp+0x20]
1838 movdqa xmm5, xmmword ptr [rsp+0x40]
1839 pshufd xmm13, xmm12, 0x0F
1840 shufps xmm12, xmm5, 214
1841 pshufd xmm4, xmm12, 0x39
1842 movdqa xmm12, xmm6
1843 shufps xmm12, xmm7, 250
1844 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1845 pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1846 por xmm13, xmm12
1847 movdqa xmmword ptr [rsp+0x20], xmm13
1848 movdqa xmm12, xmm7
1849 punpcklqdq xmm12, xmm5
1850 movdqa xmm13, xmm6
1851 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1852 pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1853 por xmm12, xmm13
1854 pshufd xmm12, xmm12, 0x78
1855 punpckhdq xmm5, xmm7
1856 punpckldq xmm6, xmm5
1857 pshufd xmm7, xmm6, 0x1E
1858 movdqa xmmword ptr [rsp+0x40], xmm12
1859 movdqa xmm5, xmmword ptr [rsp+0x30]
1860 movdqa xmm13, xmmword ptr [rsp+0x50]
1861 pshufd xmm6, xmm5, 0x0F
1862 shufps xmm5, xmm13, 214
1863 pshufd xmm12, xmm5, 0x39
1864 movdqa xmm5, xmm14
1865 shufps xmm5, xmm15, 250
1866 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1867 pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1868 por xmm6, xmm5
1869 movdqa xmm5, xmm15
1870 punpcklqdq xmm5, xmm13
1871 movdqa xmmword ptr [rsp+0x30], xmm2
1872 movdqa xmm2, xmm14
1873 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1874 pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1875 por xmm5, xmm2
1876 movdqa xmm2, xmmword ptr [rsp+0x30]
1877 pshufd xmm5, xmm5, 0x78
1878 punpckhdq xmm13, xmm15
1879 punpckldq xmm14, xmm13
1880 pshufd xmm15, xmm14, 0x1E
1881 movdqa xmm13, xmm6
1882 movdqa xmm14, xmm5
1883 movdqa xmm5, xmmword ptr [rsp+0x20]
1884 movdqa xmm6, xmmword ptr [rsp+0x40]
1885 jmp 9b
1886 9:
1887 pxor xmm0, xmm2
1888 pxor xmm1, xmm3
1889 pxor xmm8, xmm10
1890 pxor xmm9, xmm11
1891 mov eax, r13d
1892 cmp rdx, r15
1893 jne 2b
1894 movups xmmword ptr [rbx], xmm0
1895 movups xmmword ptr [rbx+0x10], xmm1
1896 movups xmmword ptr [rbx+0x20], xmm8
1897 movups xmmword ptr [rbx+0x30], xmm9
1898 mov eax, dword ptr [rsp+0x130]
1899 neg eax
1900 mov r10d, dword ptr [rsp+0x110+8*rax]
1901 mov r11d, dword ptr [rsp+0x120+8*rax]
1902 mov dword ptr [rsp+0x110], r10d
1903 mov dword ptr [rsp+0x120], r11d
1904 add rdi, 16
1905 add rbx, 64
1906 sub rsi, 2
1907 3:
1908 test esi, 0x1
1909 je 4b
1910 movups xmm0, xmmword ptr [rcx]
1911 movups xmm1, xmmword ptr [rcx+0x10]
1912 movd xmm13, dword ptr [rsp+0x110]
1913 movd xmm14, dword ptr [rsp+0x120]
1914 punpckldq xmm13, xmm14
1915 mov r8, qword ptr [rdi]
1916 movzx eax, byte ptr [rbp+0x40]
1917 or eax, r13d
1918 xor edx, edx
1919 2:
1920 mov r14d, eax
1921 or eax, r12d
1922 add rdx, 64
1923 cmp rdx, r15
1924 cmovne eax, r14d
1925 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1926 shl rax, 32
1927 or rax, 64
1928 movq xmm12, rax
1929 movdqa xmm3, xmm13
1930 punpcklqdq xmm3, xmm12
1931 movups xmm4, xmmword ptr [r8+rdx-0x40]
1932 movups xmm5, xmmword ptr [r8+rdx-0x30]
1933 movaps xmm8, xmm4
1934 shufps xmm4, xmm5, 136
1935 shufps xmm8, xmm5, 221
1936 movaps xmm5, xmm8
1937 movups xmm6, xmmword ptr [r8+rdx-0x20]
1938 movups xmm7, xmmword ptr [r8+rdx-0x10]
1939 movaps xmm8, xmm6
1940 shufps xmm6, xmm7, 136
1941 pshufd xmm6, xmm6, 0x93
1942 shufps xmm8, xmm7, 221
1943 pshufd xmm7, xmm8, 0x93
1944 mov al, 7
1945 9:
1946 paddd xmm0, xmm4
1947 paddd xmm0, xmm1
1948 pxor xmm3, xmm0
1949 pshuflw xmm3, xmm3, 0xB1
1950 pshufhw xmm3, xmm3, 0xB1
1951 paddd xmm2, xmm3
1952 pxor xmm1, xmm2
1953 movdqa xmm11, xmm1
1954 pslld xmm1, 20
1955 psrld xmm11, 12
1956 por xmm1, xmm11
1957 paddd xmm0, xmm5
1958 paddd xmm0, xmm1
1959 pxor xmm3, xmm0
1960 movdqa xmm14, xmm3
1961 psrld xmm3, 8
1962 pslld xmm14, 24
1963 pxor xmm3, xmm14
1964 paddd xmm2, xmm3
1965 pxor xmm1, xmm2
1966 movdqa xmm11, xmm1
1967 pslld xmm1, 25
1968 psrld xmm11, 7
1969 por xmm1, xmm11
1970 pshufd xmm0, xmm0, 0x93
1971 pshufd xmm3, xmm3, 0x4E
1972 pshufd xmm2, xmm2, 0x39
1973 paddd xmm0, xmm6
1974 paddd xmm0, xmm1
1975 pxor xmm3, xmm0
1976 pshuflw xmm3, xmm3, 0xB1
1977 pshufhw xmm3, xmm3, 0xB1
1978 paddd xmm2, xmm3
1979 pxor xmm1, xmm2
1980 movdqa xmm11, xmm1
1981 pslld xmm1, 20
1982 psrld xmm11, 12
1983 por xmm1, xmm11
1984 paddd xmm0, xmm7
1985 paddd xmm0, xmm1
1986 pxor xmm3, xmm0
1987 movdqa xmm14, xmm3
1988 psrld xmm3, 8
1989 pslld xmm14, 24
1990 pxor xmm3, xmm14
1991 paddd xmm2, xmm3
1992 pxor xmm1, xmm2
1993 movdqa xmm11, xmm1
1994 pslld xmm1, 25
1995 psrld xmm11, 7
1996 por xmm1, xmm11
1997 pshufd xmm0, xmm0, 0x39
1998 pshufd xmm3, xmm3, 0x4E
1999 pshufd xmm2, xmm2, 0x93
2000 dec al
2001 jz 9f
2002 movdqa xmm8, xmm4
2003 shufps xmm8, xmm5, 214
2004 pshufd xmm9, xmm4, 0x0F
2005 pshufd xmm4, xmm8, 0x39
2006 movdqa xmm8, xmm6
2007 shufps xmm8, xmm7, 250
2008 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2009 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2010 por xmm9, xmm8
2011 movdqa xmm8, xmm7
2012 punpcklqdq xmm8, xmm5
2013 movdqa xmm10, xmm6
2014 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2015 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2016 por xmm8, xmm10
2017 pshufd xmm8, xmm8, 0x78
2018 punpckhdq xmm5, xmm7
2019 punpckldq xmm6, xmm5
2020 pshufd xmm7, xmm6, 0x1E
2021 movdqa xmm5, xmm9
2022 movdqa xmm6, xmm8
2023 jmp 9b
2024 9:
2025 pxor xmm0, xmm2
2026 pxor xmm1, xmm3
2027 mov eax, r13d
2028 cmp rdx, r15
2029 jne 2b
2030 movups xmmword ptr [rbx], xmm0
2031 movups xmmword ptr [rbx+0x10], xmm1
2032 jmp 4b
2033 SET_SIZE(zfs_blake3_hash_many_sse2)
2034
2035 ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64)
2036 ENDBR
2037 movups xmm0, xmmword ptr [rdi]
2038 movups xmm1, xmmword ptr [rdi+0x10]
2039 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2040 shl r8, 32
2041 add rdx, r8
2042 movq xmm3, rcx
2043 movq xmm4, rdx
2044 punpcklqdq xmm3, xmm4
2045 movups xmm4, xmmword ptr [rsi]
2046 movups xmm5, xmmword ptr [rsi+0x10]
2047 movaps xmm8, xmm4
2048 shufps xmm4, xmm5, 136
2049 shufps xmm8, xmm5, 221
2050 movaps xmm5, xmm8
2051 movups xmm6, xmmword ptr [rsi+0x20]
2052 movups xmm7, xmmword ptr [rsi+0x30]
2053 movaps xmm8, xmm6
2054 shufps xmm6, xmm7, 136
2055 pshufd xmm6, xmm6, 0x93
2056 shufps xmm8, xmm7, 221
2057 pshufd xmm7, xmm8, 0x93
2058 mov al, 7
2059 9:
2060 paddd xmm0, xmm4
2061 paddd xmm0, xmm1
2062 pxor xmm3, xmm0
2063 pshuflw xmm3, xmm3, 0xB1
2064 pshufhw xmm3, xmm3, 0xB1
2065 paddd xmm2, xmm3
2066 pxor xmm1, xmm2
2067 movdqa xmm11, xmm1
2068 pslld xmm1, 20
2069 psrld xmm11, 12
2070 por xmm1, xmm11
2071 paddd xmm0, xmm5
2072 paddd xmm0, xmm1
2073 pxor xmm3, xmm0
2074 movdqa xmm14, xmm3
2075 psrld xmm3, 8
2076 pslld xmm14, 24
2077 pxor xmm3, xmm14
2078 paddd xmm2, xmm3
2079 pxor xmm1, xmm2
2080 movdqa xmm11, xmm1
2081 pslld xmm1, 25
2082 psrld xmm11, 7
2083 por xmm1, xmm11
2084 pshufd xmm0, xmm0, 0x93
2085 pshufd xmm3, xmm3, 0x4E
2086 pshufd xmm2, xmm2, 0x39
2087 paddd xmm0, xmm6
2088 paddd xmm0, xmm1
2089 pxor xmm3, xmm0
2090 pshuflw xmm3, xmm3, 0xB1
2091 pshufhw xmm3, xmm3, 0xB1
2092 paddd xmm2, xmm3
2093 pxor xmm1, xmm2
2094 movdqa xmm11, xmm1
2095 pslld xmm1, 20
2096 psrld xmm11, 12
2097 por xmm1, xmm11
2098 paddd xmm0, xmm7
2099 paddd xmm0, xmm1
2100 pxor xmm3, xmm0
2101 movdqa xmm14, xmm3
2102 psrld xmm3, 8
2103 pslld xmm14, 24
2104 pxor xmm3, xmm14
2105 paddd xmm2, xmm3
2106 pxor xmm1, xmm2
2107 movdqa xmm11, xmm1
2108 pslld xmm1, 25
2109 psrld xmm11, 7
2110 por xmm1, xmm11
2111 pshufd xmm0, xmm0, 0x39
2112 pshufd xmm3, xmm3, 0x4E
2113 pshufd xmm2, xmm2, 0x93
2114 dec al
2115 jz 9f
2116 movdqa xmm8, xmm4
2117 shufps xmm8, xmm5, 214
2118 pshufd xmm9, xmm4, 0x0F
2119 pshufd xmm4, xmm8, 0x39
2120 movdqa xmm8, xmm6
2121 shufps xmm8, xmm7, 250
2122 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2123 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2124 por xmm9, xmm8
2125 movdqa xmm8, xmm7
2126 punpcklqdq xmm8, xmm5
2127 movdqa xmm10, xmm6
2128 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2129 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2130 por xmm8, xmm10
2131 pshufd xmm8, xmm8, 0x78
2132 punpckhdq xmm5, xmm7
2133 punpckldq xmm6, xmm5
2134 pshufd xmm7, xmm6, 0x1E
2135 movdqa xmm5, xmm9
2136 movdqa xmm6, xmm8
2137 jmp 9b
2138 9:
2139 pxor xmm0, xmm2
2140 pxor xmm1, xmm3
2141 movups xmmword ptr [rdi], xmm0
2142 movups xmmword ptr [rdi+0x10], xmm1
2143 RET
2144 SET_SIZE(zfs_blake3_compress_in_place_sse2)
2145
2146 ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64)
2147 ENDBR
2148 movups xmm0, xmmword ptr [rdi]
2149 movups xmm1, xmmword ptr [rdi+0x10]
2150 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2151 movzx eax, r8b
2152 movzx edx, dl
2153 shl rax, 32
2154 add rdx, rax
2155 movq xmm3, rcx
2156 movq xmm4, rdx
2157 punpcklqdq xmm3, xmm4
2158 movups xmm4, xmmword ptr [rsi]
2159 movups xmm5, xmmword ptr [rsi+0x10]
2160 movaps xmm8, xmm4
2161 shufps xmm4, xmm5, 136
2162 shufps xmm8, xmm5, 221
2163 movaps xmm5, xmm8
2164 movups xmm6, xmmword ptr [rsi+0x20]
2165 movups xmm7, xmmword ptr [rsi+0x30]
2166 movaps xmm8, xmm6
2167 shufps xmm6, xmm7, 136
2168 pshufd xmm6, xmm6, 0x93
2169 shufps xmm8, xmm7, 221
2170 pshufd xmm7, xmm8, 0x93
2171 mov al, 7
2172 9:
2173 paddd xmm0, xmm4
2174 paddd xmm0, xmm1
2175 pxor xmm3, xmm0
2176 pshuflw xmm3, xmm3, 0xB1
2177 pshufhw xmm3, xmm3, 0xB1
2178 paddd xmm2, xmm3
2179 pxor xmm1, xmm2
2180 movdqa xmm11, xmm1
2181 pslld xmm1, 20
2182 psrld xmm11, 12
2183 por xmm1, xmm11
2184 paddd xmm0, xmm5
2185 paddd xmm0, xmm1
2186 pxor xmm3, xmm0
2187 movdqa xmm14, xmm3
2188 psrld xmm3, 8
2189 pslld xmm14, 24
2190 pxor xmm3, xmm14
2191 paddd xmm2, xmm3
2192 pxor xmm1, xmm2
2193 movdqa xmm11, xmm1
2194 pslld xmm1, 25
2195 psrld xmm11, 7
2196 por xmm1, xmm11
2197 pshufd xmm0, xmm0, 0x93
2198 pshufd xmm3, xmm3, 0x4E
2199 pshufd xmm2, xmm2, 0x39
2200 paddd xmm0, xmm6
2201 paddd xmm0, xmm1
2202 pxor xmm3, xmm0
2203 pshuflw xmm3, xmm3, 0xB1
2204 pshufhw xmm3, xmm3, 0xB1
2205 paddd xmm2, xmm3
2206 pxor xmm1, xmm2
2207 movdqa xmm11, xmm1
2208 pslld xmm1, 20
2209 psrld xmm11, 12
2210 por xmm1, xmm11
2211 paddd xmm0, xmm7
2212 paddd xmm0, xmm1
2213 pxor xmm3, xmm0
2214 movdqa xmm14, xmm3
2215 psrld xmm3, 8
2216 pslld xmm14, 24
2217 pxor xmm3, xmm14
2218 paddd xmm2, xmm3
2219 pxor xmm1, xmm2
2220 movdqa xmm11, xmm1
2221 pslld xmm1, 25
2222 psrld xmm11, 7
2223 por xmm1, xmm11
2224 pshufd xmm0, xmm0, 0x39
2225 pshufd xmm3, xmm3, 0x4E
2226 pshufd xmm2, xmm2, 0x93
2227 dec al
2228 jz 9f
2229 movdqa xmm8, xmm4
2230 shufps xmm8, xmm5, 214
2231 pshufd xmm9, xmm4, 0x0F
2232 pshufd xmm4, xmm8, 0x39
2233 movdqa xmm8, xmm6
2234 shufps xmm8, xmm7, 250
2235 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2236 pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2237 por xmm9, xmm8
2238 movdqa xmm8, xmm7
2239 punpcklqdq xmm8, xmm5
2240 movdqa xmm10, xmm6
2241 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2242 pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2243 por xmm8, xmm10
2244 pshufd xmm8, xmm8, 0x78
2245 punpckhdq xmm5, xmm7
2246 punpckldq xmm6, xmm5
2247 pshufd xmm7, xmm6, 0x1E
2248 movdqa xmm5, xmm9
2249 movdqa xmm6, xmm8
2250 jmp 9b
2251 9:
2252 movdqu xmm4, xmmword ptr [rdi]
2253 movdqu xmm5, xmmword ptr [rdi+0x10]
2254 pxor xmm0, xmm2
2255 pxor xmm1, xmm3
2256 pxor xmm2, xmm4
2257 pxor xmm3, xmm5
2258 movups xmmword ptr [r9], xmm0
2259 movups xmmword ptr [r9+0x10], xmm1
2260 movups xmmword ptr [r9+0x20], xmm2
2261 movups xmmword ptr [r9+0x30], xmm3
2262 RET
2263 SET_SIZE(zfs_blake3_compress_xof_sse2)
2264
2265 SECTION_STATIC
2266 .p2align 6
2267 BLAKE3_IV:
2268 .long 0x6A09E667, 0xBB67AE85
2269 .long 0x3C6EF372, 0xA54FF53A
2270 ADD0:
2271 .long 0, 1, 2, 3
2272 ADD1:
2273 .long 4, 4, 4, 4
2274 BLAKE3_IV_0:
2275 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2276 BLAKE3_IV_1:
2277 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2278 BLAKE3_IV_2:
2279 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2280 BLAKE3_IV_3:
2281 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2282 BLAKE3_BLOCK_LEN:
2283 .long 64, 64, 64, 64
2284 CMP_MSB_MASK:
2285 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2286 PBLENDW_0x33_MASK:
2287 .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2288 PBLENDW_0xCC_MASK:
2289 .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2290 PBLENDW_0x3F_MASK:
2291 .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2292 PBLENDW_0xC0_MASK:
2293 .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2294
2295 #endif /* HAVE_SSE2 */
2296
2297 #ifdef __ELF__
2298 .section .note.GNU-stack,"",%progbits
2299 #endif
Cache object: 2cc3f8888a9a15c6ac16c2573da44315
|