1 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
2 #
3 # Licensed under the Apache License 2.0 (the "License"). You may not use
4 # this file except in compliance with the License. You can obtain a copy
5 # in the file LICENSE in the source distribution or at
6 # https://www.openssl.org/source/license.html
7
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 #
17 # AES-NI-CTR+GHASH stitch.
18 #
19 # February 2013
20 #
21 # OpenSSL GCM implementation is organized in such way that its
22 # performance is rather close to the sum of its streamed components,
23 # in the context parallelized AES-NI CTR and modulo-scheduled
24 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
25 # was observed to perform significantly better than the sum of the
26 # components on contemporary CPUs, the effort was deemed impossible to
27 # justify. This module is based on combination of Intel submissions,
28 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
29 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
30 # pressure with notable relative improvement, achieving 1.0 cycle per
31 # byte processed with 128-bit key on Haswell processor, 0.74 - on
32 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
33 # measurements for favourable packet size, one divisible by 96.
34 # Applications using the EVP interface will observe a few percent
35 # worse performance.]
36 #
37 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
38 #
39 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
40 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
41
42 # Generated once from
43 # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
44 # and modified for ICP. Modification are kept at a bare minimum to ease later
45 # upstream merges.
46
47 #if defined(__x86_64__) && defined(HAVE_AVX) && \
48 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
49
50 #define _ASM
51 #include <sys/asm_linkage.h>
52
53 /* Windows userland links with OpenSSL */
54 #if !defined (_WIN32) || defined (_KERNEL)
55
56 .extern gcm_avx_can_use_movbe
57
58 .text
59
60 #ifdef HAVE_MOVBE
61 .balign 32
62 FUNCTION(_aesni_ctr32_ghash_6x)
63 .cfi_startproc
64 ENDBR
65 vmovdqu 32(%r11),%xmm2
66 subq $6,%rdx
67 vpxor %xmm4,%xmm4,%xmm4
68 vmovdqu 0-128(%rcx),%xmm15
69 vpaddb %xmm2,%xmm1,%xmm10
70 vpaddb %xmm2,%xmm10,%xmm11
71 vpaddb %xmm2,%xmm11,%xmm12
72 vpaddb %xmm2,%xmm12,%xmm13
73 vpaddb %xmm2,%xmm13,%xmm14
74 vpxor %xmm15,%xmm1,%xmm9
75 vmovdqu %xmm4,16+8(%rsp)
76 jmp .Loop6x
77
78 .balign 32
79 .Loop6x:
80 addl $100663296,%ebx
81 jc .Lhandle_ctr32
82 vmovdqu 0-32(%r9),%xmm3
83 vpaddb %xmm2,%xmm14,%xmm1
84 vpxor %xmm15,%xmm10,%xmm10
85 vpxor %xmm15,%xmm11,%xmm11
86
87 .Lresume_ctr32:
88 vmovdqu %xmm1,(%r8)
89 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
90 vpxor %xmm15,%xmm12,%xmm12
91 vmovups 16-128(%rcx),%xmm2
92 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
93 xorq %r12,%r12
94 cmpq %r14,%r15
95
96 vaesenc %xmm2,%xmm9,%xmm9
97 vmovdqu 48+8(%rsp),%xmm0
98 vpxor %xmm15,%xmm13,%xmm13
99 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
100 vaesenc %xmm2,%xmm10,%xmm10
101 vpxor %xmm15,%xmm14,%xmm14
102 setnc %r12b
103 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
104 vaesenc %xmm2,%xmm11,%xmm11
105 vmovdqu 16-32(%r9),%xmm3
106 negq %r12
107 vaesenc %xmm2,%xmm12,%xmm12
108 vpxor %xmm5,%xmm6,%xmm6
109 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
110 vpxor %xmm4,%xmm8,%xmm8
111 vaesenc %xmm2,%xmm13,%xmm13
112 vpxor %xmm5,%xmm1,%xmm4
113 andq $0x60,%r12
114 vmovups 32-128(%rcx),%xmm15
115 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
116 vaesenc %xmm2,%xmm14,%xmm14
117
118 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
119 leaq (%r14,%r12,1),%r14
120 vaesenc %xmm15,%xmm9,%xmm9
121 vpxor 16+8(%rsp),%xmm8,%xmm8
122 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
123 vmovdqu 64+8(%rsp),%xmm0
124 vaesenc %xmm15,%xmm10,%xmm10
125 movbeq 88(%r14),%r13
126 vaesenc %xmm15,%xmm11,%xmm11
127 movbeq 80(%r14),%r12
128 vaesenc %xmm15,%xmm12,%xmm12
129 movq %r13,32+8(%rsp)
130 vaesenc %xmm15,%xmm13,%xmm13
131 movq %r12,40+8(%rsp)
132 vmovdqu 48-32(%r9),%xmm5
133 vaesenc %xmm15,%xmm14,%xmm14
134
135 vmovups 48-128(%rcx),%xmm15
136 vpxor %xmm1,%xmm6,%xmm6
137 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
138 vaesenc %xmm15,%xmm9,%xmm9
139 vpxor %xmm2,%xmm6,%xmm6
140 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
141 vaesenc %xmm15,%xmm10,%xmm10
142 vpxor %xmm3,%xmm7,%xmm7
143 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
144 vaesenc %xmm15,%xmm11,%xmm11
145 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
146 vmovdqu 80+8(%rsp),%xmm0
147 vaesenc %xmm15,%xmm12,%xmm12
148 vaesenc %xmm15,%xmm13,%xmm13
149 vpxor %xmm1,%xmm4,%xmm4
150 vmovdqu 64-32(%r9),%xmm1
151 vaesenc %xmm15,%xmm14,%xmm14
152
153 vmovups 64-128(%rcx),%xmm15
154 vpxor %xmm2,%xmm6,%xmm6
155 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
156 vaesenc %xmm15,%xmm9,%xmm9
157 vpxor %xmm3,%xmm6,%xmm6
158 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
159 vaesenc %xmm15,%xmm10,%xmm10
160 movbeq 72(%r14),%r13
161 vpxor %xmm5,%xmm7,%xmm7
162 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
163 vaesenc %xmm15,%xmm11,%xmm11
164 movbeq 64(%r14),%r12
165 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
166 vmovdqu 96+8(%rsp),%xmm0
167 vaesenc %xmm15,%xmm12,%xmm12
168 movq %r13,48+8(%rsp)
169 vaesenc %xmm15,%xmm13,%xmm13
170 movq %r12,56+8(%rsp)
171 vpxor %xmm2,%xmm4,%xmm4
172 vmovdqu 96-32(%r9),%xmm2
173 vaesenc %xmm15,%xmm14,%xmm14
174
175 vmovups 80-128(%rcx),%xmm15
176 vpxor %xmm3,%xmm6,%xmm6
177 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
178 vaesenc %xmm15,%xmm9,%xmm9
179 vpxor %xmm5,%xmm6,%xmm6
180 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
181 vaesenc %xmm15,%xmm10,%xmm10
182 movbeq 56(%r14),%r13
183 vpxor %xmm1,%xmm7,%xmm7
184 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
185 vpxor 112+8(%rsp),%xmm8,%xmm8
186 vaesenc %xmm15,%xmm11,%xmm11
187 movbeq 48(%r14),%r12
188 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
189 vaesenc %xmm15,%xmm12,%xmm12
190 movq %r13,64+8(%rsp)
191 vaesenc %xmm15,%xmm13,%xmm13
192 movq %r12,72+8(%rsp)
193 vpxor %xmm3,%xmm4,%xmm4
194 vmovdqu 112-32(%r9),%xmm3
195 vaesenc %xmm15,%xmm14,%xmm14
196
197 vmovups 96-128(%rcx),%xmm15
198 vpxor %xmm5,%xmm6,%xmm6
199 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
200 vaesenc %xmm15,%xmm9,%xmm9
201 vpxor %xmm1,%xmm6,%xmm6
202 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
203 vaesenc %xmm15,%xmm10,%xmm10
204 movbeq 40(%r14),%r13
205 vpxor %xmm2,%xmm7,%xmm7
206 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
207 vaesenc %xmm15,%xmm11,%xmm11
208 movbeq 32(%r14),%r12
209 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
210 vaesenc %xmm15,%xmm12,%xmm12
211 movq %r13,80+8(%rsp)
212 vaesenc %xmm15,%xmm13,%xmm13
213 movq %r12,88+8(%rsp)
214 vpxor %xmm5,%xmm6,%xmm6
215 vaesenc %xmm15,%xmm14,%xmm14
216 vpxor %xmm1,%xmm6,%xmm6
217
218 vmovups 112-128(%rcx),%xmm15
219 vpslldq $8,%xmm6,%xmm5
220 vpxor %xmm2,%xmm4,%xmm4
221 vmovdqu 16(%r11),%xmm3
222
223 vaesenc %xmm15,%xmm9,%xmm9
224 vpxor %xmm8,%xmm7,%xmm7
225 vaesenc %xmm15,%xmm10,%xmm10
226 vpxor %xmm5,%xmm4,%xmm4
227 movbeq 24(%r14),%r13
228 vaesenc %xmm15,%xmm11,%xmm11
229 movbeq 16(%r14),%r12
230 vpalignr $8,%xmm4,%xmm4,%xmm0
231 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
232 movq %r13,96+8(%rsp)
233 vaesenc %xmm15,%xmm12,%xmm12
234 movq %r12,104+8(%rsp)
235 vaesenc %xmm15,%xmm13,%xmm13
236 vmovups 128-128(%rcx),%xmm1
237 vaesenc %xmm15,%xmm14,%xmm14
238
239 vaesenc %xmm1,%xmm9,%xmm9
240 vmovups 144-128(%rcx),%xmm15
241 vaesenc %xmm1,%xmm10,%xmm10
242 vpsrldq $8,%xmm6,%xmm6
243 vaesenc %xmm1,%xmm11,%xmm11
244 vpxor %xmm6,%xmm7,%xmm7
245 vaesenc %xmm1,%xmm12,%xmm12
246 vpxor %xmm0,%xmm4,%xmm4
247 movbeq 8(%r14),%r13
248 vaesenc %xmm1,%xmm13,%xmm13
249 movbeq 0(%r14),%r12
250 vaesenc %xmm1,%xmm14,%xmm14
251 vmovups 160-128(%rcx),%xmm1
252 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
253 jb .Lenc_tail
254
255 vaesenc %xmm15,%xmm9,%xmm9
256 vaesenc %xmm15,%xmm10,%xmm10
257 vaesenc %xmm15,%xmm11,%xmm11
258 vaesenc %xmm15,%xmm12,%xmm12
259 vaesenc %xmm15,%xmm13,%xmm13
260 vaesenc %xmm15,%xmm14,%xmm14
261
262 vaesenc %xmm1,%xmm9,%xmm9
263 vaesenc %xmm1,%xmm10,%xmm10
264 vaesenc %xmm1,%xmm11,%xmm11
265 vaesenc %xmm1,%xmm12,%xmm12
266 vaesenc %xmm1,%xmm13,%xmm13
267 vmovups 176-128(%rcx),%xmm15
268 vaesenc %xmm1,%xmm14,%xmm14
269 vmovups 192-128(%rcx),%xmm1
270 cmpl $14,%ebp // ICP does not zero key schedule.
271 jb .Lenc_tail
272
273 vaesenc %xmm15,%xmm9,%xmm9
274 vaesenc %xmm15,%xmm10,%xmm10
275 vaesenc %xmm15,%xmm11,%xmm11
276 vaesenc %xmm15,%xmm12,%xmm12
277 vaesenc %xmm15,%xmm13,%xmm13
278 vaesenc %xmm15,%xmm14,%xmm14
279
280 vaesenc %xmm1,%xmm9,%xmm9
281 vaesenc %xmm1,%xmm10,%xmm10
282 vaesenc %xmm1,%xmm11,%xmm11
283 vaesenc %xmm1,%xmm12,%xmm12
284 vaesenc %xmm1,%xmm13,%xmm13
285 vmovups 208-128(%rcx),%xmm15
286 vaesenc %xmm1,%xmm14,%xmm14
287 vmovups 224-128(%rcx),%xmm1
288 jmp .Lenc_tail
289
290 .balign 32
291 .Lhandle_ctr32:
292 vmovdqu (%r11),%xmm0
293 vpshufb %xmm0,%xmm1,%xmm6
294 vmovdqu 48(%r11),%xmm5
295 vpaddd 64(%r11),%xmm6,%xmm10
296 vpaddd %xmm5,%xmm6,%xmm11
297 vmovdqu 0-32(%r9),%xmm3
298 vpaddd %xmm5,%xmm10,%xmm12
299 vpshufb %xmm0,%xmm10,%xmm10
300 vpaddd %xmm5,%xmm11,%xmm13
301 vpshufb %xmm0,%xmm11,%xmm11
302 vpxor %xmm15,%xmm10,%xmm10
303 vpaddd %xmm5,%xmm12,%xmm14
304 vpshufb %xmm0,%xmm12,%xmm12
305 vpxor %xmm15,%xmm11,%xmm11
306 vpaddd %xmm5,%xmm13,%xmm1
307 vpshufb %xmm0,%xmm13,%xmm13
308 vpshufb %xmm0,%xmm14,%xmm14
309 vpshufb %xmm0,%xmm1,%xmm1
310 jmp .Lresume_ctr32
311
312 .balign 32
313 .Lenc_tail:
314 vaesenc %xmm15,%xmm9,%xmm9
315 vmovdqu %xmm7,16+8(%rsp)
316 vpalignr $8,%xmm4,%xmm4,%xmm8
317 vaesenc %xmm15,%xmm10,%xmm10
318 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
319 vpxor 0(%rdi),%xmm1,%xmm2
320 vaesenc %xmm15,%xmm11,%xmm11
321 vpxor 16(%rdi),%xmm1,%xmm0
322 vaesenc %xmm15,%xmm12,%xmm12
323 vpxor 32(%rdi),%xmm1,%xmm5
324 vaesenc %xmm15,%xmm13,%xmm13
325 vpxor 48(%rdi),%xmm1,%xmm6
326 vaesenc %xmm15,%xmm14,%xmm14
327 vpxor 64(%rdi),%xmm1,%xmm7
328 vpxor 80(%rdi),%xmm1,%xmm3
329 vmovdqu (%r8),%xmm1
330
331 vaesenclast %xmm2,%xmm9,%xmm9
332 vmovdqu 32(%r11),%xmm2
333 vaesenclast %xmm0,%xmm10,%xmm10
334 vpaddb %xmm2,%xmm1,%xmm0
335 movq %r13,112+8(%rsp)
336 leaq 96(%rdi),%rdi
337 vaesenclast %xmm5,%xmm11,%xmm11
338 vpaddb %xmm2,%xmm0,%xmm5
339 movq %r12,120+8(%rsp)
340 leaq 96(%rsi),%rsi
341 vmovdqu 0-128(%rcx),%xmm15
342 vaesenclast %xmm6,%xmm12,%xmm12
343 vpaddb %xmm2,%xmm5,%xmm6
344 vaesenclast %xmm7,%xmm13,%xmm13
345 vpaddb %xmm2,%xmm6,%xmm7
346 vaesenclast %xmm3,%xmm14,%xmm14
347 vpaddb %xmm2,%xmm7,%xmm3
348
349 addq $0x60,%r10
350 subq $0x6,%rdx
351 jc .L6x_done
352
353 vmovups %xmm9,-96(%rsi)
354 vpxor %xmm15,%xmm1,%xmm9
355 vmovups %xmm10,-80(%rsi)
356 vmovdqa %xmm0,%xmm10
357 vmovups %xmm11,-64(%rsi)
358 vmovdqa %xmm5,%xmm11
359 vmovups %xmm12,-48(%rsi)
360 vmovdqa %xmm6,%xmm12
361 vmovups %xmm13,-32(%rsi)
362 vmovdqa %xmm7,%xmm13
363 vmovups %xmm14,-16(%rsi)
364 vmovdqa %xmm3,%xmm14
365 vmovdqu 32+8(%rsp),%xmm7
366 jmp .Loop6x
367
368 .L6x_done:
369 vpxor 16+8(%rsp),%xmm8,%xmm8
370 vpxor %xmm4,%xmm8,%xmm8
371
372 RET
373 .cfi_endproc
374 SET_SIZE(_aesni_ctr32_ghash_6x)
375 #endif /* ifdef HAVE_MOVBE */
376
377 .balign 32
378 FUNCTION(_aesni_ctr32_ghash_no_movbe_6x)
379 .cfi_startproc
380 ENDBR
381 vmovdqu 32(%r11),%xmm2
382 subq $6,%rdx
383 vpxor %xmm4,%xmm4,%xmm4
384 vmovdqu 0-128(%rcx),%xmm15
385 vpaddb %xmm2,%xmm1,%xmm10
386 vpaddb %xmm2,%xmm10,%xmm11
387 vpaddb %xmm2,%xmm11,%xmm12
388 vpaddb %xmm2,%xmm12,%xmm13
389 vpaddb %xmm2,%xmm13,%xmm14
390 vpxor %xmm15,%xmm1,%xmm9
391 vmovdqu %xmm4,16+8(%rsp)
392 jmp .Loop6x_nmb
393
394 .balign 32
395 .Loop6x_nmb:
396 addl $100663296,%ebx
397 jc .Lhandle_ctr32_nmb
398 vmovdqu 0-32(%r9),%xmm3
399 vpaddb %xmm2,%xmm14,%xmm1
400 vpxor %xmm15,%xmm10,%xmm10
401 vpxor %xmm15,%xmm11,%xmm11
402
403 .Lresume_ctr32_nmb:
404 vmovdqu %xmm1,(%r8)
405 vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
406 vpxor %xmm15,%xmm12,%xmm12
407 vmovups 16-128(%rcx),%xmm2
408 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
409 xorq %r12,%r12
410 cmpq %r14,%r15
411
412 vaesenc %xmm2,%xmm9,%xmm9
413 vmovdqu 48+8(%rsp),%xmm0
414 vpxor %xmm15,%xmm13,%xmm13
415 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
416 vaesenc %xmm2,%xmm10,%xmm10
417 vpxor %xmm15,%xmm14,%xmm14
418 setnc %r12b
419 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
420 vaesenc %xmm2,%xmm11,%xmm11
421 vmovdqu 16-32(%r9),%xmm3
422 negq %r12
423 vaesenc %xmm2,%xmm12,%xmm12
424 vpxor %xmm5,%xmm6,%xmm6
425 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
426 vpxor %xmm4,%xmm8,%xmm8
427 vaesenc %xmm2,%xmm13,%xmm13
428 vpxor %xmm5,%xmm1,%xmm4
429 andq $0x60,%r12
430 vmovups 32-128(%rcx),%xmm15
431 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
432 vaesenc %xmm2,%xmm14,%xmm14
433
434 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
435 leaq (%r14,%r12,1),%r14
436 vaesenc %xmm15,%xmm9,%xmm9
437 vpxor 16+8(%rsp),%xmm8,%xmm8
438 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
439 vmovdqu 64+8(%rsp),%xmm0
440 vaesenc %xmm15,%xmm10,%xmm10
441 movq 88(%r14),%r13
442 bswapq %r13
443 vaesenc %xmm15,%xmm11,%xmm11
444 movq 80(%r14),%r12
445 bswapq %r12
446 vaesenc %xmm15,%xmm12,%xmm12
447 movq %r13,32+8(%rsp)
448 vaesenc %xmm15,%xmm13,%xmm13
449 movq %r12,40+8(%rsp)
450 vmovdqu 48-32(%r9),%xmm5
451 vaesenc %xmm15,%xmm14,%xmm14
452
453 vmovups 48-128(%rcx),%xmm15
454 vpxor %xmm1,%xmm6,%xmm6
455 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
456 vaesenc %xmm15,%xmm9,%xmm9
457 vpxor %xmm2,%xmm6,%xmm6
458 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
459 vaesenc %xmm15,%xmm10,%xmm10
460 vpxor %xmm3,%xmm7,%xmm7
461 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
462 vaesenc %xmm15,%xmm11,%xmm11
463 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
464 vmovdqu 80+8(%rsp),%xmm0
465 vaesenc %xmm15,%xmm12,%xmm12
466 vaesenc %xmm15,%xmm13,%xmm13
467 vpxor %xmm1,%xmm4,%xmm4
468 vmovdqu 64-32(%r9),%xmm1
469 vaesenc %xmm15,%xmm14,%xmm14
470
471 vmovups 64-128(%rcx),%xmm15
472 vpxor %xmm2,%xmm6,%xmm6
473 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
474 vaesenc %xmm15,%xmm9,%xmm9
475 vpxor %xmm3,%xmm6,%xmm6
476 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
477 vaesenc %xmm15,%xmm10,%xmm10
478 movq 72(%r14),%r13
479 bswapq %r13
480 vpxor %xmm5,%xmm7,%xmm7
481 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
482 vaesenc %xmm15,%xmm11,%xmm11
483 movq 64(%r14),%r12
484 bswapq %r12
485 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
486 vmovdqu 96+8(%rsp),%xmm0
487 vaesenc %xmm15,%xmm12,%xmm12
488 movq %r13,48+8(%rsp)
489 vaesenc %xmm15,%xmm13,%xmm13
490 movq %r12,56+8(%rsp)
491 vpxor %xmm2,%xmm4,%xmm4
492 vmovdqu 96-32(%r9),%xmm2
493 vaesenc %xmm15,%xmm14,%xmm14
494
495 vmovups 80-128(%rcx),%xmm15
496 vpxor %xmm3,%xmm6,%xmm6
497 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
498 vaesenc %xmm15,%xmm9,%xmm9
499 vpxor %xmm5,%xmm6,%xmm6
500 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
501 vaesenc %xmm15,%xmm10,%xmm10
502 movq 56(%r14),%r13
503 bswapq %r13
504 vpxor %xmm1,%xmm7,%xmm7
505 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
506 vpxor 112+8(%rsp),%xmm8,%xmm8
507 vaesenc %xmm15,%xmm11,%xmm11
508 movq 48(%r14),%r12
509 bswapq %r12
510 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
511 vaesenc %xmm15,%xmm12,%xmm12
512 movq %r13,64+8(%rsp)
513 vaesenc %xmm15,%xmm13,%xmm13
514 movq %r12,72+8(%rsp)
515 vpxor %xmm3,%xmm4,%xmm4
516 vmovdqu 112-32(%r9),%xmm3
517 vaesenc %xmm15,%xmm14,%xmm14
518
519 vmovups 96-128(%rcx),%xmm15
520 vpxor %xmm5,%xmm6,%xmm6
521 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
522 vaesenc %xmm15,%xmm9,%xmm9
523 vpxor %xmm1,%xmm6,%xmm6
524 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
525 vaesenc %xmm15,%xmm10,%xmm10
526 movq 40(%r14),%r13
527 bswapq %r13
528 vpxor %xmm2,%xmm7,%xmm7
529 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
530 vaesenc %xmm15,%xmm11,%xmm11
531 movq 32(%r14),%r12
532 bswapq %r12
533 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
534 vaesenc %xmm15,%xmm12,%xmm12
535 movq %r13,80+8(%rsp)
536 vaesenc %xmm15,%xmm13,%xmm13
537 movq %r12,88+8(%rsp)
538 vpxor %xmm5,%xmm6,%xmm6
539 vaesenc %xmm15,%xmm14,%xmm14
540 vpxor %xmm1,%xmm6,%xmm6
541
542 vmovups 112-128(%rcx),%xmm15
543 vpslldq $8,%xmm6,%xmm5
544 vpxor %xmm2,%xmm4,%xmm4
545 vmovdqu 16(%r11),%xmm3
546
547 vaesenc %xmm15,%xmm9,%xmm9
548 vpxor %xmm8,%xmm7,%xmm7
549 vaesenc %xmm15,%xmm10,%xmm10
550 vpxor %xmm5,%xmm4,%xmm4
551 movq 24(%r14),%r13
552 bswapq %r13
553 vaesenc %xmm15,%xmm11,%xmm11
554 movq 16(%r14),%r12
555 bswapq %r12
556 vpalignr $8,%xmm4,%xmm4,%xmm0
557 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
558 movq %r13,96+8(%rsp)
559 vaesenc %xmm15,%xmm12,%xmm12
560 movq %r12,104+8(%rsp)
561 vaesenc %xmm15,%xmm13,%xmm13
562 vmovups 128-128(%rcx),%xmm1
563 vaesenc %xmm15,%xmm14,%xmm14
564
565 vaesenc %xmm1,%xmm9,%xmm9
566 vmovups 144-128(%rcx),%xmm15
567 vaesenc %xmm1,%xmm10,%xmm10
568 vpsrldq $8,%xmm6,%xmm6
569 vaesenc %xmm1,%xmm11,%xmm11
570 vpxor %xmm6,%xmm7,%xmm7
571 vaesenc %xmm1,%xmm12,%xmm12
572 vpxor %xmm0,%xmm4,%xmm4
573 movq 8(%r14),%r13
574 bswapq %r13
575 vaesenc %xmm1,%xmm13,%xmm13
576 movq 0(%r14),%r12
577 bswapq %r12
578 vaesenc %xmm1,%xmm14,%xmm14
579 vmovups 160-128(%rcx),%xmm1
580 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
581 jb .Lenc_tail_nmb
582
583 vaesenc %xmm15,%xmm9,%xmm9
584 vaesenc %xmm15,%xmm10,%xmm10
585 vaesenc %xmm15,%xmm11,%xmm11
586 vaesenc %xmm15,%xmm12,%xmm12
587 vaesenc %xmm15,%xmm13,%xmm13
588 vaesenc %xmm15,%xmm14,%xmm14
589
590 vaesenc %xmm1,%xmm9,%xmm9
591 vaesenc %xmm1,%xmm10,%xmm10
592 vaesenc %xmm1,%xmm11,%xmm11
593 vaesenc %xmm1,%xmm12,%xmm12
594 vaesenc %xmm1,%xmm13,%xmm13
595 vmovups 176-128(%rcx),%xmm15
596 vaesenc %xmm1,%xmm14,%xmm14
597 vmovups 192-128(%rcx),%xmm1
598 cmpl $14,%ebp // ICP does not zero key schedule.
599 jb .Lenc_tail_nmb
600
601 vaesenc %xmm15,%xmm9,%xmm9
602 vaesenc %xmm15,%xmm10,%xmm10
603 vaesenc %xmm15,%xmm11,%xmm11
604 vaesenc %xmm15,%xmm12,%xmm12
605 vaesenc %xmm15,%xmm13,%xmm13
606 vaesenc %xmm15,%xmm14,%xmm14
607
608 vaesenc %xmm1,%xmm9,%xmm9
609 vaesenc %xmm1,%xmm10,%xmm10
610 vaesenc %xmm1,%xmm11,%xmm11
611 vaesenc %xmm1,%xmm12,%xmm12
612 vaesenc %xmm1,%xmm13,%xmm13
613 vmovups 208-128(%rcx),%xmm15
614 vaesenc %xmm1,%xmm14,%xmm14
615 vmovups 224-128(%rcx),%xmm1
616 jmp .Lenc_tail_nmb
617
618 .balign 32
619 .Lhandle_ctr32_nmb:
620 vmovdqu (%r11),%xmm0
621 vpshufb %xmm0,%xmm1,%xmm6
622 vmovdqu 48(%r11),%xmm5
623 vpaddd 64(%r11),%xmm6,%xmm10
624 vpaddd %xmm5,%xmm6,%xmm11
625 vmovdqu 0-32(%r9),%xmm3
626 vpaddd %xmm5,%xmm10,%xmm12
627 vpshufb %xmm0,%xmm10,%xmm10
628 vpaddd %xmm5,%xmm11,%xmm13
629 vpshufb %xmm0,%xmm11,%xmm11
630 vpxor %xmm15,%xmm10,%xmm10
631 vpaddd %xmm5,%xmm12,%xmm14
632 vpshufb %xmm0,%xmm12,%xmm12
633 vpxor %xmm15,%xmm11,%xmm11
634 vpaddd %xmm5,%xmm13,%xmm1
635 vpshufb %xmm0,%xmm13,%xmm13
636 vpshufb %xmm0,%xmm14,%xmm14
637 vpshufb %xmm0,%xmm1,%xmm1
638 jmp .Lresume_ctr32_nmb
639
640 .balign 32
641 .Lenc_tail_nmb:
642 vaesenc %xmm15,%xmm9,%xmm9
643 vmovdqu %xmm7,16+8(%rsp)
644 vpalignr $8,%xmm4,%xmm4,%xmm8
645 vaesenc %xmm15,%xmm10,%xmm10
646 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
647 vpxor 0(%rdi),%xmm1,%xmm2
648 vaesenc %xmm15,%xmm11,%xmm11
649 vpxor 16(%rdi),%xmm1,%xmm0
650 vaesenc %xmm15,%xmm12,%xmm12
651 vpxor 32(%rdi),%xmm1,%xmm5
652 vaesenc %xmm15,%xmm13,%xmm13
653 vpxor 48(%rdi),%xmm1,%xmm6
654 vaesenc %xmm15,%xmm14,%xmm14
655 vpxor 64(%rdi),%xmm1,%xmm7
656 vpxor 80(%rdi),%xmm1,%xmm3
657 vmovdqu (%r8),%xmm1
658
659 vaesenclast %xmm2,%xmm9,%xmm9
660 vmovdqu 32(%r11),%xmm2
661 vaesenclast %xmm0,%xmm10,%xmm10
662 vpaddb %xmm2,%xmm1,%xmm0
663 movq %r13,112+8(%rsp)
664 leaq 96(%rdi),%rdi
665 vaesenclast %xmm5,%xmm11,%xmm11
666 vpaddb %xmm2,%xmm0,%xmm5
667 movq %r12,120+8(%rsp)
668 leaq 96(%rsi),%rsi
669 vmovdqu 0-128(%rcx),%xmm15
670 vaesenclast %xmm6,%xmm12,%xmm12
671 vpaddb %xmm2,%xmm5,%xmm6
672 vaesenclast %xmm7,%xmm13,%xmm13
673 vpaddb %xmm2,%xmm6,%xmm7
674 vaesenclast %xmm3,%xmm14,%xmm14
675 vpaddb %xmm2,%xmm7,%xmm3
676
677 addq $0x60,%r10
678 subq $0x6,%rdx
679 jc .L6x_done_nmb
680
681 vmovups %xmm9,-96(%rsi)
682 vpxor %xmm15,%xmm1,%xmm9
683 vmovups %xmm10,-80(%rsi)
684 vmovdqa %xmm0,%xmm10
685 vmovups %xmm11,-64(%rsi)
686 vmovdqa %xmm5,%xmm11
687 vmovups %xmm12,-48(%rsi)
688 vmovdqa %xmm6,%xmm12
689 vmovups %xmm13,-32(%rsi)
690 vmovdqa %xmm7,%xmm13
691 vmovups %xmm14,-16(%rsi)
692 vmovdqa %xmm3,%xmm14
693 vmovdqu 32+8(%rsp),%xmm7
694 jmp .Loop6x_nmb
695
696 .L6x_done_nmb:
697 vpxor 16+8(%rsp),%xmm8,%xmm8
698 vpxor %xmm4,%xmm8,%xmm8
699
700 RET
701 .cfi_endproc
702 SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x)
703
704 ENTRY_ALIGN(aesni_gcm_decrypt, 32)
705 .cfi_startproc
706 ENDBR
707 xorq %r10,%r10
708 cmpq $0x60,%rdx
709 jb .Lgcm_dec_abort
710
711 leaq (%rsp),%rax
712 .cfi_def_cfa_register %rax
713 pushq %rbx
714 .cfi_offset %rbx,-16
715 pushq %rbp
716 .cfi_offset %rbp,-24
717 pushq %r12
718 .cfi_offset %r12,-32
719 pushq %r13
720 .cfi_offset %r13,-40
721 pushq %r14
722 .cfi_offset %r14,-48
723 pushq %r15
724 .cfi_offset %r15,-56
725 pushq %r9
726 .cfi_offset %r9,-64
727 vzeroupper
728
729 vmovdqu (%r8),%xmm1
730 addq $-128,%rsp
731 movl 12(%r8),%ebx
732 leaq .Lbswap_mask(%rip),%r11
733 leaq -128(%rcx),%r14
734 movq $0xf80,%r15
735 vmovdqu (%r9),%xmm8
736 andq $-128,%rsp
737 vmovdqu (%r11),%xmm0
738 leaq 128(%rcx),%rcx
739 movq 32(%r9),%r9
740 leaq 32(%r9),%r9
741 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
742 vpshufb %xmm0,%xmm8,%xmm8
743
744 andq %r15,%r14
745 andq %rsp,%r15
746 subq %r14,%r15
747 jc .Ldec_no_key_aliasing
748 cmpq $768,%r15
749 jnc .Ldec_no_key_aliasing
750 subq %r15,%rsp
751 .Ldec_no_key_aliasing:
752
753 vmovdqu 80(%rdi),%xmm7
754 leaq (%rdi),%r14
755 vmovdqu 64(%rdi),%xmm4
756 leaq -192(%rdi,%rdx,1),%r15
757 vmovdqu 48(%rdi),%xmm5
758 shrq $4,%rdx
759 xorq %r10,%r10
760 vmovdqu 32(%rdi),%xmm6
761 vpshufb %xmm0,%xmm7,%xmm7
762 vmovdqu 16(%rdi),%xmm2
763 vpshufb %xmm0,%xmm4,%xmm4
764 vmovdqu (%rdi),%xmm3
765 vpshufb %xmm0,%xmm5,%xmm5
766 vmovdqu %xmm4,48(%rsp)
767 vpshufb %xmm0,%xmm6,%xmm6
768 vmovdqu %xmm5,64(%rsp)
769 vpshufb %xmm0,%xmm2,%xmm2
770 vmovdqu %xmm6,80(%rsp)
771 vpshufb %xmm0,%xmm3,%xmm3
772 vmovdqu %xmm2,96(%rsp)
773 vmovdqu %xmm3,112(%rsp)
774
775 #ifdef HAVE_MOVBE
776 #ifdef _KERNEL
777 testl $1,gcm_avx_can_use_movbe(%rip)
778 #else
779 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
780 #endif
781 jz 1f
782 call _aesni_ctr32_ghash_6x
783 jmp 2f
784 1:
785 #endif
786 call _aesni_ctr32_ghash_no_movbe_6x
787 2:
788 vmovups %xmm9,-96(%rsi)
789 vmovups %xmm10,-80(%rsi)
790 vmovups %xmm11,-64(%rsi)
791 vmovups %xmm12,-48(%rsi)
792 vmovups %xmm13,-32(%rsi)
793 vmovups %xmm14,-16(%rsi)
794
795 vpshufb (%r11),%xmm8,%xmm8
796 movq -56(%rax),%r9
797 .cfi_restore %r9
798 vmovdqu %xmm8,(%r9)
799
800 vzeroupper
801 movq -48(%rax),%r15
802 .cfi_restore %r15
803 movq -40(%rax),%r14
804 .cfi_restore %r14
805 movq -32(%rax),%r13
806 .cfi_restore %r13
807 movq -24(%rax),%r12
808 .cfi_restore %r12
809 movq -16(%rax),%rbp
810 .cfi_restore %rbp
811 movq -8(%rax),%rbx
812 .cfi_restore %rbx
813 leaq (%rax),%rsp
814 .cfi_def_cfa_register %rsp
815 .Lgcm_dec_abort:
816 movq %r10,%rax
817 RET
818 .cfi_endproc
819 SET_SIZE(aesni_gcm_decrypt)
820
821 .balign 32
822 FUNCTION(_aesni_ctr32_6x)
823 .cfi_startproc
824 ENDBR
825 vmovdqu 0-128(%rcx),%xmm4
826 vmovdqu 32(%r11),%xmm2
827 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds.
828 vmovups 16-128(%rcx),%xmm15
829 leaq 32-128(%rcx),%r12
830 vpxor %xmm4,%xmm1,%xmm9
831 addl $100663296,%ebx
832 jc .Lhandle_ctr32_2
833 vpaddb %xmm2,%xmm1,%xmm10
834 vpaddb %xmm2,%xmm10,%xmm11
835 vpxor %xmm4,%xmm10,%xmm10
836 vpaddb %xmm2,%xmm11,%xmm12
837 vpxor %xmm4,%xmm11,%xmm11
838 vpaddb %xmm2,%xmm12,%xmm13
839 vpxor %xmm4,%xmm12,%xmm12
840 vpaddb %xmm2,%xmm13,%xmm14
841 vpxor %xmm4,%xmm13,%xmm13
842 vpaddb %xmm2,%xmm14,%xmm1
843 vpxor %xmm4,%xmm14,%xmm14
844 jmp .Loop_ctr32
845
846 .balign 16
847 .Loop_ctr32:
848 vaesenc %xmm15,%xmm9,%xmm9
849 vaesenc %xmm15,%xmm10,%xmm10
850 vaesenc %xmm15,%xmm11,%xmm11
851 vaesenc %xmm15,%xmm12,%xmm12
852 vaesenc %xmm15,%xmm13,%xmm13
853 vaesenc %xmm15,%xmm14,%xmm14
854 vmovups (%r12),%xmm15
855 leaq 16(%r12),%r12
856 decl %r13d
857 jnz .Loop_ctr32
858
859 vmovdqu (%r12),%xmm3
860 vaesenc %xmm15,%xmm9,%xmm9
861 vpxor 0(%rdi),%xmm3,%xmm4
862 vaesenc %xmm15,%xmm10,%xmm10
863 vpxor 16(%rdi),%xmm3,%xmm5
864 vaesenc %xmm15,%xmm11,%xmm11
865 vpxor 32(%rdi),%xmm3,%xmm6
866 vaesenc %xmm15,%xmm12,%xmm12
867 vpxor 48(%rdi),%xmm3,%xmm8
868 vaesenc %xmm15,%xmm13,%xmm13
869 vpxor 64(%rdi),%xmm3,%xmm2
870 vaesenc %xmm15,%xmm14,%xmm14
871 vpxor 80(%rdi),%xmm3,%xmm3
872 leaq 96(%rdi),%rdi
873
874 vaesenclast %xmm4,%xmm9,%xmm9
875 vaesenclast %xmm5,%xmm10,%xmm10
876 vaesenclast %xmm6,%xmm11,%xmm11
877 vaesenclast %xmm8,%xmm12,%xmm12
878 vaesenclast %xmm2,%xmm13,%xmm13
879 vaesenclast %xmm3,%xmm14,%xmm14
880 vmovups %xmm9,0(%rsi)
881 vmovups %xmm10,16(%rsi)
882 vmovups %xmm11,32(%rsi)
883 vmovups %xmm12,48(%rsi)
884 vmovups %xmm13,64(%rsi)
885 vmovups %xmm14,80(%rsi)
886 leaq 96(%rsi),%rsi
887
888 RET
889 .balign 32
890 .Lhandle_ctr32_2:
891 vpshufb %xmm0,%xmm1,%xmm6
892 vmovdqu 48(%r11),%xmm5
893 vpaddd 64(%r11),%xmm6,%xmm10
894 vpaddd %xmm5,%xmm6,%xmm11
895 vpaddd %xmm5,%xmm10,%xmm12
896 vpshufb %xmm0,%xmm10,%xmm10
897 vpaddd %xmm5,%xmm11,%xmm13
898 vpshufb %xmm0,%xmm11,%xmm11
899 vpxor %xmm4,%xmm10,%xmm10
900 vpaddd %xmm5,%xmm12,%xmm14
901 vpshufb %xmm0,%xmm12,%xmm12
902 vpxor %xmm4,%xmm11,%xmm11
903 vpaddd %xmm5,%xmm13,%xmm1
904 vpshufb %xmm0,%xmm13,%xmm13
905 vpxor %xmm4,%xmm12,%xmm12
906 vpshufb %xmm0,%xmm14,%xmm14
907 vpxor %xmm4,%xmm13,%xmm13
908 vpshufb %xmm0,%xmm1,%xmm1
909 vpxor %xmm4,%xmm14,%xmm14
910 jmp .Loop_ctr32
911 .cfi_endproc
912 SET_SIZE(_aesni_ctr32_6x)
913
914 ENTRY_ALIGN(aesni_gcm_encrypt, 32)
915 .cfi_startproc
916 ENDBR
917 xorq %r10,%r10
918 cmpq $288,%rdx
919 jb .Lgcm_enc_abort
920
921 leaq (%rsp),%rax
922 .cfi_def_cfa_register %rax
923 pushq %rbx
924 .cfi_offset %rbx,-16
925 pushq %rbp
926 .cfi_offset %rbp,-24
927 pushq %r12
928 .cfi_offset %r12,-32
929 pushq %r13
930 .cfi_offset %r13,-40
931 pushq %r14
932 .cfi_offset %r14,-48
933 pushq %r15
934 .cfi_offset %r15,-56
935 pushq %r9
936 .cfi_offset %r9,-64
937 vzeroupper
938
939 vmovdqu (%r8),%xmm1
940 addq $-128,%rsp
941 movl 12(%r8),%ebx
942 leaq .Lbswap_mask(%rip),%r11
943 leaq -128(%rcx),%r14
944 movq $0xf80,%r15
945 leaq 128(%rcx),%rcx
946 vmovdqu (%r11),%xmm0
947 andq $-128,%rsp
948 movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds.
949
950 andq %r15,%r14
951 andq %rsp,%r15
952 subq %r14,%r15
953 jc .Lenc_no_key_aliasing
954 cmpq $768,%r15
955 jnc .Lenc_no_key_aliasing
956 subq %r15,%rsp
957 .Lenc_no_key_aliasing:
958
959 leaq (%rsi),%r14
960 leaq -192(%rsi,%rdx,1),%r15
961 shrq $4,%rdx
962
963 call _aesni_ctr32_6x
964 vpshufb %xmm0,%xmm9,%xmm8
965 vpshufb %xmm0,%xmm10,%xmm2
966 vmovdqu %xmm8,112(%rsp)
967 vpshufb %xmm0,%xmm11,%xmm4
968 vmovdqu %xmm2,96(%rsp)
969 vpshufb %xmm0,%xmm12,%xmm5
970 vmovdqu %xmm4,80(%rsp)
971 vpshufb %xmm0,%xmm13,%xmm6
972 vmovdqu %xmm5,64(%rsp)
973 vpshufb %xmm0,%xmm14,%xmm7
974 vmovdqu %xmm6,48(%rsp)
975
976 call _aesni_ctr32_6x
977
978 vmovdqu (%r9),%xmm8
979 movq 32(%r9),%r9
980 leaq 32(%r9),%r9
981 subq $12,%rdx
982 movq $192,%r10
983 vpshufb %xmm0,%xmm8,%xmm8
984
985 #ifdef HAVE_MOVBE
986 #ifdef _KERNEL
987 testl $1,gcm_avx_can_use_movbe(%rip)
988 #else
989 testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
990 #endif
991 jz 1f
992 call _aesni_ctr32_ghash_6x
993 jmp 2f
994 1:
995 #endif
996 call _aesni_ctr32_ghash_no_movbe_6x
997 2:
998 vmovdqu 32(%rsp),%xmm7
999 vmovdqu (%r11),%xmm0
1000 vmovdqu 0-32(%r9),%xmm3
1001 vpunpckhqdq %xmm7,%xmm7,%xmm1
1002 vmovdqu 32-32(%r9),%xmm15
1003 vmovups %xmm9,-96(%rsi)
1004 vpshufb %xmm0,%xmm9,%xmm9
1005 vpxor %xmm7,%xmm1,%xmm1
1006 vmovups %xmm10,-80(%rsi)
1007 vpshufb %xmm0,%xmm10,%xmm10
1008 vmovups %xmm11,-64(%rsi)
1009 vpshufb %xmm0,%xmm11,%xmm11
1010 vmovups %xmm12,-48(%rsi)
1011 vpshufb %xmm0,%xmm12,%xmm12
1012 vmovups %xmm13,-32(%rsi)
1013 vpshufb %xmm0,%xmm13,%xmm13
1014 vmovups %xmm14,-16(%rsi)
1015 vpshufb %xmm0,%xmm14,%xmm14
1016 vmovdqu %xmm9,16(%rsp)
1017 vmovdqu 48(%rsp),%xmm6
1018 vmovdqu 16-32(%r9),%xmm0
1019 vpunpckhqdq %xmm6,%xmm6,%xmm2
1020 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
1021 vpxor %xmm6,%xmm2,%xmm2
1022 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
1023 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
1024
1025 vmovdqu 64(%rsp),%xmm9
1026 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
1027 vmovdqu 48-32(%r9),%xmm3
1028 vpxor %xmm5,%xmm4,%xmm4
1029 vpunpckhqdq %xmm9,%xmm9,%xmm5
1030 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
1031 vpxor %xmm9,%xmm5,%xmm5
1032 vpxor %xmm7,%xmm6,%xmm6
1033 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
1034 vmovdqu 80-32(%r9),%xmm15
1035 vpxor %xmm1,%xmm2,%xmm2
1036
1037 vmovdqu 80(%rsp),%xmm1
1038 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
1039 vmovdqu 64-32(%r9),%xmm0
1040 vpxor %xmm4,%xmm7,%xmm7
1041 vpunpckhqdq %xmm1,%xmm1,%xmm4
1042 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
1043 vpxor %xmm1,%xmm4,%xmm4
1044 vpxor %xmm6,%xmm9,%xmm9
1045 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
1046 vpxor %xmm2,%xmm5,%xmm5
1047
1048 vmovdqu 96(%rsp),%xmm2
1049 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
1050 vmovdqu 96-32(%r9),%xmm3
1051 vpxor %xmm7,%xmm6,%xmm6
1052 vpunpckhqdq %xmm2,%xmm2,%xmm7
1053 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
1054 vpxor %xmm2,%xmm7,%xmm7
1055 vpxor %xmm9,%xmm1,%xmm1
1056 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
1057 vmovdqu 128-32(%r9),%xmm15
1058 vpxor %xmm5,%xmm4,%xmm4
1059
1060 vpxor 112(%rsp),%xmm8,%xmm8
1061 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
1062 vmovdqu 112-32(%r9),%xmm0
1063 vpunpckhqdq %xmm8,%xmm8,%xmm9
1064 vpxor %xmm6,%xmm5,%xmm5
1065 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
1066 vpxor %xmm8,%xmm9,%xmm9
1067 vpxor %xmm1,%xmm2,%xmm2
1068 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
1069 vpxor %xmm4,%xmm7,%xmm4
1070
1071 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
1072 vmovdqu 0-32(%r9),%xmm3
1073 vpunpckhqdq %xmm14,%xmm14,%xmm1
1074 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
1075 vpxor %xmm14,%xmm1,%xmm1
1076 vpxor %xmm5,%xmm6,%xmm5
1077 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
1078 vmovdqu 32-32(%r9),%xmm15
1079 vpxor %xmm2,%xmm8,%xmm7
1080 vpxor %xmm4,%xmm9,%xmm6
1081
1082 vmovdqu 16-32(%r9),%xmm0
1083 vpxor %xmm5,%xmm7,%xmm9
1084 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
1085 vpxor %xmm9,%xmm6,%xmm6
1086 vpunpckhqdq %xmm13,%xmm13,%xmm2
1087 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
1088 vpxor %xmm13,%xmm2,%xmm2
1089 vpslldq $8,%xmm6,%xmm9
1090 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
1091 vpxor %xmm9,%xmm5,%xmm8
1092 vpsrldq $8,%xmm6,%xmm6
1093 vpxor %xmm6,%xmm7,%xmm7
1094
1095 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
1096 vmovdqu 48-32(%r9),%xmm3
1097 vpxor %xmm4,%xmm5,%xmm5
1098 vpunpckhqdq %xmm12,%xmm12,%xmm9
1099 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
1100 vpxor %xmm12,%xmm9,%xmm9
1101 vpxor %xmm14,%xmm13,%xmm13
1102 vpalignr $8,%xmm8,%xmm8,%xmm14
1103 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
1104 vmovdqu 80-32(%r9),%xmm15
1105 vpxor %xmm1,%xmm2,%xmm2
1106
1107 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
1108 vmovdqu 64-32(%r9),%xmm0
1109 vpxor %xmm5,%xmm4,%xmm4
1110 vpunpckhqdq %xmm11,%xmm11,%xmm1
1111 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
1112 vpxor %xmm11,%xmm1,%xmm1
1113 vpxor %xmm13,%xmm12,%xmm12
1114 vxorps 16(%rsp),%xmm7,%xmm7
1115 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
1116 vpxor %xmm2,%xmm9,%xmm9
1117
1118 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
1119 vxorps %xmm14,%xmm8,%xmm8
1120
1121 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
1122 vmovdqu 96-32(%r9),%xmm3
1123 vpxor %xmm4,%xmm5,%xmm5
1124 vpunpckhqdq %xmm10,%xmm10,%xmm2
1125 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
1126 vpxor %xmm10,%xmm2,%xmm2
1127 vpalignr $8,%xmm8,%xmm8,%xmm14
1128 vpxor %xmm12,%xmm11,%xmm11
1129 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
1130 vmovdqu 128-32(%r9),%xmm15
1131 vpxor %xmm9,%xmm1,%xmm1
1132
1133 vxorps %xmm7,%xmm14,%xmm14
1134 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
1135 vxorps %xmm14,%xmm8,%xmm8
1136
1137 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
1138 vmovdqu 112-32(%r9),%xmm0
1139 vpxor %xmm5,%xmm4,%xmm4
1140 vpunpckhqdq %xmm8,%xmm8,%xmm9
1141 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
1142 vpxor %xmm8,%xmm9,%xmm9
1143 vpxor %xmm11,%xmm10,%xmm10
1144 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
1145 vpxor %xmm1,%xmm2,%xmm2
1146
1147 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
1148 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
1149 vpxor %xmm4,%xmm5,%xmm5
1150 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
1151 vpxor %xmm10,%xmm7,%xmm7
1152 vpxor %xmm2,%xmm6,%xmm6
1153
1154 vpxor %xmm5,%xmm7,%xmm4
1155 vpxor %xmm4,%xmm6,%xmm6
1156 vpslldq $8,%xmm6,%xmm1
1157 vmovdqu 16(%r11),%xmm3
1158 vpsrldq $8,%xmm6,%xmm6
1159 vpxor %xmm1,%xmm5,%xmm8
1160 vpxor %xmm6,%xmm7,%xmm7
1161
1162 vpalignr $8,%xmm8,%xmm8,%xmm2
1163 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
1164 vpxor %xmm2,%xmm8,%xmm8
1165
1166 vpalignr $8,%xmm8,%xmm8,%xmm2
1167 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
1168 vpxor %xmm7,%xmm2,%xmm2
1169 vpxor %xmm2,%xmm8,%xmm8
1170 vpshufb (%r11),%xmm8,%xmm8
1171 movq -56(%rax),%r9
1172 .cfi_restore %r9
1173 vmovdqu %xmm8,(%r9)
1174
1175 vzeroupper
1176 movq -48(%rax),%r15
1177 .cfi_restore %r15
1178 movq -40(%rax),%r14
1179 .cfi_restore %r14
1180 movq -32(%rax),%r13
1181 .cfi_restore %r13
1182 movq -24(%rax),%r12
1183 .cfi_restore %r12
1184 movq -16(%rax),%rbp
1185 .cfi_restore %rbp
1186 movq -8(%rax),%rbx
1187 .cfi_restore %rbx
1188 leaq (%rax),%rsp
1189 .cfi_def_cfa_register %rsp
1190 .Lgcm_enc_abort:
1191 movq %r10,%rax
1192 RET
1193 .cfi_endproc
1194 SET_SIZE(aesni_gcm_encrypt)
1195
1196 #endif /* !_WIN32 || _KERNEL */
1197
1198 /* Some utility routines */
1199
1200 /*
1201 * clear all fpu registers
1202 * void clear_fpu_regs_avx(void);
1203 */
1204 ENTRY_ALIGN(clear_fpu_regs_avx, 32)
1205 vzeroall
1206 RET
1207 SET_SIZE(clear_fpu_regs_avx)
1208
1209 /*
1210 * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1211 *
1212 * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
1213 * stores the result at `dst'. The XOR is performed using FPU registers,
1214 * so make sure FPU state is saved when running this in the kernel.
1215 */
1216 ENTRY_ALIGN(gcm_xor_avx, 32)
1217 movdqu (%rdi), %xmm0
1218 movdqu (%rsi), %xmm1
1219 pxor %xmm1, %xmm0
1220 movdqu %xmm0, (%rsi)
1221 RET
1222 SET_SIZE(gcm_xor_avx)
1223
1224 /*
1225 * Toggle a boolean_t value atomically and return the new value.
1226 * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
1227 */
1228 ENTRY_ALIGN(atomic_toggle_boolean_nv, 32)
1229 xorl %eax, %eax
1230 lock
1231 xorl $1, (%rdi)
1232 jz 1f
1233 movl $1, %eax
1234 1:
1235 RET
1236 SET_SIZE(atomic_toggle_boolean_nv)
1237
1238 SECTION_STATIC
1239
1240 .balign 64
1241 .Lbswap_mask:
1242 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1243 .Lpoly:
1244 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1245 .Lone_msb:
1246 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1247 .Ltwo_lsb:
1248 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1249 .Lone_lsb:
1250 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1251 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1252 .balign 64
1253
1254 /* Mark the stack non-executable. */
1255 #if defined(__linux__) && defined(__ELF__)
1256 .section .note.GNU-stack,"",%progbits
1257 #endif
1258
1259 #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
Cache object: 0a32b8b766b849810f89da71e4e78e5c
|