1 /*-
2 * Copyright (c) 2016 The FreeBSD Foundation
3 * Copyright (c) 2020 Ampere Computing
4 * All rights reserved.
5 *
6 * This software was developed by Andrew Turner under
7 * sponsorship from the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 *
30 * This file is derived from aesni_wrap.c:
31 * Copyright (C) 2008 Damien Miller <djm@mindrot.org>
32 * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
33 * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net>
34 * Copyright 2012-2013 John-Mark Gurney <jmg@FreeBSD.org>
35 * Copyright (c) 2014 The FreeBSD Foundation
36 */
37
38 /*
39 * This code is built with floating-point enabled. Make sure to have entered
40 * into floating-point context before calling any of these functions.
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/malloc.h>
49 #include <sys/queue.h>
50
51 #include <opencrypto/cryptodev.h>
52 #include <opencrypto/gmac.h>
53 #include <crypto/rijndael/rijndael.h>
54 #include <crypto/armv8/armv8_crypto.h>
55
56 #include <arm_neon.h>
57
58 static uint8x16_t
59 armv8_aes_enc(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
60 {
61 uint8x16_t tmp;
62 int i;
63
64 tmp = from;
65 for (i = 0; i < rounds - 1; i += 2) {
66 tmp = vaeseq_u8(tmp, keysched[i]);
67 tmp = vaesmcq_u8(tmp);
68 tmp = vaeseq_u8(tmp, keysched[i + 1]);
69 tmp = vaesmcq_u8(tmp);
70 }
71
72 tmp = vaeseq_u8(tmp, keysched[rounds - 1]);
73 tmp = vaesmcq_u8(tmp);
74 tmp = vaeseq_u8(tmp, keysched[rounds]);
75 tmp = veorq_u8(tmp, keysched[rounds + 1]);
76
77 return (tmp);
78 }
79
80 static uint8x16_t
81 armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from)
82 {
83 uint8x16_t tmp;
84 int i;
85
86 tmp = from;
87 for (i = 0; i < rounds - 1; i += 2) {
88 tmp = vaesdq_u8(tmp, keysched[i]);
89 tmp = vaesimcq_u8(tmp);
90 tmp = vaesdq_u8(tmp, keysched[i+1]);
91 tmp = vaesimcq_u8(tmp);
92 }
93
94 tmp = vaesdq_u8(tmp, keysched[rounds - 1]);
95 tmp = vaesimcq_u8(tmp);
96 tmp = vaesdq_u8(tmp, keysched[rounds]);
97 tmp = veorq_u8(tmp, keysched[rounds + 1]);
98
99 return (tmp);
100 }
101
102 void
103 armv8_aes_encrypt_cbc(const AES_key_t *key, size_t len,
104 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
105 const uint8_t iv[static AES_BLOCK_LEN])
106 {
107 uint8x16_t tot, ivreg, tmp;
108 uint8_t block[AES_BLOCK_LEN], *from, *to;
109 size_t fromseglen, oseglen, seglen, toseglen;
110
111 KASSERT(len % AES_BLOCK_LEN == 0,
112 ("%s: length %zu not a multiple of the block size", __func__, len));
113
114 ivreg = vld1q_u8(iv);
115 for (; len > 0; len -= seglen) {
116 from = crypto_cursor_segment(fromc, &fromseglen);
117 to = crypto_cursor_segment(toc, &toseglen);
118
119 seglen = ulmin(len, ulmin(fromseglen, toseglen));
120 if (seglen < AES_BLOCK_LEN) {
121 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
122 tmp = vld1q_u8(block);
123 tot = armv8_aes_enc(key->aes_rounds - 1,
124 (const void *)key->aes_key, veorq_u8(tmp, ivreg));
125 ivreg = tot;
126 vst1q_u8(block, tot);
127 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
128 seglen = AES_BLOCK_LEN;
129 } else {
130 for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
131 seglen -= AES_BLOCK_LEN) {
132 tmp = vld1q_u8(from);
133 tot = armv8_aes_enc(key->aes_rounds - 1,
134 (const void *)key->aes_key,
135 veorq_u8(tmp, ivreg));
136 ivreg = tot;
137 vst1q_u8(to, tot);
138 from += AES_BLOCK_LEN;
139 to += AES_BLOCK_LEN;
140 }
141 seglen = oseglen - seglen;
142 crypto_cursor_advance(fromc, seglen);
143 crypto_cursor_advance(toc, seglen);
144 }
145 }
146
147 explicit_bzero(block, sizeof(block));
148 }
149
150 void
151 armv8_aes_decrypt_cbc(const AES_key_t *key, size_t len,
152 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
153 const uint8_t iv[static AES_BLOCK_LEN])
154 {
155 uint8x16_t ivreg, nextiv, tmp;
156 uint8_t block[AES_BLOCK_LEN], *from, *to;
157 size_t fromseglen, oseglen, seglen, toseglen;
158
159 KASSERT(len % AES_BLOCK_LEN == 0,
160 ("%s: length %zu not a multiple of the block size", __func__, len));
161
162 ivreg = vld1q_u8(iv);
163 for (; len > 0; len -= seglen) {
164 from = crypto_cursor_segment(fromc, &fromseglen);
165 to = crypto_cursor_segment(toc, &toseglen);
166
167 seglen = ulmin(len, ulmin(fromseglen, toseglen));
168 if (seglen < AES_BLOCK_LEN) {
169 crypto_cursor_copydata(fromc, AES_BLOCK_LEN, block);
170 nextiv = vld1q_u8(block);
171 tmp = armv8_aes_dec(key->aes_rounds - 1,
172 (const void *)key->aes_key, nextiv);
173 vst1q_u8(block, veorq_u8(tmp, ivreg));
174 ivreg = nextiv;
175 crypto_cursor_copyback(toc, AES_BLOCK_LEN, block);
176 seglen = AES_BLOCK_LEN;
177 } else {
178 for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
179 seglen -= AES_BLOCK_LEN) {
180 nextiv = vld1q_u8(from);
181 tmp = armv8_aes_dec(key->aes_rounds - 1,
182 (const void *)key->aes_key, nextiv);
183 vst1q_u8(to, veorq_u8(tmp, ivreg));
184 ivreg = nextiv;
185 from += AES_BLOCK_LEN;
186 to += AES_BLOCK_LEN;
187 }
188 crypto_cursor_advance(fromc, oseglen - seglen);
189 crypto_cursor_advance(toc, oseglen - seglen);
190 seglen = oseglen - seglen;
191 }
192 }
193
194 explicit_bzero(block, sizeof(block));
195 }
196
197 #define AES_XTS_BLOCKSIZE 16
198 #define AES_XTS_IVSIZE 8
199 #define AES_XTS_ALPHA 0x87 /* GF(2^128) generator polynomial */
200
201 static inline int32x4_t
202 xts_crank_lfsr(int32x4_t inp)
203 {
204 const int32x4_t alphamask = {AES_XTS_ALPHA, 1, 1, 1};
205 int32x4_t xtweak, ret;
206
207 /* set up xor mask */
208 xtweak = vextq_s32(inp, inp, 3);
209 xtweak = vshrq_n_s32(xtweak, 31);
210 xtweak &= alphamask;
211
212 /* next term */
213 ret = vshlq_n_s32(inp, 1);
214 ret ^= xtweak;
215
216 return ret;
217 }
218
219 static void
220 armv8_aes_crypt_xts_block(int rounds, const uint8x16_t *key_schedule,
221 uint8x16_t *tweak, const uint8_t *from, uint8_t *to, int do_encrypt)
222 {
223 uint8x16_t block;
224
225 block = vld1q_u8(from) ^ *tweak;
226
227 if (do_encrypt)
228 block = armv8_aes_enc(rounds - 1, key_schedule, block);
229 else
230 block = armv8_aes_dec(rounds - 1, key_schedule, block);
231
232 vst1q_u8(to, block ^ *tweak);
233
234 *tweak = vreinterpretq_u8_s32(xts_crank_lfsr(vreinterpretq_s32_u8(*tweak)));
235 }
236
237 static void
238 armv8_aes_crypt_xts(int rounds, const uint8x16_t *data_schedule,
239 const uint8x16_t *tweak_schedule, size_t len,
240 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
241 const uint8_t iv[static AES_BLOCK_LEN], int do_encrypt)
242 {
243 uint8x16_t tweakreg;
244 uint8_t block[AES_XTS_BLOCKSIZE] __aligned(16);
245 uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16);
246 uint8_t *from, *to;
247 size_t fromseglen, oseglen, seglen, toseglen;
248
249 KASSERT(len % AES_XTS_BLOCKSIZE == 0,
250 ("%s: length %zu not a multiple of the block size", __func__, len));
251
252 /*
253 * Prepare tweak as E_k2(IV). IV is specified as LE representation
254 * of a 64-bit block number which we allow to be passed in directly.
255 */
256 #if BYTE_ORDER == LITTLE_ENDIAN
257 bcopy(iv, tweak, AES_XTS_IVSIZE);
258 /* Last 64 bits of IV are always zero. */
259 bzero(tweak + AES_XTS_IVSIZE, AES_XTS_IVSIZE);
260 #else
261 #error Only LITTLE_ENDIAN architectures are supported.
262 #endif
263 tweakreg = vld1q_u8(tweak);
264 tweakreg = armv8_aes_enc(rounds - 1, tweak_schedule, tweakreg);
265
266 for (; len > 0; len -= seglen) {
267 from = crypto_cursor_segment(fromc, &fromseglen);
268 to = crypto_cursor_segment(toc, &toseglen);
269
270 seglen = ulmin(len, ulmin(fromseglen, toseglen));
271 if (seglen < AES_XTS_BLOCKSIZE) {
272 crypto_cursor_copydata(fromc, AES_XTS_BLOCKSIZE, block);
273 armv8_aes_crypt_xts_block(rounds, data_schedule,
274 &tweakreg, block, block, do_encrypt);
275 crypto_cursor_copyback(toc, AES_XTS_BLOCKSIZE, block);
276 seglen = AES_XTS_BLOCKSIZE;
277 } else {
278 for (oseglen = seglen; seglen >= AES_XTS_BLOCKSIZE;
279 seglen -= AES_XTS_BLOCKSIZE) {
280 armv8_aes_crypt_xts_block(rounds, data_schedule,
281 &tweakreg, from, to, do_encrypt);
282 from += AES_XTS_BLOCKSIZE;
283 to += AES_XTS_BLOCKSIZE;
284 }
285 seglen = oseglen - seglen;
286 crypto_cursor_advance(fromc, seglen);
287 crypto_cursor_advance(toc, seglen);
288 }
289 }
290
291 explicit_bzero(block, sizeof(block));
292 }
293
294 void
295 armv8_aes_encrypt_xts(AES_key_t *data_schedule,
296 const void *tweak_schedule, size_t len, struct crypto_buffer_cursor *fromc,
297 struct crypto_buffer_cursor *toc, const uint8_t iv[static AES_BLOCK_LEN])
298 {
299 armv8_aes_crypt_xts(data_schedule->aes_rounds,
300 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
301 toc, iv, 1);
302 }
303
304 void
305 armv8_aes_decrypt_xts(AES_key_t *data_schedule,
306 const void *tweak_schedule, size_t len,
307 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
308 const uint8_t iv[static AES_BLOCK_LEN])
309 {
310 armv8_aes_crypt_xts(data_schedule->aes_rounds,
311 (const void *)&data_schedule->aes_key, tweak_schedule, len, fromc,
312 toc, iv, 0);
313
314 }
315 #define AES_INC_COUNTER(counter) \
316 do { \
317 for (int pos = AES_BLOCK_LEN - 1; \
318 pos >= 0; pos--) \
319 if (++(counter)[pos]) \
320 break; \
321 } while (0)
322
323 struct armv8_gcm_state {
324 __uint128_val_t EK0;
325 __uint128_val_t EKi;
326 __uint128_val_t Xi;
327 __uint128_val_t lenblock;
328 uint8_t aes_counter[AES_BLOCK_LEN];
329 };
330
331 static void
332 armv8_aes_gmac_setup(struct armv8_gcm_state *s, AES_key_t *aes_key,
333 const uint8_t *authdata, size_t authdatalen,
334 const uint8_t iv[static AES_GCM_IV_LEN], const __uint128_val_t *Htable)
335 {
336 uint8_t block[AES_BLOCK_LEN];
337 size_t trailer;
338
339 bzero(s->aes_counter, AES_BLOCK_LEN);
340 memcpy(s->aes_counter, iv, AES_GCM_IV_LEN);
341
342 /* Setup the counter */
343 s->aes_counter[AES_BLOCK_LEN - 1] = 1;
344
345 /* EK0 for a final GMAC round */
346 aes_v8_encrypt(s->aes_counter, s->EK0.c, aes_key);
347
348 /* GCM starts with 2 as counter, 1 is used for final xor of tag. */
349 s->aes_counter[AES_BLOCK_LEN - 1] = 2;
350
351 memset(s->Xi.c, 0, sizeof(s->Xi.c));
352 trailer = authdatalen % AES_BLOCK_LEN;
353 if (authdatalen - trailer > 0) {
354 gcm_ghash_v8(s->Xi.u, Htable, authdata, authdatalen - trailer);
355 authdata += authdatalen - trailer;
356 }
357 if (trailer > 0 || authdatalen == 0) {
358 memset(block, 0, sizeof(block));
359 memcpy(block, authdata, trailer);
360 gcm_ghash_v8(s->Xi.u, Htable, block, AES_BLOCK_LEN);
361 }
362 }
363
364 static void
365 armv8_aes_gmac_finish(struct armv8_gcm_state *s, size_t len,
366 size_t authdatalen, const __uint128_val_t *Htable)
367 {
368 /* Lengths block */
369 s->lenblock.u[0] = s->lenblock.u[1] = 0;
370 s->lenblock.d[1] = htobe32(authdatalen * 8);
371 s->lenblock.d[3] = htobe32(len * 8);
372 gcm_ghash_v8(s->Xi.u, Htable, s->lenblock.c, AES_BLOCK_LEN);
373
374 s->Xi.u[0] ^= s->EK0.u[0];
375 s->Xi.u[1] ^= s->EK0.u[1];
376 }
377
378 static void
379 armv8_aes_encrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
380 const uint64_t *from, uint64_t *to)
381 {
382 aes_v8_encrypt(s->aes_counter, s->EKi.c, aes_key);
383 AES_INC_COUNTER(s->aes_counter);
384 to[0] = from[0] ^ s->EKi.u[0];
385 to[1] = from[1] ^ s->EKi.u[1];
386 }
387
388 static void
389 armv8_aes_decrypt_gcm_block(struct armv8_gcm_state *s, AES_key_t *aes_key,
390 const uint64_t *from, uint64_t *to)
391 {
392 armv8_aes_encrypt_gcm_block(s, aes_key, from, to);
393 }
394
395 void
396 armv8_aes_encrypt_gcm(AES_key_t *aes_key, size_t len,
397 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
398 size_t authdatalen, const uint8_t *authdata,
399 uint8_t tag[static GMAC_DIGEST_LEN],
400 const uint8_t iv[static AES_GCM_IV_LEN],
401 const __uint128_val_t *Htable)
402 {
403 struct armv8_gcm_state s;
404 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN);
405 uint64_t *from64, *to64;
406 size_t fromseglen, i, olen, oseglen, seglen, toseglen;
407
408 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
409
410 for (olen = len; len > 0; len -= seglen) {
411 from64 = crypto_cursor_segment(fromc, &fromseglen);
412 to64 = crypto_cursor_segment(toc, &toseglen);
413
414 seglen = ulmin(len, ulmin(fromseglen, toseglen));
415 if (seglen < AES_BLOCK_LEN) {
416 seglen = ulmin(len, AES_BLOCK_LEN);
417
418 memset(block, 0, sizeof(block));
419 crypto_cursor_copydata(fromc, (int)seglen, block);
420
421 if (seglen == AES_BLOCK_LEN) {
422 armv8_aes_encrypt_gcm_block(&s, aes_key,
423 (uint64_t *)block, (uint64_t *)block);
424 } else {
425 aes_v8_encrypt(s.aes_counter, s.EKi.c, aes_key);
426 AES_INC_COUNTER(s.aes_counter);
427 for (i = 0; i < seglen; i++)
428 block[i] ^= s.EKi.c[i];
429 }
430 gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
431
432 crypto_cursor_copyback(toc, (int)seglen, block);
433 } else {
434 for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
435 seglen -= AES_BLOCK_LEN) {
436 armv8_aes_encrypt_gcm_block(&s, aes_key, from64,
437 to64);
438 gcm_ghash_v8(s.Xi.u, Htable, (uint8_t *)to64,
439 AES_BLOCK_LEN);
440
441 from64 += 2;
442 to64 += 2;
443 }
444
445 seglen = oseglen - seglen;
446 crypto_cursor_advance(fromc, seglen);
447 crypto_cursor_advance(toc, seglen);
448 }
449 }
450
451 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
452 memcpy(tag, s.Xi.c, GMAC_DIGEST_LEN);
453
454 explicit_bzero(block, sizeof(block));
455 explicit_bzero(&s, sizeof(s));
456 }
457
458 int
459 armv8_aes_decrypt_gcm(AES_key_t *aes_key, size_t len,
460 struct crypto_buffer_cursor *fromc, struct crypto_buffer_cursor *toc,
461 size_t authdatalen, const uint8_t *authdata,
462 const uint8_t tag[static GMAC_DIGEST_LEN],
463 const uint8_t iv[static AES_GCM_IV_LEN],
464 const __uint128_val_t *Htable)
465 {
466 struct armv8_gcm_state s;
467 struct crypto_buffer_cursor fromcc;
468 uint8_t block[AES_BLOCK_LEN] __aligned(AES_BLOCK_LEN), *from;
469 uint64_t *block64, *from64, *to64;
470 size_t fromseglen, olen, oseglen, seglen, toseglen;
471 int error;
472
473 armv8_aes_gmac_setup(&s, aes_key, authdata, authdatalen, iv, Htable);
474
475 crypto_cursor_copy(fromc, &fromcc);
476 for (olen = len; len > 0; len -= seglen) {
477 from = crypto_cursor_segment(&fromcc, &fromseglen);
478 seglen = ulmin(len, fromseglen);
479 seglen -= seglen % AES_BLOCK_LEN;
480 if (seglen > 0) {
481 gcm_ghash_v8(s.Xi.u, Htable, from, seglen);
482 crypto_cursor_advance(&fromcc, seglen);
483 } else {
484 memset(block, 0, sizeof(block));
485 seglen = ulmin(len, AES_BLOCK_LEN);
486 crypto_cursor_copydata(&fromcc, seglen, block);
487 gcm_ghash_v8(s.Xi.u, Htable, block, seglen);
488 }
489 }
490
491 armv8_aes_gmac_finish(&s, olen, authdatalen, Htable);
492
493 if (timingsafe_bcmp(tag, s.Xi.c, GMAC_DIGEST_LEN) != 0) {
494 error = EBADMSG;
495 goto out;
496 }
497
498 block64 = (uint64_t *)block;
499 for (len = olen; len > 0; len -= seglen) {
500 from64 = crypto_cursor_segment(fromc, &fromseglen);
501 to64 = crypto_cursor_segment(toc, &toseglen);
502
503 seglen = ulmin(len, ulmin(fromseglen, toseglen));
504 if (seglen < AES_BLOCK_LEN) {
505 seglen = ulmin(len, AES_BLOCK_LEN);
506
507 memset(block, 0, sizeof(block));
508 crypto_cursor_copydata(fromc, seglen, block);
509
510 armv8_aes_decrypt_gcm_block(&s, aes_key, block64,
511 block64);
512
513 crypto_cursor_copyback(toc, (int)seglen, block);
514 } else {
515 for (oseglen = seglen; seglen >= AES_BLOCK_LEN;
516 seglen -= AES_BLOCK_LEN) {
517 armv8_aes_decrypt_gcm_block(&s, aes_key, from64,
518 to64);
519
520 from64 += 2;
521 to64 += 2;
522 }
523
524 seglen = oseglen - seglen;
525 crypto_cursor_advance(fromc, seglen);
526 crypto_cursor_advance(toc, seglen);
527 }
528 }
529
530 error = 0;
531 out:
532 explicit_bzero(block, sizeof(block));
533 explicit_bzero(&s, sizeof(s));
534 return (error);
535 }
Cache object: eec27035f30283be30b3d9c9a3f66af4
|