1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
3 .text
4
5 .type _vpaes_consts,%object
6 .align 7 // totally strategic alignment
7 _vpaes_consts:
8 .Lk_mc_forward: // mc_forward
9 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
10 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
11 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
12 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
13 .Lk_mc_backward: // mc_backward
14 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
15 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
16 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
17 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
18 .Lk_sr: // sr
19 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
20 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
21 .quad 0x0F060D040B020900, 0x070E050C030A0108
22 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
23
24 //
25 // "Hot" constants
26 //
27 .Lk_inv: // inv, inva
28 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
29 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
30 .Lk_ipt: // input transform (lo, hi)
31 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
32 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
33 .Lk_sbo: // sbou, sbot
34 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
35 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
36 .Lk_sb1: // sb1u, sb1t
37 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
38 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
39 .Lk_sb2: // sb2u, sb2t
40 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
41 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
42
43 //
44 // Decryption stuff
45 //
46 .Lk_dipt: // decryption input transform
47 .quad 0x0F505B040B545F00, 0x154A411E114E451A
48 .quad 0x86E383E660056500, 0x12771772F491F194
49 .Lk_dsbo: // decryption sbox final output
50 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
51 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
52 .Lk_dsb9: // decryption sbox output *9*u, *9*t
53 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
54 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
55 .Lk_dsbd: // decryption sbox output *D*u, *D*t
56 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
57 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
58 .Lk_dsbb: // decryption sbox output *B*u, *B*t
59 .quad 0xD022649296B44200, 0x602646F6B0F2D404
60 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
61 .Lk_dsbe: // decryption sbox output *E*u, *E*t
62 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
63 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
64
65 //
66 // Key schedule constants
67 //
68 .Lk_dksd: // decryption key schedule: invskew x*D
69 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
70 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
71 .Lk_dksb: // decryption key schedule: invskew x*B
72 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
73 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
74 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
75 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
76 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
77 .Lk_dks9: // decryption key schedule: invskew x*9
78 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
79 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
80
81 .Lk_rcon: // rcon
82 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
83
84 .Lk_opt: // output transform
85 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
86 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
87 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
88 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
89 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
90
91 .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
92 .align 2
93 .size _vpaes_consts,.-_vpaes_consts
94 .align 6
95 ##
96 ## _aes_preheat
97 ##
98 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
99 ## and %xmm9-%xmm15 as specified below.
100 ##
101 .type _vpaes_encrypt_preheat,%function
102 .align 4
103 _vpaes_encrypt_preheat:
104 adr x10, .Lk_inv
105 movi v17.16b, #0x0f
106 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
107 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
108 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
109 ret
110 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
111
112 ##
113 ## _aes_encrypt_core
114 ##
115 ## AES-encrypt %xmm0.
116 ##
117 ## Inputs:
118 ## %xmm0 = input
119 ## %xmm9-%xmm15 as in _vpaes_preheat
120 ## (%rdx) = scheduled keys
121 ##
122 ## Output in %xmm0
123 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
124 ## Preserves %xmm6 - %xmm8 so you get some local vectors
125 ##
126 ##
127 .type _vpaes_encrypt_core,%function
128 .align 4
129 _vpaes_encrypt_core:
130 mov x9, x2
131 ldr w8, [x2,#240] // pull rounds
132 adr x11, .Lk_mc_forward+16
133 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
134 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
135 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
136 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
137 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
138 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
139 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
140 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
141 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
142 b .Lenc_entry
143
144 .align 4
145 .Lenc_loop:
146 // middle of middle round
147 add x10, x11, #0x40
148 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
149 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
150 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
151 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
152 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
153 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
154 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
155 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
156 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
157 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
158 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
159 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
160 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
161 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
162 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
163 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
164 sub w8, w8, #1 // nr--
165
166 .Lenc_entry:
167 // top of round
168 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
169 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
170 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
171 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
172 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
173 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
174 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
175 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
176 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
177 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
178 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
179 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
180 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
181 cbnz w8, .Lenc_loop
182
183 // middle of last round
184 add x10, x11, #0x80
185 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
186 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
187 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
188 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
189 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
190 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
191 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
192 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
193 ret
194 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
195
196 .globl vpaes_encrypt
197 .type vpaes_encrypt,%function
198 .align 4
199 vpaes_encrypt:
200 .inst 0xd503233f // paciasp
201 stp x29,x30,[sp,#-16]!
202 add x29,sp,#0
203
204 ld1 {v7.16b}, [x0]
205 bl _vpaes_encrypt_preheat
206 bl _vpaes_encrypt_core
207 st1 {v0.16b}, [x1]
208
209 ldp x29,x30,[sp],#16
210 .inst 0xd50323bf // autiasp
211 ret
212 .size vpaes_encrypt,.-vpaes_encrypt
213
214 .type _vpaes_encrypt_2x,%function
215 .align 4
216 _vpaes_encrypt_2x:
217 mov x9, x2
218 ldr w8, [x2,#240] // pull rounds
219 adr x11, .Lk_mc_forward+16
220 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
221 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
222 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
223 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
224 and v9.16b, v15.16b, v17.16b
225 ushr v8.16b, v15.16b, #4
226 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
227 tbl v9.16b, {v20.16b}, v9.16b
228 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
229 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
230 tbl v10.16b, {v21.16b}, v8.16b
231 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
232 eor v8.16b, v9.16b, v16.16b
233 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
234 eor v8.16b, v8.16b, v10.16b
235 b .Lenc_2x_entry
236
237 .align 4
238 .Lenc_2x_loop:
239 // middle of middle round
240 add x10, x11, #0x40
241 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
242 tbl v12.16b, {v25.16b}, v10.16b
243 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
244 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
245 tbl v8.16b, {v24.16b}, v11.16b
246 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
247 eor v12.16b, v12.16b, v16.16b
248 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
249 tbl v13.16b, {v27.16b}, v10.16b
250 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 eor v8.16b, v8.16b, v12.16b
252 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
253 tbl v10.16b, {v26.16b}, v11.16b
254 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
255 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
256 tbl v11.16b, {v8.16b}, v1.16b
257 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
258 eor v10.16b, v10.16b, v13.16b
259 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
260 tbl v8.16b, {v8.16b}, v4.16b
261 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
262 eor v11.16b, v11.16b, v10.16b
263 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
264 tbl v12.16b, {v11.16b},v1.16b
265 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
266 eor v8.16b, v8.16b, v11.16b
267 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
268 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
269 eor v8.16b, v8.16b, v12.16b
270 sub w8, w8, #1 // nr--
271
272 .Lenc_2x_entry:
273 // top of round
274 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
275 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
276 and v9.16b, v8.16b, v17.16b
277 ushr v8.16b, v8.16b, #4
278 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
279 tbl v13.16b, {v19.16b},v9.16b
280 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
281 eor v9.16b, v9.16b, v8.16b
282 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
283 tbl v11.16b, {v18.16b},v8.16b
284 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
285 tbl v12.16b, {v18.16b},v9.16b
286 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
287 eor v11.16b, v11.16b, v13.16b
288 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
289 eor v12.16b, v12.16b, v13.16b
290 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
291 tbl v10.16b, {v18.16b},v11.16b
292 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
293 tbl v11.16b, {v18.16b},v12.16b
294 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
295 eor v10.16b, v10.16b, v9.16b
296 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
297 eor v11.16b, v11.16b, v8.16b
298 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
299 cbnz w8, .Lenc_2x_loop
300
301 // middle of last round
302 add x10, x11, #0x80
303 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
304 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
305 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
306 tbl v12.16b, {v22.16b}, v10.16b
307 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
308 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
309 tbl v8.16b, {v23.16b}, v11.16b
310 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
311 eor v12.16b, v12.16b, v16.16b
312 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
313 eor v8.16b, v8.16b, v12.16b
314 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
315 tbl v1.16b, {v8.16b},v1.16b
316 ret
317 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
318
319 .type _vpaes_decrypt_preheat,%function
320 .align 4
321 _vpaes_decrypt_preheat:
322 adr x10, .Lk_inv
323 movi v17.16b, #0x0f
324 adr x11, .Lk_dipt
325 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
326 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
327 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
328 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
329 ret
330 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
331
332 ##
333 ## Decryption core
334 ##
335 ## Same API as encryption core.
336 ##
337 .type _vpaes_decrypt_core,%function
338 .align 4
339 _vpaes_decrypt_core:
340 mov x9, x2
341 ldr w8, [x2,#240] // pull rounds
342
343 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
344 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
345 eor x11, x11, #0x30 // xor $0x30, %r11
346 adr x10, .Lk_sr
347 and x11, x11, #0x30 // and $0x30, %r11
348 add x11, x11, x10
349 adr x10, .Lk_mc_forward+48
350
351 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
352 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
353 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
354 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
355 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
356 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
357 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
358 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
359 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
360 b .Ldec_entry
361
362 .align 4
363 .Ldec_loop:
364 //
365 // Inverse mix columns
366 //
367 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
368 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
369 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
370 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
371 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
372 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
373 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
374 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
375
376 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
377 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
378 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
379 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
380 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
381 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
382 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
383
384 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
385 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
386 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
387 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
388 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
389 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
390 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
391
392 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
393 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
394 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
395 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
396 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
397 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
398 sub w8, w8, #1 // sub $1,%rax # nr--
399
400 .Ldec_entry:
401 // top of round
402 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
403 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
404 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
405 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
406 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
407 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
408 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
409 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
410 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
411 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
412 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
413 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
414 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
415 cbnz w8, .Ldec_loop
416
417 // middle of last round
418 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
419 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
420 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
421 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
422 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
423 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
424 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
425 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
426 ret
427 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
428
429 .globl vpaes_decrypt
430 .type vpaes_decrypt,%function
431 .align 4
432 vpaes_decrypt:
433 .inst 0xd503233f // paciasp
434 stp x29,x30,[sp,#-16]!
435 add x29,sp,#0
436
437 ld1 {v7.16b}, [x0]
438 bl _vpaes_decrypt_preheat
439 bl _vpaes_decrypt_core
440 st1 {v0.16b}, [x1]
441
442 ldp x29,x30,[sp],#16
443 .inst 0xd50323bf // autiasp
444 ret
445 .size vpaes_decrypt,.-vpaes_decrypt
446
447 // v14-v15 input, v0-v1 output
448 .type _vpaes_decrypt_2x,%function
449 .align 4
450 _vpaes_decrypt_2x:
451 mov x9, x2
452 ldr w8, [x2,#240] // pull rounds
453
454 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
455 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
456 eor x11, x11, #0x30 // xor $0x30, %r11
457 adr x10, .Lk_sr
458 and x11, x11, #0x30 // and $0x30, %r11
459 add x11, x11, x10
460 adr x10, .Lk_mc_forward+48
461
462 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
463 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
464 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
465 and v9.16b, v15.16b, v17.16b
466 ushr v8.16b, v15.16b, #4
467 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
468 tbl v10.16b, {v20.16b},v9.16b
469 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
470 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
471 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
472 tbl v8.16b, {v21.16b},v8.16b
473 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
474 eor v10.16b, v10.16b, v16.16b
475 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
476 eor v8.16b, v8.16b, v10.16b
477 b .Ldec_2x_entry
478
479 .align 4
480 .Ldec_2x_loop:
481 //
482 // Inverse mix columns
483 //
484 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
485 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
486 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
487 tbl v12.16b, {v24.16b}, v10.16b
488 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
489 tbl v9.16b, {v25.16b}, v11.16b
490 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
491 eor v8.16b, v12.16b, v16.16b
492 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
493 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
494 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
495 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
496
497 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
498 tbl v12.16b, {v26.16b}, v10.16b
499 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
500 tbl v8.16b, {v8.16b},v5.16b
501 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
502 tbl v9.16b, {v27.16b}, v11.16b
503 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
504 eor v8.16b, v8.16b, v12.16b
505 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
506 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
507 eor v8.16b, v8.16b, v9.16b
508 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
509
510 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
511 tbl v12.16b, {v28.16b}, v10.16b
512 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
513 tbl v8.16b, {v8.16b},v5.16b
514 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
515 tbl v9.16b, {v29.16b}, v11.16b
516 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
517 eor v8.16b, v8.16b, v12.16b
518 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
519 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
520 eor v8.16b, v8.16b, v9.16b
521 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
522
523 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
524 tbl v12.16b, {v30.16b}, v10.16b
525 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
526 tbl v8.16b, {v8.16b},v5.16b
527 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
528 tbl v9.16b, {v31.16b}, v11.16b
529 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
530 eor v8.16b, v8.16b, v12.16b
531 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
532 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
533 eor v8.16b, v8.16b, v9.16b
534 sub w8, w8, #1 // sub $1,%rax # nr--
535
536 .Ldec_2x_entry:
537 // top of round
538 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
539 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
540 and v9.16b, v8.16b, v17.16b
541 ushr v8.16b, v8.16b, #4
542 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
543 tbl v10.16b, {v19.16b},v9.16b
544 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
545 eor v9.16b, v9.16b, v8.16b
546 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
547 tbl v11.16b, {v18.16b},v8.16b
548 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
549 tbl v12.16b, {v18.16b},v9.16b
550 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
551 eor v11.16b, v11.16b, v10.16b
552 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
553 eor v12.16b, v12.16b, v10.16b
554 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
555 tbl v10.16b, {v18.16b},v11.16b
556 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
557 tbl v11.16b, {v18.16b},v12.16b
558 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
559 eor v10.16b, v10.16b, v9.16b
560 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
561 eor v11.16b, v11.16b, v8.16b
562 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
563 cbnz w8, .Ldec_2x_loop
564
565 // middle of last round
566 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
567 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
568 tbl v12.16b, {v22.16b}, v10.16b
569 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
570 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
571 tbl v9.16b, {v23.16b}, v11.16b
572 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
573 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
574 eor v12.16b, v12.16b, v16.16b
575 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
576 eor v8.16b, v9.16b, v12.16b
577 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
578 tbl v1.16b, {v8.16b},v2.16b
579 ret
580 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
581 ########################################################
582 ## ##
583 ## AES key schedule ##
584 ## ##
585 ########################################################
586 .type _vpaes_key_preheat,%function
587 .align 4
588 _vpaes_key_preheat:
589 adr x10, .Lk_inv
590 movi v16.16b, #0x5b // .Lk_s63
591 adr x11, .Lk_sb1
592 movi v17.16b, #0x0f // .Lk_s0F
593 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
594 adr x10, .Lk_dksd
595 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
596 adr x11, .Lk_mc_forward
597 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
598 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
599 ld1 {v8.2d}, [x10] // .Lk_rcon
600 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
601 ret
602 .size _vpaes_key_preheat,.-_vpaes_key_preheat
603
604 .type _vpaes_schedule_core,%function
605 .align 4
606 _vpaes_schedule_core:
607 .inst 0xd503233f // paciasp
608 stp x29, x30, [sp,#-16]!
609 add x29,sp,#0
610
611 bl _vpaes_key_preheat // load the tables
612
613 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
614
615 // input transform
616 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
617 bl _vpaes_schedule_transform
618 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
619
620 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
621 add x8, x8, x10
622 cbnz w3, .Lschedule_am_decrypting
623
624 // encrypting, output zeroth round key after transform
625 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
626 b .Lschedule_go
627
628 .Lschedule_am_decrypting:
629 // decrypting, output zeroth round key after shiftrows
630 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
631 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
632 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
633 eor x8, x8, #0x30 // xor $0x30, %r8
634
635 .Lschedule_go:
636 cmp w1, #192 // cmp $192, %esi
637 b.hi .Lschedule_256
638 b.eq .Lschedule_192
639 // 128: fall though
640
641 ##
642 ## .schedule_128
643 ##
644 ## 128-bit specific part of key schedule.
645 ##
646 ## This schedule is really simple, because all its parts
647 ## are accomplished by the subroutines.
648 ##
649 .Lschedule_128:
650 mov x0, #10 // mov $10, %esi
651
652 .Loop_schedule_128:
653 sub x0, x0, #1 // dec %esi
654 bl _vpaes_schedule_round
655 cbz x0, .Lschedule_mangle_last
656 bl _vpaes_schedule_mangle // write output
657 b .Loop_schedule_128
658
659 ##
660 ## .aes_schedule_192
661 ##
662 ## 192-bit specific part of key schedule.
663 ##
664 ## The main body of this schedule is the same as the 128-bit
665 ## schedule, but with more smearing. The long, high side is
666 ## stored in %xmm7 as before, and the short, low side is in
667 ## the high bits of %xmm6.
668 ##
669 ## This schedule is somewhat nastier, however, because each
670 ## round produces 192 bits of key material, or 1.5 round keys.
671 ## Therefore, on each cycle we do 2 rounds and produce 3 round
672 ## keys.
673 ##
674 .align 4
675 .Lschedule_192:
676 sub x0, x0, #8
677 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
678 bl _vpaes_schedule_transform // input transform
679 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
680 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
681 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
682 mov x0, #4 // mov $4, %esi
683
684 .Loop_schedule_192:
685 sub x0, x0, #1 // dec %esi
686 bl _vpaes_schedule_round
687 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
688 bl _vpaes_schedule_mangle // save key n
689 bl _vpaes_schedule_192_smear
690 bl _vpaes_schedule_mangle // save key n+1
691 bl _vpaes_schedule_round
692 cbz x0, .Lschedule_mangle_last
693 bl _vpaes_schedule_mangle // save key n+2
694 bl _vpaes_schedule_192_smear
695 b .Loop_schedule_192
696
697 ##
698 ## .aes_schedule_256
699 ##
700 ## 256-bit specific part of key schedule.
701 ##
702 ## The structure here is very similar to the 128-bit
703 ## schedule, but with an additional "low side" in
704 ## %xmm6. The low side's rounds are the same as the
705 ## high side's, except no rcon and no rotation.
706 ##
707 .align 4
708 .Lschedule_256:
709 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
710 bl _vpaes_schedule_transform // input transform
711 mov x0, #7 // mov $7, %esi
712
713 .Loop_schedule_256:
714 sub x0, x0, #1 // dec %esi
715 bl _vpaes_schedule_mangle // output low result
716 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
717
718 // high round
719 bl _vpaes_schedule_round
720 cbz x0, .Lschedule_mangle_last
721 bl _vpaes_schedule_mangle
722
723 // low round. swap xmm7 and xmm6
724 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
725 movi v4.16b, #0
726 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
727 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
728 bl _vpaes_schedule_low_round
729 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
730
731 b .Loop_schedule_256
732
733 ##
734 ## .aes_schedule_mangle_last
735 ##
736 ## Mangler for last round of key schedule
737 ## Mangles %xmm0
738 ## when encrypting, outputs out(%xmm0) ^ 63
739 ## when decrypting, outputs unskew(%xmm0)
740 ##
741 ## Always called right before return... jumps to cleanup and exits
742 ##
743 .align 4
744 .Lschedule_mangle_last:
745 // schedule last round key from xmm0
746 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
747 cbnz w3, .Lschedule_mangle_last_dec
748
749 // encrypting
750 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
751 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
752 add x2, x2, #32 // add $32, %rdx
753 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
754
755 .Lschedule_mangle_last_dec:
756 ld1 {v20.2d,v21.2d}, [x11] // reload constants
757 sub x2, x2, #16 // add $-16, %rdx
758 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
759 bl _vpaes_schedule_transform // output transform
760 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
761
762 // cleanup
763 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
764 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
765 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
766 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
767 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
768 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
769 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
770 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
771 ldp x29, x30, [sp],#16
772 .inst 0xd50323bf // autiasp
773 ret
774 .size _vpaes_schedule_core,.-_vpaes_schedule_core
775
776 ##
777 ## .aes_schedule_192_smear
778 ##
779 ## Smear the short, low side in the 192-bit key schedule.
780 ##
781 ## Inputs:
782 ## %xmm7: high side, b a x y
783 ## %xmm6: low side, d c 0 0
784 ## %xmm13: 0
785 ##
786 ## Outputs:
787 ## %xmm6: b+c+d b+c 0 0
788 ## %xmm0: b+c+d b+c b a
789 ##
790 .type _vpaes_schedule_192_smear,%function
791 .align 4
792 _vpaes_schedule_192_smear:
793 movi v1.16b, #0
794 dup v0.4s, v7.s[3]
795 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
796 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
797 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
798 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
799 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
800 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
801 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
802 ret
803 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
804
805 ##
806 ## .aes_schedule_round
807 ##
808 ## Runs one main round of the key schedule on %xmm0, %xmm7
809 ##
810 ## Specifically, runs subbytes on the high dword of %xmm0
811 ## then rotates it by one byte and xors into the low dword of
812 ## %xmm7.
813 ##
814 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
815 ## next rcon.
816 ##
817 ## Smears the dwords of %xmm7 by xoring the low into the
818 ## second low, result into third, result into highest.
819 ##
820 ## Returns results in %xmm7 = %xmm0.
821 ## Clobbers %xmm1-%xmm4, %r11.
822 ##
823 .type _vpaes_schedule_round,%function
824 .align 4
825 _vpaes_schedule_round:
826 // extract rcon from xmm8
827 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
828 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
829 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
830 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
831
832 // rotate
833 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
834 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
835
836 // fall through...
837
838 // low round: same as high round, but no rotation and no rcon.
839 _vpaes_schedule_low_round:
840 // smear xmm7
841 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
842 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
843 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
844
845 // subbytes
846 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
847 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
848 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
849 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
850 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
851 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
852 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
853 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
854 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
855 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
856 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
857 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
858 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
859 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
860 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
861 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
862 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
863
864 // add in smeared stuff
865 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
866 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
867 ret
868 .size _vpaes_schedule_round,.-_vpaes_schedule_round
869
870 ##
871 ## .aes_schedule_transform
872 ##
873 ## Linear-transform %xmm0 according to tables at (%r11)
874 ##
875 ## Requires that %xmm9 = 0x0F0F... as in preheat
876 ## Output in %xmm0
877 ## Clobbers %xmm1, %xmm2
878 ##
879 .type _vpaes_schedule_transform,%function
880 .align 4
881 _vpaes_schedule_transform:
882 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
883 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
884 // vmovdqa (%r11), %xmm2 # lo
885 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
886 // vmovdqa 16(%r11), %xmm1 # hi
887 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
888 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
889 ret
890 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
891
892 ##
893 ## .aes_schedule_mangle
894 ##
895 ## Mangle xmm0 from (basis-transformed) standard version
896 ## to our version.
897 ##
898 ## On encrypt,
899 ## xor with 0x63
900 ## multiply by circulant 0,1,1,1
901 ## apply shiftrows transform
902 ##
903 ## On decrypt,
904 ## xor with 0x63
905 ## multiply by "inverse mixcolumns" circulant E,B,D,9
906 ## deskew
907 ## apply shiftrows transform
908 ##
909 ##
910 ## Writes out to (%rdx), and increments or decrements it
911 ## Keeps track of round number mod 4 in %r8
912 ## Preserves xmm0
913 ## Clobbers xmm1-xmm5
914 ##
915 .type _vpaes_schedule_mangle,%function
916 .align 4
917 _vpaes_schedule_mangle:
918 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
919 // vmovdqa .Lk_mc_forward(%rip),%xmm5
920 cbnz w3, .Lschedule_mangle_dec
921
922 // encrypting
923 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
924 add x2, x2, #16 // add $16, %rdx
925 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
926 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
927 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
928 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
929 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
930 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
931
932 b .Lschedule_mangle_both
933 .align 4
934 .Lschedule_mangle_dec:
935 // inverse mix columns
936 // lea .Lk_dksd(%rip),%r11
937 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
938 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
939
940 // vmovdqa 0x00(%r11), %xmm2
941 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
942 // vmovdqa 0x10(%r11), %xmm3
943 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
944 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
945 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
946
947 // vmovdqa 0x20(%r11), %xmm2
948 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
949 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
950 // vmovdqa 0x30(%r11), %xmm3
951 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
952 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
953 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
954
955 // vmovdqa 0x40(%r11), %xmm2
956 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
957 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
958 // vmovdqa 0x50(%r11), %xmm3
959 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
960 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
961
962 // vmovdqa 0x60(%r11), %xmm2
963 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
964 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
965 // vmovdqa 0x70(%r11), %xmm4
966 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
967 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
968 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
969 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
970
971 sub x2, x2, #16 // add $-16, %rdx
972
973 .Lschedule_mangle_both:
974 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
975 add x8, x8, #64-16 // add $-16, %r8
976 and x8, x8, #~(1<<6) // and $0x30, %r8
977 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
978 ret
979 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
980
981 .globl vpaes_set_encrypt_key
982 .type vpaes_set_encrypt_key,%function
983 .align 4
984 vpaes_set_encrypt_key:
985 .inst 0xd503233f // paciasp
986 stp x29,x30,[sp,#-16]!
987 add x29,sp,#0
988 stp d8,d9,[sp,#-16]! // ABI spec says so
989
990 lsr w9, w1, #5 // shr $5,%eax
991 add w9, w9, #5 // $5,%eax
992 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
993
994 mov w3, #0 // mov $0,%ecx
995 mov x8, #0x30 // mov $0x30,%r8d
996 bl _vpaes_schedule_core
997 eor x0, x0, x0
998
999 ldp d8,d9,[sp],#16
1000 ldp x29,x30,[sp],#16
1001 .inst 0xd50323bf // autiasp
1002 ret
1003 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1004
1005 .globl vpaes_set_decrypt_key
1006 .type vpaes_set_decrypt_key,%function
1007 .align 4
1008 vpaes_set_decrypt_key:
1009 .inst 0xd503233f // paciasp
1010 stp x29,x30,[sp,#-16]!
1011 add x29,sp,#0
1012 stp d8,d9,[sp,#-16]! // ABI spec says so
1013
1014 lsr w9, w1, #5 // shr $5,%eax
1015 add w9, w9, #5 // $5,%eax
1016 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1017 lsl w9, w9, #4 // shl $4,%eax
1018 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1019 add x2, x2, x9
1020
1021 mov w3, #1 // mov $1,%ecx
1022 lsr w8, w1, #1 // shr $1,%r8d
1023 and x8, x8, #32 // and $32,%r8d
1024 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
1025 bl _vpaes_schedule_core
1026
1027 ldp d8,d9,[sp],#16
1028 ldp x29,x30,[sp],#16
1029 .inst 0xd50323bf // autiasp
1030 ret
1031 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1032 .globl vpaes_cbc_encrypt
1033 .type vpaes_cbc_encrypt,%function
1034 .align 4
1035 vpaes_cbc_encrypt:
1036 cbz x2, .Lcbc_abort
1037 cmp w5, #0 // check direction
1038 b.eq vpaes_cbc_decrypt
1039
1040 .inst 0xd503233f // paciasp
1041 stp x29,x30,[sp,#-16]!
1042 add x29,sp,#0
1043
1044 mov x17, x2 // reassign
1045 mov x2, x3 // reassign
1046
1047 ld1 {v0.16b}, [x4] // load ivec
1048 bl _vpaes_encrypt_preheat
1049 b .Lcbc_enc_loop
1050
1051 .align 4
1052 .Lcbc_enc_loop:
1053 ld1 {v7.16b}, [x0],#16 // load input
1054 eor v7.16b, v7.16b, v0.16b // xor with ivec
1055 bl _vpaes_encrypt_core
1056 st1 {v0.16b}, [x1],#16 // save output
1057 subs x17, x17, #16
1058 b.hi .Lcbc_enc_loop
1059
1060 st1 {v0.16b}, [x4] // write ivec
1061
1062 ldp x29,x30,[sp],#16
1063 .inst 0xd50323bf // autiasp
1064 .Lcbc_abort:
1065 ret
1066 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1067
1068 .type vpaes_cbc_decrypt,%function
1069 .align 4
1070 vpaes_cbc_decrypt:
1071 .inst 0xd503233f // paciasp
1072 stp x29,x30,[sp,#-16]!
1073 add x29,sp,#0
1074 stp d8,d9,[sp,#-16]! // ABI spec says so
1075 stp d10,d11,[sp,#-16]!
1076 stp d12,d13,[sp,#-16]!
1077 stp d14,d15,[sp,#-16]!
1078
1079 mov x17, x2 // reassign
1080 mov x2, x3 // reassign
1081 ld1 {v6.16b}, [x4] // load ivec
1082 bl _vpaes_decrypt_preheat
1083 tst x17, #16
1084 b.eq .Lcbc_dec_loop2x
1085
1086 ld1 {v7.16b}, [x0], #16 // load input
1087 bl _vpaes_decrypt_core
1088 eor v0.16b, v0.16b, v6.16b // xor with ivec
1089 orr v6.16b, v7.16b, v7.16b // next ivec value
1090 st1 {v0.16b}, [x1], #16
1091 subs x17, x17, #16
1092 b.ls .Lcbc_dec_done
1093
1094 .align 4
1095 .Lcbc_dec_loop2x:
1096 ld1 {v14.16b,v15.16b}, [x0], #32
1097 bl _vpaes_decrypt_2x
1098 eor v0.16b, v0.16b, v6.16b // xor with ivec
1099 eor v1.16b, v1.16b, v14.16b
1100 orr v6.16b, v15.16b, v15.16b
1101 st1 {v0.16b,v1.16b}, [x1], #32
1102 subs x17, x17, #32
1103 b.hi .Lcbc_dec_loop2x
1104
1105 .Lcbc_dec_done:
1106 st1 {v6.16b}, [x4]
1107
1108 ldp d14,d15,[sp],#16
1109 ldp d12,d13,[sp],#16
1110 ldp d10,d11,[sp],#16
1111 ldp d8,d9,[sp],#16
1112 ldp x29,x30,[sp],#16
1113 .inst 0xd50323bf // autiasp
1114 ret
1115 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1116 .globl vpaes_ecb_encrypt
1117 .type vpaes_ecb_encrypt,%function
1118 .align 4
1119 vpaes_ecb_encrypt:
1120 .inst 0xd503233f // paciasp
1121 stp x29,x30,[sp,#-16]!
1122 add x29,sp,#0
1123 stp d8,d9,[sp,#-16]! // ABI spec says so
1124 stp d10,d11,[sp,#-16]!
1125 stp d12,d13,[sp,#-16]!
1126 stp d14,d15,[sp,#-16]!
1127
1128 mov x17, x2
1129 mov x2, x3
1130 bl _vpaes_encrypt_preheat
1131 tst x17, #16
1132 b.eq .Lecb_enc_loop
1133
1134 ld1 {v7.16b}, [x0],#16
1135 bl _vpaes_encrypt_core
1136 st1 {v0.16b}, [x1],#16
1137 subs x17, x17, #16
1138 b.ls .Lecb_enc_done
1139
1140 .align 4
1141 .Lecb_enc_loop:
1142 ld1 {v14.16b,v15.16b}, [x0], #32
1143 bl _vpaes_encrypt_2x
1144 st1 {v0.16b,v1.16b}, [x1], #32
1145 subs x17, x17, #32
1146 b.hi .Lecb_enc_loop
1147
1148 .Lecb_enc_done:
1149 ldp d14,d15,[sp],#16
1150 ldp d12,d13,[sp],#16
1151 ldp d10,d11,[sp],#16
1152 ldp d8,d9,[sp],#16
1153 ldp x29,x30,[sp],#16
1154 .inst 0xd50323bf // autiasp
1155 ret
1156 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1157
1158 .globl vpaes_ecb_decrypt
1159 .type vpaes_ecb_decrypt,%function
1160 .align 4
1161 vpaes_ecb_decrypt:
1162 .inst 0xd503233f // paciasp
1163 stp x29,x30,[sp,#-16]!
1164 add x29,sp,#0
1165 stp d8,d9,[sp,#-16]! // ABI spec says so
1166 stp d10,d11,[sp,#-16]!
1167 stp d12,d13,[sp,#-16]!
1168 stp d14,d15,[sp,#-16]!
1169
1170 mov x17, x2
1171 mov x2, x3
1172 bl _vpaes_decrypt_preheat
1173 tst x17, #16
1174 b.eq .Lecb_dec_loop
1175
1176 ld1 {v7.16b}, [x0],#16
1177 bl _vpaes_encrypt_core
1178 st1 {v0.16b}, [x1],#16
1179 subs x17, x17, #16
1180 b.ls .Lecb_dec_done
1181
1182 .align 4
1183 .Lecb_dec_loop:
1184 ld1 {v14.16b,v15.16b}, [x0], #32
1185 bl _vpaes_decrypt_2x
1186 st1 {v0.16b,v1.16b}, [x1], #32
1187 subs x17, x17, #32
1188 b.hi .Lecb_dec_loop
1189
1190 .Lecb_dec_done:
1191 ldp d14,d15,[sp],#16
1192 ldp d12,d13,[sp],#16
1193 ldp d10,d11,[sp],#16
1194 ldp d8,d9,[sp],#16
1195 ldp x29,x30,[sp],#16
1196 .inst 0xd50323bf // autiasp
1197 ret
1198 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
Cache object: dbeacbfff0c7e728510c7cb4954ab201
|