1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
3 #include "arm_arch.h"
4
5 #if __ARM_MAX_ARCH__>=7
6 .text
7 .globl gcm_init_v8
8 .type gcm_init_v8,%function
9 .align 4
10 gcm_init_v8:
11 ld1 {v17.2d},[x1] //load input H
12 movi v19.16b,#0xe1
13 shl v19.2d,v19.2d,#57 //0xc2.0
14 ext v3.16b,v17.16b,v17.16b,#8
15 ushr v18.2d,v19.2d,#63
16 dup v17.4s,v17.s[1]
17 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
18 ushr v18.2d,v3.2d,#63
19 sshr v17.4s,v17.4s,#31 //broadcast carry bit
20 and v18.16b,v18.16b,v16.16b
21 shl v3.2d,v3.2d,#1
22 ext v18.16b,v18.16b,v18.16b,#8
23 and v16.16b,v16.16b,v17.16b
24 orr v3.16b,v3.16b,v18.16b //H<<<=1
25 eor v20.16b,v3.16b,v16.16b //twisted H
26 st1 {v20.2d},[x0],#16 //store Htable[0]
27
28 //calculate H^2
29 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
30 pmull v0.1q,v20.1d,v20.1d
31 eor v16.16b,v16.16b,v20.16b
32 pmull2 v2.1q,v20.2d,v20.2d
33 pmull v1.1q,v16.1d,v16.1d
34
35 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
36 eor v18.16b,v0.16b,v2.16b
37 eor v1.16b,v1.16b,v17.16b
38 eor v1.16b,v1.16b,v18.16b
39 pmull v18.1q,v0.1d,v19.1d //1st phase
40
41 ins v2.d[0],v1.d[1]
42 ins v1.d[1],v0.d[0]
43 eor v0.16b,v1.16b,v18.16b
44
45 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
46 pmull v0.1q,v0.1d,v19.1d
47 eor v18.16b,v18.16b,v2.16b
48 eor v22.16b,v0.16b,v18.16b
49
50 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
51 eor v17.16b,v17.16b,v22.16b
52 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
53 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
54 //calculate H^3 and H^4
55 pmull v0.1q,v20.1d, v22.1d
56 pmull v5.1q,v22.1d,v22.1d
57 pmull2 v2.1q,v20.2d, v22.2d
58 pmull2 v7.1q,v22.2d,v22.2d
59 pmull v1.1q,v16.1d,v17.1d
60 pmull v6.1q,v17.1d,v17.1d
61
62 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
63 ext v17.16b,v5.16b,v7.16b,#8
64 eor v18.16b,v0.16b,v2.16b
65 eor v1.16b,v1.16b,v16.16b
66 eor v4.16b,v5.16b,v7.16b
67 eor v6.16b,v6.16b,v17.16b
68 eor v1.16b,v1.16b,v18.16b
69 pmull v18.1q,v0.1d,v19.1d //1st phase
70 eor v6.16b,v6.16b,v4.16b
71 pmull v4.1q,v5.1d,v19.1d
72
73 ins v2.d[0],v1.d[1]
74 ins v7.d[0],v6.d[1]
75 ins v1.d[1],v0.d[0]
76 ins v6.d[1],v5.d[0]
77 eor v0.16b,v1.16b,v18.16b
78 eor v5.16b,v6.16b,v4.16b
79
80 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
81 ext v4.16b,v5.16b,v5.16b,#8
82 pmull v0.1q,v0.1d,v19.1d
83 pmull v5.1q,v5.1d,v19.1d
84 eor v18.16b,v18.16b,v2.16b
85 eor v4.16b,v4.16b,v7.16b
86 eor v20.16b, v0.16b,v18.16b //H^3
87 eor v22.16b,v5.16b,v4.16b //H^4
88
89 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
90 ext v17.16b,v22.16b,v22.16b,#8
91 eor v16.16b,v16.16b,v20.16b
92 eor v17.16b,v17.16b,v22.16b
93 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
94 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
95 ret
96 .size gcm_init_v8,.-gcm_init_v8
97 .globl gcm_gmult_v8
98 .type gcm_gmult_v8,%function
99 .align 4
100 gcm_gmult_v8:
101 ld1 {v17.2d},[x0] //load Xi
102 movi v19.16b,#0xe1
103 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
104 shl v19.2d,v19.2d,#57
105 #ifndef __ARMEB__
106 rev64 v17.16b,v17.16b
107 #endif
108 ext v3.16b,v17.16b,v17.16b,#8
109
110 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
111 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
112 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
113 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
114
115 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
116 eor v18.16b,v0.16b,v2.16b
117 eor v1.16b,v1.16b,v17.16b
118 eor v1.16b,v1.16b,v18.16b
119 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
120
121 ins v2.d[0],v1.d[1]
122 ins v1.d[1],v0.d[0]
123 eor v0.16b,v1.16b,v18.16b
124
125 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
126 pmull v0.1q,v0.1d,v19.1d
127 eor v18.16b,v18.16b,v2.16b
128 eor v0.16b,v0.16b,v18.16b
129
130 #ifndef __ARMEB__
131 rev64 v0.16b,v0.16b
132 #endif
133 ext v0.16b,v0.16b,v0.16b,#8
134 st1 {v0.2d},[x0] //write out Xi
135
136 ret
137 .size gcm_gmult_v8,.-gcm_gmult_v8
138 .globl gcm_ghash_v8
139 .type gcm_ghash_v8,%function
140 .align 4
141 gcm_ghash_v8:
142 cmp x3,#64
143 b.hs .Lgcm_ghash_v8_4x
144 ld1 {v0.2d},[x0] //load [rotated] Xi
145 //"[rotated]" means that
146 //loaded value would have
147 //to be rotated in order to
148 //make it appear as in
149 //algorithm specification
150 subs x3,x3,#32 //see if x3 is 32 or larger
151 mov x12,#16 //x12 is used as post-
152 //increment for input pointer;
153 //as loop is modulo-scheduled
154 //x12 is zeroed just in time
155 //to preclude overstepping
156 //inp[len], which means that
157 //last block[s] are actually
158 //loaded twice, but last
159 //copy is not processed
160 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
161 movi v19.16b,#0xe1
162 ld1 {v22.2d},[x1]
163 csel x12,xzr,x12,eq //is it time to zero x12?
164 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
165 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
166 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
167 #ifndef __ARMEB__
168 rev64 v16.16b,v16.16b
169 rev64 v0.16b,v0.16b
170 #endif
171 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
172 b.lo .Lodd_tail_v8 //x3 was less than 32
173 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
174 #ifndef __ARMEB__
175 rev64 v17.16b,v17.16b
176 #endif
177 ext v7.16b,v17.16b,v17.16b,#8
178 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
179 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
180 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
181 pmull2 v6.1q,v20.2d,v7.2d
182 b .Loop_mod2x_v8
183
184 .align 4
185 .Loop_mod2x_v8:
186 ext v18.16b,v3.16b,v3.16b,#8
187 subs x3,x3,#32 //is there more data?
188 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
189 csel x12,xzr,x12,lo //is it time to zero x12?
190
191 pmull v5.1q,v21.1d,v17.1d
192 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
193 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
194 eor v0.16b,v0.16b,v4.16b //accumulate
195 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
196 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
197
198 eor v2.16b,v2.16b,v6.16b
199 csel x12,xzr,x12,eq //is it time to zero x12?
200 eor v1.16b,v1.16b,v5.16b
201
202 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
203 eor v18.16b,v0.16b,v2.16b
204 eor v1.16b,v1.16b,v17.16b
205 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
206 #ifndef __ARMEB__
207 rev64 v16.16b,v16.16b
208 #endif
209 eor v1.16b,v1.16b,v18.16b
210 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
211
212 #ifndef __ARMEB__
213 rev64 v17.16b,v17.16b
214 #endif
215 ins v2.d[0],v1.d[1]
216 ins v1.d[1],v0.d[0]
217 ext v7.16b,v17.16b,v17.16b,#8
218 ext v3.16b,v16.16b,v16.16b,#8
219 eor v0.16b,v1.16b,v18.16b
220 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
221 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
222
223 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
224 pmull v0.1q,v0.1d,v19.1d
225 eor v3.16b,v3.16b,v18.16b
226 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
227 eor v3.16b,v3.16b,v0.16b
228 pmull2 v6.1q,v20.2d,v7.2d
229 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
230
231 eor v2.16b,v2.16b,v18.16b
232 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
233 adds x3,x3,#32 //re-construct x3
234 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
235 b.eq .Ldone_v8 //is x3 zero?
236 .Lodd_tail_v8:
237 ext v18.16b,v0.16b,v0.16b,#8
238 eor v3.16b,v3.16b,v0.16b //inp^=Xi
239 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
240
241 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
242 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
243 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
244 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
245
246 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
247 eor v18.16b,v0.16b,v2.16b
248 eor v1.16b,v1.16b,v17.16b
249 eor v1.16b,v1.16b,v18.16b
250 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
251
252 ins v2.d[0],v1.d[1]
253 ins v1.d[1],v0.d[0]
254 eor v0.16b,v1.16b,v18.16b
255
256 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
257 pmull v0.1q,v0.1d,v19.1d
258 eor v18.16b,v18.16b,v2.16b
259 eor v0.16b,v0.16b,v18.16b
260
261 .Ldone_v8:
262 #ifndef __ARMEB__
263 rev64 v0.16b,v0.16b
264 #endif
265 ext v0.16b,v0.16b,v0.16b,#8
266 st1 {v0.2d},[x0] //write out Xi
267
268 ret
269 .size gcm_ghash_v8,.-gcm_ghash_v8
270 .type gcm_ghash_v8_4x,%function
271 .align 4
272 gcm_ghash_v8_4x:
273 .Lgcm_ghash_v8_4x:
274 ld1 {v0.2d},[x0] //load [rotated] Xi
275 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
276 movi v19.16b,#0xe1
277 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
278 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
279
280 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
281 #ifndef __ARMEB__
282 rev64 v0.16b,v0.16b
283 rev64 v5.16b,v5.16b
284 rev64 v6.16b,v6.16b
285 rev64 v7.16b,v7.16b
286 rev64 v4.16b,v4.16b
287 #endif
288 ext v25.16b,v7.16b,v7.16b,#8
289 ext v24.16b,v6.16b,v6.16b,#8
290 ext v23.16b,v5.16b,v5.16b,#8
291
292 pmull v29.1q,v20.1d,v25.1d //H·Ii+3
293 eor v7.16b,v7.16b,v25.16b
294 pmull2 v31.1q,v20.2d,v25.2d
295 pmull v30.1q,v21.1d,v7.1d
296
297 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
298 eor v6.16b,v6.16b,v24.16b
299 pmull2 v24.1q,v22.2d,v24.2d
300 pmull2 v6.1q,v21.2d,v6.2d
301
302 eor v29.16b,v29.16b,v16.16b
303 eor v31.16b,v31.16b,v24.16b
304 eor v30.16b,v30.16b,v6.16b
305
306 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
307 eor v5.16b,v5.16b,v23.16b
308 pmull2 v23.1q,v26.2d,v23.2d
309 pmull v5.1q,v27.1d,v5.1d
310
311 eor v29.16b,v29.16b,v7.16b
312 eor v31.16b,v31.16b,v23.16b
313 eor v30.16b,v30.16b,v5.16b
314
315 subs x3,x3,#128
316 b.lo .Ltail4x
317
318 b .Loop4x
319
320 .align 4
321 .Loop4x:
322 eor v16.16b,v4.16b,v0.16b
323 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
324 ext v3.16b,v16.16b,v16.16b,#8
325 #ifndef __ARMEB__
326 rev64 v5.16b,v5.16b
327 rev64 v6.16b,v6.16b
328 rev64 v7.16b,v7.16b
329 rev64 v4.16b,v4.16b
330 #endif
331
332 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
333 eor v16.16b,v16.16b,v3.16b
334 pmull2 v2.1q,v28.2d,v3.2d
335 ext v25.16b,v7.16b,v7.16b,#8
336 pmull2 v1.1q,v27.2d,v16.2d
337
338 eor v0.16b,v0.16b,v29.16b
339 eor v2.16b,v2.16b,v31.16b
340 ext v24.16b,v6.16b,v6.16b,#8
341 eor v1.16b,v1.16b,v30.16b
342 ext v23.16b,v5.16b,v5.16b,#8
343
344 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
345 eor v18.16b,v0.16b,v2.16b
346 pmull v29.1q,v20.1d,v25.1d //H·Ii+3
347 eor v7.16b,v7.16b,v25.16b
348 eor v1.16b,v1.16b,v17.16b
349 pmull2 v31.1q,v20.2d,v25.2d
350 eor v1.16b,v1.16b,v18.16b
351 pmull v30.1q,v21.1d,v7.1d
352
353 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
354 ins v2.d[0],v1.d[1]
355 ins v1.d[1],v0.d[0]
356 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
357 eor v6.16b,v6.16b,v24.16b
358 pmull2 v24.1q,v22.2d,v24.2d
359 eor v0.16b,v1.16b,v18.16b
360 pmull2 v6.1q,v21.2d,v6.2d
361
362 eor v29.16b,v29.16b,v16.16b
363 eor v31.16b,v31.16b,v24.16b
364 eor v30.16b,v30.16b,v6.16b
365
366 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
367 pmull v0.1q,v0.1d,v19.1d
368 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
369 eor v5.16b,v5.16b,v23.16b
370 eor v18.16b,v18.16b,v2.16b
371 pmull2 v23.1q,v26.2d,v23.2d
372 pmull v5.1q,v27.1d,v5.1d
373
374 eor v0.16b,v0.16b,v18.16b
375 eor v29.16b,v29.16b,v7.16b
376 eor v31.16b,v31.16b,v23.16b
377 ext v0.16b,v0.16b,v0.16b,#8
378 eor v30.16b,v30.16b,v5.16b
379
380 subs x3,x3,#64
381 b.hs .Loop4x
382
383 .Ltail4x:
384 eor v16.16b,v4.16b,v0.16b
385 ext v3.16b,v16.16b,v16.16b,#8
386
387 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
388 eor v16.16b,v16.16b,v3.16b
389 pmull2 v2.1q,v28.2d,v3.2d
390 pmull2 v1.1q,v27.2d,v16.2d
391
392 eor v0.16b,v0.16b,v29.16b
393 eor v2.16b,v2.16b,v31.16b
394 eor v1.16b,v1.16b,v30.16b
395
396 adds x3,x3,#64
397 b.eq .Ldone4x
398
399 cmp x3,#32
400 b.lo .Lone
401 b.eq .Ltwo
402 .Lthree:
403 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
404 eor v18.16b,v0.16b,v2.16b
405 eor v1.16b,v1.16b,v17.16b
406 ld1 {v4.2d,v5.2d,v6.2d},[x2]
407 eor v1.16b,v1.16b,v18.16b
408 #ifndef __ARMEB__
409 rev64 v5.16b,v5.16b
410 rev64 v6.16b,v6.16b
411 rev64 v4.16b,v4.16b
412 #endif
413
414 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
415 ins v2.d[0],v1.d[1]
416 ins v1.d[1],v0.d[0]
417 ext v24.16b,v6.16b,v6.16b,#8
418 ext v23.16b,v5.16b,v5.16b,#8
419 eor v0.16b,v1.16b,v18.16b
420
421 pmull v29.1q,v20.1d,v24.1d //H·Ii+2
422 eor v6.16b,v6.16b,v24.16b
423
424 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
425 pmull v0.1q,v0.1d,v19.1d
426 eor v18.16b,v18.16b,v2.16b
427 pmull2 v31.1q,v20.2d,v24.2d
428 pmull v30.1q,v21.1d,v6.1d
429 eor v0.16b,v0.16b,v18.16b
430 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
431 eor v5.16b,v5.16b,v23.16b
432 ext v0.16b,v0.16b,v0.16b,#8
433
434 pmull2 v23.1q,v22.2d,v23.2d
435 eor v16.16b,v4.16b,v0.16b
436 pmull2 v5.1q,v21.2d,v5.2d
437 ext v3.16b,v16.16b,v16.16b,#8
438
439 eor v29.16b,v29.16b,v7.16b
440 eor v31.16b,v31.16b,v23.16b
441 eor v30.16b,v30.16b,v5.16b
442
443 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
444 eor v16.16b,v16.16b,v3.16b
445 pmull2 v2.1q,v26.2d,v3.2d
446 pmull v1.1q,v27.1d,v16.1d
447
448 eor v0.16b,v0.16b,v29.16b
449 eor v2.16b,v2.16b,v31.16b
450 eor v1.16b,v1.16b,v30.16b
451 b .Ldone4x
452
453 .align 4
454 .Ltwo:
455 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
456 eor v18.16b,v0.16b,v2.16b
457 eor v1.16b,v1.16b,v17.16b
458 ld1 {v4.2d,v5.2d},[x2]
459 eor v1.16b,v1.16b,v18.16b
460 #ifndef __ARMEB__
461 rev64 v5.16b,v5.16b
462 rev64 v4.16b,v4.16b
463 #endif
464
465 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
466 ins v2.d[0],v1.d[1]
467 ins v1.d[1],v0.d[0]
468 ext v23.16b,v5.16b,v5.16b,#8
469 eor v0.16b,v1.16b,v18.16b
470
471 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
472 pmull v0.1q,v0.1d,v19.1d
473 eor v18.16b,v18.16b,v2.16b
474 eor v0.16b,v0.16b,v18.16b
475 ext v0.16b,v0.16b,v0.16b,#8
476
477 pmull v29.1q,v20.1d,v23.1d //H·Ii+1
478 eor v5.16b,v5.16b,v23.16b
479
480 eor v16.16b,v4.16b,v0.16b
481 ext v3.16b,v16.16b,v16.16b,#8
482
483 pmull2 v31.1q,v20.2d,v23.2d
484 pmull v30.1q,v21.1d,v5.1d
485
486 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
487 eor v16.16b,v16.16b,v3.16b
488 pmull2 v2.1q,v22.2d,v3.2d
489 pmull2 v1.1q,v21.2d,v16.2d
490
491 eor v0.16b,v0.16b,v29.16b
492 eor v2.16b,v2.16b,v31.16b
493 eor v1.16b,v1.16b,v30.16b
494 b .Ldone4x
495
496 .align 4
497 .Lone:
498 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
499 eor v18.16b,v0.16b,v2.16b
500 eor v1.16b,v1.16b,v17.16b
501 ld1 {v4.2d},[x2]
502 eor v1.16b,v1.16b,v18.16b
503 #ifndef __ARMEB__
504 rev64 v4.16b,v4.16b
505 #endif
506
507 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
508 ins v2.d[0],v1.d[1]
509 ins v1.d[1],v0.d[0]
510 eor v0.16b,v1.16b,v18.16b
511
512 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
513 pmull v0.1q,v0.1d,v19.1d
514 eor v18.16b,v18.16b,v2.16b
515 eor v0.16b,v0.16b,v18.16b
516 ext v0.16b,v0.16b,v0.16b,#8
517
518 eor v16.16b,v4.16b,v0.16b
519 ext v3.16b,v16.16b,v16.16b,#8
520
521 pmull v0.1q,v20.1d,v3.1d
522 eor v16.16b,v16.16b,v3.16b
523 pmull2 v2.1q,v20.2d,v3.2d
524 pmull v1.1q,v21.1d,v16.1d
525
526 .Ldone4x:
527 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
528 eor v18.16b,v0.16b,v2.16b
529 eor v1.16b,v1.16b,v17.16b
530 eor v1.16b,v1.16b,v18.16b
531
532 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
533 ins v2.d[0],v1.d[1]
534 ins v1.d[1],v0.d[0]
535 eor v0.16b,v1.16b,v18.16b
536
537 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
538 pmull v0.1q,v0.1d,v19.1d
539 eor v18.16b,v18.16b,v2.16b
540 eor v0.16b,v0.16b,v18.16b
541 ext v0.16b,v0.16b,v0.16b,#8
542
543 #ifndef __ARMEB__
544 rev64 v0.16b,v0.16b
545 #endif
546 st1 {v0.2d},[x0] //write out Xi
547
548 ret
549 .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
550 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
551 .align 2
552 .align 2
553 #endif
Cache object: d2788a6c8bd3658bd98a0d8967ef226f
|