1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from ghash-armv4.pl. */
3 #include "arm_arch.h"
4
5 .text
6 #if defined(__thumb2__) || defined(__clang__)
7 .syntax unified
8 #define ldrplb ldrbpl
9 #define ldrneb ldrbne
10 #endif
11 #if defined(__thumb2__)
12 .thumb
13 #else
14 .code 32
15 #endif
16
17 .type rem_4bit,%object
18 .align 5
19 rem_4bit:
20 .short 0x0000,0x1C20,0x3840,0x2460
21 .short 0x7080,0x6CA0,0x48C0,0x54E0
22 .short 0xE100,0xFD20,0xD940,0xC560
23 .short 0x9180,0x8DA0,0xA9C0,0xB5E0
24 .size rem_4bit,.-rem_4bit
25
26 .type rem_4bit_get,%function
27 rem_4bit_get:
28 #if defined(__thumb2__)
29 adr r2,rem_4bit
30 #else
31 sub r2,pc,#8+32 @ &rem_4bit
32 #endif
33 b .Lrem_4bit_got
34 nop
35 nop
36 .size rem_4bit_get,.-rem_4bit_get
37
38 .globl gcm_ghash_4bit
39 .type gcm_ghash_4bit,%function
40 .align 4
41 gcm_ghash_4bit:
42 #if defined(__thumb2__)
43 adr r12,rem_4bit
44 #else
45 sub r12,pc,#8+48 @ &rem_4bit
46 #endif
47 add r3,r2,r3 @ r3 to point at the end
48 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too
49
50 ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ...
51 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack
52
53 ldrb r12,[r2,#15]
54 ldrb r14,[r0,#15]
55 .Louter:
56 eor r12,r12,r14
57 and r14,r12,#0xf0
58 and r12,r12,#0x0f
59 mov r3,#14
60
61 add r7,r1,r12,lsl#4
62 ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
63 add r11,r1,r14
64 ldrb r12,[r2,#14]
65
66 and r14,r4,#0xf @ rem
67 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
68 add r14,r14,r14
69 eor r4,r8,r4,lsr#4
70 ldrh r8,[sp,r14] @ rem_4bit[rem]
71 eor r4,r4,r5,lsl#28
72 ldrb r14,[r0,#14]
73 eor r5,r9,r5,lsr#4
74 eor r5,r5,r6,lsl#28
75 eor r6,r10,r6,lsr#4
76 eor r6,r6,r7,lsl#28
77 eor r7,r11,r7,lsr#4
78 eor r12,r12,r14
79 and r14,r12,#0xf0
80 and r12,r12,#0x0f
81 eor r7,r7,r8,lsl#16
82
83 .Linner:
84 add r11,r1,r12,lsl#4
85 and r12,r4,#0xf @ rem
86 subs r3,r3,#1
87 add r12,r12,r12
88 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
89 eor r4,r8,r4,lsr#4
90 eor r4,r4,r5,lsl#28
91 eor r5,r9,r5,lsr#4
92 eor r5,r5,r6,lsl#28
93 ldrh r8,[sp,r12] @ rem_4bit[rem]
94 eor r6,r10,r6,lsr#4
95 #ifdef __thumb2__
96 it pl
97 #endif
98 ldrplb r12,[r2,r3]
99 eor r6,r6,r7,lsl#28
100 eor r7,r11,r7,lsr#4
101
102 add r11,r1,r14
103 and r14,r4,#0xf @ rem
104 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
105 add r14,r14,r14
106 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
107 eor r4,r8,r4,lsr#4
108 #ifdef __thumb2__
109 it pl
110 #endif
111 ldrplb r8,[r0,r3]
112 eor r4,r4,r5,lsl#28
113 eor r5,r9,r5,lsr#4
114 ldrh r9,[sp,r14]
115 eor r5,r5,r6,lsl#28
116 eor r6,r10,r6,lsr#4
117 eor r6,r6,r7,lsl#28
118 #ifdef __thumb2__
119 it pl
120 #endif
121 eorpl r12,r12,r8
122 eor r7,r11,r7,lsr#4
123 #ifdef __thumb2__
124 itt pl
125 #endif
126 andpl r14,r12,#0xf0
127 andpl r12,r12,#0x0f
128 eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
129 bpl .Linner
130
131 ldr r3,[sp,#32] @ re-load r3/end
132 add r2,r2,#16
133 mov r14,r4
134 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
135 rev r4,r4
136 str r4,[r0,#12]
137 #elif defined(__ARMEB__)
138 str r4,[r0,#12]
139 #else
140 mov r9,r4,lsr#8
141 strb r4,[r0,#12+3]
142 mov r10,r4,lsr#16
143 strb r9,[r0,#12+2]
144 mov r11,r4,lsr#24
145 strb r10,[r0,#12+1]
146 strb r11,[r0,#12]
147 #endif
148 cmp r2,r3
149 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
150 rev r5,r5
151 str r5,[r0,#8]
152 #elif defined(__ARMEB__)
153 str r5,[r0,#8]
154 #else
155 mov r9,r5,lsr#8
156 strb r5,[r0,#8+3]
157 mov r10,r5,lsr#16
158 strb r9,[r0,#8+2]
159 mov r11,r5,lsr#24
160 strb r10,[r0,#8+1]
161 strb r11,[r0,#8]
162 #endif
163
164 #ifdef __thumb2__
165 it ne
166 #endif
167 ldrneb r12,[r2,#15]
168 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
169 rev r6,r6
170 str r6,[r0,#4]
171 #elif defined(__ARMEB__)
172 str r6,[r0,#4]
173 #else
174 mov r9,r6,lsr#8
175 strb r6,[r0,#4+3]
176 mov r10,r6,lsr#16
177 strb r9,[r0,#4+2]
178 mov r11,r6,lsr#24
179 strb r10,[r0,#4+1]
180 strb r11,[r0,#4]
181 #endif
182
183 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
184 rev r7,r7
185 str r7,[r0,#0]
186 #elif defined(__ARMEB__)
187 str r7,[r0,#0]
188 #else
189 mov r9,r7,lsr#8
190 strb r7,[r0,#0+3]
191 mov r10,r7,lsr#16
192 strb r9,[r0,#0+2]
193 mov r11,r7,lsr#24
194 strb r10,[r0,#0+1]
195 strb r11,[r0,#0]
196 #endif
197
198 bne .Louter
199
200 add sp,sp,#36
201 #if __ARM_ARCH__>=5
202 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
203 #else
204 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
205 tst lr,#1
206 moveq pc,lr @ be binary compatible with V4, yet
207 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
208 #endif
209 .size gcm_ghash_4bit,.-gcm_ghash_4bit
210
211 .globl gcm_gmult_4bit
212 .type gcm_gmult_4bit,%function
213 gcm_gmult_4bit:
214 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
215 ldrb r12,[r0,#15]
216 b rem_4bit_get
217 .Lrem_4bit_got:
218 and r14,r12,#0xf0
219 and r12,r12,#0x0f
220 mov r3,#14
221
222 add r7,r1,r12,lsl#4
223 ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
224 ldrb r12,[r0,#14]
225
226 add r11,r1,r14
227 and r14,r4,#0xf @ rem
228 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
229 add r14,r14,r14
230 eor r4,r8,r4,lsr#4
231 ldrh r8,[r2,r14] @ rem_4bit[rem]
232 eor r4,r4,r5,lsl#28
233 eor r5,r9,r5,lsr#4
234 eor r5,r5,r6,lsl#28
235 eor r6,r10,r6,lsr#4
236 eor r6,r6,r7,lsl#28
237 eor r7,r11,r7,lsr#4
238 and r14,r12,#0xf0
239 eor r7,r7,r8,lsl#16
240 and r12,r12,#0x0f
241
242 .Loop:
243 add r11,r1,r12,lsl#4
244 and r12,r4,#0xf @ rem
245 subs r3,r3,#1
246 add r12,r12,r12
247 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
248 eor r4,r8,r4,lsr#4
249 eor r4,r4,r5,lsl#28
250 eor r5,r9,r5,lsr#4
251 eor r5,r5,r6,lsl#28
252 ldrh r8,[r2,r12] @ rem_4bit[rem]
253 eor r6,r10,r6,lsr#4
254 #ifdef __thumb2__
255 it pl
256 #endif
257 ldrplb r12,[r0,r3]
258 eor r6,r6,r7,lsl#28
259 eor r7,r11,r7,lsr#4
260
261 add r11,r1,r14
262 and r14,r4,#0xf @ rem
263 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
264 add r14,r14,r14
265 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
266 eor r4,r8,r4,lsr#4
267 eor r4,r4,r5,lsl#28
268 eor r5,r9,r5,lsr#4
269 ldrh r8,[r2,r14] @ rem_4bit[rem]
270 eor r5,r5,r6,lsl#28
271 eor r6,r10,r6,lsr#4
272 eor r6,r6,r7,lsl#28
273 eor r7,r11,r7,lsr#4
274 #ifdef __thumb2__
275 itt pl
276 #endif
277 andpl r14,r12,#0xf0
278 andpl r12,r12,#0x0f
279 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
280 bpl .Loop
281 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
282 rev r4,r4
283 str r4,[r0,#12]
284 #elif defined(__ARMEB__)
285 str r4,[r0,#12]
286 #else
287 mov r9,r4,lsr#8
288 strb r4,[r0,#12+3]
289 mov r10,r4,lsr#16
290 strb r9,[r0,#12+2]
291 mov r11,r4,lsr#24
292 strb r10,[r0,#12+1]
293 strb r11,[r0,#12]
294 #endif
295
296 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
297 rev r5,r5
298 str r5,[r0,#8]
299 #elif defined(__ARMEB__)
300 str r5,[r0,#8]
301 #else
302 mov r9,r5,lsr#8
303 strb r5,[r0,#8+3]
304 mov r10,r5,lsr#16
305 strb r9,[r0,#8+2]
306 mov r11,r5,lsr#24
307 strb r10,[r0,#8+1]
308 strb r11,[r0,#8]
309 #endif
310
311 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
312 rev r6,r6
313 str r6,[r0,#4]
314 #elif defined(__ARMEB__)
315 str r6,[r0,#4]
316 #else
317 mov r9,r6,lsr#8
318 strb r6,[r0,#4+3]
319 mov r10,r6,lsr#16
320 strb r9,[r0,#4+2]
321 mov r11,r6,lsr#24
322 strb r10,[r0,#4+1]
323 strb r11,[r0,#4]
324 #endif
325
326 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
327 rev r7,r7
328 str r7,[r0,#0]
329 #elif defined(__ARMEB__)
330 str r7,[r0,#0]
331 #else
332 mov r9,r7,lsr#8
333 strb r7,[r0,#0+3]
334 mov r10,r7,lsr#16
335 strb r9,[r0,#0+2]
336 mov r11,r7,lsr#24
337 strb r10,[r0,#0+1]
338 strb r11,[r0,#0]
339 #endif
340
341 #if __ARM_ARCH__>=5
342 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
343 #else
344 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
345 tst lr,#1
346 moveq pc,lr @ be binary compatible with V4, yet
347 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
348 #endif
349 .size gcm_gmult_4bit,.-gcm_gmult_4bit
350 #if __ARM_MAX_ARCH__>=7
351 .arch armv7-a
352 .fpu neon
353
354 .globl gcm_init_neon
355 .type gcm_init_neon,%function
356 .align 4
357 gcm_init_neon:
358 vld1.64 d7,[r1]! @ load H
359 vmov.i8 q8,#0xe1
360 vld1.64 d6,[r1]
361 vshl.i64 d17,#57
362 vshr.u64 d16,#63 @ t0=0xc2....01
363 vdup.8 q9,d7[7]
364 vshr.u64 d26,d6,#63
365 vshr.s8 q9,#7 @ broadcast carry bit
366 vshl.i64 q3,q3,#1
367 vand q8,q8,q9
368 vorr d7,d26 @ H<<<=1
369 veor q3,q3,q8 @ twisted H
370 vstmia r0,{q3}
371
372 bx lr @ bx lr
373 .size gcm_init_neon,.-gcm_init_neon
374
375 .globl gcm_gmult_neon
376 .type gcm_gmult_neon,%function
377 .align 4
378 gcm_gmult_neon:
379 vld1.64 d7,[r0]! @ load Xi
380 vld1.64 d6,[r0]!
381 vmov.i64 d29,#0x0000ffffffffffff
382 vldmia r1,{d26,d27} @ load twisted H
383 vmov.i64 d30,#0x00000000ffffffff
384 #ifdef __ARMEL__
385 vrev64.8 q3,q3
386 #endif
387 vmov.i64 d31,#0x000000000000ffff
388 veor d28,d26,d27 @ Karatsuba pre-processing
389 mov r3,#16
390 b .Lgmult_neon
391 .size gcm_gmult_neon,.-gcm_gmult_neon
392
393 .globl gcm_ghash_neon
394 .type gcm_ghash_neon,%function
395 .align 4
396 gcm_ghash_neon:
397 vld1.64 d1,[r0]! @ load Xi
398 vld1.64 d0,[r0]!
399 vmov.i64 d29,#0x0000ffffffffffff
400 vldmia r1,{d26,d27} @ load twisted H
401 vmov.i64 d30,#0x00000000ffffffff
402 #ifdef __ARMEL__
403 vrev64.8 q0,q0
404 #endif
405 vmov.i64 d31,#0x000000000000ffff
406 veor d28,d26,d27 @ Karatsuba pre-processing
407
408 .Loop_neon:
409 vld1.64 d7,[r2]! @ load inp
410 vld1.64 d6,[r2]!
411 #ifdef __ARMEL__
412 vrev64.8 q3,q3
413 #endif
414 veor q3,q0 @ inp^=Xi
415 .Lgmult_neon:
416 vext.8 d16, d26, d26, #1 @ A1
417 vmull.p8 q8, d16, d6 @ F = A1*B
418 vext.8 d0, d6, d6, #1 @ B1
419 vmull.p8 q0, d26, d0 @ E = A*B1
420 vext.8 d18, d26, d26, #2 @ A2
421 vmull.p8 q9, d18, d6 @ H = A2*B
422 vext.8 d22, d6, d6, #2 @ B2
423 vmull.p8 q11, d26, d22 @ G = A*B2
424 vext.8 d20, d26, d26, #3 @ A3
425 veor q8, q8, q0 @ L = E + F
426 vmull.p8 q10, d20, d6 @ J = A3*B
427 vext.8 d0, d6, d6, #3 @ B3
428 veor q9, q9, q11 @ M = G + H
429 vmull.p8 q0, d26, d0 @ I = A*B3
430 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
431 vand d17, d17, d29
432 vext.8 d22, d6, d6, #4 @ B4
433 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
434 vand d19, d19, d30
435 vmull.p8 q11, d26, d22 @ K = A*B4
436 veor q10, q10, q0 @ N = I + J
437 veor d16, d16, d17
438 veor d18, d18, d19
439 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
440 vand d21, d21, d31
441 vext.8 q8, q8, q8, #15
442 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
443 vmov.i64 d23, #0
444 vext.8 q9, q9, q9, #14
445 veor d20, d20, d21
446 vmull.p8 q0, d26, d6 @ D = A*B
447 vext.8 q11, q11, q11, #12
448 vext.8 q10, q10, q10, #13
449 veor q8, q8, q9
450 veor q10, q10, q11
451 veor q0, q0, q8
452 veor q0, q0, q10
453 veor d6,d6,d7 @ Karatsuba pre-processing
454 vext.8 d16, d28, d28, #1 @ A1
455 vmull.p8 q8, d16, d6 @ F = A1*B
456 vext.8 d2, d6, d6, #1 @ B1
457 vmull.p8 q1, d28, d2 @ E = A*B1
458 vext.8 d18, d28, d28, #2 @ A2
459 vmull.p8 q9, d18, d6 @ H = A2*B
460 vext.8 d22, d6, d6, #2 @ B2
461 vmull.p8 q11, d28, d22 @ G = A*B2
462 vext.8 d20, d28, d28, #3 @ A3
463 veor q8, q8, q1 @ L = E + F
464 vmull.p8 q10, d20, d6 @ J = A3*B
465 vext.8 d2, d6, d6, #3 @ B3
466 veor q9, q9, q11 @ M = G + H
467 vmull.p8 q1, d28, d2 @ I = A*B3
468 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
469 vand d17, d17, d29
470 vext.8 d22, d6, d6, #4 @ B4
471 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
472 vand d19, d19, d30
473 vmull.p8 q11, d28, d22 @ K = A*B4
474 veor q10, q10, q1 @ N = I + J
475 veor d16, d16, d17
476 veor d18, d18, d19
477 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
478 vand d21, d21, d31
479 vext.8 q8, q8, q8, #15
480 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
481 vmov.i64 d23, #0
482 vext.8 q9, q9, q9, #14
483 veor d20, d20, d21
484 vmull.p8 q1, d28, d6 @ D = A*B
485 vext.8 q11, q11, q11, #12
486 vext.8 q10, q10, q10, #13
487 veor q8, q8, q9
488 veor q10, q10, q11
489 veor q1, q1, q8
490 veor q1, q1, q10
491 vext.8 d16, d27, d27, #1 @ A1
492 vmull.p8 q8, d16, d7 @ F = A1*B
493 vext.8 d4, d7, d7, #1 @ B1
494 vmull.p8 q2, d27, d4 @ E = A*B1
495 vext.8 d18, d27, d27, #2 @ A2
496 vmull.p8 q9, d18, d7 @ H = A2*B
497 vext.8 d22, d7, d7, #2 @ B2
498 vmull.p8 q11, d27, d22 @ G = A*B2
499 vext.8 d20, d27, d27, #3 @ A3
500 veor q8, q8, q2 @ L = E + F
501 vmull.p8 q10, d20, d7 @ J = A3*B
502 vext.8 d4, d7, d7, #3 @ B3
503 veor q9, q9, q11 @ M = G + H
504 vmull.p8 q2, d27, d4 @ I = A*B3
505 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
506 vand d17, d17, d29
507 vext.8 d22, d7, d7, #4 @ B4
508 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
509 vand d19, d19, d30
510 vmull.p8 q11, d27, d22 @ K = A*B4
511 veor q10, q10, q2 @ N = I + J
512 veor d16, d16, d17
513 veor d18, d18, d19
514 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
515 vand d21, d21, d31
516 vext.8 q8, q8, q8, #15
517 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
518 vmov.i64 d23, #0
519 vext.8 q9, q9, q9, #14
520 veor d20, d20, d21
521 vmull.p8 q2, d27, d7 @ D = A*B
522 vext.8 q11, q11, q11, #12
523 vext.8 q10, q10, q10, #13
524 veor q8, q8, q9
525 veor q10, q10, q11
526 veor q2, q2, q8
527 veor q2, q2, q10
528 veor q1,q1,q0 @ Karatsuba post-processing
529 veor q1,q1,q2
530 veor d1,d1,d2
531 veor d4,d4,d3 @ Xh|Xl - 256-bit result
532
533 @ equivalent of reduction_avx from ghash-x86_64.pl
534 vshl.i64 q9,q0,#57 @ 1st phase
535 vshl.i64 q10,q0,#62
536 veor q10,q10,q9 @
537 vshl.i64 q9,q0,#63
538 veor q10, q10, q9 @
539 veor d1,d1,d20 @
540 veor d4,d4,d21
541
542 vshr.u64 q10,q0,#1 @ 2nd phase
543 veor q2,q2,q0
544 veor q0,q0,q10 @
545 vshr.u64 q10,q10,#6
546 vshr.u64 q0,q0,#1 @
547 veor q0,q0,q2 @
548 veor q0,q0,q10 @
549
550 subs r3,#16
551 bne .Loop_neon
552
553 #ifdef __ARMEL__
554 vrev64.8 q0,q0
555 #endif
556 sub r0,#16
557 vst1.64 d1,[r0]! @ write out Xi
558 vst1.64 d0,[r0]
559
560 bx lr @ bx lr
561 .size gcm_ghash_neon,.-gcm_ghash_neon
562 #endif
563 .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
564 .align 2
565 .align 2
Cache object: 6e43a40dfb726a29162990d5fda5d8aa
|