1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from armv8-mont.pl. */
3 .text
4
5 .globl bn_mul_mont
6 .type bn_mul_mont,%function
7 .align 5
8 bn_mul_mont:
9 tst x5,#7
10 b.eq __bn_sqr8x_mont
11 tst x5,#3
12 b.eq __bn_mul4x_mont
13 .Lmul_mont:
14 stp x29,x30,[sp,#-64]!
15 add x29,sp,#0
16 stp x19,x20,[sp,#16]
17 stp x21,x22,[sp,#32]
18 stp x23,x24,[sp,#48]
19
20 ldr x9,[x2],#8 // bp[0]
21 sub x22,sp,x5,lsl#3
22 ldp x7,x8,[x1],#16 // ap[0..1]
23 lsl x5,x5,#3
24 ldr x4,[x4] // *n0
25 and x22,x22,#-16 // ABI says so
26 ldp x13,x14,[x3],#16 // np[0..1]
27
28 mul x6,x7,x9 // ap[0]*bp[0]
29 sub x21,x5,#16 // j=num-2
30 umulh x7,x7,x9
31 mul x10,x8,x9 // ap[1]*bp[0]
32 umulh x11,x8,x9
33
34 mul x15,x6,x4 // "tp[0]"*n0
35 mov sp,x22 // alloca
36
37 // (*) mul x12,x13,x15 // np[0]*m1
38 umulh x13,x13,x15
39 mul x16,x14,x15 // np[1]*m1
40 // (*) adds x12,x12,x6 // discarded
41 // (*) As for removal of first multiplication and addition
42 // instructions. The outcome of first addition is
43 // guaranteed to be zero, which leaves two computationally
44 // significant outcomes: it either carries or not. Then
45 // question is when does it carry? Is there alternative
46 // way to deduce it? If you follow operations, you can
47 // observe that condition for carry is quite simple:
48 // x6 being non-zero. So that carry can be calculated
49 // by adding -1 to x6. That's what next instruction does.
50 subs xzr,x6,#1 // (*)
51 umulh x17,x14,x15
52 adc x13,x13,xzr
53 cbz x21,.L1st_skip
54
55 .L1st:
56 ldr x8,[x1],#8
57 adds x6,x10,x7
58 sub x21,x21,#8 // j--
59 adc x7,x11,xzr
60
61 ldr x14,[x3],#8
62 adds x12,x16,x13
63 mul x10,x8,x9 // ap[j]*bp[0]
64 adc x13,x17,xzr
65 umulh x11,x8,x9
66
67 adds x12,x12,x6
68 mul x16,x14,x15 // np[j]*m1
69 adc x13,x13,xzr
70 umulh x17,x14,x15
71 str x12,[x22],#8 // tp[j-1]
72 cbnz x21,.L1st
73
74 .L1st_skip:
75 adds x6,x10,x7
76 sub x1,x1,x5 // rewind x1
77 adc x7,x11,xzr
78
79 adds x12,x16,x13
80 sub x3,x3,x5 // rewind x3
81 adc x13,x17,xzr
82
83 adds x12,x12,x6
84 sub x20,x5,#8 // i=num-1
85 adcs x13,x13,x7
86
87 adc x19,xzr,xzr // upmost overflow bit
88 stp x12,x13,[x22]
89
90 .Louter:
91 ldr x9,[x2],#8 // bp[i]
92 ldp x7,x8,[x1],#16
93 ldr x23,[sp] // tp[0]
94 add x22,sp,#8
95
96 mul x6,x7,x9 // ap[0]*bp[i]
97 sub x21,x5,#16 // j=num-2
98 umulh x7,x7,x9
99 ldp x13,x14,[x3],#16
100 mul x10,x8,x9 // ap[1]*bp[i]
101 adds x6,x6,x23
102 umulh x11,x8,x9
103 adc x7,x7,xzr
104
105 mul x15,x6,x4
106 sub x20,x20,#8 // i--
107
108 // (*) mul x12,x13,x15 // np[0]*m1
109 umulh x13,x13,x15
110 mul x16,x14,x15 // np[1]*m1
111 // (*) adds x12,x12,x6
112 subs xzr,x6,#1 // (*)
113 umulh x17,x14,x15
114 cbz x21,.Linner_skip
115
116 .Linner:
117 ldr x8,[x1],#8
118 adc x13,x13,xzr
119 ldr x23,[x22],#8 // tp[j]
120 adds x6,x10,x7
121 sub x21,x21,#8 // j--
122 adc x7,x11,xzr
123
124 adds x12,x16,x13
125 ldr x14,[x3],#8
126 adc x13,x17,xzr
127
128 mul x10,x8,x9 // ap[j]*bp[i]
129 adds x6,x6,x23
130 umulh x11,x8,x9
131 adc x7,x7,xzr
132
133 mul x16,x14,x15 // np[j]*m1
134 adds x12,x12,x6
135 umulh x17,x14,x15
136 str x12,[x22,#-16] // tp[j-1]
137 cbnz x21,.Linner
138
139 .Linner_skip:
140 ldr x23,[x22],#8 // tp[j]
141 adc x13,x13,xzr
142 adds x6,x10,x7
143 sub x1,x1,x5 // rewind x1
144 adc x7,x11,xzr
145
146 adds x12,x16,x13
147 sub x3,x3,x5 // rewind x3
148 adcs x13,x17,x19
149 adc x19,xzr,xzr
150
151 adds x6,x6,x23
152 adc x7,x7,xzr
153
154 adds x12,x12,x6
155 adcs x13,x13,x7
156 adc x19,x19,xzr // upmost overflow bit
157 stp x12,x13,[x22,#-16]
158
159 cbnz x20,.Louter
160
161 // Final step. We see if result is larger than modulus, and
162 // if it is, subtract the modulus. But comparison implies
163 // subtraction. So we subtract modulus, see if it borrowed,
164 // and conditionally copy original value.
165 ldr x23,[sp] // tp[0]
166 add x22,sp,#8
167 ldr x14,[x3],#8 // np[0]
168 subs x21,x5,#8 // j=num-1 and clear borrow
169 mov x1,x0
170 .Lsub:
171 sbcs x8,x23,x14 // tp[j]-np[j]
172 ldr x23,[x22],#8
173 sub x21,x21,#8 // j--
174 ldr x14,[x3],#8
175 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
176 cbnz x21,.Lsub
177
178 sbcs x8,x23,x14
179 sbcs x19,x19,xzr // did it borrow?
180 str x8,[x1],#8 // rp[num-1]
181
182 ldr x23,[sp] // tp[0]
183 add x22,sp,#8
184 ldr x8,[x0],#8 // rp[0]
185 sub x5,x5,#8 // num--
186 nop
187 .Lcond_copy:
188 sub x5,x5,#8 // num--
189 csel x14,x23,x8,lo // did it borrow?
190 ldr x23,[x22],#8
191 ldr x8,[x0],#8
192 str xzr,[x22,#-16] // wipe tp
193 str x14,[x0,#-16]
194 cbnz x5,.Lcond_copy
195
196 csel x14,x23,x8,lo
197 str xzr,[x22,#-8] // wipe tp
198 str x14,[x0,#-8]
199
200 ldp x19,x20,[x29,#16]
201 mov sp,x29
202 ldp x21,x22,[x29,#32]
203 mov x0,#1
204 ldp x23,x24,[x29,#48]
205 ldr x29,[sp],#64
206 ret
207 .size bn_mul_mont,.-bn_mul_mont
208 .type __bn_sqr8x_mont,%function
209 .align 5
210 __bn_sqr8x_mont:
211 cmp x1,x2
212 b.ne __bn_mul4x_mont
213 .Lsqr8x_mont:
214 .inst 0xd503233f // paciasp
215 stp x29,x30,[sp,#-128]!
216 add x29,sp,#0
217 stp x19,x20,[sp,#16]
218 stp x21,x22,[sp,#32]
219 stp x23,x24,[sp,#48]
220 stp x25,x26,[sp,#64]
221 stp x27,x28,[sp,#80]
222 stp x0,x3,[sp,#96] // offload rp and np
223
224 ldp x6,x7,[x1,#8*0]
225 ldp x8,x9,[x1,#8*2]
226 ldp x10,x11,[x1,#8*4]
227 ldp x12,x13,[x1,#8*6]
228
229 sub x2,sp,x5,lsl#4
230 lsl x5,x5,#3
231 ldr x4,[x4] // *n0
232 mov sp,x2 // alloca
233 sub x27,x5,#8*8
234 b .Lsqr8x_zero_start
235
236 .Lsqr8x_zero:
237 sub x27,x27,#8*8
238 stp xzr,xzr,[x2,#8*0]
239 stp xzr,xzr,[x2,#8*2]
240 stp xzr,xzr,[x2,#8*4]
241 stp xzr,xzr,[x2,#8*6]
242 .Lsqr8x_zero_start:
243 stp xzr,xzr,[x2,#8*8]
244 stp xzr,xzr,[x2,#8*10]
245 stp xzr,xzr,[x2,#8*12]
246 stp xzr,xzr,[x2,#8*14]
247 add x2,x2,#8*16
248 cbnz x27,.Lsqr8x_zero
249
250 add x3,x1,x5
251 add x1,x1,#8*8
252 mov x19,xzr
253 mov x20,xzr
254 mov x21,xzr
255 mov x22,xzr
256 mov x23,xzr
257 mov x24,xzr
258 mov x25,xzr
259 mov x26,xzr
260 mov x2,sp
261 str x4,[x29,#112] // offload n0
262
263 // Multiply everything but a[i]*a[i]
264 .align 4
265 .Lsqr8x_outer_loop:
266 // a[1]a[0] (i)
267 // a[2]a[0]
268 // a[3]a[0]
269 // a[4]a[0]
270 // a[5]a[0]
271 // a[6]a[0]
272 // a[7]a[0]
273 // a[2]a[1] (ii)
274 // a[3]a[1]
275 // a[4]a[1]
276 // a[5]a[1]
277 // a[6]a[1]
278 // a[7]a[1]
279 // a[3]a[2] (iii)
280 // a[4]a[2]
281 // a[5]a[2]
282 // a[6]a[2]
283 // a[7]a[2]
284 // a[4]a[3] (iv)
285 // a[5]a[3]
286 // a[6]a[3]
287 // a[7]a[3]
288 // a[5]a[4] (v)
289 // a[6]a[4]
290 // a[7]a[4]
291 // a[6]a[5] (vi)
292 // a[7]a[5]
293 // a[7]a[6] (vii)
294
295 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
296 mul x15,x8,x6
297 mul x16,x9,x6
298 mul x17,x10,x6
299 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
300 mul x14,x11,x6
301 adcs x21,x21,x15
302 mul x15,x12,x6
303 adcs x22,x22,x16
304 mul x16,x13,x6
305 adcs x23,x23,x17
306 umulh x17,x7,x6 // hi(a[1..7]*a[0])
307 adcs x24,x24,x14
308 umulh x14,x8,x6
309 adcs x25,x25,x15
310 umulh x15,x9,x6
311 adcs x26,x26,x16
312 umulh x16,x10,x6
313 stp x19,x20,[x2],#8*2 // t[0..1]
314 adc x19,xzr,xzr // t[8]
315 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
316 umulh x17,x11,x6
317 adcs x22,x22,x14
318 umulh x14,x12,x6
319 adcs x23,x23,x15
320 umulh x15,x13,x6
321 adcs x24,x24,x16
322 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
323 adcs x25,x25,x17
324 mul x17,x9,x7
325 adcs x26,x26,x14
326 mul x14,x10,x7
327 adc x19,x19,x15
328
329 mul x15,x11,x7
330 adds x22,x22,x16
331 mul x16,x12,x7
332 adcs x23,x23,x17
333 mul x17,x13,x7
334 adcs x24,x24,x14
335 umulh x14,x8,x7 // hi(a[2..7]*a[1])
336 adcs x25,x25,x15
337 umulh x15,x9,x7
338 adcs x26,x26,x16
339 umulh x16,x10,x7
340 adcs x19,x19,x17
341 umulh x17,x11,x7
342 stp x21,x22,[x2],#8*2 // t[2..3]
343 adc x20,xzr,xzr // t[9]
344 adds x23,x23,x14
345 umulh x14,x12,x7
346 adcs x24,x24,x15
347 umulh x15,x13,x7
348 adcs x25,x25,x16
349 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
350 adcs x26,x26,x17
351 mul x17,x10,x8
352 adcs x19,x19,x14
353 mul x14,x11,x8
354 adc x20,x20,x15
355
356 mul x15,x12,x8
357 adds x24,x24,x16
358 mul x16,x13,x8
359 adcs x25,x25,x17
360 umulh x17,x9,x8 // hi(a[3..7]*a[2])
361 adcs x26,x26,x14
362 umulh x14,x10,x8
363 adcs x19,x19,x15
364 umulh x15,x11,x8
365 adcs x20,x20,x16
366 umulh x16,x12,x8
367 stp x23,x24,[x2],#8*2 // t[4..5]
368 adc x21,xzr,xzr // t[10]
369 adds x25,x25,x17
370 umulh x17,x13,x8
371 adcs x26,x26,x14
372 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
373 adcs x19,x19,x15
374 mul x15,x11,x9
375 adcs x20,x20,x16
376 mul x16,x12,x9
377 adc x21,x21,x17
378
379 mul x17,x13,x9
380 adds x26,x26,x14
381 umulh x14,x10,x9 // hi(a[4..7]*a[3])
382 adcs x19,x19,x15
383 umulh x15,x11,x9
384 adcs x20,x20,x16
385 umulh x16,x12,x9
386 adcs x21,x21,x17
387 umulh x17,x13,x9
388 stp x25,x26,[x2],#8*2 // t[6..7]
389 adc x22,xzr,xzr // t[11]
390 adds x19,x19,x14
391 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
392 adcs x20,x20,x15
393 mul x15,x12,x10
394 adcs x21,x21,x16
395 mul x16,x13,x10
396 adc x22,x22,x17
397
398 umulh x17,x11,x10 // hi(a[5..7]*a[4])
399 adds x20,x20,x14
400 umulh x14,x12,x10
401 adcs x21,x21,x15
402 umulh x15,x13,x10
403 adcs x22,x22,x16
404 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
405 adc x23,xzr,xzr // t[12]
406 adds x21,x21,x17
407 mul x17,x13,x11
408 adcs x22,x22,x14
409 umulh x14,x12,x11 // hi(a[6..7]*a[5])
410 adc x23,x23,x15
411
412 umulh x15,x13,x11
413 adds x22,x22,x16
414 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
415 adcs x23,x23,x17
416 umulh x17,x13,x12 // hi(a[7]*a[6])
417 adc x24,xzr,xzr // t[13]
418 adds x23,x23,x14
419 sub x27,x3,x1 // done yet?
420 adc x24,x24,x15
421
422 adds x24,x24,x16
423 sub x14,x3,x5 // rewinded ap
424 adc x25,xzr,xzr // t[14]
425 add x25,x25,x17
426
427 cbz x27,.Lsqr8x_outer_break
428
429 mov x4,x6
430 ldp x6,x7,[x2,#8*0]
431 ldp x8,x9,[x2,#8*2]
432 ldp x10,x11,[x2,#8*4]
433 ldp x12,x13,[x2,#8*6]
434 adds x19,x19,x6
435 adcs x20,x20,x7
436 ldp x6,x7,[x1,#8*0]
437 adcs x21,x21,x8
438 adcs x22,x22,x9
439 ldp x8,x9,[x1,#8*2]
440 adcs x23,x23,x10
441 adcs x24,x24,x11
442 ldp x10,x11,[x1,#8*4]
443 adcs x25,x25,x12
444 mov x0,x1
445 adcs x26,xzr,x13
446 ldp x12,x13,[x1,#8*6]
447 add x1,x1,#8*8
448 //adc x28,xzr,xzr // moved below
449 mov x27,#-8*8
450
451 // a[8]a[0]
452 // a[9]a[0]
453 // a[a]a[0]
454 // a[b]a[0]
455 // a[c]a[0]
456 // a[d]a[0]
457 // a[e]a[0]
458 // a[f]a[0]
459 // a[8]a[1]
460 // a[f]a[1]........................
461 // a[8]a[2]
462 // a[f]a[2]........................
463 // a[8]a[3]
464 // a[f]a[3]........................
465 // a[8]a[4]
466 // a[f]a[4]........................
467 // a[8]a[5]
468 // a[f]a[5]........................
469 // a[8]a[6]
470 // a[f]a[6]........................
471 // a[8]a[7]
472 // a[f]a[7]........................
473 .Lsqr8x_mul:
474 mul x14,x6,x4
475 adc x28,xzr,xzr // carry bit, modulo-scheduled
476 mul x15,x7,x4
477 add x27,x27,#8
478 mul x16,x8,x4
479 mul x17,x9,x4
480 adds x19,x19,x14
481 mul x14,x10,x4
482 adcs x20,x20,x15
483 mul x15,x11,x4
484 adcs x21,x21,x16
485 mul x16,x12,x4
486 adcs x22,x22,x17
487 mul x17,x13,x4
488 adcs x23,x23,x14
489 umulh x14,x6,x4
490 adcs x24,x24,x15
491 umulh x15,x7,x4
492 adcs x25,x25,x16
493 umulh x16,x8,x4
494 adcs x26,x26,x17
495 umulh x17,x9,x4
496 adc x28,x28,xzr
497 str x19,[x2],#8
498 adds x19,x20,x14
499 umulh x14,x10,x4
500 adcs x20,x21,x15
501 umulh x15,x11,x4
502 adcs x21,x22,x16
503 umulh x16,x12,x4
504 adcs x22,x23,x17
505 umulh x17,x13,x4
506 ldr x4,[x0,x27]
507 adcs x23,x24,x14
508 adcs x24,x25,x15
509 adcs x25,x26,x16
510 adcs x26,x28,x17
511 //adc x28,xzr,xzr // moved above
512 cbnz x27,.Lsqr8x_mul
513 // note that carry flag is guaranteed
514 // to be zero at this point
515 cmp x1,x3 // done yet?
516 b.eq .Lsqr8x_break
517
518 ldp x6,x7,[x2,#8*0]
519 ldp x8,x9,[x2,#8*2]
520 ldp x10,x11,[x2,#8*4]
521 ldp x12,x13,[x2,#8*6]
522 adds x19,x19,x6
523 ldr x4,[x0,#-8*8]
524 adcs x20,x20,x7
525 ldp x6,x7,[x1,#8*0]
526 adcs x21,x21,x8
527 adcs x22,x22,x9
528 ldp x8,x9,[x1,#8*2]
529 adcs x23,x23,x10
530 adcs x24,x24,x11
531 ldp x10,x11,[x1,#8*4]
532 adcs x25,x25,x12
533 mov x27,#-8*8
534 adcs x26,x26,x13
535 ldp x12,x13,[x1,#8*6]
536 add x1,x1,#8*8
537 //adc x28,xzr,xzr // moved above
538 b .Lsqr8x_mul
539
540 .align 4
541 .Lsqr8x_break:
542 ldp x6,x7,[x0,#8*0]
543 add x1,x0,#8*8
544 ldp x8,x9,[x0,#8*2]
545 sub x14,x3,x1 // is it last iteration?
546 ldp x10,x11,[x0,#8*4]
547 sub x15,x2,x14
548 ldp x12,x13,[x0,#8*6]
549 cbz x14,.Lsqr8x_outer_loop
550
551 stp x19,x20,[x2,#8*0]
552 ldp x19,x20,[x15,#8*0]
553 stp x21,x22,[x2,#8*2]
554 ldp x21,x22,[x15,#8*2]
555 stp x23,x24,[x2,#8*4]
556 ldp x23,x24,[x15,#8*4]
557 stp x25,x26,[x2,#8*6]
558 mov x2,x15
559 ldp x25,x26,[x15,#8*6]
560 b .Lsqr8x_outer_loop
561
562 .align 4
563 .Lsqr8x_outer_break:
564 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
565 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
566 ldp x15,x16,[sp,#8*1]
567 ldp x11,x13,[x14,#8*2]
568 add x1,x14,#8*4
569 ldp x17,x14,[sp,#8*3]
570
571 stp x19,x20,[x2,#8*0]
572 mul x19,x7,x7
573 stp x21,x22,[x2,#8*2]
574 umulh x7,x7,x7
575 stp x23,x24,[x2,#8*4]
576 mul x8,x9,x9
577 stp x25,x26,[x2,#8*6]
578 mov x2,sp
579 umulh x9,x9,x9
580 adds x20,x7,x15,lsl#1
581 extr x15,x16,x15,#63
582 sub x27,x5,#8*4
583
584 .Lsqr4x_shift_n_add:
585 adcs x21,x8,x15
586 extr x16,x17,x16,#63
587 sub x27,x27,#8*4
588 adcs x22,x9,x16
589 ldp x15,x16,[x2,#8*5]
590 mul x10,x11,x11
591 ldp x7,x9,[x1],#8*2
592 umulh x11,x11,x11
593 mul x12,x13,x13
594 umulh x13,x13,x13
595 extr x17,x14,x17,#63
596 stp x19,x20,[x2,#8*0]
597 adcs x23,x10,x17
598 extr x14,x15,x14,#63
599 stp x21,x22,[x2,#8*2]
600 adcs x24,x11,x14
601 ldp x17,x14,[x2,#8*7]
602 extr x15,x16,x15,#63
603 adcs x25,x12,x15
604 extr x16,x17,x16,#63
605 adcs x26,x13,x16
606 ldp x15,x16,[x2,#8*9]
607 mul x6,x7,x7
608 ldp x11,x13,[x1],#8*2
609 umulh x7,x7,x7
610 mul x8,x9,x9
611 umulh x9,x9,x9
612 stp x23,x24,[x2,#8*4]
613 extr x17,x14,x17,#63
614 stp x25,x26,[x2,#8*6]
615 add x2,x2,#8*8
616 adcs x19,x6,x17
617 extr x14,x15,x14,#63
618 adcs x20,x7,x14
619 ldp x17,x14,[x2,#8*3]
620 extr x15,x16,x15,#63
621 cbnz x27,.Lsqr4x_shift_n_add
622 ldp x1,x4,[x29,#104] // pull np and n0
623
624 adcs x21,x8,x15
625 extr x16,x17,x16,#63
626 adcs x22,x9,x16
627 ldp x15,x16,[x2,#8*5]
628 mul x10,x11,x11
629 umulh x11,x11,x11
630 stp x19,x20,[x2,#8*0]
631 mul x12,x13,x13
632 umulh x13,x13,x13
633 stp x21,x22,[x2,#8*2]
634 extr x17,x14,x17,#63
635 adcs x23,x10,x17
636 extr x14,x15,x14,#63
637 ldp x19,x20,[sp,#8*0]
638 adcs x24,x11,x14
639 extr x15,x16,x15,#63
640 ldp x6,x7,[x1,#8*0]
641 adcs x25,x12,x15
642 extr x16,xzr,x16,#63
643 ldp x8,x9,[x1,#8*2]
644 adc x26,x13,x16
645 ldp x10,x11,[x1,#8*4]
646
647 // Reduce by 512 bits per iteration
648 mul x28,x4,x19 // t[0]*n0
649 ldp x12,x13,[x1,#8*6]
650 add x3,x1,x5
651 ldp x21,x22,[sp,#8*2]
652 stp x23,x24,[x2,#8*4]
653 ldp x23,x24,[sp,#8*4]
654 stp x25,x26,[x2,#8*6]
655 ldp x25,x26,[sp,#8*6]
656 add x1,x1,#8*8
657 mov x30,xzr // initial top-most carry
658 mov x2,sp
659 mov x27,#8
660
661 .Lsqr8x_reduction:
662 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
663 mul x15,x7,x28
664 sub x27,x27,#1
665 mul x16,x8,x28
666 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
667 mul x17,x9,x28
668 // (*) adds xzr,x19,x14
669 subs xzr,x19,#1 // (*)
670 mul x14,x10,x28
671 adcs x19,x20,x15
672 mul x15,x11,x28
673 adcs x20,x21,x16
674 mul x16,x12,x28
675 adcs x21,x22,x17
676 mul x17,x13,x28
677 adcs x22,x23,x14
678 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
679 adcs x23,x24,x15
680 umulh x15,x7,x28
681 adcs x24,x25,x16
682 umulh x16,x8,x28
683 adcs x25,x26,x17
684 umulh x17,x9,x28
685 adc x26,xzr,xzr
686 adds x19,x19,x14
687 umulh x14,x10,x28
688 adcs x20,x20,x15
689 umulh x15,x11,x28
690 adcs x21,x21,x16
691 umulh x16,x12,x28
692 adcs x22,x22,x17
693 umulh x17,x13,x28
694 mul x28,x4,x19 // next t[0]*n0
695 adcs x23,x23,x14
696 adcs x24,x24,x15
697 adcs x25,x25,x16
698 adc x26,x26,x17
699 cbnz x27,.Lsqr8x_reduction
700
701 ldp x14,x15,[x2,#8*0]
702 ldp x16,x17,[x2,#8*2]
703 mov x0,x2
704 sub x27,x3,x1 // done yet?
705 adds x19,x19,x14
706 adcs x20,x20,x15
707 ldp x14,x15,[x2,#8*4]
708 adcs x21,x21,x16
709 adcs x22,x22,x17
710 ldp x16,x17,[x2,#8*6]
711 adcs x23,x23,x14
712 adcs x24,x24,x15
713 adcs x25,x25,x16
714 adcs x26,x26,x17
715 //adc x28,xzr,xzr // moved below
716 cbz x27,.Lsqr8x8_post_condition
717
718 ldr x4,[x2,#-8*8]
719 ldp x6,x7,[x1,#8*0]
720 ldp x8,x9,[x1,#8*2]
721 ldp x10,x11,[x1,#8*4]
722 mov x27,#-8*8
723 ldp x12,x13,[x1,#8*6]
724 add x1,x1,#8*8
725
726 .Lsqr8x_tail:
727 mul x14,x6,x4
728 adc x28,xzr,xzr // carry bit, modulo-scheduled
729 mul x15,x7,x4
730 add x27,x27,#8
731 mul x16,x8,x4
732 mul x17,x9,x4
733 adds x19,x19,x14
734 mul x14,x10,x4
735 adcs x20,x20,x15
736 mul x15,x11,x4
737 adcs x21,x21,x16
738 mul x16,x12,x4
739 adcs x22,x22,x17
740 mul x17,x13,x4
741 adcs x23,x23,x14
742 umulh x14,x6,x4
743 adcs x24,x24,x15
744 umulh x15,x7,x4
745 adcs x25,x25,x16
746 umulh x16,x8,x4
747 adcs x26,x26,x17
748 umulh x17,x9,x4
749 adc x28,x28,xzr
750 str x19,[x2],#8
751 adds x19,x20,x14
752 umulh x14,x10,x4
753 adcs x20,x21,x15
754 umulh x15,x11,x4
755 adcs x21,x22,x16
756 umulh x16,x12,x4
757 adcs x22,x23,x17
758 umulh x17,x13,x4
759 ldr x4,[x0,x27]
760 adcs x23,x24,x14
761 adcs x24,x25,x15
762 adcs x25,x26,x16
763 adcs x26,x28,x17
764 //adc x28,xzr,xzr // moved above
765 cbnz x27,.Lsqr8x_tail
766 // note that carry flag is guaranteed
767 // to be zero at this point
768 ldp x6,x7,[x2,#8*0]
769 sub x27,x3,x1 // done yet?
770 sub x16,x3,x5 // rewinded np
771 ldp x8,x9,[x2,#8*2]
772 ldp x10,x11,[x2,#8*4]
773 ldp x12,x13,[x2,#8*6]
774 cbz x27,.Lsqr8x_tail_break
775
776 ldr x4,[x0,#-8*8]
777 adds x19,x19,x6
778 adcs x20,x20,x7
779 ldp x6,x7,[x1,#8*0]
780 adcs x21,x21,x8
781 adcs x22,x22,x9
782 ldp x8,x9,[x1,#8*2]
783 adcs x23,x23,x10
784 adcs x24,x24,x11
785 ldp x10,x11,[x1,#8*4]
786 adcs x25,x25,x12
787 mov x27,#-8*8
788 adcs x26,x26,x13
789 ldp x12,x13,[x1,#8*6]
790 add x1,x1,#8*8
791 //adc x28,xzr,xzr // moved above
792 b .Lsqr8x_tail
793
794 .align 4
795 .Lsqr8x_tail_break:
796 ldr x4,[x29,#112] // pull n0
797 add x27,x2,#8*8 // end of current t[num] window
798
799 subs xzr,x30,#1 // "move" top-most carry to carry bit
800 adcs x14,x19,x6
801 adcs x15,x20,x7
802 ldp x19,x20,[x0,#8*0]
803 adcs x21,x21,x8
804 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
805 adcs x22,x22,x9
806 ldp x8,x9,[x16,#8*2]
807 adcs x23,x23,x10
808 adcs x24,x24,x11
809 ldp x10,x11,[x16,#8*4]
810 adcs x25,x25,x12
811 adcs x26,x26,x13
812 ldp x12,x13,[x16,#8*6]
813 add x1,x16,#8*8
814 adc x30,xzr,xzr // top-most carry
815 mul x28,x4,x19
816 stp x14,x15,[x2,#8*0]
817 stp x21,x22,[x2,#8*2]
818 ldp x21,x22,[x0,#8*2]
819 stp x23,x24,[x2,#8*4]
820 ldp x23,x24,[x0,#8*4]
821 cmp x27,x29 // did we hit the bottom?
822 stp x25,x26,[x2,#8*6]
823 mov x2,x0 // slide the window
824 ldp x25,x26,[x0,#8*6]
825 mov x27,#8
826 b.ne .Lsqr8x_reduction
827
828 // Final step. We see if result is larger than modulus, and
829 // if it is, subtract the modulus. But comparison implies
830 // subtraction. So we subtract modulus, see if it borrowed,
831 // and conditionally copy original value.
832 ldr x0,[x29,#96] // pull rp
833 add x2,x2,#8*8
834 subs x14,x19,x6
835 sbcs x15,x20,x7
836 sub x27,x5,#8*8
837 mov x3,x0 // x0 copy
838
839 .Lsqr8x_sub:
840 sbcs x16,x21,x8
841 ldp x6,x7,[x1,#8*0]
842 sbcs x17,x22,x9
843 stp x14,x15,[x0,#8*0]
844 sbcs x14,x23,x10
845 ldp x8,x9,[x1,#8*2]
846 sbcs x15,x24,x11
847 stp x16,x17,[x0,#8*2]
848 sbcs x16,x25,x12
849 ldp x10,x11,[x1,#8*4]
850 sbcs x17,x26,x13
851 ldp x12,x13,[x1,#8*6]
852 add x1,x1,#8*8
853 ldp x19,x20,[x2,#8*0]
854 sub x27,x27,#8*8
855 ldp x21,x22,[x2,#8*2]
856 ldp x23,x24,[x2,#8*4]
857 ldp x25,x26,[x2,#8*6]
858 add x2,x2,#8*8
859 stp x14,x15,[x0,#8*4]
860 sbcs x14,x19,x6
861 stp x16,x17,[x0,#8*6]
862 add x0,x0,#8*8
863 sbcs x15,x20,x7
864 cbnz x27,.Lsqr8x_sub
865
866 sbcs x16,x21,x8
867 mov x2,sp
868 add x1,sp,x5
869 ldp x6,x7,[x3,#8*0]
870 sbcs x17,x22,x9
871 stp x14,x15,[x0,#8*0]
872 sbcs x14,x23,x10
873 ldp x8,x9,[x3,#8*2]
874 sbcs x15,x24,x11
875 stp x16,x17,[x0,#8*2]
876 sbcs x16,x25,x12
877 ldp x19,x20,[x1,#8*0]
878 sbcs x17,x26,x13
879 ldp x21,x22,[x1,#8*2]
880 sbcs xzr,x30,xzr // did it borrow?
881 ldr x30,[x29,#8] // pull return address
882 stp x14,x15,[x0,#8*4]
883 stp x16,x17,[x0,#8*6]
884
885 sub x27,x5,#8*4
886 .Lsqr4x_cond_copy:
887 sub x27,x27,#8*4
888 csel x14,x19,x6,lo
889 stp xzr,xzr,[x2,#8*0]
890 csel x15,x20,x7,lo
891 ldp x6,x7,[x3,#8*4]
892 ldp x19,x20,[x1,#8*4]
893 csel x16,x21,x8,lo
894 stp xzr,xzr,[x2,#8*2]
895 add x2,x2,#8*4
896 csel x17,x22,x9,lo
897 ldp x8,x9,[x3,#8*6]
898 ldp x21,x22,[x1,#8*6]
899 add x1,x1,#8*4
900 stp x14,x15,[x3,#8*0]
901 stp x16,x17,[x3,#8*2]
902 add x3,x3,#8*4
903 stp xzr,xzr,[x1,#8*0]
904 stp xzr,xzr,[x1,#8*2]
905 cbnz x27,.Lsqr4x_cond_copy
906
907 csel x14,x19,x6,lo
908 stp xzr,xzr,[x2,#8*0]
909 csel x15,x20,x7,lo
910 stp xzr,xzr,[x2,#8*2]
911 csel x16,x21,x8,lo
912 csel x17,x22,x9,lo
913 stp x14,x15,[x3,#8*0]
914 stp x16,x17,[x3,#8*2]
915
916 b .Lsqr8x_done
917
918 .align 4
919 .Lsqr8x8_post_condition:
920 adc x28,xzr,xzr
921 ldr x30,[x29,#8] // pull return address
922 // x19-7,x28 hold result, x6-7 hold modulus
923 subs x6,x19,x6
924 ldr x1,[x29,#96] // pull rp
925 sbcs x7,x20,x7
926 stp xzr,xzr,[sp,#8*0]
927 sbcs x8,x21,x8
928 stp xzr,xzr,[sp,#8*2]
929 sbcs x9,x22,x9
930 stp xzr,xzr,[sp,#8*4]
931 sbcs x10,x23,x10
932 stp xzr,xzr,[sp,#8*6]
933 sbcs x11,x24,x11
934 stp xzr,xzr,[sp,#8*8]
935 sbcs x12,x25,x12
936 stp xzr,xzr,[sp,#8*10]
937 sbcs x13,x26,x13
938 stp xzr,xzr,[sp,#8*12]
939 sbcs x28,x28,xzr // did it borrow?
940 stp xzr,xzr,[sp,#8*14]
941
942 // x6-7 hold result-modulus
943 csel x6,x19,x6,lo
944 csel x7,x20,x7,lo
945 csel x8,x21,x8,lo
946 csel x9,x22,x9,lo
947 stp x6,x7,[x1,#8*0]
948 csel x10,x23,x10,lo
949 csel x11,x24,x11,lo
950 stp x8,x9,[x1,#8*2]
951 csel x12,x25,x12,lo
952 csel x13,x26,x13,lo
953 stp x10,x11,[x1,#8*4]
954 stp x12,x13,[x1,#8*6]
955
956 .Lsqr8x_done:
957 ldp x19,x20,[x29,#16]
958 mov sp,x29
959 ldp x21,x22,[x29,#32]
960 mov x0,#1
961 ldp x23,x24,[x29,#48]
962 ldp x25,x26,[x29,#64]
963 ldp x27,x28,[x29,#80]
964 ldr x29,[sp],#128
965 .inst 0xd50323bf // autiasp
966 ret
967 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
968 .type __bn_mul4x_mont,%function
969 .align 5
970 __bn_mul4x_mont:
971 .inst 0xd503233f // paciasp
972 stp x29,x30,[sp,#-128]!
973 add x29,sp,#0
974 stp x19,x20,[sp,#16]
975 stp x21,x22,[sp,#32]
976 stp x23,x24,[sp,#48]
977 stp x25,x26,[sp,#64]
978 stp x27,x28,[sp,#80]
979
980 sub x26,sp,x5,lsl#3
981 lsl x5,x5,#3
982 ldr x4,[x4] // *n0
983 sub sp,x26,#8*4 // alloca
984
985 add x10,x2,x5
986 add x27,x1,x5
987 stp x0,x10,[x29,#96] // offload rp and &b[num]
988
989 ldr x24,[x2,#8*0] // b[0]
990 ldp x6,x7,[x1,#8*0] // a[0..3]
991 ldp x8,x9,[x1,#8*2]
992 add x1,x1,#8*4
993 mov x19,xzr
994 mov x20,xzr
995 mov x21,xzr
996 mov x22,xzr
997 ldp x14,x15,[x3,#8*0] // n[0..3]
998 ldp x16,x17,[x3,#8*2]
999 adds x3,x3,#8*4 // clear carry bit
1000 mov x0,xzr
1001 mov x28,#0
1002 mov x26,sp
1003
1004 .Loop_mul4x_1st_reduction:
1005 mul x10,x6,x24 // lo(a[0..3]*b[0])
1006 adc x0,x0,xzr // modulo-scheduled
1007 mul x11,x7,x24
1008 add x28,x28,#8
1009 mul x12,x8,x24
1010 and x28,x28,#31
1011 mul x13,x9,x24
1012 adds x19,x19,x10
1013 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1014 adcs x20,x20,x11
1015 mul x25,x19,x4 // t[0]*n0
1016 adcs x21,x21,x12
1017 umulh x11,x7,x24
1018 adcs x22,x22,x13
1019 umulh x12,x8,x24
1020 adc x23,xzr,xzr
1021 umulh x13,x9,x24
1022 ldr x24,[x2,x28] // next b[i] (or b[0])
1023 adds x20,x20,x10
1024 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1025 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1026 adcs x21,x21,x11
1027 mul x11,x15,x25
1028 adcs x22,x22,x12
1029 mul x12,x16,x25
1030 adc x23,x23,x13 // can't overflow
1031 mul x13,x17,x25
1032 // (*) adds xzr,x19,x10
1033 subs xzr,x19,#1 // (*)
1034 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1035 adcs x19,x20,x11
1036 umulh x11,x15,x25
1037 adcs x20,x21,x12
1038 umulh x12,x16,x25
1039 adcs x21,x22,x13
1040 umulh x13,x17,x25
1041 adcs x22,x23,x0
1042 adc x0,xzr,xzr
1043 adds x19,x19,x10
1044 sub x10,x27,x1
1045 adcs x20,x20,x11
1046 adcs x21,x21,x12
1047 adcs x22,x22,x13
1048 //adc x0,x0,xzr
1049 cbnz x28,.Loop_mul4x_1st_reduction
1050
1051 cbz x10,.Lmul4x4_post_condition
1052
1053 ldp x6,x7,[x1,#8*0] // a[4..7]
1054 ldp x8,x9,[x1,#8*2]
1055 add x1,x1,#8*4
1056 ldr x25,[sp] // a[0]*n0
1057 ldp x14,x15,[x3,#8*0] // n[4..7]
1058 ldp x16,x17,[x3,#8*2]
1059 add x3,x3,#8*4
1060
1061 .Loop_mul4x_1st_tail:
1062 mul x10,x6,x24 // lo(a[4..7]*b[i])
1063 adc x0,x0,xzr // modulo-scheduled
1064 mul x11,x7,x24
1065 add x28,x28,#8
1066 mul x12,x8,x24
1067 and x28,x28,#31
1068 mul x13,x9,x24
1069 adds x19,x19,x10
1070 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1071 adcs x20,x20,x11
1072 umulh x11,x7,x24
1073 adcs x21,x21,x12
1074 umulh x12,x8,x24
1075 adcs x22,x22,x13
1076 umulh x13,x9,x24
1077 adc x23,xzr,xzr
1078 ldr x24,[x2,x28] // next b[i] (or b[0])
1079 adds x20,x20,x10
1080 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1081 adcs x21,x21,x11
1082 mul x11,x15,x25
1083 adcs x22,x22,x12
1084 mul x12,x16,x25
1085 adc x23,x23,x13 // can't overflow
1086 mul x13,x17,x25
1087 adds x19,x19,x10
1088 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1089 adcs x20,x20,x11
1090 umulh x11,x15,x25
1091 adcs x21,x21,x12
1092 umulh x12,x16,x25
1093 adcs x22,x22,x13
1094 adcs x23,x23,x0
1095 umulh x13,x17,x25
1096 adc x0,xzr,xzr
1097 ldr x25,[sp,x28] // next t[0]*n0
1098 str x19,[x26],#8 // result!!!
1099 adds x19,x20,x10
1100 sub x10,x27,x1 // done yet?
1101 adcs x20,x21,x11
1102 adcs x21,x22,x12
1103 adcs x22,x23,x13
1104 //adc x0,x0,xzr
1105 cbnz x28,.Loop_mul4x_1st_tail
1106
1107 sub x11,x27,x5 // rewinded x1
1108 cbz x10,.Lmul4x_proceed
1109
1110 ldp x6,x7,[x1,#8*0]
1111 ldp x8,x9,[x1,#8*2]
1112 add x1,x1,#8*4
1113 ldp x14,x15,[x3,#8*0]
1114 ldp x16,x17,[x3,#8*2]
1115 add x3,x3,#8*4
1116 b .Loop_mul4x_1st_tail
1117
1118 .align 5
1119 .Lmul4x_proceed:
1120 ldr x24,[x2,#8*4]! // *++b
1121 adc x30,x0,xzr
1122 ldp x6,x7,[x11,#8*0] // a[0..3]
1123 sub x3,x3,x5 // rewind np
1124 ldp x8,x9,[x11,#8*2]
1125 add x1,x11,#8*4
1126
1127 stp x19,x20,[x26,#8*0] // result!!!
1128 ldp x19,x20,[sp,#8*4] // t[0..3]
1129 stp x21,x22,[x26,#8*2] // result!!!
1130 ldp x21,x22,[sp,#8*6]
1131
1132 ldp x14,x15,[x3,#8*0] // n[0..3]
1133 mov x26,sp
1134 ldp x16,x17,[x3,#8*2]
1135 adds x3,x3,#8*4 // clear carry bit
1136 mov x0,xzr
1137
1138 .align 4
1139 .Loop_mul4x_reduction:
1140 mul x10,x6,x24 // lo(a[0..3]*b[4])
1141 adc x0,x0,xzr // modulo-scheduled
1142 mul x11,x7,x24
1143 add x28,x28,#8
1144 mul x12,x8,x24
1145 and x28,x28,#31
1146 mul x13,x9,x24
1147 adds x19,x19,x10
1148 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1149 adcs x20,x20,x11
1150 mul x25,x19,x4 // t[0]*n0
1151 adcs x21,x21,x12
1152 umulh x11,x7,x24
1153 adcs x22,x22,x13
1154 umulh x12,x8,x24
1155 adc x23,xzr,xzr
1156 umulh x13,x9,x24
1157 ldr x24,[x2,x28] // next b[i]
1158 adds x20,x20,x10
1159 // (*) mul x10,x14,x25
1160 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1161 adcs x21,x21,x11
1162 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1163 adcs x22,x22,x12
1164 mul x12,x16,x25
1165 adc x23,x23,x13 // can't overflow
1166 mul x13,x17,x25
1167 // (*) adds xzr,x19,x10
1168 subs xzr,x19,#1 // (*)
1169 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1170 adcs x19,x20,x11
1171 umulh x11,x15,x25
1172 adcs x20,x21,x12
1173 umulh x12,x16,x25
1174 adcs x21,x22,x13
1175 umulh x13,x17,x25
1176 adcs x22,x23,x0
1177 adc x0,xzr,xzr
1178 adds x19,x19,x10
1179 adcs x20,x20,x11
1180 adcs x21,x21,x12
1181 adcs x22,x22,x13
1182 //adc x0,x0,xzr
1183 cbnz x28,.Loop_mul4x_reduction
1184
1185 adc x0,x0,xzr
1186 ldp x10,x11,[x26,#8*4] // t[4..7]
1187 ldp x12,x13,[x26,#8*6]
1188 ldp x6,x7,[x1,#8*0] // a[4..7]
1189 ldp x8,x9,[x1,#8*2]
1190 add x1,x1,#8*4
1191 adds x19,x19,x10
1192 adcs x20,x20,x11
1193 adcs x21,x21,x12
1194 adcs x22,x22,x13
1195 //adc x0,x0,xzr
1196
1197 ldr x25,[sp] // t[0]*n0
1198 ldp x14,x15,[x3,#8*0] // n[4..7]
1199 ldp x16,x17,[x3,#8*2]
1200 add x3,x3,#8*4
1201
1202 .align 4
1203 .Loop_mul4x_tail:
1204 mul x10,x6,x24 // lo(a[4..7]*b[4])
1205 adc x0,x0,xzr // modulo-scheduled
1206 mul x11,x7,x24
1207 add x28,x28,#8
1208 mul x12,x8,x24
1209 and x28,x28,#31
1210 mul x13,x9,x24
1211 adds x19,x19,x10
1212 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1213 adcs x20,x20,x11
1214 umulh x11,x7,x24
1215 adcs x21,x21,x12
1216 umulh x12,x8,x24
1217 adcs x22,x22,x13
1218 umulh x13,x9,x24
1219 adc x23,xzr,xzr
1220 ldr x24,[x2,x28] // next b[i]
1221 adds x20,x20,x10
1222 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1223 adcs x21,x21,x11
1224 mul x11,x15,x25
1225 adcs x22,x22,x12
1226 mul x12,x16,x25
1227 adc x23,x23,x13 // can't overflow
1228 mul x13,x17,x25
1229 adds x19,x19,x10
1230 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1231 adcs x20,x20,x11
1232 umulh x11,x15,x25
1233 adcs x21,x21,x12
1234 umulh x12,x16,x25
1235 adcs x22,x22,x13
1236 umulh x13,x17,x25
1237 adcs x23,x23,x0
1238 ldr x25,[sp,x28] // next a[0]*n0
1239 adc x0,xzr,xzr
1240 str x19,[x26],#8 // result!!!
1241 adds x19,x20,x10
1242 sub x10,x27,x1 // done yet?
1243 adcs x20,x21,x11
1244 adcs x21,x22,x12
1245 adcs x22,x23,x13
1246 //adc x0,x0,xzr
1247 cbnz x28,.Loop_mul4x_tail
1248
1249 sub x11,x3,x5 // rewinded np?
1250 adc x0,x0,xzr
1251 cbz x10,.Loop_mul4x_break
1252
1253 ldp x10,x11,[x26,#8*4]
1254 ldp x12,x13,[x26,#8*6]
1255 ldp x6,x7,[x1,#8*0]
1256 ldp x8,x9,[x1,#8*2]
1257 add x1,x1,#8*4
1258 adds x19,x19,x10
1259 adcs x20,x20,x11
1260 adcs x21,x21,x12
1261 adcs x22,x22,x13
1262 //adc x0,x0,xzr
1263 ldp x14,x15,[x3,#8*0]
1264 ldp x16,x17,[x3,#8*2]
1265 add x3,x3,#8*4
1266 b .Loop_mul4x_tail
1267
1268 .align 4
1269 .Loop_mul4x_break:
1270 ldp x12,x13,[x29,#96] // pull rp and &b[num]
1271 adds x19,x19,x30
1272 add x2,x2,#8*4 // bp++
1273 adcs x20,x20,xzr
1274 sub x1,x1,x5 // rewind ap
1275 adcs x21,x21,xzr
1276 stp x19,x20,[x26,#8*0] // result!!!
1277 adcs x22,x22,xzr
1278 ldp x19,x20,[sp,#8*4] // t[0..3]
1279 adc x30,x0,xzr
1280 stp x21,x22,[x26,#8*2] // result!!!
1281 cmp x2,x13 // done yet?
1282 ldp x21,x22,[sp,#8*6]
1283 ldp x14,x15,[x11,#8*0] // n[0..3]
1284 ldp x16,x17,[x11,#8*2]
1285 add x3,x11,#8*4
1286 b.eq .Lmul4x_post
1287
1288 ldr x24,[x2]
1289 ldp x6,x7,[x1,#8*0] // a[0..3]
1290 ldp x8,x9,[x1,#8*2]
1291 adds x1,x1,#8*4 // clear carry bit
1292 mov x0,xzr
1293 mov x26,sp
1294 b .Loop_mul4x_reduction
1295
1296 .align 4
1297 .Lmul4x_post:
1298 // Final step. We see if result is larger than modulus, and
1299 // if it is, subtract the modulus. But comparison implies
1300 // subtraction. So we subtract modulus, see if it borrowed,
1301 // and conditionally copy original value.
1302 mov x0,x12
1303 mov x27,x12 // x0 copy
1304 subs x10,x19,x14
1305 add x26,sp,#8*8
1306 sbcs x11,x20,x15
1307 sub x28,x5,#8*4
1308
1309 .Lmul4x_sub:
1310 sbcs x12,x21,x16
1311 ldp x14,x15,[x3,#8*0]
1312 sub x28,x28,#8*4
1313 ldp x19,x20,[x26,#8*0]
1314 sbcs x13,x22,x17
1315 ldp x16,x17,[x3,#8*2]
1316 add x3,x3,#8*4
1317 ldp x21,x22,[x26,#8*2]
1318 add x26,x26,#8*4
1319 stp x10,x11,[x0,#8*0]
1320 sbcs x10,x19,x14
1321 stp x12,x13,[x0,#8*2]
1322 add x0,x0,#8*4
1323 sbcs x11,x20,x15
1324 cbnz x28,.Lmul4x_sub
1325
1326 sbcs x12,x21,x16
1327 mov x26,sp
1328 add x1,sp,#8*4
1329 ldp x6,x7,[x27,#8*0]
1330 sbcs x13,x22,x17
1331 stp x10,x11,[x0,#8*0]
1332 ldp x8,x9,[x27,#8*2]
1333 stp x12,x13,[x0,#8*2]
1334 ldp x19,x20,[x1,#8*0]
1335 ldp x21,x22,[x1,#8*2]
1336 sbcs xzr,x30,xzr // did it borrow?
1337 ldr x30,[x29,#8] // pull return address
1338
1339 sub x28,x5,#8*4
1340 .Lmul4x_cond_copy:
1341 sub x28,x28,#8*4
1342 csel x10,x19,x6,lo
1343 stp xzr,xzr,[x26,#8*0]
1344 csel x11,x20,x7,lo
1345 ldp x6,x7,[x27,#8*4]
1346 ldp x19,x20,[x1,#8*4]
1347 csel x12,x21,x8,lo
1348 stp xzr,xzr,[x26,#8*2]
1349 add x26,x26,#8*4
1350 csel x13,x22,x9,lo
1351 ldp x8,x9,[x27,#8*6]
1352 ldp x21,x22,[x1,#8*6]
1353 add x1,x1,#8*4
1354 stp x10,x11,[x27,#8*0]
1355 stp x12,x13,[x27,#8*2]
1356 add x27,x27,#8*4
1357 cbnz x28,.Lmul4x_cond_copy
1358
1359 csel x10,x19,x6,lo
1360 stp xzr,xzr,[x26,#8*0]
1361 csel x11,x20,x7,lo
1362 stp xzr,xzr,[x26,#8*2]
1363 csel x12,x21,x8,lo
1364 stp xzr,xzr,[x26,#8*3]
1365 csel x13,x22,x9,lo
1366 stp xzr,xzr,[x26,#8*4]
1367 stp x10,x11,[x27,#8*0]
1368 stp x12,x13,[x27,#8*2]
1369
1370 b .Lmul4x_done
1371
1372 .align 4
1373 .Lmul4x4_post_condition:
1374 adc x0,x0,xzr
1375 ldr x1,[x29,#96] // pull rp
1376 // x19-3,x0 hold result, x14-7 hold modulus
1377 subs x6,x19,x14
1378 ldr x30,[x29,#8] // pull return address
1379 sbcs x7,x20,x15
1380 stp xzr,xzr,[sp,#8*0]
1381 sbcs x8,x21,x16
1382 stp xzr,xzr,[sp,#8*2]
1383 sbcs x9,x22,x17
1384 stp xzr,xzr,[sp,#8*4]
1385 sbcs xzr,x0,xzr // did it borrow?
1386 stp xzr,xzr,[sp,#8*6]
1387
1388 // x6-3 hold result-modulus
1389 csel x6,x19,x6,lo
1390 csel x7,x20,x7,lo
1391 csel x8,x21,x8,lo
1392 csel x9,x22,x9,lo
1393 stp x6,x7,[x1,#8*0]
1394 stp x8,x9,[x1,#8*2]
1395
1396 .Lmul4x_done:
1397 ldp x19,x20,[x29,#16]
1398 mov sp,x29
1399 ldp x21,x22,[x29,#32]
1400 mov x0,#1
1401 ldp x23,x24,[x29,#48]
1402 ldp x25,x26,[x29,#64]
1403 ldp x27,x28,[x29,#80]
1404 ldr x29,[sp],#128
1405 .inst 0xd50323bf // autiasp
1406 ret
1407 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1408 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1409 .align 2
1410 .align 4
Cache object: 86bd6007e16e5b537367233db5289c37
|