1 /* $FreeBSD$ */
2 /* Do not modify. This file is auto-generated from x86_64-mont.pl. */
3 .text
4
5
6
7 .globl bn_mul_mont
8 .type bn_mul_mont,@function
9 .align 16
10 bn_mul_mont:
11 .cfi_startproc
12 movl %r9d,%r9d
13 movq %rsp,%rax
14 .cfi_def_cfa_register %rax
15 testl $3,%r9d
16 jnz .Lmul_enter
17 cmpl $8,%r9d
18 jb .Lmul_enter
19 movl OPENSSL_ia32cap_P+8(%rip),%r11d
20 cmpq %rsi,%rdx
21 jne .Lmul4x_enter
22 testl $7,%r9d
23 jz .Lsqr8x_enter
24 jmp .Lmul4x_enter
25
26 .align 16
27 .Lmul_enter:
28 pushq %rbx
29 .cfi_offset %rbx,-16
30 pushq %rbp
31 .cfi_offset %rbp,-24
32 pushq %r12
33 .cfi_offset %r12,-32
34 pushq %r13
35 .cfi_offset %r13,-40
36 pushq %r14
37 .cfi_offset %r14,-48
38 pushq %r15
39 .cfi_offset %r15,-56
40
41 negq %r9
42 movq %rsp,%r11
43 leaq -16(%rsp,%r9,8),%r10
44 negq %r9
45 andq $-1024,%r10
46
47
48
49
50
51
52
53
54
55 subq %r10,%r11
56 andq $-4096,%r11
57 leaq (%r10,%r11,1),%rsp
58 movq (%rsp),%r11
59 cmpq %r10,%rsp
60 ja .Lmul_page_walk
61 jmp .Lmul_page_walk_done
62
63 .align 16
64 .Lmul_page_walk:
65 leaq -4096(%rsp),%rsp
66 movq (%rsp),%r11
67 cmpq %r10,%rsp
68 ja .Lmul_page_walk
69 .Lmul_page_walk_done:
70
71 movq %rax,8(%rsp,%r9,8)
72 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
73 .Lmul_body:
74 movq %rdx,%r12
75 movq (%r8),%r8
76 movq (%r12),%rbx
77 movq (%rsi),%rax
78
79 xorq %r14,%r14
80 xorq %r15,%r15
81
82 movq %r8,%rbp
83 mulq %rbx
84 movq %rax,%r10
85 movq (%rcx),%rax
86
87 imulq %r10,%rbp
88 movq %rdx,%r11
89
90 mulq %rbp
91 addq %rax,%r10
92 movq 8(%rsi),%rax
93 adcq $0,%rdx
94 movq %rdx,%r13
95
96 leaq 1(%r15),%r15
97 jmp .L1st_enter
98
99 .align 16
100 .L1st:
101 addq %rax,%r13
102 movq (%rsi,%r15,8),%rax
103 adcq $0,%rdx
104 addq %r11,%r13
105 movq %r10,%r11
106 adcq $0,%rdx
107 movq %r13,-16(%rsp,%r15,8)
108 movq %rdx,%r13
109
110 .L1st_enter:
111 mulq %rbx
112 addq %rax,%r11
113 movq (%rcx,%r15,8),%rax
114 adcq $0,%rdx
115 leaq 1(%r15),%r15
116 movq %rdx,%r10
117
118 mulq %rbp
119 cmpq %r9,%r15
120 jne .L1st
121
122 addq %rax,%r13
123 movq (%rsi),%rax
124 adcq $0,%rdx
125 addq %r11,%r13
126 adcq $0,%rdx
127 movq %r13,-16(%rsp,%r15,8)
128 movq %rdx,%r13
129 movq %r10,%r11
130
131 xorq %rdx,%rdx
132 addq %r11,%r13
133 adcq $0,%rdx
134 movq %r13,-8(%rsp,%r9,8)
135 movq %rdx,(%rsp,%r9,8)
136
137 leaq 1(%r14),%r14
138 jmp .Louter
139 .align 16
140 .Louter:
141 movq (%r12,%r14,8),%rbx
142 xorq %r15,%r15
143 movq %r8,%rbp
144 movq (%rsp),%r10
145 mulq %rbx
146 addq %rax,%r10
147 movq (%rcx),%rax
148 adcq $0,%rdx
149
150 imulq %r10,%rbp
151 movq %rdx,%r11
152
153 mulq %rbp
154 addq %rax,%r10
155 movq 8(%rsi),%rax
156 adcq $0,%rdx
157 movq 8(%rsp),%r10
158 movq %rdx,%r13
159
160 leaq 1(%r15),%r15
161 jmp .Linner_enter
162
163 .align 16
164 .Linner:
165 addq %rax,%r13
166 movq (%rsi,%r15,8),%rax
167 adcq $0,%rdx
168 addq %r10,%r13
169 movq (%rsp,%r15,8),%r10
170 adcq $0,%rdx
171 movq %r13,-16(%rsp,%r15,8)
172 movq %rdx,%r13
173
174 .Linner_enter:
175 mulq %rbx
176 addq %rax,%r11
177 movq (%rcx,%r15,8),%rax
178 adcq $0,%rdx
179 addq %r11,%r10
180 movq %rdx,%r11
181 adcq $0,%r11
182 leaq 1(%r15),%r15
183
184 mulq %rbp
185 cmpq %r9,%r15
186 jne .Linner
187
188 addq %rax,%r13
189 movq (%rsi),%rax
190 adcq $0,%rdx
191 addq %r10,%r13
192 movq (%rsp,%r15,8),%r10
193 adcq $0,%rdx
194 movq %r13,-16(%rsp,%r15,8)
195 movq %rdx,%r13
196
197 xorq %rdx,%rdx
198 addq %r11,%r13
199 adcq $0,%rdx
200 addq %r10,%r13
201 adcq $0,%rdx
202 movq %r13,-8(%rsp,%r9,8)
203 movq %rdx,(%rsp,%r9,8)
204
205 leaq 1(%r14),%r14
206 cmpq %r9,%r14
207 jb .Louter
208
209 xorq %r14,%r14
210 movq (%rsp),%rax
211 movq %r9,%r15
212
213 .align 16
214 .Lsub: sbbq (%rcx,%r14,8),%rax
215 movq %rax,(%rdi,%r14,8)
216 movq 8(%rsp,%r14,8),%rax
217 leaq 1(%r14),%r14
218 decq %r15
219 jnz .Lsub
220
221 sbbq $0,%rax
222 movq $-1,%rbx
223 xorq %rax,%rbx
224 xorq %r14,%r14
225 movq %r9,%r15
226
227 .Lcopy:
228 movq (%rdi,%r14,8),%rcx
229 movq (%rsp,%r14,8),%rdx
230 andq %rbx,%rcx
231 andq %rax,%rdx
232 movq %r9,(%rsp,%r14,8)
233 orq %rcx,%rdx
234 movq %rdx,(%rdi,%r14,8)
235 leaq 1(%r14),%r14
236 subq $1,%r15
237 jnz .Lcopy
238
239 movq 8(%rsp,%r9,8),%rsi
240 .cfi_def_cfa %rsi,8
241 movq $1,%rax
242 movq -48(%rsi),%r15
243 .cfi_restore %r15
244 movq -40(%rsi),%r14
245 .cfi_restore %r14
246 movq -32(%rsi),%r13
247 .cfi_restore %r13
248 movq -24(%rsi),%r12
249 .cfi_restore %r12
250 movq -16(%rsi),%rbp
251 .cfi_restore %rbp
252 movq -8(%rsi),%rbx
253 .cfi_restore %rbx
254 leaq (%rsi),%rsp
255 .cfi_def_cfa_register %rsp
256 .Lmul_epilogue:
257 .byte 0xf3,0xc3
258 .cfi_endproc
259 .size bn_mul_mont,.-bn_mul_mont
260 .type bn_mul4x_mont,@function
261 .align 16
262 bn_mul4x_mont:
263 .cfi_startproc
264 movl %r9d,%r9d
265 movq %rsp,%rax
266 .cfi_def_cfa_register %rax
267 .Lmul4x_enter:
268 andl $0x80100,%r11d
269 cmpl $0x80100,%r11d
270 je .Lmulx4x_enter
271 pushq %rbx
272 .cfi_offset %rbx,-16
273 pushq %rbp
274 .cfi_offset %rbp,-24
275 pushq %r12
276 .cfi_offset %r12,-32
277 pushq %r13
278 .cfi_offset %r13,-40
279 pushq %r14
280 .cfi_offset %r14,-48
281 pushq %r15
282 .cfi_offset %r15,-56
283
284 negq %r9
285 movq %rsp,%r11
286 leaq -32(%rsp,%r9,8),%r10
287 negq %r9
288 andq $-1024,%r10
289
290 subq %r10,%r11
291 andq $-4096,%r11
292 leaq (%r10,%r11,1),%rsp
293 movq (%rsp),%r11
294 cmpq %r10,%rsp
295 ja .Lmul4x_page_walk
296 jmp .Lmul4x_page_walk_done
297
298 .Lmul4x_page_walk:
299 leaq -4096(%rsp),%rsp
300 movq (%rsp),%r11
301 cmpq %r10,%rsp
302 ja .Lmul4x_page_walk
303 .Lmul4x_page_walk_done:
304
305 movq %rax,8(%rsp,%r9,8)
306 .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
307 .Lmul4x_body:
308 movq %rdi,16(%rsp,%r9,8)
309 movq %rdx,%r12
310 movq (%r8),%r8
311 movq (%r12),%rbx
312 movq (%rsi),%rax
313
314 xorq %r14,%r14
315 xorq %r15,%r15
316
317 movq %r8,%rbp
318 mulq %rbx
319 movq %rax,%r10
320 movq (%rcx),%rax
321
322 imulq %r10,%rbp
323 movq %rdx,%r11
324
325 mulq %rbp
326 addq %rax,%r10
327 movq 8(%rsi),%rax
328 adcq $0,%rdx
329 movq %rdx,%rdi
330
331 mulq %rbx
332 addq %rax,%r11
333 movq 8(%rcx),%rax
334 adcq $0,%rdx
335 movq %rdx,%r10
336
337 mulq %rbp
338 addq %rax,%rdi
339 movq 16(%rsi),%rax
340 adcq $0,%rdx
341 addq %r11,%rdi
342 leaq 4(%r15),%r15
343 adcq $0,%rdx
344 movq %rdi,(%rsp)
345 movq %rdx,%r13
346 jmp .L1st4x
347 .align 16
348 .L1st4x:
349 mulq %rbx
350 addq %rax,%r10
351 movq -16(%rcx,%r15,8),%rax
352 adcq $0,%rdx
353 movq %rdx,%r11
354
355 mulq %rbp
356 addq %rax,%r13
357 movq -8(%rsi,%r15,8),%rax
358 adcq $0,%rdx
359 addq %r10,%r13
360 adcq $0,%rdx
361 movq %r13,-24(%rsp,%r15,8)
362 movq %rdx,%rdi
363
364 mulq %rbx
365 addq %rax,%r11
366 movq -8(%rcx,%r15,8),%rax
367 adcq $0,%rdx
368 movq %rdx,%r10
369
370 mulq %rbp
371 addq %rax,%rdi
372 movq (%rsi,%r15,8),%rax
373 adcq $0,%rdx
374 addq %r11,%rdi
375 adcq $0,%rdx
376 movq %rdi,-16(%rsp,%r15,8)
377 movq %rdx,%r13
378
379 mulq %rbx
380 addq %rax,%r10
381 movq (%rcx,%r15,8),%rax
382 adcq $0,%rdx
383 movq %rdx,%r11
384
385 mulq %rbp
386 addq %rax,%r13
387 movq 8(%rsi,%r15,8),%rax
388 adcq $0,%rdx
389 addq %r10,%r13
390 adcq $0,%rdx
391 movq %r13,-8(%rsp,%r15,8)
392 movq %rdx,%rdi
393
394 mulq %rbx
395 addq %rax,%r11
396 movq 8(%rcx,%r15,8),%rax
397 adcq $0,%rdx
398 leaq 4(%r15),%r15
399 movq %rdx,%r10
400
401 mulq %rbp
402 addq %rax,%rdi
403 movq -16(%rsi,%r15,8),%rax
404 adcq $0,%rdx
405 addq %r11,%rdi
406 adcq $0,%rdx
407 movq %rdi,-32(%rsp,%r15,8)
408 movq %rdx,%r13
409 cmpq %r9,%r15
410 jb .L1st4x
411
412 mulq %rbx
413 addq %rax,%r10
414 movq -16(%rcx,%r15,8),%rax
415 adcq $0,%rdx
416 movq %rdx,%r11
417
418 mulq %rbp
419 addq %rax,%r13
420 movq -8(%rsi,%r15,8),%rax
421 adcq $0,%rdx
422 addq %r10,%r13
423 adcq $0,%rdx
424 movq %r13,-24(%rsp,%r15,8)
425 movq %rdx,%rdi
426
427 mulq %rbx
428 addq %rax,%r11
429 movq -8(%rcx,%r15,8),%rax
430 adcq $0,%rdx
431 movq %rdx,%r10
432
433 mulq %rbp
434 addq %rax,%rdi
435 movq (%rsi),%rax
436 adcq $0,%rdx
437 addq %r11,%rdi
438 adcq $0,%rdx
439 movq %rdi,-16(%rsp,%r15,8)
440 movq %rdx,%r13
441
442 xorq %rdi,%rdi
443 addq %r10,%r13
444 adcq $0,%rdi
445 movq %r13,-8(%rsp,%r15,8)
446 movq %rdi,(%rsp,%r15,8)
447
448 leaq 1(%r14),%r14
449 .align 4
450 .Louter4x:
451 movq (%r12,%r14,8),%rbx
452 xorq %r15,%r15
453 movq (%rsp),%r10
454 movq %r8,%rbp
455 mulq %rbx
456 addq %rax,%r10
457 movq (%rcx),%rax
458 adcq $0,%rdx
459
460 imulq %r10,%rbp
461 movq %rdx,%r11
462
463 mulq %rbp
464 addq %rax,%r10
465 movq 8(%rsi),%rax
466 adcq $0,%rdx
467 movq %rdx,%rdi
468
469 mulq %rbx
470 addq %rax,%r11
471 movq 8(%rcx),%rax
472 adcq $0,%rdx
473 addq 8(%rsp),%r11
474 adcq $0,%rdx
475 movq %rdx,%r10
476
477 mulq %rbp
478 addq %rax,%rdi
479 movq 16(%rsi),%rax
480 adcq $0,%rdx
481 addq %r11,%rdi
482 leaq 4(%r15),%r15
483 adcq $0,%rdx
484 movq %rdi,(%rsp)
485 movq %rdx,%r13
486 jmp .Linner4x
487 .align 16
488 .Linner4x:
489 mulq %rbx
490 addq %rax,%r10
491 movq -16(%rcx,%r15,8),%rax
492 adcq $0,%rdx
493 addq -16(%rsp,%r15,8),%r10
494 adcq $0,%rdx
495 movq %rdx,%r11
496
497 mulq %rbp
498 addq %rax,%r13
499 movq -8(%rsi,%r15,8),%rax
500 adcq $0,%rdx
501 addq %r10,%r13
502 adcq $0,%rdx
503 movq %r13,-24(%rsp,%r15,8)
504 movq %rdx,%rdi
505
506 mulq %rbx
507 addq %rax,%r11
508 movq -8(%rcx,%r15,8),%rax
509 adcq $0,%rdx
510 addq -8(%rsp,%r15,8),%r11
511 adcq $0,%rdx
512 movq %rdx,%r10
513
514 mulq %rbp
515 addq %rax,%rdi
516 movq (%rsi,%r15,8),%rax
517 adcq $0,%rdx
518 addq %r11,%rdi
519 adcq $0,%rdx
520 movq %rdi,-16(%rsp,%r15,8)
521 movq %rdx,%r13
522
523 mulq %rbx
524 addq %rax,%r10
525 movq (%rcx,%r15,8),%rax
526 adcq $0,%rdx
527 addq (%rsp,%r15,8),%r10
528 adcq $0,%rdx
529 movq %rdx,%r11
530
531 mulq %rbp
532 addq %rax,%r13
533 movq 8(%rsi,%r15,8),%rax
534 adcq $0,%rdx
535 addq %r10,%r13
536 adcq $0,%rdx
537 movq %r13,-8(%rsp,%r15,8)
538 movq %rdx,%rdi
539
540 mulq %rbx
541 addq %rax,%r11
542 movq 8(%rcx,%r15,8),%rax
543 adcq $0,%rdx
544 addq 8(%rsp,%r15,8),%r11
545 adcq $0,%rdx
546 leaq 4(%r15),%r15
547 movq %rdx,%r10
548
549 mulq %rbp
550 addq %rax,%rdi
551 movq -16(%rsi,%r15,8),%rax
552 adcq $0,%rdx
553 addq %r11,%rdi
554 adcq $0,%rdx
555 movq %rdi,-32(%rsp,%r15,8)
556 movq %rdx,%r13
557 cmpq %r9,%r15
558 jb .Linner4x
559
560 mulq %rbx
561 addq %rax,%r10
562 movq -16(%rcx,%r15,8),%rax
563 adcq $0,%rdx
564 addq -16(%rsp,%r15,8),%r10
565 adcq $0,%rdx
566 movq %rdx,%r11
567
568 mulq %rbp
569 addq %rax,%r13
570 movq -8(%rsi,%r15,8),%rax
571 adcq $0,%rdx
572 addq %r10,%r13
573 adcq $0,%rdx
574 movq %r13,-24(%rsp,%r15,8)
575 movq %rdx,%rdi
576
577 mulq %rbx
578 addq %rax,%r11
579 movq -8(%rcx,%r15,8),%rax
580 adcq $0,%rdx
581 addq -8(%rsp,%r15,8),%r11
582 adcq $0,%rdx
583 leaq 1(%r14),%r14
584 movq %rdx,%r10
585
586 mulq %rbp
587 addq %rax,%rdi
588 movq (%rsi),%rax
589 adcq $0,%rdx
590 addq %r11,%rdi
591 adcq $0,%rdx
592 movq %rdi,-16(%rsp,%r15,8)
593 movq %rdx,%r13
594
595 xorq %rdi,%rdi
596 addq %r10,%r13
597 adcq $0,%rdi
598 addq (%rsp,%r9,8),%r13
599 adcq $0,%rdi
600 movq %r13,-8(%rsp,%r15,8)
601 movq %rdi,(%rsp,%r15,8)
602
603 cmpq %r9,%r14
604 jb .Louter4x
605 movq 16(%rsp,%r9,8),%rdi
606 leaq -4(%r9),%r15
607 movq 0(%rsp),%rax
608 movq 8(%rsp),%rdx
609 shrq $2,%r15
610 leaq (%rsp),%rsi
611 xorq %r14,%r14
612
613 subq 0(%rcx),%rax
614 movq 16(%rsi),%rbx
615 movq 24(%rsi),%rbp
616 sbbq 8(%rcx),%rdx
617
618 .Lsub4x:
619 movq %rax,0(%rdi,%r14,8)
620 movq %rdx,8(%rdi,%r14,8)
621 sbbq 16(%rcx,%r14,8),%rbx
622 movq 32(%rsi,%r14,8),%rax
623 movq 40(%rsi,%r14,8),%rdx
624 sbbq 24(%rcx,%r14,8),%rbp
625 movq %rbx,16(%rdi,%r14,8)
626 movq %rbp,24(%rdi,%r14,8)
627 sbbq 32(%rcx,%r14,8),%rax
628 movq 48(%rsi,%r14,8),%rbx
629 movq 56(%rsi,%r14,8),%rbp
630 sbbq 40(%rcx,%r14,8),%rdx
631 leaq 4(%r14),%r14
632 decq %r15
633 jnz .Lsub4x
634
635 movq %rax,0(%rdi,%r14,8)
636 movq 32(%rsi,%r14,8),%rax
637 sbbq 16(%rcx,%r14,8),%rbx
638 movq %rdx,8(%rdi,%r14,8)
639 sbbq 24(%rcx,%r14,8),%rbp
640 movq %rbx,16(%rdi,%r14,8)
641
642 sbbq $0,%rax
643 movq %rbp,24(%rdi,%r14,8)
644 pxor %xmm0,%xmm0
645 .byte 102,72,15,110,224
646 pcmpeqd %xmm5,%xmm5
647 pshufd $0,%xmm4,%xmm4
648 movq %r9,%r15
649 pxor %xmm4,%xmm5
650 shrq $2,%r15
651 xorl %eax,%eax
652
653 jmp .Lcopy4x
654 .align 16
655 .Lcopy4x:
656 movdqa (%rsp,%rax,1),%xmm1
657 movdqu (%rdi,%rax,1),%xmm2
658 pand %xmm4,%xmm1
659 pand %xmm5,%xmm2
660 movdqa 16(%rsp,%rax,1),%xmm3
661 movdqa %xmm0,(%rsp,%rax,1)
662 por %xmm2,%xmm1
663 movdqu 16(%rdi,%rax,1),%xmm2
664 movdqu %xmm1,(%rdi,%rax,1)
665 pand %xmm4,%xmm3
666 pand %xmm5,%xmm2
667 movdqa %xmm0,16(%rsp,%rax,1)
668 por %xmm2,%xmm3
669 movdqu %xmm3,16(%rdi,%rax,1)
670 leaq 32(%rax),%rax
671 decq %r15
672 jnz .Lcopy4x
673 movq 8(%rsp,%r9,8),%rsi
674 .cfi_def_cfa %rsi, 8
675 movq $1,%rax
676 movq -48(%rsi),%r15
677 .cfi_restore %r15
678 movq -40(%rsi),%r14
679 .cfi_restore %r14
680 movq -32(%rsi),%r13
681 .cfi_restore %r13
682 movq -24(%rsi),%r12
683 .cfi_restore %r12
684 movq -16(%rsi),%rbp
685 .cfi_restore %rbp
686 movq -8(%rsi),%rbx
687 .cfi_restore %rbx
688 leaq (%rsi),%rsp
689 .cfi_def_cfa_register %rsp
690 .Lmul4x_epilogue:
691 .byte 0xf3,0xc3
692 .cfi_endproc
693 .size bn_mul4x_mont,.-bn_mul4x_mont
694
695
696
697 .type bn_sqr8x_mont,@function
698 .align 32
699 bn_sqr8x_mont:
700 .cfi_startproc
701 movq %rsp,%rax
702 .cfi_def_cfa_register %rax
703 .Lsqr8x_enter:
704 pushq %rbx
705 .cfi_offset %rbx,-16
706 pushq %rbp
707 .cfi_offset %rbp,-24
708 pushq %r12
709 .cfi_offset %r12,-32
710 pushq %r13
711 .cfi_offset %r13,-40
712 pushq %r14
713 .cfi_offset %r14,-48
714 pushq %r15
715 .cfi_offset %r15,-56
716 .Lsqr8x_prologue:
717
718 movl %r9d,%r10d
719 shll $3,%r9d
720 shlq $3+2,%r10
721 negq %r9
722
723
724
725
726
727
728 leaq -64(%rsp,%r9,2),%r11
729 movq %rsp,%rbp
730 movq (%r8),%r8
731 subq %rsi,%r11
732 andq $4095,%r11
733 cmpq %r11,%r10
734 jb .Lsqr8x_sp_alt
735 subq %r11,%rbp
736 leaq -64(%rbp,%r9,2),%rbp
737 jmp .Lsqr8x_sp_done
738
739 .align 32
740 .Lsqr8x_sp_alt:
741 leaq 4096-64(,%r9,2),%r10
742 leaq -64(%rbp,%r9,2),%rbp
743 subq %r10,%r11
744 movq $0,%r10
745 cmovcq %r10,%r11
746 subq %r11,%rbp
747 .Lsqr8x_sp_done:
748 andq $-64,%rbp
749 movq %rsp,%r11
750 subq %rbp,%r11
751 andq $-4096,%r11
752 leaq (%r11,%rbp,1),%rsp
753 movq (%rsp),%r10
754 cmpq %rbp,%rsp
755 ja .Lsqr8x_page_walk
756 jmp .Lsqr8x_page_walk_done
757
758 .align 16
759 .Lsqr8x_page_walk:
760 leaq -4096(%rsp),%rsp
761 movq (%rsp),%r10
762 cmpq %rbp,%rsp
763 ja .Lsqr8x_page_walk
764 .Lsqr8x_page_walk_done:
765
766 movq %r9,%r10
767 negq %r9
768
769 movq %r8,32(%rsp)
770 movq %rax,40(%rsp)
771 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
772 .Lsqr8x_body:
773
774 .byte 102,72,15,110,209
775 pxor %xmm0,%xmm0
776 .byte 102,72,15,110,207
777 .byte 102,73,15,110,218
778 movl OPENSSL_ia32cap_P+8(%rip),%eax
779 andl $0x80100,%eax
780 cmpl $0x80100,%eax
781 jne .Lsqr8x_nox
782
783 call bn_sqrx8x_internal
784
785
786
787
788 leaq (%r8,%rcx,1),%rbx
789 movq %rcx,%r9
790 movq %rcx,%rdx
791 .byte 102,72,15,126,207
792 sarq $3+2,%rcx
793 jmp .Lsqr8x_sub
794
795 .align 32
796 .Lsqr8x_nox:
797 call bn_sqr8x_internal
798
799
800
801
802 leaq (%rdi,%r9,1),%rbx
803 movq %r9,%rcx
804 movq %r9,%rdx
805 .byte 102,72,15,126,207
806 sarq $3+2,%rcx
807 jmp .Lsqr8x_sub
808
809 .align 32
810 .Lsqr8x_sub:
811 movq 0(%rbx),%r12
812 movq 8(%rbx),%r13
813 movq 16(%rbx),%r14
814 movq 24(%rbx),%r15
815 leaq 32(%rbx),%rbx
816 sbbq 0(%rbp),%r12
817 sbbq 8(%rbp),%r13
818 sbbq 16(%rbp),%r14
819 sbbq 24(%rbp),%r15
820 leaq 32(%rbp),%rbp
821 movq %r12,0(%rdi)
822 movq %r13,8(%rdi)
823 movq %r14,16(%rdi)
824 movq %r15,24(%rdi)
825 leaq 32(%rdi),%rdi
826 incq %rcx
827 jnz .Lsqr8x_sub
828
829 sbbq $0,%rax
830 leaq (%rbx,%r9,1),%rbx
831 leaq (%rdi,%r9,1),%rdi
832
833 .byte 102,72,15,110,200
834 pxor %xmm0,%xmm0
835 pshufd $0,%xmm1,%xmm1
836 movq 40(%rsp),%rsi
837 .cfi_def_cfa %rsi,8
838 jmp .Lsqr8x_cond_copy
839
840 .align 32
841 .Lsqr8x_cond_copy:
842 movdqa 0(%rbx),%xmm2
843 movdqa 16(%rbx),%xmm3
844 leaq 32(%rbx),%rbx
845 movdqu 0(%rdi),%xmm4
846 movdqu 16(%rdi),%xmm5
847 leaq 32(%rdi),%rdi
848 movdqa %xmm0,-32(%rbx)
849 movdqa %xmm0,-16(%rbx)
850 movdqa %xmm0,-32(%rbx,%rdx,1)
851 movdqa %xmm0,-16(%rbx,%rdx,1)
852 pcmpeqd %xmm1,%xmm0
853 pand %xmm1,%xmm2
854 pand %xmm1,%xmm3
855 pand %xmm0,%xmm4
856 pand %xmm0,%xmm5
857 pxor %xmm0,%xmm0
858 por %xmm2,%xmm4
859 por %xmm3,%xmm5
860 movdqu %xmm4,-32(%rdi)
861 movdqu %xmm5,-16(%rdi)
862 addq $32,%r9
863 jnz .Lsqr8x_cond_copy
864
865 movq $1,%rax
866 movq -48(%rsi),%r15
867 .cfi_restore %r15
868 movq -40(%rsi),%r14
869 .cfi_restore %r14
870 movq -32(%rsi),%r13
871 .cfi_restore %r13
872 movq -24(%rsi),%r12
873 .cfi_restore %r12
874 movq -16(%rsi),%rbp
875 .cfi_restore %rbp
876 movq -8(%rsi),%rbx
877 .cfi_restore %rbx
878 leaq (%rsi),%rsp
879 .cfi_def_cfa_register %rsp
880 .Lsqr8x_epilogue:
881 .byte 0xf3,0xc3
882 .cfi_endproc
883 .size bn_sqr8x_mont,.-bn_sqr8x_mont
884 .type bn_mulx4x_mont,@function
885 .align 32
886 bn_mulx4x_mont:
887 .cfi_startproc
888 movq %rsp,%rax
889 .cfi_def_cfa_register %rax
890 .Lmulx4x_enter:
891 pushq %rbx
892 .cfi_offset %rbx,-16
893 pushq %rbp
894 .cfi_offset %rbp,-24
895 pushq %r12
896 .cfi_offset %r12,-32
897 pushq %r13
898 .cfi_offset %r13,-40
899 pushq %r14
900 .cfi_offset %r14,-48
901 pushq %r15
902 .cfi_offset %r15,-56
903 .Lmulx4x_prologue:
904
905 shll $3,%r9d
906 xorq %r10,%r10
907 subq %r9,%r10
908 movq (%r8),%r8
909 leaq -72(%rsp,%r10,1),%rbp
910 andq $-128,%rbp
911 movq %rsp,%r11
912 subq %rbp,%r11
913 andq $-4096,%r11
914 leaq (%r11,%rbp,1),%rsp
915 movq (%rsp),%r10
916 cmpq %rbp,%rsp
917 ja .Lmulx4x_page_walk
918 jmp .Lmulx4x_page_walk_done
919
920 .align 16
921 .Lmulx4x_page_walk:
922 leaq -4096(%rsp),%rsp
923 movq (%rsp),%r10
924 cmpq %rbp,%rsp
925 ja .Lmulx4x_page_walk
926 .Lmulx4x_page_walk_done:
927
928 leaq (%rdx,%r9,1),%r10
929
930
931
932
933
934
935
936
937
938
939
940
941 movq %r9,0(%rsp)
942 shrq $5,%r9
943 movq %r10,16(%rsp)
944 subq $1,%r9
945 movq %r8,24(%rsp)
946 movq %rdi,32(%rsp)
947 movq %rax,40(%rsp)
948 .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
949 movq %r9,48(%rsp)
950 jmp .Lmulx4x_body
951
952 .align 32
953 .Lmulx4x_body:
954 leaq 8(%rdx),%rdi
955 movq (%rdx),%rdx
956 leaq 64+32(%rsp),%rbx
957 movq %rdx,%r9
958
959 mulxq 0(%rsi),%r8,%rax
960 mulxq 8(%rsi),%r11,%r14
961 addq %rax,%r11
962 movq %rdi,8(%rsp)
963 mulxq 16(%rsi),%r12,%r13
964 adcq %r14,%r12
965 adcq $0,%r13
966
967 movq %r8,%rdi
968 imulq 24(%rsp),%r8
969 xorq %rbp,%rbp
970
971 mulxq 24(%rsi),%rax,%r14
972 movq %r8,%rdx
973 leaq 32(%rsi),%rsi
974 adcxq %rax,%r13
975 adcxq %rbp,%r14
976
977 mulxq 0(%rcx),%rax,%r10
978 adcxq %rax,%rdi
979 adoxq %r11,%r10
980 mulxq 8(%rcx),%rax,%r11
981 adcxq %rax,%r10
982 adoxq %r12,%r11
983 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
984 movq 48(%rsp),%rdi
985 movq %r10,-32(%rbx)
986 adcxq %rax,%r11
987 adoxq %r13,%r12
988 mulxq 24(%rcx),%rax,%r15
989 movq %r9,%rdx
990 movq %r11,-24(%rbx)
991 adcxq %rax,%r12
992 adoxq %rbp,%r15
993 leaq 32(%rcx),%rcx
994 movq %r12,-16(%rbx)
995
996 jmp .Lmulx4x_1st
997
998 .align 32
999 .Lmulx4x_1st:
1000 adcxq %rbp,%r15
1001 mulxq 0(%rsi),%r10,%rax
1002 adcxq %r14,%r10
1003 mulxq 8(%rsi),%r11,%r14
1004 adcxq %rax,%r11
1005 mulxq 16(%rsi),%r12,%rax
1006 adcxq %r14,%r12
1007 mulxq 24(%rsi),%r13,%r14
1008 .byte 0x67,0x67
1009 movq %r8,%rdx
1010 adcxq %rax,%r13
1011 adcxq %rbp,%r14
1012 leaq 32(%rsi),%rsi
1013 leaq 32(%rbx),%rbx
1014
1015 adoxq %r15,%r10
1016 mulxq 0(%rcx),%rax,%r15
1017 adcxq %rax,%r10
1018 adoxq %r15,%r11
1019 mulxq 8(%rcx),%rax,%r15
1020 adcxq %rax,%r11
1021 adoxq %r15,%r12
1022 mulxq 16(%rcx),%rax,%r15
1023 movq %r10,-40(%rbx)
1024 adcxq %rax,%r12
1025 movq %r11,-32(%rbx)
1026 adoxq %r15,%r13
1027 mulxq 24(%rcx),%rax,%r15
1028 movq %r9,%rdx
1029 movq %r12,-24(%rbx)
1030 adcxq %rax,%r13
1031 adoxq %rbp,%r15
1032 leaq 32(%rcx),%rcx
1033 movq %r13,-16(%rbx)
1034
1035 decq %rdi
1036 jnz .Lmulx4x_1st
1037
1038 movq 0(%rsp),%rax
1039 movq 8(%rsp),%rdi
1040 adcq %rbp,%r15
1041 addq %r15,%r14
1042 sbbq %r15,%r15
1043 movq %r14,-8(%rbx)
1044 jmp .Lmulx4x_outer
1045
1046 .align 32
1047 .Lmulx4x_outer:
1048 movq (%rdi),%rdx
1049 leaq 8(%rdi),%rdi
1050 subq %rax,%rsi
1051 movq %r15,(%rbx)
1052 leaq 64+32(%rsp),%rbx
1053 subq %rax,%rcx
1054
1055 mulxq 0(%rsi),%r8,%r11
1056 xorl %ebp,%ebp
1057 movq %rdx,%r9
1058 mulxq 8(%rsi),%r14,%r12
1059 adoxq -32(%rbx),%r8
1060 adcxq %r14,%r11
1061 mulxq 16(%rsi),%r15,%r13
1062 adoxq -24(%rbx),%r11
1063 adcxq %r15,%r12
1064 adoxq -16(%rbx),%r12
1065 adcxq %rbp,%r13
1066 adoxq %rbp,%r13
1067
1068 movq %rdi,8(%rsp)
1069 movq %r8,%r15
1070 imulq 24(%rsp),%r8
1071 xorl %ebp,%ebp
1072
1073 mulxq 24(%rsi),%rax,%r14
1074 movq %r8,%rdx
1075 adcxq %rax,%r13
1076 adoxq -8(%rbx),%r13
1077 adcxq %rbp,%r14
1078 leaq 32(%rsi),%rsi
1079 adoxq %rbp,%r14
1080
1081 mulxq 0(%rcx),%rax,%r10
1082 adcxq %rax,%r15
1083 adoxq %r11,%r10
1084 mulxq 8(%rcx),%rax,%r11
1085 adcxq %rax,%r10
1086 adoxq %r12,%r11
1087 mulxq 16(%rcx),%rax,%r12
1088 movq %r10,-32(%rbx)
1089 adcxq %rax,%r11
1090 adoxq %r13,%r12
1091 mulxq 24(%rcx),%rax,%r15
1092 movq %r9,%rdx
1093 movq %r11,-24(%rbx)
1094 leaq 32(%rcx),%rcx
1095 adcxq %rax,%r12
1096 adoxq %rbp,%r15
1097 movq 48(%rsp),%rdi
1098 movq %r12,-16(%rbx)
1099
1100 jmp .Lmulx4x_inner
1101
1102 .align 32
1103 .Lmulx4x_inner:
1104 mulxq 0(%rsi),%r10,%rax
1105 adcxq %rbp,%r15
1106 adoxq %r14,%r10
1107 mulxq 8(%rsi),%r11,%r14
1108 adcxq 0(%rbx),%r10
1109 adoxq %rax,%r11
1110 mulxq 16(%rsi),%r12,%rax
1111 adcxq 8(%rbx),%r11
1112 adoxq %r14,%r12
1113 mulxq 24(%rsi),%r13,%r14
1114 movq %r8,%rdx
1115 adcxq 16(%rbx),%r12
1116 adoxq %rax,%r13
1117 adcxq 24(%rbx),%r13
1118 adoxq %rbp,%r14
1119 leaq 32(%rsi),%rsi
1120 leaq 32(%rbx),%rbx
1121 adcxq %rbp,%r14
1122
1123 adoxq %r15,%r10
1124 mulxq 0(%rcx),%rax,%r15
1125 adcxq %rax,%r10
1126 adoxq %r15,%r11
1127 mulxq 8(%rcx),%rax,%r15
1128 adcxq %rax,%r11
1129 adoxq %r15,%r12
1130 mulxq 16(%rcx),%rax,%r15
1131 movq %r10,-40(%rbx)
1132 adcxq %rax,%r12
1133 adoxq %r15,%r13
1134 mulxq 24(%rcx),%rax,%r15
1135 movq %r9,%rdx
1136 movq %r11,-32(%rbx)
1137 movq %r12,-24(%rbx)
1138 adcxq %rax,%r13
1139 adoxq %rbp,%r15
1140 leaq 32(%rcx),%rcx
1141 movq %r13,-16(%rbx)
1142
1143 decq %rdi
1144 jnz .Lmulx4x_inner
1145
1146 movq 0(%rsp),%rax
1147 movq 8(%rsp),%rdi
1148 adcq %rbp,%r15
1149 subq 0(%rbx),%rbp
1150 adcq %r15,%r14
1151 sbbq %r15,%r15
1152 movq %r14,-8(%rbx)
1153
1154 cmpq 16(%rsp),%rdi
1155 jne .Lmulx4x_outer
1156
1157 leaq 64(%rsp),%rbx
1158 subq %rax,%rcx
1159 negq %r15
1160 movq %rax,%rdx
1161 shrq $3+2,%rax
1162 movq 32(%rsp),%rdi
1163 jmp .Lmulx4x_sub
1164
1165 .align 32
1166 .Lmulx4x_sub:
1167 movq 0(%rbx),%r11
1168 movq 8(%rbx),%r12
1169 movq 16(%rbx),%r13
1170 movq 24(%rbx),%r14
1171 leaq 32(%rbx),%rbx
1172 sbbq 0(%rcx),%r11
1173 sbbq 8(%rcx),%r12
1174 sbbq 16(%rcx),%r13
1175 sbbq 24(%rcx),%r14
1176 leaq 32(%rcx),%rcx
1177 movq %r11,0(%rdi)
1178 movq %r12,8(%rdi)
1179 movq %r13,16(%rdi)
1180 movq %r14,24(%rdi)
1181 leaq 32(%rdi),%rdi
1182 decq %rax
1183 jnz .Lmulx4x_sub
1184
1185 sbbq $0,%r15
1186 leaq 64(%rsp),%rbx
1187 subq %rdx,%rdi
1188
1189 .byte 102,73,15,110,207
1190 pxor %xmm0,%xmm0
1191 pshufd $0,%xmm1,%xmm1
1192 movq 40(%rsp),%rsi
1193 .cfi_def_cfa %rsi,8
1194 jmp .Lmulx4x_cond_copy
1195
1196 .align 32
1197 .Lmulx4x_cond_copy:
1198 movdqa 0(%rbx),%xmm2
1199 movdqa 16(%rbx),%xmm3
1200 leaq 32(%rbx),%rbx
1201 movdqu 0(%rdi),%xmm4
1202 movdqu 16(%rdi),%xmm5
1203 leaq 32(%rdi),%rdi
1204 movdqa %xmm0,-32(%rbx)
1205 movdqa %xmm0,-16(%rbx)
1206 pcmpeqd %xmm1,%xmm0
1207 pand %xmm1,%xmm2
1208 pand %xmm1,%xmm3
1209 pand %xmm0,%xmm4
1210 pand %xmm0,%xmm5
1211 pxor %xmm0,%xmm0
1212 por %xmm2,%xmm4
1213 por %xmm3,%xmm5
1214 movdqu %xmm4,-32(%rdi)
1215 movdqu %xmm5,-16(%rdi)
1216 subq $32,%rdx
1217 jnz .Lmulx4x_cond_copy
1218
1219 movq %rdx,(%rbx)
1220
1221 movq $1,%rax
1222 movq -48(%rsi),%r15
1223 .cfi_restore %r15
1224 movq -40(%rsi),%r14
1225 .cfi_restore %r14
1226 movq -32(%rsi),%r13
1227 .cfi_restore %r13
1228 movq -24(%rsi),%r12
1229 .cfi_restore %r12
1230 movq -16(%rsi),%rbp
1231 .cfi_restore %rbp
1232 movq -8(%rsi),%rbx
1233 .cfi_restore %rbx
1234 leaq (%rsi),%rsp
1235 .cfi_def_cfa_register %rsp
1236 .Lmulx4x_epilogue:
1237 .byte 0xf3,0xc3
1238 .cfi_endproc
1239 .size bn_mulx4x_mont,.-bn_mulx4x_mont
1240 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1241 .align 16
Cache object: 2e8e0958f257f5d9d5c3a4d54b4d7567
|