FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/trap.c
1 /*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
38 */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD: releng/11.1/sys/amd64/amd64/trap.c 333371 2018-05-08 17:12:10Z gordon $");
42
43 /*
44 * AMD64 Trap and System call handling
45 */
46
47 #include "opt_clock.h"
48 #include "opt_compat.h"
49 #include "opt_cpu.h"
50 #include "opt_hwpmc_hooks.h"
51 #include "opt_isa.h"
52 #include "opt_kdb.h"
53 #include "opt_stack.h"
54
55 #include <sys/param.h>
56 #include <sys/bus.h>
57 #include <sys/systm.h>
58 #include <sys/proc.h>
59 #include <sys/pioctl.h>
60 #include <sys/ptrace.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/ktr.h>
64 #include <sys/lock.h>
65 #include <sys/mutex.h>
66 #include <sys/resourcevar.h>
67 #include <sys/signalvar.h>
68 #include <sys/syscall.h>
69 #include <sys/sysctl.h>
70 #include <sys/sysent.h>
71 #include <sys/uio.h>
72 #include <sys/vmmeter.h>
73 #ifdef HWPMC_HOOKS
74 #include <sys/pmckern.h>
75 PMC_SOFT_DEFINE( , , page_fault, all);
76 PMC_SOFT_DEFINE( , , page_fault, read);
77 PMC_SOFT_DEFINE( , , page_fault, write);
78 #endif
79
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_kern.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_extern.h>
87
88 #include <machine/cpu.h>
89 #include <machine/intr_machdep.h>
90 #include <x86/mca.h>
91 #include <machine/md_var.h>
92 #include <machine/pcb.h>
93 #ifdef SMP
94 #include <machine/smp.h>
95 #endif
96 #include <machine/stack.h>
97 #include <machine/tss.h>
98
99 #ifdef KDTRACE_HOOKS
100 #include <sys/dtrace_bsd.h>
101 #endif
102
103 extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
104 IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
105 IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
106
107
108 extern void __noinline trap(struct trapframe *frame);
109 extern void trap_check(struct trapframe *frame);
110 extern void syscall(struct trapframe *frame);
111 void dblfault_handler(struct trapframe *frame);
112
113 static int trap_pfault(struct trapframe *, int);
114 static void trap_fatal(struct trapframe *, vm_offset_t);
115
116 #define MAX_TRAP_MSG 32
117 static char *trap_msg[] = {
118 "", /* 0 unused */
119 "privileged instruction fault", /* 1 T_PRIVINFLT */
120 "", /* 2 unused */
121 "breakpoint instruction fault", /* 3 T_BPTFLT */
122 "", /* 4 unused */
123 "", /* 5 unused */
124 "arithmetic trap", /* 6 T_ARITHTRAP */
125 "", /* 7 unused */
126 "", /* 8 unused */
127 "general protection fault", /* 9 T_PROTFLT */
128 "trace trap", /* 10 T_TRCTRAP */
129 "", /* 11 unused */
130 "page fault", /* 12 T_PAGEFLT */
131 "", /* 13 unused */
132 "alignment fault", /* 14 T_ALIGNFLT */
133 "", /* 15 unused */
134 "", /* 16 unused */
135 "", /* 17 unused */
136 "integer divide fault", /* 18 T_DIVIDE */
137 "non-maskable interrupt trap", /* 19 T_NMI */
138 "overflow trap", /* 20 T_OFLOW */
139 "FPU bounds check fault", /* 21 T_BOUND */
140 "FPU device not available", /* 22 T_DNA */
141 "double fault", /* 23 T_DOUBLEFLT */
142 "FPU operand fetch fault", /* 24 T_FPOPFLT */
143 "invalid TSS fault", /* 25 T_TSSFLT */
144 "segment not present fault", /* 26 T_SEGNPFLT */
145 "stack fault", /* 27 T_STKFLT */
146 "machine check trap", /* 28 T_MCHK */
147 "SIMD floating-point exception", /* 29 T_XMMFLT */
148 "reserved (unknown) fault", /* 30 T_RESERVED */
149 "", /* 31 unused (reserved) */
150 "DTrace pid return trap", /* 32 T_DTRACE_RET */
151 };
152
153 static int prot_fault_translation;
154 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
155 &prot_fault_translation, 0,
156 "Select signal to deliver on protection fault");
157 static int uprintf_signal;
158 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
159 &uprintf_signal, 0,
160 "Print debugging information on trap signal to ctty");
161
162 /*
163 * Exception, fault, and trap interface to the FreeBSD kernel.
164 * This common code is called from assembly language IDT gate entry
165 * routines that prepare a suitable stack frame, and restore this
166 * frame after the exception has been processed.
167 */
168
169 void
170 trap(struct trapframe *frame)
171 {
172 #ifdef KDTRACE_HOOKS
173 struct reg regs;
174 #endif
175 struct thread *td = curthread;
176 struct proc *p = td->td_proc;
177 #ifdef KDB
178 register_t dr6;
179 #endif
180 int i = 0, ucode = 0;
181 u_int type;
182 register_t addr = 0;
183 ksiginfo_t ksi;
184
185 PCPU_INC(cnt.v_trap);
186 type = frame->tf_trapno;
187
188 #ifdef SMP
189 /* Handler for NMI IPIs used for stopping CPUs. */
190 if (type == T_NMI) {
191 if (ipi_nmi_handler() == 0)
192 goto out;
193 }
194 #endif /* SMP */
195
196 #ifdef KDB
197 if (kdb_active) {
198 kdb_reenter();
199 goto out;
200 }
201 #endif
202
203 if (type == T_RESERVED) {
204 trap_fatal(frame, 0);
205 goto out;
206 }
207
208 if (type == T_NMI) {
209 #ifdef HWPMC_HOOKS
210 /*
211 * CPU PMCs interrupt using an NMI. If the PMC module is
212 * active, pass the 'rip' value to the PMC module's interrupt
213 * handler. A non-zero return value from the handler means that
214 * the NMI was consumed by it and we can return immediately.
215 */
216 if (pmc_intr != NULL &&
217 (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
218 goto out;
219 #endif
220
221 #ifdef STACK
222 if (stack_nmi_handler(frame) != 0)
223 goto out;
224 #endif
225 }
226
227 if ((frame->tf_rflags & PSL_I) == 0) {
228 /*
229 * Buggy application or kernel code has disabled
230 * interrupts and then trapped. Enabling interrupts
231 * now is wrong, but it is better than running with
232 * interrupts disabled until they are accidentally
233 * enabled later.
234 */
235 if (TRAPF_USERMODE(frame))
236 uprintf(
237 "pid %ld (%s): trap %d with interrupts disabled\n",
238 (long)curproc->p_pid, curthread->td_name, type);
239 else if (type != T_NMI && type != T_BPTFLT &&
240 type != T_TRCTRAP) {
241 /*
242 * XXX not quite right, since this may be for a
243 * multiple fault in user mode.
244 */
245 printf("kernel trap %d with interrupts disabled\n",
246 type);
247
248 /*
249 * We shouldn't enable interrupts while holding a
250 * spin lock.
251 */
252 if (td->td_md.md_spinlock_count == 0)
253 enable_intr();
254 }
255 }
256
257 if (TRAPF_USERMODE(frame)) {
258 /* user trap */
259
260 td->td_pticks = 0;
261 td->td_frame = frame;
262 addr = frame->tf_rip;
263 if (td->td_cowgen != p->p_cowgen)
264 thread_cow_update(td);
265
266 switch (type) {
267 case T_PRIVINFLT: /* privileged instruction fault */
268 i = SIGILL;
269 ucode = ILL_PRVOPC;
270 break;
271
272 case T_BPTFLT: /* bpt instruction fault */
273 case T_TRCTRAP: /* trace trap */
274 enable_intr();
275 #ifdef KDTRACE_HOOKS
276 if (type == T_BPTFLT) {
277 fill_frame_regs(frame, ®s);
278 if (dtrace_pid_probe_ptr != NULL &&
279 dtrace_pid_probe_ptr(®s) == 0)
280 goto out;
281 }
282 #endif
283 frame->tf_rflags &= ~PSL_T;
284 i = SIGTRAP;
285 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
286 break;
287
288 case T_ARITHTRAP: /* arithmetic trap */
289 ucode = fputrap_x87();
290 if (ucode == -1)
291 goto userout;
292 i = SIGFPE;
293 break;
294
295 case T_PROTFLT: /* general protection fault */
296 i = SIGBUS;
297 ucode = BUS_OBJERR;
298 break;
299 case T_STKFLT: /* stack fault */
300 case T_SEGNPFLT: /* segment not present fault */
301 i = SIGBUS;
302 ucode = BUS_ADRERR;
303 break;
304 case T_TSSFLT: /* invalid TSS fault */
305 i = SIGBUS;
306 ucode = BUS_OBJERR;
307 break;
308 case T_ALIGNFLT:
309 i = SIGBUS;
310 ucode = BUS_ADRALN;
311 break;
312 case T_DOUBLEFLT: /* double fault */
313 default:
314 i = SIGBUS;
315 ucode = BUS_OBJERR;
316 break;
317
318 case T_PAGEFLT: /* page fault */
319 /*
320 * Emulator can take care about this trap?
321 */
322 if (*p->p_sysent->sv_trap != NULL &&
323 (*p->p_sysent->sv_trap)(td) == 0)
324 goto userout;
325
326 addr = frame->tf_addr;
327 i = trap_pfault(frame, TRUE);
328 if (i == -1)
329 goto userout;
330 if (i == 0)
331 goto user;
332
333 if (i == SIGSEGV)
334 ucode = SEGV_MAPERR;
335 else {
336 if (prot_fault_translation == 0) {
337 /*
338 * Autodetect.
339 * This check also covers the images
340 * without the ABI-tag ELF note.
341 */
342 if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
343 && p->p_osrel >= P_OSREL_SIGSEGV) {
344 i = SIGSEGV;
345 ucode = SEGV_ACCERR;
346 } else {
347 i = SIGBUS;
348 ucode = BUS_PAGE_FAULT;
349 }
350 } else if (prot_fault_translation == 1) {
351 /*
352 * Always compat mode.
353 */
354 i = SIGBUS;
355 ucode = BUS_PAGE_FAULT;
356 } else {
357 /*
358 * Always SIGSEGV mode.
359 */
360 i = SIGSEGV;
361 ucode = SEGV_ACCERR;
362 }
363 }
364 break;
365
366 case T_DIVIDE: /* integer divide fault */
367 ucode = FPE_INTDIV;
368 i = SIGFPE;
369 break;
370
371 #ifdef DEV_ISA
372 case T_NMI:
373 nmi_handle_intr(type, frame);
374 break;
375 #endif /* DEV_ISA */
376
377 case T_OFLOW: /* integer overflow fault */
378 ucode = FPE_INTOVF;
379 i = SIGFPE;
380 break;
381
382 case T_BOUND: /* bounds check fault */
383 ucode = FPE_FLTSUB;
384 i = SIGFPE;
385 break;
386
387 case T_DNA:
388 /* transparent fault (due to context switch "late") */
389 KASSERT(PCB_USER_FPU(td->td_pcb),
390 ("kernel FPU ctx has leaked"));
391 fpudna();
392 goto userout;
393
394 case T_FPOPFLT: /* FPU operand fetch fault */
395 ucode = ILL_COPROC;
396 i = SIGILL;
397 break;
398
399 case T_XMMFLT: /* SIMD floating-point exception */
400 ucode = fputrap_sse();
401 if (ucode == -1)
402 goto userout;
403 i = SIGFPE;
404 break;
405 #ifdef KDTRACE_HOOKS
406 case T_DTRACE_RET:
407 enable_intr();
408 fill_frame_regs(frame, ®s);
409 if (dtrace_return_probe_ptr != NULL &&
410 dtrace_return_probe_ptr(®s) == 0)
411 goto out;
412 break;
413 #endif
414 }
415 } else {
416 /* kernel trap */
417
418 KASSERT(cold || td->td_ucred != NULL,
419 ("kernel trap doesn't have ucred"));
420 switch (type) {
421 case T_PAGEFLT: /* page fault */
422 (void) trap_pfault(frame, FALSE);
423 goto out;
424
425 case T_DNA:
426 if (PCB_USER_FPU(td->td_pcb))
427 panic("Unregistered use of FPU in kernel");
428 fpudna();
429 goto out;
430
431 case T_ARITHTRAP: /* arithmetic trap */
432 case T_XMMFLT: /* SIMD floating-point exception */
433 case T_FPOPFLT: /* FPU operand fetch fault */
434 /*
435 * For now, supporting kernel handler
436 * registration for FPU traps is overkill.
437 */
438 trap_fatal(frame, 0);
439 goto out;
440
441 case T_STKFLT: /* stack fault */
442 case T_PROTFLT: /* general protection fault */
443 case T_SEGNPFLT: /* segment not present fault */
444 if (td->td_intr_nesting_level != 0)
445 break;
446
447 /*
448 * Invalid segment selectors and out of bounds
449 * %rip's and %rsp's can be set up in user mode.
450 * This causes a fault in kernel mode when the
451 * kernel tries to return to user mode. We want
452 * to get this fault so that we can fix the
453 * problem here and not have to check all the
454 * selectors and pointers when the user changes
455 * them.
456 *
457 * In case of PTI, the IRETQ faulted while the
458 * kernel used the pti stack, and exception
459 * frame records %rsp value pointing to that
460 * stack. If we return normally to
461 * doreti_iret_fault, the trapframe is
462 * reconstructed on pti stack, and calltrap()
463 * called on it as well. Due to the very
464 * limited pti stack size, kernel does not
465 * survive for too long. Switch to the normal
466 * thread stack for the trap handling.
467 *
468 * Magic '5' is the number of qwords occupied by
469 * the hardware trap frame.
470 */
471 if (frame->tf_rip == (long)doreti_iret) {
472 frame->tf_rip = (long)doreti_iret_fault;
473 if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR(
474 pti_stack) + (PC_PTI_STACK_SZ - 5) *
475 sizeof(register_t))
476 frame->tf_rsp = PCPU_GET(rsp0) - 5 *
477 sizeof(register_t);
478 goto out;
479 }
480 if (frame->tf_rip == (long)ld_ds) {
481 frame->tf_rip = (long)ds_load_fault;
482 goto out;
483 }
484 if (frame->tf_rip == (long)ld_es) {
485 frame->tf_rip = (long)es_load_fault;
486 goto out;
487 }
488 if (frame->tf_rip == (long)ld_fs) {
489 frame->tf_rip = (long)fs_load_fault;
490 goto out;
491 }
492 if (frame->tf_rip == (long)ld_gs) {
493 frame->tf_rip = (long)gs_load_fault;
494 goto out;
495 }
496 if (frame->tf_rip == (long)ld_gsbase) {
497 frame->tf_rip = (long)gsbase_load_fault;
498 goto out;
499 }
500 if (frame->tf_rip == (long)ld_fsbase) {
501 frame->tf_rip = (long)fsbase_load_fault;
502 goto out;
503 }
504 if (curpcb->pcb_onfault != NULL) {
505 frame->tf_rip = (long)curpcb->pcb_onfault;
506 goto out;
507 }
508 break;
509
510 case T_TSSFLT:
511 /*
512 * PSL_NT can be set in user mode and isn't cleared
513 * automatically when the kernel is entered. This
514 * causes a TSS fault when the kernel attempts to
515 * `iret' because the TSS link is uninitialized. We
516 * want to get this fault so that we can fix the
517 * problem here and not every time the kernel is
518 * entered.
519 */
520 if (frame->tf_rflags & PSL_NT) {
521 frame->tf_rflags &= ~PSL_NT;
522 goto out;
523 }
524 break;
525
526 case T_TRCTRAP: /* trace trap */
527 /*
528 * Ignore debug register trace traps due to
529 * accesses in the user's address space, which
530 * can happen under several conditions such as
531 * if a user sets a watchpoint on a buffer and
532 * then passes that buffer to a system call.
533 * We still want to get TRCTRAPS for addresses
534 * in kernel space because that is useful when
535 * debugging the kernel.
536 */
537 if (user_dbreg_trap()) {
538 /*
539 * Reset breakpoint bits because the
540 * processor doesn't
541 */
542 load_dr6(rdr6() & ~0xf);
543 goto out;
544 }
545
546 /*
547 * Malicious user code can configure a debug
548 * register watchpoint to trap on data access
549 * to the top of stack and then execute 'pop
550 * %ss; int 3'. Due to exception deferral for
551 * 'pop %ss', the CPU will not interrupt 'int
552 * 3' to raise the DB# exception for the debug
553 * register but will postpone the DB# until
554 * execution of the first instruction of the
555 * BP# handler (in kernel mode). Normally the
556 * previous check would ignore DB# exceptions
557 * for watchpoints on user addresses raised in
558 * kernel mode. However, some CPU errata
559 * include cases where DB# exceptions do not
560 * properly set bits in %dr6, e.g. Haswell
561 * HSD23 and Skylake-X SKZ24.
562 *
563 * A deferred DB# can also be raised on the
564 * first instructions of system call entry
565 * points or single-step traps via similar use
566 * of 'pop %ss' or 'mov xxx, %ss'.
567 */
568 if (pti) {
569 if (frame->tf_rip ==
570 (uintptr_t)IDTVEC(fast_syscall_pti) ||
571 #ifdef COMPAT_FREEBSD32
572 frame->tf_rip ==
573 (uintptr_t)IDTVEC(int0x80_syscall_pti) ||
574 #endif
575 frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
576 return;
577 } else {
578 if (frame->tf_rip ==
579 (uintptr_t)IDTVEC(fast_syscall) ||
580 #ifdef COMPAT_FREEBSD32
581 frame->tf_rip ==
582 (uintptr_t)IDTVEC(int0x80_syscall) ||
583 #endif
584 frame->tf_rip == (uintptr_t)IDTVEC(bpt))
585 return;
586 }
587 if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
588 /* Needed for AMD. */
589 frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
590 return;
591 /*
592 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
593 */
594 case T_BPTFLT:
595 /*
596 * If KDB is enabled, let it handle the debugger trap.
597 * Otherwise, debugger traps "can't happen".
598 */
599 #ifdef KDB
600 /* XXX %dr6 is not quite reentrant. */
601 dr6 = rdr6();
602 load_dr6(dr6 & ~0x4000);
603 if (kdb_trap(type, dr6, frame))
604 goto out;
605 #endif
606 break;
607
608 #ifdef DEV_ISA
609 case T_NMI:
610 nmi_handle_intr(type, frame);
611 goto out;
612 #endif /* DEV_ISA */
613 }
614
615 trap_fatal(frame, 0);
616 goto out;
617 }
618
619 /* Translate fault for emulators (e.g. Linux) */
620 if (*p->p_sysent->sv_transtrap)
621 i = (*p->p_sysent->sv_transtrap)(i, type);
622
623 ksiginfo_init_trap(&ksi);
624 ksi.ksi_signo = i;
625 ksi.ksi_code = ucode;
626 ksi.ksi_trapno = type;
627 ksi.ksi_addr = (void *)addr;
628 if (uprintf_signal) {
629 uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
630 "addr 0x%lx rsp 0x%lx rip 0x%lx "
631 "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
632 p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
633 frame->tf_rsp, frame->tf_rip,
634 fubyte((void *)(frame->tf_rip + 0)),
635 fubyte((void *)(frame->tf_rip + 1)),
636 fubyte((void *)(frame->tf_rip + 2)),
637 fubyte((void *)(frame->tf_rip + 3)),
638 fubyte((void *)(frame->tf_rip + 4)),
639 fubyte((void *)(frame->tf_rip + 5)),
640 fubyte((void *)(frame->tf_rip + 6)),
641 fubyte((void *)(frame->tf_rip + 7)));
642 }
643 KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
644 trapsignal(td, &ksi);
645
646 user:
647 userret(td, frame);
648 KASSERT(PCB_USER_FPU(td->td_pcb),
649 ("Return from trap with kernel FPU ctx leaked"));
650 userout:
651 out:
652 return;
653 }
654
655 /*
656 * Ensure that we ignore any DTrace-induced faults. This function cannot
657 * be instrumented, so it cannot generate such faults itself.
658 */
659 void
660 trap_check(struct trapframe *frame)
661 {
662
663 #ifdef KDTRACE_HOOKS
664 if (dtrace_trap_func != NULL &&
665 (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
666 return;
667 #endif
668 trap(frame);
669 }
670
671 static int
672 trap_pfault(frame, usermode)
673 struct trapframe *frame;
674 int usermode;
675 {
676 vm_offset_t va;
677 vm_map_t map;
678 int rv = 0;
679 vm_prot_t ftype;
680 struct thread *td = curthread;
681 struct proc *p = td->td_proc;
682 vm_offset_t eva = frame->tf_addr;
683
684 if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
685 /*
686 * Due to both processor errata and lazy TLB invalidation when
687 * access restrictions are removed from virtual pages, memory
688 * accesses that are allowed by the physical mapping layer may
689 * nonetheless cause one spurious page fault per virtual page.
690 * When the thread is executing a "no faulting" section that
691 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
692 * every page fault is treated as a spurious page fault,
693 * unless it accesses the same virtual address as the most
694 * recent page fault within the same "no faulting" section.
695 */
696 if (td->td_md.md_spurflt_addr != eva ||
697 (td->td_pflags & TDP_RESETSPUR) != 0) {
698 /*
699 * Do nothing to the TLB. A stale TLB entry is
700 * flushed automatically by a page fault.
701 */
702 td->td_md.md_spurflt_addr = eva;
703 td->td_pflags &= ~TDP_RESETSPUR;
704 return (0);
705 }
706 } else {
707 /*
708 * If we get a page fault while in a critical section, then
709 * it is most likely a fatal kernel page fault. The kernel
710 * is already going to panic trying to get a sleep lock to
711 * do the VM lookup, so just consider it a fatal trap so the
712 * kernel can print out a useful trap message and even get
713 * to the debugger.
714 *
715 * If we get a page fault while holding a non-sleepable
716 * lock, then it is most likely a fatal kernel page fault.
717 * If WITNESS is enabled, then it's going to whine about
718 * bogus LORs with various VM locks, so just skip to the
719 * fatal trap handling directly.
720 */
721 if (td->td_critnest != 0 ||
722 WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
723 "Kernel page fault") != 0) {
724 trap_fatal(frame, eva);
725 return (-1);
726 }
727 }
728 va = trunc_page(eva);
729 if (va >= VM_MIN_KERNEL_ADDRESS) {
730 /*
731 * Don't allow user-mode faults in kernel address space.
732 */
733 if (usermode)
734 goto nogo;
735
736 map = kernel_map;
737 } else {
738 map = &p->p_vmspace->vm_map;
739
740 /*
741 * When accessing a usermode address, kernel must be
742 * ready to accept the page fault, and provide a
743 * handling routine. Since accessing the address
744 * without the handler is a bug, do not try to handle
745 * it normally, and panic immediately.
746 */
747 if (!usermode && (td->td_intr_nesting_level != 0 ||
748 curpcb->pcb_onfault == NULL)) {
749 trap_fatal(frame, eva);
750 return (-1);
751 }
752 }
753
754 /*
755 * If the trap was caused by errant bits in the PTE then panic.
756 */
757 if (frame->tf_err & PGEX_RSV) {
758 trap_fatal(frame, eva);
759 return (-1);
760 }
761
762 /*
763 * If nx protection of the usermode portion of kernel page
764 * tables caused trap, panic.
765 */
766 if (pti && usermode && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
767 PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
768 (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)==
769 (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK))
770 panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid,
771 p->p_comm, frame->tf_err);
772
773 /*
774 * PGEX_I is defined only if the execute disable bit capability is
775 * supported and enabled.
776 */
777 if (frame->tf_err & PGEX_W)
778 ftype = VM_PROT_WRITE;
779 else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
780 ftype = VM_PROT_EXECUTE;
781 else
782 ftype = VM_PROT_READ;
783
784 /* Fault in the page. */
785 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
786 if (rv == KERN_SUCCESS) {
787 #ifdef HWPMC_HOOKS
788 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
789 PMC_SOFT_CALL_TF( , , page_fault, all, frame);
790 if (ftype == VM_PROT_READ)
791 PMC_SOFT_CALL_TF( , , page_fault, read,
792 frame);
793 else
794 PMC_SOFT_CALL_TF( , , page_fault, write,
795 frame);
796 }
797 #endif
798 return (0);
799 }
800 nogo:
801 if (!usermode) {
802 if (td->td_intr_nesting_level == 0 &&
803 curpcb->pcb_onfault != NULL) {
804 frame->tf_rip = (long)curpcb->pcb_onfault;
805 return (0);
806 }
807 trap_fatal(frame, eva);
808 return (-1);
809 }
810 return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
811 }
812
813 static void
814 trap_fatal(frame, eva)
815 struct trapframe *frame;
816 vm_offset_t eva;
817 {
818 int code, ss;
819 u_int type;
820 struct soft_segment_descriptor softseg;
821 char *msg;
822
823 code = frame->tf_err;
824 type = frame->tf_trapno;
825 sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
826 &softseg);
827
828 if (type <= MAX_TRAP_MSG)
829 msg = trap_msg[type];
830 else
831 msg = "UNKNOWN";
832 printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
833 TRAPF_USERMODE(frame) ? "user" : "kernel");
834 #ifdef SMP
835 /* two separate prints in case of a trap on an unmapped page */
836 printf("cpuid = %d; ", PCPU_GET(cpuid));
837 printf("apic id = %02x\n", PCPU_GET(apic_id));
838 #endif
839 if (type == T_PAGEFLT) {
840 printf("fault virtual address = 0x%lx\n", eva);
841 printf("fault code = %s %s %s, %s\n",
842 code & PGEX_U ? "user" : "supervisor",
843 code & PGEX_W ? "write" : "read",
844 code & PGEX_I ? "instruction" : "data",
845 code & PGEX_RSV ? "reserved bits in PTE" :
846 code & PGEX_P ? "protection violation" : "page not present");
847 }
848 printf("instruction pointer = 0x%lx:0x%lx\n",
849 frame->tf_cs & 0xffff, frame->tf_rip);
850 ss = frame->tf_ss & 0xffff;
851 printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp);
852 printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp);
853 printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n",
854 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
855 printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n",
856 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
857 softseg.ssd_gran);
858 printf("processor eflags = ");
859 if (frame->tf_rflags & PSL_T)
860 printf("trace trap, ");
861 if (frame->tf_rflags & PSL_I)
862 printf("interrupt enabled, ");
863 if (frame->tf_rflags & PSL_NT)
864 printf("nested task, ");
865 if (frame->tf_rflags & PSL_RF)
866 printf("resume, ");
867 printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
868 printf("current process = %d (%s)\n",
869 curproc->p_pid, curthread->td_name);
870
871 #ifdef KDB
872 if (debugger_on_panic || kdb_active)
873 if (kdb_trap(type, 0, frame))
874 return;
875 #endif
876 printf("trap number = %d\n", type);
877 if (type <= MAX_TRAP_MSG)
878 panic("%s", trap_msg[type]);
879 else
880 panic("unknown/reserved trap");
881 }
882
883 /*
884 * Double fault handler. Called when a fault occurs while writing
885 * a frame for a trap/exception onto the stack. This usually occurs
886 * when the stack overflows (such is the case with infinite recursion,
887 * for example).
888 */
889 void
890 dblfault_handler(struct trapframe *frame)
891 {
892 #ifdef KDTRACE_HOOKS
893 if (dtrace_doubletrap_func != NULL)
894 (*dtrace_doubletrap_func)();
895 #endif
896 printf("\nFatal double fault\n");
897 printf("rip = 0x%lx\n", frame->tf_rip);
898 printf("rsp = 0x%lx\n", frame->tf_rsp);
899 printf("rbp = 0x%lx\n", frame->tf_rbp);
900 #ifdef SMP
901 /* two separate prints in case of a trap on an unmapped page */
902 printf("cpuid = %d; ", PCPU_GET(cpuid));
903 printf("apic id = %02x\n", PCPU_GET(apic_id));
904 #endif
905 panic("double fault");
906 }
907
908 int
909 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
910 {
911 struct proc *p;
912 struct trapframe *frame;
913 register_t *argp;
914 caddr_t params;
915 int reg, regcnt, error;
916
917 p = td->td_proc;
918 frame = td->td_frame;
919 reg = 0;
920 regcnt = 6;
921
922 params = (caddr_t)frame->tf_rsp + sizeof(register_t);
923 sa->code = frame->tf_rax;
924
925 if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
926 sa->code = frame->tf_rdi;
927 reg++;
928 regcnt--;
929 }
930 if (p->p_sysent->sv_mask)
931 sa->code &= p->p_sysent->sv_mask;
932
933 if (sa->code >= p->p_sysent->sv_size)
934 sa->callp = &p->p_sysent->sv_table[0];
935 else
936 sa->callp = &p->p_sysent->sv_table[sa->code];
937
938 sa->narg = sa->callp->sy_narg;
939 KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
940 ("Too many syscall arguments!"));
941 error = 0;
942 argp = &frame->tf_rdi;
943 argp += reg;
944 bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt);
945 if (sa->narg > regcnt) {
946 KASSERT(params != NULL, ("copyin args with no params!"));
947 error = copyin(params, &sa->args[regcnt],
948 (sa->narg - regcnt) * sizeof(sa->args[0]));
949 }
950
951 if (error == 0) {
952 td->td_retval[0] = 0;
953 td->td_retval[1] = frame->tf_rdx;
954 }
955
956 return (error);
957 }
958
959 #include "../../kern/subr_syscall.c"
960
961 /*
962 * System call handler for native binaries. The trap frame is already
963 * set up by the assembler trampoline and a pointer to it is saved in
964 * td_frame.
965 */
966 void
967 amd64_syscall(struct thread *td, int traced)
968 {
969 struct syscall_args sa;
970 int error;
971 ksiginfo_t ksi;
972
973 #ifdef DIAGNOSTIC
974 if (!TRAPF_USERMODE(td->td_frame)) {
975 panic("syscall");
976 /* NOT REACHED */
977 }
978 #endif
979 error = syscallenter(td, &sa);
980
981 /*
982 * Traced syscall.
983 */
984 if (__predict_false(traced)) {
985 td->td_frame->tf_rflags &= ~PSL_T;
986 ksiginfo_init_trap(&ksi);
987 ksi.ksi_signo = SIGTRAP;
988 ksi.ksi_code = TRAP_TRACE;
989 ksi.ksi_addr = (void *)td->td_frame->tf_rip;
990 trapsignal(td, &ksi);
991 }
992
993 KASSERT(PCB_USER_FPU(td->td_pcb),
994 ("System call %s returning with kernel FPU ctx leaked",
995 syscallname(td->td_proc, sa.code)));
996 KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
997 ("System call %s returning with mangled pcb_save",
998 syscallname(td->td_proc, sa.code)));
999 KASSERT(td->td_md.md_invl_gen.gen == 0,
1000 ("System call %s returning with leaked invl_gen %lu",
1001 syscallname(td->td_proc, sa.code), td->td_md.md_invl_gen.gen));
1002
1003 syscallret(td, error, &sa);
1004
1005 /*
1006 * If the user-supplied value of %rip is not a canonical
1007 * address, then some CPUs will trigger a ring 0 #GP during
1008 * the sysret instruction. However, the fault handler would
1009 * execute in ring 0 with the user's %gs and %rsp which would
1010 * not be safe. Instead, use the full return path which
1011 * catches the problem safely.
1012 */
1013 if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
1014 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1015 }
Cache object: 151c005a564aa6e5fe34c0438aaf3fa2
|