FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/trap.c
1 /*-
2 * Copyright (C) 1994, David Greenman
3 * Copyright (c) 1990, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the University of Utah, and William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
38 */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD: releng/11.2/sys/amd64/amd64/trap.c 333369 2018-05-08 17:03:33Z emaste $");
42
43 /*
44 * AMD64 Trap and System call handling
45 */
46
47 #include "opt_clock.h"
48 #include "opt_compat.h"
49 #include "opt_cpu.h"
50 #include "opt_hwpmc_hooks.h"
51 #include "opt_isa.h"
52 #include "opt_kdb.h"
53 #include "opt_stack.h"
54
55 #include <sys/param.h>
56 #include <sys/bus.h>
57 #include <sys/systm.h>
58 #include <sys/proc.h>
59 #include <sys/pioctl.h>
60 #include <sys/ptrace.h>
61 #include <sys/kdb.h>
62 #include <sys/kernel.h>
63 #include <sys/ktr.h>
64 #include <sys/lock.h>
65 #include <sys/mutex.h>
66 #include <sys/resourcevar.h>
67 #include <sys/signalvar.h>
68 #include <sys/syscall.h>
69 #include <sys/sysctl.h>
70 #include <sys/sysent.h>
71 #include <sys/uio.h>
72 #include <sys/vmmeter.h>
73 #ifdef HWPMC_HOOKS
74 #include <sys/pmckern.h>
75 PMC_SOFT_DEFINE( , , page_fault, all);
76 PMC_SOFT_DEFINE( , , page_fault, read);
77 PMC_SOFT_DEFINE( , , page_fault, write);
78 #endif
79
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_kern.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_extern.h>
87
88 #include <machine/cpu.h>
89 #include <machine/intr_machdep.h>
90 #include <x86/mca.h>
91 #include <machine/md_var.h>
92 #include <machine/pcb.h>
93 #ifdef SMP
94 #include <machine/smp.h>
95 #endif
96 #include <machine/stack.h>
97 #include <machine/tss.h>
98
99 #ifdef KDTRACE_HOOKS
100 #include <sys/dtrace_bsd.h>
101 #endif
102
103 extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
104 IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
105 IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
106
107 void __noinline trap(struct trapframe *frame);
108 void trap_check(struct trapframe *frame);
109 void dblfault_handler(struct trapframe *frame);
110
111 static int trap_pfault(struct trapframe *, int);
112 static void trap_fatal(struct trapframe *, vm_offset_t);
113
114 #define MAX_TRAP_MSG 32
115 static char *trap_msg[] = {
116 "", /* 0 unused */
117 "privileged instruction fault", /* 1 T_PRIVINFLT */
118 "", /* 2 unused */
119 "breakpoint instruction fault", /* 3 T_BPTFLT */
120 "", /* 4 unused */
121 "", /* 5 unused */
122 "arithmetic trap", /* 6 T_ARITHTRAP */
123 "", /* 7 unused */
124 "", /* 8 unused */
125 "general protection fault", /* 9 T_PROTFLT */
126 "trace trap", /* 10 T_TRCTRAP */
127 "", /* 11 unused */
128 "page fault", /* 12 T_PAGEFLT */
129 "", /* 13 unused */
130 "alignment fault", /* 14 T_ALIGNFLT */
131 "", /* 15 unused */
132 "", /* 16 unused */
133 "", /* 17 unused */
134 "integer divide fault", /* 18 T_DIVIDE */
135 "non-maskable interrupt trap", /* 19 T_NMI */
136 "overflow trap", /* 20 T_OFLOW */
137 "FPU bounds check fault", /* 21 T_BOUND */
138 "FPU device not available", /* 22 T_DNA */
139 "double fault", /* 23 T_DOUBLEFLT */
140 "FPU operand fetch fault", /* 24 T_FPOPFLT */
141 "invalid TSS fault", /* 25 T_TSSFLT */
142 "segment not present fault", /* 26 T_SEGNPFLT */
143 "stack fault", /* 27 T_STKFLT */
144 "machine check trap", /* 28 T_MCHK */
145 "SIMD floating-point exception", /* 29 T_XMMFLT */
146 "reserved (unknown) fault", /* 30 T_RESERVED */
147 "", /* 31 unused (reserved) */
148 "DTrace pid return trap", /* 32 T_DTRACE_RET */
149 };
150
151 static int prot_fault_translation;
152 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
153 &prot_fault_translation, 0,
154 "Select signal to deliver on protection fault");
155 static int uprintf_signal;
156 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
157 &uprintf_signal, 0,
158 "Print debugging information on trap signal to ctty");
159
160 /*
161 * Exception, fault, and trap interface to the FreeBSD kernel.
162 * This common code is called from assembly language IDT gate entry
163 * routines that prepare a suitable stack frame, and restore this
164 * frame after the exception has been processed.
165 */
166
167 void
168 trap(struct trapframe *frame)
169 {
170 ksiginfo_t ksi;
171 struct thread *td;
172 struct proc *p;
173 register_t addr;
174 #ifdef KDB
175 register_t dr6;
176 #endif
177 int signo, ucode;
178 u_int type;
179
180 td = curthread;
181 p = td->td_proc;
182 signo = 0;
183 ucode = 0;
184 addr = 0;
185
186 PCPU_INC(cnt.v_trap);
187 type = frame->tf_trapno;
188
189 #ifdef SMP
190 /* Handler for NMI IPIs used for stopping CPUs. */
191 if (type == T_NMI && ipi_nmi_handler() == 0)
192 return;
193 #endif
194
195 #ifdef KDB
196 if (kdb_active) {
197 kdb_reenter();
198 return;
199 }
200 #endif
201
202 if (type == T_RESERVED) {
203 trap_fatal(frame, 0);
204 return;
205 }
206
207 if (type == T_NMI) {
208 #ifdef HWPMC_HOOKS
209 /*
210 * CPU PMCs interrupt using an NMI. If the PMC module is
211 * active, pass the 'rip' value to the PMC module's interrupt
212 * handler. A non-zero return value from the handler means that
213 * the NMI was consumed by it and we can return immediately.
214 */
215 if (pmc_intr != NULL &&
216 (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
217 return;
218 #endif
219
220 #ifdef STACK
221 if (stack_nmi_handler(frame) != 0)
222 return;
223 #endif
224 }
225
226 if ((frame->tf_rflags & PSL_I) == 0) {
227 /*
228 * Buggy application or kernel code has disabled
229 * interrupts and then trapped. Enabling interrupts
230 * now is wrong, but it is better than running with
231 * interrupts disabled until they are accidentally
232 * enabled later.
233 */
234 if (TRAPF_USERMODE(frame))
235 uprintf(
236 "pid %ld (%s): trap %d with interrupts disabled\n",
237 (long)curproc->p_pid, curthread->td_name, type);
238 else if (type != T_NMI && type != T_BPTFLT &&
239 type != T_TRCTRAP) {
240 /*
241 * XXX not quite right, since this may be for a
242 * multiple fault in user mode.
243 */
244 printf("kernel trap %d with interrupts disabled\n",
245 type);
246
247 /*
248 * We shouldn't enable interrupts while holding a
249 * spin lock.
250 */
251 if (td->td_md.md_spinlock_count == 0)
252 enable_intr();
253 }
254 }
255
256 if (TRAPF_USERMODE(frame)) {
257 /* user trap */
258
259 td->td_pticks = 0;
260 td->td_frame = frame;
261 addr = frame->tf_rip;
262 if (td->td_cowgen != p->p_cowgen)
263 thread_cow_update(td);
264
265 switch (type) {
266 case T_PRIVINFLT: /* privileged instruction fault */
267 signo = SIGILL;
268 ucode = ILL_PRVOPC;
269 break;
270
271 case T_BPTFLT: /* bpt instruction fault */
272 case T_TRCTRAP: /* trace trap */
273 enable_intr();
274 #ifdef KDTRACE_HOOKS
275 if (type == T_BPTFLT) {
276 if (dtrace_pid_probe_ptr != NULL &&
277 dtrace_pid_probe_ptr(frame) == 0)
278 return;
279 }
280 #endif
281 frame->tf_rflags &= ~PSL_T;
282 signo = SIGTRAP;
283 ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
284 break;
285
286 case T_ARITHTRAP: /* arithmetic trap */
287 ucode = fputrap_x87();
288 if (ucode == -1)
289 return;
290 signo = SIGFPE;
291 break;
292
293 case T_PROTFLT: /* general protection fault */
294 signo = SIGBUS;
295 ucode = BUS_OBJERR;
296 break;
297 case T_STKFLT: /* stack fault */
298 case T_SEGNPFLT: /* segment not present fault */
299 signo = SIGBUS;
300 ucode = BUS_ADRERR;
301 break;
302 case T_TSSFLT: /* invalid TSS fault */
303 signo = SIGBUS;
304 ucode = BUS_OBJERR;
305 break;
306 case T_ALIGNFLT:
307 signo = SIGBUS;
308 ucode = BUS_ADRALN;
309 break;
310 case T_DOUBLEFLT: /* double fault */
311 default:
312 signo = SIGBUS;
313 ucode = BUS_OBJERR;
314 break;
315
316 case T_PAGEFLT: /* page fault */
317 /*
318 * Emulator can take care about this trap?
319 */
320 if (*p->p_sysent->sv_trap != NULL &&
321 (*p->p_sysent->sv_trap)(td) == 0)
322 return;
323
324 addr = frame->tf_addr;
325 signo = trap_pfault(frame, TRUE);
326 if (signo == -1)
327 return;
328 if (signo == 0)
329 goto userret;
330 if (signo == SIGSEGV) {
331 ucode = SEGV_MAPERR;
332 } else if (prot_fault_translation == 0) {
333 /*
334 * Autodetect. This check also covers
335 * the images without the ABI-tag ELF
336 * note.
337 */
338 if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
339 p->p_osrel >= P_OSREL_SIGSEGV) {
340 signo = SIGSEGV;
341 ucode = SEGV_ACCERR;
342 } else {
343 signo = SIGBUS;
344 ucode = BUS_PAGE_FAULT;
345 }
346 } else if (prot_fault_translation == 1) {
347 /*
348 * Always compat mode.
349 */
350 signo = SIGBUS;
351 ucode = BUS_PAGE_FAULT;
352 } else {
353 /*
354 * Always SIGSEGV mode.
355 */
356 signo = SIGSEGV;
357 ucode = SEGV_ACCERR;
358 }
359 break;
360
361 case T_DIVIDE: /* integer divide fault */
362 ucode = FPE_INTDIV;
363 signo = SIGFPE;
364 break;
365
366 #ifdef DEV_ISA
367 case T_NMI:
368 nmi_handle_intr(type, frame);
369 return;
370 #endif
371
372 case T_OFLOW: /* integer overflow fault */
373 ucode = FPE_INTOVF;
374 signo = SIGFPE;
375 break;
376
377 case T_BOUND: /* bounds check fault */
378 ucode = FPE_FLTSUB;
379 signo = SIGFPE;
380 break;
381
382 case T_DNA:
383 /* transparent fault (due to context switch "late") */
384 KASSERT(PCB_USER_FPU(td->td_pcb),
385 ("kernel FPU ctx has leaked"));
386 fpudna();
387 return;
388
389 case T_FPOPFLT: /* FPU operand fetch fault */
390 ucode = ILL_COPROC;
391 signo = SIGILL;
392 break;
393
394 case T_XMMFLT: /* SIMD floating-point exception */
395 ucode = fputrap_sse();
396 if (ucode == -1)
397 return;
398 signo = SIGFPE;
399 break;
400 #ifdef KDTRACE_HOOKS
401 case T_DTRACE_RET:
402 enable_intr();
403 if (dtrace_return_probe_ptr != NULL)
404 dtrace_return_probe_ptr(frame);
405 return;
406 #endif
407 }
408 } else {
409 /* kernel trap */
410
411 KASSERT(cold || td->td_ucred != NULL,
412 ("kernel trap doesn't have ucred"));
413 switch (type) {
414 case T_PAGEFLT: /* page fault */
415 (void) trap_pfault(frame, FALSE);
416 return;
417
418 case T_DNA:
419 if (PCB_USER_FPU(td->td_pcb))
420 panic("Unregistered use of FPU in kernel");
421 fpudna();
422 return;
423
424 case T_ARITHTRAP: /* arithmetic trap */
425 case T_XMMFLT: /* SIMD floating-point exception */
426 case T_FPOPFLT: /* FPU operand fetch fault */
427 /*
428 * For now, supporting kernel handler
429 * registration for FPU traps is overkill.
430 */
431 trap_fatal(frame, 0);
432 return;
433
434 case T_STKFLT: /* stack fault */
435 case T_PROTFLT: /* general protection fault */
436 case T_SEGNPFLT: /* segment not present fault */
437 if (td->td_intr_nesting_level != 0)
438 break;
439
440 /*
441 * Invalid segment selectors and out of bounds
442 * %rip's and %rsp's can be set up in user mode.
443 * This causes a fault in kernel mode when the
444 * kernel tries to return to user mode. We want
445 * to get this fault so that we can fix the
446 * problem here and not have to check all the
447 * selectors and pointers when the user changes
448 * them.
449 *
450 * In case of PTI, the IRETQ faulted while the
451 * kernel used the pti stack, and exception
452 * frame records %rsp value pointing to that
453 * stack. If we return normally to
454 * doreti_iret_fault, the trapframe is
455 * reconstructed on pti stack, and calltrap()
456 * called on it as well. Due to the very
457 * limited pti stack size, kernel does not
458 * survive for too long. Switch to the normal
459 * thread stack for the trap handling.
460 *
461 * Magic '5' is the number of qwords occupied by
462 * the hardware trap frame.
463 */
464 if (frame->tf_rip == (long)doreti_iret) {
465 frame->tf_rip = (long)doreti_iret_fault;
466 if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR(
467 pti_stack) + (PC_PTI_STACK_SZ - 5) *
468 sizeof(register_t))
469 frame->tf_rsp = PCPU_GET(rsp0) - 5 *
470 sizeof(register_t);
471 return;
472 }
473 if (frame->tf_rip == (long)ld_ds) {
474 frame->tf_rip = (long)ds_load_fault;
475 return;
476 }
477 if (frame->tf_rip == (long)ld_es) {
478 frame->tf_rip = (long)es_load_fault;
479 return;
480 }
481 if (frame->tf_rip == (long)ld_fs) {
482 frame->tf_rip = (long)fs_load_fault;
483 return;
484 }
485 if (frame->tf_rip == (long)ld_gs) {
486 frame->tf_rip = (long)gs_load_fault;
487 return;
488 }
489 if (frame->tf_rip == (long)ld_gsbase) {
490 frame->tf_rip = (long)gsbase_load_fault;
491 return;
492 }
493 if (frame->tf_rip == (long)ld_fsbase) {
494 frame->tf_rip = (long)fsbase_load_fault;
495 return;
496 }
497 if (curpcb->pcb_onfault != NULL) {
498 frame->tf_rip = (long)curpcb->pcb_onfault;
499 return;
500 }
501 break;
502
503 case T_TSSFLT:
504 /*
505 * PSL_NT can be set in user mode and isn't cleared
506 * automatically when the kernel is entered. This
507 * causes a TSS fault when the kernel attempts to
508 * `iret' because the TSS link is uninitialized. We
509 * want to get this fault so that we can fix the
510 * problem here and not every time the kernel is
511 * entered.
512 */
513 if (frame->tf_rflags & PSL_NT) {
514 frame->tf_rflags &= ~PSL_NT;
515 return;
516 }
517 break;
518
519 case T_TRCTRAP: /* trace trap */
520 /*
521 * Ignore debug register trace traps due to
522 * accesses in the user's address space, which
523 * can happen under several conditions such as
524 * if a user sets a watchpoint on a buffer and
525 * then passes that buffer to a system call.
526 * We still want to get TRCTRAPS for addresses
527 * in kernel space because that is useful when
528 * debugging the kernel.
529 */
530 if (user_dbreg_trap()) {
531 /*
532 * Reset breakpoint bits because the
533 * processor doesn't
534 */
535 load_dr6(rdr6() & ~0xf);
536 return;
537 }
538
539 /*
540 * Malicious user code can configure a debug
541 * register watchpoint to trap on data access
542 * to the top of stack and then execute 'pop
543 * %ss; int 3'. Due to exception deferral for
544 * 'pop %ss', the CPU will not interrupt 'int
545 * 3' to raise the DB# exception for the debug
546 * register but will postpone the DB# until
547 * execution of the first instruction of the
548 * BP# handler (in kernel mode). Normally the
549 * previous check would ignore DB# exceptions
550 * for watchpoints on user addresses raised in
551 * kernel mode. However, some CPU errata
552 * include cases where DB# exceptions do not
553 * properly set bits in %dr6, e.g. Haswell
554 * HSD23 and Skylake-X SKZ24.
555 *
556 * A deferred DB# can also be raised on the
557 * first instructions of system call entry
558 * points or single-step traps via similar use
559 * of 'pop %ss' or 'mov xxx, %ss'.
560 */
561 if (pti) {
562 if (frame->tf_rip ==
563 (uintptr_t)IDTVEC(fast_syscall_pti) ||
564 #ifdef COMPAT_FREEBSD32
565 frame->tf_rip ==
566 (uintptr_t)IDTVEC(int0x80_syscall_pti) ||
567 #endif
568 frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
569 return;
570 } else {
571 if (frame->tf_rip ==
572 (uintptr_t)IDTVEC(fast_syscall) ||
573 #ifdef COMPAT_FREEBSD32
574 frame->tf_rip ==
575 (uintptr_t)IDTVEC(int0x80_syscall) ||
576 #endif
577 frame->tf_rip == (uintptr_t)IDTVEC(bpt))
578 return;
579 }
580 if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
581 /* Needed for AMD. */
582 frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
583 return;
584 /*
585 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
586 */
587 case T_BPTFLT:
588 /*
589 * If KDB is enabled, let it handle the debugger trap.
590 * Otherwise, debugger traps "can't happen".
591 */
592 #ifdef KDB
593 /* XXX %dr6 is not quite reentrant. */
594 dr6 = rdr6();
595 load_dr6(dr6 & ~0x4000);
596 if (kdb_trap(type, dr6, frame))
597 return;
598 #endif
599 break;
600
601 #ifdef DEV_ISA
602 case T_NMI:
603 nmi_handle_intr(type, frame);
604 return;
605 #endif
606 }
607
608 trap_fatal(frame, 0);
609 return;
610 }
611
612 /* Translate fault for emulators (e.g. Linux) */
613 if (*p->p_sysent->sv_transtrap != NULL)
614 signo = (*p->p_sysent->sv_transtrap)(signo, type);
615
616 ksiginfo_init_trap(&ksi);
617 ksi.ksi_signo = signo;
618 ksi.ksi_code = ucode;
619 ksi.ksi_trapno = type;
620 ksi.ksi_addr = (void *)addr;
621 if (uprintf_signal) {
622 uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
623 "addr 0x%lx rsp 0x%lx rip 0x%lx "
624 "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
625 p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
626 addr, frame->tf_rsp, frame->tf_rip,
627 fubyte((void *)(frame->tf_rip + 0)),
628 fubyte((void *)(frame->tf_rip + 1)),
629 fubyte((void *)(frame->tf_rip + 2)),
630 fubyte((void *)(frame->tf_rip + 3)),
631 fubyte((void *)(frame->tf_rip + 4)),
632 fubyte((void *)(frame->tf_rip + 5)),
633 fubyte((void *)(frame->tf_rip + 6)),
634 fubyte((void *)(frame->tf_rip + 7)));
635 }
636 KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
637 trapsignal(td, &ksi);
638 userret:
639 userret(td, frame);
640 KASSERT(PCB_USER_FPU(td->td_pcb),
641 ("Return from trap with kernel FPU ctx leaked"));
642 }
643
644 /*
645 * Ensure that we ignore any DTrace-induced faults. This function cannot
646 * be instrumented, so it cannot generate such faults itself.
647 */
648 void
649 trap_check(struct trapframe *frame)
650 {
651
652 #ifdef KDTRACE_HOOKS
653 if (dtrace_trap_func != NULL &&
654 (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
655 return;
656 #endif
657 trap(frame);
658 }
659
660 static int
661 trap_pfault(struct trapframe *frame, int usermode)
662 {
663 struct thread *td;
664 struct proc *p;
665 vm_map_t map;
666 vm_offset_t va;
667 int rv;
668 vm_prot_t ftype;
669 vm_offset_t eva;
670
671 td = curthread;
672 p = td->td_proc;
673 eva = frame->tf_addr;
674
675 if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
676 /*
677 * Due to both processor errata and lazy TLB invalidation when
678 * access restrictions are removed from virtual pages, memory
679 * accesses that are allowed by the physical mapping layer may
680 * nonetheless cause one spurious page fault per virtual page.
681 * When the thread is executing a "no faulting" section that
682 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
683 * every page fault is treated as a spurious page fault,
684 * unless it accesses the same virtual address as the most
685 * recent page fault within the same "no faulting" section.
686 */
687 if (td->td_md.md_spurflt_addr != eva ||
688 (td->td_pflags & TDP_RESETSPUR) != 0) {
689 /*
690 * Do nothing to the TLB. A stale TLB entry is
691 * flushed automatically by a page fault.
692 */
693 td->td_md.md_spurflt_addr = eva;
694 td->td_pflags &= ~TDP_RESETSPUR;
695 return (0);
696 }
697 } else {
698 /*
699 * If we get a page fault while in a critical section, then
700 * it is most likely a fatal kernel page fault. The kernel
701 * is already going to panic trying to get a sleep lock to
702 * do the VM lookup, so just consider it a fatal trap so the
703 * kernel can print out a useful trap message and even get
704 * to the debugger.
705 *
706 * If we get a page fault while holding a non-sleepable
707 * lock, then it is most likely a fatal kernel page fault.
708 * If WITNESS is enabled, then it's going to whine about
709 * bogus LORs with various VM locks, so just skip to the
710 * fatal trap handling directly.
711 */
712 if (td->td_critnest != 0 ||
713 WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
714 "Kernel page fault") != 0) {
715 trap_fatal(frame, eva);
716 return (-1);
717 }
718 }
719 va = trunc_page(eva);
720 if (va >= VM_MIN_KERNEL_ADDRESS) {
721 /*
722 * Don't allow user-mode faults in kernel address space.
723 */
724 if (usermode)
725 return (SIGSEGV);
726
727 map = kernel_map;
728 } else {
729 map = &p->p_vmspace->vm_map;
730
731 /*
732 * When accessing a usermode address, kernel must be
733 * ready to accept the page fault, and provide a
734 * handling routine. Since accessing the address
735 * without the handler is a bug, do not try to handle
736 * it normally, and panic immediately.
737 */
738 if (!usermode && (td->td_intr_nesting_level != 0 ||
739 curpcb->pcb_onfault == NULL)) {
740 trap_fatal(frame, eva);
741 return (-1);
742 }
743 }
744
745 /*
746 * If the trap was caused by errant bits in the PTE then panic.
747 */
748 if (frame->tf_err & PGEX_RSV) {
749 trap_fatal(frame, eva);
750 return (-1);
751 }
752
753 /*
754 * If nx protection of the usermode portion of kernel page
755 * tables caused trap, panic.
756 */
757 if (pti && usermode && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
758 PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
759 (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK)==
760 (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK))
761 panic("PTI: pid %d comm %s tf_err %#lx\n", p->p_pid,
762 p->p_comm, frame->tf_err);
763
764 /*
765 * PGEX_I is defined only if the execute disable bit capability is
766 * supported and enabled.
767 */
768 if (frame->tf_err & PGEX_W)
769 ftype = VM_PROT_WRITE;
770 else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
771 ftype = VM_PROT_EXECUTE;
772 else
773 ftype = VM_PROT_READ;
774
775 /* Fault in the page. */
776 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
777 if (rv == KERN_SUCCESS) {
778 #ifdef HWPMC_HOOKS
779 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
780 PMC_SOFT_CALL_TF( , , page_fault, all, frame);
781 if (ftype == VM_PROT_READ)
782 PMC_SOFT_CALL_TF( , , page_fault, read,
783 frame);
784 else
785 PMC_SOFT_CALL_TF( , , page_fault, write,
786 frame);
787 }
788 #endif
789 return (0);
790 }
791 if (!usermode) {
792 if (td->td_intr_nesting_level == 0 &&
793 curpcb->pcb_onfault != NULL) {
794 frame->tf_rip = (long)curpcb->pcb_onfault;
795 return (0);
796 }
797 trap_fatal(frame, eva);
798 return (-1);
799 }
800 return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
801 }
802
803 static void
804 trap_fatal(frame, eva)
805 struct trapframe *frame;
806 vm_offset_t eva;
807 {
808 int code, ss;
809 u_int type;
810 struct soft_segment_descriptor softseg;
811 char *msg;
812 #ifdef KDB
813 bool handled;
814 #endif
815
816 code = frame->tf_err;
817 type = frame->tf_trapno;
818 sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
819 &softseg);
820
821 if (type <= MAX_TRAP_MSG)
822 msg = trap_msg[type];
823 else
824 msg = "UNKNOWN";
825 printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
826 TRAPF_USERMODE(frame) ? "user" : "kernel");
827 #ifdef SMP
828 /* two separate prints in case of a trap on an unmapped page */
829 printf("cpuid = %d; ", PCPU_GET(cpuid));
830 printf("apic id = %02x\n", PCPU_GET(apic_id));
831 #endif
832 if (type == T_PAGEFLT) {
833 printf("fault virtual address = 0x%lx\n", eva);
834 printf("fault code = %s %s %s, %s\n",
835 code & PGEX_U ? "user" : "supervisor",
836 code & PGEX_W ? "write" : "read",
837 code & PGEX_I ? "instruction" : "data",
838 code & PGEX_RSV ? "reserved bits in PTE" :
839 code & PGEX_P ? "protection violation" : "page not present");
840 }
841 printf("instruction pointer = 0x%lx:0x%lx\n",
842 frame->tf_cs & 0xffff, frame->tf_rip);
843 ss = frame->tf_ss & 0xffff;
844 printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp);
845 printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp);
846 printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n",
847 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
848 printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n",
849 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
850 softseg.ssd_gran);
851 printf("processor eflags = ");
852 if (frame->tf_rflags & PSL_T)
853 printf("trace trap, ");
854 if (frame->tf_rflags & PSL_I)
855 printf("interrupt enabled, ");
856 if (frame->tf_rflags & PSL_NT)
857 printf("nested task, ");
858 if (frame->tf_rflags & PSL_RF)
859 printf("resume, ");
860 printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
861 printf("current process = %d (%s)\n",
862 curproc->p_pid, curthread->td_name);
863
864 #ifdef KDB
865 if (debugger_on_panic) {
866 kdb_why = KDB_WHY_TRAP;
867 handled = kdb_trap(type, 0, frame);
868 kdb_why = KDB_WHY_UNSET;
869 if (handled)
870 return;
871 }
872 #endif
873 printf("trap number = %d\n", type);
874 if (type <= MAX_TRAP_MSG)
875 panic("%s", trap_msg[type]);
876 else
877 panic("unknown/reserved trap");
878 }
879
880 /*
881 * Double fault handler. Called when a fault occurs while writing
882 * a frame for a trap/exception onto the stack. This usually occurs
883 * when the stack overflows (such is the case with infinite recursion,
884 * for example).
885 */
886 void
887 dblfault_handler(struct trapframe *frame)
888 {
889 #ifdef KDTRACE_HOOKS
890 if (dtrace_doubletrap_func != NULL)
891 (*dtrace_doubletrap_func)();
892 #endif
893 printf("\nFatal double fault\n"
894 "rip %#lx rsp %#lx rbp %#lx\n"
895 "rax %#lx rdx %#lx rbx %#lx\n"
896 "rcx %#lx rsi %#lx rdi %#lx\n"
897 "r8 %#lx r9 %#lx r10 %#lx\n"
898 "r11 %#lx r12 %#lx r13 %#lx\n"
899 "r14 %#lx r15 %#lx rflags %#lx\n"
900 "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n"
901 "fsbase %#lx gsbase %#lx kgsbase %#lx\n",
902 frame->tf_rip, frame->tf_rsp, frame->tf_rbp,
903 frame->tf_rax, frame->tf_rdx, frame->tf_rbx,
904 frame->tf_rcx, frame->tf_rdi, frame->tf_rsi,
905 frame->tf_r8, frame->tf_r9, frame->tf_r10,
906 frame->tf_r11, frame->tf_r12, frame->tf_r13,
907 frame->tf_r14, frame->tf_r15, frame->tf_rflags,
908 frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
909 frame->tf_fs, frame->tf_gs,
910 rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
911 #ifdef SMP
912 /* two separate prints in case of a trap on an unmapped page */
913 printf("cpuid = %d; ", PCPU_GET(cpuid));
914 printf("apic id = %02x\n", PCPU_GET(apic_id));
915 #endif
916 panic("double fault");
917 }
918
919 int
920 cpu_fetch_syscall_args(struct thread *td)
921 {
922 struct proc *p;
923 struct trapframe *frame;
924 register_t *argp;
925 struct syscall_args *sa;
926 caddr_t params;
927 int reg, regcnt, error;
928
929 p = td->td_proc;
930 frame = td->td_frame;
931 sa = &td->td_sa;
932 reg = 0;
933 regcnt = 6;
934
935 params = (caddr_t)frame->tf_rsp + sizeof(register_t);
936 sa->code = frame->tf_rax;
937
938 if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
939 sa->code = frame->tf_rdi;
940 reg++;
941 regcnt--;
942 }
943 if (p->p_sysent->sv_mask)
944 sa->code &= p->p_sysent->sv_mask;
945
946 if (sa->code >= p->p_sysent->sv_size)
947 sa->callp = &p->p_sysent->sv_table[0];
948 else
949 sa->callp = &p->p_sysent->sv_table[sa->code];
950
951 sa->narg = sa->callp->sy_narg;
952 KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
953 ("Too many syscall arguments!"));
954 error = 0;
955 argp = &frame->tf_rdi;
956 argp += reg;
957 bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt);
958 if (sa->narg > regcnt) {
959 KASSERT(params != NULL, ("copyin args with no params!"));
960 error = copyin(params, &sa->args[regcnt],
961 (sa->narg - regcnt) * sizeof(sa->args[0]));
962 }
963
964 if (error == 0) {
965 td->td_retval[0] = 0;
966 td->td_retval[1] = frame->tf_rdx;
967 }
968
969 return (error);
970 }
971
972 #include "../../kern/subr_syscall.c"
973
974 /*
975 * System call handler for native binaries. The trap frame is already
976 * set up by the assembler trampoline and a pointer to it is saved in
977 * td_frame.
978 */
979 void
980 amd64_syscall(struct thread *td, int traced)
981 {
982 int error;
983 ksiginfo_t ksi;
984
985 #ifdef DIAGNOSTIC
986 if (!TRAPF_USERMODE(td->td_frame)) {
987 panic("syscall");
988 /* NOT REACHED */
989 }
990 #endif
991 error = syscallenter(td);
992
993 /*
994 * Traced syscall.
995 */
996 if (__predict_false(traced)) {
997 td->td_frame->tf_rflags &= ~PSL_T;
998 ksiginfo_init_trap(&ksi);
999 ksi.ksi_signo = SIGTRAP;
1000 ksi.ksi_code = TRAP_TRACE;
1001 ksi.ksi_addr = (void *)td->td_frame->tf_rip;
1002 trapsignal(td, &ksi);
1003 }
1004
1005 KASSERT(PCB_USER_FPU(td->td_pcb),
1006 ("System call %s returning with kernel FPU ctx leaked",
1007 syscallname(td->td_proc, td->td_sa.code)));
1008 KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1009 ("System call %s returning with mangled pcb_save",
1010 syscallname(td->td_proc, td->td_sa.code)));
1011 KASSERT(td->td_md.md_invl_gen.gen == 0,
1012 ("System call %s returning with leaked invl_gen %lu",
1013 syscallname(td->td_proc, td->td_sa.code),
1014 td->td_md.md_invl_gen.gen));
1015
1016 syscallret(td, error);
1017
1018 /*
1019 * If the user-supplied value of %rip is not a canonical
1020 * address, then some CPUs will trigger a ring 0 #GP during
1021 * the sysret instruction. However, the fault handler would
1022 * execute in ring 0 with the user's %gs and %rsp which would
1023 * not be safe. Instead, use the full return path which
1024 * catches the problem safely.
1025 */
1026 if (__predict_false(td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS))
1027 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1028 }
Cache object: 02a28db0a37be35013809a5595b90b6f
|