FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/trap.c
1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (C) 1994, David Greenman
5 * Copyright (c) 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the University of Utah, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
40 */
41
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44
45 /*
46 * 386 Trap and System call handling
47 */
48
49 #include "opt_clock.h"
50 #include "opt_compat.h"
51 #include "opt_cpu.h"
52 #include "opt_hwpmc_hooks.h"
53 #include "opt_isa.h"
54 #include "opt_kdb.h"
55 #include "opt_trap.h"
56
57 #include <sys/param.h>
58 #include <sys/bus.h>
59 #include <sys/systm.h>
60 #include <sys/proc.h>
61 #include <sys/ptrace.h>
62 #include <sys/kdb.h>
63 #include <sys/kernel.h>
64 #include <sys/ktr.h>
65 #include <sys/lock.h>
66 #include <sys/mutex.h>
67 #include <sys/resourcevar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscall.h>
70 #include <sys/sysctl.h>
71 #include <sys/sysent.h>
72 #include <sys/uio.h>
73 #include <sys/vmmeter.h>
74 #ifdef HWPMC_HOOKS
75 #include <sys/pmckern.h>
76 PMC_SOFT_DEFINE( , , page_fault, all);
77 PMC_SOFT_DEFINE( , , page_fault, read);
78 PMC_SOFT_DEFINE( , , page_fault, write);
79 #endif
80 #include <security/audit/audit.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_extern.h>
89
90 #include <machine/cpu.h>
91 #include <machine/intr_machdep.h>
92 #include <x86/mca.h>
93 #include <machine/md_var.h>
94 #include <machine/pcb.h>
95 #ifdef SMP
96 #include <machine/smp.h>
97 #endif
98 #include <machine/stack.h>
99 #include <machine/trap.h>
100 #include <machine/tss.h>
101 #include <machine/vm86.h>
102
103 #ifdef POWERFAIL_NMI
104 #include <sys/syslog.h>
105 #include <machine/clock.h>
106 #endif
107
108 #ifdef KDTRACE_HOOKS
109 #include <sys/dtrace_bsd.h>
110 #endif
111
112 void trap(struct trapframe *frame);
113 void syscall(struct trapframe *frame);
114
115 static int trap_pfault(struct trapframe *, bool, vm_offset_t, int *, int *);
116 static void trap_fatal(struct trapframe *, vm_offset_t);
117 #ifdef KDTRACE_HOOKS
118 static bool trap_user_dtrace(struct trapframe *,
119 int (**hook)(struct trapframe *));
120 #endif
121 void dblfault_handler(void);
122
123 extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
124 extern uint64_t pg_nx;
125
126 struct trap_data {
127 bool ei;
128 const char *msg;
129 };
130
131 static const struct trap_data trap_data[] = {
132 [T_PRIVINFLT] = { .ei = true, .msg = "privileged instruction fault" },
133 [T_BPTFLT] = { .ei = false, .msg = "breakpoint instruction fault" },
134 [T_ARITHTRAP] = { .ei = true, .msg = "arithmetic trap" },
135 [T_PROTFLT] = { .ei = true, .msg = "general protection fault" },
136 [T_TRCTRAP] = { .ei = false, .msg = "debug exception" },
137 [T_PAGEFLT] = { .ei = true, .msg = "page fault" },
138 [T_ALIGNFLT] = { .ei = true, .msg = "alignment fault" },
139 [T_DIVIDE] = { .ei = true, .msg = "integer divide fault" },
140 [T_NMI] = { .ei = false, .msg = "non-maskable interrupt trap" },
141 [T_OFLOW] = { .ei = true, .msg = "overflow trap" },
142 [T_BOUND] = { .ei = true, .msg = "FPU bounds check fault" },
143 [T_DNA] = { .ei = true, .msg = "FPU device not available" },
144 [T_DOUBLEFLT] = { .ei = false, .msg = "double fault" },
145 [T_FPOPFLT] = { .ei = true, .msg = "FPU operand fetch fault" },
146 [T_TSSFLT] = { .ei = true, .msg = "invalid TSS fault" },
147 [T_SEGNPFLT] = { .ei = true, .msg = "segment not present fault" },
148 [T_STKFLT] = { .ei = true, .msg = "stack fault" },
149 [T_MCHK] = { .ei = true, .msg = "machine check trap" },
150 [T_XMMFLT] = { .ei = true, .msg = "SIMD floating-point exception" },
151 [T_DTRACE_RET] ={ .ei = true, .msg = "DTrace pid return trap" },
152 };
153
154 static bool
155 trap_enable_intr(int trapno)
156 {
157
158 MPASS(trapno > 0);
159 if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL)
160 return (trap_data[trapno].ei);
161 return (false);
162 }
163
164 static const char *
165 trap_msg(int trapno)
166 {
167 const char *res;
168 static const char unkn[] = "UNKNOWN";
169
170 res = NULL;
171 if (trapno < nitems(trap_data))
172 res = trap_data[trapno].msg;
173 if (res == NULL)
174 res = unkn;
175 return (res);
176 }
177
178 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
179 int has_f00f_bug = 0; /* Initialized so that it can be patched. */
180 #endif
181
182 static int uprintf_signal;
183 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
184 &uprintf_signal, 0,
185 "Print debugging information on trap signal to ctty");
186
187
188 #ifdef INVARIANTS
189 static __inline register_t
190 read_esp(void)
191 {
192 register_t res;
193
194 __asm __volatile("movl\t%%esp,%0" : "=r" (res));
195 return (res);
196 }
197
198 void
199 trap_check_kstack(void)
200 {
201 struct thread *td;
202 vm_offset_t stk;
203
204 td = curthread;
205 stk = read_esp();
206 if (stk >= PMAP_TRM_MIN_ADDRESS)
207 panic("td %p stack %#x in trampoline", td, stk);
208 if (stk < td->td_kstack || stk >= td->td_kstack +
209 ptoa(td->td_kstack_pages))
210 panic("td %p stack %#x not in kstack VA %#x %d",
211 td, stk, td->td_kstack, td->td_kstack_pages);
212 }
213 #endif
214
215 /*
216 * Exception, fault, and trap interface to the FreeBSD kernel.
217 * This common code is called from assembly language IDT gate entry
218 * routines that prepare a suitable stack frame, and restore this
219 * frame after the exception has been processed.
220 */
221
222 void
223 trap(struct trapframe *frame)
224 {
225 ksiginfo_t ksi;
226 struct thread *td;
227 struct proc *p;
228 int pf, signo, ucode;
229 u_int type;
230 register_t addr, dr6;
231 vm_offset_t eva;
232 #ifdef POWERFAIL_NMI
233 static int lastalert = 0;
234 #endif
235
236 td = curthread;
237 p = td->td_proc;
238 dr6 = 0;
239
240 VM_CNT_INC(v_trap);
241 type = frame->tf_trapno;
242
243 KASSERT((read_eflags() & PSL_I) == 0,
244 ("trap: interrupts enabled, type %d frame %p", type, frame));
245
246 #ifdef SMP
247 /* Handler for NMI IPIs used for stopping CPUs. */
248 if (type == T_NMI && ipi_nmi_handler() == 0)
249 return;
250 #endif /* SMP */
251
252 #ifdef KDB
253 if (kdb_active) {
254 kdb_reenter();
255 return;
256 }
257 #endif
258 trap_check_kstack();
259
260 if (type == T_RESERVED) {
261 trap_fatal(frame, 0);
262 return;
263 }
264
265 if (type == T_NMI) {
266 #ifdef HWPMC_HOOKS
267 /*
268 * CPU PMCs interrupt using an NMI so we check for that first.
269 * If the HWPMC module is active, 'pmc_hook' will point to
270 * the function to be called. A non-zero return value from the
271 * hook means that the NMI was consumed by it and that we can
272 * return immediately.
273 */
274 if (pmc_intr != NULL &&
275 (*pmc_intr)(frame) != 0)
276 return;
277 #endif
278 }
279
280 if (type == T_MCHK) {
281 mca_intr();
282 return;
283 }
284
285 #ifdef KDTRACE_HOOKS
286 /*
287 * A trap can occur while DTrace executes a probe. Before
288 * executing the probe, DTrace blocks re-scheduling and sets
289 * a flag in its per-cpu flags to indicate that it doesn't
290 * want to fault. On returning from the probe, the no-fault
291 * flag is cleared and finally re-scheduling is enabled.
292 */
293 if ((type == T_PROTFLT || type == T_PAGEFLT) &&
294 dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
295 return;
296 #endif
297
298 /*
299 * We must not allow context switches until %cr2 is read.
300 * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts.
301 * All faults use interrupt gates, so %cr2 can be safely read
302 * now, before optional enable of the interrupts below.
303 */
304 if (type == T_PAGEFLT)
305 eva = rcr2();
306
307 /*
308 * Buggy application or kernel code has disabled interrupts
309 * and then trapped. Enabling interrupts now is wrong, but it
310 * is better than running with interrupts disabled until they
311 * are accidentally enabled later.
312 */
313 if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) &&
314 (curpcb->pcb_flags & PCB_VM86CALL) == 0)
315 uprintf("pid %ld (%s): usermode trap %d (%s) with "
316 "interrupts disabled\n",
317 (long)curproc->p_pid, curthread->td_name, type,
318 trap_data[type].msg);
319
320 /*
321 * Conditionally reenable interrupts. If we hold a spin lock,
322 * then we must not reenable interrupts. This might be a
323 * spurious page fault.
324 */
325 if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 &&
326 frame->tf_eip != (int)cpu_switch_load_gs)
327 enable_intr();
328
329 if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
330 /* user trap */
331
332 td->td_pticks = 0;
333 td->td_frame = frame;
334 addr = frame->tf_eip;
335 if (td->td_cowgen != atomic_load_int(&p->p_cowgen))
336 thread_cow_update(td);
337
338 switch (type) {
339 case T_PRIVINFLT: /* privileged instruction fault */
340 signo = SIGILL;
341 ucode = ILL_PRVOPC;
342 break;
343
344 case T_BPTFLT: /* bpt instruction fault */
345 #ifdef KDTRACE_HOOKS
346 if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr))
347 return;
348 #else
349 enable_intr();
350 #endif
351 signo = SIGTRAP;
352 ucode = TRAP_BRKPT;
353 break;
354
355 case T_TRCTRAP: /* debug exception */
356 enable_intr();
357 user_trctrap_out:
358 signo = SIGTRAP;
359 ucode = TRAP_TRACE;
360 dr6 = rdr6();
361 if ((dr6 & DBREG_DR6_BS) != 0) {
362 PROC_LOCK(td->td_proc);
363 if ((td->td_dbgflags & TDB_STEP) != 0) {
364 td->td_frame->tf_eflags &= ~PSL_T;
365 td->td_dbgflags &= ~TDB_STEP;
366 }
367 PROC_UNLOCK(td->td_proc);
368 }
369 break;
370
371 case T_ARITHTRAP: /* arithmetic trap */
372 ucode = npxtrap_x87();
373 if (ucode == -1)
374 return;
375 signo = SIGFPE;
376 break;
377
378 /*
379 * The following two traps can happen in vm86 mode,
380 * and, if so, we want to handle them specially.
381 */
382 case T_PROTFLT: /* general protection fault */
383 case T_STKFLT: /* stack fault */
384 if (frame->tf_eflags & PSL_VM) {
385 signo = vm86_emulate((struct vm86frame *)frame);
386 ucode = 0; /* XXXKIB: better code ? */
387 if (signo == SIGTRAP) {
388 load_dr6(rdr6() | 0x4000);
389 goto user_trctrap_out;
390 }
391 if (signo == 0)
392 goto user;
393 break;
394 }
395 signo = SIGBUS;
396 ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
397 break;
398 case T_SEGNPFLT: /* segment not present fault */
399 signo = SIGBUS;
400 ucode = BUS_ADRERR;
401 break;
402 case T_TSSFLT: /* invalid TSS fault */
403 signo = SIGBUS;
404 ucode = BUS_OBJERR;
405 break;
406 case T_ALIGNFLT:
407 signo = SIGBUS;
408 ucode = BUS_ADRALN;
409 break;
410 case T_DOUBLEFLT: /* double fault */
411 default:
412 signo = SIGBUS;
413 ucode = BUS_OBJERR;
414 break;
415
416 case T_PAGEFLT: /* page fault */
417 addr = eva;
418 pf = trap_pfault(frame, true, eva, &signo, &ucode);
419 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
420 if (pf == -2) {
421 /*
422 * The f00f hack workaround has triggered, so
423 * treat the fault as an illegal instruction
424 * (T_PRIVINFLT) instead of a page fault.
425 */
426 type = frame->tf_trapno = T_PRIVINFLT;
427 break;
428 }
429 #endif
430 if (pf == -1)
431 return;
432 if (pf == 0)
433 goto user;
434 break;
435
436 case T_DIVIDE: /* integer divide fault */
437 ucode = FPE_INTDIV;
438 signo = SIGFPE;
439 break;
440
441 case T_NMI:
442 #ifdef POWERFAIL_NMI
443 #ifndef TIMER_FREQ
444 # define TIMER_FREQ 1193182
445 #endif
446 if (time_second - lastalert > 10) {
447 log(LOG_WARNING, "NMI: power fail\n");
448 sysbeep(880, SBT_1S);
449 lastalert = time_second;
450 }
451 return;
452 #else /* !POWERFAIL_NMI */
453 nmi_handle_intr(type, frame);
454 return;
455 #endif /* POWERFAIL_NMI */
456
457 case T_OFLOW: /* integer overflow fault */
458 ucode = FPE_INTOVF;
459 signo = SIGFPE;
460 break;
461
462 case T_BOUND: /* bounds check fault */
463 ucode = FPE_FLTSUB;
464 signo = SIGFPE;
465 break;
466
467 case T_DNA:
468 KASSERT(PCB_USER_FPU(td->td_pcb),
469 ("kernel FPU ctx has leaked"));
470 /* transparent fault (due to context switch "late") */
471 if (npxdna())
472 return;
473 uprintf("pid %d killed due to lack of floating point\n",
474 p->p_pid);
475 signo = SIGKILL;
476 ucode = 0;
477 break;
478
479 case T_FPOPFLT: /* FPU operand fetch fault */
480 ucode = ILL_COPROC;
481 signo = SIGILL;
482 break;
483
484 case T_XMMFLT: /* SIMD floating-point exception */
485 ucode = npxtrap_sse();
486 if (ucode == -1)
487 return;
488 signo = SIGFPE;
489 break;
490 #ifdef KDTRACE_HOOKS
491 case T_DTRACE_RET:
492 (void)trap_user_dtrace(frame, &dtrace_return_probe_ptr);
493 return;
494 #endif
495 }
496 } else {
497 /* kernel trap */
498
499 KASSERT(cold || td->td_ucred != NULL,
500 ("kernel trap doesn't have ucred"));
501 switch (type) {
502 case T_PAGEFLT: /* page fault */
503 (void)trap_pfault(frame, false, eva, NULL, NULL);
504 return;
505
506 case T_DNA:
507 if (PCB_USER_FPU(td->td_pcb))
508 panic("Unregistered use of FPU in kernel");
509 if (npxdna())
510 return;
511 break;
512
513 case T_ARITHTRAP: /* arithmetic trap */
514 case T_XMMFLT: /* SIMD floating-point exception */
515 case T_FPOPFLT: /* FPU operand fetch fault */
516 /*
517 * XXXKIB for now disable any FPU traps in kernel
518 * handler registration seems to be overkill
519 */
520 trap_fatal(frame, 0);
521 return;
522
523 /*
524 * The following two traps can happen in
525 * vm86 mode, and, if so, we want to handle
526 * them specially.
527 */
528 case T_PROTFLT: /* general protection fault */
529 case T_STKFLT: /* stack fault */
530 if (frame->tf_eflags & PSL_VM) {
531 signo = vm86_emulate((struct vm86frame *)frame);
532 if (signo == SIGTRAP) {
533 type = T_TRCTRAP;
534 load_dr6(rdr6() | 0x4000);
535 goto kernel_trctrap;
536 }
537 if (signo != 0)
538 /*
539 * returns to original process
540 */
541 vm86_trap((struct vm86frame *)frame);
542 return;
543 }
544 /* FALL THROUGH */
545 case T_SEGNPFLT: /* segment not present fault */
546 if (curpcb->pcb_flags & PCB_VM86CALL)
547 break;
548
549 /*
550 * Invalid %fs's and %gs's can be created using
551 * procfs or PT_SETREGS or by invalidating the
552 * underlying LDT entry. This causes a fault
553 * in kernel mode when the kernel attempts to
554 * switch contexts. Lose the bad context
555 * (XXX) so that we can continue, and generate
556 * a signal.
557 */
558 if (frame->tf_eip == (int)cpu_switch_load_gs) {
559 curpcb->pcb_gs = 0;
560 #if 0
561 PROC_LOCK(p);
562 kern_psignal(p, SIGBUS);
563 PROC_UNLOCK(p);
564 #endif
565 return;
566 }
567
568 if (td->td_intr_nesting_level != 0)
569 break;
570
571 /*
572 * Invalid segment selectors and out of bounds
573 * %eip's and %esp's can be set up in user mode.
574 * This causes a fault in kernel mode when the
575 * kernel tries to return to user mode. We want
576 * to get this fault so that we can fix the
577 * problem here and not have to check all the
578 * selectors and pointers when the user changes
579 * them.
580 *
581 * N.B. Comparing to long mode, 32-bit mode
582 * does not push %esp on the trap frame,
583 * because iretl faulted while in ring 0. As
584 * the consequence, there is no need to fixup
585 * the stack pointer for doreti_iret_fault,
586 * the fixup and the complimentary trap() call
587 * are executed on the main thread stack, not
588 * on the trampoline stack.
589 */
590 if (frame->tf_eip == (int)doreti_iret + setidt_disp) {
591 frame->tf_eip = (int)doreti_iret_fault +
592 setidt_disp;
593 return;
594 }
595 if (type == T_STKFLT)
596 break;
597
598 if (frame->tf_eip == (int)doreti_popl_ds +
599 setidt_disp) {
600 frame->tf_eip = (int)doreti_popl_ds_fault +
601 setidt_disp;
602 return;
603 }
604 if (frame->tf_eip == (int)doreti_popl_es +
605 setidt_disp) {
606 frame->tf_eip = (int)doreti_popl_es_fault +
607 setidt_disp;
608 return;
609 }
610 if (frame->tf_eip == (int)doreti_popl_fs +
611 setidt_disp) {
612 frame->tf_eip = (int)doreti_popl_fs_fault +
613 setidt_disp;
614 return;
615 }
616 if (curpcb->pcb_onfault != NULL) {
617 frame->tf_eip = (int)curpcb->pcb_onfault;
618 return;
619 }
620 break;
621
622 case T_TSSFLT:
623 /*
624 * PSL_NT can be set in user mode and isn't cleared
625 * automatically when the kernel is entered. This
626 * causes a TSS fault when the kernel attempts to
627 * `iret' because the TSS link is uninitialized. We
628 * want to get this fault so that we can fix the
629 * problem here and not every time the kernel is
630 * entered.
631 */
632 if (frame->tf_eflags & PSL_NT) {
633 frame->tf_eflags &= ~PSL_NT;
634 return;
635 }
636 break;
637
638 case T_TRCTRAP: /* debug exception */
639 kernel_trctrap:
640 /* Clear any pending debug events. */
641 dr6 = rdr6();
642 load_dr6(0);
643
644 /*
645 * Ignore debug register exceptions due to
646 * accesses in the user's address space, which
647 * can happen under several conditions such as
648 * if a user sets a watchpoint on a buffer and
649 * then passes that buffer to a system call.
650 * We still want to get TRCTRAPS for addresses
651 * in kernel space because that is useful when
652 * debugging the kernel.
653 */
654 if (user_dbreg_trap(dr6) &&
655 !(curpcb->pcb_flags & PCB_VM86CALL))
656 return;
657
658 /*
659 * Malicious user code can configure a debug
660 * register watchpoint to trap on data access
661 * to the top of stack and then execute 'pop
662 * %ss; int 3'. Due to exception deferral for
663 * 'pop %ss', the CPU will not interrupt 'int
664 * 3' to raise the DB# exception for the debug
665 * register but will postpone the DB# until
666 * execution of the first instruction of the
667 * BP# handler (in kernel mode). Normally the
668 * previous check would ignore DB# exceptions
669 * for watchpoints on user addresses raised in
670 * kernel mode. However, some CPU errata
671 * include cases where DB# exceptions do not
672 * properly set bits in %dr6, e.g. Haswell
673 * HSD23 and Skylake-X SKZ24.
674 *
675 * A deferred DB# can also be raised on the
676 * first instructions of system call entry
677 * points or single-step traps via similar use
678 * of 'pop %ss' or 'mov xxx, %ss'.
679 */
680 if (frame->tf_eip ==
681 (uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp ||
682 frame->tf_eip == (uintptr_t)IDTVEC(bpt) +
683 setidt_disp ||
684 frame->tf_eip == (uintptr_t)IDTVEC(dbg) +
685 setidt_disp)
686 return;
687 /*
688 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
689 */
690 case T_BPTFLT:
691 /*
692 * If KDB is enabled, let it handle the debugger trap.
693 * Otherwise, debugger traps "can't happen".
694 */
695 #ifdef KDB
696 if (kdb_trap(type, dr6, frame))
697 return;
698 #endif
699 break;
700
701 case T_NMI:
702 #ifdef POWERFAIL_NMI
703 if (time_second - lastalert > 10) {
704 log(LOG_WARNING, "NMI: power fail\n");
705 sysbeep(880, SBT_1S);
706 lastalert = time_second;
707 }
708 return;
709 #else /* !POWERFAIL_NMI */
710 nmi_handle_intr(type, frame);
711 return;
712 #endif /* POWERFAIL_NMI */
713 }
714
715 trap_fatal(frame, eva);
716 return;
717 }
718
719 ksiginfo_init_trap(&ksi);
720 ksi.ksi_signo = signo;
721 ksi.ksi_code = ucode;
722 ksi.ksi_addr = (void *)addr;
723 ksi.ksi_trapno = type;
724 if (uprintf_signal) {
725 uprintf("pid %d comm %s: signal %d err %#x code %d type %d "
726 "addr %#x ss %#04x esp %#08x cs %#04x eip %#08x eax %#08x"
727 "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
728 p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
729 addr, frame->tf_ss, frame->tf_esp, frame->tf_cs,
730 frame->tf_eip, frame->tf_eax,
731 fubyte((void *)(frame->tf_eip + 0)),
732 fubyte((void *)(frame->tf_eip + 1)),
733 fubyte((void *)(frame->tf_eip + 2)),
734 fubyte((void *)(frame->tf_eip + 3)),
735 fubyte((void *)(frame->tf_eip + 4)),
736 fubyte((void *)(frame->tf_eip + 5)),
737 fubyte((void *)(frame->tf_eip + 6)),
738 fubyte((void *)(frame->tf_eip + 7)));
739 }
740 KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
741 trapsignal(td, &ksi);
742
743 user:
744 userret(td, frame);
745 KASSERT(PCB_USER_FPU(td->td_pcb),
746 ("Return from trap with kernel FPU ctx leaked"));
747 }
748
749 /*
750 * Handle all details of a page fault.
751 * Returns:
752 * -2 if the fault was caused by triggered workaround for Intel Pentium
753 * 0xf00f bug.
754 * -1 if this fault was fatal, typically from kernel mode
755 * (cannot happen, but we need to return something).
756 * 0 if this fault was handled by updating either the user or kernel
757 * page table, execution can continue.
758 * 1 if this fault was from usermode and it was not handled, a synchronous
759 * signal should be delivered to the thread. *signo returns the signal
760 * number, *ucode gives si_code.
761 */
762 static int
763 trap_pfault(struct trapframe *frame, bool usermode, vm_offset_t eva,
764 int *signo, int *ucode)
765 {
766 struct thread *td;
767 struct proc *p;
768 vm_map_t map;
769 int rv;
770 vm_prot_t ftype;
771
772 MPASS(!usermode || (signo != NULL && ucode != NULL));
773
774 td = curthread;
775 p = td->td_proc;
776
777 if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
778 /*
779 * Due to both processor errata and lazy TLB invalidation when
780 * access restrictions are removed from virtual pages, memory
781 * accesses that are allowed by the physical mapping layer may
782 * nonetheless cause one spurious page fault per virtual page.
783 * When the thread is executing a "no faulting" section that
784 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
785 * every page fault is treated as a spurious page fault,
786 * unless it accesses the same virtual address as the most
787 * recent page fault within the same "no faulting" section.
788 */
789 if (td->td_md.md_spurflt_addr != eva ||
790 (td->td_pflags & TDP_RESETSPUR) != 0) {
791 /*
792 * Do nothing to the TLB. A stale TLB entry is
793 * flushed automatically by a page fault.
794 */
795 td->td_md.md_spurflt_addr = eva;
796 td->td_pflags &= ~TDP_RESETSPUR;
797 return (0);
798 }
799 } else {
800 /*
801 * If we get a page fault while in a critical section, then
802 * it is most likely a fatal kernel page fault. The kernel
803 * is already going to panic trying to get a sleep lock to
804 * do the VM lookup, so just consider it a fatal trap so the
805 * kernel can print out a useful trap message and even get
806 * to the debugger.
807 *
808 * If we get a page fault while holding a non-sleepable
809 * lock, then it is most likely a fatal kernel page fault.
810 * If WITNESS is enabled, then it's going to whine about
811 * bogus LORs with various VM locks, so just skip to the
812 * fatal trap handling directly.
813 */
814 if (td->td_critnest != 0 ||
815 WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
816 "Kernel page fault") != 0) {
817 trap_fatal(frame, eva);
818 return (-1);
819 }
820 }
821 if (eva >= PMAP_TRM_MIN_ADDRESS) {
822 /*
823 * Don't allow user-mode faults in kernel address space.
824 * An exception: if the faulting address is the invalid
825 * instruction entry in the IDT, then the Intel Pentium
826 * F00F bug workaround was triggered, and we need to
827 * treat it is as an illegal instruction, and not a page
828 * fault.
829 */
830 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
831 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
832 *ucode = ILL_PRVOPC;
833 *signo = SIGILL;
834 return (-2);
835 }
836 #endif
837 if (usermode) {
838 *signo = SIGSEGV;
839 *ucode = SEGV_MAPERR;
840 return (1);
841 }
842 trap_fatal(frame, eva);
843 return (-1);
844 } else {
845 map = usermode ? &p->p_vmspace->vm_map : kernel_map;
846
847 /*
848 * Kernel cannot access a user-space address directly
849 * because user pages are not mapped. Also, page
850 * faults must not be caused during the interrupts.
851 */
852 if (!usermode && td->td_intr_nesting_level != 0) {
853 trap_fatal(frame, eva);
854 return (-1);
855 }
856 }
857
858 /*
859 * If the trap was caused by errant bits in the PTE then panic.
860 */
861 if (frame->tf_err & PGEX_RSV) {
862 trap_fatal(frame, eva);
863 return (-1);
864 }
865
866 /*
867 * PGEX_I is defined only if the execute disable bit capability is
868 * supported and enabled.
869 */
870 if (frame->tf_err & PGEX_W)
871 ftype = VM_PROT_WRITE;
872 else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
873 ftype = VM_PROT_EXECUTE;
874 else
875 ftype = VM_PROT_READ;
876
877 /* Fault in the page. */
878 rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode);
879 if (rv == KERN_SUCCESS) {
880 #ifdef HWPMC_HOOKS
881 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
882 PMC_SOFT_CALL_TF( , , page_fault, all, frame);
883 if (ftype == VM_PROT_READ)
884 PMC_SOFT_CALL_TF( , , page_fault, read,
885 frame);
886 else
887 PMC_SOFT_CALL_TF( , , page_fault, write,
888 frame);
889 }
890 #endif
891 return (0);
892 }
893 if (usermode)
894 return (1);
895 if (td->td_intr_nesting_level == 0 &&
896 curpcb->pcb_onfault != NULL) {
897 frame->tf_eip = (int)curpcb->pcb_onfault;
898 return (0);
899 }
900 trap_fatal(frame, eva);
901 return (-1);
902 }
903
904 static void
905 trap_fatal(struct trapframe *frame, vm_offset_t eva)
906 {
907 int code, ss, esp;
908 u_int type;
909 struct soft_segment_descriptor softseg;
910 #ifdef KDB
911 bool handled;
912 #endif
913
914 code = frame->tf_err;
915 type = frame->tf_trapno;
916 sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
917
918 printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type),
919 frame->tf_eflags & PSL_VM ? "vm86" :
920 ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
921 #ifdef SMP
922 /* two separate prints in case of a trap on an unmapped page */
923 printf("cpuid = %d; ", PCPU_GET(cpuid));
924 printf("apic id = %02x\n", PCPU_GET(apic_id));
925 #endif
926 if (type == T_PAGEFLT) {
927 printf("fault virtual address = 0x%x\n", eva);
928 printf("fault code = %s %s%s, %s\n",
929 code & PGEX_U ? "user" : "supervisor",
930 code & PGEX_W ? "write" : "read",
931 pg_nx != 0 ?
932 (code & PGEX_I ? " instruction" : " data") :
933 "",
934 code & PGEX_RSV ? "reserved bits in PTE" :
935 code & PGEX_P ? "protection violation" : "page not present");
936 } else {
937 printf("error code = %#x\n", code);
938 }
939 printf("instruction pointer = 0x%x:0x%x\n",
940 frame->tf_cs & 0xffff, frame->tf_eip);
941 if (TF_HAS_STACKREGS(frame)) {
942 ss = frame->tf_ss & 0xffff;
943 esp = frame->tf_esp;
944 } else {
945 ss = GSEL(GDATA_SEL, SEL_KPL);
946 esp = (int)&frame->tf_esp;
947 }
948 printf("stack pointer = 0x%x:0x%x\n", ss, esp);
949 printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
950 printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
951 softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
952 printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
953 softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
954 softseg.ssd_gran);
955 printf("processor eflags = ");
956 if (frame->tf_eflags & PSL_T)
957 printf("trace trap, ");
958 if (frame->tf_eflags & PSL_I)
959 printf("interrupt enabled, ");
960 if (frame->tf_eflags & PSL_NT)
961 printf("nested task, ");
962 if (frame->tf_eflags & PSL_RF)
963 printf("resume, ");
964 if (frame->tf_eflags & PSL_VM)
965 printf("vm86, ");
966 printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
967 printf("current process = %d (%s)\n",
968 curproc->p_pid, curthread->td_name);
969
970 #ifdef KDB
971 if (debugger_on_trap) {
972 kdb_why = KDB_WHY_TRAP;
973 frame->tf_err = eva; /* smuggle fault address to ddb */
974 handled = kdb_trap(type, 0, frame);
975 frame->tf_err = code; /* restore error code */
976 kdb_why = KDB_WHY_UNSET;
977 if (handled)
978 return;
979 }
980 #endif
981 printf("trap number = %d\n", type);
982 if (trap_msg(type) != NULL)
983 panic("%s", trap_msg(type));
984 else
985 panic("unknown/reserved trap");
986 }
987
988 #ifdef KDTRACE_HOOKS
989 /*
990 * Invoke a userspace DTrace hook. The hook pointer is cleared when no
991 * userspace probes are enabled, so we must synchronize with DTrace to ensure
992 * that a trapping thread is able to call the hook before it is cleared.
993 */
994 static bool
995 trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *))
996 {
997 int (*hook)(struct trapframe *);
998
999 hook = atomic_load_ptr(hookp);
1000 enable_intr();
1001 if (hook != NULL)
1002 return ((hook)(frame) == 0);
1003 return (false);
1004 }
1005 #endif
1006
1007 /*
1008 * Double fault handler. Called when a fault occurs while writing
1009 * a frame for a trap/exception onto the stack. This usually occurs
1010 * when the stack overflows (such is the case with infinite recursion,
1011 * for example).
1012 *
1013 * XXX Note that the current PTD gets replaced by IdlePTD when the
1014 * task switch occurs. This means that the stack that was active at
1015 * the time of the double fault is not available at <kstack> unless
1016 * the machine was idle when the double fault occurred. The downside
1017 * of this is that "trace <ebp>" in ddb won't work.
1018 */
1019 void
1020 dblfault_handler(void)
1021 {
1022 struct i386tss *t;
1023
1024 #ifdef KDTRACE_HOOKS
1025 if (dtrace_doubletrap_func != NULL)
1026 (*dtrace_doubletrap_func)();
1027 #endif
1028 printf("\nFatal double fault:\n");
1029 t = PCPU_GET(common_tssp);
1030 printf(
1031 "eip = %#08x esp = %#08x ebp = %#08x eax = %#08x\n"
1032 "edx = %#08x ecx = %#08x edi = %#08x esi = %#08x\n"
1033 "ebx = %#08x\n"
1034 "psl = %#08x cs = %#08x ss = %#08x ds = %#08x\n"
1035 "es = %#08x fs = %#08x gs = %#08x cr3 = %#08x\n",
1036 t->tss_eip, t->tss_esp, t->tss_ebp, t->tss_eax,
1037 t->tss_edx, t->tss_ecx, t->tss_edi, t->tss_esi,
1038 t->tss_ebx,
1039 t->tss_eflags, t->tss_cs, t->tss_ss, t->tss_ds,
1040 t->tss_es, t->tss_fs, t->tss_gs, t->tss_cr3);
1041 #ifdef SMP
1042 printf("cpuid = %d; apic id = %02x\n", PCPU_GET(cpuid),
1043 PCPU_GET(apic_id));
1044 #endif
1045 panic("double fault");
1046 }
1047
1048 int
1049 cpu_fetch_syscall_args(struct thread *td)
1050 {
1051 struct proc *p;
1052 struct trapframe *frame;
1053 struct syscall_args *sa;
1054 caddr_t params;
1055 long tmp;
1056 int error;
1057 #ifdef COMPAT_43
1058 u_int32_t eip;
1059 int cs;
1060 #endif
1061
1062 p = td->td_proc;
1063 frame = td->td_frame;
1064 sa = &td->td_sa;
1065
1066 #ifdef COMPAT_43
1067 if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) {
1068 /*
1069 * In lcall $7,$0 after int $0x80. Convert the user
1070 * frame to what it would be for a direct int 0x80 instead
1071 * of lcall $7,$0, by popping the lcall return address.
1072 */
1073 error = fueword32((void *)frame->tf_esp, &eip);
1074 if (error == -1)
1075 return (EFAULT);
1076 cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t)));
1077 if (cs == -1)
1078 return (EFAULT);
1079
1080 /*
1081 * Unwind in-kernel frame after all stack frame pieces
1082 * were successfully read.
1083 */
1084 frame->tf_eip = eip;
1085 frame->tf_cs = cs;
1086 frame->tf_esp += 2 * sizeof(u_int32_t);
1087 frame->tf_err = 7; /* size of lcall $7,$0 */
1088 }
1089 #endif
1090
1091 sa->code = frame->tf_eax;
1092 sa->original_code = sa->code;
1093 params = (caddr_t)frame->tf_esp + sizeof(uint32_t);
1094
1095 /*
1096 * Need to check if this is a 32 bit or 64 bit syscall.
1097 */
1098 if (sa->code == SYS_syscall) {
1099 /*
1100 * Code is first argument, followed by actual args.
1101 */
1102 error = fueword(params, &tmp);
1103 if (error == -1)
1104 return (EFAULT);
1105 sa->code = tmp;
1106 params += sizeof(uint32_t);
1107 } else if (sa->code == SYS___syscall) {
1108 /*
1109 * Like syscall, but code is a quad, so as to maintain
1110 * quad alignment for the rest of the arguments.
1111 */
1112 error = fueword(params, &tmp);
1113 if (error == -1)
1114 return (EFAULT);
1115 sa->code = tmp;
1116 params += sizeof(quad_t);
1117 }
1118
1119 if (sa->code >= p->p_sysent->sv_size)
1120 sa->callp = &p->p_sysent->sv_table[0];
1121 else
1122 sa->callp = &p->p_sysent->sv_table[sa->code];
1123
1124 if (params != NULL && sa->callp->sy_narg != 0)
1125 error = copyin(params, (caddr_t)sa->args,
1126 (u_int)(sa->callp->sy_narg * sizeof(uint32_t)));
1127 else
1128 error = 0;
1129
1130 if (error == 0) {
1131 td->td_retval[0] = 0;
1132 td->td_retval[1] = frame->tf_edx;
1133 }
1134
1135 return (error);
1136 }
1137
1138 #include "../../kern/subr_syscall.c"
1139
1140 /*
1141 * syscall - system call request C handler. A system call is
1142 * essentially treated as a trap by reusing the frame layout.
1143 */
1144 void
1145 syscall(struct trapframe *frame)
1146 {
1147 struct thread *td;
1148 register_t orig_tf_eflags;
1149 ksiginfo_t ksi;
1150
1151 #ifdef DIAGNOSTIC
1152 if (!(TRAPF_USERMODE(frame) &&
1153 (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
1154 panic("syscall");
1155 /* NOT REACHED */
1156 }
1157 #endif
1158 trap_check_kstack();
1159 orig_tf_eflags = frame->tf_eflags;
1160
1161 td = curthread;
1162 td->td_frame = frame;
1163
1164 syscallenter(td);
1165
1166 /*
1167 * Traced syscall.
1168 */
1169 if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1170 frame->tf_eflags &= ~PSL_T;
1171 ksiginfo_init_trap(&ksi);
1172 ksi.ksi_signo = SIGTRAP;
1173 ksi.ksi_code = TRAP_TRACE;
1174 ksi.ksi_addr = (void *)frame->tf_eip;
1175 trapsignal(td, &ksi);
1176 }
1177
1178 KASSERT(PCB_USER_FPU(td->td_pcb),
1179 ("System call %s returning with kernel FPU ctx leaked",
1180 syscallname(td->td_proc, td->td_sa.code)));
1181 KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1182 ("System call %s returning with mangled pcb_save",
1183 syscallname(td->td_proc, td->td_sa.code)));
1184
1185 syscallret(td);
1186 }
Cache object: 1a9bb1b9ba29ffb8346862ba3a7befec
|