The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/trap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-4-Clause
    3  *
    4  * Copyright (C) 1994, David Greenman
    5  * Copyright (c) 1990, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the University of Utah, and William Jolitz.
   10  *
   11  * Redistribution and use in source and binary forms, with or without
   12  * modification, are permitted provided that the following conditions
   13  * are met:
   14  * 1. Redistributions of source code must retain the above copyright
   15  *    notice, this list of conditions and the following disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  * 3. All advertising materials mentioning features or use of this software
   20  *    must display the following acknowledgement:
   21  *      This product includes software developed by the University of
   22  *      California, Berkeley and its contributors.
   23  * 4. Neither the name of the University nor the names of its contributors
   24  *    may be used to endorse or promote products derived from this software
   25  *    without specific prior written permission.
   26  *
   27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   37  * SUCH DAMAGE.
   38  *
   39  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
   40  */
   41 
   42 #include <sys/cdefs.h>
   43 __FBSDID("$FreeBSD: releng/12.0/sys/amd64/amd64/trap.c 339349 2018-10-13 21:18:31Z mjg $");
   44 
   45 /*
   46  * AMD64 Trap and System call handling
   47  */
   48 
   49 #include "opt_clock.h"
   50 #include "opt_compat.h"
   51 #include "opt_cpu.h"
   52 #include "opt_hwpmc_hooks.h"
   53 #include "opt_isa.h"
   54 #include "opt_kdb.h"
   55 #include "opt_stack.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/bus.h>
   59 #include <sys/systm.h>
   60 #include <sys/proc.h>
   61 #include <sys/pioctl.h>
   62 #include <sys/ptrace.h>
   63 #include <sys/kdb.h>
   64 #include <sys/kernel.h>
   65 #include <sys/ktr.h>
   66 #include <sys/lock.h>
   67 #include <sys/mutex.h>
   68 #include <sys/resourcevar.h>
   69 #include <sys/signalvar.h>
   70 #include <sys/syscall.h>
   71 #include <sys/sysctl.h>
   72 #include <sys/sysent.h>
   73 #include <sys/uio.h>
   74 #include <sys/vmmeter.h>
   75 #ifdef HWPMC_HOOKS
   76 #include <sys/pmckern.h>
   77 PMC_SOFT_DEFINE( , , page_fault, all);
   78 PMC_SOFT_DEFINE( , , page_fault, read);
   79 PMC_SOFT_DEFINE( , , page_fault, write);
   80 #endif
   81 
   82 #include <vm/vm.h>
   83 #include <vm/vm_param.h>
   84 #include <vm/pmap.h>
   85 #include <vm/vm_kern.h>
   86 #include <vm/vm_map.h>
   87 #include <vm/vm_page.h>
   88 #include <vm/vm_extern.h>
   89 
   90 #include <machine/cpu.h>
   91 #include <machine/intr_machdep.h>
   92 #include <x86/mca.h>
   93 #include <machine/md_var.h>
   94 #include <machine/pcb.h>
   95 #ifdef SMP
   96 #include <machine/smp.h>
   97 #endif
   98 #include <machine/stack.h>
   99 #include <machine/trap.h>
  100 #include <machine/tss.h>
  101 
  102 #ifdef KDTRACE_HOOKS
  103 #include <sys/dtrace_bsd.h>
  104 #endif
  105 
  106 extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg),
  107     IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32),
  108     IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall);
  109 
  110 void __noinline trap(struct trapframe *frame);
  111 void trap_check(struct trapframe *frame);
  112 void dblfault_handler(struct trapframe *frame);
  113 
  114 static int trap_pfault(struct trapframe *, int);
  115 static void trap_fatal(struct trapframe *, vm_offset_t);
  116 
  117 #define MAX_TRAP_MSG            32
  118 static char *trap_msg[] = {
  119         "",                                     /*  0 unused */
  120         "privileged instruction fault",         /*  1 T_PRIVINFLT */
  121         "",                                     /*  2 unused */
  122         "breakpoint instruction fault",         /*  3 T_BPTFLT */
  123         "",                                     /*  4 unused */
  124         "",                                     /*  5 unused */
  125         "arithmetic trap",                      /*  6 T_ARITHTRAP */
  126         "",                                     /*  7 unused */
  127         "",                                     /*  8 unused */
  128         "general protection fault",             /*  9 T_PROTFLT */
  129         "debug exception",                      /* 10 T_TRCTRAP */
  130         "",                                     /* 11 unused */
  131         "page fault",                           /* 12 T_PAGEFLT */
  132         "",                                     /* 13 unused */
  133         "alignment fault",                      /* 14 T_ALIGNFLT */
  134         "",                                     /* 15 unused */
  135         "",                                     /* 16 unused */
  136         "",                                     /* 17 unused */
  137         "integer divide fault",                 /* 18 T_DIVIDE */
  138         "non-maskable interrupt trap",          /* 19 T_NMI */
  139         "overflow trap",                        /* 20 T_OFLOW */
  140         "FPU bounds check fault",               /* 21 T_BOUND */
  141         "FPU device not available",             /* 22 T_DNA */
  142         "double fault",                         /* 23 T_DOUBLEFLT */
  143         "FPU operand fetch fault",              /* 24 T_FPOPFLT */
  144         "invalid TSS fault",                    /* 25 T_TSSFLT */
  145         "segment not present fault",            /* 26 T_SEGNPFLT */
  146         "stack fault",                          /* 27 T_STKFLT */
  147         "machine check trap",                   /* 28 T_MCHK */
  148         "SIMD floating-point exception",        /* 29 T_XMMFLT */
  149         "reserved (unknown) fault",             /* 30 T_RESERVED */
  150         "",                                     /* 31 unused (reserved) */
  151         "DTrace pid return trap",               /* 32 T_DTRACE_RET */
  152 };
  153 
  154 static int prot_fault_translation;
  155 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
  156     &prot_fault_translation, 0,
  157     "Select signal to deliver on protection fault");
  158 static int uprintf_signal;
  159 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
  160     &uprintf_signal, 0,
  161     "Print debugging information on trap signal to ctty");
  162 
  163 /*
  164  * Control L1D flush on return from NMI.
  165  *
  166  * Tunable  can be set to the following values:
  167  * 0 - only enable flush on return from NMI if required by vmm.ko (default)
  168  * >1 - always flush on return from NMI.
  169  *
  170  * Post-boot, the sysctl indicates if flushing is currently enabled.
  171  */
  172 int nmi_flush_l1d_sw;
  173 SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN,
  174     &nmi_flush_l1d_sw, 0,
  175     "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist");
  176 
  177 /*
  178  * Exception, fault, and trap interface to the FreeBSD kernel.
  179  * This common code is called from assembly language IDT gate entry
  180  * routines that prepare a suitable stack frame, and restore this
  181  * frame after the exception has been processed.
  182  */
  183 
  184 void
  185 trap(struct trapframe *frame)
  186 {
  187         ksiginfo_t ksi;
  188         struct thread *td;
  189         struct proc *p;
  190         register_t addr, dr6;
  191         int signo, ucode;
  192         u_int type;
  193 
  194         td = curthread;
  195         p = td->td_proc;
  196         signo = 0;
  197         ucode = 0;
  198         addr = 0;
  199         dr6 = 0;
  200 
  201         VM_CNT_INC(v_trap);
  202         type = frame->tf_trapno;
  203 
  204 #ifdef SMP
  205         /* Handler for NMI IPIs used for stopping CPUs. */
  206         if (type == T_NMI && ipi_nmi_handler() == 0)
  207                 return;
  208 #endif
  209 
  210 #ifdef KDB
  211         if (kdb_active) {
  212                 kdb_reenter();
  213                 return;
  214         }
  215 #endif
  216 
  217         if (type == T_RESERVED) {
  218                 trap_fatal(frame, 0);
  219                 return;
  220         }
  221 
  222         if (type == T_NMI) {
  223 #ifdef HWPMC_HOOKS
  224                 /*
  225                  * CPU PMCs interrupt using an NMI.  If the PMC module is
  226                  * active, pass the 'rip' value to the PMC module's interrupt
  227                  * handler.  A non-zero return value from the handler means that
  228                  * the NMI was consumed by it and we can return immediately.
  229                  */
  230                 if (pmc_intr != NULL &&
  231                     (*pmc_intr)(frame) != 0)
  232                         return;
  233 #endif
  234 
  235 #ifdef STACK
  236                 if (stack_nmi_handler(frame) != 0)
  237                         return;
  238 #endif
  239         }
  240 
  241         if ((frame->tf_rflags & PSL_I) == 0) {
  242                 /*
  243                  * Buggy application or kernel code has disabled
  244                  * interrupts and then trapped.  Enabling interrupts
  245                  * now is wrong, but it is better than running with
  246                  * interrupts disabled until they are accidentally
  247                  * enabled later.
  248                  */
  249                 if (TRAPF_USERMODE(frame))
  250                         uprintf(
  251                             "pid %ld (%s): trap %d with interrupts disabled\n",
  252                             (long)curproc->p_pid, curthread->td_name, type);
  253                 else if (type != T_NMI && type != T_BPTFLT &&
  254                     type != T_TRCTRAP) {
  255                         /*
  256                          * XXX not quite right, since this may be for a
  257                          * multiple fault in user mode.
  258                          */
  259                         printf("kernel trap %d with interrupts disabled\n",
  260                             type);
  261 
  262                         /*
  263                          * We shouldn't enable interrupts while holding a
  264                          * spin lock.
  265                          */
  266                         if (td->td_md.md_spinlock_count == 0)
  267                                 enable_intr();
  268                 }
  269         }
  270 
  271         if (TRAPF_USERMODE(frame)) {
  272                 /* user trap */
  273 
  274                 td->td_pticks = 0;
  275                 td->td_frame = frame;
  276                 addr = frame->tf_rip;
  277                 if (td->td_cowgen != p->p_cowgen)
  278                         thread_cow_update(td);
  279 
  280                 switch (type) {
  281                 case T_PRIVINFLT:       /* privileged instruction fault */
  282                         signo = SIGILL;
  283                         ucode = ILL_PRVOPC;
  284                         break;
  285 
  286                 case T_BPTFLT:          /* bpt instruction fault */
  287                         enable_intr();
  288 #ifdef KDTRACE_HOOKS
  289                         if (dtrace_pid_probe_ptr != NULL &&
  290                             dtrace_pid_probe_ptr(frame) == 0)
  291                                 return;
  292 #endif
  293                         signo = SIGTRAP;
  294                         ucode = TRAP_BRKPT;
  295                         break;
  296 
  297                 case T_TRCTRAP:         /* debug exception */
  298                         enable_intr();
  299                         signo = SIGTRAP;
  300                         ucode = TRAP_TRACE;
  301                         dr6 = rdr6();
  302                         if ((dr6 & DBREG_DR6_BS) != 0) {
  303                                 PROC_LOCK(td->td_proc);
  304                                 if ((td->td_dbgflags & TDB_STEP) != 0) {
  305                                         td->td_frame->tf_rflags &= ~PSL_T;
  306                                         td->td_dbgflags &= ~TDB_STEP;
  307                                 }
  308                                 PROC_UNLOCK(td->td_proc);
  309                         }
  310                         break;
  311 
  312                 case T_ARITHTRAP:       /* arithmetic trap */
  313                         ucode = fputrap_x87();
  314                         if (ucode == -1)
  315                                 return;
  316                         signo = SIGFPE;
  317                         break;
  318 
  319                 case T_PROTFLT:         /* general protection fault */
  320                         signo = SIGBUS;
  321                         ucode = BUS_OBJERR;
  322                         break;
  323                 case T_STKFLT:          /* stack fault */
  324                 case T_SEGNPFLT:        /* segment not present fault */
  325                         signo = SIGBUS;
  326                         ucode = BUS_ADRERR;
  327                         break;
  328                 case T_TSSFLT:          /* invalid TSS fault */
  329                         signo = SIGBUS;
  330                         ucode = BUS_OBJERR;
  331                         break;
  332                 case T_ALIGNFLT:
  333                         signo = SIGBUS;
  334                         ucode = BUS_ADRALN;
  335                         break;
  336                 case T_DOUBLEFLT:       /* double fault */
  337                 default:
  338                         signo = SIGBUS;
  339                         ucode = BUS_OBJERR;
  340                         break;
  341 
  342                 case T_PAGEFLT:         /* page fault */
  343                         /*
  344                          * Emulator can take care about this trap?
  345                          */
  346                         if (*p->p_sysent->sv_trap != NULL &&
  347                             (*p->p_sysent->sv_trap)(td) == 0)
  348                                 return;
  349 
  350                         addr = frame->tf_addr;
  351                         signo = trap_pfault(frame, TRUE);
  352                         if (signo == -1)
  353                                 return;
  354                         if (signo == 0)
  355                                 goto userret;
  356                         if (signo == SIGSEGV) {
  357                                 ucode = SEGV_MAPERR;
  358                         } else if (prot_fault_translation == 0) {
  359                                 /*
  360                                  * Autodetect.  This check also covers
  361                                  * the images without the ABI-tag ELF
  362                                  * note.
  363                                  */
  364                                 if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
  365                                     p->p_osrel >= P_OSREL_SIGSEGV) {
  366                                         signo = SIGSEGV;
  367                                         ucode = SEGV_ACCERR;
  368                                 } else {
  369                                         signo = SIGBUS;
  370                                         ucode = T_PAGEFLT;
  371                                 }
  372                         } else if (prot_fault_translation == 1) {
  373                                 /*
  374                                  * Always compat mode.
  375                                  */
  376                                 signo = SIGBUS;
  377                                 ucode = T_PAGEFLT;
  378                         } else {
  379                                 /*
  380                                  * Always SIGSEGV mode.
  381                                  */
  382                                 signo = SIGSEGV;
  383                                 ucode = SEGV_ACCERR;
  384                         }
  385                         break;
  386 
  387                 case T_DIVIDE:          /* integer divide fault */
  388                         ucode = FPE_INTDIV;
  389                         signo = SIGFPE;
  390                         break;
  391 
  392 #ifdef DEV_ISA
  393                 case T_NMI:
  394                         nmi_handle_intr(type, frame);
  395                         return;
  396 #endif
  397 
  398                 case T_OFLOW:           /* integer overflow fault */
  399                         ucode = FPE_INTOVF;
  400                         signo = SIGFPE;
  401                         break;
  402 
  403                 case T_BOUND:           /* bounds check fault */
  404                         ucode = FPE_FLTSUB;
  405                         signo = SIGFPE;
  406                         break;
  407 
  408                 case T_DNA:
  409                         /* transparent fault (due to context switch "late") */
  410                         KASSERT(PCB_USER_FPU(td->td_pcb),
  411                             ("kernel FPU ctx has leaked"));
  412                         fpudna();
  413                         return;
  414 
  415                 case T_FPOPFLT:         /* FPU operand fetch fault */
  416                         ucode = ILL_COPROC;
  417                         signo = SIGILL;
  418                         break;
  419 
  420                 case T_XMMFLT:          /* SIMD floating-point exception */
  421                         ucode = fputrap_sse();
  422                         if (ucode == -1)
  423                                 return;
  424                         signo = SIGFPE;
  425                         break;
  426 #ifdef KDTRACE_HOOKS
  427                 case T_DTRACE_RET:
  428                         enable_intr();
  429                         if (dtrace_return_probe_ptr != NULL)
  430                                 dtrace_return_probe_ptr(frame);
  431                         return;
  432 #endif
  433                 }
  434         } else {
  435                 /* kernel trap */
  436 
  437                 KASSERT(cold || td->td_ucred != NULL,
  438                     ("kernel trap doesn't have ucred"));
  439                 switch (type) {
  440                 case T_PAGEFLT:                 /* page fault */
  441                         (void) trap_pfault(frame, FALSE);
  442                         return;
  443 
  444                 case T_DNA:
  445                         if (PCB_USER_FPU(td->td_pcb))
  446                                 panic("Unregistered use of FPU in kernel");
  447                         fpudna();
  448                         return;
  449 
  450                 case T_ARITHTRAP:       /* arithmetic trap */
  451                 case T_XMMFLT:          /* SIMD floating-point exception */
  452                 case T_FPOPFLT:         /* FPU operand fetch fault */
  453                         /*
  454                          * For now, supporting kernel handler
  455                          * registration for FPU traps is overkill.
  456                          */
  457                         trap_fatal(frame, 0);
  458                         return;
  459 
  460                 case T_STKFLT:          /* stack fault */
  461                 case T_PROTFLT:         /* general protection fault */
  462                 case T_SEGNPFLT:        /* segment not present fault */
  463                         if (td->td_intr_nesting_level != 0)
  464                                 break;
  465 
  466                         /*
  467                          * Invalid segment selectors and out of bounds
  468                          * %rip's and %rsp's can be set up in user mode.
  469                          * This causes a fault in kernel mode when the
  470                          * kernel tries to return to user mode.  We want
  471                          * to get this fault so that we can fix the
  472                          * problem here and not have to check all the
  473                          * selectors and pointers when the user changes
  474                          * them.
  475                          *
  476                          * In case of PTI, the IRETQ faulted while the
  477                          * kernel used the pti stack, and exception
  478                          * frame records %rsp value pointing to that
  479                          * stack.  If we return normally to
  480                          * doreti_iret_fault, the trapframe is
  481                          * reconstructed on pti stack, and calltrap()
  482                          * called on it as well.  Due to the very
  483                          * limited pti stack size, kernel does not
  484                          * survive for too long.  Switch to the normal
  485                          * thread stack for the trap handling.
  486                          *
  487                          * Magic '5' is the number of qwords occupied by
  488                          * the hardware trap frame.
  489                          */
  490                         if (frame->tf_rip == (long)doreti_iret) {
  491                                 frame->tf_rip = (long)doreti_iret_fault;
  492                                 if ((PCPU_GET(curpmap)->pm_ucr3 !=
  493                                     PMAP_NO_CR3) &&
  494                                     (frame->tf_rsp == (uintptr_t)PCPU_GET(
  495                                     pti_rsp0) - 5 * sizeof(register_t))) {
  496                                         frame->tf_rsp = PCPU_GET(rsp0) - 5 *
  497                                             sizeof(register_t);
  498                                 }
  499                                 return;
  500                         }
  501                         if (frame->tf_rip == (long)ld_ds) {
  502                                 frame->tf_rip = (long)ds_load_fault;
  503                                 return;
  504                         }
  505                         if (frame->tf_rip == (long)ld_es) {
  506                                 frame->tf_rip = (long)es_load_fault;
  507                                 return;
  508                         }
  509                         if (frame->tf_rip == (long)ld_fs) {
  510                                 frame->tf_rip = (long)fs_load_fault;
  511                                 return;
  512                         }
  513                         if (frame->tf_rip == (long)ld_gs) {
  514                                 frame->tf_rip = (long)gs_load_fault;
  515                                 return;
  516                         }
  517                         if (frame->tf_rip == (long)ld_gsbase) {
  518                                 frame->tf_rip = (long)gsbase_load_fault;
  519                                 return;
  520                         }
  521                         if (frame->tf_rip == (long)ld_fsbase) {
  522                                 frame->tf_rip = (long)fsbase_load_fault;
  523                                 return;
  524                         }
  525                         if (curpcb->pcb_onfault != NULL) {
  526                                 frame->tf_rip = (long)curpcb->pcb_onfault;
  527                                 return;
  528                         }
  529                         break;
  530 
  531                 case T_TSSFLT:
  532                         /*
  533                          * PSL_NT can be set in user mode and isn't cleared
  534                          * automatically when the kernel is entered.  This
  535                          * causes a TSS fault when the kernel attempts to
  536                          * `iret' because the TSS link is uninitialized.  We
  537                          * want to get this fault so that we can fix the
  538                          * problem here and not every time the kernel is
  539                          * entered.
  540                          */
  541                         if (frame->tf_rflags & PSL_NT) {
  542                                 frame->tf_rflags &= ~PSL_NT;
  543                                 return;
  544                         }
  545                         break;
  546 
  547                 case T_TRCTRAP:  /* debug exception */
  548                         /* Clear any pending debug events. */
  549                         dr6 = rdr6();
  550                         load_dr6(0);
  551 
  552                         /*
  553                          * Ignore debug register exceptions due to
  554                          * accesses in the user's address space, which
  555                          * can happen under several conditions such as
  556                          * if a user sets a watchpoint on a buffer and
  557                          * then passes that buffer to a system call.
  558                          * We still want to get TRCTRAPS for addresses
  559                          * in kernel space because that is useful when
  560                          * debugging the kernel.
  561                          */
  562                         if (user_dbreg_trap(dr6))
  563                                 return;
  564 
  565                         /*
  566                          * Malicious user code can configure a debug
  567                          * register watchpoint to trap on data access
  568                          * to the top of stack and then execute 'pop
  569                          * %ss; int 3'.  Due to exception deferral for
  570                          * 'pop %ss', the CPU will not interrupt 'int
  571                          * 3' to raise the DB# exception for the debug
  572                          * register but will postpone the DB# until
  573                          * execution of the first instruction of the
  574                          * BP# handler (in kernel mode).  Normally the
  575                          * previous check would ignore DB# exceptions
  576                          * for watchpoints on user addresses raised in
  577                          * kernel mode.  However, some CPU errata
  578                          * include cases where DB# exceptions do not
  579                          * properly set bits in %dr6, e.g. Haswell
  580                          * HSD23 and Skylake-X SKZ24.
  581                          *
  582                          * A deferred DB# can also be raised on the
  583                          * first instructions of system call entry
  584                          * points or single-step traps via similar use
  585                          * of 'pop %ss' or 'mov xxx, %ss'.
  586                          */
  587                         if (pti) {
  588                                 if (frame->tf_rip ==
  589                                     (uintptr_t)IDTVEC(fast_syscall_pti) ||
  590 #ifdef COMPAT_FREEBSD32
  591                                     frame->tf_rip ==
  592                                     (uintptr_t)IDTVEC(int0x80_syscall_pti) ||
  593 #endif
  594                                     frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti))
  595                                         return;
  596                         } else {
  597                                 if (frame->tf_rip ==
  598                                     (uintptr_t)IDTVEC(fast_syscall) ||
  599 #ifdef COMPAT_FREEBSD32
  600                                     frame->tf_rip ==
  601                                     (uintptr_t)IDTVEC(int0x80_syscall) ||
  602 #endif
  603                                     frame->tf_rip == (uintptr_t)IDTVEC(bpt))
  604                                         return;
  605                         }
  606                         if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) ||
  607                             /* Needed for AMD. */
  608                             frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32))
  609                                 return;
  610                         /*
  611                          * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
  612                          */
  613                 case T_BPTFLT:
  614                         /*
  615                          * If KDB is enabled, let it handle the debugger trap.
  616                          * Otherwise, debugger traps "can't happen".
  617                          */
  618 #ifdef KDB
  619                         if (kdb_trap(type, dr6, frame))
  620                                 return;
  621 #endif
  622                         break;
  623 
  624 #ifdef DEV_ISA
  625                 case T_NMI:
  626                         nmi_handle_intr(type, frame);
  627                         return;
  628 #endif
  629                 }
  630 
  631                 trap_fatal(frame, 0);
  632                 return;
  633         }
  634 
  635         /* Translate fault for emulators (e.g. Linux) */
  636         if (*p->p_sysent->sv_transtrap != NULL)
  637                 signo = (*p->p_sysent->sv_transtrap)(signo, type);
  638 
  639         ksiginfo_init_trap(&ksi);
  640         ksi.ksi_signo = signo;
  641         ksi.ksi_code = ucode;
  642         ksi.ksi_trapno = type;
  643         ksi.ksi_addr = (void *)addr;
  644         if (uprintf_signal) {
  645                 uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
  646                     "addr 0x%lx rsp 0x%lx rip 0x%lx "
  647                     "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
  648                     p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
  649                     addr, frame->tf_rsp, frame->tf_rip,
  650                     fubyte((void *)(frame->tf_rip + 0)),
  651                     fubyte((void *)(frame->tf_rip + 1)),
  652                     fubyte((void *)(frame->tf_rip + 2)),
  653                     fubyte((void *)(frame->tf_rip + 3)),
  654                     fubyte((void *)(frame->tf_rip + 4)),
  655                     fubyte((void *)(frame->tf_rip + 5)),
  656                     fubyte((void *)(frame->tf_rip + 6)),
  657                     fubyte((void *)(frame->tf_rip + 7)));
  658         }
  659         KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
  660         trapsignal(td, &ksi);
  661 
  662 userret:
  663         userret(td, frame);
  664         KASSERT(PCB_USER_FPU(td->td_pcb),
  665             ("Return from trap with kernel FPU ctx leaked"));
  666 }
  667 
  668 /*
  669  * Ensure that we ignore any DTrace-induced faults. This function cannot
  670  * be instrumented, so it cannot generate such faults itself.
  671  */
  672 void
  673 trap_check(struct trapframe *frame)
  674 {
  675 
  676 #ifdef KDTRACE_HOOKS
  677         if (dtrace_trap_func != NULL &&
  678             (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
  679                 return;
  680 #endif
  681         trap(frame);
  682 }
  683 
  684 static bool
  685 trap_is_smap(struct trapframe *frame)
  686 {
  687 
  688         /*
  689          * A page fault on a userspace address is classified as
  690          * SMAP-induced if:
  691          * - SMAP is supported;
  692          * - kernel mode accessed present data page;
  693          * - rflags.AC was cleared.
  694          * Kernel must never access user space with rflags.AC cleared
  695          * if SMAP is enabled.
  696          */
  697         return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 &&
  698             (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) ==
  699             PGEX_P && (frame->tf_rflags & PSL_AC) == 0);
  700 }
  701 
  702 static bool
  703 trap_is_pti(struct trapframe *frame)
  704 {
  705 
  706         return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 &&
  707             pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W |
  708             PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) &&
  709             (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) ==
  710             (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK));
  711 }
  712 
  713 static int
  714 trap_pfault(struct trapframe *frame, int usermode)
  715 {
  716         struct thread *td;
  717         struct proc *p;
  718         vm_map_t map;
  719         vm_offset_t va;
  720         int rv;
  721         vm_prot_t ftype;
  722         vm_offset_t eva;
  723 
  724         td = curthread;
  725         p = td->td_proc;
  726         eva = frame->tf_addr;
  727 
  728         if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
  729                 /*
  730                  * Due to both processor errata and lazy TLB invalidation when
  731                  * access restrictions are removed from virtual pages, memory
  732                  * accesses that are allowed by the physical mapping layer may
  733                  * nonetheless cause one spurious page fault per virtual page. 
  734                  * When the thread is executing a "no faulting" section that
  735                  * is bracketed by vm_fault_{disable,enable}_pagefaults(),
  736                  * every page fault is treated as a spurious page fault,
  737                  * unless it accesses the same virtual address as the most
  738                  * recent page fault within the same "no faulting" section.
  739                  */
  740                 if (td->td_md.md_spurflt_addr != eva ||
  741                     (td->td_pflags & TDP_RESETSPUR) != 0) {
  742                         /*
  743                          * Do nothing to the TLB.  A stale TLB entry is
  744                          * flushed automatically by a page fault.
  745                          */
  746                         td->td_md.md_spurflt_addr = eva;
  747                         td->td_pflags &= ~TDP_RESETSPUR;
  748                         return (0);
  749                 }
  750         } else {
  751                 /*
  752                  * If we get a page fault while in a critical section, then
  753                  * it is most likely a fatal kernel page fault.  The kernel
  754                  * is already going to panic trying to get a sleep lock to
  755                  * do the VM lookup, so just consider it a fatal trap so the
  756                  * kernel can print out a useful trap message and even get
  757                  * to the debugger.
  758                  *
  759                  * If we get a page fault while holding a non-sleepable
  760                  * lock, then it is most likely a fatal kernel page fault.
  761                  * If WITNESS is enabled, then it's going to whine about
  762                  * bogus LORs with various VM locks, so just skip to the
  763                  * fatal trap handling directly.
  764                  */
  765                 if (td->td_critnest != 0 ||
  766                     WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
  767                     "Kernel page fault") != 0) {
  768                         trap_fatal(frame, eva);
  769                         return (-1);
  770                 }
  771         }
  772         va = trunc_page(eva);
  773         if (va >= VM_MIN_KERNEL_ADDRESS) {
  774                 /*
  775                  * Don't allow user-mode faults in kernel address space.
  776                  */
  777                 if (usermode)
  778                         return (SIGSEGV);
  779 
  780                 map = kernel_map;
  781         } else {
  782                 map = &p->p_vmspace->vm_map;
  783 
  784                 /*
  785                  * When accessing a usermode address, kernel must be
  786                  * ready to accept the page fault, and provide a
  787                  * handling routine.  Since accessing the address
  788                  * without the handler is a bug, do not try to handle
  789                  * it normally, and panic immediately.
  790                  *
  791                  * If SMAP is enabled, filter SMAP faults also,
  792                  * because illegal access might occur to the mapped
  793                  * user address, causing infinite loop.
  794                  */
  795                 if (!usermode && (td->td_intr_nesting_level != 0 ||
  796                     trap_is_smap(frame) || curpcb->pcb_onfault == NULL)) {
  797                         trap_fatal(frame, eva);
  798                         return (-1);
  799                 }
  800         }
  801 
  802         /*
  803          * If the trap was caused by errant bits in the PTE then panic.
  804          */
  805         if (frame->tf_err & PGEX_RSV) {
  806                 trap_fatal(frame, eva);
  807                 return (-1);
  808         }
  809 
  810         /*
  811          * If nx protection of the usermode portion of kernel page
  812          * tables caused trap, panic.
  813          */
  814         if (usermode && trap_is_pti(frame))
  815                 panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid,
  816                     p->p_comm, frame->tf_err);
  817 
  818         /*
  819          * PGEX_I is defined only if the execute disable bit capability is
  820          * supported and enabled.
  821          */
  822         if (frame->tf_err & PGEX_W)
  823                 ftype = VM_PROT_WRITE;
  824         else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
  825                 ftype = VM_PROT_EXECUTE;
  826         else
  827                 ftype = VM_PROT_READ;
  828 
  829         /* Fault in the page. */
  830         rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
  831         if (rv == KERN_SUCCESS) {
  832 #ifdef HWPMC_HOOKS
  833                 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
  834                         PMC_SOFT_CALL_TF( , , page_fault, all, frame);
  835                         if (ftype == VM_PROT_READ)
  836                                 PMC_SOFT_CALL_TF( , , page_fault, read,
  837                                     frame);
  838                         else
  839                                 PMC_SOFT_CALL_TF( , , page_fault, write,
  840                                     frame);
  841                 }
  842 #endif
  843                 return (0);
  844         }
  845         if (!usermode) {
  846                 if (td->td_intr_nesting_level == 0 &&
  847                     curpcb->pcb_onfault != NULL) {
  848                         frame->tf_rip = (long)curpcb->pcb_onfault;
  849                         return (0);
  850                 }
  851                 trap_fatal(frame, eva);
  852                 return (-1);
  853         }
  854         return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
  855 }
  856 
  857 static void
  858 trap_fatal(frame, eva)
  859         struct trapframe *frame;
  860         vm_offset_t eva;
  861 {
  862         int code, ss;
  863         u_int type;
  864         struct soft_segment_descriptor softseg;
  865         char *msg;
  866 #ifdef KDB
  867         bool handled;
  868 #endif
  869 
  870         code = frame->tf_err;
  871         type = frame->tf_trapno;
  872         sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
  873             &softseg);
  874 
  875         if (type <= MAX_TRAP_MSG)
  876                 msg = trap_msg[type];
  877         else
  878                 msg = "UNKNOWN";
  879         printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
  880             TRAPF_USERMODE(frame) ? "user" : "kernel");
  881 #ifdef SMP
  882         /* two separate prints in case of a trap on an unmapped page */
  883         printf("cpuid = %d; ", PCPU_GET(cpuid));
  884         printf("apic id = %02x\n", PCPU_GET(apic_id));
  885 #endif
  886         if (type == T_PAGEFLT) {
  887                 printf("fault virtual address   = 0x%lx\n", eva);
  888                 printf("fault code              = %s %s %s, %s\n",
  889                         code & PGEX_U ? "user" : "supervisor",
  890                         code & PGEX_W ? "write" : "read",
  891                         code & PGEX_I ? "instruction" : "data",
  892                         code & PGEX_RSV ? "reserved bits in PTE" :
  893                         code & PGEX_P ? "protection violation" : "page not present");
  894         }
  895         printf("instruction pointer     = 0x%lx:0x%lx\n",
  896                frame->tf_cs & 0xffff, frame->tf_rip);
  897         ss = frame->tf_ss & 0xffff;
  898         printf("stack pointer           = 0x%x:0x%lx\n", ss, frame->tf_rsp);
  899         printf("frame pointer           = 0x%x:0x%lx\n", ss, frame->tf_rbp);
  900         printf("code segment            = base 0x%lx, limit 0x%lx, type 0x%x\n",
  901                softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
  902         printf("                        = DPL %d, pres %d, long %d, def32 %d, gran %d\n",
  903                softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
  904                softseg.ssd_gran);
  905         printf("processor eflags        = ");
  906         if (frame->tf_rflags & PSL_T)
  907                 printf("trace trap, ");
  908         if (frame->tf_rflags & PSL_I)
  909                 printf("interrupt enabled, ");
  910         if (frame->tf_rflags & PSL_NT)
  911                 printf("nested task, ");
  912         if (frame->tf_rflags & PSL_RF)
  913                 printf("resume, ");
  914         printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
  915         printf("current process         = %d (%s)\n",
  916             curproc->p_pid, curthread->td_name);
  917 
  918 #ifdef KDB
  919         if (debugger_on_panic) {
  920                 kdb_why = KDB_WHY_TRAP;
  921                 handled = kdb_trap(type, 0, frame);
  922                 kdb_why = KDB_WHY_UNSET;
  923                 if (handled)
  924                         return;
  925         }
  926 #endif
  927         printf("trap number             = %d\n", type);
  928         if (type <= MAX_TRAP_MSG)
  929                 panic("%s", trap_msg[type]);
  930         else
  931                 panic("unknown/reserved trap");
  932 }
  933 
  934 /*
  935  * Double fault handler. Called when a fault occurs while writing
  936  * a frame for a trap/exception onto the stack. This usually occurs
  937  * when the stack overflows (such is the case with infinite recursion,
  938  * for example).
  939  */
  940 void
  941 dblfault_handler(struct trapframe *frame)
  942 {
  943 #ifdef KDTRACE_HOOKS
  944         if (dtrace_doubletrap_func != NULL)
  945                 (*dtrace_doubletrap_func)();
  946 #endif
  947         printf("\nFatal double fault\n"
  948             "rip %#lx rsp %#lx rbp %#lx\n"
  949             "rax %#lx rdx %#lx rbx %#lx\n"
  950             "rcx %#lx rsi %#lx rdi %#lx\n"
  951             "r8 %#lx r9 %#lx r10 %#lx\n"
  952             "r11 %#lx r12 %#lx r13 %#lx\n"
  953             "r14 %#lx r15 %#lx rflags %#lx\n"
  954             "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n"
  955             "fsbase %#lx gsbase %#lx kgsbase %#lx\n",
  956             frame->tf_rip, frame->tf_rsp, frame->tf_rbp,
  957             frame->tf_rax, frame->tf_rdx, frame->tf_rbx,
  958             frame->tf_rcx, frame->tf_rdi, frame->tf_rsi,
  959             frame->tf_r8, frame->tf_r9, frame->tf_r10,
  960             frame->tf_r11, frame->tf_r12, frame->tf_r13,
  961             frame->tf_r14, frame->tf_r15, frame->tf_rflags,
  962             frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es,
  963             frame->tf_fs, frame->tf_gs,
  964             rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE));
  965 #ifdef SMP
  966         /* two separate prints in case of a trap on an unmapped page */
  967         printf("cpuid = %d; ", PCPU_GET(cpuid));
  968         printf("apic id = %02x\n", PCPU_GET(apic_id));
  969 #endif
  970         panic("double fault");
  971 }
  972 
  973 static int __noinline
  974 cpu_fetch_syscall_args_fallback(struct thread *td, struct syscall_args *sa)
  975 {
  976         struct proc *p;
  977         struct trapframe *frame;
  978         register_t *argp;
  979         caddr_t params;
  980         int reg, regcnt, error;
  981 
  982         p = td->td_proc;
  983         frame = td->td_frame;
  984         reg = 0;
  985         regcnt = NARGREGS;
  986 
  987         sa->code = frame->tf_rax;
  988 
  989         if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
  990                 sa->code = frame->tf_rdi;
  991                 reg++;
  992                 regcnt--;
  993         }
  994         if (p->p_sysent->sv_mask)
  995                 sa->code &= p->p_sysent->sv_mask;
  996 
  997         if (sa->code >= p->p_sysent->sv_size)
  998                 sa->callp = &p->p_sysent->sv_table[0];
  999         else
 1000                 sa->callp = &p->p_sysent->sv_table[sa->code];
 1001 
 1002         sa->narg = sa->callp->sy_narg;
 1003         KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!"));
 1004         argp = &frame->tf_rdi;
 1005         argp += reg;
 1006         memcpy(sa->args, argp, sizeof(sa->args[0]) * NARGREGS);
 1007         if (sa->narg > regcnt) {
 1008                 params = (caddr_t)frame->tf_rsp + sizeof(register_t);
 1009                 error = copyin(params, &sa->args[regcnt],
 1010                     (sa->narg - regcnt) * sizeof(sa->args[0]));
 1011                 if (__predict_false(error != 0))
 1012                         return (error);
 1013         }
 1014 
 1015         td->td_retval[0] = 0;
 1016         td->td_retval[1] = frame->tf_rdx;
 1017 
 1018         return (0);
 1019 }
 1020 
 1021 int
 1022 cpu_fetch_syscall_args(struct thread *td)
 1023 {
 1024         struct proc *p;
 1025         struct trapframe *frame;
 1026         struct syscall_args *sa;
 1027 
 1028         p = td->td_proc;
 1029         frame = td->td_frame;
 1030         sa = &td->td_sa;
 1031 
 1032         sa->code = frame->tf_rax;
 1033 
 1034         if (__predict_false(sa->code == SYS_syscall ||
 1035             sa->code == SYS___syscall ||
 1036             sa->code >= p->p_sysent->sv_size))
 1037                 return (cpu_fetch_syscall_args_fallback(td, sa));
 1038 
 1039         sa->callp = &p->p_sysent->sv_table[sa->code];
 1040         sa->narg = sa->callp->sy_narg;
 1041         KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!"));
 1042 
 1043         if (p->p_sysent->sv_mask)
 1044                 sa->code &= p->p_sysent->sv_mask;
 1045 
 1046         if (__predict_false(sa->narg > NARGREGS))
 1047                 return (cpu_fetch_syscall_args_fallback(td, sa));
 1048 
 1049         memcpy(sa->args, &frame->tf_rdi, sizeof(sa->args[0]) * NARGREGS);
 1050 
 1051         td->td_retval[0] = 0;
 1052         td->td_retval[1] = frame->tf_rdx;
 1053 
 1054         return (0);
 1055 }
 1056 
 1057 #include "../../kern/subr_syscall.c"
 1058 
 1059 /*
 1060  * System call handler for native binaries.  The trap frame is already
 1061  * set up by the assembler trampoline and a pointer to it is saved in
 1062  * td_frame.
 1063  */
 1064 void
 1065 amd64_syscall(struct thread *td, int traced)
 1066 {
 1067         int error;
 1068         ksiginfo_t ksi;
 1069 
 1070 #ifdef DIAGNOSTIC
 1071         if (!TRAPF_USERMODE(td->td_frame)) {
 1072                 panic("syscall");
 1073                 /* NOT REACHED */
 1074         }
 1075 #endif
 1076         error = syscallenter(td);
 1077 
 1078         /*
 1079          * Traced syscall.
 1080          */
 1081         if (__predict_false(traced)) {
 1082                 td->td_frame->tf_rflags &= ~PSL_T;
 1083                 ksiginfo_init_trap(&ksi);
 1084                 ksi.ksi_signo = SIGTRAP;
 1085                 ksi.ksi_code = TRAP_TRACE;
 1086                 ksi.ksi_addr = (void *)td->td_frame->tf_rip;
 1087                 trapsignal(td, &ksi);
 1088         }
 1089 
 1090         KASSERT(PCB_USER_FPU(td->td_pcb),
 1091             ("System call %s returning with kernel FPU ctx leaked",
 1092              syscallname(td->td_proc, td->td_sa.code)));
 1093         KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
 1094             ("System call %s returning with mangled pcb_save",
 1095              syscallname(td->td_proc, td->td_sa.code)));
 1096         KASSERT(td->td_md.md_invl_gen.gen == 0,
 1097             ("System call %s returning with leaked invl_gen %lu",
 1098             syscallname(td->td_proc, td->td_sa.code),
 1099             td->td_md.md_invl_gen.gen));
 1100 
 1101         syscallret(td, error);
 1102 
 1103         /*
 1104          * If the user-supplied value of %rip is not a canonical
 1105          * address, then some CPUs will trigger a ring 0 #GP during
 1106          * the sysret instruction.  However, the fault handler would
 1107          * execute in ring 0 with the user's %gs and %rsp which would
 1108          * not be safe.  Instead, use the full return path which
 1109          * catches the problem safely.
 1110          */
 1111         if (__predict_false(td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS))
 1112                 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 1113 }

Cache object: 384325b26a535edfb04b5abd1dc94bdf


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.