The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: releng/11.1/sys/amd64/amd64/machdep.c 338607 2018-09-12 05:08:49Z gordon $");
   43 
   44 #include "opt_atpic.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kstack_pages.h"
   51 #include "opt_maxmem.h"
   52 #include "opt_mp_watchdog.h"
   53 #include "opt_perfmon.h"
   54 #include "opt_platform.h"
   55 #include "opt_sched.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/bio.h>
   61 #include <sys/buf.h>
   62 #include <sys/bus.h>
   63 #include <sys/callout.h>
   64 #include <sys/cons.h>
   65 #include <sys/cpu.h>
   66 #include <sys/efi.h>
   67 #include <sys/eventhandler.h>
   68 #include <sys/exec.h>
   69 #include <sys/imgact.h>
   70 #include <sys/kdb.h>
   71 #include <sys/kernel.h>
   72 #include <sys/ktr.h>
   73 #include <sys/linker.h>
   74 #include <sys/lock.h>
   75 #include <sys/malloc.h>
   76 #include <sys/memrange.h>
   77 #include <sys/msgbuf.h>
   78 #include <sys/mutex.h>
   79 #include <sys/pcpu.h>
   80 #include <sys/ptrace.h>
   81 #include <sys/reboot.h>
   82 #include <sys/rwlock.h>
   83 #include <sys/sched.h>
   84 #include <sys/signalvar.h>
   85 #ifdef SMP
   86 #include <sys/smp.h>
   87 #endif
   88 #include <sys/syscallsubr.h>
   89 #include <sys/sysctl.h>
   90 #include <sys/sysent.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/ucontext.h>
   93 #include <sys/vmmeter.h>
   94 
   95 #include <vm/vm.h>
   96 #include <vm/vm_extern.h>
   97 #include <vm/vm_kern.h>
   98 #include <vm/vm_page.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_object.h>
  101 #include <vm/vm_pager.h>
  102 #include <vm/vm_param.h>
  103 
  104 #ifdef DDB
  105 #ifndef KDB
  106 #error KDB must be enabled in order for DDB to work!
  107 #endif
  108 #include <ddb/ddb.h>
  109 #include <ddb/db_sym.h>
  110 #endif
  111 
  112 #include <net/netisr.h>
  113 
  114 #include <machine/clock.h>
  115 #include <machine/cpu.h>
  116 #include <machine/cputypes.h>
  117 #include <machine/frame.h>
  118 #include <machine/intr_machdep.h>
  119 #include <x86/mca.h>
  120 #include <machine/md_var.h>
  121 #include <machine/metadata.h>
  122 #include <machine/mp_watchdog.h>
  123 #include <machine/pc/bios.h>
  124 #include <machine/pcb.h>
  125 #include <machine/proc.h>
  126 #include <machine/reg.h>
  127 #include <machine/sigframe.h>
  128 #include <machine/specialreg.h>
  129 #ifdef PERFMON
  130 #include <machine/perfmon.h>
  131 #endif
  132 #include <machine/tss.h>
  133 #ifdef SMP
  134 #include <machine/smp.h>
  135 #endif
  136 #ifdef FDT
  137 #include <x86/fdt.h>
  138 #endif
  139 
  140 #ifdef DEV_ATPIC
  141 #include <x86/isa/icu.h>
  142 #else
  143 #include <x86/apicvar.h>
  144 #endif
  145 
  146 #include <isa/isareg.h>
  147 #include <isa/rtc.h>
  148 #include <x86/init.h>
  149 
  150 /* Sanity check for __curthread() */
  151 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  152 
  153 /*
  154  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  155  * couple of scratch registers, as well as the trapframe left behind after an
  156  * iret fault.
  157  */
  158 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
  159     offsetof(struct pti_frame, pti_rip));
  160 
  161 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  162 
  163 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  164 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  165 
  166 static void cpu_startup(void *);
  167 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
  168     char *xfpusave, size_t xfpusave_len);
  169 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
  170     char *xfpustate, size_t xfpustate_len);
  171 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  172 
  173 /* Preload data parse function */
  174 static caddr_t native_parse_preload_data(u_int64_t);
  175 
  176 /* Native function to fetch and parse the e820 map */
  177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
  178 
  179 /* Default init_ops implementation. */
  180 struct init_ops init_ops = {
  181         .parse_preload_data =   native_parse_preload_data,
  182         .early_clock_source_init =      i8254_init,
  183         .early_delay =                  i8254_delay,
  184         .parse_memmap =                 native_parse_memmap,
  185 #ifdef SMP
  186         .mp_bootaddress =               mp_bootaddress,
  187         .start_all_aps =                native_start_all_aps,
  188 #endif
  189         .msi_init =                     msi_init,
  190 };
  191 
  192 struct msgbuf *msgbufp;
  193 
  194 /*
  195  * Physical address of the EFI System Table. Stashed from the metadata hints
  196  * passed into the kernel and used by the EFI code to call runtime services.
  197  */
  198 vm_paddr_t efi_systbl_phys;
  199 
  200 /* Intel ICH registers */
  201 #define ICH_PMBASE      0x400
  202 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  203 
  204 int     _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
  205 
  206 int cold = 1;
  207 
  208 long Maxmem = 0;
  209 long realmem = 0;
  210 
  211 /*
  212  * The number of PHYSMAP entries must be one less than the number of
  213  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  214  * physical address that is accessible by ISA DMA is split into two
  215  * PHYSSEG entries.
  216  */
  217 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  218 
  219 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  220 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  221 
  222 /* must be 2 less so 0 0 can signal end of chunks */
  223 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
  224 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
  225 
  226 struct kva_md_info kmi;
  227 
  228 static struct trapframe proc0_tf;
  229 struct region_descriptor r_gdt, r_idt;
  230 
  231 struct pcpu __pcpu[MAXCPU];
  232 
  233 struct mtx icu_lock;
  234 
  235 struct mem_range_softc mem_range_softc;
  236 
  237 struct mtx dt_lock;     /* lock for GDT and LDT */
  238 
  239 void (*vmm_resume_p)(void);
  240 
  241 static void
  242 cpu_startup(dummy)
  243         void *dummy;
  244 {
  245         uintmax_t memsize;
  246         char *sysenv;
  247 
  248         /*
  249          * On MacBooks, we need to disallow the legacy USB circuit to
  250          * generate an SMI# because this can cause several problems,
  251          * namely: incorrect CPU frequency detection and failure to
  252          * start the APs.
  253          * We do this by disabling a bit in the SMI_EN (SMI Control and
  254          * Enable register) of the Intel ICH LPC Interface Bridge. 
  255          */
  256         sysenv = kern_getenv("smbios.system.product");
  257         if (sysenv != NULL) {
  258                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  259                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  260                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  261                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  262                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  263                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  264                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  265                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  266                         if (bootverbose)
  267                                 printf("Disabling LEGACY_USB_EN bit on "
  268                                     "Intel ICH.\n");
  269                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  270                 }
  271                 freeenv(sysenv);
  272         }
  273 
  274         /*
  275          * Good {morning,afternoon,evening,night}.
  276          */
  277         startrtclock();
  278         printcpuinfo();
  279 #ifdef PERFMON
  280         perfmon_init();
  281 #endif
  282 
  283         /*
  284          * Display physical memory if SMBIOS reports reasonable amount.
  285          */
  286         memsize = 0;
  287         sysenv = kern_getenv("smbios.memory.enabled");
  288         if (sysenv != NULL) {
  289                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  290                 freeenv(sysenv);
  291         }
  292         if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
  293                 memsize = ptoa((uintmax_t)Maxmem);
  294         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  295         realmem = atop(memsize);
  296 
  297         /*
  298          * Display any holes after the first chunk of extended memory.
  299          */
  300         if (bootverbose) {
  301                 int indx;
  302 
  303                 printf("Physical memory chunk(s):\n");
  304                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  305                         vm_paddr_t size;
  306 
  307                         size = phys_avail[indx + 1] - phys_avail[indx];
  308                         printf(
  309                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  310                             (uintmax_t)phys_avail[indx],
  311                             (uintmax_t)phys_avail[indx + 1] - 1,
  312                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  313                 }
  314         }
  315 
  316         vm_ksubmap_init(&kmi);
  317 
  318         printf("avail memory = %ju (%ju MB)\n",
  319             ptoa((uintmax_t)vm_cnt.v_free_count),
  320             ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
  321 
  322         /*
  323          * Set up buffers, so they can be used to read disk labels.
  324          */
  325         bufinit();
  326         vm_pager_bufferinit();
  327 
  328         cpu_setregs();
  329 }
  330 
  331 /*
  332  * Send an interrupt to process.
  333  *
  334  * Stack is set up to allow sigcode stored
  335  * at top to call routine, followed by call
  336  * to sigreturn routine below.  After sigreturn
  337  * resets the signal mask, the stack, and the
  338  * frame pointer, it returns to the user
  339  * specified pc, psl.
  340  */
  341 void
  342 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  343 {
  344         struct sigframe sf, *sfp;
  345         struct pcb *pcb;
  346         struct proc *p;
  347         struct thread *td;
  348         struct sigacts *psp;
  349         char *sp;
  350         struct trapframe *regs;
  351         char *xfpusave;
  352         size_t xfpusave_len;
  353         int sig;
  354         int oonstack;
  355 
  356         td = curthread;
  357         pcb = td->td_pcb;
  358         p = td->td_proc;
  359         PROC_LOCK_ASSERT(p, MA_OWNED);
  360         sig = ksi->ksi_signo;
  361         psp = p->p_sigacts;
  362         mtx_assert(&psp->ps_mtx, MA_OWNED);
  363         regs = td->td_frame;
  364         oonstack = sigonstack(regs->tf_rsp);
  365 
  366         if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
  367                 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
  368                 xfpusave = __builtin_alloca(xfpusave_len);
  369         } else {
  370                 xfpusave_len = 0;
  371                 xfpusave = NULL;
  372         }
  373 
  374         /* Save user context. */
  375         bzero(&sf, sizeof(sf));
  376         sf.sf_uc.uc_sigmask = *mask;
  377         sf.sf_uc.uc_stack = td->td_sigstk;
  378         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  379             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  380         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  381         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
  382         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  383         get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
  384         fpstate_drop(td);
  385         sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
  386         sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
  387         bzero(sf.sf_uc.uc_mcontext.mc_spare,
  388             sizeof(sf.sf_uc.uc_mcontext.mc_spare));
  389         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  390 
  391         /* Allocate space for the signal handler context. */
  392         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  393             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  394                 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
  395 #if defined(COMPAT_43)
  396                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  397 #endif
  398         } else
  399                 sp = (char *)regs->tf_rsp - 128;
  400         if (xfpusave != NULL) {
  401                 sp -= xfpusave_len;
  402                 sp = (char *)((unsigned long)sp & ~0x3Ful);
  403                 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
  404         }
  405         sp -= sizeof(struct sigframe);
  406         /* Align to 16 bytes. */
  407         sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
  408 
  409         /* Build the argument list for the signal handler. */
  410         regs->tf_rdi = sig;                     /* arg 1 in %rdi */
  411         regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
  412         bzero(&sf.sf_si, sizeof(sf.sf_si));
  413         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  414                 /* Signal handler installed with SA_SIGINFO. */
  415                 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
  416                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  417 
  418                 /* Fill in POSIX parts */
  419                 sf.sf_si = ksi->ksi_info;
  420                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  421                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  422         } else {
  423                 /* Old FreeBSD-style arguments. */
  424                 regs->tf_rsi = ksi->ksi_code;   /* arg 2 in %rsi */
  425                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  426                 sf.sf_ahu.sf_handler = catcher;
  427         }
  428         mtx_unlock(&psp->ps_mtx);
  429         PROC_UNLOCK(p);
  430 
  431         /*
  432          * Copy the sigframe out to the user's stack.
  433          */
  434         if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
  435             (xfpusave != NULL && copyout(xfpusave,
  436             (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
  437             != 0)) {
  438 #ifdef DEBUG
  439                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  440 #endif
  441                 PROC_LOCK(p);
  442                 sigexit(td, SIGILL);
  443         }
  444 
  445         regs->tf_rsp = (long)sfp;
  446         regs->tf_rip = p->p_sysent->sv_sigcode_base;
  447         regs->tf_rflags &= ~(PSL_T | PSL_D);
  448         regs->tf_cs = _ucodesel;
  449         regs->tf_ds = _udatasel;
  450         regs->tf_ss = _udatasel;
  451         regs->tf_es = _udatasel;
  452         regs->tf_fs = _ufssel;
  453         regs->tf_gs = _ugssel;
  454         regs->tf_flags = TF_HASSEGS;
  455         set_pcb_flags(pcb, PCB_FULL_IRET);
  456         PROC_LOCK(p);
  457         mtx_lock(&psp->ps_mtx);
  458 }
  459 
  460 /*
  461  * System call to cleanup state after a signal
  462  * has been taken.  Reset signal mask and
  463  * stack state from context left by sendsig (above).
  464  * Return to previous pc and psl as specified by
  465  * context left by sendsig. Check carefully to
  466  * make sure that the user has not modified the
  467  * state to gain improper privileges.
  468  *
  469  * MPSAFE
  470  */
  471 int
  472 sys_sigreturn(td, uap)
  473         struct thread *td;
  474         struct sigreturn_args /* {
  475                 const struct __ucontext *sigcntxp;
  476         } */ *uap;
  477 {
  478         ucontext_t uc;
  479         struct pcb *pcb;
  480         struct proc *p;
  481         struct trapframe *regs;
  482         ucontext_t *ucp;
  483         char *xfpustate;
  484         size_t xfpustate_len;
  485         long rflags;
  486         int cs, error, ret;
  487         ksiginfo_t ksi;
  488 
  489         pcb = td->td_pcb;
  490         p = td->td_proc;
  491 
  492         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  493         if (error != 0) {
  494                 uprintf("pid %d (%s): sigreturn copyin failed\n",
  495                     p->p_pid, td->td_name);
  496                 return (error);
  497         }
  498         ucp = &uc;
  499         if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
  500                 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
  501                     td->td_name, ucp->uc_mcontext.mc_flags);
  502                 return (EINVAL);
  503         }
  504         regs = td->td_frame;
  505         rflags = ucp->uc_mcontext.mc_rflags;
  506         /*
  507          * Don't allow users to change privileged or reserved flags.
  508          */
  509         if (!EFL_SECURE(rflags, regs->tf_rflags)) {
  510                 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
  511                     td->td_name, rflags);
  512                 return (EINVAL);
  513         }
  514 
  515         /*
  516          * Don't allow users to load a valid privileged %cs.  Let the
  517          * hardware check for invalid selectors, excess privilege in
  518          * other selectors, invalid %eip's and invalid %esp's.
  519          */
  520         cs = ucp->uc_mcontext.mc_cs;
  521         if (!CS_SECURE(cs)) {
  522                 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
  523                     td->td_name, cs);
  524                 ksiginfo_init_trap(&ksi);
  525                 ksi.ksi_signo = SIGBUS;
  526                 ksi.ksi_code = BUS_OBJERR;
  527                 ksi.ksi_trapno = T_PROTFLT;
  528                 ksi.ksi_addr = (void *)regs->tf_rip;
  529                 trapsignal(td, &ksi);
  530                 return (EINVAL);
  531         }
  532 
  533         if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
  534                 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
  535                 if (xfpustate_len > cpu_max_ext_state_size -
  536                     sizeof(struct savefpu)) {
  537                         uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
  538                             p->p_pid, td->td_name, xfpustate_len);
  539                         return (EINVAL);
  540                 }
  541                 xfpustate = __builtin_alloca(xfpustate_len);
  542                 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
  543                     xfpustate, xfpustate_len);
  544                 if (error != 0) {
  545                         uprintf(
  546         "pid %d (%s): sigreturn copying xfpustate failed\n",
  547                             p->p_pid, td->td_name);
  548                         return (error);
  549                 }
  550         } else {
  551                 xfpustate = NULL;
  552                 xfpustate_len = 0;
  553         }
  554         ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
  555         if (ret != 0) {
  556                 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
  557                     p->p_pid, td->td_name, ret);
  558                 return (ret);
  559         }
  560         bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
  561         pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
  562         pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
  563 
  564 #if defined(COMPAT_43)
  565         if (ucp->uc_mcontext.mc_onstack & 1)
  566                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  567         else
  568                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  569 #endif
  570 
  571         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
  572         set_pcb_flags(pcb, PCB_FULL_IRET);
  573         return (EJUSTRETURN);
  574 }
  575 
  576 #ifdef COMPAT_FREEBSD4
  577 int
  578 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
  579 {
  580  
  581         return sys_sigreturn(td, (struct sigreturn_args *)uap);
  582 }
  583 #endif
  584 
  585 /*
  586  * Reset registers to default values on exec.
  587  */
  588 void
  589 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
  590 {
  591         struct trapframe *regs = td->td_frame;
  592         struct pcb *pcb = td->td_pcb;
  593 
  594         mtx_lock(&dt_lock);
  595         if (td->td_proc->p_md.md_ldt != NULL)
  596                 user_ldt_free(td);
  597         else
  598                 mtx_unlock(&dt_lock);
  599         
  600         pcb->pcb_fsbase = 0;
  601         pcb->pcb_gsbase = 0;
  602         clear_pcb_flags(pcb, PCB_32BIT);
  603         pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
  604         set_pcb_flags(pcb, PCB_FULL_IRET);
  605 
  606         bzero((char *)regs, sizeof(struct trapframe));
  607         regs->tf_rip = imgp->entry_addr;
  608         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
  609         regs->tf_rdi = stack;           /* argv */
  610         regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
  611         regs->tf_ss = _udatasel;
  612         regs->tf_cs = _ucodesel;
  613         regs->tf_ds = _udatasel;
  614         regs->tf_es = _udatasel;
  615         regs->tf_fs = _ufssel;
  616         regs->tf_gs = _ugssel;
  617         regs->tf_flags = TF_HASSEGS;
  618         td->td_retval[1] = 0;
  619 
  620         /*
  621          * Reset the hardware debug registers if they were in use.
  622          * They won't have any meaning for the newly exec'd process.
  623          */
  624         if (pcb->pcb_flags & PCB_DBREGS) {
  625                 pcb->pcb_dr0 = 0;
  626                 pcb->pcb_dr1 = 0;
  627                 pcb->pcb_dr2 = 0;
  628                 pcb->pcb_dr3 = 0;
  629                 pcb->pcb_dr6 = 0;
  630                 pcb->pcb_dr7 = 0;
  631                 if (pcb == curpcb) {
  632                         /*
  633                          * Clear the debug registers on the running
  634                          * CPU, otherwise they will end up affecting
  635                          * the next process we switch to.
  636                          */
  637                         reset_dbregs();
  638                 }
  639                 clear_pcb_flags(pcb, PCB_DBREGS);
  640         }
  641 
  642         /*
  643          * Drop the FP state if we hold it, so that the process gets a
  644          * clean FP state if it uses the FPU again.
  645          */
  646         fpstate_drop(td);
  647 }
  648 
  649 void
  650 cpu_setregs(void)
  651 {
  652         register_t cr0;
  653 
  654         cr0 = rcr0();
  655         /*
  656          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  657          * BSP.  See the comments there about why we set them.
  658          */
  659         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  660         load_cr0(cr0);
  661 }
  662 
  663 /*
  664  * Initialize amd64 and configure to run kernel
  665  */
  666 
  667 /*
  668  * Initialize segments & interrupt table
  669  */
  670 
  671 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
  672 static struct gate_descriptor idt0[NIDT];
  673 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  674 
  675 static char dblfault_stack[PAGE_SIZE] __aligned(16);
  676 static char mce0_stack[PAGE_SIZE] __aligned(16);
  677 static char nmi0_stack[PAGE_SIZE] __aligned(16);
  678 static char dbg0_stack[PAGE_SIZE] __aligned(16);
  679 CTASSERT(sizeof(struct nmi_pcpu) == 16);
  680 
  681 struct amd64tss common_tss[MAXCPU];
  682 
  683 /*
  684  * Software prototypes -- in more palatable form.
  685  *
  686  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  687  * slots as corresponding segments for i386 kernel.
  688  */
  689 struct soft_segment_descriptor gdt_segs[] = {
  690 /* GNULL_SEL    0 Null Descriptor */
  691 {       .ssd_base = 0x0,
  692         .ssd_limit = 0x0,
  693         .ssd_type = 0,
  694         .ssd_dpl = 0,
  695         .ssd_p = 0,
  696         .ssd_long = 0,
  697         .ssd_def32 = 0,
  698         .ssd_gran = 0           },
  699 /* GNULL2_SEL   1 Null Descriptor */
  700 {       .ssd_base = 0x0,
  701         .ssd_limit = 0x0,
  702         .ssd_type = 0,
  703         .ssd_dpl = 0,
  704         .ssd_p = 0,
  705         .ssd_long = 0,
  706         .ssd_def32 = 0,
  707         .ssd_gran = 0           },
  708 /* GUFS32_SEL   2 32 bit %gs Descriptor for user */
  709 {       .ssd_base = 0x0,
  710         .ssd_limit = 0xfffff,
  711         .ssd_type = SDT_MEMRWA,
  712         .ssd_dpl = SEL_UPL,
  713         .ssd_p = 1,
  714         .ssd_long = 0,
  715         .ssd_def32 = 1,
  716         .ssd_gran = 1           },
  717 /* GUGS32_SEL   3 32 bit %fs Descriptor for user */
  718 {       .ssd_base = 0x0,
  719         .ssd_limit = 0xfffff,
  720         .ssd_type = SDT_MEMRWA,
  721         .ssd_dpl = SEL_UPL,
  722         .ssd_p = 1,
  723         .ssd_long = 0,
  724         .ssd_def32 = 1,
  725         .ssd_gran = 1           },
  726 /* GCODE_SEL    4 Code Descriptor for kernel */
  727 {       .ssd_base = 0x0,
  728         .ssd_limit = 0xfffff,
  729         .ssd_type = SDT_MEMERA,
  730         .ssd_dpl = SEL_KPL,
  731         .ssd_p = 1,
  732         .ssd_long = 1,
  733         .ssd_def32 = 0,
  734         .ssd_gran = 1           },
  735 /* GDATA_SEL    5 Data Descriptor for kernel */
  736 {       .ssd_base = 0x0,
  737         .ssd_limit = 0xfffff,
  738         .ssd_type = SDT_MEMRWA,
  739         .ssd_dpl = SEL_KPL,
  740         .ssd_p = 1,
  741         .ssd_long = 1,
  742         .ssd_def32 = 0,
  743         .ssd_gran = 1           },
  744 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
  745 {       .ssd_base = 0x0,
  746         .ssd_limit = 0xfffff,
  747         .ssd_type = SDT_MEMERA,
  748         .ssd_dpl = SEL_UPL,
  749         .ssd_p = 1,
  750         .ssd_long = 0,
  751         .ssd_def32 = 1,
  752         .ssd_gran = 1           },
  753 /* GUDATA_SEL   7 32/64 bit Data Descriptor for user */
  754 {       .ssd_base = 0x0,
  755         .ssd_limit = 0xfffff,
  756         .ssd_type = SDT_MEMRWA,
  757         .ssd_dpl = SEL_UPL,
  758         .ssd_p = 1,
  759         .ssd_long = 0,
  760         .ssd_def32 = 1,
  761         .ssd_gran = 1           },
  762 /* GUCODE_SEL   8 64 bit Code Descriptor for user */
  763 {       .ssd_base = 0x0,
  764         .ssd_limit = 0xfffff,
  765         .ssd_type = SDT_MEMERA,
  766         .ssd_dpl = SEL_UPL,
  767         .ssd_p = 1,
  768         .ssd_long = 1,
  769         .ssd_def32 = 0,
  770         .ssd_gran = 1           },
  771 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
  772 {       .ssd_base = 0x0,
  773         .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
  774         .ssd_type = SDT_SYSTSS,
  775         .ssd_dpl = SEL_KPL,
  776         .ssd_p = 1,
  777         .ssd_long = 0,
  778         .ssd_def32 = 0,
  779         .ssd_gran = 0           },
  780 /* Actually, the TSS is a system descriptor which is double size */
  781 {       .ssd_base = 0x0,
  782         .ssd_limit = 0x0,
  783         .ssd_type = 0,
  784         .ssd_dpl = 0,
  785         .ssd_p = 0,
  786         .ssd_long = 0,
  787         .ssd_def32 = 0,
  788         .ssd_gran = 0           },
  789 /* GUSERLDT_SEL 11 LDT Descriptor */
  790 {       .ssd_base = 0x0,
  791         .ssd_limit = 0x0,
  792         .ssd_type = 0,
  793         .ssd_dpl = 0,
  794         .ssd_p = 0,
  795         .ssd_long = 0,
  796         .ssd_def32 = 0,
  797         .ssd_gran = 0           },
  798 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
  799 {       .ssd_base = 0x0,
  800         .ssd_limit = 0x0,
  801         .ssd_type = 0,
  802         .ssd_dpl = 0,
  803         .ssd_p = 0,
  804         .ssd_long = 0,
  805         .ssd_def32 = 0,
  806         .ssd_gran = 0           },
  807 };
  808 
  809 void
  810 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
  811 {
  812         struct gate_descriptor *ip;
  813 
  814         ip = idt + idx;
  815         ip->gd_looffset = (uintptr_t)func;
  816         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  817         ip->gd_ist = ist;
  818         ip->gd_xx = 0;
  819         ip->gd_type = typ;
  820         ip->gd_dpl = dpl;
  821         ip->gd_p = 1;
  822         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  823 }
  824 
  825 extern inthand_t
  826         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  827         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  828         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  829         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  830         IDTVEC(xmm), IDTVEC(dblfault),
  831         IDTVEC(div_pti), IDTVEC(bpt_pti),
  832         IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
  833         IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
  834         IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
  835         IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
  836         IDTVEC(xmm_pti),
  837 #ifdef KDTRACE_HOOKS
  838         IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
  839 #endif
  840 #ifdef XENHVM
  841         IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
  842 #endif
  843         IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
  844         IDTVEC(fast_syscall_pti);
  845 
  846 #ifdef DDB
  847 /*
  848  * Display the index and function name of any IDT entries that don't use
  849  * the default 'rsvd' entry point.
  850  */
  851 DB_SHOW_COMMAND(idt, db_show_idt)
  852 {
  853         struct gate_descriptor *ip;
  854         int idx;
  855         uintptr_t func;
  856 
  857         ip = idt;
  858         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
  859                 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
  860                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
  861                         db_printf("%3d\t", idx);
  862                         db_printsym(func, DB_STGY_PROC);
  863                         db_printf("\n");
  864                 }
  865                 ip++;
  866         }
  867 }
  868 
  869 /* Show privileged registers. */
  870 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
  871 {
  872         struct {
  873                 uint16_t limit;
  874                 uint64_t base;
  875         } __packed idtr, gdtr;
  876         uint16_t ldt, tr;
  877 
  878         __asm __volatile("sidt %0" : "=m" (idtr));
  879         db_printf("idtr\t0x%016lx/%04x\n",
  880             (u_long)idtr.base, (u_int)idtr.limit);
  881         __asm __volatile("sgdt %0" : "=m" (gdtr));
  882         db_printf("gdtr\t0x%016lx/%04x\n",
  883             (u_long)gdtr.base, (u_int)gdtr.limit);
  884         __asm __volatile("sldt %0" : "=r" (ldt));
  885         db_printf("ldtr\t0x%04x\n", ldt);
  886         __asm __volatile("str %0" : "=r" (tr));
  887         db_printf("tr\t0x%04x\n", tr);
  888         db_printf("cr0\t0x%016lx\n", rcr0());
  889         db_printf("cr2\t0x%016lx\n", rcr2());
  890         db_printf("cr3\t0x%016lx\n", rcr3());
  891         db_printf("cr4\t0x%016lx\n", rcr4());
  892         if (rcr4() & CR4_XSAVE)
  893                 db_printf("xcr0\t0x%016lx\n", rxcr(0));
  894         db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
  895         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
  896                 db_printf("FEATURES_CTL\t%016lx\n",
  897                     rdmsr(MSR_IA32_FEATURE_CONTROL));
  898         db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
  899         db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
  900         db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
  901 }
  902 
  903 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
  904 {
  905 
  906         db_printf("dr0\t0x%016lx\n", rdr0());
  907         db_printf("dr1\t0x%016lx\n", rdr1());
  908         db_printf("dr2\t0x%016lx\n", rdr2());
  909         db_printf("dr3\t0x%016lx\n", rdr3());
  910         db_printf("dr6\t0x%016lx\n", rdr6());
  911         db_printf("dr7\t0x%016lx\n", rdr7());   
  912 }
  913 #endif
  914 
  915 void
  916 sdtossd(sd, ssd)
  917         struct user_segment_descriptor *sd;
  918         struct soft_segment_descriptor *ssd;
  919 {
  920 
  921         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  922         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  923         ssd->ssd_type  = sd->sd_type;
  924         ssd->ssd_dpl   = sd->sd_dpl;
  925         ssd->ssd_p     = sd->sd_p;
  926         ssd->ssd_long  = sd->sd_long;
  927         ssd->ssd_def32 = sd->sd_def32;
  928         ssd->ssd_gran  = sd->sd_gran;
  929 }
  930 
  931 void
  932 ssdtosd(ssd, sd)
  933         struct soft_segment_descriptor *ssd;
  934         struct user_segment_descriptor *sd;
  935 {
  936 
  937         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  938         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  939         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  940         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  941         sd->sd_type  = ssd->ssd_type;
  942         sd->sd_dpl   = ssd->ssd_dpl;
  943         sd->sd_p     = ssd->ssd_p;
  944         sd->sd_long  = ssd->ssd_long;
  945         sd->sd_def32 = ssd->ssd_def32;
  946         sd->sd_gran  = ssd->ssd_gran;
  947 }
  948 
  949 void
  950 ssdtosyssd(ssd, sd)
  951         struct soft_segment_descriptor *ssd;
  952         struct system_segment_descriptor *sd;
  953 {
  954 
  955         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  956         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  957         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  958         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  959         sd->sd_type  = ssd->ssd_type;
  960         sd->sd_dpl   = ssd->ssd_dpl;
  961         sd->sd_p     = ssd->ssd_p;
  962         sd->sd_gran  = ssd->ssd_gran;
  963 }
  964 
  965 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
  966 #include <isa/isavar.h>
  967 #include <isa/isareg.h>
  968 /*
  969  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  970  * and is only suitable for use at probe time.
  971  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  972  * It shouldn't be here.  There should probably be an APIC centric
  973  * implementation in the apic driver code, if at all.
  974  */
  975 intrmask_t
  976 isa_irq_pending(void)
  977 {
  978         u_char irr1;
  979         u_char irr2;
  980 
  981         irr1 = inb(IO_ICU1);
  982         irr2 = inb(IO_ICU2);
  983         return ((irr2 << 8) | irr1);
  984 }
  985 #endif
  986 
  987 u_int basemem;
  988 
  989 static int
  990 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
  991     int *physmap_idxp)
  992 {
  993         int i, insert_idx, physmap_idx;
  994 
  995         physmap_idx = *physmap_idxp;
  996 
  997         if (length == 0)
  998                 return (1);
  999 
 1000         /*
 1001          * Find insertion point while checking for overlap.  Start off by
 1002          * assuming the new entry will be added to the end.
 1003          *
 1004          * NB: physmap_idx points to the next free slot.
 1005          */
 1006         insert_idx = physmap_idx;
 1007         for (i = 0; i <= physmap_idx; i += 2) {
 1008                 if (base < physmap[i + 1]) {
 1009                         if (base + length <= physmap[i]) {
 1010                                 insert_idx = i;
 1011                                 break;
 1012                         }
 1013                         if (boothowto & RB_VERBOSE)
 1014                                 printf(
 1015                     "Overlapping memory regions, ignoring second region\n");
 1016                         return (1);
 1017                 }
 1018         }
 1019 
 1020         /* See if we can prepend to the next entry. */
 1021         if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 1022                 physmap[insert_idx] = base;
 1023                 return (1);
 1024         }
 1025 
 1026         /* See if we can append to the previous entry. */
 1027         if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 1028                 physmap[insert_idx - 1] += length;
 1029                 return (1);
 1030         }
 1031 
 1032         physmap_idx += 2;
 1033         *physmap_idxp = physmap_idx;
 1034         if (physmap_idx == PHYSMAP_SIZE) {
 1035                 printf(
 1036                 "Too many segments in the physical address map, giving up\n");
 1037                 return (0);
 1038         }
 1039 
 1040         /*
 1041          * Move the last 'N' entries down to make room for the new
 1042          * entry if needed.
 1043          */
 1044         for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 1045                 physmap[i] = physmap[i - 2];
 1046                 physmap[i + 1] = physmap[i - 1];
 1047         }
 1048 
 1049         /* Insert the new entry. */
 1050         physmap[insert_idx] = base;
 1051         physmap[insert_idx + 1] = base + length;
 1052         return (1);
 1053 }
 1054 
 1055 void
 1056 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
 1057                       vm_paddr_t *physmap, int *physmap_idx)
 1058 {
 1059         struct bios_smap *smap, *smapend;
 1060 
 1061         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 1062 
 1063         for (smap = smapbase; smap < smapend; smap++) {
 1064                 if (boothowto & RB_VERBOSE)
 1065                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
 1066                             smap->type, smap->base, smap->length);
 1067 
 1068                 if (smap->type != SMAP_TYPE_MEMORY)
 1069                         continue;
 1070 
 1071                 if (!add_physmap_entry(smap->base, smap->length, physmap,
 1072                     physmap_idx))
 1073                         break;
 1074         }
 1075 }
 1076 
 1077 static void
 1078 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
 1079     int *physmap_idx)
 1080 {
 1081         struct efi_md *map, *p;
 1082         const char *type;
 1083         size_t efisz;
 1084         int ndesc, i;
 1085 
 1086         static const char *types[] = {
 1087                 "Reserved",
 1088                 "LoaderCode",
 1089                 "LoaderData",
 1090                 "BootServicesCode",
 1091                 "BootServicesData",
 1092                 "RuntimeServicesCode",
 1093                 "RuntimeServicesData",
 1094                 "ConventionalMemory",
 1095                 "UnusableMemory",
 1096                 "ACPIReclaimMemory",
 1097                 "ACPIMemoryNVS",
 1098                 "MemoryMappedIO",
 1099                 "MemoryMappedIOPortSpace",
 1100                 "PalCode",
 1101                 "PersistentMemory"
 1102         };
 1103 
 1104         /*
 1105          * Memory map data provided by UEFI via the GetMemoryMap
 1106          * Boot Services API.
 1107          */
 1108         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 1109         map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 1110 
 1111         if (efihdr->descriptor_size == 0)
 1112                 return;
 1113         ndesc = efihdr->memory_size / efihdr->descriptor_size;
 1114 
 1115         if (boothowto & RB_VERBOSE)
 1116                 printf("%23s %12s %12s %8s %4s\n",
 1117                     "Type", "Physical", "Virtual", "#Pages", "Attr");
 1118 
 1119         for (i = 0, p = map; i < ndesc; i++,
 1120             p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 1121                 if (boothowto & RB_VERBOSE) {
 1122                         if (p->md_type < nitems(types))
 1123                                 type = types[p->md_type];
 1124                         else
 1125                                 type = "<INVALID>";
 1126                         printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 1127                             p->md_virt, p->md_pages);
 1128                         if (p->md_attr & EFI_MD_ATTR_UC)
 1129                                 printf("UC ");
 1130                         if (p->md_attr & EFI_MD_ATTR_WC)
 1131                                 printf("WC ");
 1132                         if (p->md_attr & EFI_MD_ATTR_WT)
 1133                                 printf("WT ");
 1134                         if (p->md_attr & EFI_MD_ATTR_WB)
 1135                                 printf("WB ");
 1136                         if (p->md_attr & EFI_MD_ATTR_UCE)
 1137                                 printf("UCE ");
 1138                         if (p->md_attr & EFI_MD_ATTR_WP)
 1139                                 printf("WP ");
 1140                         if (p->md_attr & EFI_MD_ATTR_RP)
 1141                                 printf("RP ");
 1142                         if (p->md_attr & EFI_MD_ATTR_XP)
 1143                                 printf("XP ");
 1144                         if (p->md_attr & EFI_MD_ATTR_NV)
 1145                                 printf("NV ");
 1146                         if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 1147                                 printf("MORE_RELIABLE ");
 1148                         if (p->md_attr & EFI_MD_ATTR_RO)
 1149                                 printf("RO ");
 1150                         if (p->md_attr & EFI_MD_ATTR_RT)
 1151                                 printf("RUNTIME");
 1152                         printf("\n");
 1153                 }
 1154 
 1155                 switch (p->md_type) {
 1156                 case EFI_MD_TYPE_CODE:
 1157                 case EFI_MD_TYPE_DATA:
 1158                 case EFI_MD_TYPE_BS_CODE:
 1159                 case EFI_MD_TYPE_BS_DATA:
 1160                 case EFI_MD_TYPE_FREE:
 1161                         /*
 1162                          * We're allowed to use any entry with these types.
 1163                          */
 1164                         break;
 1165                 default:
 1166                         continue;
 1167                 }
 1168 
 1169                 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 1170                     physmap, physmap_idx))
 1171                         break;
 1172         }
 1173 }
 1174 
 1175 static char bootmethod[16] = "";
 1176 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
 1177     "System firmware boot method");
 1178 
 1179 static void
 1180 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 1181 {
 1182         struct bios_smap *smap;
 1183         struct efi_map_header *efihdr;
 1184         u_int32_t size;
 1185 
 1186         /*
 1187          * Memory map from INT 15:E820.
 1188          *
 1189          * subr_module.c says:
 1190          * "Consumer may safely assume that size value precedes data."
 1191          * ie: an int32_t immediately precedes smap.
 1192          */
 1193 
 1194         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1195             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1196         smap = (struct bios_smap *)preload_search_info(kmdp,
 1197             MODINFO_METADATA | MODINFOMD_SMAP);
 1198         if (efihdr == NULL && smap == NULL)
 1199                 panic("No BIOS smap or EFI map info from loader!");
 1200 
 1201         if (efihdr != NULL) {
 1202                 add_efi_map_entries(efihdr, physmap, physmap_idx);
 1203                 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 1204         } else {
 1205                 size = *((u_int32_t *)smap - 1);
 1206                 bios_add_smap_entries(smap, size, physmap, physmap_idx);
 1207                 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 1208         }
 1209 }
 1210 
 1211 #define PAGES_PER_GB    (1024 * 1024 * 1024 / PAGE_SIZE)
 1212 
 1213 /*
 1214  * Populate the (physmap) array with base/bound pairs describing the
 1215  * available physical memory in the system, then test this memory and
 1216  * build the phys_avail array describing the actually-available memory.
 1217  *
 1218  * Total memory size may be set by the kernel environment variable
 1219  * hw.physmem or the compile-time define MAXMEM.
 1220  *
 1221  * XXX first should be vm_paddr_t.
 1222  */
 1223 static void
 1224 getmemsize(caddr_t kmdp, u_int64_t first)
 1225 {
 1226         int i, physmap_idx, pa_indx, da_indx;
 1227         vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 1228         u_long physmem_start, physmem_tunable, memtest;
 1229         pt_entry_t *pte;
 1230         quad_t dcons_addr, dcons_size;
 1231         int page_counter;
 1232 
 1233         bzero(physmap, sizeof(physmap));
 1234         physmap_idx = 0;
 1235 
 1236         init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 1237         physmap_idx -= 2;
 1238 
 1239         /*
 1240          * Find the 'base memory' segment for SMP
 1241          */
 1242         basemem = 0;
 1243         for (i = 0; i <= physmap_idx; i += 2) {
 1244                 if (physmap[i] <= 0xA0000) {
 1245                         basemem = physmap[i + 1] / 1024;
 1246                         break;
 1247                 }
 1248         }
 1249         if (basemem == 0 || basemem > 640) {
 1250                 if (bootverbose)
 1251                         printf(
 1252                 "Memory map doesn't contain a basemem segment, faking it");
 1253                 basemem = 640;
 1254         }
 1255 
 1256         /*
 1257          * Make hole for "AP -> long mode" bootstrap code.  The
 1258          * mp_bootaddress vector is only available when the kernel
 1259          * is configured to support APs and APs for the system start
 1260          * in 32bit mode (e.g. SMP bare metal).
 1261          */
 1262         if (init_ops.mp_bootaddress) {
 1263                 if (physmap[1] >= 0x100000000)
 1264                         panic(
 1265         "Basemem segment is not suitable for AP bootstrap code!");
 1266                 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 1267         }
 1268 
 1269         /*
 1270          * Maxmem isn't the "maximum memory", it's one larger than the
 1271          * highest page of the physical address space.  It should be
 1272          * called something like "Maxphyspage".  We may adjust this
 1273          * based on ``hw.physmem'' and the results of the memory test.
 1274          */
 1275         Maxmem = atop(physmap[physmap_idx + 1]);
 1276 
 1277 #ifdef MAXMEM
 1278         Maxmem = MAXMEM / 4;
 1279 #endif
 1280 
 1281         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 1282                 Maxmem = atop(physmem_tunable);
 1283 
 1284         /*
 1285          * The boot memory test is disabled by default, as it takes a
 1286          * significant amount of time on large-memory systems, and is
 1287          * unfriendly to virtual machines as it unnecessarily touches all
 1288          * pages.
 1289          *
 1290          * A general name is used as the code may be extended to support
 1291          * additional tests beyond the current "page present" test.
 1292          */
 1293         memtest = 0;
 1294         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 1295 
 1296         /*
 1297          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 1298          * in the system.
 1299          */
 1300         if (Maxmem > atop(physmap[physmap_idx + 1]))
 1301                 Maxmem = atop(physmap[physmap_idx + 1]);
 1302 
 1303         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 1304             (boothowto & RB_VERBOSE))
 1305                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 1306 
 1307         /* call pmap initialization to make new kernel address space */
 1308         pmap_bootstrap(&first);
 1309 
 1310         /*
 1311          * Size up each available chunk of physical memory.
 1312          *
 1313          * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 1314          * By default, mask off the first 16 pages unless we appear to be
 1315          * running in a VM.
 1316          */
 1317         physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 1318         TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 1319         if (physmap[0] < physmem_start) {
 1320                 if (physmem_start < PAGE_SIZE)
 1321                         physmap[0] = PAGE_SIZE;
 1322                 else if (physmem_start >= physmap[1])
 1323                         physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 1324                 else
 1325                         physmap[0] = round_page(physmem_start);
 1326         }
 1327         pa_indx = 0;
 1328         da_indx = 1;
 1329         phys_avail[pa_indx++] = physmap[0];
 1330         phys_avail[pa_indx] = physmap[0];
 1331         dump_avail[da_indx] = physmap[0];
 1332         pte = CMAP1;
 1333 
 1334         /*
 1335          * Get dcons buffer address
 1336          */
 1337         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 1338             getenv_quad("dcons.size", &dcons_size) == 0)
 1339                 dcons_addr = 0;
 1340 
 1341         /*
 1342          * physmap is in bytes, so when converting to page boundaries,
 1343          * round up the start address and round down the end address.
 1344          */
 1345         page_counter = 0;
 1346         if (memtest != 0)
 1347                 printf("Testing system memory");
 1348         for (i = 0; i <= physmap_idx; i += 2) {
 1349                 vm_paddr_t end;
 1350 
 1351                 end = ptoa((vm_paddr_t)Maxmem);
 1352                 if (physmap[i + 1] < end)
 1353                         end = trunc_page(physmap[i + 1]);
 1354                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 1355                         int tmp, page_bad, full;
 1356                         int *ptr = (int *)CADDR1;
 1357 
 1358                         full = FALSE;
 1359                         /*
 1360                          * block out kernel memory as not available.
 1361                          */
 1362                         if (pa >= (vm_paddr_t)kernphys && pa < first)
 1363                                 goto do_dump_avail;
 1364 
 1365                         /*
 1366                          * block out dcons buffer
 1367                          */
 1368                         if (dcons_addr > 0
 1369                             && pa >= trunc_page(dcons_addr)
 1370                             && pa < dcons_addr + dcons_size)
 1371                                 goto do_dump_avail;
 1372 
 1373                         page_bad = FALSE;
 1374                         if (memtest == 0)
 1375                                 goto skip_memtest;
 1376 
 1377                         /*
 1378                          * Print a "." every GB to show we're making
 1379                          * progress.
 1380                          */
 1381                         page_counter++;
 1382                         if ((page_counter % PAGES_PER_GB) == 0)
 1383                                 printf(".");
 1384 
 1385                         /*
 1386                          * map page into kernel: valid, read/write,non-cacheable
 1387                          */
 1388                         *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 1389                         invltlb();
 1390 
 1391                         tmp = *(int *)ptr;
 1392                         /*
 1393                          * Test for alternating 1's and 0's
 1394                          */
 1395                         *(volatile int *)ptr = 0xaaaaaaaa;
 1396                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1397                                 page_bad = TRUE;
 1398                         /*
 1399                          * Test for alternating 0's and 1's
 1400                          */
 1401                         *(volatile int *)ptr = 0x55555555;
 1402                         if (*(volatile int *)ptr != 0x55555555)
 1403                                 page_bad = TRUE;
 1404                         /*
 1405                          * Test for all 1's
 1406                          */
 1407                         *(volatile int *)ptr = 0xffffffff;
 1408                         if (*(volatile int *)ptr != 0xffffffff)
 1409                                 page_bad = TRUE;
 1410                         /*
 1411                          * Test for all 0's
 1412                          */
 1413                         *(volatile int *)ptr = 0x0;
 1414                         if (*(volatile int *)ptr != 0x0)
 1415                                 page_bad = TRUE;
 1416                         /*
 1417                          * Restore original value.
 1418                          */
 1419                         *(int *)ptr = tmp;
 1420 
 1421 skip_memtest:
 1422                         /*
 1423                          * Adjust array of valid/good pages.
 1424                          */
 1425                         if (page_bad == TRUE)
 1426                                 continue;
 1427                         /*
 1428                          * If this good page is a continuation of the
 1429                          * previous set of good pages, then just increase
 1430                          * the end pointer. Otherwise start a new chunk.
 1431                          * Note that "end" points one higher than end,
 1432                          * making the range >= start and < end.
 1433                          * If we're also doing a speculative memory
 1434                          * test and we at or past the end, bump up Maxmem
 1435                          * so that we keep going. The first bad page
 1436                          * will terminate the loop.
 1437                          */
 1438                         if (phys_avail[pa_indx] == pa) {
 1439                                 phys_avail[pa_indx] += PAGE_SIZE;
 1440                         } else {
 1441                                 pa_indx++;
 1442                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 1443                                         printf(
 1444                 "Too many holes in the physical address space, giving up\n");
 1445                                         pa_indx--;
 1446                                         full = TRUE;
 1447                                         goto do_dump_avail;
 1448                                 }
 1449                                 phys_avail[pa_indx++] = pa;     /* start */
 1450                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1451                         }
 1452                         physmem++;
 1453 do_dump_avail:
 1454                         if (dump_avail[da_indx] == pa) {
 1455                                 dump_avail[da_indx] += PAGE_SIZE;
 1456                         } else {
 1457                                 da_indx++;
 1458                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 1459                                         da_indx--;
 1460                                         goto do_next;
 1461                                 }
 1462                                 dump_avail[da_indx++] = pa; /* start */
 1463                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1464                         }
 1465 do_next:
 1466                         if (full)
 1467                                 break;
 1468                 }
 1469         }
 1470         *pte = 0;
 1471         invltlb();
 1472         if (memtest != 0)
 1473                 printf("\n");
 1474 
 1475         /*
 1476          * XXX
 1477          * The last chunk must contain at least one page plus the message
 1478          * buffer to avoid complicating other code (message buffer address
 1479          * calculation, etc.).
 1480          */
 1481         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1482             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 1483                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1484                 phys_avail[pa_indx--] = 0;
 1485                 phys_avail[pa_indx--] = 0;
 1486         }
 1487 
 1488         Maxmem = atop(phys_avail[pa_indx]);
 1489 
 1490         /* Trim off space for the message buffer. */
 1491         phys_avail[pa_indx] -= round_page(msgbufsize);
 1492 
 1493         /* Map the message buffer. */
 1494         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 1495 }
 1496 
 1497 static caddr_t
 1498 native_parse_preload_data(u_int64_t modulep)
 1499 {
 1500         caddr_t kmdp;
 1501         char *envp;
 1502 #ifdef DDB
 1503         vm_offset_t ksym_start;
 1504         vm_offset_t ksym_end;
 1505 #endif
 1506 
 1507         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1508         preload_bootstrap_relocate(KERNBASE);
 1509         kmdp = preload_search_by_type("elf kernel");
 1510         if (kmdp == NULL)
 1511                 kmdp = preload_search_by_type("elf64 kernel");
 1512         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1513         envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 1514         if (envp != NULL)
 1515                 envp += KERNBASE;
 1516         init_static_kenv(envp, 0);
 1517 #ifdef DDB
 1518         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1519         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1520         db_fetch_ksymtab(ksym_start, ksym_end);
 1521 #endif
 1522         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 1523 
 1524         return (kmdp);
 1525 }
 1526 
 1527 static void
 1528 amd64_kdb_init(void)
 1529 {
 1530         kdb_init();
 1531 #ifdef KDB
 1532         if (boothowto & RB_KDB)
 1533                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 1534 #endif
 1535 }
 1536 
 1537 /* Set up the fast syscall stuff */
 1538 void
 1539 amd64_conf_fast_syscall(void)
 1540 {
 1541         uint64_t msr;
 1542 
 1543         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1544         wrmsr(MSR_EFER, msr);
 1545         wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 1546             (u_int64_t)IDTVEC(fast_syscall));
 1547         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1548         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1549             ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1550         wrmsr(MSR_STAR, msr);
 1551         wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
 1552 }
 1553 
 1554 u_int64_t
 1555 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1556 {
 1557         caddr_t kmdp;
 1558         int gsel_tss, x;
 1559         struct pcpu *pc;
 1560         struct nmi_pcpu *np;
 1561         struct xstate_hdr *xhdr;
 1562         u_int64_t rsp0;
 1563         char *env;
 1564         size_t kstack0_sz;
 1565         int late_console;
 1566 
 1567         /*
 1568          * This may be done better later if it gets more high level
 1569          * components in it. If so just link td->td_proc here.
 1570          */
 1571         proc_linkup0(&proc0, &thread0);
 1572 
 1573         kmdp = init_ops.parse_preload_data(modulep);
 1574 
 1575         identify_cpu1();
 1576 
 1577         /* Init basic tunables, hz etc */
 1578         init_param1();
 1579 
 1580         thread0.td_kstack = physfree + KERNBASE;
 1581         thread0.td_kstack_pages = kstack_pages;
 1582         kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 1583         bzero((void *)thread0.td_kstack, kstack0_sz);
 1584         physfree += kstack0_sz;
 1585 
 1586         /*
 1587          * make gdt memory segments
 1588          */
 1589         for (x = 0; x < NGDT; x++) {
 1590                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 1591                     x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 1592                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1593         }
 1594         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 1595         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1596             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1597 
 1598         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1599         r_gdt.rd_base =  (long) gdt;
 1600         lgdt(&r_gdt);
 1601         pc = &__pcpu[0];
 1602 
 1603         wrmsr(MSR_FSBASE, 0);           /* User value */
 1604         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1605         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1606 
 1607         pcpu_init(pc, 0, sizeof(struct pcpu));
 1608         dpcpu_init((void *)(physfree + KERNBASE), 0);
 1609         physfree += DPCPU_SIZE;
 1610         PCPU_SET(prvspace, pc);
 1611         PCPU_SET(curthread, &thread0);
 1612         /* Non-late cninit() and printf() can be moved up to here. */
 1613         PCPU_SET(tssp, &common_tss[0]);
 1614         PCPU_SET(commontssp, &common_tss[0]);
 1615         PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1616         PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 1617         PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 1618         PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 1619 
 1620         /*
 1621          * Initialize mutexes.
 1622          *
 1623          * icu_lock: in order to allow an interrupt to occur in a critical
 1624          *           section, to set pcpu->ipending (etc...) properly, we
 1625          *           must be able to get the icu lock, so it can't be
 1626          *           under witness.
 1627          */
 1628         mutex_init();
 1629         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1630         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 1631 
 1632         /* exceptions */
 1633         pti = pti_get_default();
 1634         TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 1635 
 1636         for (x = 0; x < NIDT; x++)
 1637                 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 1638                     SEL_KPL, 0);
 1639         setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 1640             SEL_KPL, 0);
 1641         setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 1642         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 1643         setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 1644             SEL_UPL, 0);
 1645         setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 1646             SEL_KPL, 0);
 1647         setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 1648             SEL_KPL, 0);
 1649         setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 1650             SEL_KPL, 0);
 1651         setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 1652             SEL_KPL, 0);
 1653         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1654         setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 1655             SDT_SYSIGT, SEL_KPL, 0);
 1656         setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 1657             SEL_KPL, 0);
 1658         setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 1659             SDT_SYSIGT, SEL_KPL, 0);
 1660         setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 1661             SEL_KPL, 0);
 1662         setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 1663             SEL_KPL, 0);
 1664         setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 1665             SEL_KPL, 0);
 1666         setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 1667             SEL_KPL, 0);
 1668         setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 1669             SEL_KPL, 0);
 1670         setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 1671         setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 1672             SEL_KPL, 0);
 1673 #ifdef KDTRACE_HOOKS
 1674         setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 1675             &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 1676 #endif
 1677 #ifdef XENHVM
 1678         setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 1679             &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 1680 #endif
 1681         r_idt.rd_limit = sizeof(idt0) - 1;
 1682         r_idt.rd_base = (long) idt;
 1683         lidt(&r_idt);
 1684 
 1685         /*
 1686          * Initialize the clock before the console so that console
 1687          * initialization can use DELAY().
 1688          */
 1689         clock_init();
 1690 
 1691         /*
 1692          * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 1693          * transition).
 1694          * Once bootblocks have updated, we can test directly for
 1695          * efi_systbl != NULL here...
 1696          */
 1697         if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 1698             != NULL)
 1699                 vty_set_preferred(VTY_VT);
 1700 
 1701         finishidentcpu();       /* Final stage of CPU initialization */
 1702         initializecpu();        /* Initialize CPU registers */
 1703         initializecpucache();
 1704 
 1705         /* doublefault stack space, runs on ist1 */
 1706         common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 1707 
 1708         /*
 1709          * NMI stack, runs on ist2.  The pcpu pointer is stored just
 1710          * above the start of the ist2 stack.
 1711          */
 1712         np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 1713         np->np_pcpu = (register_t) pc;
 1714         common_tss[0].tss_ist2 = (long) np;
 1715 
 1716         /*
 1717          * MC# stack, runs on ist3.  The pcpu pointer is stored just
 1718          * above the start of the ist3 stack.
 1719          */
 1720         np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 1721         np->np_pcpu = (register_t) pc;
 1722         common_tss[0].tss_ist3 = (long) np;
 1723 
 1724         /*
 1725          * DB# stack, runs on ist4.
 1726          */
 1727         np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 1728         np->np_pcpu = (register_t) pc;
 1729         common_tss[0].tss_ist4 = (long) np;
 1730         
 1731         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1732         common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 1733 
 1734         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1735         ltr(gsel_tss);
 1736 
 1737         amd64_conf_fast_syscall();
 1738 
 1739         /*
 1740          * Temporary forge some valid pointer to PCB, for exception
 1741          * handlers.  It is reinitialized properly below after FPU is
 1742          * set up.  Also set up td_critnest to short-cut the page
 1743          * fault handler.
 1744          */
 1745         cpu_max_ext_state_size = sizeof(struct savefpu);
 1746         thread0.td_pcb = get_pcb_td(&thread0);
 1747         thread0.td_critnest = 1;
 1748 
 1749         /*
 1750          * The console and kdb should be initialized even earlier than here,
 1751          * but some console drivers don't work until after getmemsize().
 1752          * Default to late console initialization to support these drivers.
 1753          * This loses mainly printf()s in getmemsize() and early debugging.
 1754          */
 1755         late_console = 1;
 1756         TUNABLE_INT_FETCH("debug.late_console", &late_console);
 1757         if (!late_console) {
 1758                 cninit();
 1759                 amd64_kdb_init();
 1760         }
 1761 
 1762         getmemsize(kmdp, physfree);
 1763         init_param2(physmem);
 1764 
 1765         /* now running on new page tables, configured,and u/iom is accessible */
 1766 
 1767         if (late_console)
 1768                 cninit();
 1769 
 1770 #ifdef DEV_ISA
 1771 #ifdef DEV_ATPIC
 1772         elcr_probe();
 1773         atpic_startup();
 1774 #else
 1775         /* Reset and mask the atpics and leave them shut down. */
 1776         atpic_reset();
 1777 
 1778         /*
 1779          * Point the ICU spurious interrupt vectors at the APIC spurious
 1780          * interrupt handler.
 1781          */
 1782         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1783         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1784 #endif
 1785 #else
 1786 #error "have you forgotten the isa device?";
 1787 #endif
 1788 
 1789         if (late_console)
 1790                 amd64_kdb_init();
 1791 
 1792         msgbufinit(msgbufp, msgbufsize);
 1793         fpuinit();
 1794 
 1795         /*
 1796          * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 1797          * area size.  Zero out the extended state header in fpu save
 1798          * area.
 1799          */
 1800         thread0.td_pcb = get_pcb_td(&thread0);
 1801         thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 1802         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 1803         if (use_xsave) {
 1804                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 1805                     1);
 1806                 xhdr->xstate_bv = xsave_mask;
 1807         }
 1808         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1809         rsp0 = (vm_offset_t)thread0.td_pcb;
 1810         /* Ensure the stack is aligned to 16 bytes */
 1811         rsp0 &= ~0xFul;
 1812         common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
 1813             PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
 1814         PCPU_SET(rsp0, rsp0);
 1815         PCPU_SET(curpcb, thread0.td_pcb);
 1816 
 1817         /* transfer to user mode */
 1818 
 1819         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1820         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1821         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1822         _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 1823         _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 1824 
 1825         load_ds(_udatasel);
 1826         load_es(_udatasel);
 1827         load_fs(_ufssel);
 1828 
 1829         /* setup proc 0's pcb */
 1830         thread0.td_pcb->pcb_flags = 0;
 1831         thread0.td_frame = &proc0_tf;
 1832 
 1833         env = kern_getenv("kernelname");
 1834         if (env != NULL)
 1835                 strlcpy(kernelname, env, sizeof(kernelname));
 1836 
 1837         cpu_probe_amdc1e();
 1838 
 1839 #ifdef FDT
 1840         x86_init_fdt();
 1841 #endif
 1842         thread0.td_critnest = 0;
 1843 
 1844         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1845 
 1846         /* Location of kernel stack for locore */
 1847         return ((u_int64_t)thread0.td_pcb);
 1848 }
 1849 
 1850 void
 1851 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1852 {
 1853 
 1854         pcpu->pc_acpi_id = 0xffffffff;
 1855 }
 1856 
 1857 static int
 1858 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1859 {
 1860         struct bios_smap *smapbase;
 1861         struct bios_smap_xattr smap;
 1862         caddr_t kmdp;
 1863         uint32_t *smapattr;
 1864         int count, error, i;
 1865 
 1866         /* Retrieve the system memory map from the loader. */
 1867         kmdp = preload_search_by_type("elf kernel");
 1868         if (kmdp == NULL)
 1869                 kmdp = preload_search_by_type("elf64 kernel");
 1870         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 1871             MODINFO_METADATA | MODINFOMD_SMAP);
 1872         if (smapbase == NULL)
 1873                 return (0);
 1874         smapattr = (uint32_t *)preload_search_info(kmdp,
 1875             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 1876         count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 1877         error = 0;
 1878         for (i = 0; i < count; i++) {
 1879                 smap.base = smapbase[i].base;
 1880                 smap.length = smapbase[i].length;
 1881                 smap.type = smapbase[i].type;
 1882                 if (smapattr != NULL)
 1883                         smap.xattr = smapattr[i];
 1884                 else
 1885                         smap.xattr = 0;
 1886                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 1887         }
 1888         return (error);
 1889 }
 1890 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1891     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 1892 
 1893 static int
 1894 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1895 {
 1896         struct efi_map_header *efihdr;
 1897         caddr_t kmdp;
 1898         uint32_t efisize;
 1899 
 1900         kmdp = preload_search_by_type("elf kernel");
 1901         if (kmdp == NULL)
 1902                 kmdp = preload_search_by_type("elf64 kernel");
 1903         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1904             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1905         if (efihdr == NULL)
 1906                 return (0);
 1907         efisize = *((uint32_t *)efihdr - 1);
 1908         return (SYSCTL_OUT(req, efihdr, efisize));
 1909 }
 1910 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1911     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 1912 
 1913 void
 1914 spinlock_enter(void)
 1915 {
 1916         struct thread *td;
 1917         register_t flags;
 1918 
 1919         td = curthread;
 1920         if (td->td_md.md_spinlock_count == 0) {
 1921                 flags = intr_disable();
 1922                 td->td_md.md_spinlock_count = 1;
 1923                 td->td_md.md_saved_flags = flags;
 1924         } else
 1925                 td->td_md.md_spinlock_count++;
 1926         critical_enter();
 1927 }
 1928 
 1929 void
 1930 spinlock_exit(void)
 1931 {
 1932         struct thread *td;
 1933         register_t flags;
 1934 
 1935         td = curthread;
 1936         critical_exit();
 1937         flags = td->td_md.md_saved_flags;
 1938         td->td_md.md_spinlock_count--;
 1939         if (td->td_md.md_spinlock_count == 0)
 1940                 intr_restore(flags);
 1941 }
 1942 
 1943 /*
 1944  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1945  * we want to start a backtrace from the function that caused us to enter
 1946  * the debugger. We have the context in the trapframe, but base the trace
 1947  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1948  * enough for a backtrace.
 1949  */
 1950 void
 1951 makectx(struct trapframe *tf, struct pcb *pcb)
 1952 {
 1953 
 1954         pcb->pcb_r12 = tf->tf_r12;
 1955         pcb->pcb_r13 = tf->tf_r13;
 1956         pcb->pcb_r14 = tf->tf_r14;
 1957         pcb->pcb_r15 = tf->tf_r15;
 1958         pcb->pcb_rbp = tf->tf_rbp;
 1959         pcb->pcb_rbx = tf->tf_rbx;
 1960         pcb->pcb_rip = tf->tf_rip;
 1961         pcb->pcb_rsp = tf->tf_rsp;
 1962 }
 1963 
 1964 int
 1965 ptrace_set_pc(struct thread *td, unsigned long addr)
 1966 {
 1967 
 1968         td->td_frame->tf_rip = addr;
 1969         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 1970         return (0);
 1971 }
 1972 
 1973 int
 1974 ptrace_single_step(struct thread *td)
 1975 {
 1976         td->td_frame->tf_rflags |= PSL_T;
 1977         return (0);
 1978 }
 1979 
 1980 int
 1981 ptrace_clear_single_step(struct thread *td)
 1982 {
 1983         td->td_frame->tf_rflags &= ~PSL_T;
 1984         return (0);
 1985 }
 1986 
 1987 int
 1988 fill_regs(struct thread *td, struct reg *regs)
 1989 {
 1990         struct trapframe *tp;
 1991 
 1992         tp = td->td_frame;
 1993         return (fill_frame_regs(tp, regs));
 1994 }
 1995 
 1996 int
 1997 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 1998 {
 1999         regs->r_r15 = tp->tf_r15;
 2000         regs->r_r14 = tp->tf_r14;
 2001         regs->r_r13 = tp->tf_r13;
 2002         regs->r_r12 = tp->tf_r12;
 2003         regs->r_r11 = tp->tf_r11;
 2004         regs->r_r10 = tp->tf_r10;
 2005         regs->r_r9  = tp->tf_r9;
 2006         regs->r_r8  = tp->tf_r8;
 2007         regs->r_rdi = tp->tf_rdi;
 2008         regs->r_rsi = tp->tf_rsi;
 2009         regs->r_rbp = tp->tf_rbp;
 2010         regs->r_rbx = tp->tf_rbx;
 2011         regs->r_rdx = tp->tf_rdx;
 2012         regs->r_rcx = tp->tf_rcx;
 2013         regs->r_rax = tp->tf_rax;
 2014         regs->r_rip = tp->tf_rip;
 2015         regs->r_cs = tp->tf_cs;
 2016         regs->r_rflags = tp->tf_rflags;
 2017         regs->r_rsp = tp->tf_rsp;
 2018         regs->r_ss = tp->tf_ss;
 2019         if (tp->tf_flags & TF_HASSEGS) {
 2020                 regs->r_ds = tp->tf_ds;
 2021                 regs->r_es = tp->tf_es;
 2022                 regs->r_fs = tp->tf_fs;
 2023                 regs->r_gs = tp->tf_gs;
 2024         } else {
 2025                 regs->r_ds = 0;
 2026                 regs->r_es = 0;
 2027                 regs->r_fs = 0;
 2028                 regs->r_gs = 0;
 2029         }
 2030         return (0);
 2031 }
 2032 
 2033 int
 2034 set_regs(struct thread *td, struct reg *regs)
 2035 {
 2036         struct trapframe *tp;
 2037         register_t rflags;
 2038 
 2039         tp = td->td_frame;
 2040         rflags = regs->r_rflags & 0xffffffff;
 2041         if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 2042                 return (EINVAL);
 2043         tp->tf_r15 = regs->r_r15;
 2044         tp->tf_r14 = regs->r_r14;
 2045         tp->tf_r13 = regs->r_r13;
 2046         tp->tf_r12 = regs->r_r12;
 2047         tp->tf_r11 = regs->r_r11;
 2048         tp->tf_r10 = regs->r_r10;
 2049         tp->tf_r9  = regs->r_r9;
 2050         tp->tf_r8  = regs->r_r8;
 2051         tp->tf_rdi = regs->r_rdi;
 2052         tp->tf_rsi = regs->r_rsi;
 2053         tp->tf_rbp = regs->r_rbp;
 2054         tp->tf_rbx = regs->r_rbx;
 2055         tp->tf_rdx = regs->r_rdx;
 2056         tp->tf_rcx = regs->r_rcx;
 2057         tp->tf_rax = regs->r_rax;
 2058         tp->tf_rip = regs->r_rip;
 2059         tp->tf_cs = regs->r_cs;
 2060         tp->tf_rflags = rflags;
 2061         tp->tf_rsp = regs->r_rsp;
 2062         tp->tf_ss = regs->r_ss;
 2063         if (0) {        /* XXXKIB */
 2064                 tp->tf_ds = regs->r_ds;
 2065                 tp->tf_es = regs->r_es;
 2066                 tp->tf_fs = regs->r_fs;
 2067                 tp->tf_gs = regs->r_gs;
 2068                 tp->tf_flags = TF_HASSEGS;
 2069         }
 2070         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2071         return (0);
 2072 }
 2073 
 2074 /* XXX check all this stuff! */
 2075 /* externalize from sv_xmm */
 2076 static void
 2077 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 2078 {
 2079         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2080         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2081         int i;
 2082 
 2083         /* pcb -> fpregs */
 2084         bzero(fpregs, sizeof(*fpregs));
 2085 
 2086         /* FPU control/status */
 2087         penv_fpreg->en_cw = penv_xmm->en_cw;
 2088         penv_fpreg->en_sw = penv_xmm->en_sw;
 2089         penv_fpreg->en_tw = penv_xmm->en_tw;
 2090         penv_fpreg->en_opcode = penv_xmm->en_opcode;
 2091         penv_fpreg->en_rip = penv_xmm->en_rip;
 2092         penv_fpreg->en_rdp = penv_xmm->en_rdp;
 2093         penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 2094         penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 2095 
 2096         /* FPU registers */
 2097         for (i = 0; i < 8; ++i)
 2098                 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 2099 
 2100         /* SSE registers */
 2101         for (i = 0; i < 16; ++i)
 2102                 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 2103 }
 2104 
 2105 /* internalize from fpregs into sv_xmm */
 2106 static void
 2107 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 2108 {
 2109         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2110         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2111         int i;
 2112 
 2113         /* fpregs -> pcb */
 2114         /* FPU control/status */
 2115         penv_xmm->en_cw = penv_fpreg->en_cw;
 2116         penv_xmm->en_sw = penv_fpreg->en_sw;
 2117         penv_xmm->en_tw = penv_fpreg->en_tw;
 2118         penv_xmm->en_opcode = penv_fpreg->en_opcode;
 2119         penv_xmm->en_rip = penv_fpreg->en_rip;
 2120         penv_xmm->en_rdp = penv_fpreg->en_rdp;
 2121         penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 2122         penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 2123 
 2124         /* FPU registers */
 2125         for (i = 0; i < 8; ++i)
 2126                 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 2127 
 2128         /* SSE registers */
 2129         for (i = 0; i < 16; ++i)
 2130                 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 2131 }
 2132 
 2133 /* externalize from td->pcb */
 2134 int
 2135 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 2136 {
 2137 
 2138         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 2139             P_SHOULDSTOP(td->td_proc),
 2140             ("not suspended thread %p", td));
 2141         fpugetregs(td);
 2142         fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 2143         return (0);
 2144 }
 2145 
 2146 /* internalize to td->pcb */
 2147 int
 2148 set_fpregs(struct thread *td, struct fpreg *fpregs)
 2149 {
 2150 
 2151         critical_enter();
 2152         set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 2153         fpuuserinited(td);
 2154         critical_exit();
 2155         return (0);
 2156 }
 2157 
 2158 /*
 2159  * Get machine context.
 2160  */
 2161 int
 2162 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 2163 {
 2164         struct pcb *pcb;
 2165         struct trapframe *tp;
 2166 
 2167         pcb = td->td_pcb;
 2168         tp = td->td_frame;
 2169         PROC_LOCK(curthread->td_proc);
 2170         mcp->mc_onstack = sigonstack(tp->tf_rsp);
 2171         PROC_UNLOCK(curthread->td_proc);
 2172         mcp->mc_r15 = tp->tf_r15;
 2173         mcp->mc_r14 = tp->tf_r14;
 2174         mcp->mc_r13 = tp->tf_r13;
 2175         mcp->mc_r12 = tp->tf_r12;
 2176         mcp->mc_r11 = tp->tf_r11;
 2177         mcp->mc_r10 = tp->tf_r10;
 2178         mcp->mc_r9  = tp->tf_r9;
 2179         mcp->mc_r8  = tp->tf_r8;
 2180         mcp->mc_rdi = tp->tf_rdi;
 2181         mcp->mc_rsi = tp->tf_rsi;
 2182         mcp->mc_rbp = tp->tf_rbp;
 2183         mcp->mc_rbx = tp->tf_rbx;
 2184         mcp->mc_rcx = tp->tf_rcx;
 2185         mcp->mc_rflags = tp->tf_rflags;
 2186         if (flags & GET_MC_CLEAR_RET) {
 2187                 mcp->mc_rax = 0;
 2188                 mcp->mc_rdx = 0;
 2189                 mcp->mc_rflags &= ~PSL_C;
 2190         } else {
 2191                 mcp->mc_rax = tp->tf_rax;
 2192                 mcp->mc_rdx = tp->tf_rdx;
 2193         }
 2194         mcp->mc_rip = tp->tf_rip;
 2195         mcp->mc_cs = tp->tf_cs;
 2196         mcp->mc_rsp = tp->tf_rsp;
 2197         mcp->mc_ss = tp->tf_ss;
 2198         mcp->mc_ds = tp->tf_ds;
 2199         mcp->mc_es = tp->tf_es;
 2200         mcp->mc_fs = tp->tf_fs;
 2201         mcp->mc_gs = tp->tf_gs;
 2202         mcp->mc_flags = tp->tf_flags;
 2203         mcp->mc_len = sizeof(*mcp);
 2204         get_fpcontext(td, mcp, NULL, 0);
 2205         mcp->mc_fsbase = pcb->pcb_fsbase;
 2206         mcp->mc_gsbase = pcb->pcb_gsbase;
 2207         mcp->mc_xfpustate = 0;
 2208         mcp->mc_xfpustate_len = 0;
 2209         bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 2210         return (0);
 2211 }
 2212 
 2213 /*
 2214  * Set machine context.
 2215  *
 2216  * However, we don't set any but the user modifiable flags, and we won't
 2217  * touch the cs selector.
 2218  */
 2219 int
 2220 set_mcontext(struct thread *td, mcontext_t *mcp)
 2221 {
 2222         struct pcb *pcb;
 2223         struct trapframe *tp;
 2224         char *xfpustate;
 2225         long rflags;
 2226         int ret;
 2227 
 2228         pcb = td->td_pcb;
 2229         tp = td->td_frame;
 2230         if (mcp->mc_len != sizeof(*mcp) ||
 2231             (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 2232                 return (EINVAL);
 2233         rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 2234             (tp->tf_rflags & ~PSL_USERCHANGE);
 2235         if (mcp->mc_flags & _MC_HASFPXSTATE) {
 2236                 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 2237                     sizeof(struct savefpu))
 2238                         return (EINVAL);
 2239                 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 2240                 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 2241                     mcp->mc_xfpustate_len);
 2242                 if (ret != 0)
 2243                         return (ret);
 2244         } else
 2245                 xfpustate = NULL;
 2246         ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 2247         if (ret != 0)
 2248                 return (ret);
 2249         tp->tf_r15 = mcp->mc_r15;
 2250         tp->tf_r14 = mcp->mc_r14;
 2251         tp->tf_r13 = mcp->mc_r13;
 2252         tp->tf_r12 = mcp->mc_r12;
 2253         tp->tf_r11 = mcp->mc_r11;
 2254         tp->tf_r10 = mcp->mc_r10;
 2255         tp->tf_r9  = mcp->mc_r9;
 2256         tp->tf_r8  = mcp->mc_r8;
 2257         tp->tf_rdi = mcp->mc_rdi;
 2258         tp->tf_rsi = mcp->mc_rsi;
 2259         tp->tf_rbp = mcp->mc_rbp;
 2260         tp->tf_rbx = mcp->mc_rbx;
 2261         tp->tf_rdx = mcp->mc_rdx;
 2262         tp->tf_rcx = mcp->mc_rcx;
 2263         tp->tf_rax = mcp->mc_rax;
 2264         tp->tf_rip = mcp->mc_rip;
 2265         tp->tf_rflags = rflags;
 2266         tp->tf_rsp = mcp->mc_rsp;
 2267         tp->tf_ss = mcp->mc_ss;
 2268         tp->tf_flags = mcp->mc_flags;
 2269         if (tp->tf_flags & TF_HASSEGS) {
 2270                 tp->tf_ds = mcp->mc_ds;
 2271                 tp->tf_es = mcp->mc_es;
 2272                 tp->tf_fs = mcp->mc_fs;
 2273                 tp->tf_gs = mcp->mc_gs;
 2274         }
 2275         if (mcp->mc_flags & _MC_HASBASES) {
 2276                 pcb->pcb_fsbase = mcp->mc_fsbase;
 2277                 pcb->pcb_gsbase = mcp->mc_gsbase;
 2278         }
 2279         set_pcb_flags(pcb, PCB_FULL_IRET);
 2280         return (0);
 2281 }
 2282 
 2283 static void
 2284 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
 2285     size_t xfpusave_len)
 2286 {
 2287         size_t max_len, len;
 2288 
 2289         mcp->mc_ownedfp = fpugetregs(td);
 2290         bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 2291             sizeof(mcp->mc_fpstate));
 2292         mcp->mc_fpformat = fpuformat();
 2293         if (!use_xsave || xfpusave_len == 0)
 2294                 return;
 2295         max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 2296         len = xfpusave_len;
 2297         if (len > max_len) {
 2298                 len = max_len;
 2299                 bzero(xfpusave + max_len, len - max_len);
 2300         }
 2301         mcp->mc_flags |= _MC_HASFPXSTATE;
 2302         mcp->mc_xfpustate_len = len;
 2303         bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 2304 }
 2305 
 2306 static int
 2307 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
 2308     size_t xfpustate_len)
 2309 {
 2310         struct savefpu *fpstate;
 2311         int error;
 2312 
 2313         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 2314                 return (0);
 2315         else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 2316                 return (EINVAL);
 2317         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 2318                 /* We don't care what state is left in the FPU or PCB. */
 2319                 fpstate_drop(td);
 2320                 error = 0;
 2321         } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 2322             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 2323                 fpstate = (struct savefpu *)&mcp->mc_fpstate;
 2324                 fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 2325                 error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
 2326         } else
 2327                 return (EINVAL);
 2328         return (error);
 2329 }
 2330 
 2331 void
 2332 fpstate_drop(struct thread *td)
 2333 {
 2334 
 2335         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 2336         critical_enter();
 2337         if (PCPU_GET(fpcurthread) == td)
 2338                 fpudrop();
 2339         /*
 2340          * XXX force a full drop of the fpu.  The above only drops it if we
 2341          * owned it.
 2342          *
 2343          * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 2344          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 2345          * We only need to drop to !PCB_INITDONE in sendsig().  But
 2346          * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 2347          * have too many layers.
 2348          */
 2349         clear_pcb_flags(curthread->td_pcb,
 2350             PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 2351         critical_exit();
 2352 }
 2353 
 2354 int
 2355 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 2356 {
 2357         struct pcb *pcb;
 2358 
 2359         if (td == NULL) {
 2360                 dbregs->dr[0] = rdr0();
 2361                 dbregs->dr[1] = rdr1();
 2362                 dbregs->dr[2] = rdr2();
 2363                 dbregs->dr[3] = rdr3();
 2364                 dbregs->dr[6] = rdr6();
 2365                 dbregs->dr[7] = rdr7();
 2366         } else {
 2367                 pcb = td->td_pcb;
 2368                 dbregs->dr[0] = pcb->pcb_dr0;
 2369                 dbregs->dr[1] = pcb->pcb_dr1;
 2370                 dbregs->dr[2] = pcb->pcb_dr2;
 2371                 dbregs->dr[3] = pcb->pcb_dr3;
 2372                 dbregs->dr[6] = pcb->pcb_dr6;
 2373                 dbregs->dr[7] = pcb->pcb_dr7;
 2374         }
 2375         dbregs->dr[4] = 0;
 2376         dbregs->dr[5] = 0;
 2377         dbregs->dr[8] = 0;
 2378         dbregs->dr[9] = 0;
 2379         dbregs->dr[10] = 0;
 2380         dbregs->dr[11] = 0;
 2381         dbregs->dr[12] = 0;
 2382         dbregs->dr[13] = 0;
 2383         dbregs->dr[14] = 0;
 2384         dbregs->dr[15] = 0;
 2385         return (0);
 2386 }
 2387 
 2388 int
 2389 set_dbregs(struct thread *td, struct dbreg *dbregs)
 2390 {
 2391         struct pcb *pcb;
 2392         int i;
 2393 
 2394         if (td == NULL) {
 2395                 load_dr0(dbregs->dr[0]);
 2396                 load_dr1(dbregs->dr[1]);
 2397                 load_dr2(dbregs->dr[2]);
 2398                 load_dr3(dbregs->dr[3]);
 2399                 load_dr6(dbregs->dr[6]);
 2400                 load_dr7(dbregs->dr[7]);
 2401         } else {
 2402                 /*
 2403                  * Don't let an illegal value for dr7 get set.  Specifically,
 2404                  * check for undefined settings.  Setting these bit patterns
 2405                  * result in undefined behaviour and can lead to an unexpected
 2406                  * TRCTRAP or a general protection fault right here.
 2407                  * Upper bits of dr6 and dr7 must not be set
 2408                  */
 2409                 for (i = 0; i < 4; i++) {
 2410                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 2411                                 return (EINVAL);
 2412                         if (td->td_frame->tf_cs == _ucode32sel &&
 2413                             DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 2414                                 return (EINVAL);
 2415                 }
 2416                 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 2417                     (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 2418                         return (EINVAL);
 2419 
 2420                 pcb = td->td_pcb;
 2421 
 2422                 /*
 2423                  * Don't let a process set a breakpoint that is not within the
 2424                  * process's address space.  If a process could do this, it
 2425                  * could halt the system by setting a breakpoint in the kernel
 2426                  * (if ddb was enabled).  Thus, we need to check to make sure
 2427                  * that no breakpoints are being enabled for addresses outside
 2428                  * process's address space.
 2429                  *
 2430                  * XXX - what about when the watched area of the user's
 2431                  * address space is written into from within the kernel
 2432                  * ... wouldn't that still cause a breakpoint to be generated
 2433                  * from within kernel mode?
 2434                  */
 2435 
 2436                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 2437                         /* dr0 is enabled */
 2438                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 2439                                 return (EINVAL);
 2440                 }
 2441                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 2442                         /* dr1 is enabled */
 2443                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 2444                                 return (EINVAL);
 2445                 }
 2446                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 2447                         /* dr2 is enabled */
 2448                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 2449                                 return (EINVAL);
 2450                 }
 2451                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 2452                         /* dr3 is enabled */
 2453                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 2454                                 return (EINVAL);
 2455                 }
 2456 
 2457                 pcb->pcb_dr0 = dbregs->dr[0];
 2458                 pcb->pcb_dr1 = dbregs->dr[1];
 2459                 pcb->pcb_dr2 = dbregs->dr[2];
 2460                 pcb->pcb_dr3 = dbregs->dr[3];
 2461                 pcb->pcb_dr6 = dbregs->dr[6];
 2462                 pcb->pcb_dr7 = dbregs->dr[7];
 2463 
 2464                 set_pcb_flags(pcb, PCB_DBREGS);
 2465         }
 2466 
 2467         return (0);
 2468 }
 2469 
 2470 void
 2471 reset_dbregs(void)
 2472 {
 2473 
 2474         load_dr7(0);    /* Turn off the control bits first */
 2475         load_dr0(0);
 2476         load_dr1(0);
 2477         load_dr2(0);
 2478         load_dr3(0);
 2479         load_dr6(0);
 2480 }
 2481 
 2482 /*
 2483  * Return > 0 if a hardware breakpoint has been hit, and the
 2484  * breakpoint was in user space.  Return 0, otherwise.
 2485  */
 2486 int
 2487 user_dbreg_trap(void)
 2488 {
 2489         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
 2490         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
 2491         int nbp;            /* number of breakpoints that triggered */
 2492         caddr_t addr[4];    /* breakpoint addresses */
 2493         int i;
 2494         
 2495         dr7 = rdr7();
 2496         if ((dr7 & 0x000000ff) == 0) {
 2497                 /*
 2498                  * all GE and LE bits in the dr7 register are zero,
 2499                  * thus the trap couldn't have been caused by the
 2500                  * hardware debug registers
 2501                  */
 2502                 return 0;
 2503         }
 2504 
 2505         nbp = 0;
 2506         dr6 = rdr6();
 2507         bp = dr6 & 0x0000000f;
 2508 
 2509         if (!bp) {
 2510                 /*
 2511                  * None of the breakpoint bits are set meaning this
 2512                  * trap was not caused by any of the debug registers
 2513                  */
 2514                 return 0;
 2515         }
 2516 
 2517         /*
 2518          * at least one of the breakpoints were hit, check to see
 2519          * which ones and if any of them are user space addresses
 2520          */
 2521 
 2522         if (bp & 0x01) {
 2523                 addr[nbp++] = (caddr_t)rdr0();
 2524         }
 2525         if (bp & 0x02) {
 2526                 addr[nbp++] = (caddr_t)rdr1();
 2527         }
 2528         if (bp & 0x04) {
 2529                 addr[nbp++] = (caddr_t)rdr2();
 2530         }
 2531         if (bp & 0x08) {
 2532                 addr[nbp++] = (caddr_t)rdr3();
 2533         }
 2534 
 2535         for (i = 0; i < nbp; i++) {
 2536                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 2537                         /*
 2538                          * addr[i] is in user space
 2539                          */
 2540                         return nbp;
 2541                 }
 2542         }
 2543 
 2544         /*
 2545          * None of the breakpoints are in user space.
 2546          */
 2547         return 0;
 2548 }
 2549 
 2550 #ifdef KDB
 2551 
 2552 /*
 2553  * Provide inb() and outb() as functions.  They are normally only available as
 2554  * inline functions, thus cannot be called from the debugger.
 2555  */
 2556 
 2557 /* silence compiler warnings */
 2558 u_char inb_(u_short);
 2559 void outb_(u_short, u_char);
 2560 
 2561 u_char
 2562 inb_(u_short port)
 2563 {
 2564         return inb(port);
 2565 }
 2566 
 2567 void
 2568 outb_(u_short port, u_char data)
 2569 {
 2570         outb(port, data);
 2571 }
 2572 
 2573 #endif /* KDB */

Cache object: e98599797ce99f24365138f01f82b9ee


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.