The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: releng/11.2/sys/amd64/amd64/machdep.c 338607 2018-09-12 05:08:49Z gordon $");
   43 
   44 #include "opt_atpic.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kstack_pages.h"
   51 #include "opt_maxmem.h"
   52 #include "opt_mp_watchdog.h"
   53 #include "opt_perfmon.h"
   54 #include "opt_platform.h"
   55 #include "opt_sched.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/bio.h>
   61 #include <sys/buf.h>
   62 #include <sys/bus.h>
   63 #include <sys/callout.h>
   64 #include <sys/cons.h>
   65 #include <sys/cpu.h>
   66 #include <sys/efi.h>
   67 #include <sys/eventhandler.h>
   68 #include <sys/exec.h>
   69 #include <sys/imgact.h>
   70 #include <sys/kdb.h>
   71 #include <sys/kernel.h>
   72 #include <sys/ktr.h>
   73 #include <sys/linker.h>
   74 #include <sys/lock.h>
   75 #include <sys/malloc.h>
   76 #include <sys/memrange.h>
   77 #include <sys/msgbuf.h>
   78 #include <sys/mutex.h>
   79 #include <sys/pcpu.h>
   80 #include <sys/ptrace.h>
   81 #include <sys/reboot.h>
   82 #include <sys/rwlock.h>
   83 #include <sys/sched.h>
   84 #include <sys/signalvar.h>
   85 #ifdef SMP
   86 #include <sys/smp.h>
   87 #endif
   88 #include <sys/syscallsubr.h>
   89 #include <sys/sysctl.h>
   90 #include <sys/sysent.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/ucontext.h>
   93 #include <sys/vmmeter.h>
   94 
   95 #include <vm/vm.h>
   96 #include <vm/vm_extern.h>
   97 #include <vm/vm_kern.h>
   98 #include <vm/vm_page.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_object.h>
  101 #include <vm/vm_pager.h>
  102 #include <vm/vm_param.h>
  103 
  104 #ifdef DDB
  105 #ifndef KDB
  106 #error KDB must be enabled in order for DDB to work!
  107 #endif
  108 #include <ddb/ddb.h>
  109 #include <ddb/db_sym.h>
  110 #endif
  111 
  112 #include <net/netisr.h>
  113 
  114 #include <machine/clock.h>
  115 #include <machine/cpu.h>
  116 #include <machine/cputypes.h>
  117 #include <machine/frame.h>
  118 #include <machine/intr_machdep.h>
  119 #include <x86/mca.h>
  120 #include <machine/md_var.h>
  121 #include <machine/metadata.h>
  122 #include <machine/mp_watchdog.h>
  123 #include <machine/pc/bios.h>
  124 #include <machine/pcb.h>
  125 #include <machine/proc.h>
  126 #include <machine/reg.h>
  127 #include <machine/sigframe.h>
  128 #include <machine/specialreg.h>
  129 #ifdef PERFMON
  130 #include <machine/perfmon.h>
  131 #endif
  132 #include <machine/tss.h>
  133 #ifdef SMP
  134 #include <machine/smp.h>
  135 #endif
  136 #ifdef FDT
  137 #include <x86/fdt.h>
  138 #endif
  139 
  140 #ifdef DEV_ATPIC
  141 #include <x86/isa/icu.h>
  142 #else
  143 #include <x86/apicvar.h>
  144 #endif
  145 
  146 #include <isa/isareg.h>
  147 #include <isa/rtc.h>
  148 #include <x86/init.h>
  149 
  150 /* Sanity check for __curthread() */
  151 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  152 
  153 /*
  154  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  155  * couple of scratch registers, as well as the trapframe left behind after an
  156  * iret fault.
  157  */
  158 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
  159     offsetof(struct pti_frame, pti_rip));
  160 
  161 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  162 
  163 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  164 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  165 
  166 static void cpu_startup(void *);
  167 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
  168     char *xfpusave, size_t xfpusave_len);
  169 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
  170     char *xfpustate, size_t xfpustate_len);
  171 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  172 
  173 /* Preload data parse function */
  174 static caddr_t native_parse_preload_data(u_int64_t);
  175 
  176 /* Native function to fetch and parse the e820 map */
  177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
  178 
  179 /* Default init_ops implementation. */
  180 struct init_ops init_ops = {
  181         .parse_preload_data =   native_parse_preload_data,
  182         .early_clock_source_init =      i8254_init,
  183         .early_delay =                  i8254_delay,
  184         .parse_memmap =                 native_parse_memmap,
  185 #ifdef SMP
  186         .mp_bootaddress =               mp_bootaddress,
  187         .start_all_aps =                native_start_all_aps,
  188 #endif
  189         .msi_init =                     msi_init,
  190 };
  191 
  192 struct msgbuf *msgbufp;
  193 
  194 /*
  195  * Physical address of the EFI System Table. Stashed from the metadata hints
  196  * passed into the kernel and used by the EFI code to call runtime services.
  197  */
  198 vm_paddr_t efi_systbl_phys;
  199 
  200 /* Intel ICH registers */
  201 #define ICH_PMBASE      0x400
  202 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  203 
  204 int     _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
  205 
  206 int cold = 1;
  207 
  208 long Maxmem = 0;
  209 long realmem = 0;
  210 
  211 /*
  212  * The number of PHYSMAP entries must be one less than the number of
  213  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  214  * physical address that is accessible by ISA DMA is split into two
  215  * PHYSSEG entries.
  216  */
  217 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  218 
  219 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  220 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  221 
  222 /* must be 2 less so 0 0 can signal end of chunks */
  223 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
  224 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
  225 
  226 struct kva_md_info kmi;
  227 
  228 static struct trapframe proc0_tf;
  229 struct region_descriptor r_gdt, r_idt;
  230 
  231 struct pcpu __pcpu[MAXCPU];
  232 
  233 struct mtx icu_lock;
  234 
  235 struct mem_range_softc mem_range_softc;
  236 
  237 struct mtx dt_lock;     /* lock for GDT and LDT */
  238 
  239 void (*vmm_resume_p)(void);
  240 
  241 static void
  242 cpu_startup(dummy)
  243         void *dummy;
  244 {
  245         uintmax_t memsize;
  246         char *sysenv;
  247 
  248         /*
  249          * On MacBooks, we need to disallow the legacy USB circuit to
  250          * generate an SMI# because this can cause several problems,
  251          * namely: incorrect CPU frequency detection and failure to
  252          * start the APs.
  253          * We do this by disabling a bit in the SMI_EN (SMI Control and
  254          * Enable register) of the Intel ICH LPC Interface Bridge. 
  255          */
  256         sysenv = kern_getenv("smbios.system.product");
  257         if (sysenv != NULL) {
  258                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  259                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  260                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  261                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  262                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  263                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  264                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  265                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  266                         if (bootverbose)
  267                                 printf("Disabling LEGACY_USB_EN bit on "
  268                                     "Intel ICH.\n");
  269                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  270                 }
  271                 freeenv(sysenv);
  272         }
  273 
  274         /*
  275          * Good {morning,afternoon,evening,night}.
  276          */
  277         startrtclock();
  278         printcpuinfo();
  279 #ifdef PERFMON
  280         perfmon_init();
  281 #endif
  282 
  283         /*
  284          * Display physical memory if SMBIOS reports reasonable amount.
  285          */
  286         memsize = 0;
  287         sysenv = kern_getenv("smbios.memory.enabled");
  288         if (sysenv != NULL) {
  289                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  290                 freeenv(sysenv);
  291         }
  292         if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
  293                 memsize = ptoa((uintmax_t)Maxmem);
  294         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  295         realmem = atop(memsize);
  296 
  297         /*
  298          * Display any holes after the first chunk of extended memory.
  299          */
  300         if (bootverbose) {
  301                 int indx;
  302 
  303                 printf("Physical memory chunk(s):\n");
  304                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  305                         vm_paddr_t size;
  306 
  307                         size = phys_avail[indx + 1] - phys_avail[indx];
  308                         printf(
  309                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  310                             (uintmax_t)phys_avail[indx],
  311                             (uintmax_t)phys_avail[indx + 1] - 1,
  312                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  313                 }
  314         }
  315 
  316         vm_ksubmap_init(&kmi);
  317 
  318         printf("avail memory = %ju (%ju MB)\n",
  319             ptoa((uintmax_t)vm_cnt.v_free_count),
  320             ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
  321 
  322         /*
  323          * Set up buffers, so they can be used to read disk labels.
  324          */
  325         bufinit();
  326         vm_pager_bufferinit();
  327 
  328         cpu_setregs();
  329 }
  330 
  331 /*
  332  * Send an interrupt to process.
  333  *
  334  * Stack is set up to allow sigcode stored
  335  * at top to call routine, followed by call
  336  * to sigreturn routine below.  After sigreturn
  337  * resets the signal mask, the stack, and the
  338  * frame pointer, it returns to the user
  339  * specified pc, psl.
  340  */
  341 void
  342 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  343 {
  344         struct sigframe sf, *sfp;
  345         struct pcb *pcb;
  346         struct proc *p;
  347         struct thread *td;
  348         struct sigacts *psp;
  349         char *sp;
  350         struct trapframe *regs;
  351         char *xfpusave;
  352         size_t xfpusave_len;
  353         int sig;
  354         int oonstack;
  355 
  356         td = curthread;
  357         pcb = td->td_pcb;
  358         p = td->td_proc;
  359         PROC_LOCK_ASSERT(p, MA_OWNED);
  360         sig = ksi->ksi_signo;
  361         psp = p->p_sigacts;
  362         mtx_assert(&psp->ps_mtx, MA_OWNED);
  363         regs = td->td_frame;
  364         oonstack = sigonstack(regs->tf_rsp);
  365 
  366         if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
  367                 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
  368                 xfpusave = __builtin_alloca(xfpusave_len);
  369         } else {
  370                 xfpusave_len = 0;
  371                 xfpusave = NULL;
  372         }
  373 
  374         /* Save user context. */
  375         bzero(&sf, sizeof(sf));
  376         sf.sf_uc.uc_sigmask = *mask;
  377         sf.sf_uc.uc_stack = td->td_sigstk;
  378         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  379             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  380         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  381         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
  382         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  383         get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
  384         fpstate_drop(td);
  385         update_pcb_bases(pcb);
  386         sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
  387         sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
  388         bzero(sf.sf_uc.uc_mcontext.mc_spare,
  389             sizeof(sf.sf_uc.uc_mcontext.mc_spare));
  390         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  391 
  392         /* Allocate space for the signal handler context. */
  393         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  394             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  395                 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
  396 #if defined(COMPAT_43)
  397                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  398 #endif
  399         } else
  400                 sp = (char *)regs->tf_rsp - 128;
  401         if (xfpusave != NULL) {
  402                 sp -= xfpusave_len;
  403                 sp = (char *)((unsigned long)sp & ~0x3Ful);
  404                 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
  405         }
  406         sp -= sizeof(struct sigframe);
  407         /* Align to 16 bytes. */
  408         sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
  409 
  410         /* Build the argument list for the signal handler. */
  411         regs->tf_rdi = sig;                     /* arg 1 in %rdi */
  412         regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
  413         bzero(&sf.sf_si, sizeof(sf.sf_si));
  414         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  415                 /* Signal handler installed with SA_SIGINFO. */
  416                 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
  417                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  418 
  419                 /* Fill in POSIX parts */
  420                 sf.sf_si = ksi->ksi_info;
  421                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  422                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  423         } else {
  424                 /* Old FreeBSD-style arguments. */
  425                 regs->tf_rsi = ksi->ksi_code;   /* arg 2 in %rsi */
  426                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  427                 sf.sf_ahu.sf_handler = catcher;
  428         }
  429         mtx_unlock(&psp->ps_mtx);
  430         PROC_UNLOCK(p);
  431 
  432         /*
  433          * Copy the sigframe out to the user's stack.
  434          */
  435         if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
  436             (xfpusave != NULL && copyout(xfpusave,
  437             (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
  438             != 0)) {
  439 #ifdef DEBUG
  440                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  441 #endif
  442                 PROC_LOCK(p);
  443                 sigexit(td, SIGILL);
  444         }
  445 
  446         regs->tf_rsp = (long)sfp;
  447         regs->tf_rip = p->p_sysent->sv_sigcode_base;
  448         regs->tf_rflags &= ~(PSL_T | PSL_D);
  449         regs->tf_cs = _ucodesel;
  450         regs->tf_ds = _udatasel;
  451         regs->tf_ss = _udatasel;
  452         regs->tf_es = _udatasel;
  453         regs->tf_fs = _ufssel;
  454         regs->tf_gs = _ugssel;
  455         regs->tf_flags = TF_HASSEGS;
  456         PROC_LOCK(p);
  457         mtx_lock(&psp->ps_mtx);
  458 }
  459 
  460 /*
  461  * System call to cleanup state after a signal
  462  * has been taken.  Reset signal mask and
  463  * stack state from context left by sendsig (above).
  464  * Return to previous pc and psl as specified by
  465  * context left by sendsig. Check carefully to
  466  * make sure that the user has not modified the
  467  * state to gain improper privileges.
  468  *
  469  * MPSAFE
  470  */
  471 int
  472 sys_sigreturn(td, uap)
  473         struct thread *td;
  474         struct sigreturn_args /* {
  475                 const struct __ucontext *sigcntxp;
  476         } */ *uap;
  477 {
  478         ucontext_t uc;
  479         struct pcb *pcb;
  480         struct proc *p;
  481         struct trapframe *regs;
  482         ucontext_t *ucp;
  483         char *xfpustate;
  484         size_t xfpustate_len;
  485         long rflags;
  486         int cs, error, ret;
  487         ksiginfo_t ksi;
  488 
  489         pcb = td->td_pcb;
  490         p = td->td_proc;
  491 
  492         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  493         if (error != 0) {
  494                 uprintf("pid %d (%s): sigreturn copyin failed\n",
  495                     p->p_pid, td->td_name);
  496                 return (error);
  497         }
  498         ucp = &uc;
  499         if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
  500                 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
  501                     td->td_name, ucp->uc_mcontext.mc_flags);
  502                 return (EINVAL);
  503         }
  504         regs = td->td_frame;
  505         rflags = ucp->uc_mcontext.mc_rflags;
  506         /*
  507          * Don't allow users to change privileged or reserved flags.
  508          */
  509         if (!EFL_SECURE(rflags, regs->tf_rflags)) {
  510                 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
  511                     td->td_name, rflags);
  512                 return (EINVAL);
  513         }
  514 
  515         /*
  516          * Don't allow users to load a valid privileged %cs.  Let the
  517          * hardware check for invalid selectors, excess privilege in
  518          * other selectors, invalid %eip's and invalid %esp's.
  519          */
  520         cs = ucp->uc_mcontext.mc_cs;
  521         if (!CS_SECURE(cs)) {
  522                 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
  523                     td->td_name, cs);
  524                 ksiginfo_init_trap(&ksi);
  525                 ksi.ksi_signo = SIGBUS;
  526                 ksi.ksi_code = BUS_OBJERR;
  527                 ksi.ksi_trapno = T_PROTFLT;
  528                 ksi.ksi_addr = (void *)regs->tf_rip;
  529                 trapsignal(td, &ksi);
  530                 return (EINVAL);
  531         }
  532 
  533         if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
  534                 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
  535                 if (xfpustate_len > cpu_max_ext_state_size -
  536                     sizeof(struct savefpu)) {
  537                         uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
  538                             p->p_pid, td->td_name, xfpustate_len);
  539                         return (EINVAL);
  540                 }
  541                 xfpustate = __builtin_alloca(xfpustate_len);
  542                 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
  543                     xfpustate, xfpustate_len);
  544                 if (error != 0) {
  545                         uprintf(
  546         "pid %d (%s): sigreturn copying xfpustate failed\n",
  547                             p->p_pid, td->td_name);
  548                         return (error);
  549                 }
  550         } else {
  551                 xfpustate = NULL;
  552                 xfpustate_len = 0;
  553         }
  554         ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
  555         if (ret != 0) {
  556                 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
  557                     p->p_pid, td->td_name, ret);
  558                 return (ret);
  559         }
  560         bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
  561         update_pcb_bases(pcb);
  562         pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
  563         pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
  564 
  565 #if defined(COMPAT_43)
  566         if (ucp->uc_mcontext.mc_onstack & 1)
  567                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  568         else
  569                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  570 #endif
  571 
  572         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
  573         return (EJUSTRETURN);
  574 }
  575 
  576 #ifdef COMPAT_FREEBSD4
  577 int
  578 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
  579 {
  580  
  581         return sys_sigreturn(td, (struct sigreturn_args *)uap);
  582 }
  583 #endif
  584 
  585 /*
  586  * Reset registers to default values on exec.
  587  */
  588 void
  589 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
  590 {
  591         struct trapframe *regs = td->td_frame;
  592         struct pcb *pcb = td->td_pcb;
  593 
  594         mtx_lock(&dt_lock);
  595         if (td->td_proc->p_md.md_ldt != NULL)
  596                 user_ldt_free(td);
  597         else
  598                 mtx_unlock(&dt_lock);
  599         
  600         update_pcb_bases(pcb);
  601         pcb->pcb_fsbase = 0;
  602         pcb->pcb_gsbase = 0;
  603         clear_pcb_flags(pcb, PCB_32BIT);
  604         pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
  605 
  606         bzero((char *)regs, sizeof(struct trapframe));
  607         regs->tf_rip = imgp->entry_addr;
  608         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
  609         regs->tf_rdi = stack;           /* argv */
  610         regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
  611         regs->tf_ss = _udatasel;
  612         regs->tf_cs = _ucodesel;
  613         regs->tf_ds = _udatasel;
  614         regs->tf_es = _udatasel;
  615         regs->tf_fs = _ufssel;
  616         regs->tf_gs = _ugssel;
  617         regs->tf_flags = TF_HASSEGS;
  618         td->td_retval[1] = 0;
  619 
  620         /*
  621          * Reset the hardware debug registers if they were in use.
  622          * They won't have any meaning for the newly exec'd process.
  623          */
  624         if (pcb->pcb_flags & PCB_DBREGS) {
  625                 pcb->pcb_dr0 = 0;
  626                 pcb->pcb_dr1 = 0;
  627                 pcb->pcb_dr2 = 0;
  628                 pcb->pcb_dr3 = 0;
  629                 pcb->pcb_dr6 = 0;
  630                 pcb->pcb_dr7 = 0;
  631                 if (pcb == curpcb) {
  632                         /*
  633                          * Clear the debug registers on the running
  634                          * CPU, otherwise they will end up affecting
  635                          * the next process we switch to.
  636                          */
  637                         reset_dbregs();
  638                 }
  639                 clear_pcb_flags(pcb, PCB_DBREGS);
  640         }
  641 
  642         /*
  643          * Drop the FP state if we hold it, so that the process gets a
  644          * clean FP state if it uses the FPU again.
  645          */
  646         fpstate_drop(td);
  647 }
  648 
  649 void
  650 cpu_setregs(void)
  651 {
  652         register_t cr0;
  653 
  654         cr0 = rcr0();
  655         /*
  656          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  657          * BSP.  See the comments there about why we set them.
  658          */
  659         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  660         load_cr0(cr0);
  661 }
  662 
  663 /*
  664  * Initialize amd64 and configure to run kernel
  665  */
  666 
  667 /*
  668  * Initialize segments & interrupt table
  669  */
  670 
  671 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
  672 static struct gate_descriptor idt0[NIDT];
  673 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  674 
  675 static char dblfault_stack[PAGE_SIZE] __aligned(16);
  676 static char mce0_stack[PAGE_SIZE] __aligned(16);
  677 static char nmi0_stack[PAGE_SIZE] __aligned(16);
  678 static char dbg0_stack[PAGE_SIZE] __aligned(16);
  679 CTASSERT(sizeof(struct nmi_pcpu) == 16);
  680 
  681 struct amd64tss common_tss[MAXCPU];
  682 
  683 /*
  684  * Software prototypes -- in more palatable form.
  685  *
  686  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  687  * slots as corresponding segments for i386 kernel.
  688  */
  689 struct soft_segment_descriptor gdt_segs[] = {
  690 /* GNULL_SEL    0 Null Descriptor */
  691 {       .ssd_base = 0x0,
  692         .ssd_limit = 0x0,
  693         .ssd_type = 0,
  694         .ssd_dpl = 0,
  695         .ssd_p = 0,
  696         .ssd_long = 0,
  697         .ssd_def32 = 0,
  698         .ssd_gran = 0           },
  699 /* GNULL2_SEL   1 Null Descriptor */
  700 {       .ssd_base = 0x0,
  701         .ssd_limit = 0x0,
  702         .ssd_type = 0,
  703         .ssd_dpl = 0,
  704         .ssd_p = 0,
  705         .ssd_long = 0,
  706         .ssd_def32 = 0,
  707         .ssd_gran = 0           },
  708 /* GUFS32_SEL   2 32 bit %gs Descriptor for user */
  709 {       .ssd_base = 0x0,
  710         .ssd_limit = 0xfffff,
  711         .ssd_type = SDT_MEMRWA,
  712         .ssd_dpl = SEL_UPL,
  713         .ssd_p = 1,
  714         .ssd_long = 0,
  715         .ssd_def32 = 1,
  716         .ssd_gran = 1           },
  717 /* GUGS32_SEL   3 32 bit %fs Descriptor for user */
  718 {       .ssd_base = 0x0,
  719         .ssd_limit = 0xfffff,
  720         .ssd_type = SDT_MEMRWA,
  721         .ssd_dpl = SEL_UPL,
  722         .ssd_p = 1,
  723         .ssd_long = 0,
  724         .ssd_def32 = 1,
  725         .ssd_gran = 1           },
  726 /* GCODE_SEL    4 Code Descriptor for kernel */
  727 {       .ssd_base = 0x0,
  728         .ssd_limit = 0xfffff,
  729         .ssd_type = SDT_MEMERA,
  730         .ssd_dpl = SEL_KPL,
  731         .ssd_p = 1,
  732         .ssd_long = 1,
  733         .ssd_def32 = 0,
  734         .ssd_gran = 1           },
  735 /* GDATA_SEL    5 Data Descriptor for kernel */
  736 {       .ssd_base = 0x0,
  737         .ssd_limit = 0xfffff,
  738         .ssd_type = SDT_MEMRWA,
  739         .ssd_dpl = SEL_KPL,
  740         .ssd_p = 1,
  741         .ssd_long = 1,
  742         .ssd_def32 = 0,
  743         .ssd_gran = 1           },
  744 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
  745 {       .ssd_base = 0x0,
  746         .ssd_limit = 0xfffff,
  747         .ssd_type = SDT_MEMERA,
  748         .ssd_dpl = SEL_UPL,
  749         .ssd_p = 1,
  750         .ssd_long = 0,
  751         .ssd_def32 = 1,
  752         .ssd_gran = 1           },
  753 /* GUDATA_SEL   7 32/64 bit Data Descriptor for user */
  754 {       .ssd_base = 0x0,
  755         .ssd_limit = 0xfffff,
  756         .ssd_type = SDT_MEMRWA,
  757         .ssd_dpl = SEL_UPL,
  758         .ssd_p = 1,
  759         .ssd_long = 0,
  760         .ssd_def32 = 1,
  761         .ssd_gran = 1           },
  762 /* GUCODE_SEL   8 64 bit Code Descriptor for user */
  763 {       .ssd_base = 0x0,
  764         .ssd_limit = 0xfffff,
  765         .ssd_type = SDT_MEMERA,
  766         .ssd_dpl = SEL_UPL,
  767         .ssd_p = 1,
  768         .ssd_long = 1,
  769         .ssd_def32 = 0,
  770         .ssd_gran = 1           },
  771 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
  772 {       .ssd_base = 0x0,
  773         .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
  774         .ssd_type = SDT_SYSTSS,
  775         .ssd_dpl = SEL_KPL,
  776         .ssd_p = 1,
  777         .ssd_long = 0,
  778         .ssd_def32 = 0,
  779         .ssd_gran = 0           },
  780 /* Actually, the TSS is a system descriptor which is double size */
  781 {       .ssd_base = 0x0,
  782         .ssd_limit = 0x0,
  783         .ssd_type = 0,
  784         .ssd_dpl = 0,
  785         .ssd_p = 0,
  786         .ssd_long = 0,
  787         .ssd_def32 = 0,
  788         .ssd_gran = 0           },
  789 /* GUSERLDT_SEL 11 LDT Descriptor */
  790 {       .ssd_base = 0x0,
  791         .ssd_limit = 0x0,
  792         .ssd_type = 0,
  793         .ssd_dpl = 0,
  794         .ssd_p = 0,
  795         .ssd_long = 0,
  796         .ssd_def32 = 0,
  797         .ssd_gran = 0           },
  798 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
  799 {       .ssd_base = 0x0,
  800         .ssd_limit = 0x0,
  801         .ssd_type = 0,
  802         .ssd_dpl = 0,
  803         .ssd_p = 0,
  804         .ssd_long = 0,
  805         .ssd_def32 = 0,
  806         .ssd_gran = 0           },
  807 };
  808 
  809 void
  810 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
  811 {
  812         struct gate_descriptor *ip;
  813 
  814         ip = idt + idx;
  815         ip->gd_looffset = (uintptr_t)func;
  816         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  817         ip->gd_ist = ist;
  818         ip->gd_xx = 0;
  819         ip->gd_type = typ;
  820         ip->gd_dpl = dpl;
  821         ip->gd_p = 1;
  822         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  823 }
  824 
  825 extern inthand_t
  826         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  827         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  828         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  829         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  830         IDTVEC(xmm), IDTVEC(dblfault),
  831         IDTVEC(div_pti), IDTVEC(bpt_pti),
  832         IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
  833         IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
  834         IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
  835         IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
  836         IDTVEC(xmm_pti),
  837 #ifdef KDTRACE_HOOKS
  838         IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
  839 #endif
  840 #ifdef XENHVM
  841         IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
  842 #endif
  843         IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
  844         IDTVEC(fast_syscall_pti);
  845 
  846 #ifdef DDB
  847 /*
  848  * Display the index and function name of any IDT entries that don't use
  849  * the default 'rsvd' entry point.
  850  */
  851 DB_SHOW_COMMAND(idt, db_show_idt)
  852 {
  853         struct gate_descriptor *ip;
  854         int idx;
  855         uintptr_t func;
  856 
  857         ip = idt;
  858         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
  859                 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
  860                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
  861                         db_printf("%3d\t", idx);
  862                         db_printsym(func, DB_STGY_PROC);
  863                         db_printf("\n");
  864                 }
  865                 ip++;
  866         }
  867 }
  868 
  869 /* Show privileged registers. */
  870 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
  871 {
  872         struct {
  873                 uint16_t limit;
  874                 uint64_t base;
  875         } __packed idtr, gdtr;
  876         uint16_t ldt, tr;
  877 
  878         __asm __volatile("sidt %0" : "=m" (idtr));
  879         db_printf("idtr\t0x%016lx/%04x\n",
  880             (u_long)idtr.base, (u_int)idtr.limit);
  881         __asm __volatile("sgdt %0" : "=m" (gdtr));
  882         db_printf("gdtr\t0x%016lx/%04x\n",
  883             (u_long)gdtr.base, (u_int)gdtr.limit);
  884         __asm __volatile("sldt %0" : "=r" (ldt));
  885         db_printf("ldtr\t0x%04x\n", ldt);
  886         __asm __volatile("str %0" : "=r" (tr));
  887         db_printf("tr\t0x%04x\n", tr);
  888         db_printf("cr0\t0x%016lx\n", rcr0());
  889         db_printf("cr2\t0x%016lx\n", rcr2());
  890         db_printf("cr3\t0x%016lx\n", rcr3());
  891         db_printf("cr4\t0x%016lx\n", rcr4());
  892         if (rcr4() & CR4_XSAVE)
  893                 db_printf("xcr0\t0x%016lx\n", rxcr(0));
  894         db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
  895         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
  896                 db_printf("FEATURES_CTL\t%016lx\n",
  897                     rdmsr(MSR_IA32_FEATURE_CONTROL));
  898         db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
  899         db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
  900         db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
  901 }
  902 
  903 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
  904 {
  905 
  906         db_printf("dr0\t0x%016lx\n", rdr0());
  907         db_printf("dr1\t0x%016lx\n", rdr1());
  908         db_printf("dr2\t0x%016lx\n", rdr2());
  909         db_printf("dr3\t0x%016lx\n", rdr3());
  910         db_printf("dr6\t0x%016lx\n", rdr6());
  911         db_printf("dr7\t0x%016lx\n", rdr7());   
  912 }
  913 #endif
  914 
  915 void
  916 sdtossd(sd, ssd)
  917         struct user_segment_descriptor *sd;
  918         struct soft_segment_descriptor *ssd;
  919 {
  920 
  921         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  922         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  923         ssd->ssd_type  = sd->sd_type;
  924         ssd->ssd_dpl   = sd->sd_dpl;
  925         ssd->ssd_p     = sd->sd_p;
  926         ssd->ssd_long  = sd->sd_long;
  927         ssd->ssd_def32 = sd->sd_def32;
  928         ssd->ssd_gran  = sd->sd_gran;
  929 }
  930 
  931 void
  932 ssdtosd(ssd, sd)
  933         struct soft_segment_descriptor *ssd;
  934         struct user_segment_descriptor *sd;
  935 {
  936 
  937         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  938         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  939         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  940         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  941         sd->sd_type  = ssd->ssd_type;
  942         sd->sd_dpl   = ssd->ssd_dpl;
  943         sd->sd_p     = ssd->ssd_p;
  944         sd->sd_long  = ssd->ssd_long;
  945         sd->sd_def32 = ssd->ssd_def32;
  946         sd->sd_gran  = ssd->ssd_gran;
  947 }
  948 
  949 void
  950 ssdtosyssd(ssd, sd)
  951         struct soft_segment_descriptor *ssd;
  952         struct system_segment_descriptor *sd;
  953 {
  954 
  955         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  956         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  957         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  958         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  959         sd->sd_type  = ssd->ssd_type;
  960         sd->sd_dpl   = ssd->ssd_dpl;
  961         sd->sd_p     = ssd->ssd_p;
  962         sd->sd_gran  = ssd->ssd_gran;
  963 }
  964 
  965 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
  966 #include <isa/isavar.h>
  967 #include <isa/isareg.h>
  968 /*
  969  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  970  * and is only suitable for use at probe time.
  971  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  972  * It shouldn't be here.  There should probably be an APIC centric
  973  * implementation in the apic driver code, if at all.
  974  */
  975 intrmask_t
  976 isa_irq_pending(void)
  977 {
  978         u_char irr1;
  979         u_char irr2;
  980 
  981         irr1 = inb(IO_ICU1);
  982         irr2 = inb(IO_ICU2);
  983         return ((irr2 << 8) | irr1);
  984 }
  985 #endif
  986 
  987 u_int basemem;
  988 
  989 static int
  990 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
  991     int *physmap_idxp)
  992 {
  993         int i, insert_idx, physmap_idx;
  994 
  995         physmap_idx = *physmap_idxp;
  996 
  997         if (length == 0)
  998                 return (1);
  999 
 1000         /*
 1001          * Find insertion point while checking for overlap.  Start off by
 1002          * assuming the new entry will be added to the end.
 1003          *
 1004          * NB: physmap_idx points to the next free slot.
 1005          */
 1006         insert_idx = physmap_idx;
 1007         for (i = 0; i <= physmap_idx; i += 2) {
 1008                 if (base < physmap[i + 1]) {
 1009                         if (base + length <= physmap[i]) {
 1010                                 insert_idx = i;
 1011                                 break;
 1012                         }
 1013                         if (boothowto & RB_VERBOSE)
 1014                                 printf(
 1015                     "Overlapping memory regions, ignoring second region\n");
 1016                         return (1);
 1017                 }
 1018         }
 1019 
 1020         /* See if we can prepend to the next entry. */
 1021         if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 1022                 physmap[insert_idx] = base;
 1023                 return (1);
 1024         }
 1025 
 1026         /* See if we can append to the previous entry. */
 1027         if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 1028                 physmap[insert_idx - 1] += length;
 1029                 return (1);
 1030         }
 1031 
 1032         physmap_idx += 2;
 1033         *physmap_idxp = physmap_idx;
 1034         if (physmap_idx == PHYSMAP_SIZE) {
 1035                 printf(
 1036                 "Too many segments in the physical address map, giving up\n");
 1037                 return (0);
 1038         }
 1039 
 1040         /*
 1041          * Move the last 'N' entries down to make room for the new
 1042          * entry if needed.
 1043          */
 1044         for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 1045                 physmap[i] = physmap[i - 2];
 1046                 physmap[i + 1] = physmap[i - 1];
 1047         }
 1048 
 1049         /* Insert the new entry. */
 1050         physmap[insert_idx] = base;
 1051         physmap[insert_idx + 1] = base + length;
 1052         return (1);
 1053 }
 1054 
 1055 void
 1056 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
 1057                       vm_paddr_t *physmap, int *physmap_idx)
 1058 {
 1059         struct bios_smap *smap, *smapend;
 1060 
 1061         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 1062 
 1063         for (smap = smapbase; smap < smapend; smap++) {
 1064                 if (boothowto & RB_VERBOSE)
 1065                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
 1066                             smap->type, smap->base, smap->length);
 1067 
 1068                 if (smap->type != SMAP_TYPE_MEMORY)
 1069                         continue;
 1070 
 1071                 if (!add_physmap_entry(smap->base, smap->length, physmap,
 1072                     physmap_idx))
 1073                         break;
 1074         }
 1075 }
 1076 
 1077 static void
 1078 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
 1079     int *physmap_idx)
 1080 {
 1081         struct efi_md *map, *p;
 1082         const char *type;
 1083         size_t efisz;
 1084         int ndesc, i;
 1085 
 1086         static const char *types[] = {
 1087                 "Reserved",
 1088                 "LoaderCode",
 1089                 "LoaderData",
 1090                 "BootServicesCode",
 1091                 "BootServicesData",
 1092                 "RuntimeServicesCode",
 1093                 "RuntimeServicesData",
 1094                 "ConventionalMemory",
 1095                 "UnusableMemory",
 1096                 "ACPIReclaimMemory",
 1097                 "ACPIMemoryNVS",
 1098                 "MemoryMappedIO",
 1099                 "MemoryMappedIOPortSpace",
 1100                 "PalCode",
 1101                 "PersistentMemory"
 1102         };
 1103 
 1104         /*
 1105          * Memory map data provided by UEFI via the GetMemoryMap
 1106          * Boot Services API.
 1107          */
 1108         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 1109         map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 1110 
 1111         if (efihdr->descriptor_size == 0)
 1112                 return;
 1113         ndesc = efihdr->memory_size / efihdr->descriptor_size;
 1114 
 1115         if (boothowto & RB_VERBOSE)
 1116                 printf("%23s %12s %12s %8s %4s\n",
 1117                     "Type", "Physical", "Virtual", "#Pages", "Attr");
 1118 
 1119         for (i = 0, p = map; i < ndesc; i++,
 1120             p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 1121                 if (boothowto & RB_VERBOSE) {
 1122                         if (p->md_type < nitems(types))
 1123                                 type = types[p->md_type];
 1124                         else
 1125                                 type = "<INVALID>";
 1126                         printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 1127                             p->md_virt, p->md_pages);
 1128                         if (p->md_attr & EFI_MD_ATTR_UC)
 1129                                 printf("UC ");
 1130                         if (p->md_attr & EFI_MD_ATTR_WC)
 1131                                 printf("WC ");
 1132                         if (p->md_attr & EFI_MD_ATTR_WT)
 1133                                 printf("WT ");
 1134                         if (p->md_attr & EFI_MD_ATTR_WB)
 1135                                 printf("WB ");
 1136                         if (p->md_attr & EFI_MD_ATTR_UCE)
 1137                                 printf("UCE ");
 1138                         if (p->md_attr & EFI_MD_ATTR_WP)
 1139                                 printf("WP ");
 1140                         if (p->md_attr & EFI_MD_ATTR_RP)
 1141                                 printf("RP ");
 1142                         if (p->md_attr & EFI_MD_ATTR_XP)
 1143                                 printf("XP ");
 1144                         if (p->md_attr & EFI_MD_ATTR_NV)
 1145                                 printf("NV ");
 1146                         if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 1147                                 printf("MORE_RELIABLE ");
 1148                         if (p->md_attr & EFI_MD_ATTR_RO)
 1149                                 printf("RO ");
 1150                         if (p->md_attr & EFI_MD_ATTR_RT)
 1151                                 printf("RUNTIME");
 1152                         printf("\n");
 1153                 }
 1154 
 1155                 switch (p->md_type) {
 1156                 case EFI_MD_TYPE_CODE:
 1157                 case EFI_MD_TYPE_DATA:
 1158                 case EFI_MD_TYPE_BS_CODE:
 1159                 case EFI_MD_TYPE_BS_DATA:
 1160                 case EFI_MD_TYPE_FREE:
 1161                         /*
 1162                          * We're allowed to use any entry with these types.
 1163                          */
 1164                         break;
 1165                 default:
 1166                         continue;
 1167                 }
 1168 
 1169                 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 1170                     physmap, physmap_idx))
 1171                         break;
 1172         }
 1173 }
 1174 
 1175 static char bootmethod[16] = "";
 1176 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
 1177     "System firmware boot method");
 1178 
 1179 static void
 1180 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 1181 {
 1182         struct bios_smap *smap;
 1183         struct efi_map_header *efihdr;
 1184         u_int32_t size;
 1185 
 1186         /*
 1187          * Memory map from INT 15:E820.
 1188          *
 1189          * subr_module.c says:
 1190          * "Consumer may safely assume that size value precedes data."
 1191          * ie: an int32_t immediately precedes smap.
 1192          */
 1193 
 1194         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1195             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1196         smap = (struct bios_smap *)preload_search_info(kmdp,
 1197             MODINFO_METADATA | MODINFOMD_SMAP);
 1198         if (efihdr == NULL && smap == NULL)
 1199                 panic("No BIOS smap or EFI map info from loader!");
 1200 
 1201         if (efihdr != NULL) {
 1202                 add_efi_map_entries(efihdr, physmap, physmap_idx);
 1203                 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 1204         } else {
 1205                 size = *((u_int32_t *)smap - 1);
 1206                 bios_add_smap_entries(smap, size, physmap, physmap_idx);
 1207                 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 1208         }
 1209 }
 1210 
 1211 #define PAGES_PER_GB    (1024 * 1024 * 1024 / PAGE_SIZE)
 1212 
 1213 /*
 1214  * Populate the (physmap) array with base/bound pairs describing the
 1215  * available physical memory in the system, then test this memory and
 1216  * build the phys_avail array describing the actually-available memory.
 1217  *
 1218  * Total memory size may be set by the kernel environment variable
 1219  * hw.physmem or the compile-time define MAXMEM.
 1220  *
 1221  * XXX first should be vm_paddr_t.
 1222  */
 1223 static void
 1224 getmemsize(caddr_t kmdp, u_int64_t first)
 1225 {
 1226         int i, physmap_idx, pa_indx, da_indx;
 1227         vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 1228         u_long physmem_start, physmem_tunable, memtest;
 1229         pt_entry_t *pte;
 1230         quad_t dcons_addr, dcons_size;
 1231         int page_counter;
 1232 
 1233         bzero(physmap, sizeof(physmap));
 1234         physmap_idx = 0;
 1235 
 1236         init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 1237         physmap_idx -= 2;
 1238 
 1239         /*
 1240          * Find the 'base memory' segment for SMP
 1241          */
 1242         basemem = 0;
 1243         for (i = 0; i <= physmap_idx; i += 2) {
 1244                 if (physmap[i] <= 0xA0000) {
 1245                         basemem = physmap[i + 1] / 1024;
 1246                         break;
 1247                 }
 1248         }
 1249         if (basemem == 0 || basemem > 640) {
 1250                 if (bootverbose)
 1251                         printf(
 1252                 "Memory map doesn't contain a basemem segment, faking it");
 1253                 basemem = 640;
 1254         }
 1255 
 1256         /*
 1257          * Make hole for "AP -> long mode" bootstrap code.  The
 1258          * mp_bootaddress vector is only available when the kernel
 1259          * is configured to support APs and APs for the system start
 1260          * in 32bit mode (e.g. SMP bare metal).
 1261          */
 1262         if (init_ops.mp_bootaddress) {
 1263                 if (physmap[1] >= 0x100000000)
 1264                         panic(
 1265         "Basemem segment is not suitable for AP bootstrap code!");
 1266                 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 1267         }
 1268 
 1269         /*
 1270          * Maxmem isn't the "maximum memory", it's one larger than the
 1271          * highest page of the physical address space.  It should be
 1272          * called something like "Maxphyspage".  We may adjust this
 1273          * based on ``hw.physmem'' and the results of the memory test.
 1274          */
 1275         Maxmem = atop(physmap[physmap_idx + 1]);
 1276 
 1277 #ifdef MAXMEM
 1278         Maxmem = MAXMEM / 4;
 1279 #endif
 1280 
 1281         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 1282                 Maxmem = atop(physmem_tunable);
 1283 
 1284         /*
 1285          * The boot memory test is disabled by default, as it takes a
 1286          * significant amount of time on large-memory systems, and is
 1287          * unfriendly to virtual machines as it unnecessarily touches all
 1288          * pages.
 1289          *
 1290          * A general name is used as the code may be extended to support
 1291          * additional tests beyond the current "page present" test.
 1292          */
 1293         memtest = 0;
 1294         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 1295 
 1296         /*
 1297          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 1298          * in the system.
 1299          */
 1300         if (Maxmem > atop(physmap[physmap_idx + 1]))
 1301                 Maxmem = atop(physmap[physmap_idx + 1]);
 1302 
 1303         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 1304             (boothowto & RB_VERBOSE))
 1305                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 1306 
 1307         /* call pmap initialization to make new kernel address space */
 1308         pmap_bootstrap(&first);
 1309 
 1310         /*
 1311          * Size up each available chunk of physical memory.
 1312          *
 1313          * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 1314          * By default, mask off the first 16 pages unless we appear to be
 1315          * running in a VM.
 1316          */
 1317         physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 1318         TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 1319         if (physmap[0] < physmem_start) {
 1320                 if (physmem_start < PAGE_SIZE)
 1321                         physmap[0] = PAGE_SIZE;
 1322                 else if (physmem_start >= physmap[1])
 1323                         physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 1324                 else
 1325                         physmap[0] = round_page(physmem_start);
 1326         }
 1327         pa_indx = 0;
 1328         da_indx = 1;
 1329         phys_avail[pa_indx++] = physmap[0];
 1330         phys_avail[pa_indx] = physmap[0];
 1331         dump_avail[da_indx] = physmap[0];
 1332         pte = CMAP1;
 1333 
 1334         /*
 1335          * Get dcons buffer address
 1336          */
 1337         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 1338             getenv_quad("dcons.size", &dcons_size) == 0)
 1339                 dcons_addr = 0;
 1340 
 1341         /*
 1342          * physmap is in bytes, so when converting to page boundaries,
 1343          * round up the start address and round down the end address.
 1344          */
 1345         page_counter = 0;
 1346         if (memtest != 0)
 1347                 printf("Testing system memory");
 1348         for (i = 0; i <= physmap_idx; i += 2) {
 1349                 vm_paddr_t end;
 1350 
 1351                 end = ptoa((vm_paddr_t)Maxmem);
 1352                 if (physmap[i + 1] < end)
 1353                         end = trunc_page(physmap[i + 1]);
 1354                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 1355                         int tmp, page_bad, full;
 1356                         int *ptr = (int *)CADDR1;
 1357 
 1358                         full = FALSE;
 1359                         /*
 1360                          * block out kernel memory as not available.
 1361                          */
 1362                         if (pa >= (vm_paddr_t)kernphys && pa < first)
 1363                                 goto do_dump_avail;
 1364 
 1365                         /*
 1366                          * block out dcons buffer
 1367                          */
 1368                         if (dcons_addr > 0
 1369                             && pa >= trunc_page(dcons_addr)
 1370                             && pa < dcons_addr + dcons_size)
 1371                                 goto do_dump_avail;
 1372 
 1373                         page_bad = FALSE;
 1374                         if (memtest == 0)
 1375                                 goto skip_memtest;
 1376 
 1377                         /*
 1378                          * Print a "." every GB to show we're making
 1379                          * progress.
 1380                          */
 1381                         page_counter++;
 1382                         if ((page_counter % PAGES_PER_GB) == 0)
 1383                                 printf(".");
 1384 
 1385                         /*
 1386                          * map page into kernel: valid, read/write,non-cacheable
 1387                          */
 1388                         *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 1389                         invltlb();
 1390 
 1391                         tmp = *(int *)ptr;
 1392                         /*
 1393                          * Test for alternating 1's and 0's
 1394                          */
 1395                         *(volatile int *)ptr = 0xaaaaaaaa;
 1396                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1397                                 page_bad = TRUE;
 1398                         /*
 1399                          * Test for alternating 0's and 1's
 1400                          */
 1401                         *(volatile int *)ptr = 0x55555555;
 1402                         if (*(volatile int *)ptr != 0x55555555)
 1403                                 page_bad = TRUE;
 1404                         /*
 1405                          * Test for all 1's
 1406                          */
 1407                         *(volatile int *)ptr = 0xffffffff;
 1408                         if (*(volatile int *)ptr != 0xffffffff)
 1409                                 page_bad = TRUE;
 1410                         /*
 1411                          * Test for all 0's
 1412                          */
 1413                         *(volatile int *)ptr = 0x0;
 1414                         if (*(volatile int *)ptr != 0x0)
 1415                                 page_bad = TRUE;
 1416                         /*
 1417                          * Restore original value.
 1418                          */
 1419                         *(int *)ptr = tmp;
 1420 
 1421 skip_memtest:
 1422                         /*
 1423                          * Adjust array of valid/good pages.
 1424                          */
 1425                         if (page_bad == TRUE)
 1426                                 continue;
 1427                         /*
 1428                          * If this good page is a continuation of the
 1429                          * previous set of good pages, then just increase
 1430                          * the end pointer. Otherwise start a new chunk.
 1431                          * Note that "end" points one higher than end,
 1432                          * making the range >= start and < end.
 1433                          * If we're also doing a speculative memory
 1434                          * test and we at or past the end, bump up Maxmem
 1435                          * so that we keep going. The first bad page
 1436                          * will terminate the loop.
 1437                          */
 1438                         if (phys_avail[pa_indx] == pa) {
 1439                                 phys_avail[pa_indx] += PAGE_SIZE;
 1440                         } else {
 1441                                 pa_indx++;
 1442                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 1443                                         printf(
 1444                 "Too many holes in the physical address space, giving up\n");
 1445                                         pa_indx--;
 1446                                         full = TRUE;
 1447                                         goto do_dump_avail;
 1448                                 }
 1449                                 phys_avail[pa_indx++] = pa;     /* start */
 1450                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1451                         }
 1452                         physmem++;
 1453 do_dump_avail:
 1454                         if (dump_avail[da_indx] == pa) {
 1455                                 dump_avail[da_indx] += PAGE_SIZE;
 1456                         } else {
 1457                                 da_indx++;
 1458                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 1459                                         da_indx--;
 1460                                         goto do_next;
 1461                                 }
 1462                                 dump_avail[da_indx++] = pa; /* start */
 1463                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1464                         }
 1465 do_next:
 1466                         if (full)
 1467                                 break;
 1468                 }
 1469         }
 1470         *pte = 0;
 1471         invltlb();
 1472         if (memtest != 0)
 1473                 printf("\n");
 1474 
 1475         /*
 1476          * XXX
 1477          * The last chunk must contain at least one page plus the message
 1478          * buffer to avoid complicating other code (message buffer address
 1479          * calculation, etc.).
 1480          */
 1481         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1482             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 1483                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1484                 phys_avail[pa_indx--] = 0;
 1485                 phys_avail[pa_indx--] = 0;
 1486         }
 1487 
 1488         Maxmem = atop(phys_avail[pa_indx]);
 1489 
 1490         /* Trim off space for the message buffer. */
 1491         phys_avail[pa_indx] -= round_page(msgbufsize);
 1492 
 1493         /* Map the message buffer. */
 1494         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 1495 }
 1496 
 1497 static caddr_t
 1498 native_parse_preload_data(u_int64_t modulep)
 1499 {
 1500         caddr_t kmdp;
 1501         char *envp;
 1502 #ifdef DDB
 1503         vm_offset_t ksym_start;
 1504         vm_offset_t ksym_end;
 1505 #endif
 1506 
 1507         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1508         preload_bootstrap_relocate(KERNBASE);
 1509         kmdp = preload_search_by_type("elf kernel");
 1510         if (kmdp == NULL)
 1511                 kmdp = preload_search_by_type("elf64 kernel");
 1512         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1513         envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 1514         if (envp != NULL)
 1515                 envp += KERNBASE;
 1516         init_static_kenv(envp, 0);
 1517 #ifdef DDB
 1518         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1519         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1520         db_fetch_ksymtab(ksym_start, ksym_end);
 1521 #endif
 1522         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 1523 
 1524         return (kmdp);
 1525 }
 1526 
 1527 static void
 1528 amd64_kdb_init(void)
 1529 {
 1530         kdb_init();
 1531 #ifdef KDB
 1532         if (boothowto & RB_KDB)
 1533                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 1534 #endif
 1535 }
 1536 
 1537 /* Set up the fast syscall stuff */
 1538 void
 1539 amd64_conf_fast_syscall(void)
 1540 {
 1541         uint64_t msr;
 1542 
 1543         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1544         wrmsr(MSR_EFER, msr);
 1545         wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 1546             (u_int64_t)IDTVEC(fast_syscall));
 1547         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1548         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1549             ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1550         wrmsr(MSR_STAR, msr);
 1551         wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
 1552 }
 1553 
 1554 u_int64_t
 1555 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1556 {
 1557         caddr_t kmdp;
 1558         int gsel_tss, x;
 1559         struct pcpu *pc;
 1560         struct nmi_pcpu *np;
 1561         struct xstate_hdr *xhdr;
 1562         u_int64_t rsp0;
 1563         char *env;
 1564         size_t kstack0_sz;
 1565         int late_console;
 1566 
 1567         kmdp = init_ops.parse_preload_data(modulep);
 1568 
 1569         identify_cpu1();
 1570         identify_hypervisor();
 1571         /*
 1572          * hw.cpu_stdext_disable is ignored by the call, it will be
 1573          * re-evaluted by the below call to finishidentcpu().
 1574          */
 1575         identify_cpu2();
 1576 
 1577         link_elf_ireloc(kmdp);
 1578 
 1579         /*
 1580          * This may be done better later if it gets more high level
 1581          * components in it. If so just link td->td_proc here.
 1582          */
 1583         proc_linkup0(&proc0, &thread0);
 1584 
 1585         /* Init basic tunables, hz etc */
 1586         init_param1();
 1587 
 1588         thread0.td_kstack = physfree + KERNBASE;
 1589         thread0.td_kstack_pages = kstack_pages;
 1590         kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 1591         bzero((void *)thread0.td_kstack, kstack0_sz);
 1592         physfree += kstack0_sz;
 1593 
 1594         /*
 1595          * make gdt memory segments
 1596          */
 1597         for (x = 0; x < NGDT; x++) {
 1598                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 1599                     x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 1600                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1601         }
 1602         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 1603         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1604             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1605 
 1606         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1607         r_gdt.rd_base =  (long) gdt;
 1608         lgdt(&r_gdt);
 1609         pc = &__pcpu[0];
 1610 
 1611         wrmsr(MSR_FSBASE, 0);           /* User value */
 1612         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1613         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1614 
 1615         pcpu_init(pc, 0, sizeof(struct pcpu));
 1616         dpcpu_init((void *)(physfree + KERNBASE), 0);
 1617         physfree += DPCPU_SIZE;
 1618         PCPU_SET(prvspace, pc);
 1619         PCPU_SET(curthread, &thread0);
 1620         /* Non-late cninit() and printf() can be moved up to here. */
 1621         PCPU_SET(tssp, &common_tss[0]);
 1622         PCPU_SET(commontssp, &common_tss[0]);
 1623         PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1624         PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 1625         PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 1626         PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 1627 
 1628         /*
 1629          * Initialize mutexes.
 1630          *
 1631          * icu_lock: in order to allow an interrupt to occur in a critical
 1632          *           section, to set pcpu->ipending (etc...) properly, we
 1633          *           must be able to get the icu lock, so it can't be
 1634          *           under witness.
 1635          */
 1636         mutex_init();
 1637         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1638         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 1639 
 1640         /* exceptions */
 1641         pti = pti_get_default();
 1642         TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 1643 
 1644         for (x = 0; x < NIDT; x++)
 1645                 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 1646                     SEL_KPL, 0);
 1647         setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 1648             SEL_KPL, 0);
 1649         setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 1650         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 1651         setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 1652             SEL_UPL, 0);
 1653         setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 1654             SEL_UPL, 0);
 1655         setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 1656             SEL_KPL, 0);
 1657         setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 1658             SEL_KPL, 0);
 1659         setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 1660             SEL_KPL, 0);
 1661         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1662         setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 1663             SDT_SYSIGT, SEL_KPL, 0);
 1664         setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 1665             SEL_KPL, 0);
 1666         setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 1667             SDT_SYSIGT, SEL_KPL, 0);
 1668         setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 1669             SEL_KPL, 0);
 1670         setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 1671             SEL_KPL, 0);
 1672         setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 1673             SEL_KPL, 0);
 1674         setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 1675             SEL_KPL, 0);
 1676         setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 1677             SEL_KPL, 0);
 1678         setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 1679         setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 1680             SEL_KPL, 0);
 1681 #ifdef KDTRACE_HOOKS
 1682         setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 1683             &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 1684 #endif
 1685 #ifdef XENHVM
 1686         setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 1687             &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 1688 #endif
 1689         r_idt.rd_limit = sizeof(idt0) - 1;
 1690         r_idt.rd_base = (long) idt;
 1691         lidt(&r_idt);
 1692 
 1693         /*
 1694          * Initialize the clock before the console so that console
 1695          * initialization can use DELAY().
 1696          */
 1697         clock_init();
 1698 
 1699         /*
 1700          * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 1701          * transition).
 1702          * Once bootblocks have updated, we can test directly for
 1703          * efi_systbl != NULL here...
 1704          */
 1705         if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 1706             != NULL)
 1707                 vty_set_preferred(VTY_VT);
 1708 
 1709         finishidentcpu();       /* Final stage of CPU initialization */
 1710         initializecpu();        /* Initialize CPU registers */
 1711         initializecpucache();
 1712 
 1713         /* doublefault stack space, runs on ist1 */
 1714         common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 1715 
 1716         /*
 1717          * NMI stack, runs on ist2.  The pcpu pointer is stored just
 1718          * above the start of the ist2 stack.
 1719          */
 1720         np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 1721         np->np_pcpu = (register_t) pc;
 1722         common_tss[0].tss_ist2 = (long) np;
 1723 
 1724         /*
 1725          * MC# stack, runs on ist3.  The pcpu pointer is stored just
 1726          * above the start of the ist3 stack.
 1727          */
 1728         np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 1729         np->np_pcpu = (register_t) pc;
 1730         common_tss[0].tss_ist3 = (long) np;
 1731 
 1732         /*
 1733          * DB# stack, runs on ist4.
 1734          */
 1735         np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 1736         np->np_pcpu = (register_t) pc;
 1737         common_tss[0].tss_ist4 = (long) np;
 1738         
 1739         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1740         common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 1741 
 1742         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1743         ltr(gsel_tss);
 1744 
 1745         amd64_conf_fast_syscall();
 1746 
 1747         /*
 1748          * Temporary forge some valid pointer to PCB, for exception
 1749          * handlers.  It is reinitialized properly below after FPU is
 1750          * set up.  Also set up td_critnest to short-cut the page
 1751          * fault handler.
 1752          */
 1753         cpu_max_ext_state_size = sizeof(struct savefpu);
 1754         thread0.td_pcb = get_pcb_td(&thread0);
 1755         thread0.td_critnest = 1;
 1756 
 1757         /*
 1758          * The console and kdb should be initialized even earlier than here,
 1759          * but some console drivers don't work until after getmemsize().
 1760          * Default to late console initialization to support these drivers.
 1761          * This loses mainly printf()s in getmemsize() and early debugging.
 1762          */
 1763         late_console = 1;
 1764         TUNABLE_INT_FETCH("debug.late_console", &late_console);
 1765         if (!late_console) {
 1766                 cninit();
 1767                 amd64_kdb_init();
 1768         }
 1769 
 1770         getmemsize(kmdp, physfree);
 1771         init_param2(physmem);
 1772 
 1773         /* now running on new page tables, configured,and u/iom is accessible */
 1774 
 1775         if (late_console)
 1776                 cninit();
 1777 
 1778 #ifdef DEV_ISA
 1779 #ifdef DEV_ATPIC
 1780         elcr_probe();
 1781         atpic_startup();
 1782 #else
 1783         /* Reset and mask the atpics and leave them shut down. */
 1784         atpic_reset();
 1785 
 1786         /*
 1787          * Point the ICU spurious interrupt vectors at the APIC spurious
 1788          * interrupt handler.
 1789          */
 1790         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1791         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1792 #endif
 1793 #else
 1794 #error "have you forgotten the isa device?";
 1795 #endif
 1796 
 1797         if (late_console)
 1798                 amd64_kdb_init();
 1799 
 1800         msgbufinit(msgbufp, msgbufsize);
 1801         fpuinit();
 1802 
 1803         /*
 1804          * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 1805          * area size.  Zero out the extended state header in fpu save
 1806          * area.
 1807          */
 1808         thread0.td_pcb = get_pcb_td(&thread0);
 1809         thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 1810         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 1811         if (use_xsave) {
 1812                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 1813                     1);
 1814                 xhdr->xstate_bv = xsave_mask;
 1815         }
 1816         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1817         rsp0 = (vm_offset_t)thread0.td_pcb;
 1818         /* Ensure the stack is aligned to 16 bytes */
 1819         rsp0 &= ~0xFul;
 1820         common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
 1821             PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
 1822         PCPU_SET(rsp0, rsp0);
 1823         PCPU_SET(curpcb, thread0.td_pcb);
 1824 
 1825         /* transfer to user mode */
 1826 
 1827         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1828         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1829         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1830         _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 1831         _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 1832 
 1833         load_ds(_udatasel);
 1834         load_es(_udatasel);
 1835         load_fs(_ufssel);
 1836 
 1837         /* setup proc 0's pcb */
 1838         thread0.td_pcb->pcb_flags = 0;
 1839         thread0.td_frame = &proc0_tf;
 1840 
 1841         env = kern_getenv("kernelname");
 1842         if (env != NULL)
 1843                 strlcpy(kernelname, env, sizeof(kernelname));
 1844 
 1845         cpu_probe_amdc1e();
 1846 
 1847 #ifdef FDT
 1848         x86_init_fdt();
 1849 #endif
 1850         thread0.td_critnest = 0;
 1851 
 1852         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1853         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1854 
 1855         /* Location of kernel stack for locore */
 1856         return ((u_int64_t)thread0.td_pcb);
 1857 }
 1858 
 1859 void
 1860 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1861 {
 1862 
 1863         pcpu->pc_acpi_id = 0xffffffff;
 1864 }
 1865 
 1866 static int
 1867 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1868 {
 1869         struct bios_smap *smapbase;
 1870         struct bios_smap_xattr smap;
 1871         caddr_t kmdp;
 1872         uint32_t *smapattr;
 1873         int count, error, i;
 1874 
 1875         /* Retrieve the system memory map from the loader. */
 1876         kmdp = preload_search_by_type("elf kernel");
 1877         if (kmdp == NULL)
 1878                 kmdp = preload_search_by_type("elf64 kernel");
 1879         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 1880             MODINFO_METADATA | MODINFOMD_SMAP);
 1881         if (smapbase == NULL)
 1882                 return (0);
 1883         smapattr = (uint32_t *)preload_search_info(kmdp,
 1884             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 1885         count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 1886         error = 0;
 1887         for (i = 0; i < count; i++) {
 1888                 smap.base = smapbase[i].base;
 1889                 smap.length = smapbase[i].length;
 1890                 smap.type = smapbase[i].type;
 1891                 if (smapattr != NULL)
 1892                         smap.xattr = smapattr[i];
 1893                 else
 1894                         smap.xattr = 0;
 1895                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 1896         }
 1897         return (error);
 1898 }
 1899 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1900     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 1901 
 1902 static int
 1903 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1904 {
 1905         struct efi_map_header *efihdr;
 1906         caddr_t kmdp;
 1907         uint32_t efisize;
 1908 
 1909         kmdp = preload_search_by_type("elf kernel");
 1910         if (kmdp == NULL)
 1911                 kmdp = preload_search_by_type("elf64 kernel");
 1912         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1913             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1914         if (efihdr == NULL)
 1915                 return (0);
 1916         efisize = *((uint32_t *)efihdr - 1);
 1917         return (SYSCTL_OUT(req, efihdr, efisize));
 1918 }
 1919 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1920     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 1921 
 1922 void
 1923 spinlock_enter(void)
 1924 {
 1925         struct thread *td;
 1926         register_t flags;
 1927 
 1928         td = curthread;
 1929         if (td->td_md.md_spinlock_count == 0) {
 1930                 flags = intr_disable();
 1931                 td->td_md.md_spinlock_count = 1;
 1932                 td->td_md.md_saved_flags = flags;
 1933         } else
 1934                 td->td_md.md_spinlock_count++;
 1935         critical_enter();
 1936 }
 1937 
 1938 void
 1939 spinlock_exit(void)
 1940 {
 1941         struct thread *td;
 1942         register_t flags;
 1943 
 1944         td = curthread;
 1945         critical_exit();
 1946         flags = td->td_md.md_saved_flags;
 1947         td->td_md.md_spinlock_count--;
 1948         if (td->td_md.md_spinlock_count == 0)
 1949                 intr_restore(flags);
 1950 }
 1951 
 1952 /*
 1953  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1954  * we want to start a backtrace from the function that caused us to enter
 1955  * the debugger. We have the context in the trapframe, but base the trace
 1956  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1957  * enough for a backtrace.
 1958  */
 1959 void
 1960 makectx(struct trapframe *tf, struct pcb *pcb)
 1961 {
 1962 
 1963         pcb->pcb_r12 = tf->tf_r12;
 1964         pcb->pcb_r13 = tf->tf_r13;
 1965         pcb->pcb_r14 = tf->tf_r14;
 1966         pcb->pcb_r15 = tf->tf_r15;
 1967         pcb->pcb_rbp = tf->tf_rbp;
 1968         pcb->pcb_rbx = tf->tf_rbx;
 1969         pcb->pcb_rip = tf->tf_rip;
 1970         pcb->pcb_rsp = tf->tf_rsp;
 1971 }
 1972 
 1973 int
 1974 ptrace_set_pc(struct thread *td, unsigned long addr)
 1975 {
 1976 
 1977         td->td_frame->tf_rip = addr;
 1978         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 1979         return (0);
 1980 }
 1981 
 1982 int
 1983 ptrace_single_step(struct thread *td)
 1984 {
 1985         td->td_frame->tf_rflags |= PSL_T;
 1986         return (0);
 1987 }
 1988 
 1989 int
 1990 ptrace_clear_single_step(struct thread *td)
 1991 {
 1992         td->td_frame->tf_rflags &= ~PSL_T;
 1993         return (0);
 1994 }
 1995 
 1996 int
 1997 fill_regs(struct thread *td, struct reg *regs)
 1998 {
 1999         struct trapframe *tp;
 2000 
 2001         tp = td->td_frame;
 2002         return (fill_frame_regs(tp, regs));
 2003 }
 2004 
 2005 int
 2006 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 2007 {
 2008         regs->r_r15 = tp->tf_r15;
 2009         regs->r_r14 = tp->tf_r14;
 2010         regs->r_r13 = tp->tf_r13;
 2011         regs->r_r12 = tp->tf_r12;
 2012         regs->r_r11 = tp->tf_r11;
 2013         regs->r_r10 = tp->tf_r10;
 2014         regs->r_r9  = tp->tf_r9;
 2015         regs->r_r8  = tp->tf_r8;
 2016         regs->r_rdi = tp->tf_rdi;
 2017         regs->r_rsi = tp->tf_rsi;
 2018         regs->r_rbp = tp->tf_rbp;
 2019         regs->r_rbx = tp->tf_rbx;
 2020         regs->r_rdx = tp->tf_rdx;
 2021         regs->r_rcx = tp->tf_rcx;
 2022         regs->r_rax = tp->tf_rax;
 2023         regs->r_rip = tp->tf_rip;
 2024         regs->r_cs = tp->tf_cs;
 2025         regs->r_rflags = tp->tf_rflags;
 2026         regs->r_rsp = tp->tf_rsp;
 2027         regs->r_ss = tp->tf_ss;
 2028         if (tp->tf_flags & TF_HASSEGS) {
 2029                 regs->r_ds = tp->tf_ds;
 2030                 regs->r_es = tp->tf_es;
 2031                 regs->r_fs = tp->tf_fs;
 2032                 regs->r_gs = tp->tf_gs;
 2033         } else {
 2034                 regs->r_ds = 0;
 2035                 regs->r_es = 0;
 2036                 regs->r_fs = 0;
 2037                 regs->r_gs = 0;
 2038         }
 2039         return (0);
 2040 }
 2041 
 2042 int
 2043 set_regs(struct thread *td, struct reg *regs)
 2044 {
 2045         struct trapframe *tp;
 2046         register_t rflags;
 2047 
 2048         tp = td->td_frame;
 2049         rflags = regs->r_rflags & 0xffffffff;
 2050         if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 2051                 return (EINVAL);
 2052         tp->tf_r15 = regs->r_r15;
 2053         tp->tf_r14 = regs->r_r14;
 2054         tp->tf_r13 = regs->r_r13;
 2055         tp->tf_r12 = regs->r_r12;
 2056         tp->tf_r11 = regs->r_r11;
 2057         tp->tf_r10 = regs->r_r10;
 2058         tp->tf_r9  = regs->r_r9;
 2059         tp->tf_r8  = regs->r_r8;
 2060         tp->tf_rdi = regs->r_rdi;
 2061         tp->tf_rsi = regs->r_rsi;
 2062         tp->tf_rbp = regs->r_rbp;
 2063         tp->tf_rbx = regs->r_rbx;
 2064         tp->tf_rdx = regs->r_rdx;
 2065         tp->tf_rcx = regs->r_rcx;
 2066         tp->tf_rax = regs->r_rax;
 2067         tp->tf_rip = regs->r_rip;
 2068         tp->tf_cs = regs->r_cs;
 2069         tp->tf_rflags = rflags;
 2070         tp->tf_rsp = regs->r_rsp;
 2071         tp->tf_ss = regs->r_ss;
 2072         if (0) {        /* XXXKIB */
 2073                 tp->tf_ds = regs->r_ds;
 2074                 tp->tf_es = regs->r_es;
 2075                 tp->tf_fs = regs->r_fs;
 2076                 tp->tf_gs = regs->r_gs;
 2077                 tp->tf_flags = TF_HASSEGS;
 2078         }
 2079         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2080         return (0);
 2081 }
 2082 
 2083 /* XXX check all this stuff! */
 2084 /* externalize from sv_xmm */
 2085 static void
 2086 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 2087 {
 2088         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2089         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2090         int i;
 2091 
 2092         /* pcb -> fpregs */
 2093         bzero(fpregs, sizeof(*fpregs));
 2094 
 2095         /* FPU control/status */
 2096         penv_fpreg->en_cw = penv_xmm->en_cw;
 2097         penv_fpreg->en_sw = penv_xmm->en_sw;
 2098         penv_fpreg->en_tw = penv_xmm->en_tw;
 2099         penv_fpreg->en_opcode = penv_xmm->en_opcode;
 2100         penv_fpreg->en_rip = penv_xmm->en_rip;
 2101         penv_fpreg->en_rdp = penv_xmm->en_rdp;
 2102         penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 2103         penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 2104 
 2105         /* FPU registers */
 2106         for (i = 0; i < 8; ++i)
 2107                 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 2108 
 2109         /* SSE registers */
 2110         for (i = 0; i < 16; ++i)
 2111                 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 2112 }
 2113 
 2114 /* internalize from fpregs into sv_xmm */
 2115 static void
 2116 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 2117 {
 2118         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2119         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2120         int i;
 2121 
 2122         /* fpregs -> pcb */
 2123         /* FPU control/status */
 2124         penv_xmm->en_cw = penv_fpreg->en_cw;
 2125         penv_xmm->en_sw = penv_fpreg->en_sw;
 2126         penv_xmm->en_tw = penv_fpreg->en_tw;
 2127         penv_xmm->en_opcode = penv_fpreg->en_opcode;
 2128         penv_xmm->en_rip = penv_fpreg->en_rip;
 2129         penv_xmm->en_rdp = penv_fpreg->en_rdp;
 2130         penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 2131         penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 2132 
 2133         /* FPU registers */
 2134         for (i = 0; i < 8; ++i)
 2135                 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 2136 
 2137         /* SSE registers */
 2138         for (i = 0; i < 16; ++i)
 2139                 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 2140 }
 2141 
 2142 /* externalize from td->pcb */
 2143 int
 2144 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 2145 {
 2146 
 2147         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 2148             P_SHOULDSTOP(td->td_proc),
 2149             ("not suspended thread %p", td));
 2150         fpugetregs(td);
 2151         fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 2152         return (0);
 2153 }
 2154 
 2155 /* internalize to td->pcb */
 2156 int
 2157 set_fpregs(struct thread *td, struct fpreg *fpregs)
 2158 {
 2159 
 2160         critical_enter();
 2161         set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 2162         fpuuserinited(td);
 2163         critical_exit();
 2164         return (0);
 2165 }
 2166 
 2167 /*
 2168  * Get machine context.
 2169  */
 2170 int
 2171 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 2172 {
 2173         struct pcb *pcb;
 2174         struct trapframe *tp;
 2175 
 2176         pcb = td->td_pcb;
 2177         tp = td->td_frame;
 2178         PROC_LOCK(curthread->td_proc);
 2179         mcp->mc_onstack = sigonstack(tp->tf_rsp);
 2180         PROC_UNLOCK(curthread->td_proc);
 2181         mcp->mc_r15 = tp->tf_r15;
 2182         mcp->mc_r14 = tp->tf_r14;
 2183         mcp->mc_r13 = tp->tf_r13;
 2184         mcp->mc_r12 = tp->tf_r12;
 2185         mcp->mc_r11 = tp->tf_r11;
 2186         mcp->mc_r10 = tp->tf_r10;
 2187         mcp->mc_r9  = tp->tf_r9;
 2188         mcp->mc_r8  = tp->tf_r8;
 2189         mcp->mc_rdi = tp->tf_rdi;
 2190         mcp->mc_rsi = tp->tf_rsi;
 2191         mcp->mc_rbp = tp->tf_rbp;
 2192         mcp->mc_rbx = tp->tf_rbx;
 2193         mcp->mc_rcx = tp->tf_rcx;
 2194         mcp->mc_rflags = tp->tf_rflags;
 2195         if (flags & GET_MC_CLEAR_RET) {
 2196                 mcp->mc_rax = 0;
 2197                 mcp->mc_rdx = 0;
 2198                 mcp->mc_rflags &= ~PSL_C;
 2199         } else {
 2200                 mcp->mc_rax = tp->tf_rax;
 2201                 mcp->mc_rdx = tp->tf_rdx;
 2202         }
 2203         mcp->mc_rip = tp->tf_rip;
 2204         mcp->mc_cs = tp->tf_cs;
 2205         mcp->mc_rsp = tp->tf_rsp;
 2206         mcp->mc_ss = tp->tf_ss;
 2207         mcp->mc_ds = tp->tf_ds;
 2208         mcp->mc_es = tp->tf_es;
 2209         mcp->mc_fs = tp->tf_fs;
 2210         mcp->mc_gs = tp->tf_gs;
 2211         mcp->mc_flags = tp->tf_flags;
 2212         mcp->mc_len = sizeof(*mcp);
 2213         get_fpcontext(td, mcp, NULL, 0);
 2214         update_pcb_bases(pcb);
 2215         mcp->mc_fsbase = pcb->pcb_fsbase;
 2216         mcp->mc_gsbase = pcb->pcb_gsbase;
 2217         mcp->mc_xfpustate = 0;
 2218         mcp->mc_xfpustate_len = 0;
 2219         bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 2220         return (0);
 2221 }
 2222 
 2223 /*
 2224  * Set machine context.
 2225  *
 2226  * However, we don't set any but the user modifiable flags, and we won't
 2227  * touch the cs selector.
 2228  */
 2229 int
 2230 set_mcontext(struct thread *td, mcontext_t *mcp)
 2231 {
 2232         struct pcb *pcb;
 2233         struct trapframe *tp;
 2234         char *xfpustate;
 2235         long rflags;
 2236         int ret;
 2237 
 2238         pcb = td->td_pcb;
 2239         tp = td->td_frame;
 2240         if (mcp->mc_len != sizeof(*mcp) ||
 2241             (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 2242                 return (EINVAL);
 2243         rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 2244             (tp->tf_rflags & ~PSL_USERCHANGE);
 2245         if (mcp->mc_flags & _MC_HASFPXSTATE) {
 2246                 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 2247                     sizeof(struct savefpu))
 2248                         return (EINVAL);
 2249                 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 2250                 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 2251                     mcp->mc_xfpustate_len);
 2252                 if (ret != 0)
 2253                         return (ret);
 2254         } else
 2255                 xfpustate = NULL;
 2256         ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 2257         if (ret != 0)
 2258                 return (ret);
 2259         tp->tf_r15 = mcp->mc_r15;
 2260         tp->tf_r14 = mcp->mc_r14;
 2261         tp->tf_r13 = mcp->mc_r13;
 2262         tp->tf_r12 = mcp->mc_r12;
 2263         tp->tf_r11 = mcp->mc_r11;
 2264         tp->tf_r10 = mcp->mc_r10;
 2265         tp->tf_r9  = mcp->mc_r9;
 2266         tp->tf_r8  = mcp->mc_r8;
 2267         tp->tf_rdi = mcp->mc_rdi;
 2268         tp->tf_rsi = mcp->mc_rsi;
 2269         tp->tf_rbp = mcp->mc_rbp;
 2270         tp->tf_rbx = mcp->mc_rbx;
 2271         tp->tf_rdx = mcp->mc_rdx;
 2272         tp->tf_rcx = mcp->mc_rcx;
 2273         tp->tf_rax = mcp->mc_rax;
 2274         tp->tf_rip = mcp->mc_rip;
 2275         tp->tf_rflags = rflags;
 2276         tp->tf_rsp = mcp->mc_rsp;
 2277         tp->tf_ss = mcp->mc_ss;
 2278         tp->tf_flags = mcp->mc_flags;
 2279         if (tp->tf_flags & TF_HASSEGS) {
 2280                 tp->tf_ds = mcp->mc_ds;
 2281                 tp->tf_es = mcp->mc_es;
 2282                 tp->tf_fs = mcp->mc_fs;
 2283                 tp->tf_gs = mcp->mc_gs;
 2284         }
 2285         set_pcb_flags(pcb, PCB_FULL_IRET);
 2286         if (mcp->mc_flags & _MC_HASBASES) {
 2287                 pcb->pcb_fsbase = mcp->mc_fsbase;
 2288                 pcb->pcb_gsbase = mcp->mc_gsbase;
 2289         }
 2290         return (0);
 2291 }
 2292 
 2293 static void
 2294 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
 2295     size_t xfpusave_len)
 2296 {
 2297         size_t max_len, len;
 2298 
 2299         mcp->mc_ownedfp = fpugetregs(td);
 2300         bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 2301             sizeof(mcp->mc_fpstate));
 2302         mcp->mc_fpformat = fpuformat();
 2303         if (!use_xsave || xfpusave_len == 0)
 2304                 return;
 2305         max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 2306         len = xfpusave_len;
 2307         if (len > max_len) {
 2308                 len = max_len;
 2309                 bzero(xfpusave + max_len, len - max_len);
 2310         }
 2311         mcp->mc_flags |= _MC_HASFPXSTATE;
 2312         mcp->mc_xfpustate_len = len;
 2313         bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 2314 }
 2315 
 2316 static int
 2317 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
 2318     size_t xfpustate_len)
 2319 {
 2320         int error;
 2321 
 2322         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 2323                 return (0);
 2324         else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 2325                 return (EINVAL);
 2326         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 2327                 /* We don't care what state is left in the FPU or PCB. */
 2328                 fpstate_drop(td);
 2329                 error = 0;
 2330         } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 2331             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 2332                 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 2333                     xfpustate, xfpustate_len);
 2334         } else
 2335                 return (EINVAL);
 2336         return (error);
 2337 }
 2338 
 2339 void
 2340 fpstate_drop(struct thread *td)
 2341 {
 2342 
 2343         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 2344         critical_enter();
 2345         if (PCPU_GET(fpcurthread) == td)
 2346                 fpudrop();
 2347         /*
 2348          * XXX force a full drop of the fpu.  The above only drops it if we
 2349          * owned it.
 2350          *
 2351          * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 2352          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 2353          * We only need to drop to !PCB_INITDONE in sendsig().  But
 2354          * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 2355          * have too many layers.
 2356          */
 2357         clear_pcb_flags(curthread->td_pcb,
 2358             PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 2359         critical_exit();
 2360 }
 2361 
 2362 int
 2363 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 2364 {
 2365         struct pcb *pcb;
 2366 
 2367         if (td == NULL) {
 2368                 dbregs->dr[0] = rdr0();
 2369                 dbregs->dr[1] = rdr1();
 2370                 dbregs->dr[2] = rdr2();
 2371                 dbregs->dr[3] = rdr3();
 2372                 dbregs->dr[6] = rdr6();
 2373                 dbregs->dr[7] = rdr7();
 2374         } else {
 2375                 pcb = td->td_pcb;
 2376                 dbregs->dr[0] = pcb->pcb_dr0;
 2377                 dbregs->dr[1] = pcb->pcb_dr1;
 2378                 dbregs->dr[2] = pcb->pcb_dr2;
 2379                 dbregs->dr[3] = pcb->pcb_dr3;
 2380                 dbregs->dr[6] = pcb->pcb_dr6;
 2381                 dbregs->dr[7] = pcb->pcb_dr7;
 2382         }
 2383         dbregs->dr[4] = 0;
 2384         dbregs->dr[5] = 0;
 2385         dbregs->dr[8] = 0;
 2386         dbregs->dr[9] = 0;
 2387         dbregs->dr[10] = 0;
 2388         dbregs->dr[11] = 0;
 2389         dbregs->dr[12] = 0;
 2390         dbregs->dr[13] = 0;
 2391         dbregs->dr[14] = 0;
 2392         dbregs->dr[15] = 0;
 2393         return (0);
 2394 }
 2395 
 2396 int
 2397 set_dbregs(struct thread *td, struct dbreg *dbregs)
 2398 {
 2399         struct pcb *pcb;
 2400         int i;
 2401 
 2402         if (td == NULL) {
 2403                 load_dr0(dbregs->dr[0]);
 2404                 load_dr1(dbregs->dr[1]);
 2405                 load_dr2(dbregs->dr[2]);
 2406                 load_dr3(dbregs->dr[3]);
 2407                 load_dr6(dbregs->dr[6]);
 2408                 load_dr7(dbregs->dr[7]);
 2409         } else {
 2410                 /*
 2411                  * Don't let an illegal value for dr7 get set.  Specifically,
 2412                  * check for undefined settings.  Setting these bit patterns
 2413                  * result in undefined behaviour and can lead to an unexpected
 2414                  * TRCTRAP or a general protection fault right here.
 2415                  * Upper bits of dr6 and dr7 must not be set
 2416                  */
 2417                 for (i = 0; i < 4; i++) {
 2418                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 2419                                 return (EINVAL);
 2420                         if (td->td_frame->tf_cs == _ucode32sel &&
 2421                             DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 2422                                 return (EINVAL);
 2423                 }
 2424                 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 2425                     (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 2426                         return (EINVAL);
 2427 
 2428                 pcb = td->td_pcb;
 2429 
 2430                 /*
 2431                  * Don't let a process set a breakpoint that is not within the
 2432                  * process's address space.  If a process could do this, it
 2433                  * could halt the system by setting a breakpoint in the kernel
 2434                  * (if ddb was enabled).  Thus, we need to check to make sure
 2435                  * that no breakpoints are being enabled for addresses outside
 2436                  * process's address space.
 2437                  *
 2438                  * XXX - what about when the watched area of the user's
 2439                  * address space is written into from within the kernel
 2440                  * ... wouldn't that still cause a breakpoint to be generated
 2441                  * from within kernel mode?
 2442                  */
 2443 
 2444                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 2445                         /* dr0 is enabled */
 2446                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 2447                                 return (EINVAL);
 2448                 }
 2449                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 2450                         /* dr1 is enabled */
 2451                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 2452                                 return (EINVAL);
 2453                 }
 2454                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 2455                         /* dr2 is enabled */
 2456                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 2457                                 return (EINVAL);
 2458                 }
 2459                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 2460                         /* dr3 is enabled */
 2461                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 2462                                 return (EINVAL);
 2463                 }
 2464 
 2465                 pcb->pcb_dr0 = dbregs->dr[0];
 2466                 pcb->pcb_dr1 = dbregs->dr[1];
 2467                 pcb->pcb_dr2 = dbregs->dr[2];
 2468                 pcb->pcb_dr3 = dbregs->dr[3];
 2469                 pcb->pcb_dr6 = dbregs->dr[6];
 2470                 pcb->pcb_dr7 = dbregs->dr[7];
 2471 
 2472                 set_pcb_flags(pcb, PCB_DBREGS);
 2473         }
 2474 
 2475         return (0);
 2476 }
 2477 
 2478 void
 2479 reset_dbregs(void)
 2480 {
 2481 
 2482         load_dr7(0);    /* Turn off the control bits first */
 2483         load_dr0(0);
 2484         load_dr1(0);
 2485         load_dr2(0);
 2486         load_dr3(0);
 2487         load_dr6(0);
 2488 }
 2489 
 2490 /*
 2491  * Return > 0 if a hardware breakpoint has been hit, and the
 2492  * breakpoint was in user space.  Return 0, otherwise.
 2493  */
 2494 int
 2495 user_dbreg_trap(void)
 2496 {
 2497         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
 2498         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
 2499         int nbp;            /* number of breakpoints that triggered */
 2500         caddr_t addr[4];    /* breakpoint addresses */
 2501         int i;
 2502         
 2503         dr7 = rdr7();
 2504         if ((dr7 & 0x000000ff) == 0) {
 2505                 /*
 2506                  * all GE and LE bits in the dr7 register are zero,
 2507                  * thus the trap couldn't have been caused by the
 2508                  * hardware debug registers
 2509                  */
 2510                 return 0;
 2511         }
 2512 
 2513         nbp = 0;
 2514         dr6 = rdr6();
 2515         bp = dr6 & 0x0000000f;
 2516 
 2517         if (!bp) {
 2518                 /*
 2519                  * None of the breakpoint bits are set meaning this
 2520                  * trap was not caused by any of the debug registers
 2521                  */
 2522                 return 0;
 2523         }
 2524 
 2525         /*
 2526          * at least one of the breakpoints were hit, check to see
 2527          * which ones and if any of them are user space addresses
 2528          */
 2529 
 2530         if (bp & 0x01) {
 2531                 addr[nbp++] = (caddr_t)rdr0();
 2532         }
 2533         if (bp & 0x02) {
 2534                 addr[nbp++] = (caddr_t)rdr1();
 2535         }
 2536         if (bp & 0x04) {
 2537                 addr[nbp++] = (caddr_t)rdr2();
 2538         }
 2539         if (bp & 0x08) {
 2540                 addr[nbp++] = (caddr_t)rdr3();
 2541         }
 2542 
 2543         for (i = 0; i < nbp; i++) {
 2544                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 2545                         /*
 2546                          * addr[i] is in user space
 2547                          */
 2548                         return nbp;
 2549                 }
 2550         }
 2551 
 2552         /*
 2553          * None of the breakpoints are in user space.
 2554          */
 2555         return 0;
 2556 }
 2557 
 2558 /*
 2559  * The pcb_flags is only modified by current thread, or by other threads
 2560  * when current thread is stopped.  However, current thread may change it
 2561  * from the interrupt context in cpu_switch(), or in the trap handler.
 2562  * When we read-modify-write pcb_flags from C sources, compiler may generate
 2563  * code that is not atomic regarding the interrupt handler.  If a trap or
 2564  * interrupt happens and any flag is modified from the handler, it can be
 2565  * clobbered with the cached value later.  Therefore, we implement setting
 2566  * and clearing flags with single-instruction functions, which do not race
 2567  * with possible modification of the flags from the trap or interrupt context,
 2568  * because traps and interrupts are executed only on instruction boundary.
 2569  */
 2570 void
 2571 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 2572 {
 2573 
 2574         __asm __volatile("orl %1,%0"
 2575             : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 2576             : "cc", "memory");
 2577 
 2578 }
 2579 
 2580 /*
 2581  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
 2582  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
 2583  * pcb if user space modified the bases.  We must save on the context
 2584  * switch or if the return to usermode happens through the doreti.
 2585  *
 2586  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
 2587  * which have a consequence that the base MSRs must be saved each time
 2588  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
 2589  * context switches.
 2590  */
 2591 void
 2592 set_pcb_flags(struct pcb *pcb, const u_int flags)
 2593 {
 2594         register_t r;
 2595 
 2596         if (curpcb == pcb &&
 2597             (flags & PCB_FULL_IRET) != 0 &&
 2598             (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
 2599             (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
 2600                 r = intr_disable();
 2601                 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 2602                         if (rfs() == _ufssel)
 2603                                 pcb->pcb_fsbase = rdfsbase();
 2604                         if (rgs() == _ugssel)
 2605                                 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 2606                 }
 2607                 set_pcb_flags_raw(pcb, flags);
 2608                 intr_restore(r);
 2609         } else {
 2610                 set_pcb_flags_raw(pcb, flags);
 2611         }
 2612 }
 2613 
 2614 void
 2615 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 2616 {
 2617 
 2618         __asm __volatile("andl %1,%0"
 2619             : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 2620             : "cc", "memory");
 2621 }
 2622 
 2623 #ifdef KDB
 2624 
 2625 /*
 2626  * Provide inb() and outb() as functions.  They are normally only available as
 2627  * inline functions, thus cannot be called from the debugger.
 2628  */
 2629 
 2630 /* silence compiler warnings */
 2631 u_char inb_(u_short);
 2632 void outb_(u_short, u_char);
 2633 
 2634 u_char
 2635 inb_(u_short port)
 2636 {
 2637         return inb(port);
 2638 }
 2639 
 2640 void
 2641 outb_(u_short port, u_char data)
 2642 {
 2643         outb(port, data);
 2644 }
 2645 
 2646 #endif /* KDB */

Cache object: 5029c98b02dc472c115295764a378a47


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.