The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 354764 2019-11-16 00:52:04Z scottl $");
   43 
   44 #include "opt_atpic.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kstack_pages.h"
   51 #include "opt_maxmem.h"
   52 #include "opt_mp_watchdog.h"
   53 #include "opt_perfmon.h"
   54 #include "opt_platform.h"
   55 #include "opt_sched.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/bio.h>
   61 #include <sys/buf.h>
   62 #include <sys/bus.h>
   63 #include <sys/callout.h>
   64 #include <sys/cons.h>
   65 #include <sys/cpu.h>
   66 #include <sys/efi.h>
   67 #include <sys/eventhandler.h>
   68 #include <sys/exec.h>
   69 #include <sys/imgact.h>
   70 #include <sys/kdb.h>
   71 #include <sys/kernel.h>
   72 #include <sys/ktr.h>
   73 #include <sys/linker.h>
   74 #include <sys/lock.h>
   75 #include <sys/malloc.h>
   76 #include <sys/memrange.h>
   77 #include <sys/msgbuf.h>
   78 #include <sys/mutex.h>
   79 #include <sys/pcpu.h>
   80 #include <sys/ptrace.h>
   81 #include <sys/reboot.h>
   82 #include <sys/rwlock.h>
   83 #include <sys/sched.h>
   84 #include <sys/signalvar.h>
   85 #ifdef SMP
   86 #include <sys/smp.h>
   87 #endif
   88 #include <sys/syscallsubr.h>
   89 #include <sys/sysctl.h>
   90 #include <sys/sysent.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/ucontext.h>
   93 #include <sys/vmmeter.h>
   94 
   95 #include <vm/vm.h>
   96 #include <vm/vm_extern.h>
   97 #include <vm/vm_kern.h>
   98 #include <vm/vm_page.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_object.h>
  101 #include <vm/vm_pager.h>
  102 #include <vm/vm_param.h>
  103 #include <vm/vm_phys.h>
  104 
  105 #ifdef DDB
  106 #ifndef KDB
  107 #error KDB must be enabled in order for DDB to work!
  108 #endif
  109 #include <ddb/ddb.h>
  110 #include <ddb/db_sym.h>
  111 #endif
  112 
  113 #include <net/netisr.h>
  114 
  115 #include <machine/clock.h>
  116 #include <machine/cpu.h>
  117 #include <machine/cputypes.h>
  118 #include <machine/frame.h>
  119 #include <machine/intr_machdep.h>
  120 #include <x86/mca.h>
  121 #include <machine/md_var.h>
  122 #include <machine/metadata.h>
  123 #include <machine/mp_watchdog.h>
  124 #include <machine/pc/bios.h>
  125 #include <machine/pcb.h>
  126 #include <machine/proc.h>
  127 #include <machine/reg.h>
  128 #include <machine/sigframe.h>
  129 #include <machine/specialreg.h>
  130 #ifdef PERFMON
  131 #include <machine/perfmon.h>
  132 #endif
  133 #include <machine/tss.h>
  134 #include <x86/ucode.h>
  135 #ifdef SMP
  136 #include <machine/smp.h>
  137 #endif
  138 #ifdef FDT
  139 #include <x86/fdt.h>
  140 #endif
  141 
  142 #ifdef DEV_ATPIC
  143 #include <x86/isa/icu.h>
  144 #else
  145 #include <x86/apicvar.h>
  146 #endif
  147 
  148 #include <isa/isareg.h>
  149 #include <isa/rtc.h>
  150 #include <x86/init.h>
  151 
  152 /* Sanity check for __curthread() */
  153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  154 
  155 /*
  156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  157  * couple of scratch registers, as well as the trapframe left behind after an
  158  * iret fault.
  159  */
  160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
  161     offsetof(struct pti_frame, pti_rip));
  162 
  163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  164 
  165 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  166 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  167 
  168 static void cpu_startup(void *);
  169 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
  170     char *xfpusave, size_t xfpusave_len);
  171 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
  172     char *xfpustate, size_t xfpustate_len);
  173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  174 
  175 /* Preload data parse function */
  176 static caddr_t native_parse_preload_data(u_int64_t);
  177 
  178 /* Native function to fetch and parse the e820 map */
  179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
  180 
  181 /* Default init_ops implementation. */
  182 struct init_ops init_ops = {
  183         .parse_preload_data =   native_parse_preload_data,
  184         .early_clock_source_init =      i8254_init,
  185         .early_delay =                  i8254_delay,
  186         .parse_memmap =                 native_parse_memmap,
  187 #ifdef SMP
  188         .mp_bootaddress =               mp_bootaddress,
  189         .start_all_aps =                native_start_all_aps,
  190 #endif
  191         .msi_init =                     msi_init,
  192 };
  193 
  194 struct msgbuf *msgbufp;
  195 
  196 /*
  197  * Physical address of the EFI System Table. Stashed from the metadata hints
  198  * passed into the kernel and used by the EFI code to call runtime services.
  199  */
  200 vm_paddr_t efi_systbl_phys;
  201 
  202 /* Intel ICH registers */
  203 #define ICH_PMBASE      0x400
  204 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  205 
  206 int     _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
  207 
  208 int cold = 1;
  209 
  210 long Maxmem = 0;
  211 long realmem = 0;
  212 
  213 /*
  214  * The number of PHYSMAP entries must be one less than the number of
  215  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  216  * physical address that is accessible by ISA DMA is split into two
  217  * PHYSSEG entries.
  218  */
  219 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  220 
  221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  223 
  224 /* must be 2 less so 0 0 can signal end of chunks */
  225 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
  226 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
  227 
  228 struct kva_md_info kmi;
  229 
  230 static struct trapframe proc0_tf;
  231 struct region_descriptor r_gdt, r_idt;
  232 
  233 struct pcpu __pcpu[MAXCPU];
  234 
  235 struct mtx icu_lock;
  236 
  237 struct mem_range_softc mem_range_softc;
  238 
  239 struct mtx dt_lock;     /* lock for GDT and LDT */
  240 
  241 void (*vmm_resume_p)(void);
  242 
  243 static void
  244 cpu_startup(dummy)
  245         void *dummy;
  246 {
  247         uintmax_t memsize;
  248         char *sysenv;
  249 
  250         /*
  251          * On MacBooks, we need to disallow the legacy USB circuit to
  252          * generate an SMI# because this can cause several problems,
  253          * namely: incorrect CPU frequency detection and failure to
  254          * start the APs.
  255          * We do this by disabling a bit in the SMI_EN (SMI Control and
  256          * Enable register) of the Intel ICH LPC Interface Bridge. 
  257          */
  258         sysenv = kern_getenv("smbios.system.product");
  259         if (sysenv != NULL) {
  260                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  261                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  262                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  263                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  264                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  265                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  266                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  267                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  268                         if (bootverbose)
  269                                 printf("Disabling LEGACY_USB_EN bit on "
  270                                     "Intel ICH.\n");
  271                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  272                 }
  273                 freeenv(sysenv);
  274         }
  275 
  276         /*
  277          * Good {morning,afternoon,evening,night}.
  278          */
  279         startrtclock();
  280         printcpuinfo();
  281 #ifdef PERFMON
  282         perfmon_init();
  283 #endif
  284 
  285         /*
  286          * Display physical memory if SMBIOS reports reasonable amount.
  287          */
  288         memsize = 0;
  289         sysenv = kern_getenv("smbios.memory.enabled");
  290         if (sysenv != NULL) {
  291                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  292                 freeenv(sysenv);
  293         }
  294         if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
  295                 memsize = ptoa((uintmax_t)Maxmem);
  296         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  297         realmem = atop(memsize);
  298 
  299         /*
  300          * Display any holes after the first chunk of extended memory.
  301          */
  302         if (bootverbose) {
  303                 int indx;
  304 
  305                 printf("Physical memory chunk(s):\n");
  306                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  307                         vm_paddr_t size;
  308 
  309                         size = phys_avail[indx + 1] - phys_avail[indx];
  310                         printf(
  311                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  312                             (uintmax_t)phys_avail[indx],
  313                             (uintmax_t)phys_avail[indx + 1] - 1,
  314                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  315                 }
  316         }
  317 
  318         vm_ksubmap_init(&kmi);
  319 
  320         printf("avail memory = %ju (%ju MB)\n",
  321             ptoa((uintmax_t)vm_cnt.v_free_count),
  322             ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
  323 
  324         /*
  325          * Set up buffers, so they can be used to read disk labels.
  326          */
  327         bufinit();
  328         vm_pager_bufferinit();
  329 
  330         cpu_setregs();
  331 }
  332 
  333 /*
  334  * Send an interrupt to process.
  335  *
  336  * Stack is set up to allow sigcode stored
  337  * at top to call routine, followed by call
  338  * to sigreturn routine below.  After sigreturn
  339  * resets the signal mask, the stack, and the
  340  * frame pointer, it returns to the user
  341  * specified pc, psl.
  342  */
  343 void
  344 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  345 {
  346         struct sigframe sf, *sfp;
  347         struct pcb *pcb;
  348         struct proc *p;
  349         struct thread *td;
  350         struct sigacts *psp;
  351         char *sp;
  352         struct trapframe *regs;
  353         char *xfpusave;
  354         size_t xfpusave_len;
  355         int sig;
  356         int oonstack;
  357 
  358         td = curthread;
  359         pcb = td->td_pcb;
  360         p = td->td_proc;
  361         PROC_LOCK_ASSERT(p, MA_OWNED);
  362         sig = ksi->ksi_signo;
  363         psp = p->p_sigacts;
  364         mtx_assert(&psp->ps_mtx, MA_OWNED);
  365         regs = td->td_frame;
  366         oonstack = sigonstack(regs->tf_rsp);
  367 
  368         if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
  369                 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
  370                 xfpusave = __builtin_alloca(xfpusave_len);
  371         } else {
  372                 xfpusave_len = 0;
  373                 xfpusave = NULL;
  374         }
  375 
  376         /* Save user context. */
  377         bzero(&sf, sizeof(sf));
  378         sf.sf_uc.uc_sigmask = *mask;
  379         sf.sf_uc.uc_stack = td->td_sigstk;
  380         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  381             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  382         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  383         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
  384         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  385         get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
  386         fpstate_drop(td);
  387         update_pcb_bases(pcb);
  388         sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
  389         sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
  390         bzero(sf.sf_uc.uc_mcontext.mc_spare,
  391             sizeof(sf.sf_uc.uc_mcontext.mc_spare));
  392         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  393 
  394         /* Allocate space for the signal handler context. */
  395         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  396             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  397                 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
  398 #if defined(COMPAT_43)
  399                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  400 #endif
  401         } else
  402                 sp = (char *)regs->tf_rsp - 128;
  403         if (xfpusave != NULL) {
  404                 sp -= xfpusave_len;
  405                 sp = (char *)((unsigned long)sp & ~0x3Ful);
  406                 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
  407         }
  408         sp -= sizeof(struct sigframe);
  409         /* Align to 16 bytes. */
  410         sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
  411 
  412         /* Build the argument list for the signal handler. */
  413         regs->tf_rdi = sig;                     /* arg 1 in %rdi */
  414         regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
  415         bzero(&sf.sf_si, sizeof(sf.sf_si));
  416         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  417                 /* Signal handler installed with SA_SIGINFO. */
  418                 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
  419                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  420 
  421                 /* Fill in POSIX parts */
  422                 sf.sf_si = ksi->ksi_info;
  423                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  424                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  425         } else {
  426                 /* Old FreeBSD-style arguments. */
  427                 regs->tf_rsi = ksi->ksi_code;   /* arg 2 in %rsi */
  428                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  429                 sf.sf_ahu.sf_handler = catcher;
  430         }
  431         mtx_unlock(&psp->ps_mtx);
  432         PROC_UNLOCK(p);
  433 
  434         /*
  435          * Copy the sigframe out to the user's stack.
  436          */
  437         if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
  438             (xfpusave != NULL && copyout(xfpusave,
  439             (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
  440             != 0)) {
  441 #ifdef DEBUG
  442                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  443 #endif
  444                 PROC_LOCK(p);
  445                 sigexit(td, SIGILL);
  446         }
  447 
  448         regs->tf_rsp = (long)sfp;
  449         regs->tf_rip = p->p_sysent->sv_sigcode_base;
  450         regs->tf_rflags &= ~(PSL_T | PSL_D);
  451         regs->tf_cs = _ucodesel;
  452         regs->tf_ds = _udatasel;
  453         regs->tf_ss = _udatasel;
  454         regs->tf_es = _udatasel;
  455         regs->tf_fs = _ufssel;
  456         regs->tf_gs = _ugssel;
  457         regs->tf_flags = TF_HASSEGS;
  458         PROC_LOCK(p);
  459         mtx_lock(&psp->ps_mtx);
  460 }
  461 
  462 /*
  463  * System call to cleanup state after a signal
  464  * has been taken.  Reset signal mask and
  465  * stack state from context left by sendsig (above).
  466  * Return to previous pc and psl as specified by
  467  * context left by sendsig. Check carefully to
  468  * make sure that the user has not modified the
  469  * state to gain improper privileges.
  470  *
  471  * MPSAFE
  472  */
  473 int
  474 sys_sigreturn(td, uap)
  475         struct thread *td;
  476         struct sigreturn_args /* {
  477                 const struct __ucontext *sigcntxp;
  478         } */ *uap;
  479 {
  480         ucontext_t uc;
  481         struct pcb *pcb;
  482         struct proc *p;
  483         struct trapframe *regs;
  484         ucontext_t *ucp;
  485         char *xfpustate;
  486         size_t xfpustate_len;
  487         long rflags;
  488         int cs, error, ret;
  489         ksiginfo_t ksi;
  490 
  491         pcb = td->td_pcb;
  492         p = td->td_proc;
  493 
  494         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  495         if (error != 0) {
  496                 uprintf("pid %d (%s): sigreturn copyin failed\n",
  497                     p->p_pid, td->td_name);
  498                 return (error);
  499         }
  500         ucp = &uc;
  501         if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
  502                 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
  503                     td->td_name, ucp->uc_mcontext.mc_flags);
  504                 return (EINVAL);
  505         }
  506         regs = td->td_frame;
  507         rflags = ucp->uc_mcontext.mc_rflags;
  508         /*
  509          * Don't allow users to change privileged or reserved flags.
  510          */
  511         if (!EFL_SECURE(rflags, regs->tf_rflags)) {
  512                 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
  513                     td->td_name, rflags);
  514                 return (EINVAL);
  515         }
  516 
  517         /*
  518          * Don't allow users to load a valid privileged %cs.  Let the
  519          * hardware check for invalid selectors, excess privilege in
  520          * other selectors, invalid %eip's and invalid %esp's.
  521          */
  522         cs = ucp->uc_mcontext.mc_cs;
  523         if (!CS_SECURE(cs)) {
  524                 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
  525                     td->td_name, cs);
  526                 ksiginfo_init_trap(&ksi);
  527                 ksi.ksi_signo = SIGBUS;
  528                 ksi.ksi_code = BUS_OBJERR;
  529                 ksi.ksi_trapno = T_PROTFLT;
  530                 ksi.ksi_addr = (void *)regs->tf_rip;
  531                 trapsignal(td, &ksi);
  532                 return (EINVAL);
  533         }
  534 
  535         if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
  536                 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
  537                 if (xfpustate_len > cpu_max_ext_state_size -
  538                     sizeof(struct savefpu)) {
  539                         uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
  540                             p->p_pid, td->td_name, xfpustate_len);
  541                         return (EINVAL);
  542                 }
  543                 xfpustate = __builtin_alloca(xfpustate_len);
  544                 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
  545                     xfpustate, xfpustate_len);
  546                 if (error != 0) {
  547                         uprintf(
  548         "pid %d (%s): sigreturn copying xfpustate failed\n",
  549                             p->p_pid, td->td_name);
  550                         return (error);
  551                 }
  552         } else {
  553                 xfpustate = NULL;
  554                 xfpustate_len = 0;
  555         }
  556         ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
  557         if (ret != 0) {
  558                 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
  559                     p->p_pid, td->td_name, ret);
  560                 return (ret);
  561         }
  562         bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
  563         update_pcb_bases(pcb);
  564         pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
  565         pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
  566 
  567 #if defined(COMPAT_43)
  568         if (ucp->uc_mcontext.mc_onstack & 1)
  569                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  570         else
  571                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  572 #endif
  573 
  574         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
  575         return (EJUSTRETURN);
  576 }
  577 
  578 #ifdef COMPAT_FREEBSD4
  579 int
  580 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
  581 {
  582  
  583         return sys_sigreturn(td, (struct sigreturn_args *)uap);
  584 }
  585 #endif
  586 
  587 /*
  588  * Reset registers to default values on exec.
  589  */
  590 void
  591 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
  592 {
  593         struct trapframe *regs;
  594         struct pcb *pcb;
  595         register_t saved_rflags;
  596 
  597         regs = td->td_frame;
  598         pcb = td->td_pcb;
  599 
  600         mtx_lock(&dt_lock);
  601         if (td->td_proc->p_md.md_ldt != NULL)
  602                 user_ldt_free(td);
  603         else
  604                 mtx_unlock(&dt_lock);
  605         
  606         update_pcb_bases(pcb);
  607         pcb->pcb_fsbase = 0;
  608         pcb->pcb_gsbase = 0;
  609         clear_pcb_flags(pcb, PCB_32BIT);
  610         pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
  611 
  612         saved_rflags = regs->tf_rflags & PSL_T;
  613         bzero((char *)regs, sizeof(struct trapframe));
  614         regs->tf_rip = imgp->entry_addr;
  615         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
  616         regs->tf_rdi = stack;           /* argv */
  617         regs->tf_rflags = PSL_USER | saved_rflags;
  618         regs->tf_ss = _udatasel;
  619         regs->tf_cs = _ucodesel;
  620         regs->tf_ds = _udatasel;
  621         regs->tf_es = _udatasel;
  622         regs->tf_fs = _ufssel;
  623         regs->tf_gs = _ugssel;
  624         regs->tf_flags = TF_HASSEGS;
  625         td->td_retval[1] = 0;
  626 
  627         /*
  628          * Reset the hardware debug registers if they were in use.
  629          * They won't have any meaning for the newly exec'd process.
  630          */
  631         if (pcb->pcb_flags & PCB_DBREGS) {
  632                 pcb->pcb_dr0 = 0;
  633                 pcb->pcb_dr1 = 0;
  634                 pcb->pcb_dr2 = 0;
  635                 pcb->pcb_dr3 = 0;
  636                 pcb->pcb_dr6 = 0;
  637                 pcb->pcb_dr7 = 0;
  638                 if (pcb == curpcb) {
  639                         /*
  640                          * Clear the debug registers on the running
  641                          * CPU, otherwise they will end up affecting
  642                          * the next process we switch to.
  643                          */
  644                         reset_dbregs();
  645                 }
  646                 clear_pcb_flags(pcb, PCB_DBREGS);
  647         }
  648 
  649         /*
  650          * Drop the FP state if we hold it, so that the process gets a
  651          * clean FP state if it uses the FPU again.
  652          */
  653         fpstate_drop(td);
  654 }
  655 
  656 void
  657 cpu_setregs(void)
  658 {
  659         register_t cr0;
  660 
  661         cr0 = rcr0();
  662         /*
  663          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  664          * BSP.  See the comments there about why we set them.
  665          */
  666         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  667         load_cr0(cr0);
  668 }
  669 
  670 /*
  671  * Initialize amd64 and configure to run kernel
  672  */
  673 
  674 /*
  675  * Initialize segments & interrupt table
  676  */
  677 
  678 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
  679 static struct gate_descriptor idt0[NIDT];
  680 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  681 
  682 static char dblfault_stack[PAGE_SIZE] __aligned(16);
  683 static char mce0_stack[PAGE_SIZE] __aligned(16);
  684 static char nmi0_stack[PAGE_SIZE] __aligned(16);
  685 static char dbg0_stack[PAGE_SIZE] __aligned(16);
  686 CTASSERT(sizeof(struct nmi_pcpu) == 16);
  687 
  688 struct amd64tss common_tss[MAXCPU];
  689 
  690 /*
  691  * Software prototypes -- in more palatable form.
  692  *
  693  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  694  * slots as corresponding segments for i386 kernel.
  695  */
  696 struct soft_segment_descriptor gdt_segs[] = {
  697 /* GNULL_SEL    0 Null Descriptor */
  698 {       .ssd_base = 0x0,
  699         .ssd_limit = 0x0,
  700         .ssd_type = 0,
  701         .ssd_dpl = 0,
  702         .ssd_p = 0,
  703         .ssd_long = 0,
  704         .ssd_def32 = 0,
  705         .ssd_gran = 0           },
  706 /* GNULL2_SEL   1 Null Descriptor */
  707 {       .ssd_base = 0x0,
  708         .ssd_limit = 0x0,
  709         .ssd_type = 0,
  710         .ssd_dpl = 0,
  711         .ssd_p = 0,
  712         .ssd_long = 0,
  713         .ssd_def32 = 0,
  714         .ssd_gran = 0           },
  715 /* GUFS32_SEL   2 32 bit %gs Descriptor for user */
  716 {       .ssd_base = 0x0,
  717         .ssd_limit = 0xfffff,
  718         .ssd_type = SDT_MEMRWA,
  719         .ssd_dpl = SEL_UPL,
  720         .ssd_p = 1,
  721         .ssd_long = 0,
  722         .ssd_def32 = 1,
  723         .ssd_gran = 1           },
  724 /* GUGS32_SEL   3 32 bit %fs Descriptor for user */
  725 {       .ssd_base = 0x0,
  726         .ssd_limit = 0xfffff,
  727         .ssd_type = SDT_MEMRWA,
  728         .ssd_dpl = SEL_UPL,
  729         .ssd_p = 1,
  730         .ssd_long = 0,
  731         .ssd_def32 = 1,
  732         .ssd_gran = 1           },
  733 /* GCODE_SEL    4 Code Descriptor for kernel */
  734 {       .ssd_base = 0x0,
  735         .ssd_limit = 0xfffff,
  736         .ssd_type = SDT_MEMERA,
  737         .ssd_dpl = SEL_KPL,
  738         .ssd_p = 1,
  739         .ssd_long = 1,
  740         .ssd_def32 = 0,
  741         .ssd_gran = 1           },
  742 /* GDATA_SEL    5 Data Descriptor for kernel */
  743 {       .ssd_base = 0x0,
  744         .ssd_limit = 0xfffff,
  745         .ssd_type = SDT_MEMRWA,
  746         .ssd_dpl = SEL_KPL,
  747         .ssd_p = 1,
  748         .ssd_long = 1,
  749         .ssd_def32 = 0,
  750         .ssd_gran = 1           },
  751 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
  752 {       .ssd_base = 0x0,
  753         .ssd_limit = 0xfffff,
  754         .ssd_type = SDT_MEMERA,
  755         .ssd_dpl = SEL_UPL,
  756         .ssd_p = 1,
  757         .ssd_long = 0,
  758         .ssd_def32 = 1,
  759         .ssd_gran = 1           },
  760 /* GUDATA_SEL   7 32/64 bit Data Descriptor for user */
  761 {       .ssd_base = 0x0,
  762         .ssd_limit = 0xfffff,
  763         .ssd_type = SDT_MEMRWA,
  764         .ssd_dpl = SEL_UPL,
  765         .ssd_p = 1,
  766         .ssd_long = 0,
  767         .ssd_def32 = 1,
  768         .ssd_gran = 1           },
  769 /* GUCODE_SEL   8 64 bit Code Descriptor for user */
  770 {       .ssd_base = 0x0,
  771         .ssd_limit = 0xfffff,
  772         .ssd_type = SDT_MEMERA,
  773         .ssd_dpl = SEL_UPL,
  774         .ssd_p = 1,
  775         .ssd_long = 1,
  776         .ssd_def32 = 0,
  777         .ssd_gran = 1           },
  778 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
  779 {       .ssd_base = 0x0,
  780         .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
  781         .ssd_type = SDT_SYSTSS,
  782         .ssd_dpl = SEL_KPL,
  783         .ssd_p = 1,
  784         .ssd_long = 0,
  785         .ssd_def32 = 0,
  786         .ssd_gran = 0           },
  787 /* Actually, the TSS is a system descriptor which is double size */
  788 {       .ssd_base = 0x0,
  789         .ssd_limit = 0x0,
  790         .ssd_type = 0,
  791         .ssd_dpl = 0,
  792         .ssd_p = 0,
  793         .ssd_long = 0,
  794         .ssd_def32 = 0,
  795         .ssd_gran = 0           },
  796 /* GUSERLDT_SEL 11 LDT Descriptor */
  797 {       .ssd_base = 0x0,
  798         .ssd_limit = 0x0,
  799         .ssd_type = 0,
  800         .ssd_dpl = 0,
  801         .ssd_p = 0,
  802         .ssd_long = 0,
  803         .ssd_def32 = 0,
  804         .ssd_gran = 0           },
  805 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
  806 {       .ssd_base = 0x0,
  807         .ssd_limit = 0x0,
  808         .ssd_type = 0,
  809         .ssd_dpl = 0,
  810         .ssd_p = 0,
  811         .ssd_long = 0,
  812         .ssd_def32 = 0,
  813         .ssd_gran = 0           },
  814 };
  815 
  816 void
  817 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
  818 {
  819         struct gate_descriptor *ip;
  820 
  821         ip = idt + idx;
  822         ip->gd_looffset = (uintptr_t)func;
  823         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  824         ip->gd_ist = ist;
  825         ip->gd_xx = 0;
  826         ip->gd_type = typ;
  827         ip->gd_dpl = dpl;
  828         ip->gd_p = 1;
  829         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  830 }
  831 
  832 extern inthand_t
  833         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  834         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  835         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  836         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  837         IDTVEC(xmm), IDTVEC(dblfault),
  838         IDTVEC(div_pti), IDTVEC(bpt_pti),
  839         IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
  840         IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
  841         IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
  842         IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
  843         IDTVEC(xmm_pti),
  844 #ifdef KDTRACE_HOOKS
  845         IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
  846 #endif
  847 #ifdef XENHVM
  848         IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
  849 #endif
  850         IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
  851         IDTVEC(fast_syscall_pti);
  852 
  853 #ifdef DDB
  854 /*
  855  * Display the index and function name of any IDT entries that don't use
  856  * the default 'rsvd' entry point.
  857  */
  858 DB_SHOW_COMMAND(idt, db_show_idt)
  859 {
  860         struct gate_descriptor *ip;
  861         int idx;
  862         uintptr_t func;
  863 
  864         ip = idt;
  865         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
  866                 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
  867                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
  868                         db_printf("%3d\t", idx);
  869                         db_printsym(func, DB_STGY_PROC);
  870                         db_printf("\n");
  871                 }
  872                 ip++;
  873         }
  874 }
  875 
  876 /* Show privileged registers. */
  877 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
  878 {
  879         struct {
  880                 uint16_t limit;
  881                 uint64_t base;
  882         } __packed idtr, gdtr;
  883         uint16_t ldt, tr;
  884 
  885         __asm __volatile("sidt %0" : "=m" (idtr));
  886         db_printf("idtr\t0x%016lx/%04x\n",
  887             (u_long)idtr.base, (u_int)idtr.limit);
  888         __asm __volatile("sgdt %0" : "=m" (gdtr));
  889         db_printf("gdtr\t0x%016lx/%04x\n",
  890             (u_long)gdtr.base, (u_int)gdtr.limit);
  891         __asm __volatile("sldt %0" : "=r" (ldt));
  892         db_printf("ldtr\t0x%04x\n", ldt);
  893         __asm __volatile("str %0" : "=r" (tr));
  894         db_printf("tr\t0x%04x\n", tr);
  895         db_printf("cr0\t0x%016lx\n", rcr0());
  896         db_printf("cr2\t0x%016lx\n", rcr2());
  897         db_printf("cr3\t0x%016lx\n", rcr3());
  898         db_printf("cr4\t0x%016lx\n", rcr4());
  899         if (rcr4() & CR4_XSAVE)
  900                 db_printf("xcr0\t0x%016lx\n", rxcr(0));
  901         db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
  902         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
  903                 db_printf("FEATURES_CTL\t%016lx\n",
  904                     rdmsr(MSR_IA32_FEATURE_CONTROL));
  905         db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
  906         db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
  907         db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
  908 }
  909 
  910 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
  911 {
  912 
  913         db_printf("dr0\t0x%016lx\n", rdr0());
  914         db_printf("dr1\t0x%016lx\n", rdr1());
  915         db_printf("dr2\t0x%016lx\n", rdr2());
  916         db_printf("dr3\t0x%016lx\n", rdr3());
  917         db_printf("dr6\t0x%016lx\n", rdr6());
  918         db_printf("dr7\t0x%016lx\n", rdr7());   
  919 }
  920 #endif
  921 
  922 void
  923 sdtossd(sd, ssd)
  924         struct user_segment_descriptor *sd;
  925         struct soft_segment_descriptor *ssd;
  926 {
  927 
  928         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  929         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  930         ssd->ssd_type  = sd->sd_type;
  931         ssd->ssd_dpl   = sd->sd_dpl;
  932         ssd->ssd_p     = sd->sd_p;
  933         ssd->ssd_long  = sd->sd_long;
  934         ssd->ssd_def32 = sd->sd_def32;
  935         ssd->ssd_gran  = sd->sd_gran;
  936 }
  937 
  938 void
  939 ssdtosd(ssd, sd)
  940         struct soft_segment_descriptor *ssd;
  941         struct user_segment_descriptor *sd;
  942 {
  943 
  944         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  945         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  946         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  947         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  948         sd->sd_type  = ssd->ssd_type;
  949         sd->sd_dpl   = ssd->ssd_dpl;
  950         sd->sd_p     = ssd->ssd_p;
  951         sd->sd_long  = ssd->ssd_long;
  952         sd->sd_def32 = ssd->ssd_def32;
  953         sd->sd_gran  = ssd->ssd_gran;
  954 }
  955 
  956 void
  957 ssdtosyssd(ssd, sd)
  958         struct soft_segment_descriptor *ssd;
  959         struct system_segment_descriptor *sd;
  960 {
  961 
  962         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  963         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  964         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  965         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  966         sd->sd_type  = ssd->ssd_type;
  967         sd->sd_dpl   = ssd->ssd_dpl;
  968         sd->sd_p     = ssd->ssd_p;
  969         sd->sd_gran  = ssd->ssd_gran;
  970 }
  971 
  972 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
  973 #include <isa/isavar.h>
  974 #include <isa/isareg.h>
  975 /*
  976  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  977  * and is only suitable for use at probe time.
  978  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  979  * It shouldn't be here.  There should probably be an APIC centric
  980  * implementation in the apic driver code, if at all.
  981  */
  982 intrmask_t
  983 isa_irq_pending(void)
  984 {
  985         u_char irr1;
  986         u_char irr2;
  987 
  988         irr1 = inb(IO_ICU1);
  989         irr2 = inb(IO_ICU2);
  990         return ((irr2 << 8) | irr1);
  991 }
  992 #endif
  993 
  994 u_int basemem;
  995 
  996 static int
  997 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
  998     int *physmap_idxp)
  999 {
 1000         int i, insert_idx, physmap_idx;
 1001 
 1002         physmap_idx = *physmap_idxp;
 1003 
 1004         if (length == 0)
 1005                 return (1);
 1006 
 1007         /*
 1008          * Find insertion point while checking for overlap.  Start off by
 1009          * assuming the new entry will be added to the end.
 1010          *
 1011          * NB: physmap_idx points to the next free slot.
 1012          */
 1013         insert_idx = physmap_idx;
 1014         for (i = 0; i <= physmap_idx; i += 2) {
 1015                 if (base < physmap[i + 1]) {
 1016                         if (base + length <= physmap[i]) {
 1017                                 insert_idx = i;
 1018                                 break;
 1019                         }
 1020                         if (boothowto & RB_VERBOSE)
 1021                                 printf(
 1022                     "Overlapping memory regions, ignoring second region\n");
 1023                         return (1);
 1024                 }
 1025         }
 1026 
 1027         /* See if we can prepend to the next entry. */
 1028         if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 1029                 physmap[insert_idx] = base;
 1030                 return (1);
 1031         }
 1032 
 1033         /* See if we can append to the previous entry. */
 1034         if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 1035                 physmap[insert_idx - 1] += length;
 1036                 return (1);
 1037         }
 1038 
 1039         physmap_idx += 2;
 1040         *physmap_idxp = physmap_idx;
 1041         if (physmap_idx == PHYSMAP_SIZE) {
 1042                 printf(
 1043                 "Too many segments in the physical address map, giving up\n");
 1044                 return (0);
 1045         }
 1046 
 1047         /*
 1048          * Move the last 'N' entries down to make room for the new
 1049          * entry if needed.
 1050          */
 1051         for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 1052                 physmap[i] = physmap[i - 2];
 1053                 physmap[i + 1] = physmap[i - 1];
 1054         }
 1055 
 1056         /* Insert the new entry. */
 1057         physmap[insert_idx] = base;
 1058         physmap[insert_idx + 1] = base + length;
 1059         return (1);
 1060 }
 1061 
 1062 void
 1063 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
 1064                       vm_paddr_t *physmap, int *physmap_idx)
 1065 {
 1066         struct bios_smap *smap, *smapend;
 1067 
 1068         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 1069 
 1070         for (smap = smapbase; smap < smapend; smap++) {
 1071                 if (boothowto & RB_VERBOSE)
 1072                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
 1073                             smap->type, smap->base, smap->length);
 1074 
 1075                 if (smap->type != SMAP_TYPE_MEMORY)
 1076                         continue;
 1077 
 1078                 if (!add_physmap_entry(smap->base, smap->length, physmap,
 1079                     physmap_idx))
 1080                         break;
 1081         }
 1082 }
 1083 
 1084 static void
 1085 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
 1086     int *physmap_idx)
 1087 {
 1088         struct efi_md *map, *p;
 1089         const char *type;
 1090         size_t efisz;
 1091         int ndesc, i;
 1092 
 1093         static const char *types[] = {
 1094                 "Reserved",
 1095                 "LoaderCode",
 1096                 "LoaderData",
 1097                 "BootServicesCode",
 1098                 "BootServicesData",
 1099                 "RuntimeServicesCode",
 1100                 "RuntimeServicesData",
 1101                 "ConventionalMemory",
 1102                 "UnusableMemory",
 1103                 "ACPIReclaimMemory",
 1104                 "ACPIMemoryNVS",
 1105                 "MemoryMappedIO",
 1106                 "MemoryMappedIOPortSpace",
 1107                 "PalCode",
 1108                 "PersistentMemory"
 1109         };
 1110 
 1111         /*
 1112          * Memory map data provided by UEFI via the GetMemoryMap
 1113          * Boot Services API.
 1114          */
 1115         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 1116         map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 1117 
 1118         if (efihdr->descriptor_size == 0)
 1119                 return;
 1120         ndesc = efihdr->memory_size / efihdr->descriptor_size;
 1121 
 1122         if (boothowto & RB_VERBOSE)
 1123                 printf("%23s %12s %12s %8s %4s\n",
 1124                     "Type", "Physical", "Virtual", "#Pages", "Attr");
 1125 
 1126         for (i = 0, p = map; i < ndesc; i++,
 1127             p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 1128                 if (boothowto & RB_VERBOSE) {
 1129                         if (p->md_type < nitems(types))
 1130                                 type = types[p->md_type];
 1131                         else
 1132                                 type = "<INVALID>";
 1133                         printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 1134                             p->md_virt, p->md_pages);
 1135                         if (p->md_attr & EFI_MD_ATTR_UC)
 1136                                 printf("UC ");
 1137                         if (p->md_attr & EFI_MD_ATTR_WC)
 1138                                 printf("WC ");
 1139                         if (p->md_attr & EFI_MD_ATTR_WT)
 1140                                 printf("WT ");
 1141                         if (p->md_attr & EFI_MD_ATTR_WB)
 1142                                 printf("WB ");
 1143                         if (p->md_attr & EFI_MD_ATTR_UCE)
 1144                                 printf("UCE ");
 1145                         if (p->md_attr & EFI_MD_ATTR_WP)
 1146                                 printf("WP ");
 1147                         if (p->md_attr & EFI_MD_ATTR_RP)
 1148                                 printf("RP ");
 1149                         if (p->md_attr & EFI_MD_ATTR_XP)
 1150                                 printf("XP ");
 1151                         if (p->md_attr & EFI_MD_ATTR_NV)
 1152                                 printf("NV ");
 1153                         if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 1154                                 printf("MORE_RELIABLE ");
 1155                         if (p->md_attr & EFI_MD_ATTR_RO)
 1156                                 printf("RO ");
 1157                         if (p->md_attr & EFI_MD_ATTR_RT)
 1158                                 printf("RUNTIME");
 1159                         printf("\n");
 1160                 }
 1161 
 1162                 switch (p->md_type) {
 1163                 case EFI_MD_TYPE_CODE:
 1164                 case EFI_MD_TYPE_DATA:
 1165                 case EFI_MD_TYPE_BS_CODE:
 1166                 case EFI_MD_TYPE_BS_DATA:
 1167                 case EFI_MD_TYPE_FREE:
 1168                         /*
 1169                          * We're allowed to use any entry with these types.
 1170                          */
 1171                         break;
 1172                 default:
 1173                         continue;
 1174                 }
 1175 
 1176                 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 1177                     physmap, physmap_idx))
 1178                         break;
 1179         }
 1180 }
 1181 
 1182 static char bootmethod[16] = "";
 1183 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
 1184     "System firmware boot method");
 1185 
 1186 static void
 1187 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 1188 {
 1189         struct bios_smap *smap;
 1190         struct efi_map_header *efihdr;
 1191         u_int32_t size;
 1192 
 1193         /*
 1194          * Memory map from INT 15:E820.
 1195          *
 1196          * subr_module.c says:
 1197          * "Consumer may safely assume that size value precedes data."
 1198          * ie: an int32_t immediately precedes smap.
 1199          */
 1200 
 1201         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1202             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1203         smap = (struct bios_smap *)preload_search_info(kmdp,
 1204             MODINFO_METADATA | MODINFOMD_SMAP);
 1205         if (efihdr == NULL && smap == NULL)
 1206                 panic("No BIOS smap or EFI map info from loader!");
 1207 
 1208         if (efihdr != NULL) {
 1209                 add_efi_map_entries(efihdr, physmap, physmap_idx);
 1210                 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 1211         } else {
 1212                 size = *((u_int32_t *)smap - 1);
 1213                 bios_add_smap_entries(smap, size, physmap, physmap_idx);
 1214                 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 1215         }
 1216 }
 1217 
 1218 #define PAGES_PER_GB    (1024 * 1024 * 1024 / PAGE_SIZE)
 1219 
 1220 /*
 1221  * Populate the (physmap) array with base/bound pairs describing the
 1222  * available physical memory in the system, then test this memory and
 1223  * build the phys_avail array describing the actually-available memory.
 1224  *
 1225  * Total memory size may be set by the kernel environment variable
 1226  * hw.physmem or the compile-time define MAXMEM.
 1227  *
 1228  * XXX first should be vm_paddr_t.
 1229  */
 1230 static void
 1231 getmemsize(caddr_t kmdp, u_int64_t first)
 1232 {
 1233         int i, physmap_idx, pa_indx, da_indx;
 1234         vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 1235         u_long physmem_start, physmem_tunable, memtest;
 1236         pt_entry_t *pte;
 1237         quad_t dcons_addr, dcons_size;
 1238         int page_counter;
 1239 
 1240         /*
 1241          * Tell the physical memory allocator about pages used to store
 1242          * the kernel and preloaded data.  See kmem_bootstrap_free().
 1243          */
 1244         vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 1245 
 1246         bzero(physmap, sizeof(physmap));
 1247         physmap_idx = 0;
 1248 
 1249         init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 1250         physmap_idx -= 2;
 1251 
 1252         /*
 1253          * Find the 'base memory' segment for SMP
 1254          */
 1255         basemem = 0;
 1256         for (i = 0; i <= physmap_idx; i += 2) {
 1257                 if (physmap[i] <= 0xA0000) {
 1258                         basemem = physmap[i + 1] / 1024;
 1259                         break;
 1260                 }
 1261         }
 1262         if (basemem == 0 || basemem > 640) {
 1263                 if (bootverbose)
 1264                         printf(
 1265                 "Memory map doesn't contain a basemem segment, faking it");
 1266                 basemem = 640;
 1267         }
 1268 
 1269         /*
 1270          * Make hole for "AP -> long mode" bootstrap code.  The
 1271          * mp_bootaddress vector is only available when the kernel
 1272          * is configured to support APs and APs for the system start
 1273          * in 32bit mode (e.g. SMP bare metal).
 1274          */
 1275         if (init_ops.mp_bootaddress) {
 1276                 if (physmap[1] >= 0x100000000)
 1277                         panic(
 1278         "Basemem segment is not suitable for AP bootstrap code!");
 1279                 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 1280         }
 1281 
 1282         /*
 1283          * Maxmem isn't the "maximum memory", it's one larger than the
 1284          * highest page of the physical address space.  It should be
 1285          * called something like "Maxphyspage".  We may adjust this
 1286          * based on ``hw.physmem'' and the results of the memory test.
 1287          */
 1288         Maxmem = atop(physmap[physmap_idx + 1]);
 1289 
 1290 #ifdef MAXMEM
 1291         Maxmem = MAXMEM / 4;
 1292 #endif
 1293 
 1294         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 1295                 Maxmem = atop(physmem_tunable);
 1296 
 1297         /*
 1298          * The boot memory test is disabled by default, as it takes a
 1299          * significant amount of time on large-memory systems, and is
 1300          * unfriendly to virtual machines as it unnecessarily touches all
 1301          * pages.
 1302          *
 1303          * A general name is used as the code may be extended to support
 1304          * additional tests beyond the current "page present" test.
 1305          */
 1306         memtest = 0;
 1307         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 1308 
 1309         /*
 1310          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 1311          * in the system.
 1312          */
 1313         if (Maxmem > atop(physmap[physmap_idx + 1]))
 1314                 Maxmem = atop(physmap[physmap_idx + 1]);
 1315 
 1316         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 1317             (boothowto & RB_VERBOSE))
 1318                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 1319 
 1320         /* call pmap initialization to make new kernel address space */
 1321         pmap_bootstrap(&first);
 1322 
 1323         /*
 1324          * Size up each available chunk of physical memory.
 1325          *
 1326          * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 1327          * By default, mask off the first 16 pages unless we appear to be
 1328          * running in a VM.
 1329          */
 1330         physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 1331         TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 1332         if (physmap[0] < physmem_start) {
 1333                 if (physmem_start < PAGE_SIZE)
 1334                         physmap[0] = PAGE_SIZE;
 1335                 else if (physmem_start >= physmap[1])
 1336                         physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 1337                 else
 1338                         physmap[0] = round_page(physmem_start);
 1339         }
 1340         pa_indx = 0;
 1341         da_indx = 1;
 1342         phys_avail[pa_indx++] = physmap[0];
 1343         phys_avail[pa_indx] = physmap[0];
 1344         dump_avail[da_indx] = physmap[0];
 1345         pte = CMAP1;
 1346 
 1347         /*
 1348          * Get dcons buffer address
 1349          */
 1350         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 1351             getenv_quad("dcons.size", &dcons_size) == 0)
 1352                 dcons_addr = 0;
 1353 
 1354         /*
 1355          * physmap is in bytes, so when converting to page boundaries,
 1356          * round up the start address and round down the end address.
 1357          */
 1358         page_counter = 0;
 1359         if (memtest != 0)
 1360                 printf("Testing system memory");
 1361         for (i = 0; i <= physmap_idx; i += 2) {
 1362                 vm_paddr_t end;
 1363 
 1364                 end = ptoa((vm_paddr_t)Maxmem);
 1365                 if (physmap[i + 1] < end)
 1366                         end = trunc_page(physmap[i + 1]);
 1367                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 1368                         int tmp, page_bad, full;
 1369                         int *ptr = (int *)CADDR1;
 1370 
 1371                         full = FALSE;
 1372                         /*
 1373                          * block out kernel memory as not available.
 1374                          */
 1375                         if (pa >= (vm_paddr_t)kernphys && pa < first)
 1376                                 goto do_dump_avail;
 1377 
 1378                         /*
 1379                          * block out dcons buffer
 1380                          */
 1381                         if (dcons_addr > 0
 1382                             && pa >= trunc_page(dcons_addr)
 1383                             && pa < dcons_addr + dcons_size)
 1384                                 goto do_dump_avail;
 1385 
 1386                         page_bad = FALSE;
 1387                         if (memtest == 0)
 1388                                 goto skip_memtest;
 1389 
 1390                         /*
 1391                          * Print a "." every GB to show we're making
 1392                          * progress.
 1393                          */
 1394                         page_counter++;
 1395                         if ((page_counter % PAGES_PER_GB) == 0)
 1396                                 printf(".");
 1397 
 1398                         /*
 1399                          * map page into kernel: valid, read/write,non-cacheable
 1400                          */
 1401                         *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 1402                         invltlb();
 1403 
 1404                         tmp = *(int *)ptr;
 1405                         /*
 1406                          * Test for alternating 1's and 0's
 1407                          */
 1408                         *(volatile int *)ptr = 0xaaaaaaaa;
 1409                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1410                                 page_bad = TRUE;
 1411                         /*
 1412                          * Test for alternating 0's and 1's
 1413                          */
 1414                         *(volatile int *)ptr = 0x55555555;
 1415                         if (*(volatile int *)ptr != 0x55555555)
 1416                                 page_bad = TRUE;
 1417                         /*
 1418                          * Test for all 1's
 1419                          */
 1420                         *(volatile int *)ptr = 0xffffffff;
 1421                         if (*(volatile int *)ptr != 0xffffffff)
 1422                                 page_bad = TRUE;
 1423                         /*
 1424                          * Test for all 0's
 1425                          */
 1426                         *(volatile int *)ptr = 0x0;
 1427                         if (*(volatile int *)ptr != 0x0)
 1428                                 page_bad = TRUE;
 1429                         /*
 1430                          * Restore original value.
 1431                          */
 1432                         *(int *)ptr = tmp;
 1433 
 1434 skip_memtest:
 1435                         /*
 1436                          * Adjust array of valid/good pages.
 1437                          */
 1438                         if (page_bad == TRUE)
 1439                                 continue;
 1440                         /*
 1441                          * If this good page is a continuation of the
 1442                          * previous set of good pages, then just increase
 1443                          * the end pointer. Otherwise start a new chunk.
 1444                          * Note that "end" points one higher than end,
 1445                          * making the range >= start and < end.
 1446                          * If we're also doing a speculative memory
 1447                          * test and we at or past the end, bump up Maxmem
 1448                          * so that we keep going. The first bad page
 1449                          * will terminate the loop.
 1450                          */
 1451                         if (phys_avail[pa_indx] == pa) {
 1452                                 phys_avail[pa_indx] += PAGE_SIZE;
 1453                         } else {
 1454                                 pa_indx++;
 1455                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 1456                                         printf(
 1457                 "Too many holes in the physical address space, giving up\n");
 1458                                         pa_indx--;
 1459                                         full = TRUE;
 1460                                         goto do_dump_avail;
 1461                                 }
 1462                                 phys_avail[pa_indx++] = pa;     /* start */
 1463                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1464                         }
 1465                         physmem++;
 1466 do_dump_avail:
 1467                         if (dump_avail[da_indx] == pa) {
 1468                                 dump_avail[da_indx] += PAGE_SIZE;
 1469                         } else {
 1470                                 da_indx++;
 1471                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 1472                                         da_indx--;
 1473                                         goto do_next;
 1474                                 }
 1475                                 dump_avail[da_indx++] = pa; /* start */
 1476                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1477                         }
 1478 do_next:
 1479                         if (full)
 1480                                 break;
 1481                 }
 1482         }
 1483         *pte = 0;
 1484         invltlb();
 1485         if (memtest != 0)
 1486                 printf("\n");
 1487 
 1488         /*
 1489          * XXX
 1490          * The last chunk must contain at least one page plus the message
 1491          * buffer to avoid complicating other code (message buffer address
 1492          * calculation, etc.).
 1493          */
 1494         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1495             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 1496                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1497                 phys_avail[pa_indx--] = 0;
 1498                 phys_avail[pa_indx--] = 0;
 1499         }
 1500 
 1501         Maxmem = atop(phys_avail[pa_indx]);
 1502 
 1503         /* Trim off space for the message buffer. */
 1504         phys_avail[pa_indx] -= round_page(msgbufsize);
 1505 
 1506         /* Map the message buffer. */
 1507         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 1508 }
 1509 
 1510 static caddr_t
 1511 native_parse_preload_data(u_int64_t modulep)
 1512 {
 1513         caddr_t kmdp;
 1514         char *envp;
 1515 #ifdef DDB
 1516         vm_offset_t ksym_start;
 1517         vm_offset_t ksym_end;
 1518 #endif
 1519 
 1520         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1521         preload_bootstrap_relocate(KERNBASE);
 1522         kmdp = preload_search_by_type("elf kernel");
 1523         if (kmdp == NULL)
 1524                 kmdp = preload_search_by_type("elf64 kernel");
 1525         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1526         envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 1527         if (envp != NULL)
 1528                 envp += KERNBASE;
 1529         init_static_kenv(envp, 0);
 1530 #ifdef DDB
 1531         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1532         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1533         db_fetch_ksymtab(ksym_start, ksym_end);
 1534 #endif
 1535         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 1536 
 1537         return (kmdp);
 1538 }
 1539 
 1540 static void
 1541 amd64_kdb_init(void)
 1542 {
 1543         kdb_init();
 1544 #ifdef KDB
 1545         if (boothowto & RB_KDB)
 1546                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 1547 #endif
 1548 }
 1549 
 1550 /* Set up the fast syscall stuff */
 1551 void
 1552 amd64_conf_fast_syscall(void)
 1553 {
 1554         uint64_t msr;
 1555 
 1556         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1557         wrmsr(MSR_EFER, msr);
 1558         wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 1559             (u_int64_t)IDTVEC(fast_syscall));
 1560         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1561         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1562             ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1563         wrmsr(MSR_STAR, msr);
 1564         wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
 1565 }
 1566 
 1567 u_int64_t
 1568 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1569 {
 1570         caddr_t kmdp;
 1571         int gsel_tss, x;
 1572         struct pcpu *pc;
 1573         struct nmi_pcpu *np;
 1574         struct xstate_hdr *xhdr;
 1575         u_int64_t rsp0;
 1576         char *env;
 1577         size_t kstack0_sz;
 1578         int late_console;
 1579 
 1580         kmdp = init_ops.parse_preload_data(modulep);
 1581 
 1582         physfree += ucode_load_bsp(physfree + KERNBASE);
 1583         physfree = roundup2(physfree, PAGE_SIZE);
 1584 
 1585         identify_cpu1();
 1586         identify_hypervisor();
 1587         /*
 1588          * hw.cpu_stdext_disable is ignored by the call, it will be
 1589          * re-evaluted by the below call to finishidentcpu().
 1590          */
 1591         identify_cpu2();
 1592 
 1593         link_elf_ireloc(kmdp);
 1594 
 1595         /*
 1596          * This may be done better later if it gets more high level
 1597          * components in it. If so just link td->td_proc here.
 1598          */
 1599         proc_linkup0(&proc0, &thread0);
 1600 
 1601         /* Init basic tunables, hz etc */
 1602         init_param1();
 1603 
 1604         thread0.td_kstack = physfree + KERNBASE;
 1605         thread0.td_kstack_pages = kstack_pages;
 1606         kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 1607         bzero((void *)thread0.td_kstack, kstack0_sz);
 1608         physfree += kstack0_sz;
 1609 
 1610         /*
 1611          * make gdt memory segments
 1612          */
 1613         for (x = 0; x < NGDT; x++) {
 1614                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 1615                     x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 1616                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1617         }
 1618         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 1619         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1620             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1621 
 1622         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1623         r_gdt.rd_base =  (long) gdt;
 1624         lgdt(&r_gdt);
 1625         pc = &__pcpu[0];
 1626 
 1627         wrmsr(MSR_FSBASE, 0);           /* User value */
 1628         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1629         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1630 
 1631         pcpu_init(pc, 0, sizeof(struct pcpu));
 1632         dpcpu_init((void *)(physfree + KERNBASE), 0);
 1633         physfree += DPCPU_SIZE;
 1634         PCPU_SET(prvspace, pc);
 1635         PCPU_SET(curthread, &thread0);
 1636         /* Non-late cninit() and printf() can be moved up to here. */
 1637         PCPU_SET(tssp, &common_tss[0]);
 1638         PCPU_SET(commontssp, &common_tss[0]);
 1639         PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1640         PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 1641         PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 1642         PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 1643 
 1644         /*
 1645          * Initialize mutexes.
 1646          *
 1647          * icu_lock: in order to allow an interrupt to occur in a critical
 1648          *           section, to set pcpu->ipending (etc...) properly, we
 1649          *           must be able to get the icu lock, so it can't be
 1650          *           under witness.
 1651          */
 1652         mutex_init();
 1653         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1654         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 1655 
 1656         /* exceptions */
 1657         pti = pti_get_default();
 1658         TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 1659 
 1660         for (x = 0; x < NIDT; x++)
 1661                 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 1662                     SEL_KPL, 0);
 1663         setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 1664             SEL_KPL, 0);
 1665         setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 1666         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 1667         setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 1668             SEL_UPL, 0);
 1669         setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 1670             SEL_UPL, 0);
 1671         setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 1672             SEL_KPL, 0);
 1673         setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 1674             SEL_KPL, 0);
 1675         setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 1676             SEL_KPL, 0);
 1677         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1678         setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 1679             SDT_SYSIGT, SEL_KPL, 0);
 1680         setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 1681             SEL_KPL, 0);
 1682         setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 1683             SDT_SYSIGT, SEL_KPL, 0);
 1684         setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 1685             SEL_KPL, 0);
 1686         setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 1687             SEL_KPL, 0);
 1688         setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 1689             SEL_KPL, 0);
 1690         setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 1691             SEL_KPL, 0);
 1692         setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 1693             SEL_KPL, 0);
 1694         setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 1695         setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 1696             SEL_KPL, 0);
 1697 #ifdef KDTRACE_HOOKS
 1698         setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 1699             &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 1700 #endif
 1701 #ifdef XENHVM
 1702         setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 1703             &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 1704 #endif
 1705         r_idt.rd_limit = sizeof(idt0) - 1;
 1706         r_idt.rd_base = (long) idt;
 1707         lidt(&r_idt);
 1708 
 1709         /*
 1710          * Initialize the clock before the console so that console
 1711          * initialization can use DELAY().
 1712          */
 1713         clock_init();
 1714 
 1715         /*
 1716          * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 1717          * transition).
 1718          * Once bootblocks have updated, we can test directly for
 1719          * efi_systbl != NULL here...
 1720          */
 1721         if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 1722             != NULL)
 1723                 vty_set_preferred(VTY_VT);
 1724 
 1725         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1726         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1727         TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 1728         TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
 1729 
 1730         finishidentcpu();       /* Final stage of CPU initialization */
 1731         initializecpu();        /* Initialize CPU registers */
 1732         initializecpucache();
 1733 
 1734         /* doublefault stack space, runs on ist1 */
 1735         common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 1736 
 1737         /*
 1738          * NMI stack, runs on ist2.  The pcpu pointer is stored just
 1739          * above the start of the ist2 stack.
 1740          */
 1741         np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 1742         np->np_pcpu = (register_t) pc;
 1743         common_tss[0].tss_ist2 = (long) np;
 1744 
 1745         /*
 1746          * MC# stack, runs on ist3.  The pcpu pointer is stored just
 1747          * above the start of the ist3 stack.
 1748          */
 1749         np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 1750         np->np_pcpu = (register_t) pc;
 1751         common_tss[0].tss_ist3 = (long) np;
 1752 
 1753         /*
 1754          * DB# stack, runs on ist4.
 1755          */
 1756         np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 1757         np->np_pcpu = (register_t) pc;
 1758         common_tss[0].tss_ist4 = (long) np;
 1759         
 1760         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1761         common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 1762 
 1763         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1764         ltr(gsel_tss);
 1765 
 1766         amd64_conf_fast_syscall();
 1767 
 1768         /*
 1769          * Temporary forge some valid pointer to PCB, for exception
 1770          * handlers.  It is reinitialized properly below after FPU is
 1771          * set up.  Also set up td_critnest to short-cut the page
 1772          * fault handler.
 1773          */
 1774         cpu_max_ext_state_size = sizeof(struct savefpu);
 1775         thread0.td_pcb = get_pcb_td(&thread0);
 1776         thread0.td_critnest = 1;
 1777 
 1778         /*
 1779          * The console and kdb should be initialized even earlier than here,
 1780          * but some console drivers don't work until after getmemsize().
 1781          * Default to late console initialization to support these drivers.
 1782          * This loses mainly printf()s in getmemsize() and early debugging.
 1783          */
 1784         late_console = 1;
 1785         TUNABLE_INT_FETCH("debug.late_console", &late_console);
 1786         if (!late_console) {
 1787                 cninit();
 1788                 amd64_kdb_init();
 1789         }
 1790 
 1791         getmemsize(kmdp, physfree);
 1792         init_param2(physmem);
 1793 
 1794         /* now running on new page tables, configured,and u/iom is accessible */
 1795 
 1796         if (late_console)
 1797                 cninit();
 1798 
 1799 #ifdef DEV_ISA
 1800 #ifdef DEV_ATPIC
 1801         elcr_probe();
 1802         atpic_startup();
 1803 #else
 1804         /* Reset and mask the atpics and leave them shut down. */
 1805         atpic_reset();
 1806 
 1807         /*
 1808          * Point the ICU spurious interrupt vectors at the APIC spurious
 1809          * interrupt handler.
 1810          */
 1811         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1812         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1813 #endif
 1814 #else
 1815 #error "have you forgotten the isa device?";
 1816 #endif
 1817 
 1818         if (late_console)
 1819                 amd64_kdb_init();
 1820 
 1821         msgbufinit(msgbufp, msgbufsize);
 1822         fpuinit();
 1823 
 1824         /*
 1825          * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 1826          * area size.  Zero out the extended state header in fpu save
 1827          * area.
 1828          */
 1829         thread0.td_pcb = get_pcb_td(&thread0);
 1830         thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 1831         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 1832         if (use_xsave) {
 1833                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 1834                     1);
 1835                 xhdr->xstate_bv = xsave_mask;
 1836         }
 1837         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1838         rsp0 = (vm_offset_t)thread0.td_pcb;
 1839         /* Ensure the stack is aligned to 16 bytes */
 1840         rsp0 &= ~0xFul;
 1841         common_tss[0].tss_rsp0 = rsp0;
 1842         PCPU_SET(rsp0, rsp0);
 1843         PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 1844             PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 1845         PCPU_SET(curpcb, thread0.td_pcb);
 1846 
 1847         /* transfer to user mode */
 1848 
 1849         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1850         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1851         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1852         _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 1853         _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 1854 
 1855         load_ds(_udatasel);
 1856         load_es(_udatasel);
 1857         load_fs(_ufssel);
 1858 
 1859         /* setup proc 0's pcb */
 1860         thread0.td_pcb->pcb_flags = 0;
 1861         thread0.td_frame = &proc0_tf;
 1862 
 1863         env = kern_getenv("kernelname");
 1864         if (env != NULL)
 1865                 strlcpy(kernelname, env, sizeof(kernelname));
 1866 
 1867         cpu_probe_amdc1e();
 1868 
 1869 #ifdef FDT
 1870         x86_init_fdt();
 1871 #endif
 1872         thread0.td_critnest = 0;
 1873 
 1874         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1875         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1876         TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 1877 
 1878         /* Location of kernel stack for locore */
 1879         return ((u_int64_t)thread0.td_pcb);
 1880 }
 1881 
 1882 void
 1883 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1884 {
 1885 
 1886         pcpu->pc_acpi_id = 0xffffffff;
 1887 }
 1888 
 1889 static int
 1890 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1891 {
 1892         struct bios_smap *smapbase;
 1893         struct bios_smap_xattr smap;
 1894         caddr_t kmdp;
 1895         uint32_t *smapattr;
 1896         int count, error, i;
 1897 
 1898         /* Retrieve the system memory map from the loader. */
 1899         kmdp = preload_search_by_type("elf kernel");
 1900         if (kmdp == NULL)
 1901                 kmdp = preload_search_by_type("elf64 kernel");
 1902         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 1903             MODINFO_METADATA | MODINFOMD_SMAP);
 1904         if (smapbase == NULL)
 1905                 return (0);
 1906         smapattr = (uint32_t *)preload_search_info(kmdp,
 1907             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 1908         count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 1909         error = 0;
 1910         for (i = 0; i < count; i++) {
 1911                 smap.base = smapbase[i].base;
 1912                 smap.length = smapbase[i].length;
 1913                 smap.type = smapbase[i].type;
 1914                 if (smapattr != NULL)
 1915                         smap.xattr = smapattr[i];
 1916                 else
 1917                         smap.xattr = 0;
 1918                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 1919         }
 1920         return (error);
 1921 }
 1922 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1923     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 1924 
 1925 static int
 1926 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1927 {
 1928         struct efi_map_header *efihdr;
 1929         caddr_t kmdp;
 1930         uint32_t efisize;
 1931 
 1932         kmdp = preload_search_by_type("elf kernel");
 1933         if (kmdp == NULL)
 1934                 kmdp = preload_search_by_type("elf64 kernel");
 1935         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1936             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1937         if (efihdr == NULL)
 1938                 return (0);
 1939         efisize = *((uint32_t *)efihdr - 1);
 1940         return (SYSCTL_OUT(req, efihdr, efisize));
 1941 }
 1942 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1943     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 1944 
 1945 void
 1946 spinlock_enter(void)
 1947 {
 1948         struct thread *td;
 1949         register_t flags;
 1950 
 1951         td = curthread;
 1952         if (td->td_md.md_spinlock_count == 0) {
 1953                 flags = intr_disable();
 1954                 td->td_md.md_spinlock_count = 1;
 1955                 td->td_md.md_saved_flags = flags;
 1956         } else
 1957                 td->td_md.md_spinlock_count++;
 1958         critical_enter();
 1959 }
 1960 
 1961 void
 1962 spinlock_exit(void)
 1963 {
 1964         struct thread *td;
 1965         register_t flags;
 1966 
 1967         td = curthread;
 1968         critical_exit();
 1969         flags = td->td_md.md_saved_flags;
 1970         td->td_md.md_spinlock_count--;
 1971         if (td->td_md.md_spinlock_count == 0)
 1972                 intr_restore(flags);
 1973 }
 1974 
 1975 /*
 1976  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1977  * we want to start a backtrace from the function that caused us to enter
 1978  * the debugger. We have the context in the trapframe, but base the trace
 1979  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1980  * enough for a backtrace.
 1981  */
 1982 void
 1983 makectx(struct trapframe *tf, struct pcb *pcb)
 1984 {
 1985 
 1986         pcb->pcb_r12 = tf->tf_r12;
 1987         pcb->pcb_r13 = tf->tf_r13;
 1988         pcb->pcb_r14 = tf->tf_r14;
 1989         pcb->pcb_r15 = tf->tf_r15;
 1990         pcb->pcb_rbp = tf->tf_rbp;
 1991         pcb->pcb_rbx = tf->tf_rbx;
 1992         pcb->pcb_rip = tf->tf_rip;
 1993         pcb->pcb_rsp = tf->tf_rsp;
 1994 }
 1995 
 1996 int
 1997 ptrace_set_pc(struct thread *td, unsigned long addr)
 1998 {
 1999 
 2000         td->td_frame->tf_rip = addr;
 2001         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2002         return (0);
 2003 }
 2004 
 2005 int
 2006 ptrace_single_step(struct thread *td)
 2007 {
 2008 
 2009         PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 2010         if ((td->td_frame->tf_rflags & PSL_T) == 0) {
 2011                 td->td_frame->tf_rflags |= PSL_T;
 2012                 td->td_dbgflags |= TDB_STEP;
 2013         }
 2014         return (0);
 2015 }
 2016 
 2017 int
 2018 ptrace_clear_single_step(struct thread *td)
 2019 {
 2020         PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 2021         td->td_frame->tf_rflags &= ~PSL_T;
 2022         td->td_dbgflags &= ~TDB_STEP;
 2023         return (0);
 2024 }
 2025 
 2026 int
 2027 fill_regs(struct thread *td, struct reg *regs)
 2028 {
 2029         struct trapframe *tp;
 2030 
 2031         tp = td->td_frame;
 2032         return (fill_frame_regs(tp, regs));
 2033 }
 2034 
 2035 int
 2036 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 2037 {
 2038 
 2039         regs->r_r15 = tp->tf_r15;
 2040         regs->r_r14 = tp->tf_r14;
 2041         regs->r_r13 = tp->tf_r13;
 2042         regs->r_r12 = tp->tf_r12;
 2043         regs->r_r11 = tp->tf_r11;
 2044         regs->r_r10 = tp->tf_r10;
 2045         regs->r_r9  = tp->tf_r9;
 2046         regs->r_r8  = tp->tf_r8;
 2047         regs->r_rdi = tp->tf_rdi;
 2048         regs->r_rsi = tp->tf_rsi;
 2049         regs->r_rbp = tp->tf_rbp;
 2050         regs->r_rbx = tp->tf_rbx;
 2051         regs->r_rdx = tp->tf_rdx;
 2052         regs->r_rcx = tp->tf_rcx;
 2053         regs->r_rax = tp->tf_rax;
 2054         regs->r_rip = tp->tf_rip;
 2055         regs->r_cs = tp->tf_cs;
 2056         regs->r_rflags = tp->tf_rflags;
 2057         regs->r_rsp = tp->tf_rsp;
 2058         regs->r_ss = tp->tf_ss;
 2059         if (tp->tf_flags & TF_HASSEGS) {
 2060                 regs->r_ds = tp->tf_ds;
 2061                 regs->r_es = tp->tf_es;
 2062                 regs->r_fs = tp->tf_fs;
 2063                 regs->r_gs = tp->tf_gs;
 2064         } else {
 2065                 regs->r_ds = 0;
 2066                 regs->r_es = 0;
 2067                 regs->r_fs = 0;
 2068                 regs->r_gs = 0;
 2069         }
 2070         regs->r_err = 0;
 2071         regs->r_trapno = 0;
 2072         return (0);
 2073 }
 2074 
 2075 int
 2076 set_regs(struct thread *td, struct reg *regs)
 2077 {
 2078         struct trapframe *tp;
 2079         register_t rflags;
 2080 
 2081         tp = td->td_frame;
 2082         rflags = regs->r_rflags & 0xffffffff;
 2083         if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 2084                 return (EINVAL);
 2085         tp->tf_r15 = regs->r_r15;
 2086         tp->tf_r14 = regs->r_r14;
 2087         tp->tf_r13 = regs->r_r13;
 2088         tp->tf_r12 = regs->r_r12;
 2089         tp->tf_r11 = regs->r_r11;
 2090         tp->tf_r10 = regs->r_r10;
 2091         tp->tf_r9  = regs->r_r9;
 2092         tp->tf_r8  = regs->r_r8;
 2093         tp->tf_rdi = regs->r_rdi;
 2094         tp->tf_rsi = regs->r_rsi;
 2095         tp->tf_rbp = regs->r_rbp;
 2096         tp->tf_rbx = regs->r_rbx;
 2097         tp->tf_rdx = regs->r_rdx;
 2098         tp->tf_rcx = regs->r_rcx;
 2099         tp->tf_rax = regs->r_rax;
 2100         tp->tf_rip = regs->r_rip;
 2101         tp->tf_cs = regs->r_cs;
 2102         tp->tf_rflags = rflags;
 2103         tp->tf_rsp = regs->r_rsp;
 2104         tp->tf_ss = regs->r_ss;
 2105         if (0) {        /* XXXKIB */
 2106                 tp->tf_ds = regs->r_ds;
 2107                 tp->tf_es = regs->r_es;
 2108                 tp->tf_fs = regs->r_fs;
 2109                 tp->tf_gs = regs->r_gs;
 2110                 tp->tf_flags = TF_HASSEGS;
 2111         }
 2112         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2113         return (0);
 2114 }
 2115 
 2116 /* XXX check all this stuff! */
 2117 /* externalize from sv_xmm */
 2118 static void
 2119 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 2120 {
 2121         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2122         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2123         int i;
 2124 
 2125         /* pcb -> fpregs */
 2126         bzero(fpregs, sizeof(*fpregs));
 2127 
 2128         /* FPU control/status */
 2129         penv_fpreg->en_cw = penv_xmm->en_cw;
 2130         penv_fpreg->en_sw = penv_xmm->en_sw;
 2131         penv_fpreg->en_tw = penv_xmm->en_tw;
 2132         penv_fpreg->en_opcode = penv_xmm->en_opcode;
 2133         penv_fpreg->en_rip = penv_xmm->en_rip;
 2134         penv_fpreg->en_rdp = penv_xmm->en_rdp;
 2135         penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 2136         penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 2137 
 2138         /* FPU registers */
 2139         for (i = 0; i < 8; ++i)
 2140                 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 2141 
 2142         /* SSE registers */
 2143         for (i = 0; i < 16; ++i)
 2144                 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 2145 }
 2146 
 2147 /* internalize from fpregs into sv_xmm */
 2148 static void
 2149 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 2150 {
 2151         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2152         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2153         int i;
 2154 
 2155         /* fpregs -> pcb */
 2156         /* FPU control/status */
 2157         penv_xmm->en_cw = penv_fpreg->en_cw;
 2158         penv_xmm->en_sw = penv_fpreg->en_sw;
 2159         penv_xmm->en_tw = penv_fpreg->en_tw;
 2160         penv_xmm->en_opcode = penv_fpreg->en_opcode;
 2161         penv_xmm->en_rip = penv_fpreg->en_rip;
 2162         penv_xmm->en_rdp = penv_fpreg->en_rdp;
 2163         penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 2164         penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 2165 
 2166         /* FPU registers */
 2167         for (i = 0; i < 8; ++i)
 2168                 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 2169 
 2170         /* SSE registers */
 2171         for (i = 0; i < 16; ++i)
 2172                 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 2173 }
 2174 
 2175 /* externalize from td->pcb */
 2176 int
 2177 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 2178 {
 2179 
 2180         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 2181             P_SHOULDSTOP(td->td_proc),
 2182             ("not suspended thread %p", td));
 2183         fpugetregs(td);
 2184         fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 2185         return (0);
 2186 }
 2187 
 2188 /* internalize to td->pcb */
 2189 int
 2190 set_fpregs(struct thread *td, struct fpreg *fpregs)
 2191 {
 2192 
 2193         critical_enter();
 2194         set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 2195         fpuuserinited(td);
 2196         critical_exit();
 2197         return (0);
 2198 }
 2199 
 2200 /*
 2201  * Get machine context.
 2202  */
 2203 int
 2204 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 2205 {
 2206         struct pcb *pcb;
 2207         struct trapframe *tp;
 2208 
 2209         pcb = td->td_pcb;
 2210         tp = td->td_frame;
 2211         PROC_LOCK(curthread->td_proc);
 2212         mcp->mc_onstack = sigonstack(tp->tf_rsp);
 2213         PROC_UNLOCK(curthread->td_proc);
 2214         mcp->mc_r15 = tp->tf_r15;
 2215         mcp->mc_r14 = tp->tf_r14;
 2216         mcp->mc_r13 = tp->tf_r13;
 2217         mcp->mc_r12 = tp->tf_r12;
 2218         mcp->mc_r11 = tp->tf_r11;
 2219         mcp->mc_r10 = tp->tf_r10;
 2220         mcp->mc_r9  = tp->tf_r9;
 2221         mcp->mc_r8  = tp->tf_r8;
 2222         mcp->mc_rdi = tp->tf_rdi;
 2223         mcp->mc_rsi = tp->tf_rsi;
 2224         mcp->mc_rbp = tp->tf_rbp;
 2225         mcp->mc_rbx = tp->tf_rbx;
 2226         mcp->mc_rcx = tp->tf_rcx;
 2227         mcp->mc_rflags = tp->tf_rflags;
 2228         if (flags & GET_MC_CLEAR_RET) {
 2229                 mcp->mc_rax = 0;
 2230                 mcp->mc_rdx = 0;
 2231                 mcp->mc_rflags &= ~PSL_C;
 2232         } else {
 2233                 mcp->mc_rax = tp->tf_rax;
 2234                 mcp->mc_rdx = tp->tf_rdx;
 2235         }
 2236         mcp->mc_rip = tp->tf_rip;
 2237         mcp->mc_cs = tp->tf_cs;
 2238         mcp->mc_rsp = tp->tf_rsp;
 2239         mcp->mc_ss = tp->tf_ss;
 2240         mcp->mc_ds = tp->tf_ds;
 2241         mcp->mc_es = tp->tf_es;
 2242         mcp->mc_fs = tp->tf_fs;
 2243         mcp->mc_gs = tp->tf_gs;
 2244         mcp->mc_flags = tp->tf_flags;
 2245         mcp->mc_len = sizeof(*mcp);
 2246         get_fpcontext(td, mcp, NULL, 0);
 2247         update_pcb_bases(pcb);
 2248         mcp->mc_fsbase = pcb->pcb_fsbase;
 2249         mcp->mc_gsbase = pcb->pcb_gsbase;
 2250         mcp->mc_xfpustate = 0;
 2251         mcp->mc_xfpustate_len = 0;
 2252         bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 2253         return (0);
 2254 }
 2255 
 2256 /*
 2257  * Set machine context.
 2258  *
 2259  * However, we don't set any but the user modifiable flags, and we won't
 2260  * touch the cs selector.
 2261  */
 2262 int
 2263 set_mcontext(struct thread *td, mcontext_t *mcp)
 2264 {
 2265         struct pcb *pcb;
 2266         struct trapframe *tp;
 2267         char *xfpustate;
 2268         long rflags;
 2269         int ret;
 2270 
 2271         pcb = td->td_pcb;
 2272         tp = td->td_frame;
 2273         if (mcp->mc_len != sizeof(*mcp) ||
 2274             (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 2275                 return (EINVAL);
 2276         rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 2277             (tp->tf_rflags & ~PSL_USERCHANGE);
 2278         if (mcp->mc_flags & _MC_HASFPXSTATE) {
 2279                 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 2280                     sizeof(struct savefpu))
 2281                         return (EINVAL);
 2282                 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 2283                 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 2284                     mcp->mc_xfpustate_len);
 2285                 if (ret != 0)
 2286                         return (ret);
 2287         } else
 2288                 xfpustate = NULL;
 2289         ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 2290         if (ret != 0)
 2291                 return (ret);
 2292         tp->tf_r15 = mcp->mc_r15;
 2293         tp->tf_r14 = mcp->mc_r14;
 2294         tp->tf_r13 = mcp->mc_r13;
 2295         tp->tf_r12 = mcp->mc_r12;
 2296         tp->tf_r11 = mcp->mc_r11;
 2297         tp->tf_r10 = mcp->mc_r10;
 2298         tp->tf_r9  = mcp->mc_r9;
 2299         tp->tf_r8  = mcp->mc_r8;
 2300         tp->tf_rdi = mcp->mc_rdi;
 2301         tp->tf_rsi = mcp->mc_rsi;
 2302         tp->tf_rbp = mcp->mc_rbp;
 2303         tp->tf_rbx = mcp->mc_rbx;
 2304         tp->tf_rdx = mcp->mc_rdx;
 2305         tp->tf_rcx = mcp->mc_rcx;
 2306         tp->tf_rax = mcp->mc_rax;
 2307         tp->tf_rip = mcp->mc_rip;
 2308         tp->tf_rflags = rflags;
 2309         tp->tf_rsp = mcp->mc_rsp;
 2310         tp->tf_ss = mcp->mc_ss;
 2311         tp->tf_flags = mcp->mc_flags;
 2312         if (tp->tf_flags & TF_HASSEGS) {
 2313                 tp->tf_ds = mcp->mc_ds;
 2314                 tp->tf_es = mcp->mc_es;
 2315                 tp->tf_fs = mcp->mc_fs;
 2316                 tp->tf_gs = mcp->mc_gs;
 2317         }
 2318         set_pcb_flags(pcb, PCB_FULL_IRET);
 2319         if (mcp->mc_flags & _MC_HASBASES) {
 2320                 pcb->pcb_fsbase = mcp->mc_fsbase;
 2321                 pcb->pcb_gsbase = mcp->mc_gsbase;
 2322         }
 2323         return (0);
 2324 }
 2325 
 2326 static void
 2327 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
 2328     size_t xfpusave_len)
 2329 {
 2330         size_t max_len, len;
 2331 
 2332         mcp->mc_ownedfp = fpugetregs(td);
 2333         bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 2334             sizeof(mcp->mc_fpstate));
 2335         mcp->mc_fpformat = fpuformat();
 2336         if (!use_xsave || xfpusave_len == 0)
 2337                 return;
 2338         max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 2339         len = xfpusave_len;
 2340         if (len > max_len) {
 2341                 len = max_len;
 2342                 bzero(xfpusave + max_len, len - max_len);
 2343         }
 2344         mcp->mc_flags |= _MC_HASFPXSTATE;
 2345         mcp->mc_xfpustate_len = len;
 2346         bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 2347 }
 2348 
 2349 static int
 2350 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
 2351     size_t xfpustate_len)
 2352 {
 2353         int error;
 2354 
 2355         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 2356                 return (0);
 2357         else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 2358                 return (EINVAL);
 2359         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 2360                 /* We don't care what state is left in the FPU or PCB. */
 2361                 fpstate_drop(td);
 2362                 error = 0;
 2363         } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 2364             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 2365                 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 2366                     xfpustate, xfpustate_len);
 2367         } else
 2368                 return (EINVAL);
 2369         return (error);
 2370 }
 2371 
 2372 void
 2373 fpstate_drop(struct thread *td)
 2374 {
 2375 
 2376         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 2377         critical_enter();
 2378         if (PCPU_GET(fpcurthread) == td)
 2379                 fpudrop();
 2380         /*
 2381          * XXX force a full drop of the fpu.  The above only drops it if we
 2382          * owned it.
 2383          *
 2384          * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 2385          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 2386          * We only need to drop to !PCB_INITDONE in sendsig().  But
 2387          * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 2388          * have too many layers.
 2389          */
 2390         clear_pcb_flags(curthread->td_pcb,
 2391             PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 2392         critical_exit();
 2393 }
 2394 
 2395 int
 2396 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 2397 {
 2398         struct pcb *pcb;
 2399 
 2400         if (td == NULL) {
 2401                 dbregs->dr[0] = rdr0();
 2402                 dbregs->dr[1] = rdr1();
 2403                 dbregs->dr[2] = rdr2();
 2404                 dbregs->dr[3] = rdr3();
 2405                 dbregs->dr[6] = rdr6();
 2406                 dbregs->dr[7] = rdr7();
 2407         } else {
 2408                 pcb = td->td_pcb;
 2409                 dbregs->dr[0] = pcb->pcb_dr0;
 2410                 dbregs->dr[1] = pcb->pcb_dr1;
 2411                 dbregs->dr[2] = pcb->pcb_dr2;
 2412                 dbregs->dr[3] = pcb->pcb_dr3;
 2413                 dbregs->dr[6] = pcb->pcb_dr6;
 2414                 dbregs->dr[7] = pcb->pcb_dr7;
 2415         }
 2416         dbregs->dr[4] = 0;
 2417         dbregs->dr[5] = 0;
 2418         dbregs->dr[8] = 0;
 2419         dbregs->dr[9] = 0;
 2420         dbregs->dr[10] = 0;
 2421         dbregs->dr[11] = 0;
 2422         dbregs->dr[12] = 0;
 2423         dbregs->dr[13] = 0;
 2424         dbregs->dr[14] = 0;
 2425         dbregs->dr[15] = 0;
 2426         return (0);
 2427 }
 2428 
 2429 int
 2430 set_dbregs(struct thread *td, struct dbreg *dbregs)
 2431 {
 2432         struct pcb *pcb;
 2433         int i;
 2434 
 2435         if (td == NULL) {
 2436                 load_dr0(dbregs->dr[0]);
 2437                 load_dr1(dbregs->dr[1]);
 2438                 load_dr2(dbregs->dr[2]);
 2439                 load_dr3(dbregs->dr[3]);
 2440                 load_dr6(dbregs->dr[6]);
 2441                 load_dr7(dbregs->dr[7]);
 2442         } else {
 2443                 /*
 2444                  * Don't let an illegal value for dr7 get set.  Specifically,
 2445                  * check for undefined settings.  Setting these bit patterns
 2446                  * result in undefined behaviour and can lead to an unexpected
 2447                  * TRCTRAP or a general protection fault right here.
 2448                  * Upper bits of dr6 and dr7 must not be set
 2449                  */
 2450                 for (i = 0; i < 4; i++) {
 2451                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 2452                                 return (EINVAL);
 2453                         if (td->td_frame->tf_cs == _ucode32sel &&
 2454                             DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 2455                                 return (EINVAL);
 2456                 }
 2457                 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 2458                     (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 2459                         return (EINVAL);
 2460 
 2461                 pcb = td->td_pcb;
 2462 
 2463                 /*
 2464                  * Don't let a process set a breakpoint that is not within the
 2465                  * process's address space.  If a process could do this, it
 2466                  * could halt the system by setting a breakpoint in the kernel
 2467                  * (if ddb was enabled).  Thus, we need to check to make sure
 2468                  * that no breakpoints are being enabled for addresses outside
 2469                  * process's address space.
 2470                  *
 2471                  * XXX - what about when the watched area of the user's
 2472                  * address space is written into from within the kernel
 2473                  * ... wouldn't that still cause a breakpoint to be generated
 2474                  * from within kernel mode?
 2475                  */
 2476 
 2477                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 2478                         /* dr0 is enabled */
 2479                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 2480                                 return (EINVAL);
 2481                 }
 2482                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 2483                         /* dr1 is enabled */
 2484                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 2485                                 return (EINVAL);
 2486                 }
 2487                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 2488                         /* dr2 is enabled */
 2489                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 2490                                 return (EINVAL);
 2491                 }
 2492                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 2493                         /* dr3 is enabled */
 2494                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 2495                                 return (EINVAL);
 2496                 }
 2497 
 2498                 pcb->pcb_dr0 = dbregs->dr[0];
 2499                 pcb->pcb_dr1 = dbregs->dr[1];
 2500                 pcb->pcb_dr2 = dbregs->dr[2];
 2501                 pcb->pcb_dr3 = dbregs->dr[3];
 2502                 pcb->pcb_dr6 = dbregs->dr[6];
 2503                 pcb->pcb_dr7 = dbregs->dr[7];
 2504 
 2505                 set_pcb_flags(pcb, PCB_DBREGS);
 2506         }
 2507 
 2508         return (0);
 2509 }
 2510 
 2511 void
 2512 reset_dbregs(void)
 2513 {
 2514 
 2515         load_dr7(0);    /* Turn off the control bits first */
 2516         load_dr0(0);
 2517         load_dr1(0);
 2518         load_dr2(0);
 2519         load_dr3(0);
 2520         load_dr6(0);
 2521 }
 2522 
 2523 /*
 2524  * Return > 0 if a hardware breakpoint has been hit, and the
 2525  * breakpoint was in user space.  Return 0, otherwise.
 2526  */
 2527 int
 2528 user_dbreg_trap(register_t dr6)
 2529 {
 2530         u_int64_t dr7;
 2531         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
 2532         int nbp;            /* number of breakpoints that triggered */
 2533         caddr_t addr[4];    /* breakpoint addresses */
 2534         int i;
 2535 
 2536         bp = dr6 & DBREG_DR6_BMASK;
 2537         if (bp == 0) {
 2538                 /*
 2539                  * None of the breakpoint bits are set meaning this
 2540                  * trap was not caused by any of the debug registers
 2541                  */
 2542                 return 0;
 2543         }
 2544 
 2545         dr7 = rdr7();
 2546         if ((dr7 & 0x000000ff) == 0) {
 2547                 /*
 2548                  * all GE and LE bits in the dr7 register are zero,
 2549                  * thus the trap couldn't have been caused by the
 2550                  * hardware debug registers
 2551                  */
 2552                 return 0;
 2553         }
 2554 
 2555         nbp = 0;
 2556 
 2557         /*
 2558          * at least one of the breakpoints were hit, check to see
 2559          * which ones and if any of them are user space addresses
 2560          */
 2561 
 2562         if (bp & 0x01) {
 2563                 addr[nbp++] = (caddr_t)rdr0();
 2564         }
 2565         if (bp & 0x02) {
 2566                 addr[nbp++] = (caddr_t)rdr1();
 2567         }
 2568         if (bp & 0x04) {
 2569                 addr[nbp++] = (caddr_t)rdr2();
 2570         }
 2571         if (bp & 0x08) {
 2572                 addr[nbp++] = (caddr_t)rdr3();
 2573         }
 2574 
 2575         for (i = 0; i < nbp; i++) {
 2576                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 2577                         /*
 2578                          * addr[i] is in user space
 2579                          */
 2580                         return nbp;
 2581                 }
 2582         }
 2583 
 2584         /*
 2585          * None of the breakpoints are in user space.
 2586          */
 2587         return 0;
 2588 }
 2589 
 2590 /*
 2591  * The pcb_flags is only modified by current thread, or by other threads
 2592  * when current thread is stopped.  However, current thread may change it
 2593  * from the interrupt context in cpu_switch(), or in the trap handler.
 2594  * When we read-modify-write pcb_flags from C sources, compiler may generate
 2595  * code that is not atomic regarding the interrupt handler.  If a trap or
 2596  * interrupt happens and any flag is modified from the handler, it can be
 2597  * clobbered with the cached value later.  Therefore, we implement setting
 2598  * and clearing flags with single-instruction functions, which do not race
 2599  * with possible modification of the flags from the trap or interrupt context,
 2600  * because traps and interrupts are executed only on instruction boundary.
 2601  */
 2602 void
 2603 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 2604 {
 2605 
 2606         __asm __volatile("orl %1,%0"
 2607             : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 2608             : "cc", "memory");
 2609 
 2610 }
 2611 
 2612 /*
 2613  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
 2614  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
 2615  * pcb if user space modified the bases.  We must save on the context
 2616  * switch or if the return to usermode happens through the doreti.
 2617  *
 2618  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
 2619  * which have a consequence that the base MSRs must be saved each time
 2620  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
 2621  * context switches.
 2622  */
 2623 void
 2624 set_pcb_flags(struct pcb *pcb, const u_int flags)
 2625 {
 2626         register_t r;
 2627 
 2628         if (curpcb == pcb &&
 2629             (flags & PCB_FULL_IRET) != 0 &&
 2630             (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
 2631             (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
 2632                 r = intr_disable();
 2633                 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 2634                         if (rfs() == _ufssel)
 2635                                 pcb->pcb_fsbase = rdfsbase();
 2636                         if (rgs() == _ugssel)
 2637                                 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 2638                 }
 2639                 set_pcb_flags_raw(pcb, flags);
 2640                 intr_restore(r);
 2641         } else {
 2642                 set_pcb_flags_raw(pcb, flags);
 2643         }
 2644 }
 2645 
 2646 void
 2647 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 2648 {
 2649 
 2650         __asm __volatile("andl %1,%0"
 2651             : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 2652             : "cc", "memory");
 2653 }
 2654 
 2655 #ifdef KDB
 2656 
 2657 /*
 2658  * Provide inb() and outb() as functions.  They are normally only available as
 2659  * inline functions, thus cannot be called from the debugger.
 2660  */
 2661 
 2662 /* silence compiler warnings */
 2663 u_char inb_(u_short);
 2664 void outb_(u_short, u_char);
 2665 
 2666 u_char
 2667 inb_(u_short port)
 2668 {
 2669         return inb(port);
 2670 }
 2671 
 2672 void
 2673 outb_(u_short port, u_char data)
 2674 {
 2675         outb(port, data);
 2676 }
 2677 
 2678 #endif /* KDB */

Cache object: 82046fd8e8b0a26f5503a2d9433a07a9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.