The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 341491 2018-12-04 19:07:10Z markj $");
   43 
   44 #include "opt_atpic.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kstack_pages.h"
   51 #include "opt_maxmem.h"
   52 #include "opt_mp_watchdog.h"
   53 #include "opt_perfmon.h"
   54 #include "opt_platform.h"
   55 #include "opt_sched.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/bio.h>
   61 #include <sys/buf.h>
   62 #include <sys/bus.h>
   63 #include <sys/callout.h>
   64 #include <sys/cons.h>
   65 #include <sys/cpu.h>
   66 #include <sys/efi.h>
   67 #include <sys/eventhandler.h>
   68 #include <sys/exec.h>
   69 #include <sys/imgact.h>
   70 #include <sys/kdb.h>
   71 #include <sys/kernel.h>
   72 #include <sys/ktr.h>
   73 #include <sys/linker.h>
   74 #include <sys/lock.h>
   75 #include <sys/malloc.h>
   76 #include <sys/memrange.h>
   77 #include <sys/msgbuf.h>
   78 #include <sys/mutex.h>
   79 #include <sys/pcpu.h>
   80 #include <sys/ptrace.h>
   81 #include <sys/reboot.h>
   82 #include <sys/rwlock.h>
   83 #include <sys/sched.h>
   84 #include <sys/signalvar.h>
   85 #ifdef SMP
   86 #include <sys/smp.h>
   87 #endif
   88 #include <sys/syscallsubr.h>
   89 #include <sys/sysctl.h>
   90 #include <sys/sysent.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/ucontext.h>
   93 #include <sys/vmmeter.h>
   94 
   95 #include <vm/vm.h>
   96 #include <vm/vm_extern.h>
   97 #include <vm/vm_kern.h>
   98 #include <vm/vm_page.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_object.h>
  101 #include <vm/vm_pager.h>
  102 #include <vm/vm_param.h>
  103 #include <vm/vm_phys.h>
  104 
  105 #ifdef DDB
  106 #ifndef KDB
  107 #error KDB must be enabled in order for DDB to work!
  108 #endif
  109 #include <ddb/ddb.h>
  110 #include <ddb/db_sym.h>
  111 #endif
  112 
  113 #include <net/netisr.h>
  114 
  115 #include <machine/clock.h>
  116 #include <machine/cpu.h>
  117 #include <machine/cputypes.h>
  118 #include <machine/frame.h>
  119 #include <machine/intr_machdep.h>
  120 #include <x86/mca.h>
  121 #include <machine/md_var.h>
  122 #include <machine/metadata.h>
  123 #include <machine/mp_watchdog.h>
  124 #include <machine/pc/bios.h>
  125 #include <machine/pcb.h>
  126 #include <machine/proc.h>
  127 #include <machine/reg.h>
  128 #include <machine/sigframe.h>
  129 #include <machine/specialreg.h>
  130 #ifdef PERFMON
  131 #include <machine/perfmon.h>
  132 #endif
  133 #include <machine/tss.h>
  134 #ifdef SMP
  135 #include <machine/smp.h>
  136 #endif
  137 #ifdef FDT
  138 #include <x86/fdt.h>
  139 #endif
  140 
  141 #ifdef DEV_ATPIC
  142 #include <x86/isa/icu.h>
  143 #else
  144 #include <x86/apicvar.h>
  145 #endif
  146 
  147 #include <isa/isareg.h>
  148 #include <isa/rtc.h>
  149 #include <x86/init.h>
  150 
  151 /* Sanity check for __curthread() */
  152 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  153 
  154 /*
  155  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  156  * couple of scratch registers, as well as the trapframe left behind after an
  157  * iret fault.
  158  */
  159 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
  160     offsetof(struct pti_frame, pti_rip));
  161 
  162 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  163 
  164 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  165 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  166 
  167 static void cpu_startup(void *);
  168 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
  169     char *xfpusave, size_t xfpusave_len);
  170 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
  171     char *xfpustate, size_t xfpustate_len);
  172 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  173 
  174 /* Preload data parse function */
  175 static caddr_t native_parse_preload_data(u_int64_t);
  176 
  177 /* Native function to fetch and parse the e820 map */
  178 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
  179 
  180 /* Default init_ops implementation. */
  181 struct init_ops init_ops = {
  182         .parse_preload_data =   native_parse_preload_data,
  183         .early_clock_source_init =      i8254_init,
  184         .early_delay =                  i8254_delay,
  185         .parse_memmap =                 native_parse_memmap,
  186 #ifdef SMP
  187         .mp_bootaddress =               mp_bootaddress,
  188         .start_all_aps =                native_start_all_aps,
  189 #endif
  190         .msi_init =                     msi_init,
  191 };
  192 
  193 struct msgbuf *msgbufp;
  194 
  195 /*
  196  * Physical address of the EFI System Table. Stashed from the metadata hints
  197  * passed into the kernel and used by the EFI code to call runtime services.
  198  */
  199 vm_paddr_t efi_systbl_phys;
  200 
  201 /* Intel ICH registers */
  202 #define ICH_PMBASE      0x400
  203 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  204 
  205 int     _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
  206 
  207 int cold = 1;
  208 
  209 long Maxmem = 0;
  210 long realmem = 0;
  211 
  212 /*
  213  * The number of PHYSMAP entries must be one less than the number of
  214  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  215  * physical address that is accessible by ISA DMA is split into two
  216  * PHYSSEG entries.
  217  */
  218 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  219 
  220 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  221 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  222 
  223 /* must be 2 less so 0 0 can signal end of chunks */
  224 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
  225 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
  226 
  227 struct kva_md_info kmi;
  228 
  229 static struct trapframe proc0_tf;
  230 struct region_descriptor r_gdt, r_idt;
  231 
  232 struct pcpu __pcpu[MAXCPU];
  233 
  234 struct mtx icu_lock;
  235 
  236 struct mem_range_softc mem_range_softc;
  237 
  238 struct mtx dt_lock;     /* lock for GDT and LDT */
  239 
  240 void (*vmm_resume_p)(void);
  241 
  242 static void
  243 cpu_startup(dummy)
  244         void *dummy;
  245 {
  246         uintmax_t memsize;
  247         char *sysenv;
  248 
  249         /*
  250          * On MacBooks, we need to disallow the legacy USB circuit to
  251          * generate an SMI# because this can cause several problems,
  252          * namely: incorrect CPU frequency detection and failure to
  253          * start the APs.
  254          * We do this by disabling a bit in the SMI_EN (SMI Control and
  255          * Enable register) of the Intel ICH LPC Interface Bridge. 
  256          */
  257         sysenv = kern_getenv("smbios.system.product");
  258         if (sysenv != NULL) {
  259                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  260                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  261                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  262                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  263                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  264                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  265                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  266                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  267                         if (bootverbose)
  268                                 printf("Disabling LEGACY_USB_EN bit on "
  269                                     "Intel ICH.\n");
  270                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  271                 }
  272                 freeenv(sysenv);
  273         }
  274 
  275         /*
  276          * Good {morning,afternoon,evening,night}.
  277          */
  278         startrtclock();
  279         printcpuinfo();
  280 #ifdef PERFMON
  281         perfmon_init();
  282 #endif
  283 
  284         /*
  285          * Display physical memory if SMBIOS reports reasonable amount.
  286          */
  287         memsize = 0;
  288         sysenv = kern_getenv("smbios.memory.enabled");
  289         if (sysenv != NULL) {
  290                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  291                 freeenv(sysenv);
  292         }
  293         if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
  294                 memsize = ptoa((uintmax_t)Maxmem);
  295         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  296         realmem = atop(memsize);
  297 
  298         /*
  299          * Display any holes after the first chunk of extended memory.
  300          */
  301         if (bootverbose) {
  302                 int indx;
  303 
  304                 printf("Physical memory chunk(s):\n");
  305                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  306                         vm_paddr_t size;
  307 
  308                         size = phys_avail[indx + 1] - phys_avail[indx];
  309                         printf(
  310                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  311                             (uintmax_t)phys_avail[indx],
  312                             (uintmax_t)phys_avail[indx + 1] - 1,
  313                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  314                 }
  315         }
  316 
  317         vm_ksubmap_init(&kmi);
  318 
  319         printf("avail memory = %ju (%ju MB)\n",
  320             ptoa((uintmax_t)vm_cnt.v_free_count),
  321             ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
  322 
  323         /*
  324          * Set up buffers, so they can be used to read disk labels.
  325          */
  326         bufinit();
  327         vm_pager_bufferinit();
  328 
  329         cpu_setregs();
  330 }
  331 
  332 /*
  333  * Send an interrupt to process.
  334  *
  335  * Stack is set up to allow sigcode stored
  336  * at top to call routine, followed by call
  337  * to sigreturn routine below.  After sigreturn
  338  * resets the signal mask, the stack, and the
  339  * frame pointer, it returns to the user
  340  * specified pc, psl.
  341  */
  342 void
  343 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  344 {
  345         struct sigframe sf, *sfp;
  346         struct pcb *pcb;
  347         struct proc *p;
  348         struct thread *td;
  349         struct sigacts *psp;
  350         char *sp;
  351         struct trapframe *regs;
  352         char *xfpusave;
  353         size_t xfpusave_len;
  354         int sig;
  355         int oonstack;
  356 
  357         td = curthread;
  358         pcb = td->td_pcb;
  359         p = td->td_proc;
  360         PROC_LOCK_ASSERT(p, MA_OWNED);
  361         sig = ksi->ksi_signo;
  362         psp = p->p_sigacts;
  363         mtx_assert(&psp->ps_mtx, MA_OWNED);
  364         regs = td->td_frame;
  365         oonstack = sigonstack(regs->tf_rsp);
  366 
  367         if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
  368                 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
  369                 xfpusave = __builtin_alloca(xfpusave_len);
  370         } else {
  371                 xfpusave_len = 0;
  372                 xfpusave = NULL;
  373         }
  374 
  375         /* Save user context. */
  376         bzero(&sf, sizeof(sf));
  377         sf.sf_uc.uc_sigmask = *mask;
  378         sf.sf_uc.uc_stack = td->td_sigstk;
  379         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  380             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  381         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  382         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
  383         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  384         get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
  385         fpstate_drop(td);
  386         update_pcb_bases(pcb);
  387         sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
  388         sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
  389         bzero(sf.sf_uc.uc_mcontext.mc_spare,
  390             sizeof(sf.sf_uc.uc_mcontext.mc_spare));
  391         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  392 
  393         /* Allocate space for the signal handler context. */
  394         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  395             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  396                 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
  397 #if defined(COMPAT_43)
  398                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  399 #endif
  400         } else
  401                 sp = (char *)regs->tf_rsp - 128;
  402         if (xfpusave != NULL) {
  403                 sp -= xfpusave_len;
  404                 sp = (char *)((unsigned long)sp & ~0x3Ful);
  405                 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
  406         }
  407         sp -= sizeof(struct sigframe);
  408         /* Align to 16 bytes. */
  409         sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
  410 
  411         /* Build the argument list for the signal handler. */
  412         regs->tf_rdi = sig;                     /* arg 1 in %rdi */
  413         regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
  414         bzero(&sf.sf_si, sizeof(sf.sf_si));
  415         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  416                 /* Signal handler installed with SA_SIGINFO. */
  417                 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
  418                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  419 
  420                 /* Fill in POSIX parts */
  421                 sf.sf_si = ksi->ksi_info;
  422                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  423                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  424         } else {
  425                 /* Old FreeBSD-style arguments. */
  426                 regs->tf_rsi = ksi->ksi_code;   /* arg 2 in %rsi */
  427                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  428                 sf.sf_ahu.sf_handler = catcher;
  429         }
  430         mtx_unlock(&psp->ps_mtx);
  431         PROC_UNLOCK(p);
  432 
  433         /*
  434          * Copy the sigframe out to the user's stack.
  435          */
  436         if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
  437             (xfpusave != NULL && copyout(xfpusave,
  438             (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
  439             != 0)) {
  440 #ifdef DEBUG
  441                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  442 #endif
  443                 PROC_LOCK(p);
  444                 sigexit(td, SIGILL);
  445         }
  446 
  447         regs->tf_rsp = (long)sfp;
  448         regs->tf_rip = p->p_sysent->sv_sigcode_base;
  449         regs->tf_rflags &= ~(PSL_T | PSL_D);
  450         regs->tf_cs = _ucodesel;
  451         regs->tf_ds = _udatasel;
  452         regs->tf_ss = _udatasel;
  453         regs->tf_es = _udatasel;
  454         regs->tf_fs = _ufssel;
  455         regs->tf_gs = _ugssel;
  456         regs->tf_flags = TF_HASSEGS;
  457         PROC_LOCK(p);
  458         mtx_lock(&psp->ps_mtx);
  459 }
  460 
  461 /*
  462  * System call to cleanup state after a signal
  463  * has been taken.  Reset signal mask and
  464  * stack state from context left by sendsig (above).
  465  * Return to previous pc and psl as specified by
  466  * context left by sendsig. Check carefully to
  467  * make sure that the user has not modified the
  468  * state to gain improper privileges.
  469  *
  470  * MPSAFE
  471  */
  472 int
  473 sys_sigreturn(td, uap)
  474         struct thread *td;
  475         struct sigreturn_args /* {
  476                 const struct __ucontext *sigcntxp;
  477         } */ *uap;
  478 {
  479         ucontext_t uc;
  480         struct pcb *pcb;
  481         struct proc *p;
  482         struct trapframe *regs;
  483         ucontext_t *ucp;
  484         char *xfpustate;
  485         size_t xfpustate_len;
  486         long rflags;
  487         int cs, error, ret;
  488         ksiginfo_t ksi;
  489 
  490         pcb = td->td_pcb;
  491         p = td->td_proc;
  492 
  493         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  494         if (error != 0) {
  495                 uprintf("pid %d (%s): sigreturn copyin failed\n",
  496                     p->p_pid, td->td_name);
  497                 return (error);
  498         }
  499         ucp = &uc;
  500         if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
  501                 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
  502                     td->td_name, ucp->uc_mcontext.mc_flags);
  503                 return (EINVAL);
  504         }
  505         regs = td->td_frame;
  506         rflags = ucp->uc_mcontext.mc_rflags;
  507         /*
  508          * Don't allow users to change privileged or reserved flags.
  509          */
  510         if (!EFL_SECURE(rflags, regs->tf_rflags)) {
  511                 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
  512                     td->td_name, rflags);
  513                 return (EINVAL);
  514         }
  515 
  516         /*
  517          * Don't allow users to load a valid privileged %cs.  Let the
  518          * hardware check for invalid selectors, excess privilege in
  519          * other selectors, invalid %eip's and invalid %esp's.
  520          */
  521         cs = ucp->uc_mcontext.mc_cs;
  522         if (!CS_SECURE(cs)) {
  523                 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
  524                     td->td_name, cs);
  525                 ksiginfo_init_trap(&ksi);
  526                 ksi.ksi_signo = SIGBUS;
  527                 ksi.ksi_code = BUS_OBJERR;
  528                 ksi.ksi_trapno = T_PROTFLT;
  529                 ksi.ksi_addr = (void *)regs->tf_rip;
  530                 trapsignal(td, &ksi);
  531                 return (EINVAL);
  532         }
  533 
  534         if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
  535                 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
  536                 if (xfpustate_len > cpu_max_ext_state_size -
  537                     sizeof(struct savefpu)) {
  538                         uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
  539                             p->p_pid, td->td_name, xfpustate_len);
  540                         return (EINVAL);
  541                 }
  542                 xfpustate = __builtin_alloca(xfpustate_len);
  543                 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
  544                     xfpustate, xfpustate_len);
  545                 if (error != 0) {
  546                         uprintf(
  547         "pid %d (%s): sigreturn copying xfpustate failed\n",
  548                             p->p_pid, td->td_name);
  549                         return (error);
  550                 }
  551         } else {
  552                 xfpustate = NULL;
  553                 xfpustate_len = 0;
  554         }
  555         ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
  556         if (ret != 0) {
  557                 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
  558                     p->p_pid, td->td_name, ret);
  559                 return (ret);
  560         }
  561         bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
  562         update_pcb_bases(pcb);
  563         pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
  564         pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
  565 
  566 #if defined(COMPAT_43)
  567         if (ucp->uc_mcontext.mc_onstack & 1)
  568                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  569         else
  570                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  571 #endif
  572 
  573         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
  574         return (EJUSTRETURN);
  575 }
  576 
  577 #ifdef COMPAT_FREEBSD4
  578 int
  579 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
  580 {
  581  
  582         return sys_sigreturn(td, (struct sigreturn_args *)uap);
  583 }
  584 #endif
  585 
  586 /*
  587  * Reset registers to default values on exec.
  588  */
  589 void
  590 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
  591 {
  592         struct trapframe *regs;
  593         struct pcb *pcb;
  594         register_t saved_rflags;
  595 
  596         regs = td->td_frame;
  597         pcb = td->td_pcb;
  598 
  599         mtx_lock(&dt_lock);
  600         if (td->td_proc->p_md.md_ldt != NULL)
  601                 user_ldt_free(td);
  602         else
  603                 mtx_unlock(&dt_lock);
  604         
  605         update_pcb_bases(pcb);
  606         pcb->pcb_fsbase = 0;
  607         pcb->pcb_gsbase = 0;
  608         clear_pcb_flags(pcb, PCB_32BIT);
  609         pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
  610 
  611         saved_rflags = regs->tf_rflags & PSL_T;
  612         bzero((char *)regs, sizeof(struct trapframe));
  613         regs->tf_rip = imgp->entry_addr;
  614         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
  615         regs->tf_rdi = stack;           /* argv */
  616         regs->tf_rflags = PSL_USER | saved_rflags;
  617         regs->tf_ss = _udatasel;
  618         regs->tf_cs = _ucodesel;
  619         regs->tf_ds = _udatasel;
  620         regs->tf_es = _udatasel;
  621         regs->tf_fs = _ufssel;
  622         regs->tf_gs = _ugssel;
  623         regs->tf_flags = TF_HASSEGS;
  624         td->td_retval[1] = 0;
  625 
  626         /*
  627          * Reset the hardware debug registers if they were in use.
  628          * They won't have any meaning for the newly exec'd process.
  629          */
  630         if (pcb->pcb_flags & PCB_DBREGS) {
  631                 pcb->pcb_dr0 = 0;
  632                 pcb->pcb_dr1 = 0;
  633                 pcb->pcb_dr2 = 0;
  634                 pcb->pcb_dr3 = 0;
  635                 pcb->pcb_dr6 = 0;
  636                 pcb->pcb_dr7 = 0;
  637                 if (pcb == curpcb) {
  638                         /*
  639                          * Clear the debug registers on the running
  640                          * CPU, otherwise they will end up affecting
  641                          * the next process we switch to.
  642                          */
  643                         reset_dbregs();
  644                 }
  645                 clear_pcb_flags(pcb, PCB_DBREGS);
  646         }
  647 
  648         /*
  649          * Drop the FP state if we hold it, so that the process gets a
  650          * clean FP state if it uses the FPU again.
  651          */
  652         fpstate_drop(td);
  653 }
  654 
  655 void
  656 cpu_setregs(void)
  657 {
  658         register_t cr0;
  659 
  660         cr0 = rcr0();
  661         /*
  662          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  663          * BSP.  See the comments there about why we set them.
  664          */
  665         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  666         load_cr0(cr0);
  667 }
  668 
  669 /*
  670  * Initialize amd64 and configure to run kernel
  671  */
  672 
  673 /*
  674  * Initialize segments & interrupt table
  675  */
  676 
  677 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
  678 static struct gate_descriptor idt0[NIDT];
  679 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  680 
  681 static char dblfault_stack[PAGE_SIZE] __aligned(16);
  682 static char mce0_stack[PAGE_SIZE] __aligned(16);
  683 static char nmi0_stack[PAGE_SIZE] __aligned(16);
  684 static char dbg0_stack[PAGE_SIZE] __aligned(16);
  685 CTASSERT(sizeof(struct nmi_pcpu) == 16);
  686 
  687 struct amd64tss common_tss[MAXCPU];
  688 
  689 /*
  690  * Software prototypes -- in more palatable form.
  691  *
  692  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  693  * slots as corresponding segments for i386 kernel.
  694  */
  695 struct soft_segment_descriptor gdt_segs[] = {
  696 /* GNULL_SEL    0 Null Descriptor */
  697 {       .ssd_base = 0x0,
  698         .ssd_limit = 0x0,
  699         .ssd_type = 0,
  700         .ssd_dpl = 0,
  701         .ssd_p = 0,
  702         .ssd_long = 0,
  703         .ssd_def32 = 0,
  704         .ssd_gran = 0           },
  705 /* GNULL2_SEL   1 Null Descriptor */
  706 {       .ssd_base = 0x0,
  707         .ssd_limit = 0x0,
  708         .ssd_type = 0,
  709         .ssd_dpl = 0,
  710         .ssd_p = 0,
  711         .ssd_long = 0,
  712         .ssd_def32 = 0,
  713         .ssd_gran = 0           },
  714 /* GUFS32_SEL   2 32 bit %gs Descriptor for user */
  715 {       .ssd_base = 0x0,
  716         .ssd_limit = 0xfffff,
  717         .ssd_type = SDT_MEMRWA,
  718         .ssd_dpl = SEL_UPL,
  719         .ssd_p = 1,
  720         .ssd_long = 0,
  721         .ssd_def32 = 1,
  722         .ssd_gran = 1           },
  723 /* GUGS32_SEL   3 32 bit %fs Descriptor for user */
  724 {       .ssd_base = 0x0,
  725         .ssd_limit = 0xfffff,
  726         .ssd_type = SDT_MEMRWA,
  727         .ssd_dpl = SEL_UPL,
  728         .ssd_p = 1,
  729         .ssd_long = 0,
  730         .ssd_def32 = 1,
  731         .ssd_gran = 1           },
  732 /* GCODE_SEL    4 Code Descriptor for kernel */
  733 {       .ssd_base = 0x0,
  734         .ssd_limit = 0xfffff,
  735         .ssd_type = SDT_MEMERA,
  736         .ssd_dpl = SEL_KPL,
  737         .ssd_p = 1,
  738         .ssd_long = 1,
  739         .ssd_def32 = 0,
  740         .ssd_gran = 1           },
  741 /* GDATA_SEL    5 Data Descriptor for kernel */
  742 {       .ssd_base = 0x0,
  743         .ssd_limit = 0xfffff,
  744         .ssd_type = SDT_MEMRWA,
  745         .ssd_dpl = SEL_KPL,
  746         .ssd_p = 1,
  747         .ssd_long = 1,
  748         .ssd_def32 = 0,
  749         .ssd_gran = 1           },
  750 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
  751 {       .ssd_base = 0x0,
  752         .ssd_limit = 0xfffff,
  753         .ssd_type = SDT_MEMERA,
  754         .ssd_dpl = SEL_UPL,
  755         .ssd_p = 1,
  756         .ssd_long = 0,
  757         .ssd_def32 = 1,
  758         .ssd_gran = 1           },
  759 /* GUDATA_SEL   7 32/64 bit Data Descriptor for user */
  760 {       .ssd_base = 0x0,
  761         .ssd_limit = 0xfffff,
  762         .ssd_type = SDT_MEMRWA,
  763         .ssd_dpl = SEL_UPL,
  764         .ssd_p = 1,
  765         .ssd_long = 0,
  766         .ssd_def32 = 1,
  767         .ssd_gran = 1           },
  768 /* GUCODE_SEL   8 64 bit Code Descriptor for user */
  769 {       .ssd_base = 0x0,
  770         .ssd_limit = 0xfffff,
  771         .ssd_type = SDT_MEMERA,
  772         .ssd_dpl = SEL_UPL,
  773         .ssd_p = 1,
  774         .ssd_long = 1,
  775         .ssd_def32 = 0,
  776         .ssd_gran = 1           },
  777 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
  778 {       .ssd_base = 0x0,
  779         .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
  780         .ssd_type = SDT_SYSTSS,
  781         .ssd_dpl = SEL_KPL,
  782         .ssd_p = 1,
  783         .ssd_long = 0,
  784         .ssd_def32 = 0,
  785         .ssd_gran = 0           },
  786 /* Actually, the TSS is a system descriptor which is double size */
  787 {       .ssd_base = 0x0,
  788         .ssd_limit = 0x0,
  789         .ssd_type = 0,
  790         .ssd_dpl = 0,
  791         .ssd_p = 0,
  792         .ssd_long = 0,
  793         .ssd_def32 = 0,
  794         .ssd_gran = 0           },
  795 /* GUSERLDT_SEL 11 LDT Descriptor */
  796 {       .ssd_base = 0x0,
  797         .ssd_limit = 0x0,
  798         .ssd_type = 0,
  799         .ssd_dpl = 0,
  800         .ssd_p = 0,
  801         .ssd_long = 0,
  802         .ssd_def32 = 0,
  803         .ssd_gran = 0           },
  804 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
  805 {       .ssd_base = 0x0,
  806         .ssd_limit = 0x0,
  807         .ssd_type = 0,
  808         .ssd_dpl = 0,
  809         .ssd_p = 0,
  810         .ssd_long = 0,
  811         .ssd_def32 = 0,
  812         .ssd_gran = 0           },
  813 };
  814 
  815 void
  816 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
  817 {
  818         struct gate_descriptor *ip;
  819 
  820         ip = idt + idx;
  821         ip->gd_looffset = (uintptr_t)func;
  822         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  823         ip->gd_ist = ist;
  824         ip->gd_xx = 0;
  825         ip->gd_type = typ;
  826         ip->gd_dpl = dpl;
  827         ip->gd_p = 1;
  828         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  829 }
  830 
  831 extern inthand_t
  832         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  833         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  834         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  835         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  836         IDTVEC(xmm), IDTVEC(dblfault),
  837         IDTVEC(div_pti), IDTVEC(bpt_pti),
  838         IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
  839         IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
  840         IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
  841         IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
  842         IDTVEC(xmm_pti),
  843 #ifdef KDTRACE_HOOKS
  844         IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
  845 #endif
  846 #ifdef XENHVM
  847         IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
  848 #endif
  849         IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
  850         IDTVEC(fast_syscall_pti);
  851 
  852 #ifdef DDB
  853 /*
  854  * Display the index and function name of any IDT entries that don't use
  855  * the default 'rsvd' entry point.
  856  */
  857 DB_SHOW_COMMAND(idt, db_show_idt)
  858 {
  859         struct gate_descriptor *ip;
  860         int idx;
  861         uintptr_t func;
  862 
  863         ip = idt;
  864         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
  865                 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
  866                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
  867                         db_printf("%3d\t", idx);
  868                         db_printsym(func, DB_STGY_PROC);
  869                         db_printf("\n");
  870                 }
  871                 ip++;
  872         }
  873 }
  874 
  875 /* Show privileged registers. */
  876 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
  877 {
  878         struct {
  879                 uint16_t limit;
  880                 uint64_t base;
  881         } __packed idtr, gdtr;
  882         uint16_t ldt, tr;
  883 
  884         __asm __volatile("sidt %0" : "=m" (idtr));
  885         db_printf("idtr\t0x%016lx/%04x\n",
  886             (u_long)idtr.base, (u_int)idtr.limit);
  887         __asm __volatile("sgdt %0" : "=m" (gdtr));
  888         db_printf("gdtr\t0x%016lx/%04x\n",
  889             (u_long)gdtr.base, (u_int)gdtr.limit);
  890         __asm __volatile("sldt %0" : "=r" (ldt));
  891         db_printf("ldtr\t0x%04x\n", ldt);
  892         __asm __volatile("str %0" : "=r" (tr));
  893         db_printf("tr\t0x%04x\n", tr);
  894         db_printf("cr0\t0x%016lx\n", rcr0());
  895         db_printf("cr2\t0x%016lx\n", rcr2());
  896         db_printf("cr3\t0x%016lx\n", rcr3());
  897         db_printf("cr4\t0x%016lx\n", rcr4());
  898         if (rcr4() & CR4_XSAVE)
  899                 db_printf("xcr0\t0x%016lx\n", rxcr(0));
  900         db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
  901         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
  902                 db_printf("FEATURES_CTL\t%016lx\n",
  903                     rdmsr(MSR_IA32_FEATURE_CONTROL));
  904         db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
  905         db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
  906         db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
  907 }
  908 
  909 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
  910 {
  911 
  912         db_printf("dr0\t0x%016lx\n", rdr0());
  913         db_printf("dr1\t0x%016lx\n", rdr1());
  914         db_printf("dr2\t0x%016lx\n", rdr2());
  915         db_printf("dr3\t0x%016lx\n", rdr3());
  916         db_printf("dr6\t0x%016lx\n", rdr6());
  917         db_printf("dr7\t0x%016lx\n", rdr7());   
  918 }
  919 #endif
  920 
  921 void
  922 sdtossd(sd, ssd)
  923         struct user_segment_descriptor *sd;
  924         struct soft_segment_descriptor *ssd;
  925 {
  926 
  927         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  928         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  929         ssd->ssd_type  = sd->sd_type;
  930         ssd->ssd_dpl   = sd->sd_dpl;
  931         ssd->ssd_p     = sd->sd_p;
  932         ssd->ssd_long  = sd->sd_long;
  933         ssd->ssd_def32 = sd->sd_def32;
  934         ssd->ssd_gran  = sd->sd_gran;
  935 }
  936 
  937 void
  938 ssdtosd(ssd, sd)
  939         struct soft_segment_descriptor *ssd;
  940         struct user_segment_descriptor *sd;
  941 {
  942 
  943         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  944         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  945         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  946         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  947         sd->sd_type  = ssd->ssd_type;
  948         sd->sd_dpl   = ssd->ssd_dpl;
  949         sd->sd_p     = ssd->ssd_p;
  950         sd->sd_long  = ssd->ssd_long;
  951         sd->sd_def32 = ssd->ssd_def32;
  952         sd->sd_gran  = ssd->ssd_gran;
  953 }
  954 
  955 void
  956 ssdtosyssd(ssd, sd)
  957         struct soft_segment_descriptor *ssd;
  958         struct system_segment_descriptor *sd;
  959 {
  960 
  961         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  962         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  963         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  964         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  965         sd->sd_type  = ssd->ssd_type;
  966         sd->sd_dpl   = ssd->ssd_dpl;
  967         sd->sd_p     = ssd->ssd_p;
  968         sd->sd_gran  = ssd->ssd_gran;
  969 }
  970 
  971 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
  972 #include <isa/isavar.h>
  973 #include <isa/isareg.h>
  974 /*
  975  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  976  * and is only suitable for use at probe time.
  977  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  978  * It shouldn't be here.  There should probably be an APIC centric
  979  * implementation in the apic driver code, if at all.
  980  */
  981 intrmask_t
  982 isa_irq_pending(void)
  983 {
  984         u_char irr1;
  985         u_char irr2;
  986 
  987         irr1 = inb(IO_ICU1);
  988         irr2 = inb(IO_ICU2);
  989         return ((irr2 << 8) | irr1);
  990 }
  991 #endif
  992 
  993 u_int basemem;
  994 
  995 static int
  996 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
  997     int *physmap_idxp)
  998 {
  999         int i, insert_idx, physmap_idx;
 1000 
 1001         physmap_idx = *physmap_idxp;
 1002 
 1003         if (length == 0)
 1004                 return (1);
 1005 
 1006         /*
 1007          * Find insertion point while checking for overlap.  Start off by
 1008          * assuming the new entry will be added to the end.
 1009          *
 1010          * NB: physmap_idx points to the next free slot.
 1011          */
 1012         insert_idx = physmap_idx;
 1013         for (i = 0; i <= physmap_idx; i += 2) {
 1014                 if (base < physmap[i + 1]) {
 1015                         if (base + length <= physmap[i]) {
 1016                                 insert_idx = i;
 1017                                 break;
 1018                         }
 1019                         if (boothowto & RB_VERBOSE)
 1020                                 printf(
 1021                     "Overlapping memory regions, ignoring second region\n");
 1022                         return (1);
 1023                 }
 1024         }
 1025 
 1026         /* See if we can prepend to the next entry. */
 1027         if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 1028                 physmap[insert_idx] = base;
 1029                 return (1);
 1030         }
 1031 
 1032         /* See if we can append to the previous entry. */
 1033         if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 1034                 physmap[insert_idx - 1] += length;
 1035                 return (1);
 1036         }
 1037 
 1038         physmap_idx += 2;
 1039         *physmap_idxp = physmap_idx;
 1040         if (physmap_idx == PHYSMAP_SIZE) {
 1041                 printf(
 1042                 "Too many segments in the physical address map, giving up\n");
 1043                 return (0);
 1044         }
 1045 
 1046         /*
 1047          * Move the last 'N' entries down to make room for the new
 1048          * entry if needed.
 1049          */
 1050         for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 1051                 physmap[i] = physmap[i - 2];
 1052                 physmap[i + 1] = physmap[i - 1];
 1053         }
 1054 
 1055         /* Insert the new entry. */
 1056         physmap[insert_idx] = base;
 1057         physmap[insert_idx + 1] = base + length;
 1058         return (1);
 1059 }
 1060 
 1061 void
 1062 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
 1063                       vm_paddr_t *physmap, int *physmap_idx)
 1064 {
 1065         struct bios_smap *smap, *smapend;
 1066 
 1067         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 1068 
 1069         for (smap = smapbase; smap < smapend; smap++) {
 1070                 if (boothowto & RB_VERBOSE)
 1071                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
 1072                             smap->type, smap->base, smap->length);
 1073 
 1074                 if (smap->type != SMAP_TYPE_MEMORY)
 1075                         continue;
 1076 
 1077                 if (!add_physmap_entry(smap->base, smap->length, physmap,
 1078                     physmap_idx))
 1079                         break;
 1080         }
 1081 }
 1082 
 1083 static void
 1084 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
 1085     int *physmap_idx)
 1086 {
 1087         struct efi_md *map, *p;
 1088         const char *type;
 1089         size_t efisz;
 1090         int ndesc, i;
 1091 
 1092         static const char *types[] = {
 1093                 "Reserved",
 1094                 "LoaderCode",
 1095                 "LoaderData",
 1096                 "BootServicesCode",
 1097                 "BootServicesData",
 1098                 "RuntimeServicesCode",
 1099                 "RuntimeServicesData",
 1100                 "ConventionalMemory",
 1101                 "UnusableMemory",
 1102                 "ACPIReclaimMemory",
 1103                 "ACPIMemoryNVS",
 1104                 "MemoryMappedIO",
 1105                 "MemoryMappedIOPortSpace",
 1106                 "PalCode",
 1107                 "PersistentMemory"
 1108         };
 1109 
 1110         /*
 1111          * Memory map data provided by UEFI via the GetMemoryMap
 1112          * Boot Services API.
 1113          */
 1114         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 1115         map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 1116 
 1117         if (efihdr->descriptor_size == 0)
 1118                 return;
 1119         ndesc = efihdr->memory_size / efihdr->descriptor_size;
 1120 
 1121         if (boothowto & RB_VERBOSE)
 1122                 printf("%23s %12s %12s %8s %4s\n",
 1123                     "Type", "Physical", "Virtual", "#Pages", "Attr");
 1124 
 1125         for (i = 0, p = map; i < ndesc; i++,
 1126             p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 1127                 if (boothowto & RB_VERBOSE) {
 1128                         if (p->md_type < nitems(types))
 1129                                 type = types[p->md_type];
 1130                         else
 1131                                 type = "<INVALID>";
 1132                         printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 1133                             p->md_virt, p->md_pages);
 1134                         if (p->md_attr & EFI_MD_ATTR_UC)
 1135                                 printf("UC ");
 1136                         if (p->md_attr & EFI_MD_ATTR_WC)
 1137                                 printf("WC ");
 1138                         if (p->md_attr & EFI_MD_ATTR_WT)
 1139                                 printf("WT ");
 1140                         if (p->md_attr & EFI_MD_ATTR_WB)
 1141                                 printf("WB ");
 1142                         if (p->md_attr & EFI_MD_ATTR_UCE)
 1143                                 printf("UCE ");
 1144                         if (p->md_attr & EFI_MD_ATTR_WP)
 1145                                 printf("WP ");
 1146                         if (p->md_attr & EFI_MD_ATTR_RP)
 1147                                 printf("RP ");
 1148                         if (p->md_attr & EFI_MD_ATTR_XP)
 1149                                 printf("XP ");
 1150                         if (p->md_attr & EFI_MD_ATTR_NV)
 1151                                 printf("NV ");
 1152                         if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 1153                                 printf("MORE_RELIABLE ");
 1154                         if (p->md_attr & EFI_MD_ATTR_RO)
 1155                                 printf("RO ");
 1156                         if (p->md_attr & EFI_MD_ATTR_RT)
 1157                                 printf("RUNTIME");
 1158                         printf("\n");
 1159                 }
 1160 
 1161                 switch (p->md_type) {
 1162                 case EFI_MD_TYPE_CODE:
 1163                 case EFI_MD_TYPE_DATA:
 1164                 case EFI_MD_TYPE_BS_CODE:
 1165                 case EFI_MD_TYPE_BS_DATA:
 1166                 case EFI_MD_TYPE_FREE:
 1167                         /*
 1168                          * We're allowed to use any entry with these types.
 1169                          */
 1170                         break;
 1171                 default:
 1172                         continue;
 1173                 }
 1174 
 1175                 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 1176                     physmap, physmap_idx))
 1177                         break;
 1178         }
 1179 }
 1180 
 1181 static char bootmethod[16] = "";
 1182 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
 1183     "System firmware boot method");
 1184 
 1185 static void
 1186 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 1187 {
 1188         struct bios_smap *smap;
 1189         struct efi_map_header *efihdr;
 1190         u_int32_t size;
 1191 
 1192         /*
 1193          * Memory map from INT 15:E820.
 1194          *
 1195          * subr_module.c says:
 1196          * "Consumer may safely assume that size value precedes data."
 1197          * ie: an int32_t immediately precedes smap.
 1198          */
 1199 
 1200         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1201             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1202         smap = (struct bios_smap *)preload_search_info(kmdp,
 1203             MODINFO_METADATA | MODINFOMD_SMAP);
 1204         if (efihdr == NULL && smap == NULL)
 1205                 panic("No BIOS smap or EFI map info from loader!");
 1206 
 1207         if (efihdr != NULL) {
 1208                 add_efi_map_entries(efihdr, physmap, physmap_idx);
 1209                 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 1210         } else {
 1211                 size = *((u_int32_t *)smap - 1);
 1212                 bios_add_smap_entries(smap, size, physmap, physmap_idx);
 1213                 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 1214         }
 1215 }
 1216 
 1217 #define PAGES_PER_GB    (1024 * 1024 * 1024 / PAGE_SIZE)
 1218 
 1219 /*
 1220  * Populate the (physmap) array with base/bound pairs describing the
 1221  * available physical memory in the system, then test this memory and
 1222  * build the phys_avail array describing the actually-available memory.
 1223  *
 1224  * Total memory size may be set by the kernel environment variable
 1225  * hw.physmem or the compile-time define MAXMEM.
 1226  *
 1227  * XXX first should be vm_paddr_t.
 1228  */
 1229 static void
 1230 getmemsize(caddr_t kmdp, u_int64_t first)
 1231 {
 1232         int i, physmap_idx, pa_indx, da_indx;
 1233         vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 1234         u_long physmem_start, physmem_tunable, memtest;
 1235         pt_entry_t *pte;
 1236         quad_t dcons_addr, dcons_size;
 1237         int page_counter;
 1238 
 1239         /*
 1240          * Tell the physical memory allocator about pages used to store
 1241          * the kernel and preloaded data.  See kmem_bootstrap_free().
 1242          */
 1243         vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 1244 
 1245         bzero(physmap, sizeof(physmap));
 1246         physmap_idx = 0;
 1247 
 1248         init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 1249         physmap_idx -= 2;
 1250 
 1251         /*
 1252          * Find the 'base memory' segment for SMP
 1253          */
 1254         basemem = 0;
 1255         for (i = 0; i <= physmap_idx; i += 2) {
 1256                 if (physmap[i] <= 0xA0000) {
 1257                         basemem = physmap[i + 1] / 1024;
 1258                         break;
 1259                 }
 1260         }
 1261         if (basemem == 0 || basemem > 640) {
 1262                 if (bootverbose)
 1263                         printf(
 1264                 "Memory map doesn't contain a basemem segment, faking it");
 1265                 basemem = 640;
 1266         }
 1267 
 1268         /*
 1269          * Make hole for "AP -> long mode" bootstrap code.  The
 1270          * mp_bootaddress vector is only available when the kernel
 1271          * is configured to support APs and APs for the system start
 1272          * in 32bit mode (e.g. SMP bare metal).
 1273          */
 1274         if (init_ops.mp_bootaddress) {
 1275                 if (physmap[1] >= 0x100000000)
 1276                         panic(
 1277         "Basemem segment is not suitable for AP bootstrap code!");
 1278                 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 1279         }
 1280 
 1281         /*
 1282          * Maxmem isn't the "maximum memory", it's one larger than the
 1283          * highest page of the physical address space.  It should be
 1284          * called something like "Maxphyspage".  We may adjust this
 1285          * based on ``hw.physmem'' and the results of the memory test.
 1286          */
 1287         Maxmem = atop(physmap[physmap_idx + 1]);
 1288 
 1289 #ifdef MAXMEM
 1290         Maxmem = MAXMEM / 4;
 1291 #endif
 1292 
 1293         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 1294                 Maxmem = atop(physmem_tunable);
 1295 
 1296         /*
 1297          * The boot memory test is disabled by default, as it takes a
 1298          * significant amount of time on large-memory systems, and is
 1299          * unfriendly to virtual machines as it unnecessarily touches all
 1300          * pages.
 1301          *
 1302          * A general name is used as the code may be extended to support
 1303          * additional tests beyond the current "page present" test.
 1304          */
 1305         memtest = 0;
 1306         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 1307 
 1308         /*
 1309          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 1310          * in the system.
 1311          */
 1312         if (Maxmem > atop(physmap[physmap_idx + 1]))
 1313                 Maxmem = atop(physmap[physmap_idx + 1]);
 1314 
 1315         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 1316             (boothowto & RB_VERBOSE))
 1317                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 1318 
 1319         /* call pmap initialization to make new kernel address space */
 1320         pmap_bootstrap(&first);
 1321 
 1322         /*
 1323          * Size up each available chunk of physical memory.
 1324          *
 1325          * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 1326          * By default, mask off the first 16 pages unless we appear to be
 1327          * running in a VM.
 1328          */
 1329         physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 1330         TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 1331         if (physmap[0] < physmem_start) {
 1332                 if (physmem_start < PAGE_SIZE)
 1333                         physmap[0] = PAGE_SIZE;
 1334                 else if (physmem_start >= physmap[1])
 1335                         physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 1336                 else
 1337                         physmap[0] = round_page(physmem_start);
 1338         }
 1339         pa_indx = 0;
 1340         da_indx = 1;
 1341         phys_avail[pa_indx++] = physmap[0];
 1342         phys_avail[pa_indx] = physmap[0];
 1343         dump_avail[da_indx] = physmap[0];
 1344         pte = CMAP1;
 1345 
 1346         /*
 1347          * Get dcons buffer address
 1348          */
 1349         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 1350             getenv_quad("dcons.size", &dcons_size) == 0)
 1351                 dcons_addr = 0;
 1352 
 1353         /*
 1354          * physmap is in bytes, so when converting to page boundaries,
 1355          * round up the start address and round down the end address.
 1356          */
 1357         page_counter = 0;
 1358         if (memtest != 0)
 1359                 printf("Testing system memory");
 1360         for (i = 0; i <= physmap_idx; i += 2) {
 1361                 vm_paddr_t end;
 1362 
 1363                 end = ptoa((vm_paddr_t)Maxmem);
 1364                 if (physmap[i + 1] < end)
 1365                         end = trunc_page(physmap[i + 1]);
 1366                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 1367                         int tmp, page_bad, full;
 1368                         int *ptr = (int *)CADDR1;
 1369 
 1370                         full = FALSE;
 1371                         /*
 1372                          * block out kernel memory as not available.
 1373                          */
 1374                         if (pa >= (vm_paddr_t)kernphys && pa < first)
 1375                                 goto do_dump_avail;
 1376 
 1377                         /*
 1378                          * block out dcons buffer
 1379                          */
 1380                         if (dcons_addr > 0
 1381                             && pa >= trunc_page(dcons_addr)
 1382                             && pa < dcons_addr + dcons_size)
 1383                                 goto do_dump_avail;
 1384 
 1385                         page_bad = FALSE;
 1386                         if (memtest == 0)
 1387                                 goto skip_memtest;
 1388 
 1389                         /*
 1390                          * Print a "." every GB to show we're making
 1391                          * progress.
 1392                          */
 1393                         page_counter++;
 1394                         if ((page_counter % PAGES_PER_GB) == 0)
 1395                                 printf(".");
 1396 
 1397                         /*
 1398                          * map page into kernel: valid, read/write,non-cacheable
 1399                          */
 1400                         *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 1401                         invltlb();
 1402 
 1403                         tmp = *(int *)ptr;
 1404                         /*
 1405                          * Test for alternating 1's and 0's
 1406                          */
 1407                         *(volatile int *)ptr = 0xaaaaaaaa;
 1408                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1409                                 page_bad = TRUE;
 1410                         /*
 1411                          * Test for alternating 0's and 1's
 1412                          */
 1413                         *(volatile int *)ptr = 0x55555555;
 1414                         if (*(volatile int *)ptr != 0x55555555)
 1415                                 page_bad = TRUE;
 1416                         /*
 1417                          * Test for all 1's
 1418                          */
 1419                         *(volatile int *)ptr = 0xffffffff;
 1420                         if (*(volatile int *)ptr != 0xffffffff)
 1421                                 page_bad = TRUE;
 1422                         /*
 1423                          * Test for all 0's
 1424                          */
 1425                         *(volatile int *)ptr = 0x0;
 1426                         if (*(volatile int *)ptr != 0x0)
 1427                                 page_bad = TRUE;
 1428                         /*
 1429                          * Restore original value.
 1430                          */
 1431                         *(int *)ptr = tmp;
 1432 
 1433 skip_memtest:
 1434                         /*
 1435                          * Adjust array of valid/good pages.
 1436                          */
 1437                         if (page_bad == TRUE)
 1438                                 continue;
 1439                         /*
 1440                          * If this good page is a continuation of the
 1441                          * previous set of good pages, then just increase
 1442                          * the end pointer. Otherwise start a new chunk.
 1443                          * Note that "end" points one higher than end,
 1444                          * making the range >= start and < end.
 1445                          * If we're also doing a speculative memory
 1446                          * test and we at or past the end, bump up Maxmem
 1447                          * so that we keep going. The first bad page
 1448                          * will terminate the loop.
 1449                          */
 1450                         if (phys_avail[pa_indx] == pa) {
 1451                                 phys_avail[pa_indx] += PAGE_SIZE;
 1452                         } else {
 1453                                 pa_indx++;
 1454                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 1455                                         printf(
 1456                 "Too many holes in the physical address space, giving up\n");
 1457                                         pa_indx--;
 1458                                         full = TRUE;
 1459                                         goto do_dump_avail;
 1460                                 }
 1461                                 phys_avail[pa_indx++] = pa;     /* start */
 1462                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1463                         }
 1464                         physmem++;
 1465 do_dump_avail:
 1466                         if (dump_avail[da_indx] == pa) {
 1467                                 dump_avail[da_indx] += PAGE_SIZE;
 1468                         } else {
 1469                                 da_indx++;
 1470                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 1471                                         da_indx--;
 1472                                         goto do_next;
 1473                                 }
 1474                                 dump_avail[da_indx++] = pa; /* start */
 1475                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1476                         }
 1477 do_next:
 1478                         if (full)
 1479                                 break;
 1480                 }
 1481         }
 1482         *pte = 0;
 1483         invltlb();
 1484         if (memtest != 0)
 1485                 printf("\n");
 1486 
 1487         /*
 1488          * XXX
 1489          * The last chunk must contain at least one page plus the message
 1490          * buffer to avoid complicating other code (message buffer address
 1491          * calculation, etc.).
 1492          */
 1493         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1494             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 1495                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1496                 phys_avail[pa_indx--] = 0;
 1497                 phys_avail[pa_indx--] = 0;
 1498         }
 1499 
 1500         Maxmem = atop(phys_avail[pa_indx]);
 1501 
 1502         /* Trim off space for the message buffer. */
 1503         phys_avail[pa_indx] -= round_page(msgbufsize);
 1504 
 1505         /* Map the message buffer. */
 1506         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 1507 }
 1508 
 1509 static caddr_t
 1510 native_parse_preload_data(u_int64_t modulep)
 1511 {
 1512         caddr_t kmdp;
 1513         char *envp;
 1514 #ifdef DDB
 1515         vm_offset_t ksym_start;
 1516         vm_offset_t ksym_end;
 1517 #endif
 1518 
 1519         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1520         preload_bootstrap_relocate(KERNBASE);
 1521         kmdp = preload_search_by_type("elf kernel");
 1522         if (kmdp == NULL)
 1523                 kmdp = preload_search_by_type("elf64 kernel");
 1524         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1525         envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 1526         if (envp != NULL)
 1527                 envp += KERNBASE;
 1528         init_static_kenv(envp, 0);
 1529 #ifdef DDB
 1530         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1531         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1532         db_fetch_ksymtab(ksym_start, ksym_end);
 1533 #endif
 1534         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 1535 
 1536         return (kmdp);
 1537 }
 1538 
 1539 static void
 1540 amd64_kdb_init(void)
 1541 {
 1542         kdb_init();
 1543 #ifdef KDB
 1544         if (boothowto & RB_KDB)
 1545                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 1546 #endif
 1547 }
 1548 
 1549 /* Set up the fast syscall stuff */
 1550 void
 1551 amd64_conf_fast_syscall(void)
 1552 {
 1553         uint64_t msr;
 1554 
 1555         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1556         wrmsr(MSR_EFER, msr);
 1557         wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 1558             (u_int64_t)IDTVEC(fast_syscall));
 1559         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1560         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1561             ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1562         wrmsr(MSR_STAR, msr);
 1563         wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
 1564 }
 1565 
 1566 u_int64_t
 1567 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1568 {
 1569         caddr_t kmdp;
 1570         int gsel_tss, x;
 1571         struct pcpu *pc;
 1572         struct nmi_pcpu *np;
 1573         struct xstate_hdr *xhdr;
 1574         u_int64_t rsp0;
 1575         char *env;
 1576         size_t kstack0_sz;
 1577         int late_console;
 1578 
 1579         kmdp = init_ops.parse_preload_data(modulep);
 1580 
 1581         identify_cpu1();
 1582         identify_hypervisor();
 1583         /*
 1584          * hw.cpu_stdext_disable is ignored by the call, it will be
 1585          * re-evaluted by the below call to finishidentcpu().
 1586          */
 1587         identify_cpu2();
 1588 
 1589         link_elf_ireloc(kmdp);
 1590 
 1591         /*
 1592          * This may be done better later if it gets more high level
 1593          * components in it. If so just link td->td_proc here.
 1594          */
 1595         proc_linkup0(&proc0, &thread0);
 1596 
 1597         /* Init basic tunables, hz etc */
 1598         init_param1();
 1599 
 1600         thread0.td_kstack = physfree + KERNBASE;
 1601         thread0.td_kstack_pages = kstack_pages;
 1602         kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 1603         bzero((void *)thread0.td_kstack, kstack0_sz);
 1604         physfree += kstack0_sz;
 1605 
 1606         /*
 1607          * make gdt memory segments
 1608          */
 1609         for (x = 0; x < NGDT; x++) {
 1610                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 1611                     x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 1612                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1613         }
 1614         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 1615         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1616             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1617 
 1618         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1619         r_gdt.rd_base =  (long) gdt;
 1620         lgdt(&r_gdt);
 1621         pc = &__pcpu[0];
 1622 
 1623         wrmsr(MSR_FSBASE, 0);           /* User value */
 1624         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1625         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1626 
 1627         pcpu_init(pc, 0, sizeof(struct pcpu));
 1628         dpcpu_init((void *)(physfree + KERNBASE), 0);
 1629         physfree += DPCPU_SIZE;
 1630         PCPU_SET(prvspace, pc);
 1631         PCPU_SET(curthread, &thread0);
 1632         /* Non-late cninit() and printf() can be moved up to here. */
 1633         PCPU_SET(tssp, &common_tss[0]);
 1634         PCPU_SET(commontssp, &common_tss[0]);
 1635         PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1636         PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 1637         PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 1638         PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 1639 
 1640         /*
 1641          * Initialize mutexes.
 1642          *
 1643          * icu_lock: in order to allow an interrupt to occur in a critical
 1644          *           section, to set pcpu->ipending (etc...) properly, we
 1645          *           must be able to get the icu lock, so it can't be
 1646          *           under witness.
 1647          */
 1648         mutex_init();
 1649         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1650         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 1651 
 1652         /* exceptions */
 1653         pti = pti_get_default();
 1654         TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 1655 
 1656         for (x = 0; x < NIDT; x++)
 1657                 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 1658                     SEL_KPL, 0);
 1659         setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 1660             SEL_KPL, 0);
 1661         setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 1662         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 1663         setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 1664             SEL_UPL, 0);
 1665         setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 1666             SEL_UPL, 0);
 1667         setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 1668             SEL_KPL, 0);
 1669         setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 1670             SEL_KPL, 0);
 1671         setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 1672             SEL_KPL, 0);
 1673         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1674         setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 1675             SDT_SYSIGT, SEL_KPL, 0);
 1676         setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 1677             SEL_KPL, 0);
 1678         setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 1679             SDT_SYSIGT, SEL_KPL, 0);
 1680         setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 1681             SEL_KPL, 0);
 1682         setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 1683             SEL_KPL, 0);
 1684         setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 1685             SEL_KPL, 0);
 1686         setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 1687             SEL_KPL, 0);
 1688         setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 1689             SEL_KPL, 0);
 1690         setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 1691         setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 1692             SEL_KPL, 0);
 1693 #ifdef KDTRACE_HOOKS
 1694         setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 1695             &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 1696 #endif
 1697 #ifdef XENHVM
 1698         setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 1699             &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 1700 #endif
 1701         r_idt.rd_limit = sizeof(idt0) - 1;
 1702         r_idt.rd_base = (long) idt;
 1703         lidt(&r_idt);
 1704 
 1705         /*
 1706          * Initialize the clock before the console so that console
 1707          * initialization can use DELAY().
 1708          */
 1709         clock_init();
 1710 
 1711         /*
 1712          * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 1713          * transition).
 1714          * Once bootblocks have updated, we can test directly for
 1715          * efi_systbl != NULL here...
 1716          */
 1717         if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 1718             != NULL)
 1719                 vty_set_preferred(VTY_VT);
 1720 
 1721         finishidentcpu();       /* Final stage of CPU initialization */
 1722         initializecpu();        /* Initialize CPU registers */
 1723         initializecpucache();
 1724 
 1725         /* doublefault stack space, runs on ist1 */
 1726         common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 1727 
 1728         /*
 1729          * NMI stack, runs on ist2.  The pcpu pointer is stored just
 1730          * above the start of the ist2 stack.
 1731          */
 1732         np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 1733         np->np_pcpu = (register_t) pc;
 1734         common_tss[0].tss_ist2 = (long) np;
 1735 
 1736         /*
 1737          * MC# stack, runs on ist3.  The pcpu pointer is stored just
 1738          * above the start of the ist3 stack.
 1739          */
 1740         np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 1741         np->np_pcpu = (register_t) pc;
 1742         common_tss[0].tss_ist3 = (long) np;
 1743 
 1744         /*
 1745          * DB# stack, runs on ist4.
 1746          */
 1747         np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 1748         np->np_pcpu = (register_t) pc;
 1749         common_tss[0].tss_ist4 = (long) np;
 1750         
 1751         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1752         common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 1753 
 1754         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1755         ltr(gsel_tss);
 1756 
 1757         amd64_conf_fast_syscall();
 1758 
 1759         /*
 1760          * Temporary forge some valid pointer to PCB, for exception
 1761          * handlers.  It is reinitialized properly below after FPU is
 1762          * set up.  Also set up td_critnest to short-cut the page
 1763          * fault handler.
 1764          */
 1765         cpu_max_ext_state_size = sizeof(struct savefpu);
 1766         thread0.td_pcb = get_pcb_td(&thread0);
 1767         thread0.td_critnest = 1;
 1768 
 1769         /*
 1770          * The console and kdb should be initialized even earlier than here,
 1771          * but some console drivers don't work until after getmemsize().
 1772          * Default to late console initialization to support these drivers.
 1773          * This loses mainly printf()s in getmemsize() and early debugging.
 1774          */
 1775         late_console = 1;
 1776         TUNABLE_INT_FETCH("debug.late_console", &late_console);
 1777         if (!late_console) {
 1778                 cninit();
 1779                 amd64_kdb_init();
 1780         }
 1781 
 1782         getmemsize(kmdp, physfree);
 1783         init_param2(physmem);
 1784 
 1785         /* now running on new page tables, configured,and u/iom is accessible */
 1786 
 1787         if (late_console)
 1788                 cninit();
 1789 
 1790 #ifdef DEV_ISA
 1791 #ifdef DEV_ATPIC
 1792         elcr_probe();
 1793         atpic_startup();
 1794 #else
 1795         /* Reset and mask the atpics and leave them shut down. */
 1796         atpic_reset();
 1797 
 1798         /*
 1799          * Point the ICU spurious interrupt vectors at the APIC spurious
 1800          * interrupt handler.
 1801          */
 1802         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1803         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1804 #endif
 1805 #else
 1806 #error "have you forgotten the isa device?";
 1807 #endif
 1808 
 1809         if (late_console)
 1810                 amd64_kdb_init();
 1811 
 1812         msgbufinit(msgbufp, msgbufsize);
 1813         fpuinit();
 1814 
 1815         /*
 1816          * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 1817          * area size.  Zero out the extended state header in fpu save
 1818          * area.
 1819          */
 1820         thread0.td_pcb = get_pcb_td(&thread0);
 1821         thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 1822         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 1823         if (use_xsave) {
 1824                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 1825                     1);
 1826                 xhdr->xstate_bv = xsave_mask;
 1827         }
 1828         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1829         rsp0 = (vm_offset_t)thread0.td_pcb;
 1830         /* Ensure the stack is aligned to 16 bytes */
 1831         rsp0 &= ~0xFul;
 1832         common_tss[0].tss_rsp0 = rsp0;
 1833         PCPU_SET(rsp0, rsp0);
 1834         PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 1835             PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 1836         PCPU_SET(curpcb, thread0.td_pcb);
 1837 
 1838         /* transfer to user mode */
 1839 
 1840         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1841         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1842         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1843         _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 1844         _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 1845 
 1846         load_ds(_udatasel);
 1847         load_es(_udatasel);
 1848         load_fs(_ufssel);
 1849 
 1850         /* setup proc 0's pcb */
 1851         thread0.td_pcb->pcb_flags = 0;
 1852         thread0.td_frame = &proc0_tf;
 1853 
 1854         env = kern_getenv("kernelname");
 1855         if (env != NULL)
 1856                 strlcpy(kernelname, env, sizeof(kernelname));
 1857 
 1858         cpu_probe_amdc1e();
 1859 
 1860 #ifdef FDT
 1861         x86_init_fdt();
 1862 #endif
 1863         thread0.td_critnest = 0;
 1864 
 1865         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1866         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1867 
 1868         /* Location of kernel stack for locore */
 1869         return ((u_int64_t)thread0.td_pcb);
 1870 }
 1871 
 1872 void
 1873 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1874 {
 1875 
 1876         pcpu->pc_acpi_id = 0xffffffff;
 1877 }
 1878 
 1879 static int
 1880 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1881 {
 1882         struct bios_smap *smapbase;
 1883         struct bios_smap_xattr smap;
 1884         caddr_t kmdp;
 1885         uint32_t *smapattr;
 1886         int count, error, i;
 1887 
 1888         /* Retrieve the system memory map from the loader. */
 1889         kmdp = preload_search_by_type("elf kernel");
 1890         if (kmdp == NULL)
 1891                 kmdp = preload_search_by_type("elf64 kernel");
 1892         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 1893             MODINFO_METADATA | MODINFOMD_SMAP);
 1894         if (smapbase == NULL)
 1895                 return (0);
 1896         smapattr = (uint32_t *)preload_search_info(kmdp,
 1897             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 1898         count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 1899         error = 0;
 1900         for (i = 0; i < count; i++) {
 1901                 smap.base = smapbase[i].base;
 1902                 smap.length = smapbase[i].length;
 1903                 smap.type = smapbase[i].type;
 1904                 if (smapattr != NULL)
 1905                         smap.xattr = smapattr[i];
 1906                 else
 1907                         smap.xattr = 0;
 1908                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 1909         }
 1910         return (error);
 1911 }
 1912 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1913     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 1914 
 1915 static int
 1916 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1917 {
 1918         struct efi_map_header *efihdr;
 1919         caddr_t kmdp;
 1920         uint32_t efisize;
 1921 
 1922         kmdp = preload_search_by_type("elf kernel");
 1923         if (kmdp == NULL)
 1924                 kmdp = preload_search_by_type("elf64 kernel");
 1925         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1926             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1927         if (efihdr == NULL)
 1928                 return (0);
 1929         efisize = *((uint32_t *)efihdr - 1);
 1930         return (SYSCTL_OUT(req, efihdr, efisize));
 1931 }
 1932 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1933     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 1934 
 1935 void
 1936 spinlock_enter(void)
 1937 {
 1938         struct thread *td;
 1939         register_t flags;
 1940 
 1941         td = curthread;
 1942         if (td->td_md.md_spinlock_count == 0) {
 1943                 flags = intr_disable();
 1944                 td->td_md.md_spinlock_count = 1;
 1945                 td->td_md.md_saved_flags = flags;
 1946         } else
 1947                 td->td_md.md_spinlock_count++;
 1948         critical_enter();
 1949 }
 1950 
 1951 void
 1952 spinlock_exit(void)
 1953 {
 1954         struct thread *td;
 1955         register_t flags;
 1956 
 1957         td = curthread;
 1958         critical_exit();
 1959         flags = td->td_md.md_saved_flags;
 1960         td->td_md.md_spinlock_count--;
 1961         if (td->td_md.md_spinlock_count == 0)
 1962                 intr_restore(flags);
 1963 }
 1964 
 1965 /*
 1966  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1967  * we want to start a backtrace from the function that caused us to enter
 1968  * the debugger. We have the context in the trapframe, but base the trace
 1969  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1970  * enough for a backtrace.
 1971  */
 1972 void
 1973 makectx(struct trapframe *tf, struct pcb *pcb)
 1974 {
 1975 
 1976         pcb->pcb_r12 = tf->tf_r12;
 1977         pcb->pcb_r13 = tf->tf_r13;
 1978         pcb->pcb_r14 = tf->tf_r14;
 1979         pcb->pcb_r15 = tf->tf_r15;
 1980         pcb->pcb_rbp = tf->tf_rbp;
 1981         pcb->pcb_rbx = tf->tf_rbx;
 1982         pcb->pcb_rip = tf->tf_rip;
 1983         pcb->pcb_rsp = tf->tf_rsp;
 1984 }
 1985 
 1986 int
 1987 ptrace_set_pc(struct thread *td, unsigned long addr)
 1988 {
 1989 
 1990         td->td_frame->tf_rip = addr;
 1991         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 1992         return (0);
 1993 }
 1994 
 1995 int
 1996 ptrace_single_step(struct thread *td)
 1997 {
 1998 
 1999         PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 2000         if ((td->td_frame->tf_rflags & PSL_T) == 0) {
 2001                 td->td_frame->tf_rflags |= PSL_T;
 2002                 td->td_dbgflags |= TDB_STEP;
 2003         }
 2004         return (0);
 2005 }
 2006 
 2007 int
 2008 ptrace_clear_single_step(struct thread *td)
 2009 {
 2010         PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 2011         td->td_frame->tf_rflags &= ~PSL_T;
 2012         td->td_dbgflags &= ~TDB_STEP;
 2013         return (0);
 2014 }
 2015 
 2016 int
 2017 fill_regs(struct thread *td, struct reg *regs)
 2018 {
 2019         struct trapframe *tp;
 2020 
 2021         tp = td->td_frame;
 2022         return (fill_frame_regs(tp, regs));
 2023 }
 2024 
 2025 int
 2026 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 2027 {
 2028 
 2029         regs->r_r15 = tp->tf_r15;
 2030         regs->r_r14 = tp->tf_r14;
 2031         regs->r_r13 = tp->tf_r13;
 2032         regs->r_r12 = tp->tf_r12;
 2033         regs->r_r11 = tp->tf_r11;
 2034         regs->r_r10 = tp->tf_r10;
 2035         regs->r_r9  = tp->tf_r9;
 2036         regs->r_r8  = tp->tf_r8;
 2037         regs->r_rdi = tp->tf_rdi;
 2038         regs->r_rsi = tp->tf_rsi;
 2039         regs->r_rbp = tp->tf_rbp;
 2040         regs->r_rbx = tp->tf_rbx;
 2041         regs->r_rdx = tp->tf_rdx;
 2042         regs->r_rcx = tp->tf_rcx;
 2043         regs->r_rax = tp->tf_rax;
 2044         regs->r_rip = tp->tf_rip;
 2045         regs->r_cs = tp->tf_cs;
 2046         regs->r_rflags = tp->tf_rflags;
 2047         regs->r_rsp = tp->tf_rsp;
 2048         regs->r_ss = tp->tf_ss;
 2049         if (tp->tf_flags & TF_HASSEGS) {
 2050                 regs->r_ds = tp->tf_ds;
 2051                 regs->r_es = tp->tf_es;
 2052                 regs->r_fs = tp->tf_fs;
 2053                 regs->r_gs = tp->tf_gs;
 2054         } else {
 2055                 regs->r_ds = 0;
 2056                 regs->r_es = 0;
 2057                 regs->r_fs = 0;
 2058                 regs->r_gs = 0;
 2059         }
 2060         regs->r_err = 0;
 2061         regs->r_trapno = 0;
 2062         return (0);
 2063 }
 2064 
 2065 int
 2066 set_regs(struct thread *td, struct reg *regs)
 2067 {
 2068         struct trapframe *tp;
 2069         register_t rflags;
 2070 
 2071         tp = td->td_frame;
 2072         rflags = regs->r_rflags & 0xffffffff;
 2073         if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 2074                 return (EINVAL);
 2075         tp->tf_r15 = regs->r_r15;
 2076         tp->tf_r14 = regs->r_r14;
 2077         tp->tf_r13 = regs->r_r13;
 2078         tp->tf_r12 = regs->r_r12;
 2079         tp->tf_r11 = regs->r_r11;
 2080         tp->tf_r10 = regs->r_r10;
 2081         tp->tf_r9  = regs->r_r9;
 2082         tp->tf_r8  = regs->r_r8;
 2083         tp->tf_rdi = regs->r_rdi;
 2084         tp->tf_rsi = regs->r_rsi;
 2085         tp->tf_rbp = regs->r_rbp;
 2086         tp->tf_rbx = regs->r_rbx;
 2087         tp->tf_rdx = regs->r_rdx;
 2088         tp->tf_rcx = regs->r_rcx;
 2089         tp->tf_rax = regs->r_rax;
 2090         tp->tf_rip = regs->r_rip;
 2091         tp->tf_cs = regs->r_cs;
 2092         tp->tf_rflags = rflags;
 2093         tp->tf_rsp = regs->r_rsp;
 2094         tp->tf_ss = regs->r_ss;
 2095         if (0) {        /* XXXKIB */
 2096                 tp->tf_ds = regs->r_ds;
 2097                 tp->tf_es = regs->r_es;
 2098                 tp->tf_fs = regs->r_fs;
 2099                 tp->tf_gs = regs->r_gs;
 2100                 tp->tf_flags = TF_HASSEGS;
 2101         }
 2102         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2103         return (0);
 2104 }
 2105 
 2106 /* XXX check all this stuff! */
 2107 /* externalize from sv_xmm */
 2108 static void
 2109 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 2110 {
 2111         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2112         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2113         int i;
 2114 
 2115         /* pcb -> fpregs */
 2116         bzero(fpregs, sizeof(*fpregs));
 2117 
 2118         /* FPU control/status */
 2119         penv_fpreg->en_cw = penv_xmm->en_cw;
 2120         penv_fpreg->en_sw = penv_xmm->en_sw;
 2121         penv_fpreg->en_tw = penv_xmm->en_tw;
 2122         penv_fpreg->en_opcode = penv_xmm->en_opcode;
 2123         penv_fpreg->en_rip = penv_xmm->en_rip;
 2124         penv_fpreg->en_rdp = penv_xmm->en_rdp;
 2125         penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 2126         penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 2127 
 2128         /* FPU registers */
 2129         for (i = 0; i < 8; ++i)
 2130                 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 2131 
 2132         /* SSE registers */
 2133         for (i = 0; i < 16; ++i)
 2134                 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 2135 }
 2136 
 2137 /* internalize from fpregs into sv_xmm */
 2138 static void
 2139 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 2140 {
 2141         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2142         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2143         int i;
 2144 
 2145         /* fpregs -> pcb */
 2146         /* FPU control/status */
 2147         penv_xmm->en_cw = penv_fpreg->en_cw;
 2148         penv_xmm->en_sw = penv_fpreg->en_sw;
 2149         penv_xmm->en_tw = penv_fpreg->en_tw;
 2150         penv_xmm->en_opcode = penv_fpreg->en_opcode;
 2151         penv_xmm->en_rip = penv_fpreg->en_rip;
 2152         penv_xmm->en_rdp = penv_fpreg->en_rdp;
 2153         penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 2154         penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 2155 
 2156         /* FPU registers */
 2157         for (i = 0; i < 8; ++i)
 2158                 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 2159 
 2160         /* SSE registers */
 2161         for (i = 0; i < 16; ++i)
 2162                 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 2163 }
 2164 
 2165 /* externalize from td->pcb */
 2166 int
 2167 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 2168 {
 2169 
 2170         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 2171             P_SHOULDSTOP(td->td_proc),
 2172             ("not suspended thread %p", td));
 2173         fpugetregs(td);
 2174         fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 2175         return (0);
 2176 }
 2177 
 2178 /* internalize to td->pcb */
 2179 int
 2180 set_fpregs(struct thread *td, struct fpreg *fpregs)
 2181 {
 2182 
 2183         critical_enter();
 2184         set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 2185         fpuuserinited(td);
 2186         critical_exit();
 2187         return (0);
 2188 }
 2189 
 2190 /*
 2191  * Get machine context.
 2192  */
 2193 int
 2194 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 2195 {
 2196         struct pcb *pcb;
 2197         struct trapframe *tp;
 2198 
 2199         pcb = td->td_pcb;
 2200         tp = td->td_frame;
 2201         PROC_LOCK(curthread->td_proc);
 2202         mcp->mc_onstack = sigonstack(tp->tf_rsp);
 2203         PROC_UNLOCK(curthread->td_proc);
 2204         mcp->mc_r15 = tp->tf_r15;
 2205         mcp->mc_r14 = tp->tf_r14;
 2206         mcp->mc_r13 = tp->tf_r13;
 2207         mcp->mc_r12 = tp->tf_r12;
 2208         mcp->mc_r11 = tp->tf_r11;
 2209         mcp->mc_r10 = tp->tf_r10;
 2210         mcp->mc_r9  = tp->tf_r9;
 2211         mcp->mc_r8  = tp->tf_r8;
 2212         mcp->mc_rdi = tp->tf_rdi;
 2213         mcp->mc_rsi = tp->tf_rsi;
 2214         mcp->mc_rbp = tp->tf_rbp;
 2215         mcp->mc_rbx = tp->tf_rbx;
 2216         mcp->mc_rcx = tp->tf_rcx;
 2217         mcp->mc_rflags = tp->tf_rflags;
 2218         if (flags & GET_MC_CLEAR_RET) {
 2219                 mcp->mc_rax = 0;
 2220                 mcp->mc_rdx = 0;
 2221                 mcp->mc_rflags &= ~PSL_C;
 2222         } else {
 2223                 mcp->mc_rax = tp->tf_rax;
 2224                 mcp->mc_rdx = tp->tf_rdx;
 2225         }
 2226         mcp->mc_rip = tp->tf_rip;
 2227         mcp->mc_cs = tp->tf_cs;
 2228         mcp->mc_rsp = tp->tf_rsp;
 2229         mcp->mc_ss = tp->tf_ss;
 2230         mcp->mc_ds = tp->tf_ds;
 2231         mcp->mc_es = tp->tf_es;
 2232         mcp->mc_fs = tp->tf_fs;
 2233         mcp->mc_gs = tp->tf_gs;
 2234         mcp->mc_flags = tp->tf_flags;
 2235         mcp->mc_len = sizeof(*mcp);
 2236         get_fpcontext(td, mcp, NULL, 0);
 2237         update_pcb_bases(pcb);
 2238         mcp->mc_fsbase = pcb->pcb_fsbase;
 2239         mcp->mc_gsbase = pcb->pcb_gsbase;
 2240         mcp->mc_xfpustate = 0;
 2241         mcp->mc_xfpustate_len = 0;
 2242         bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 2243         return (0);
 2244 }
 2245 
 2246 /*
 2247  * Set machine context.
 2248  *
 2249  * However, we don't set any but the user modifiable flags, and we won't
 2250  * touch the cs selector.
 2251  */
 2252 int
 2253 set_mcontext(struct thread *td, mcontext_t *mcp)
 2254 {
 2255         struct pcb *pcb;
 2256         struct trapframe *tp;
 2257         char *xfpustate;
 2258         long rflags;
 2259         int ret;
 2260 
 2261         pcb = td->td_pcb;
 2262         tp = td->td_frame;
 2263         if (mcp->mc_len != sizeof(*mcp) ||
 2264             (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 2265                 return (EINVAL);
 2266         rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 2267             (tp->tf_rflags & ~PSL_USERCHANGE);
 2268         if (mcp->mc_flags & _MC_HASFPXSTATE) {
 2269                 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 2270                     sizeof(struct savefpu))
 2271                         return (EINVAL);
 2272                 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 2273                 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 2274                     mcp->mc_xfpustate_len);
 2275                 if (ret != 0)
 2276                         return (ret);
 2277         } else
 2278                 xfpustate = NULL;
 2279         ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 2280         if (ret != 0)
 2281                 return (ret);
 2282         tp->tf_r15 = mcp->mc_r15;
 2283         tp->tf_r14 = mcp->mc_r14;
 2284         tp->tf_r13 = mcp->mc_r13;
 2285         tp->tf_r12 = mcp->mc_r12;
 2286         tp->tf_r11 = mcp->mc_r11;
 2287         tp->tf_r10 = mcp->mc_r10;
 2288         tp->tf_r9  = mcp->mc_r9;
 2289         tp->tf_r8  = mcp->mc_r8;
 2290         tp->tf_rdi = mcp->mc_rdi;
 2291         tp->tf_rsi = mcp->mc_rsi;
 2292         tp->tf_rbp = mcp->mc_rbp;
 2293         tp->tf_rbx = mcp->mc_rbx;
 2294         tp->tf_rdx = mcp->mc_rdx;
 2295         tp->tf_rcx = mcp->mc_rcx;
 2296         tp->tf_rax = mcp->mc_rax;
 2297         tp->tf_rip = mcp->mc_rip;
 2298         tp->tf_rflags = rflags;
 2299         tp->tf_rsp = mcp->mc_rsp;
 2300         tp->tf_ss = mcp->mc_ss;
 2301         tp->tf_flags = mcp->mc_flags;
 2302         if (tp->tf_flags & TF_HASSEGS) {
 2303                 tp->tf_ds = mcp->mc_ds;
 2304                 tp->tf_es = mcp->mc_es;
 2305                 tp->tf_fs = mcp->mc_fs;
 2306                 tp->tf_gs = mcp->mc_gs;
 2307         }
 2308         set_pcb_flags(pcb, PCB_FULL_IRET);
 2309         if (mcp->mc_flags & _MC_HASBASES) {
 2310                 pcb->pcb_fsbase = mcp->mc_fsbase;
 2311                 pcb->pcb_gsbase = mcp->mc_gsbase;
 2312         }
 2313         return (0);
 2314 }
 2315 
 2316 static void
 2317 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
 2318     size_t xfpusave_len)
 2319 {
 2320         size_t max_len, len;
 2321 
 2322         mcp->mc_ownedfp = fpugetregs(td);
 2323         bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 2324             sizeof(mcp->mc_fpstate));
 2325         mcp->mc_fpformat = fpuformat();
 2326         if (!use_xsave || xfpusave_len == 0)
 2327                 return;
 2328         max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 2329         len = xfpusave_len;
 2330         if (len > max_len) {
 2331                 len = max_len;
 2332                 bzero(xfpusave + max_len, len - max_len);
 2333         }
 2334         mcp->mc_flags |= _MC_HASFPXSTATE;
 2335         mcp->mc_xfpustate_len = len;
 2336         bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 2337 }
 2338 
 2339 static int
 2340 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
 2341     size_t xfpustate_len)
 2342 {
 2343         int error;
 2344 
 2345         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 2346                 return (0);
 2347         else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 2348                 return (EINVAL);
 2349         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 2350                 /* We don't care what state is left in the FPU or PCB. */
 2351                 fpstate_drop(td);
 2352                 error = 0;
 2353         } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 2354             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 2355                 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 2356                     xfpustate, xfpustate_len);
 2357         } else
 2358                 return (EINVAL);
 2359         return (error);
 2360 }
 2361 
 2362 void
 2363 fpstate_drop(struct thread *td)
 2364 {
 2365 
 2366         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 2367         critical_enter();
 2368         if (PCPU_GET(fpcurthread) == td)
 2369                 fpudrop();
 2370         /*
 2371          * XXX force a full drop of the fpu.  The above only drops it if we
 2372          * owned it.
 2373          *
 2374          * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 2375          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 2376          * We only need to drop to !PCB_INITDONE in sendsig().  But
 2377          * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 2378          * have too many layers.
 2379          */
 2380         clear_pcb_flags(curthread->td_pcb,
 2381             PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 2382         critical_exit();
 2383 }
 2384 
 2385 int
 2386 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 2387 {
 2388         struct pcb *pcb;
 2389 
 2390         if (td == NULL) {
 2391                 dbregs->dr[0] = rdr0();
 2392                 dbregs->dr[1] = rdr1();
 2393                 dbregs->dr[2] = rdr2();
 2394                 dbregs->dr[3] = rdr3();
 2395                 dbregs->dr[6] = rdr6();
 2396                 dbregs->dr[7] = rdr7();
 2397         } else {
 2398                 pcb = td->td_pcb;
 2399                 dbregs->dr[0] = pcb->pcb_dr0;
 2400                 dbregs->dr[1] = pcb->pcb_dr1;
 2401                 dbregs->dr[2] = pcb->pcb_dr2;
 2402                 dbregs->dr[3] = pcb->pcb_dr3;
 2403                 dbregs->dr[6] = pcb->pcb_dr6;
 2404                 dbregs->dr[7] = pcb->pcb_dr7;
 2405         }
 2406         dbregs->dr[4] = 0;
 2407         dbregs->dr[5] = 0;
 2408         dbregs->dr[8] = 0;
 2409         dbregs->dr[9] = 0;
 2410         dbregs->dr[10] = 0;
 2411         dbregs->dr[11] = 0;
 2412         dbregs->dr[12] = 0;
 2413         dbregs->dr[13] = 0;
 2414         dbregs->dr[14] = 0;
 2415         dbregs->dr[15] = 0;
 2416         return (0);
 2417 }
 2418 
 2419 int
 2420 set_dbregs(struct thread *td, struct dbreg *dbregs)
 2421 {
 2422         struct pcb *pcb;
 2423         int i;
 2424 
 2425         if (td == NULL) {
 2426                 load_dr0(dbregs->dr[0]);
 2427                 load_dr1(dbregs->dr[1]);
 2428                 load_dr2(dbregs->dr[2]);
 2429                 load_dr3(dbregs->dr[3]);
 2430                 load_dr6(dbregs->dr[6]);
 2431                 load_dr7(dbregs->dr[7]);
 2432         } else {
 2433                 /*
 2434                  * Don't let an illegal value for dr7 get set.  Specifically,
 2435                  * check for undefined settings.  Setting these bit patterns
 2436                  * result in undefined behaviour and can lead to an unexpected
 2437                  * TRCTRAP or a general protection fault right here.
 2438                  * Upper bits of dr6 and dr7 must not be set
 2439                  */
 2440                 for (i = 0; i < 4; i++) {
 2441                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 2442                                 return (EINVAL);
 2443                         if (td->td_frame->tf_cs == _ucode32sel &&
 2444                             DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 2445                                 return (EINVAL);
 2446                 }
 2447                 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 2448                     (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 2449                         return (EINVAL);
 2450 
 2451                 pcb = td->td_pcb;
 2452 
 2453                 /*
 2454                  * Don't let a process set a breakpoint that is not within the
 2455                  * process's address space.  If a process could do this, it
 2456                  * could halt the system by setting a breakpoint in the kernel
 2457                  * (if ddb was enabled).  Thus, we need to check to make sure
 2458                  * that no breakpoints are being enabled for addresses outside
 2459                  * process's address space.
 2460                  *
 2461                  * XXX - what about when the watched area of the user's
 2462                  * address space is written into from within the kernel
 2463                  * ... wouldn't that still cause a breakpoint to be generated
 2464                  * from within kernel mode?
 2465                  */
 2466 
 2467                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 2468                         /* dr0 is enabled */
 2469                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 2470                                 return (EINVAL);
 2471                 }
 2472                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 2473                         /* dr1 is enabled */
 2474                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 2475                                 return (EINVAL);
 2476                 }
 2477                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 2478                         /* dr2 is enabled */
 2479                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 2480                                 return (EINVAL);
 2481                 }
 2482                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 2483                         /* dr3 is enabled */
 2484                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 2485                                 return (EINVAL);
 2486                 }
 2487 
 2488                 pcb->pcb_dr0 = dbregs->dr[0];
 2489                 pcb->pcb_dr1 = dbregs->dr[1];
 2490                 pcb->pcb_dr2 = dbregs->dr[2];
 2491                 pcb->pcb_dr3 = dbregs->dr[3];
 2492                 pcb->pcb_dr6 = dbregs->dr[6];
 2493                 pcb->pcb_dr7 = dbregs->dr[7];
 2494 
 2495                 set_pcb_flags(pcb, PCB_DBREGS);
 2496         }
 2497 
 2498         return (0);
 2499 }
 2500 
 2501 void
 2502 reset_dbregs(void)
 2503 {
 2504 
 2505         load_dr7(0);    /* Turn off the control bits first */
 2506         load_dr0(0);
 2507         load_dr1(0);
 2508         load_dr2(0);
 2509         load_dr3(0);
 2510         load_dr6(0);
 2511 }
 2512 
 2513 /*
 2514  * Return > 0 if a hardware breakpoint has been hit, and the
 2515  * breakpoint was in user space.  Return 0, otherwise.
 2516  */
 2517 int
 2518 user_dbreg_trap(register_t dr6)
 2519 {
 2520         u_int64_t dr7;
 2521         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
 2522         int nbp;            /* number of breakpoints that triggered */
 2523         caddr_t addr[4];    /* breakpoint addresses */
 2524         int i;
 2525 
 2526         bp = dr6 & DBREG_DR6_BMASK;
 2527         if (bp == 0) {
 2528                 /*
 2529                  * None of the breakpoint bits are set meaning this
 2530                  * trap was not caused by any of the debug registers
 2531                  */
 2532                 return 0;
 2533         }
 2534 
 2535         dr7 = rdr7();
 2536         if ((dr7 & 0x000000ff) == 0) {
 2537                 /*
 2538                  * all GE and LE bits in the dr7 register are zero,
 2539                  * thus the trap couldn't have been caused by the
 2540                  * hardware debug registers
 2541                  */
 2542                 return 0;
 2543         }
 2544 
 2545         nbp = 0;
 2546 
 2547         /*
 2548          * at least one of the breakpoints were hit, check to see
 2549          * which ones and if any of them are user space addresses
 2550          */
 2551 
 2552         if (bp & 0x01) {
 2553                 addr[nbp++] = (caddr_t)rdr0();
 2554         }
 2555         if (bp & 0x02) {
 2556                 addr[nbp++] = (caddr_t)rdr1();
 2557         }
 2558         if (bp & 0x04) {
 2559                 addr[nbp++] = (caddr_t)rdr2();
 2560         }
 2561         if (bp & 0x08) {
 2562                 addr[nbp++] = (caddr_t)rdr3();
 2563         }
 2564 
 2565         for (i = 0; i < nbp; i++) {
 2566                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 2567                         /*
 2568                          * addr[i] is in user space
 2569                          */
 2570                         return nbp;
 2571                 }
 2572         }
 2573 
 2574         /*
 2575          * None of the breakpoints are in user space.
 2576          */
 2577         return 0;
 2578 }
 2579 
 2580 /*
 2581  * The pcb_flags is only modified by current thread, or by other threads
 2582  * when current thread is stopped.  However, current thread may change it
 2583  * from the interrupt context in cpu_switch(), or in the trap handler.
 2584  * When we read-modify-write pcb_flags from C sources, compiler may generate
 2585  * code that is not atomic regarding the interrupt handler.  If a trap or
 2586  * interrupt happens and any flag is modified from the handler, it can be
 2587  * clobbered with the cached value later.  Therefore, we implement setting
 2588  * and clearing flags with single-instruction functions, which do not race
 2589  * with possible modification of the flags from the trap or interrupt context,
 2590  * because traps and interrupts are executed only on instruction boundary.
 2591  */
 2592 void
 2593 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 2594 {
 2595 
 2596         __asm __volatile("orl %1,%0"
 2597             : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 2598             : "cc", "memory");
 2599 
 2600 }
 2601 
 2602 /*
 2603  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
 2604  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
 2605  * pcb if user space modified the bases.  We must save on the context
 2606  * switch or if the return to usermode happens through the doreti.
 2607  *
 2608  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
 2609  * which have a consequence that the base MSRs must be saved each time
 2610  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
 2611  * context switches.
 2612  */
 2613 void
 2614 set_pcb_flags(struct pcb *pcb, const u_int flags)
 2615 {
 2616         register_t r;
 2617 
 2618         if (curpcb == pcb &&
 2619             (flags & PCB_FULL_IRET) != 0 &&
 2620             (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
 2621             (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
 2622                 r = intr_disable();
 2623                 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 2624                         if (rfs() == _ufssel)
 2625                                 pcb->pcb_fsbase = rdfsbase();
 2626                         if (rgs() == _ugssel)
 2627                                 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 2628                 }
 2629                 set_pcb_flags_raw(pcb, flags);
 2630                 intr_restore(r);
 2631         } else {
 2632                 set_pcb_flags_raw(pcb, flags);
 2633         }
 2634 }
 2635 
 2636 void
 2637 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 2638 {
 2639 
 2640         __asm __volatile("andl %1,%0"
 2641             : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 2642             : "cc", "memory");
 2643 }
 2644 
 2645 #ifdef KDB
 2646 
 2647 /*
 2648  * Provide inb() and outb() as functions.  They are normally only available as
 2649  * inline functions, thus cannot be called from the debugger.
 2650  */
 2651 
 2652 /* silence compiler warnings */
 2653 u_char inb_(u_short);
 2654 void outb_(u_short, u_char);
 2655 
 2656 u_char
 2657 inb_(u_short port)
 2658 {
 2659         return inb(port);
 2660 }
 2661 
 2662 void
 2663 outb_(u_short port, u_char data)
 2664 {
 2665         outb(port, data);
 2666 }
 2667 
 2668 #endif /* KDB */

Cache object: 82046fd8e8b0a26f5503a2d9433a07a9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.