The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/machdep.c 362383 2020-06-19 13:48:23Z kib $");
   43 
   44 #include "opt_atpic.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kstack_pages.h"
   51 #include "opt_maxmem.h"
   52 #include "opt_mp_watchdog.h"
   53 #include "opt_perfmon.h"
   54 #include "opt_platform.h"
   55 #include "opt_sched.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/bio.h>
   61 #include <sys/buf.h>
   62 #include <sys/bus.h>
   63 #include <sys/callout.h>
   64 #include <sys/cons.h>
   65 #include <sys/cpu.h>
   66 #include <sys/efi.h>
   67 #include <sys/eventhandler.h>
   68 #include <sys/exec.h>
   69 #include <sys/imgact.h>
   70 #include <sys/kdb.h>
   71 #include <sys/kernel.h>
   72 #include <sys/ktr.h>
   73 #include <sys/linker.h>
   74 #include <sys/lock.h>
   75 #include <sys/malloc.h>
   76 #include <sys/memrange.h>
   77 #include <sys/msgbuf.h>
   78 #include <sys/mutex.h>
   79 #include <sys/pcpu.h>
   80 #include <sys/ptrace.h>
   81 #include <sys/reboot.h>
   82 #include <sys/rwlock.h>
   83 #include <sys/sched.h>
   84 #include <sys/signalvar.h>
   85 #ifdef SMP
   86 #include <sys/smp.h>
   87 #endif
   88 #include <sys/syscallsubr.h>
   89 #include <sys/sysctl.h>
   90 #include <sys/sysent.h>
   91 #include <sys/sysproto.h>
   92 #include <sys/ucontext.h>
   93 #include <sys/vmmeter.h>
   94 
   95 #include <vm/vm.h>
   96 #include <vm/vm_extern.h>
   97 #include <vm/vm_kern.h>
   98 #include <vm/vm_page.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_object.h>
  101 #include <vm/vm_pager.h>
  102 #include <vm/vm_param.h>
  103 #include <vm/vm_phys.h>
  104 
  105 #ifdef DDB
  106 #ifndef KDB
  107 #error KDB must be enabled in order for DDB to work!
  108 #endif
  109 #include <ddb/ddb.h>
  110 #include <ddb/db_sym.h>
  111 #endif
  112 
  113 #include <net/netisr.h>
  114 
  115 #include <machine/clock.h>
  116 #include <machine/cpu.h>
  117 #include <machine/cputypes.h>
  118 #include <machine/frame.h>
  119 #include <machine/intr_machdep.h>
  120 #include <x86/mca.h>
  121 #include <machine/md_var.h>
  122 #include <machine/metadata.h>
  123 #include <machine/mp_watchdog.h>
  124 #include <machine/pc/bios.h>
  125 #include <machine/pcb.h>
  126 #include <machine/proc.h>
  127 #include <machine/reg.h>
  128 #include <machine/sigframe.h>
  129 #include <machine/specialreg.h>
  130 #ifdef PERFMON
  131 #include <machine/perfmon.h>
  132 #endif
  133 #include <machine/tss.h>
  134 #include <x86/ucode.h>
  135 #ifdef SMP
  136 #include <machine/smp.h>
  137 #endif
  138 #ifdef FDT
  139 #include <x86/fdt.h>
  140 #endif
  141 
  142 #ifdef DEV_ATPIC
  143 #include <x86/isa/icu.h>
  144 #else
  145 #include <x86/apicvar.h>
  146 #endif
  147 
  148 #include <isa/isareg.h>
  149 #include <isa/rtc.h>
  150 #include <x86/init.h>
  151 
  152 /* Sanity check for __curthread() */
  153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  154 
  155 /*
  156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  157  * couple of scratch registers, as well as the trapframe left behind after an
  158  * iret fault.
  159  */
  160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
  161     offsetof(struct pti_frame, pti_rip));
  162 
  163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  164 
  165 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  166 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  167 
  168 static void cpu_startup(void *);
  169 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
  170     char *xfpusave, size_t xfpusave_len);
  171 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
  172     char *xfpustate, size_t xfpustate_len);
  173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  174 
  175 /* Preload data parse function */
  176 static caddr_t native_parse_preload_data(u_int64_t);
  177 
  178 /* Native function to fetch and parse the e820 map */
  179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
  180 
  181 /* Default init_ops implementation. */
  182 struct init_ops init_ops = {
  183         .parse_preload_data =   native_parse_preload_data,
  184         .early_clock_source_init =      i8254_init,
  185         .early_delay =                  i8254_delay,
  186         .parse_memmap =                 native_parse_memmap,
  187 #ifdef SMP
  188         .mp_bootaddress =               mp_bootaddress,
  189         .start_all_aps =                native_start_all_aps,
  190 #endif
  191         .msi_init =                     msi_init,
  192 };
  193 
  194 struct msgbuf *msgbufp;
  195 
  196 /*
  197  * Physical address of the EFI System Table. Stashed from the metadata hints
  198  * passed into the kernel and used by the EFI code to call runtime services.
  199  */
  200 vm_paddr_t efi_systbl_phys;
  201 
  202 /* Intel ICH registers */
  203 #define ICH_PMBASE      0x400
  204 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  205 
  206 int     _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
  207 
  208 int cold = 1;
  209 
  210 long Maxmem = 0;
  211 long realmem = 0;
  212 
  213 /*
  214  * The number of PHYSMAP entries must be one less than the number of
  215  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  216  * physical address that is accessible by ISA DMA is split into two
  217  * PHYSSEG entries.
  218  */
  219 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  220 
  221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  223 
  224 /* must be 2 less so 0 0 can signal end of chunks */
  225 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
  226 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
  227 
  228 struct kva_md_info kmi;
  229 
  230 static struct trapframe proc0_tf;
  231 struct region_descriptor r_gdt, r_idt;
  232 
  233 struct pcpu __pcpu[MAXCPU];
  234 
  235 struct mtx icu_lock;
  236 
  237 struct mem_range_softc mem_range_softc;
  238 
  239 struct mtx dt_lock;     /* lock for GDT and LDT */
  240 
  241 void (*vmm_resume_p)(void);
  242 
  243 static void
  244 cpu_startup(dummy)
  245         void *dummy;
  246 {
  247         uintmax_t memsize;
  248         char *sysenv;
  249 
  250         /*
  251          * On MacBooks, we need to disallow the legacy USB circuit to
  252          * generate an SMI# because this can cause several problems,
  253          * namely: incorrect CPU frequency detection and failure to
  254          * start the APs.
  255          * We do this by disabling a bit in the SMI_EN (SMI Control and
  256          * Enable register) of the Intel ICH LPC Interface Bridge. 
  257          */
  258         sysenv = kern_getenv("smbios.system.product");
  259         if (sysenv != NULL) {
  260                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  261                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  262                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  263                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  264                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  265                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  266                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  267                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  268                         if (bootverbose)
  269                                 printf("Disabling LEGACY_USB_EN bit on "
  270                                     "Intel ICH.\n");
  271                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  272                 }
  273                 freeenv(sysenv);
  274         }
  275 
  276         /*
  277          * Good {morning,afternoon,evening,night}.
  278          */
  279         startrtclock();
  280         printcpuinfo();
  281 #ifdef PERFMON
  282         perfmon_init();
  283 #endif
  284 
  285         /*
  286          * Display physical memory if SMBIOS reports reasonable amount.
  287          */
  288         memsize = 0;
  289         sysenv = kern_getenv("smbios.memory.enabled");
  290         if (sysenv != NULL) {
  291                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  292                 freeenv(sysenv);
  293         }
  294         if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
  295                 memsize = ptoa((uintmax_t)Maxmem);
  296         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  297         realmem = atop(memsize);
  298 
  299         /*
  300          * Display any holes after the first chunk of extended memory.
  301          */
  302         if (bootverbose) {
  303                 int indx;
  304 
  305                 printf("Physical memory chunk(s):\n");
  306                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  307                         vm_paddr_t size;
  308 
  309                         size = phys_avail[indx + 1] - phys_avail[indx];
  310                         printf(
  311                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  312                             (uintmax_t)phys_avail[indx],
  313                             (uintmax_t)phys_avail[indx + 1] - 1,
  314                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  315                 }
  316         }
  317 
  318         vm_ksubmap_init(&kmi);
  319 
  320         printf("avail memory = %ju (%ju MB)\n",
  321             ptoa((uintmax_t)vm_cnt.v_free_count),
  322             ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
  323 
  324         /*
  325          * Set up buffers, so they can be used to read disk labels.
  326          */
  327         bufinit();
  328         vm_pager_bufferinit();
  329 
  330         cpu_setregs();
  331 }
  332 
  333 /*
  334  * Send an interrupt to process.
  335  *
  336  * Stack is set up to allow sigcode stored
  337  * at top to call routine, followed by call
  338  * to sigreturn routine below.  After sigreturn
  339  * resets the signal mask, the stack, and the
  340  * frame pointer, it returns to the user
  341  * specified pc, psl.
  342  */
  343 void
  344 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  345 {
  346         struct sigframe sf, *sfp;
  347         struct pcb *pcb;
  348         struct proc *p;
  349         struct thread *td;
  350         struct sigacts *psp;
  351         char *sp;
  352         struct trapframe *regs;
  353         char *xfpusave;
  354         size_t xfpusave_len;
  355         int sig;
  356         int oonstack;
  357 
  358         td = curthread;
  359         pcb = td->td_pcb;
  360         p = td->td_proc;
  361         PROC_LOCK_ASSERT(p, MA_OWNED);
  362         sig = ksi->ksi_signo;
  363         psp = p->p_sigacts;
  364         mtx_assert(&psp->ps_mtx, MA_OWNED);
  365         regs = td->td_frame;
  366         oonstack = sigonstack(regs->tf_rsp);
  367 
  368         if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
  369                 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
  370                 xfpusave = __builtin_alloca(xfpusave_len);
  371         } else {
  372                 xfpusave_len = 0;
  373                 xfpusave = NULL;
  374         }
  375 
  376         /* Save user context. */
  377         bzero(&sf, sizeof(sf));
  378         sf.sf_uc.uc_sigmask = *mask;
  379         sf.sf_uc.uc_stack = td->td_sigstk;
  380         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  381             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  382         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  383         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
  384         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  385         get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
  386         fpstate_drop(td);
  387         update_pcb_bases(pcb);
  388         sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
  389         sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
  390         bzero(sf.sf_uc.uc_mcontext.mc_spare,
  391             sizeof(sf.sf_uc.uc_mcontext.mc_spare));
  392         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  393 
  394         /* Allocate space for the signal handler context. */
  395         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  396             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  397                 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
  398 #if defined(COMPAT_43)
  399                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  400 #endif
  401         } else
  402                 sp = (char *)regs->tf_rsp - 128;
  403         if (xfpusave != NULL) {
  404                 sp -= xfpusave_len;
  405                 sp = (char *)((unsigned long)sp & ~0x3Ful);
  406                 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
  407         }
  408         sp -= sizeof(struct sigframe);
  409         /* Align to 16 bytes. */
  410         sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
  411 
  412         /* Build the argument list for the signal handler. */
  413         regs->tf_rdi = sig;                     /* arg 1 in %rdi */
  414         regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
  415         bzero(&sf.sf_si, sizeof(sf.sf_si));
  416         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  417                 /* Signal handler installed with SA_SIGINFO. */
  418                 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
  419                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  420 
  421                 /* Fill in POSIX parts */
  422                 sf.sf_si = ksi->ksi_info;
  423                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  424                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  425         } else {
  426                 /* Old FreeBSD-style arguments. */
  427                 regs->tf_rsi = ksi->ksi_code;   /* arg 2 in %rsi */
  428                 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
  429                 sf.sf_ahu.sf_handler = catcher;
  430         }
  431         mtx_unlock(&psp->ps_mtx);
  432         PROC_UNLOCK(p);
  433 
  434         /*
  435          * Copy the sigframe out to the user's stack.
  436          */
  437         if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
  438             (xfpusave != NULL && copyout(xfpusave,
  439             (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
  440             != 0)) {
  441 #ifdef DEBUG
  442                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  443 #endif
  444                 PROC_LOCK(p);
  445                 sigexit(td, SIGILL);
  446         }
  447 
  448         regs->tf_rsp = (long)sfp;
  449         regs->tf_rip = p->p_sysent->sv_sigcode_base;
  450         regs->tf_rflags &= ~(PSL_T | PSL_D);
  451         regs->tf_cs = _ucodesel;
  452         regs->tf_ds = _udatasel;
  453         regs->tf_ss = _udatasel;
  454         regs->tf_es = _udatasel;
  455         regs->tf_fs = _ufssel;
  456         regs->tf_gs = _ugssel;
  457         regs->tf_flags = TF_HASSEGS;
  458         PROC_LOCK(p);
  459         mtx_lock(&psp->ps_mtx);
  460 }
  461 
  462 /*
  463  * System call to cleanup state after a signal
  464  * has been taken.  Reset signal mask and
  465  * stack state from context left by sendsig (above).
  466  * Return to previous pc and psl as specified by
  467  * context left by sendsig. Check carefully to
  468  * make sure that the user has not modified the
  469  * state to gain improper privileges.
  470  *
  471  * MPSAFE
  472  */
  473 int
  474 sys_sigreturn(td, uap)
  475         struct thread *td;
  476         struct sigreturn_args /* {
  477                 const struct __ucontext *sigcntxp;
  478         } */ *uap;
  479 {
  480         ucontext_t uc;
  481         struct pcb *pcb;
  482         struct proc *p;
  483         struct trapframe *regs;
  484         ucontext_t *ucp;
  485         char *xfpustate;
  486         size_t xfpustate_len;
  487         long rflags;
  488         int cs, error, ret;
  489         ksiginfo_t ksi;
  490 
  491         pcb = td->td_pcb;
  492         p = td->td_proc;
  493 
  494         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  495         if (error != 0) {
  496                 uprintf("pid %d (%s): sigreturn copyin failed\n",
  497                     p->p_pid, td->td_name);
  498                 return (error);
  499         }
  500         ucp = &uc;
  501         if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
  502                 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
  503                     td->td_name, ucp->uc_mcontext.mc_flags);
  504                 return (EINVAL);
  505         }
  506         regs = td->td_frame;
  507         rflags = ucp->uc_mcontext.mc_rflags;
  508         /*
  509          * Don't allow users to change privileged or reserved flags.
  510          */
  511         if (!EFL_SECURE(rflags, regs->tf_rflags)) {
  512                 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
  513                     td->td_name, rflags);
  514                 return (EINVAL);
  515         }
  516 
  517         /*
  518          * Don't allow users to load a valid privileged %cs.  Let the
  519          * hardware check for invalid selectors, excess privilege in
  520          * other selectors, invalid %eip's and invalid %esp's.
  521          */
  522         cs = ucp->uc_mcontext.mc_cs;
  523         if (!CS_SECURE(cs)) {
  524                 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
  525                     td->td_name, cs);
  526                 ksiginfo_init_trap(&ksi);
  527                 ksi.ksi_signo = SIGBUS;
  528                 ksi.ksi_code = BUS_OBJERR;
  529                 ksi.ksi_trapno = T_PROTFLT;
  530                 ksi.ksi_addr = (void *)regs->tf_rip;
  531                 trapsignal(td, &ksi);
  532                 return (EINVAL);
  533         }
  534 
  535         if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
  536                 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
  537                 if (xfpustate_len > cpu_max_ext_state_size -
  538                     sizeof(struct savefpu)) {
  539                         uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
  540                             p->p_pid, td->td_name, xfpustate_len);
  541                         return (EINVAL);
  542                 }
  543                 xfpustate = __builtin_alloca(xfpustate_len);
  544                 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
  545                     xfpustate, xfpustate_len);
  546                 if (error != 0) {
  547                         uprintf(
  548         "pid %d (%s): sigreturn copying xfpustate failed\n",
  549                             p->p_pid, td->td_name);
  550                         return (error);
  551                 }
  552         } else {
  553                 xfpustate = NULL;
  554                 xfpustate_len = 0;
  555         }
  556         ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
  557         if (ret != 0) {
  558                 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
  559                     p->p_pid, td->td_name, ret);
  560                 return (ret);
  561         }
  562         bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
  563         update_pcb_bases(pcb);
  564         pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
  565         pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
  566 
  567 #if defined(COMPAT_43)
  568         if (ucp->uc_mcontext.mc_onstack & 1)
  569                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  570         else
  571                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  572 #endif
  573 
  574         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
  575         return (EJUSTRETURN);
  576 }
  577 
  578 #ifdef COMPAT_FREEBSD4
  579 int
  580 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
  581 {
  582  
  583         return sys_sigreturn(td, (struct sigreturn_args *)uap);
  584 }
  585 #endif
  586 
  587 /*
  588  * Reset registers to default values on exec.
  589  */
  590 void
  591 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
  592 {
  593         struct trapframe *regs;
  594         struct pcb *pcb;
  595         register_t saved_rflags;
  596 
  597         regs = td->td_frame;
  598         pcb = td->td_pcb;
  599 
  600         mtx_lock(&dt_lock);
  601         if (td->td_proc->p_md.md_ldt != NULL)
  602                 user_ldt_free(td);
  603         else
  604                 mtx_unlock(&dt_lock);
  605         
  606         update_pcb_bases(pcb);
  607         pcb->pcb_fsbase = 0;
  608         pcb->pcb_gsbase = 0;
  609         clear_pcb_flags(pcb, PCB_32BIT);
  610         pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
  611 
  612         saved_rflags = regs->tf_rflags & PSL_T;
  613         bzero((char *)regs, sizeof(struct trapframe));
  614         regs->tf_rip = imgp->entry_addr;
  615         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
  616         regs->tf_rdi = stack;           /* argv */
  617         regs->tf_rflags = PSL_USER | saved_rflags;
  618         regs->tf_ss = _udatasel;
  619         regs->tf_cs = _ucodesel;
  620         regs->tf_ds = _udatasel;
  621         regs->tf_es = _udatasel;
  622         regs->tf_fs = _ufssel;
  623         regs->tf_gs = _ugssel;
  624         regs->tf_flags = TF_HASSEGS;
  625         td->td_retval[1] = 0;
  626 
  627         /*
  628          * Reset the hardware debug registers if they were in use.
  629          * They won't have any meaning for the newly exec'd process.
  630          */
  631         if (pcb->pcb_flags & PCB_DBREGS) {
  632                 pcb->pcb_dr0 = 0;
  633                 pcb->pcb_dr1 = 0;
  634                 pcb->pcb_dr2 = 0;
  635                 pcb->pcb_dr3 = 0;
  636                 pcb->pcb_dr6 = 0;
  637                 pcb->pcb_dr7 = 0;
  638                 if (pcb == curpcb) {
  639                         /*
  640                          * Clear the debug registers on the running
  641                          * CPU, otherwise they will end up affecting
  642                          * the next process we switch to.
  643                          */
  644                         reset_dbregs();
  645                 }
  646                 clear_pcb_flags(pcb, PCB_DBREGS);
  647         }
  648 
  649         /*
  650          * Drop the FP state if we hold it, so that the process gets a
  651          * clean FP state if it uses the FPU again.
  652          */
  653         fpstate_drop(td);
  654 }
  655 
  656 void
  657 cpu_setregs(void)
  658 {
  659         register_t cr0;
  660 
  661         cr0 = rcr0();
  662         /*
  663          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  664          * BSP.  See the comments there about why we set them.
  665          */
  666         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  667         load_cr0(cr0);
  668 }
  669 
  670 /*
  671  * Initialize amd64 and configure to run kernel
  672  */
  673 
  674 /*
  675  * Initialize segments & interrupt table
  676  */
  677 
  678 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
  679 static struct gate_descriptor idt0[NIDT];
  680 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  681 
  682 static char dblfault_stack[PAGE_SIZE] __aligned(16);
  683 static char mce0_stack[PAGE_SIZE] __aligned(16);
  684 static char nmi0_stack[PAGE_SIZE] __aligned(16);
  685 static char dbg0_stack[PAGE_SIZE] __aligned(16);
  686 CTASSERT(sizeof(struct nmi_pcpu) == 16);
  687 
  688 struct amd64tss common_tss[MAXCPU];
  689 
  690 /*
  691  * Software prototypes -- in more palatable form.
  692  *
  693  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  694  * slots as corresponding segments for i386 kernel.
  695  */
  696 struct soft_segment_descriptor gdt_segs[] = {
  697 /* GNULL_SEL    0 Null Descriptor */
  698 {       .ssd_base = 0x0,
  699         .ssd_limit = 0x0,
  700         .ssd_type = 0,
  701         .ssd_dpl = 0,
  702         .ssd_p = 0,
  703         .ssd_long = 0,
  704         .ssd_def32 = 0,
  705         .ssd_gran = 0           },
  706 /* GNULL2_SEL   1 Null Descriptor */
  707 {       .ssd_base = 0x0,
  708         .ssd_limit = 0x0,
  709         .ssd_type = 0,
  710         .ssd_dpl = 0,
  711         .ssd_p = 0,
  712         .ssd_long = 0,
  713         .ssd_def32 = 0,
  714         .ssd_gran = 0           },
  715 /* GUFS32_SEL   2 32 bit %gs Descriptor for user */
  716 {       .ssd_base = 0x0,
  717         .ssd_limit = 0xfffff,
  718         .ssd_type = SDT_MEMRWA,
  719         .ssd_dpl = SEL_UPL,
  720         .ssd_p = 1,
  721         .ssd_long = 0,
  722         .ssd_def32 = 1,
  723         .ssd_gran = 1           },
  724 /* GUGS32_SEL   3 32 bit %fs Descriptor for user */
  725 {       .ssd_base = 0x0,
  726         .ssd_limit = 0xfffff,
  727         .ssd_type = SDT_MEMRWA,
  728         .ssd_dpl = SEL_UPL,
  729         .ssd_p = 1,
  730         .ssd_long = 0,
  731         .ssd_def32 = 1,
  732         .ssd_gran = 1           },
  733 /* GCODE_SEL    4 Code Descriptor for kernel */
  734 {       .ssd_base = 0x0,
  735         .ssd_limit = 0xfffff,
  736         .ssd_type = SDT_MEMERA,
  737         .ssd_dpl = SEL_KPL,
  738         .ssd_p = 1,
  739         .ssd_long = 1,
  740         .ssd_def32 = 0,
  741         .ssd_gran = 1           },
  742 /* GDATA_SEL    5 Data Descriptor for kernel */
  743 {       .ssd_base = 0x0,
  744         .ssd_limit = 0xfffff,
  745         .ssd_type = SDT_MEMRWA,
  746         .ssd_dpl = SEL_KPL,
  747         .ssd_p = 1,
  748         .ssd_long = 1,
  749         .ssd_def32 = 0,
  750         .ssd_gran = 1           },
  751 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
  752 {       .ssd_base = 0x0,
  753         .ssd_limit = 0xfffff,
  754         .ssd_type = SDT_MEMERA,
  755         .ssd_dpl = SEL_UPL,
  756         .ssd_p = 1,
  757         .ssd_long = 0,
  758         .ssd_def32 = 1,
  759         .ssd_gran = 1           },
  760 /* GUDATA_SEL   7 32/64 bit Data Descriptor for user */
  761 {       .ssd_base = 0x0,
  762         .ssd_limit = 0xfffff,
  763         .ssd_type = SDT_MEMRWA,
  764         .ssd_dpl = SEL_UPL,
  765         .ssd_p = 1,
  766         .ssd_long = 0,
  767         .ssd_def32 = 1,
  768         .ssd_gran = 1           },
  769 /* GUCODE_SEL   8 64 bit Code Descriptor for user */
  770 {       .ssd_base = 0x0,
  771         .ssd_limit = 0xfffff,
  772         .ssd_type = SDT_MEMERA,
  773         .ssd_dpl = SEL_UPL,
  774         .ssd_p = 1,
  775         .ssd_long = 1,
  776         .ssd_def32 = 0,
  777         .ssd_gran = 1           },
  778 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
  779 {       .ssd_base = 0x0,
  780         .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
  781         .ssd_type = SDT_SYSTSS,
  782         .ssd_dpl = SEL_KPL,
  783         .ssd_p = 1,
  784         .ssd_long = 0,
  785         .ssd_def32 = 0,
  786         .ssd_gran = 0           },
  787 /* Actually, the TSS is a system descriptor which is double size */
  788 {       .ssd_base = 0x0,
  789         .ssd_limit = 0x0,
  790         .ssd_type = 0,
  791         .ssd_dpl = 0,
  792         .ssd_p = 0,
  793         .ssd_long = 0,
  794         .ssd_def32 = 0,
  795         .ssd_gran = 0           },
  796 /* GUSERLDT_SEL 11 LDT Descriptor */
  797 {       .ssd_base = 0x0,
  798         .ssd_limit = 0x0,
  799         .ssd_type = 0,
  800         .ssd_dpl = 0,
  801         .ssd_p = 0,
  802         .ssd_long = 0,
  803         .ssd_def32 = 0,
  804         .ssd_gran = 0           },
  805 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
  806 {       .ssd_base = 0x0,
  807         .ssd_limit = 0x0,
  808         .ssd_type = 0,
  809         .ssd_dpl = 0,
  810         .ssd_p = 0,
  811         .ssd_long = 0,
  812         .ssd_def32 = 0,
  813         .ssd_gran = 0           },
  814 };
  815 
  816 void
  817 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
  818 {
  819         struct gate_descriptor *ip;
  820 
  821         ip = idt + idx;
  822         ip->gd_looffset = (uintptr_t)func;
  823         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  824         ip->gd_ist = ist;
  825         ip->gd_xx = 0;
  826         ip->gd_type = typ;
  827         ip->gd_dpl = dpl;
  828         ip->gd_p = 1;
  829         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  830 }
  831 
  832 extern inthand_t
  833         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  834         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  835         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  836         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  837         IDTVEC(xmm), IDTVEC(dblfault),
  838         IDTVEC(div_pti), IDTVEC(bpt_pti),
  839         IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
  840         IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
  841         IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
  842         IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
  843         IDTVEC(xmm_pti),
  844 #ifdef KDTRACE_HOOKS
  845         IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
  846 #endif
  847 #ifdef XENHVM
  848         IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
  849 #endif
  850         IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
  851         IDTVEC(fast_syscall_pti);
  852 
  853 #ifdef DDB
  854 /*
  855  * Display the index and function name of any IDT entries that don't use
  856  * the default 'rsvd' entry point.
  857  */
  858 DB_SHOW_COMMAND(idt, db_show_idt)
  859 {
  860         struct gate_descriptor *ip;
  861         int idx;
  862         uintptr_t func;
  863 
  864         ip = idt;
  865         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
  866                 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
  867                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
  868                         db_printf("%3d\t", idx);
  869                         db_printsym(func, DB_STGY_PROC);
  870                         db_printf("\n");
  871                 }
  872                 ip++;
  873         }
  874 }
  875 
  876 /* Show privileged registers. */
  877 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
  878 {
  879         struct {
  880                 uint16_t limit;
  881                 uint64_t base;
  882         } __packed idtr, gdtr;
  883         uint16_t ldt, tr;
  884 
  885         __asm __volatile("sidt %0" : "=m" (idtr));
  886         db_printf("idtr\t0x%016lx/%04x\n",
  887             (u_long)idtr.base, (u_int)idtr.limit);
  888         __asm __volatile("sgdt %0" : "=m" (gdtr));
  889         db_printf("gdtr\t0x%016lx/%04x\n",
  890             (u_long)gdtr.base, (u_int)gdtr.limit);
  891         __asm __volatile("sldt %0" : "=r" (ldt));
  892         db_printf("ldtr\t0x%04x\n", ldt);
  893         __asm __volatile("str %0" : "=r" (tr));
  894         db_printf("tr\t0x%04x\n", tr);
  895         db_printf("cr0\t0x%016lx\n", rcr0());
  896         db_printf("cr2\t0x%016lx\n", rcr2());
  897         db_printf("cr3\t0x%016lx\n", rcr3());
  898         db_printf("cr4\t0x%016lx\n", rcr4());
  899         if (rcr4() & CR4_XSAVE)
  900                 db_printf("xcr0\t0x%016lx\n", rxcr(0));
  901         db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
  902         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
  903                 db_printf("FEATURES_CTL\t%016lx\n",
  904                     rdmsr(MSR_IA32_FEATURE_CONTROL));
  905         db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
  906         db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
  907         db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
  908 }
  909 
  910 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
  911 {
  912 
  913         db_printf("dr0\t0x%016lx\n", rdr0());
  914         db_printf("dr1\t0x%016lx\n", rdr1());
  915         db_printf("dr2\t0x%016lx\n", rdr2());
  916         db_printf("dr3\t0x%016lx\n", rdr3());
  917         db_printf("dr6\t0x%016lx\n", rdr6());
  918         db_printf("dr7\t0x%016lx\n", rdr7());   
  919 }
  920 #endif
  921 
  922 void
  923 sdtossd(sd, ssd)
  924         struct user_segment_descriptor *sd;
  925         struct soft_segment_descriptor *ssd;
  926 {
  927 
  928         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  929         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  930         ssd->ssd_type  = sd->sd_type;
  931         ssd->ssd_dpl   = sd->sd_dpl;
  932         ssd->ssd_p     = sd->sd_p;
  933         ssd->ssd_long  = sd->sd_long;
  934         ssd->ssd_def32 = sd->sd_def32;
  935         ssd->ssd_gran  = sd->sd_gran;
  936 }
  937 
  938 void
  939 ssdtosd(ssd, sd)
  940         struct soft_segment_descriptor *ssd;
  941         struct user_segment_descriptor *sd;
  942 {
  943 
  944         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  945         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  946         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  947         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  948         sd->sd_type  = ssd->ssd_type;
  949         sd->sd_dpl   = ssd->ssd_dpl;
  950         sd->sd_p     = ssd->ssd_p;
  951         sd->sd_long  = ssd->ssd_long;
  952         sd->sd_def32 = ssd->ssd_def32;
  953         sd->sd_gran  = ssd->ssd_gran;
  954 }
  955 
  956 void
  957 ssdtosyssd(ssd, sd)
  958         struct soft_segment_descriptor *ssd;
  959         struct system_segment_descriptor *sd;
  960 {
  961 
  962         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  963         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  964         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  965         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  966         sd->sd_type  = ssd->ssd_type;
  967         sd->sd_dpl   = ssd->ssd_dpl;
  968         sd->sd_p     = ssd->ssd_p;
  969         sd->sd_gran  = ssd->ssd_gran;
  970 }
  971 
  972 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
  973 #include <isa/isavar.h>
  974 #include <isa/isareg.h>
  975 /*
  976  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  977  * and is only suitable for use at probe time.
  978  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  979  * It shouldn't be here.  There should probably be an APIC centric
  980  * implementation in the apic driver code, if at all.
  981  */
  982 intrmask_t
  983 isa_irq_pending(void)
  984 {
  985         u_char irr1;
  986         u_char irr2;
  987 
  988         irr1 = inb(IO_ICU1);
  989         irr2 = inb(IO_ICU2);
  990         return ((irr2 << 8) | irr1);
  991 }
  992 #endif
  993 
  994 u_int basemem;
  995 
  996 static int
  997 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
  998     int *physmap_idxp)
  999 {
 1000         int i, insert_idx, physmap_idx;
 1001 
 1002         physmap_idx = *physmap_idxp;
 1003 
 1004         if (length == 0)
 1005                 return (1);
 1006 
 1007         /*
 1008          * Find insertion point while checking for overlap.  Start off by
 1009          * assuming the new entry will be added to the end.
 1010          *
 1011          * NB: physmap_idx points to the next free slot.
 1012          */
 1013         insert_idx = physmap_idx;
 1014         for (i = 0; i <= physmap_idx; i += 2) {
 1015                 if (base < physmap[i + 1]) {
 1016                         if (base + length <= physmap[i]) {
 1017                                 insert_idx = i;
 1018                                 break;
 1019                         }
 1020                         if (boothowto & RB_VERBOSE)
 1021                                 printf(
 1022                     "Overlapping memory regions, ignoring second region\n");
 1023                         return (1);
 1024                 }
 1025         }
 1026 
 1027         /* See if we can prepend to the next entry. */
 1028         if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 1029                 physmap[insert_idx] = base;
 1030                 return (1);
 1031         }
 1032 
 1033         /* See if we can append to the previous entry. */
 1034         if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 1035                 physmap[insert_idx - 1] += length;
 1036                 return (1);
 1037         }
 1038 
 1039         physmap_idx += 2;
 1040         *physmap_idxp = physmap_idx;
 1041         if (physmap_idx == PHYSMAP_SIZE) {
 1042                 printf(
 1043                 "Too many segments in the physical address map, giving up\n");
 1044                 return (0);
 1045         }
 1046 
 1047         /*
 1048          * Move the last 'N' entries down to make room for the new
 1049          * entry if needed.
 1050          */
 1051         for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 1052                 physmap[i] = physmap[i - 2];
 1053                 physmap[i + 1] = physmap[i - 1];
 1054         }
 1055 
 1056         /* Insert the new entry. */
 1057         physmap[insert_idx] = base;
 1058         physmap[insert_idx + 1] = base + length;
 1059         return (1);
 1060 }
 1061 
 1062 void
 1063 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
 1064                       vm_paddr_t *physmap, int *physmap_idx)
 1065 {
 1066         struct bios_smap *smap, *smapend;
 1067 
 1068         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 1069 
 1070         for (smap = smapbase; smap < smapend; smap++) {
 1071                 if (boothowto & RB_VERBOSE)
 1072                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
 1073                             smap->type, smap->base, smap->length);
 1074 
 1075                 if (smap->type != SMAP_TYPE_MEMORY)
 1076                         continue;
 1077 
 1078                 if (!add_physmap_entry(smap->base, smap->length, physmap,
 1079                     physmap_idx))
 1080                         break;
 1081         }
 1082 }
 1083 
 1084 static void
 1085 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
 1086     int *physmap_idx)
 1087 {
 1088         struct efi_md *map, *p;
 1089         const char *type;
 1090         size_t efisz;
 1091         int ndesc, i;
 1092 
 1093         static const char *types[] = {
 1094                 "Reserved",
 1095                 "LoaderCode",
 1096                 "LoaderData",
 1097                 "BootServicesCode",
 1098                 "BootServicesData",
 1099                 "RuntimeServicesCode",
 1100                 "RuntimeServicesData",
 1101                 "ConventionalMemory",
 1102                 "UnusableMemory",
 1103                 "ACPIReclaimMemory",
 1104                 "ACPIMemoryNVS",
 1105                 "MemoryMappedIO",
 1106                 "MemoryMappedIOPortSpace",
 1107                 "PalCode",
 1108                 "PersistentMemory"
 1109         };
 1110 
 1111         /*
 1112          * Memory map data provided by UEFI via the GetMemoryMap
 1113          * Boot Services API.
 1114          */
 1115         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 1116         map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 1117 
 1118         if (efihdr->descriptor_size == 0)
 1119                 return;
 1120         ndesc = efihdr->memory_size / efihdr->descriptor_size;
 1121 
 1122         if (boothowto & RB_VERBOSE)
 1123                 printf("%23s %12s %12s %8s %4s\n",
 1124                     "Type", "Physical", "Virtual", "#Pages", "Attr");
 1125 
 1126         for (i = 0, p = map; i < ndesc; i++,
 1127             p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 1128                 if (boothowto & RB_VERBOSE) {
 1129                         if (p->md_type < nitems(types))
 1130                                 type = types[p->md_type];
 1131                         else
 1132                                 type = "<INVALID>";
 1133                         printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 1134                             p->md_virt, p->md_pages);
 1135                         if (p->md_attr & EFI_MD_ATTR_UC)
 1136                                 printf("UC ");
 1137                         if (p->md_attr & EFI_MD_ATTR_WC)
 1138                                 printf("WC ");
 1139                         if (p->md_attr & EFI_MD_ATTR_WT)
 1140                                 printf("WT ");
 1141                         if (p->md_attr & EFI_MD_ATTR_WB)
 1142                                 printf("WB ");
 1143                         if (p->md_attr & EFI_MD_ATTR_UCE)
 1144                                 printf("UCE ");
 1145                         if (p->md_attr & EFI_MD_ATTR_WP)
 1146                                 printf("WP ");
 1147                         if (p->md_attr & EFI_MD_ATTR_RP)
 1148                                 printf("RP ");
 1149                         if (p->md_attr & EFI_MD_ATTR_XP)
 1150                                 printf("XP ");
 1151                         if (p->md_attr & EFI_MD_ATTR_NV)
 1152                                 printf("NV ");
 1153                         if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 1154                                 printf("MORE_RELIABLE ");
 1155                         if (p->md_attr & EFI_MD_ATTR_RO)
 1156                                 printf("RO ");
 1157                         if (p->md_attr & EFI_MD_ATTR_RT)
 1158                                 printf("RUNTIME");
 1159                         printf("\n");
 1160                 }
 1161 
 1162                 switch (p->md_type) {
 1163                 case EFI_MD_TYPE_CODE:
 1164                 case EFI_MD_TYPE_DATA:
 1165                 case EFI_MD_TYPE_BS_CODE:
 1166                 case EFI_MD_TYPE_BS_DATA:
 1167                 case EFI_MD_TYPE_FREE:
 1168                         /*
 1169                          * We're allowed to use any entry with these types.
 1170                          */
 1171                         break;
 1172                 default:
 1173                         continue;
 1174                 }
 1175 
 1176                 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 1177                     physmap, physmap_idx))
 1178                         break;
 1179         }
 1180 }
 1181 
 1182 static char bootmethod[16] = "";
 1183 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
 1184     "System firmware boot method");
 1185 
 1186 static void
 1187 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 1188 {
 1189         struct bios_smap *smap;
 1190         struct efi_map_header *efihdr;
 1191         u_int32_t size;
 1192 
 1193         /*
 1194          * Memory map from INT 15:E820.
 1195          *
 1196          * subr_module.c says:
 1197          * "Consumer may safely assume that size value precedes data."
 1198          * ie: an int32_t immediately precedes smap.
 1199          */
 1200 
 1201         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1202             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1203         smap = (struct bios_smap *)preload_search_info(kmdp,
 1204             MODINFO_METADATA | MODINFOMD_SMAP);
 1205         if (efihdr == NULL && smap == NULL)
 1206                 panic("No BIOS smap or EFI map info from loader!");
 1207 
 1208         if (efihdr != NULL) {
 1209                 add_efi_map_entries(efihdr, physmap, physmap_idx);
 1210                 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 1211         } else {
 1212                 size = *((u_int32_t *)smap - 1);
 1213                 bios_add_smap_entries(smap, size, physmap, physmap_idx);
 1214                 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 1215         }
 1216 }
 1217 
 1218 #define PAGES_PER_GB    (1024 * 1024 * 1024 / PAGE_SIZE)
 1219 
 1220 /*
 1221  * Populate the (physmap) array with base/bound pairs describing the
 1222  * available physical memory in the system, then test this memory and
 1223  * build the phys_avail array describing the actually-available memory.
 1224  *
 1225  * Total memory size may be set by the kernel environment variable
 1226  * hw.physmem or the compile-time define MAXMEM.
 1227  *
 1228  * XXX first should be vm_paddr_t.
 1229  */
 1230 static void
 1231 getmemsize(caddr_t kmdp, u_int64_t first)
 1232 {
 1233         int i, physmap_idx, pa_indx, da_indx;
 1234         vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 1235         u_long physmem_start, physmem_tunable, memtest;
 1236         pt_entry_t *pte;
 1237         quad_t dcons_addr, dcons_size;
 1238         int page_counter;
 1239 
 1240         /*
 1241          * Tell the physical memory allocator about pages used to store
 1242          * the kernel and preloaded data.  See kmem_bootstrap_free().
 1243          */
 1244         vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 1245 
 1246         bzero(physmap, sizeof(physmap));
 1247         physmap_idx = 0;
 1248 
 1249         init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 1250         physmap_idx -= 2;
 1251 
 1252         /*
 1253          * Find the 'base memory' segment for SMP
 1254          */
 1255         basemem = 0;
 1256         for (i = 0; i <= physmap_idx; i += 2) {
 1257                 if (physmap[i] <= 0xA0000) {
 1258                         basemem = physmap[i + 1] / 1024;
 1259                         break;
 1260                 }
 1261         }
 1262         if (basemem == 0 || basemem > 640) {
 1263                 if (bootverbose)
 1264                         printf(
 1265                 "Memory map doesn't contain a basemem segment, faking it");
 1266                 basemem = 640;
 1267         }
 1268 
 1269         /*
 1270          * Make hole for "AP -> long mode" bootstrap code.  The
 1271          * mp_bootaddress vector is only available when the kernel
 1272          * is configured to support APs and APs for the system start
 1273          * in 32bit mode (e.g. SMP bare metal).
 1274          */
 1275         if (init_ops.mp_bootaddress) {
 1276                 if (physmap[1] >= 0x100000000)
 1277                         panic(
 1278         "Basemem segment is not suitable for AP bootstrap code!");
 1279                 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
 1280         }
 1281 
 1282         /*
 1283          * Maxmem isn't the "maximum memory", it's one larger than the
 1284          * highest page of the physical address space.  It should be
 1285          * called something like "Maxphyspage".  We may adjust this
 1286          * based on ``hw.physmem'' and the results of the memory test.
 1287          */
 1288         Maxmem = atop(physmap[physmap_idx + 1]);
 1289 
 1290 #ifdef MAXMEM
 1291         Maxmem = MAXMEM / 4;
 1292 #endif
 1293 
 1294         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 1295                 Maxmem = atop(physmem_tunable);
 1296 
 1297         /*
 1298          * The boot memory test is disabled by default, as it takes a
 1299          * significant amount of time on large-memory systems, and is
 1300          * unfriendly to virtual machines as it unnecessarily touches all
 1301          * pages.
 1302          *
 1303          * A general name is used as the code may be extended to support
 1304          * additional tests beyond the current "page present" test.
 1305          */
 1306         memtest = 0;
 1307         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 1308 
 1309         /*
 1310          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 1311          * in the system.
 1312          */
 1313         if (Maxmem > atop(physmap[physmap_idx + 1]))
 1314                 Maxmem = atop(physmap[physmap_idx + 1]);
 1315 
 1316         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 1317             (boothowto & RB_VERBOSE))
 1318                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 1319 
 1320         /* call pmap initialization to make new kernel address space */
 1321         pmap_bootstrap(&first);
 1322 
 1323         /*
 1324          * Size up each available chunk of physical memory.
 1325          *
 1326          * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 1327          * By default, mask off the first 16 pages unless we appear to be
 1328          * running in a VM.
 1329          */
 1330         physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 1331         TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 1332         if (physmap[0] < physmem_start) {
 1333                 if (physmem_start < PAGE_SIZE)
 1334                         physmap[0] = PAGE_SIZE;
 1335                 else if (physmem_start >= physmap[1])
 1336                         physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 1337                 else
 1338                         physmap[0] = round_page(physmem_start);
 1339         }
 1340         pa_indx = 0;
 1341         da_indx = 1;
 1342         phys_avail[pa_indx++] = physmap[0];
 1343         phys_avail[pa_indx] = physmap[0];
 1344         dump_avail[da_indx] = physmap[0];
 1345         pte = CMAP1;
 1346 
 1347         /*
 1348          * Get dcons buffer address
 1349          */
 1350         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 1351             getenv_quad("dcons.size", &dcons_size) == 0)
 1352                 dcons_addr = 0;
 1353 
 1354         /*
 1355          * physmap is in bytes, so when converting to page boundaries,
 1356          * round up the start address and round down the end address.
 1357          */
 1358         page_counter = 0;
 1359         if (memtest != 0)
 1360                 printf("Testing system memory");
 1361         for (i = 0; i <= physmap_idx; i += 2) {
 1362                 vm_paddr_t end;
 1363 
 1364                 end = ptoa((vm_paddr_t)Maxmem);
 1365                 if (physmap[i + 1] < end)
 1366                         end = trunc_page(physmap[i + 1]);
 1367                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 1368                         int tmp, page_bad, full;
 1369                         int *ptr = (int *)CADDR1;
 1370 
 1371                         full = FALSE;
 1372                         /*
 1373                          * block out kernel memory as not available.
 1374                          */
 1375                         if (pa >= (vm_paddr_t)kernphys && pa < first)
 1376                                 goto do_dump_avail;
 1377 
 1378                         /*
 1379                          * block out dcons buffer
 1380                          */
 1381                         if (dcons_addr > 0
 1382                             && pa >= trunc_page(dcons_addr)
 1383                             && pa < dcons_addr + dcons_size)
 1384                                 goto do_dump_avail;
 1385 
 1386                         page_bad = FALSE;
 1387                         if (memtest == 0)
 1388                                 goto skip_memtest;
 1389 
 1390                         /*
 1391                          * Print a "." every GB to show we're making
 1392                          * progress.
 1393                          */
 1394                         page_counter++;
 1395                         if ((page_counter % PAGES_PER_GB) == 0)
 1396                                 printf(".");
 1397 
 1398                         /*
 1399                          * map page into kernel: valid, read/write,non-cacheable
 1400                          */
 1401                         *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 1402                         invltlb();
 1403 
 1404                         tmp = *(int *)ptr;
 1405                         /*
 1406                          * Test for alternating 1's and 0's
 1407                          */
 1408                         *(volatile int *)ptr = 0xaaaaaaaa;
 1409                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1410                                 page_bad = TRUE;
 1411                         /*
 1412                          * Test for alternating 0's and 1's
 1413                          */
 1414                         *(volatile int *)ptr = 0x55555555;
 1415                         if (*(volatile int *)ptr != 0x55555555)
 1416                                 page_bad = TRUE;
 1417                         /*
 1418                          * Test for all 1's
 1419                          */
 1420                         *(volatile int *)ptr = 0xffffffff;
 1421                         if (*(volatile int *)ptr != 0xffffffff)
 1422                                 page_bad = TRUE;
 1423                         /*
 1424                          * Test for all 0's
 1425                          */
 1426                         *(volatile int *)ptr = 0x0;
 1427                         if (*(volatile int *)ptr != 0x0)
 1428                                 page_bad = TRUE;
 1429                         /*
 1430                          * Restore original value.
 1431                          */
 1432                         *(int *)ptr = tmp;
 1433 
 1434 skip_memtest:
 1435                         /*
 1436                          * Adjust array of valid/good pages.
 1437                          */
 1438                         if (page_bad == TRUE)
 1439                                 continue;
 1440                         /*
 1441                          * If this good page is a continuation of the
 1442                          * previous set of good pages, then just increase
 1443                          * the end pointer. Otherwise start a new chunk.
 1444                          * Note that "end" points one higher than end,
 1445                          * making the range >= start and < end.
 1446                          * If we're also doing a speculative memory
 1447                          * test and we at or past the end, bump up Maxmem
 1448                          * so that we keep going. The first bad page
 1449                          * will terminate the loop.
 1450                          */
 1451                         if (phys_avail[pa_indx] == pa) {
 1452                                 phys_avail[pa_indx] += PAGE_SIZE;
 1453                         } else {
 1454                                 pa_indx++;
 1455                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 1456                                         printf(
 1457                 "Too many holes in the physical address space, giving up\n");
 1458                                         pa_indx--;
 1459                                         full = TRUE;
 1460                                         goto do_dump_avail;
 1461                                 }
 1462                                 phys_avail[pa_indx++] = pa;     /* start */
 1463                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1464                         }
 1465                         physmem++;
 1466 do_dump_avail:
 1467                         if (dump_avail[da_indx] == pa) {
 1468                                 dump_avail[da_indx] += PAGE_SIZE;
 1469                         } else {
 1470                                 da_indx++;
 1471                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 1472                                         da_indx--;
 1473                                         goto do_next;
 1474                                 }
 1475                                 dump_avail[da_indx++] = pa; /* start */
 1476                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1477                         }
 1478 do_next:
 1479                         if (full)
 1480                                 break;
 1481                 }
 1482         }
 1483         *pte = 0;
 1484         invltlb();
 1485         if (memtest != 0)
 1486                 printf("\n");
 1487 
 1488         /*
 1489          * XXX
 1490          * The last chunk must contain at least one page plus the message
 1491          * buffer to avoid complicating other code (message buffer address
 1492          * calculation, etc.).
 1493          */
 1494         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1495             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 1496                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1497                 phys_avail[pa_indx--] = 0;
 1498                 phys_avail[pa_indx--] = 0;
 1499         }
 1500 
 1501         Maxmem = atop(phys_avail[pa_indx]);
 1502 
 1503         /* Trim off space for the message buffer. */
 1504         phys_avail[pa_indx] -= round_page(msgbufsize);
 1505 
 1506         /* Map the message buffer. */
 1507         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 1508 }
 1509 
 1510 static caddr_t
 1511 native_parse_preload_data(u_int64_t modulep)
 1512 {
 1513         caddr_t kmdp;
 1514         char *envp;
 1515 #ifdef DDB
 1516         vm_offset_t ksym_start;
 1517         vm_offset_t ksym_end;
 1518 #endif
 1519 
 1520         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1521         preload_bootstrap_relocate(KERNBASE);
 1522         kmdp = preload_search_by_type("elf kernel");
 1523         if (kmdp == NULL)
 1524                 kmdp = preload_search_by_type("elf64 kernel");
 1525         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1526         envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 1527         if (envp != NULL)
 1528                 envp += KERNBASE;
 1529         init_static_kenv(envp, 0);
 1530 #ifdef DDB
 1531         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1532         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1533         db_fetch_ksymtab(ksym_start, ksym_end);
 1534 #endif
 1535         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 1536 
 1537         return (kmdp);
 1538 }
 1539 
 1540 static void
 1541 amd64_kdb_init(void)
 1542 {
 1543         kdb_init();
 1544 #ifdef KDB
 1545         if (boothowto & RB_KDB)
 1546                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 1547 #endif
 1548 }
 1549 
 1550 /* Set up the fast syscall stuff */
 1551 void
 1552 amd64_conf_fast_syscall(void)
 1553 {
 1554         uint64_t msr;
 1555 
 1556         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1557         wrmsr(MSR_EFER, msr);
 1558         wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 1559             (u_int64_t)IDTVEC(fast_syscall));
 1560         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1561         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1562             ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1563         wrmsr(MSR_STAR, msr);
 1564         wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
 1565 }
 1566 
 1567 u_int64_t
 1568 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1569 {
 1570         caddr_t kmdp;
 1571         int gsel_tss, x;
 1572         struct pcpu *pc;
 1573         struct nmi_pcpu *np;
 1574         struct xstate_hdr *xhdr;
 1575         u_int64_t rsp0;
 1576         char *env;
 1577         size_t kstack0_sz;
 1578         int late_console;
 1579 
 1580         kmdp = init_ops.parse_preload_data(modulep);
 1581 
 1582         physfree += ucode_load_bsp(physfree + KERNBASE);
 1583         physfree = roundup2(physfree, PAGE_SIZE);
 1584 
 1585         identify_cpu1();
 1586         identify_hypervisor();
 1587         /*
 1588          * hw.cpu_stdext_disable is ignored by the call, it will be
 1589          * re-evaluted by the below call to finishidentcpu().
 1590          */
 1591         identify_cpu2();
 1592 
 1593         link_elf_ireloc(kmdp);
 1594 
 1595         /*
 1596          * This may be done better later if it gets more high level
 1597          * components in it. If so just link td->td_proc here.
 1598          */
 1599         proc_linkup0(&proc0, &thread0);
 1600 
 1601         /* Init basic tunables, hz etc */
 1602         init_param1();
 1603 
 1604         thread0.td_kstack = physfree + KERNBASE;
 1605         thread0.td_kstack_pages = kstack_pages;
 1606         kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 1607         bzero((void *)thread0.td_kstack, kstack0_sz);
 1608         physfree += kstack0_sz;
 1609 
 1610         /*
 1611          * make gdt memory segments
 1612          */
 1613         for (x = 0; x < NGDT; x++) {
 1614                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 1615                     x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 1616                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1617         }
 1618         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 1619         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1620             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1621 
 1622         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1623         r_gdt.rd_base =  (long) gdt;
 1624         lgdt(&r_gdt);
 1625         pc = &__pcpu[0];
 1626 
 1627         wrmsr(MSR_FSBASE, 0);           /* User value */
 1628         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1629         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1630 
 1631         pcpu_init(pc, 0, sizeof(struct pcpu));
 1632         dpcpu_init((void *)(physfree + KERNBASE), 0);
 1633         physfree += DPCPU_SIZE;
 1634         PCPU_SET(prvspace, pc);
 1635         PCPU_SET(curthread, &thread0);
 1636         /* Non-late cninit() and printf() can be moved up to here. */
 1637         PCPU_SET(tssp, &common_tss[0]);
 1638         PCPU_SET(commontssp, &common_tss[0]);
 1639         PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1640         PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 1641         PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 1642         PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 1643 
 1644         /*
 1645          * Initialize mutexes.
 1646          *
 1647          * icu_lock: in order to allow an interrupt to occur in a critical
 1648          *           section, to set pcpu->ipending (etc...) properly, we
 1649          *           must be able to get the icu lock, so it can't be
 1650          *           under witness.
 1651          */
 1652         mutex_init();
 1653         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1654         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 1655 
 1656         /* exceptions */
 1657         pti = pti_get_default();
 1658         TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 1659 
 1660         for (x = 0; x < NIDT; x++)
 1661                 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 1662                     SEL_KPL, 0);
 1663         setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 1664             SEL_KPL, 0);
 1665         setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 1666         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 1667         setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 1668             SEL_UPL, 0);
 1669         setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 1670             SEL_UPL, 0);
 1671         setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 1672             SEL_KPL, 0);
 1673         setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 1674             SEL_KPL, 0);
 1675         setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 1676             SEL_KPL, 0);
 1677         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1678         setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 1679             SDT_SYSIGT, SEL_KPL, 0);
 1680         setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 1681             SEL_KPL, 0);
 1682         setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 1683             SDT_SYSIGT, SEL_KPL, 0);
 1684         setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 1685             SEL_KPL, 0);
 1686         setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 1687             SEL_KPL, 0);
 1688         setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 1689             SEL_KPL, 0);
 1690         setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 1691             SEL_KPL, 0);
 1692         setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 1693             SEL_KPL, 0);
 1694         setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 1695         setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 1696             SEL_KPL, 0);
 1697 #ifdef KDTRACE_HOOKS
 1698         setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 1699             &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 1700 #endif
 1701 #ifdef XENHVM
 1702         setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 1703             &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 1704 #endif
 1705         r_idt.rd_limit = sizeof(idt0) - 1;
 1706         r_idt.rd_base = (long) idt;
 1707         lidt(&r_idt);
 1708 
 1709         /*
 1710          * Initialize the clock before the console so that console
 1711          * initialization can use DELAY().
 1712          */
 1713         clock_init();
 1714 
 1715         /*
 1716          * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 1717          * transition).
 1718          * Once bootblocks have updated, we can test directly for
 1719          * efi_systbl != NULL here...
 1720          */
 1721         if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 1722             != NULL)
 1723                 vty_set_preferred(VTY_VT);
 1724 
 1725         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1726         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1727         TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 1728         TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
 1729 
 1730         TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
 1731             &x86_rngds_mitg_enable);
 1732 
 1733         finishidentcpu();       /* Final stage of CPU initialization */
 1734         initializecpu();        /* Initialize CPU registers */
 1735         initializecpucache();
 1736 
 1737         /* doublefault stack space, runs on ist1 */
 1738         common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 1739 
 1740         /*
 1741          * NMI stack, runs on ist2.  The pcpu pointer is stored just
 1742          * above the start of the ist2 stack.
 1743          */
 1744         np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
 1745         np->np_pcpu = (register_t) pc;
 1746         common_tss[0].tss_ist2 = (long) np;
 1747 
 1748         /*
 1749          * MC# stack, runs on ist3.  The pcpu pointer is stored just
 1750          * above the start of the ist3 stack.
 1751          */
 1752         np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
 1753         np->np_pcpu = (register_t) pc;
 1754         common_tss[0].tss_ist3 = (long) np;
 1755 
 1756         /*
 1757          * DB# stack, runs on ist4.
 1758          */
 1759         np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
 1760         np->np_pcpu = (register_t) pc;
 1761         common_tss[0].tss_ist4 = (long) np;
 1762         
 1763         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1764         common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 1765 
 1766         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1767         ltr(gsel_tss);
 1768 
 1769         amd64_conf_fast_syscall();
 1770 
 1771         /*
 1772          * Temporary forge some valid pointer to PCB, for exception
 1773          * handlers.  It is reinitialized properly below after FPU is
 1774          * set up.  Also set up td_critnest to short-cut the page
 1775          * fault handler.
 1776          */
 1777         cpu_max_ext_state_size = sizeof(struct savefpu);
 1778         thread0.td_pcb = get_pcb_td(&thread0);
 1779         thread0.td_critnest = 1;
 1780 
 1781         /*
 1782          * The console and kdb should be initialized even earlier than here,
 1783          * but some console drivers don't work until after getmemsize().
 1784          * Default to late console initialization to support these drivers.
 1785          * This loses mainly printf()s in getmemsize() and early debugging.
 1786          */
 1787         late_console = 1;
 1788         TUNABLE_INT_FETCH("debug.late_console", &late_console);
 1789         if (!late_console) {
 1790                 cninit();
 1791                 amd64_kdb_init();
 1792         }
 1793 
 1794         getmemsize(kmdp, physfree);
 1795         init_param2(physmem);
 1796 
 1797         /* now running on new page tables, configured,and u/iom is accessible */
 1798 
 1799         if (late_console)
 1800                 cninit();
 1801 
 1802 #ifdef DEV_ISA
 1803 #ifdef DEV_ATPIC
 1804         elcr_probe();
 1805         atpic_startup();
 1806 #else
 1807         /* Reset and mask the atpics and leave them shut down. */
 1808         atpic_reset();
 1809 
 1810         /*
 1811          * Point the ICU spurious interrupt vectors at the APIC spurious
 1812          * interrupt handler.
 1813          */
 1814         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1815         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1816 #endif
 1817 #else
 1818 #error "have you forgotten the isa device?";
 1819 #endif
 1820 
 1821         if (late_console)
 1822                 amd64_kdb_init();
 1823 
 1824         msgbufinit(msgbufp, msgbufsize);
 1825         fpuinit();
 1826 
 1827         /*
 1828          * Set up thread0 pcb after fpuinit calculated pcb + fpu save
 1829          * area size.  Zero out the extended state header in fpu save
 1830          * area.
 1831          */
 1832         thread0.td_pcb = get_pcb_td(&thread0);
 1833         thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 1834         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 1835         if (use_xsave) {
 1836                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 1837                     1);
 1838                 xhdr->xstate_bv = xsave_mask;
 1839         }
 1840         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1841         rsp0 = (vm_offset_t)thread0.td_pcb;
 1842         /* Ensure the stack is aligned to 16 bytes */
 1843         rsp0 &= ~0xFul;
 1844         common_tss[0].tss_rsp0 = rsp0;
 1845         PCPU_SET(rsp0, rsp0);
 1846         PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 1847             PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 1848         PCPU_SET(curpcb, thread0.td_pcb);
 1849 
 1850         /* transfer to user mode */
 1851 
 1852         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1853         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1854         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1855         _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 1856         _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 1857 
 1858         load_ds(_udatasel);
 1859         load_es(_udatasel);
 1860         load_fs(_ufssel);
 1861 
 1862         /* setup proc 0's pcb */
 1863         thread0.td_pcb->pcb_flags = 0;
 1864         thread0.td_frame = &proc0_tf;
 1865 
 1866         env = kern_getenv("kernelname");
 1867         if (env != NULL)
 1868                 strlcpy(kernelname, env, sizeof(kernelname));
 1869 
 1870         cpu_probe_amdc1e();
 1871 
 1872 #ifdef FDT
 1873         x86_init_fdt();
 1874 #endif
 1875         thread0.td_critnest = 0;
 1876 
 1877         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1878         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1879         TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 1880 
 1881         /* Location of kernel stack for locore */
 1882         return ((u_int64_t)thread0.td_pcb);
 1883 }
 1884 
 1885 void
 1886 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1887 {
 1888 
 1889         pcpu->pc_acpi_id = 0xffffffff;
 1890 }
 1891 
 1892 static int
 1893 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1894 {
 1895         struct bios_smap *smapbase;
 1896         struct bios_smap_xattr smap;
 1897         caddr_t kmdp;
 1898         uint32_t *smapattr;
 1899         int count, error, i;
 1900 
 1901         /* Retrieve the system memory map from the loader. */
 1902         kmdp = preload_search_by_type("elf kernel");
 1903         if (kmdp == NULL)
 1904                 kmdp = preload_search_by_type("elf64 kernel");
 1905         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 1906             MODINFO_METADATA | MODINFOMD_SMAP);
 1907         if (smapbase == NULL)
 1908                 return (0);
 1909         smapattr = (uint32_t *)preload_search_info(kmdp,
 1910             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 1911         count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 1912         error = 0;
 1913         for (i = 0; i < count; i++) {
 1914                 smap.base = smapbase[i].base;
 1915                 smap.length = smapbase[i].length;
 1916                 smap.type = smapbase[i].type;
 1917                 if (smapattr != NULL)
 1918                         smap.xattr = smapattr[i];
 1919                 else
 1920                         smap.xattr = 0;
 1921                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 1922         }
 1923         return (error);
 1924 }
 1925 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1926     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 1927 
 1928 static int
 1929 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1930 {
 1931         struct efi_map_header *efihdr;
 1932         caddr_t kmdp;
 1933         uint32_t efisize;
 1934 
 1935         kmdp = preload_search_by_type("elf kernel");
 1936         if (kmdp == NULL)
 1937                 kmdp = preload_search_by_type("elf64 kernel");
 1938         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1939             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1940         if (efihdr == NULL)
 1941                 return (0);
 1942         efisize = *((uint32_t *)efihdr - 1);
 1943         return (SYSCTL_OUT(req, efihdr, efisize));
 1944 }
 1945 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 1946     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
 1947 
 1948 void
 1949 spinlock_enter(void)
 1950 {
 1951         struct thread *td;
 1952         register_t flags;
 1953 
 1954         td = curthread;
 1955         if (td->td_md.md_spinlock_count == 0) {
 1956                 flags = intr_disable();
 1957                 td->td_md.md_spinlock_count = 1;
 1958                 td->td_md.md_saved_flags = flags;
 1959         } else
 1960                 td->td_md.md_spinlock_count++;
 1961         critical_enter();
 1962 }
 1963 
 1964 void
 1965 spinlock_exit(void)
 1966 {
 1967         struct thread *td;
 1968         register_t flags;
 1969 
 1970         td = curthread;
 1971         critical_exit();
 1972         flags = td->td_md.md_saved_flags;
 1973         td->td_md.md_spinlock_count--;
 1974         if (td->td_md.md_spinlock_count == 0)
 1975                 intr_restore(flags);
 1976 }
 1977 
 1978 /*
 1979  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1980  * we want to start a backtrace from the function that caused us to enter
 1981  * the debugger. We have the context in the trapframe, but base the trace
 1982  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1983  * enough for a backtrace.
 1984  */
 1985 void
 1986 makectx(struct trapframe *tf, struct pcb *pcb)
 1987 {
 1988 
 1989         pcb->pcb_r12 = tf->tf_r12;
 1990         pcb->pcb_r13 = tf->tf_r13;
 1991         pcb->pcb_r14 = tf->tf_r14;
 1992         pcb->pcb_r15 = tf->tf_r15;
 1993         pcb->pcb_rbp = tf->tf_rbp;
 1994         pcb->pcb_rbx = tf->tf_rbx;
 1995         pcb->pcb_rip = tf->tf_rip;
 1996         pcb->pcb_rsp = tf->tf_rsp;
 1997 }
 1998 
 1999 int
 2000 ptrace_set_pc(struct thread *td, unsigned long addr)
 2001 {
 2002 
 2003         td->td_frame->tf_rip = addr;
 2004         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2005         return (0);
 2006 }
 2007 
 2008 int
 2009 ptrace_single_step(struct thread *td)
 2010 {
 2011 
 2012         PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 2013         if ((td->td_frame->tf_rflags & PSL_T) == 0) {
 2014                 td->td_frame->tf_rflags |= PSL_T;
 2015                 td->td_dbgflags |= TDB_STEP;
 2016         }
 2017         return (0);
 2018 }
 2019 
 2020 int
 2021 ptrace_clear_single_step(struct thread *td)
 2022 {
 2023         PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 2024         td->td_frame->tf_rflags &= ~PSL_T;
 2025         td->td_dbgflags &= ~TDB_STEP;
 2026         return (0);
 2027 }
 2028 
 2029 int
 2030 fill_regs(struct thread *td, struct reg *regs)
 2031 {
 2032         struct trapframe *tp;
 2033 
 2034         tp = td->td_frame;
 2035         return (fill_frame_regs(tp, regs));
 2036 }
 2037 
 2038 int
 2039 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 2040 {
 2041 
 2042         regs->r_r15 = tp->tf_r15;
 2043         regs->r_r14 = tp->tf_r14;
 2044         regs->r_r13 = tp->tf_r13;
 2045         regs->r_r12 = tp->tf_r12;
 2046         regs->r_r11 = tp->tf_r11;
 2047         regs->r_r10 = tp->tf_r10;
 2048         regs->r_r9  = tp->tf_r9;
 2049         regs->r_r8  = tp->tf_r8;
 2050         regs->r_rdi = tp->tf_rdi;
 2051         regs->r_rsi = tp->tf_rsi;
 2052         regs->r_rbp = tp->tf_rbp;
 2053         regs->r_rbx = tp->tf_rbx;
 2054         regs->r_rdx = tp->tf_rdx;
 2055         regs->r_rcx = tp->tf_rcx;
 2056         regs->r_rax = tp->tf_rax;
 2057         regs->r_rip = tp->tf_rip;
 2058         regs->r_cs = tp->tf_cs;
 2059         regs->r_rflags = tp->tf_rflags;
 2060         regs->r_rsp = tp->tf_rsp;
 2061         regs->r_ss = tp->tf_ss;
 2062         if (tp->tf_flags & TF_HASSEGS) {
 2063                 regs->r_ds = tp->tf_ds;
 2064                 regs->r_es = tp->tf_es;
 2065                 regs->r_fs = tp->tf_fs;
 2066                 regs->r_gs = tp->tf_gs;
 2067         } else {
 2068                 regs->r_ds = 0;
 2069                 regs->r_es = 0;
 2070                 regs->r_fs = 0;
 2071                 regs->r_gs = 0;
 2072         }
 2073         regs->r_err = 0;
 2074         regs->r_trapno = 0;
 2075         return (0);
 2076 }
 2077 
 2078 int
 2079 set_regs(struct thread *td, struct reg *regs)
 2080 {
 2081         struct trapframe *tp;
 2082         register_t rflags;
 2083 
 2084         tp = td->td_frame;
 2085         rflags = regs->r_rflags & 0xffffffff;
 2086         if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 2087                 return (EINVAL);
 2088         tp->tf_r15 = regs->r_r15;
 2089         tp->tf_r14 = regs->r_r14;
 2090         tp->tf_r13 = regs->r_r13;
 2091         tp->tf_r12 = regs->r_r12;
 2092         tp->tf_r11 = regs->r_r11;
 2093         tp->tf_r10 = regs->r_r10;
 2094         tp->tf_r9  = regs->r_r9;
 2095         tp->tf_r8  = regs->r_r8;
 2096         tp->tf_rdi = regs->r_rdi;
 2097         tp->tf_rsi = regs->r_rsi;
 2098         tp->tf_rbp = regs->r_rbp;
 2099         tp->tf_rbx = regs->r_rbx;
 2100         tp->tf_rdx = regs->r_rdx;
 2101         tp->tf_rcx = regs->r_rcx;
 2102         tp->tf_rax = regs->r_rax;
 2103         tp->tf_rip = regs->r_rip;
 2104         tp->tf_cs = regs->r_cs;
 2105         tp->tf_rflags = rflags;
 2106         tp->tf_rsp = regs->r_rsp;
 2107         tp->tf_ss = regs->r_ss;
 2108         if (0) {        /* XXXKIB */
 2109                 tp->tf_ds = regs->r_ds;
 2110                 tp->tf_es = regs->r_es;
 2111                 tp->tf_fs = regs->r_fs;
 2112                 tp->tf_gs = regs->r_gs;
 2113                 tp->tf_flags = TF_HASSEGS;
 2114         }
 2115         set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 2116         return (0);
 2117 }
 2118 
 2119 /* XXX check all this stuff! */
 2120 /* externalize from sv_xmm */
 2121 static void
 2122 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 2123 {
 2124         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2125         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2126         int i;
 2127 
 2128         /* pcb -> fpregs */
 2129         bzero(fpregs, sizeof(*fpregs));
 2130 
 2131         /* FPU control/status */
 2132         penv_fpreg->en_cw = penv_xmm->en_cw;
 2133         penv_fpreg->en_sw = penv_xmm->en_sw;
 2134         penv_fpreg->en_tw = penv_xmm->en_tw;
 2135         penv_fpreg->en_opcode = penv_xmm->en_opcode;
 2136         penv_fpreg->en_rip = penv_xmm->en_rip;
 2137         penv_fpreg->en_rdp = penv_xmm->en_rdp;
 2138         penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 2139         penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 2140 
 2141         /* FPU registers */
 2142         for (i = 0; i < 8; ++i)
 2143                 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 2144 
 2145         /* SSE registers */
 2146         for (i = 0; i < 16; ++i)
 2147                 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 2148 }
 2149 
 2150 /* internalize from fpregs into sv_xmm */
 2151 static void
 2152 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 2153 {
 2154         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 2155         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 2156         int i;
 2157 
 2158         /* fpregs -> pcb */
 2159         /* FPU control/status */
 2160         penv_xmm->en_cw = penv_fpreg->en_cw;
 2161         penv_xmm->en_sw = penv_fpreg->en_sw;
 2162         penv_xmm->en_tw = penv_fpreg->en_tw;
 2163         penv_xmm->en_opcode = penv_fpreg->en_opcode;
 2164         penv_xmm->en_rip = penv_fpreg->en_rip;
 2165         penv_xmm->en_rdp = penv_fpreg->en_rdp;
 2166         penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 2167         penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 2168 
 2169         /* FPU registers */
 2170         for (i = 0; i < 8; ++i)
 2171                 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 2172 
 2173         /* SSE registers */
 2174         for (i = 0; i < 16; ++i)
 2175                 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 2176 }
 2177 
 2178 /* externalize from td->pcb */
 2179 int
 2180 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 2181 {
 2182 
 2183         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 2184             P_SHOULDSTOP(td->td_proc),
 2185             ("not suspended thread %p", td));
 2186         fpugetregs(td);
 2187         fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 2188         return (0);
 2189 }
 2190 
 2191 /* internalize to td->pcb */
 2192 int
 2193 set_fpregs(struct thread *td, struct fpreg *fpregs)
 2194 {
 2195 
 2196         critical_enter();
 2197         set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 2198         fpuuserinited(td);
 2199         critical_exit();
 2200         return (0);
 2201 }
 2202 
 2203 /*
 2204  * Get machine context.
 2205  */
 2206 int
 2207 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 2208 {
 2209         struct pcb *pcb;
 2210         struct trapframe *tp;
 2211 
 2212         pcb = td->td_pcb;
 2213         tp = td->td_frame;
 2214         PROC_LOCK(curthread->td_proc);
 2215         mcp->mc_onstack = sigonstack(tp->tf_rsp);
 2216         PROC_UNLOCK(curthread->td_proc);
 2217         mcp->mc_r15 = tp->tf_r15;
 2218         mcp->mc_r14 = tp->tf_r14;
 2219         mcp->mc_r13 = tp->tf_r13;
 2220         mcp->mc_r12 = tp->tf_r12;
 2221         mcp->mc_r11 = tp->tf_r11;
 2222         mcp->mc_r10 = tp->tf_r10;
 2223         mcp->mc_r9  = tp->tf_r9;
 2224         mcp->mc_r8  = tp->tf_r8;
 2225         mcp->mc_rdi = tp->tf_rdi;
 2226         mcp->mc_rsi = tp->tf_rsi;
 2227         mcp->mc_rbp = tp->tf_rbp;
 2228         mcp->mc_rbx = tp->tf_rbx;
 2229         mcp->mc_rcx = tp->tf_rcx;
 2230         mcp->mc_rflags = tp->tf_rflags;
 2231         if (flags & GET_MC_CLEAR_RET) {
 2232                 mcp->mc_rax = 0;
 2233                 mcp->mc_rdx = 0;
 2234                 mcp->mc_rflags &= ~PSL_C;
 2235         } else {
 2236                 mcp->mc_rax = tp->tf_rax;
 2237                 mcp->mc_rdx = tp->tf_rdx;
 2238         }
 2239         mcp->mc_rip = tp->tf_rip;
 2240         mcp->mc_cs = tp->tf_cs;
 2241         mcp->mc_rsp = tp->tf_rsp;
 2242         mcp->mc_ss = tp->tf_ss;
 2243         mcp->mc_ds = tp->tf_ds;
 2244         mcp->mc_es = tp->tf_es;
 2245         mcp->mc_fs = tp->tf_fs;
 2246         mcp->mc_gs = tp->tf_gs;
 2247         mcp->mc_flags = tp->tf_flags;
 2248         mcp->mc_len = sizeof(*mcp);
 2249         get_fpcontext(td, mcp, NULL, 0);
 2250         update_pcb_bases(pcb);
 2251         mcp->mc_fsbase = pcb->pcb_fsbase;
 2252         mcp->mc_gsbase = pcb->pcb_gsbase;
 2253         mcp->mc_xfpustate = 0;
 2254         mcp->mc_xfpustate_len = 0;
 2255         bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 2256         return (0);
 2257 }
 2258 
 2259 /*
 2260  * Set machine context.
 2261  *
 2262  * However, we don't set any but the user modifiable flags, and we won't
 2263  * touch the cs selector.
 2264  */
 2265 int
 2266 set_mcontext(struct thread *td, mcontext_t *mcp)
 2267 {
 2268         struct pcb *pcb;
 2269         struct trapframe *tp;
 2270         char *xfpustate;
 2271         long rflags;
 2272         int ret;
 2273 
 2274         pcb = td->td_pcb;
 2275         tp = td->td_frame;
 2276         if (mcp->mc_len != sizeof(*mcp) ||
 2277             (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 2278                 return (EINVAL);
 2279         rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 2280             (tp->tf_rflags & ~PSL_USERCHANGE);
 2281         if (mcp->mc_flags & _MC_HASFPXSTATE) {
 2282                 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 2283                     sizeof(struct savefpu))
 2284                         return (EINVAL);
 2285                 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 2286                 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 2287                     mcp->mc_xfpustate_len);
 2288                 if (ret != 0)
 2289                         return (ret);
 2290         } else
 2291                 xfpustate = NULL;
 2292         ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 2293         if (ret != 0)
 2294                 return (ret);
 2295         tp->tf_r15 = mcp->mc_r15;
 2296         tp->tf_r14 = mcp->mc_r14;
 2297         tp->tf_r13 = mcp->mc_r13;
 2298         tp->tf_r12 = mcp->mc_r12;
 2299         tp->tf_r11 = mcp->mc_r11;
 2300         tp->tf_r10 = mcp->mc_r10;
 2301         tp->tf_r9  = mcp->mc_r9;
 2302         tp->tf_r8  = mcp->mc_r8;
 2303         tp->tf_rdi = mcp->mc_rdi;
 2304         tp->tf_rsi = mcp->mc_rsi;
 2305         tp->tf_rbp = mcp->mc_rbp;
 2306         tp->tf_rbx = mcp->mc_rbx;
 2307         tp->tf_rdx = mcp->mc_rdx;
 2308         tp->tf_rcx = mcp->mc_rcx;
 2309         tp->tf_rax = mcp->mc_rax;
 2310         tp->tf_rip = mcp->mc_rip;
 2311         tp->tf_rflags = rflags;
 2312         tp->tf_rsp = mcp->mc_rsp;
 2313         tp->tf_ss = mcp->mc_ss;
 2314         tp->tf_flags = mcp->mc_flags;
 2315         if (tp->tf_flags & TF_HASSEGS) {
 2316                 tp->tf_ds = mcp->mc_ds;
 2317                 tp->tf_es = mcp->mc_es;
 2318                 tp->tf_fs = mcp->mc_fs;
 2319                 tp->tf_gs = mcp->mc_gs;
 2320         }
 2321         set_pcb_flags(pcb, PCB_FULL_IRET);
 2322         if (mcp->mc_flags & _MC_HASBASES) {
 2323                 pcb->pcb_fsbase = mcp->mc_fsbase;
 2324                 pcb->pcb_gsbase = mcp->mc_gsbase;
 2325         }
 2326         return (0);
 2327 }
 2328 
 2329 static void
 2330 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
 2331     size_t xfpusave_len)
 2332 {
 2333         size_t max_len, len;
 2334 
 2335         mcp->mc_ownedfp = fpugetregs(td);
 2336         bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 2337             sizeof(mcp->mc_fpstate));
 2338         mcp->mc_fpformat = fpuformat();
 2339         if (!use_xsave || xfpusave_len == 0)
 2340                 return;
 2341         max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 2342         len = xfpusave_len;
 2343         if (len > max_len) {
 2344                 len = max_len;
 2345                 bzero(xfpusave + max_len, len - max_len);
 2346         }
 2347         mcp->mc_flags |= _MC_HASFPXSTATE;
 2348         mcp->mc_xfpustate_len = len;
 2349         bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 2350 }
 2351 
 2352 static int
 2353 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
 2354     size_t xfpustate_len)
 2355 {
 2356         int error;
 2357 
 2358         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 2359                 return (0);
 2360         else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 2361                 return (EINVAL);
 2362         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 2363                 /* We don't care what state is left in the FPU or PCB. */
 2364                 fpstate_drop(td);
 2365                 error = 0;
 2366         } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 2367             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 2368                 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 2369                     xfpustate, xfpustate_len);
 2370         } else
 2371                 return (EINVAL);
 2372         return (error);
 2373 }
 2374 
 2375 void
 2376 fpstate_drop(struct thread *td)
 2377 {
 2378 
 2379         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 2380         critical_enter();
 2381         if (PCPU_GET(fpcurthread) == td)
 2382                 fpudrop();
 2383         /*
 2384          * XXX force a full drop of the fpu.  The above only drops it if we
 2385          * owned it.
 2386          *
 2387          * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 2388          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 2389          * We only need to drop to !PCB_INITDONE in sendsig().  But
 2390          * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 2391          * have too many layers.
 2392          */
 2393         clear_pcb_flags(curthread->td_pcb,
 2394             PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 2395         critical_exit();
 2396 }
 2397 
 2398 int
 2399 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 2400 {
 2401         struct pcb *pcb;
 2402 
 2403         if (td == NULL) {
 2404                 dbregs->dr[0] = rdr0();
 2405                 dbregs->dr[1] = rdr1();
 2406                 dbregs->dr[2] = rdr2();
 2407                 dbregs->dr[3] = rdr3();
 2408                 dbregs->dr[6] = rdr6();
 2409                 dbregs->dr[7] = rdr7();
 2410         } else {
 2411                 pcb = td->td_pcb;
 2412                 dbregs->dr[0] = pcb->pcb_dr0;
 2413                 dbregs->dr[1] = pcb->pcb_dr1;
 2414                 dbregs->dr[2] = pcb->pcb_dr2;
 2415                 dbregs->dr[3] = pcb->pcb_dr3;
 2416                 dbregs->dr[6] = pcb->pcb_dr6;
 2417                 dbregs->dr[7] = pcb->pcb_dr7;
 2418         }
 2419         dbregs->dr[4] = 0;
 2420         dbregs->dr[5] = 0;
 2421         dbregs->dr[8] = 0;
 2422         dbregs->dr[9] = 0;
 2423         dbregs->dr[10] = 0;
 2424         dbregs->dr[11] = 0;
 2425         dbregs->dr[12] = 0;
 2426         dbregs->dr[13] = 0;
 2427         dbregs->dr[14] = 0;
 2428         dbregs->dr[15] = 0;
 2429         return (0);
 2430 }
 2431 
 2432 int
 2433 set_dbregs(struct thread *td, struct dbreg *dbregs)
 2434 {
 2435         struct pcb *pcb;
 2436         int i;
 2437 
 2438         if (td == NULL) {
 2439                 load_dr0(dbregs->dr[0]);
 2440                 load_dr1(dbregs->dr[1]);
 2441                 load_dr2(dbregs->dr[2]);
 2442                 load_dr3(dbregs->dr[3]);
 2443                 load_dr6(dbregs->dr[6]);
 2444                 load_dr7(dbregs->dr[7]);
 2445         } else {
 2446                 /*
 2447                  * Don't let an illegal value for dr7 get set.  Specifically,
 2448                  * check for undefined settings.  Setting these bit patterns
 2449                  * result in undefined behaviour and can lead to an unexpected
 2450                  * TRCTRAP or a general protection fault right here.
 2451                  * Upper bits of dr6 and dr7 must not be set
 2452                  */
 2453                 for (i = 0; i < 4; i++) {
 2454                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 2455                                 return (EINVAL);
 2456                         if (td->td_frame->tf_cs == _ucode32sel &&
 2457                             DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 2458                                 return (EINVAL);
 2459                 }
 2460                 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 2461                     (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 2462                         return (EINVAL);
 2463 
 2464                 pcb = td->td_pcb;
 2465 
 2466                 /*
 2467                  * Don't let a process set a breakpoint that is not within the
 2468                  * process's address space.  If a process could do this, it
 2469                  * could halt the system by setting a breakpoint in the kernel
 2470                  * (if ddb was enabled).  Thus, we need to check to make sure
 2471                  * that no breakpoints are being enabled for addresses outside
 2472                  * process's address space.
 2473                  *
 2474                  * XXX - what about when the watched area of the user's
 2475                  * address space is written into from within the kernel
 2476                  * ... wouldn't that still cause a breakpoint to be generated
 2477                  * from within kernel mode?
 2478                  */
 2479 
 2480                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 2481                         /* dr0 is enabled */
 2482                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 2483                                 return (EINVAL);
 2484                 }
 2485                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 2486                         /* dr1 is enabled */
 2487                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 2488                                 return (EINVAL);
 2489                 }
 2490                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 2491                         /* dr2 is enabled */
 2492                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 2493                                 return (EINVAL);
 2494                 }
 2495                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 2496                         /* dr3 is enabled */
 2497                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 2498                                 return (EINVAL);
 2499                 }
 2500 
 2501                 pcb->pcb_dr0 = dbregs->dr[0];
 2502                 pcb->pcb_dr1 = dbregs->dr[1];
 2503                 pcb->pcb_dr2 = dbregs->dr[2];
 2504                 pcb->pcb_dr3 = dbregs->dr[3];
 2505                 pcb->pcb_dr6 = dbregs->dr[6];
 2506                 pcb->pcb_dr7 = dbregs->dr[7];
 2507 
 2508                 set_pcb_flags(pcb, PCB_DBREGS);
 2509         }
 2510 
 2511         return (0);
 2512 }
 2513 
 2514 void
 2515 reset_dbregs(void)
 2516 {
 2517 
 2518         load_dr7(0);    /* Turn off the control bits first */
 2519         load_dr0(0);
 2520         load_dr1(0);
 2521         load_dr2(0);
 2522         load_dr3(0);
 2523         load_dr6(0);
 2524 }
 2525 
 2526 /*
 2527  * Return > 0 if a hardware breakpoint has been hit, and the
 2528  * breakpoint was in user space.  Return 0, otherwise.
 2529  */
 2530 int
 2531 user_dbreg_trap(register_t dr6)
 2532 {
 2533         u_int64_t dr7;
 2534         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
 2535         int nbp;            /* number of breakpoints that triggered */
 2536         caddr_t addr[4];    /* breakpoint addresses */
 2537         int i;
 2538 
 2539         bp = dr6 & DBREG_DR6_BMASK;
 2540         if (bp == 0) {
 2541                 /*
 2542                  * None of the breakpoint bits are set meaning this
 2543                  * trap was not caused by any of the debug registers
 2544                  */
 2545                 return 0;
 2546         }
 2547 
 2548         dr7 = rdr7();
 2549         if ((dr7 & 0x000000ff) == 0) {
 2550                 /*
 2551                  * all GE and LE bits in the dr7 register are zero,
 2552                  * thus the trap couldn't have been caused by the
 2553                  * hardware debug registers
 2554                  */
 2555                 return 0;
 2556         }
 2557 
 2558         nbp = 0;
 2559 
 2560         /*
 2561          * at least one of the breakpoints were hit, check to see
 2562          * which ones and if any of them are user space addresses
 2563          */
 2564 
 2565         if (bp & 0x01) {
 2566                 addr[nbp++] = (caddr_t)rdr0();
 2567         }
 2568         if (bp & 0x02) {
 2569                 addr[nbp++] = (caddr_t)rdr1();
 2570         }
 2571         if (bp & 0x04) {
 2572                 addr[nbp++] = (caddr_t)rdr2();
 2573         }
 2574         if (bp & 0x08) {
 2575                 addr[nbp++] = (caddr_t)rdr3();
 2576         }
 2577 
 2578         for (i = 0; i < nbp; i++) {
 2579                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 2580                         /*
 2581                          * addr[i] is in user space
 2582                          */
 2583                         return nbp;
 2584                 }
 2585         }
 2586 
 2587         /*
 2588          * None of the breakpoints are in user space.
 2589          */
 2590         return 0;
 2591 }
 2592 
 2593 /*
 2594  * The pcb_flags is only modified by current thread, or by other threads
 2595  * when current thread is stopped.  However, current thread may change it
 2596  * from the interrupt context in cpu_switch(), or in the trap handler.
 2597  * When we read-modify-write pcb_flags from C sources, compiler may generate
 2598  * code that is not atomic regarding the interrupt handler.  If a trap or
 2599  * interrupt happens and any flag is modified from the handler, it can be
 2600  * clobbered with the cached value later.  Therefore, we implement setting
 2601  * and clearing flags with single-instruction functions, which do not race
 2602  * with possible modification of the flags from the trap or interrupt context,
 2603  * because traps and interrupts are executed only on instruction boundary.
 2604  */
 2605 void
 2606 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 2607 {
 2608 
 2609         __asm __volatile("orl %1,%0"
 2610             : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 2611             : "cc", "memory");
 2612 
 2613 }
 2614 
 2615 /*
 2616  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
 2617  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
 2618  * pcb if user space modified the bases.  We must save on the context
 2619  * switch or if the return to usermode happens through the doreti.
 2620  *
 2621  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
 2622  * which have a consequence that the base MSRs must be saved each time
 2623  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
 2624  * context switches.
 2625  */
 2626 void
 2627 set_pcb_flags(struct pcb *pcb, const u_int flags)
 2628 {
 2629         register_t r;
 2630 
 2631         if (curpcb == pcb &&
 2632             (flags & PCB_FULL_IRET) != 0 &&
 2633             (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
 2634             (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
 2635                 r = intr_disable();
 2636                 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 2637                         if (rfs() == _ufssel)
 2638                                 pcb->pcb_fsbase = rdfsbase();
 2639                         if (rgs() == _ugssel)
 2640                                 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 2641                 }
 2642                 set_pcb_flags_raw(pcb, flags);
 2643                 intr_restore(r);
 2644         } else {
 2645                 set_pcb_flags_raw(pcb, flags);
 2646         }
 2647 }
 2648 
 2649 void
 2650 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 2651 {
 2652 
 2653         __asm __volatile("andl %1,%0"
 2654             : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 2655             : "cc", "memory");
 2656 }
 2657 
 2658 #ifdef KDB
 2659 
 2660 /*
 2661  * Provide inb() and outb() as functions.  They are normally only available as
 2662  * inline functions, thus cannot be called from the debugger.
 2663  */
 2664 
 2665 /* silence compiler warnings */
 2666 u_char inb_(u_short);
 2667 void outb_(u_short, u_char);
 2668 
 2669 u_char
 2670 inb_(u_short port)
 2671 {
 2672         return inb(port);
 2673 }
 2674 
 2675 void
 2676 outb_(u_short port, u_char data)
 2677 {
 2678         outb(port, data);
 2679 }
 2680 
 2681 #endif /* KDB */

Cache object: 82046fd8e8b0a26f5503a2d9433a07a9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.