The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/machdep.c

Version: -  FREEBSD  -  FREEBSD11  -  FREEBSD10  -  FREEBSD9  -  FREEBSD92  -  FREEBSD91  -  FREEBSD90  -  FREEBSD8  -  FREEBSD82  -  FREEBSD81  -  FREEBSD80  -  FREEBSD7  -  FREEBSD74  -  FREEBSD73  -  FREEBSD72  -  FREEBSD71  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1992 Terrence R. Lambert.
    3  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    4  * All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * William Jolitz.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice, this list of conditions and the following disclaimer.
   14  * 2. Redistributions in binary form must reproduce the above copyright
   15  *    notice, this list of conditions and the following disclaimer in the
   16  *    documentation and/or other materials provided with the distribution.
   17  * 3. All advertising materials mentioning features or use of this software
   18  *    must display the following acknowledgement:
   19  *      This product includes software developed by the University of
   20  *      California, Berkeley and its contributors.
   21  * 4. Neither the name of the University nor the names of its contributors
   22  *    may be used to endorse or promote products derived from this software
   23  *    without specific prior written permission.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  *
   37  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   38  */
   39 
   40 #include <sys/cdefs.h>
   41 __FBSDID("$FreeBSD: stable/10/sys/i386/i386/machdep.c 337245 2018-08-03 14:12:37Z kib $");
   42 
   43 #include "opt_apic.h"
   44 #include "opt_atalk.h"
   45 #include "opt_atpic.h"
   46 #include "opt_compat.h"
   47 #include "opt_cpu.h"
   48 #include "opt_ddb.h"
   49 #include "opt_inet.h"
   50 #include "opt_ipx.h"
   51 #include "opt_isa.h"
   52 #include "opt_kstack_pages.h"
   53 #include "opt_maxmem.h"
   54 #include "opt_mp_watchdog.h"
   55 #include "opt_npx.h"
   56 #include "opt_perfmon.h"
   57 #include "opt_platform.h"
   58 #include "opt_xbox.h"
   59 #include "opt_kdtrace.h"
   60 
   61 #include <sys/param.h>
   62 #include <sys/proc.h>
   63 #include <sys/systm.h>
   64 #include <sys/bio.h>
   65 #include <sys/buf.h>
   66 #include <sys/bus.h>
   67 #include <sys/callout.h>
   68 #include <sys/cons.h>
   69 #include <sys/cpu.h>
   70 #include <sys/eventhandler.h>
   71 #include <sys/exec.h>
   72 #include <sys/imgact.h>
   73 #include <sys/kdb.h>
   74 #include <sys/kernel.h>
   75 #include <sys/ktr.h>
   76 #include <sys/linker.h>
   77 #include <sys/lock.h>
   78 #include <sys/malloc.h>
   79 #include <sys/memrange.h>
   80 #include <sys/msgbuf.h>
   81 #include <sys/mutex.h>
   82 #include <sys/pcpu.h>
   83 #include <sys/ptrace.h>
   84 #include <sys/reboot.h>
   85 #include <sys/rwlock.h>
   86 #include <sys/sched.h>
   87 #include <sys/signalvar.h>
   88 #ifdef SMP
   89 #include <sys/smp.h>
   90 #endif
   91 #include <sys/syscallsubr.h>
   92 #include <sys/sysctl.h>
   93 #include <sys/sysent.h>
   94 #include <sys/sysproto.h>
   95 #include <sys/ucontext.h>
   96 #include <sys/vmmeter.h>
   97 
   98 #include <vm/vm.h>
   99 #include <vm/vm_extern.h>
  100 #include <vm/vm_kern.h>
  101 #include <vm/vm_page.h>
  102 #include <vm/vm_map.h>
  103 #include <vm/vm_object.h>
  104 #include <vm/vm_pager.h>
  105 #include <vm/vm_param.h>
  106 
  107 #ifdef DDB
  108 #ifndef KDB
  109 #error KDB must be enabled in order for DDB to work!
  110 #endif
  111 #include <ddb/ddb.h>
  112 #include <ddb/db_sym.h>
  113 #endif
  114 
  115 #ifdef PC98
  116 #include <pc98/pc98/pc98_machdep.h>
  117 #else
  118 #include <isa/rtc.h>
  119 #endif
  120 
  121 #include <net/netisr.h>
  122 
  123 #include <machine/bootinfo.h>
  124 #include <machine/clock.h>
  125 #include <machine/cpu.h>
  126 #include <machine/cputypes.h>
  127 #include <machine/intr_machdep.h>
  128 #include <x86/mca.h>
  129 #include <machine/md_var.h>
  130 #include <machine/metadata.h>
  131 #include <machine/mp_watchdog.h>
  132 #include <machine/pc/bios.h>
  133 #include <machine/pcb.h>
  134 #include <machine/pcb_ext.h>
  135 #include <machine/proc.h>
  136 #include <machine/reg.h>
  137 #include <machine/sigframe.h>
  138 #include <machine/specialreg.h>
  139 #include <machine/vm86.h>
  140 #ifdef PERFMON
  141 #include <machine/perfmon.h>
  142 #endif
  143 #ifdef SMP
  144 #include <machine/smp.h>
  145 #endif
  146 #ifdef FDT
  147 #include <x86/fdt.h>
  148 #endif
  149 
  150 #ifdef DEV_APIC
  151 #include <machine/apicvar.h>
  152 #endif
  153 
  154 #ifdef DEV_ISA
  155 #include <x86/isa/icu.h>
  156 #endif
  157 
  158 #ifdef XBOX
  159 #include <machine/xbox.h>
  160 
  161 int arch_i386_is_xbox = 0;
  162 uint32_t arch_i386_xbox_memsize = 0;
  163 #endif
  164 
  165 #ifdef XEN
  166 /* XEN includes */
  167 #include <xen/xen-os.h>
  168 #include <xen/hypervisor.h>
  169 #include <machine/xen/xenvar.h>
  170 #include <machine/xen/xenfunc.h>
  171 #include <xen/xen_intr.h>
  172 
  173 void Xhypervisor_callback(void);
  174 void failsafe_callback(void);
  175 
  176 extern trap_info_t trap_table[];
  177 struct proc_ldt default_proc_ldt;
  178 extern int init_first;
  179 int running_xen = 1;
  180 extern unsigned long physfree;
  181 #endif /* XEN */
  182 
  183 /* Sanity check for __curthread() */
  184 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  185 
  186 extern register_t init386(int first);
  187 extern void dblfault_handler(void);
  188 
  189 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  190 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  191 
  192 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
  193 #define CPU_ENABLE_SSE
  194 #endif
  195 
  196 static void cpu_startup(void *);
  197 static void fpstate_drop(struct thread *td);
  198 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
  199     char *xfpusave, size_t xfpusave_len);
  200 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
  201     char *xfpustate, size_t xfpustate_len);
  202 #ifdef CPU_ENABLE_SSE
  203 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
  204 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
  205 #endif /* CPU_ENABLE_SSE */
  206 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  207 
  208 #ifdef DDB
  209 extern vm_offset_t ksym_start, ksym_end;
  210 #endif
  211 
  212 /* Intel ICH registers */
  213 #define ICH_PMBASE      0x400
  214 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  215 
  216 int     _udatasel, _ucodesel;
  217 u_int   basemem;
  218 
  219 #ifdef PC98
  220 int     need_pre_dma_flush;     /* If 1, use wbinvd befor DMA transfer. */
  221 int     need_post_dma_flush;    /* If 1, use invd after DMA transfer. */
  222 
  223 static int      ispc98 = 1;
  224 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
  225 #endif
  226 
  227 int cold = 1;
  228 
  229 #ifdef COMPAT_43
  230 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
  231 #endif
  232 #ifdef COMPAT_FREEBSD4
  233 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
  234 #endif
  235 
  236 long Maxmem = 0;
  237 long realmem = 0;
  238 
  239 #ifdef PAE
  240 FEATURE(pae, "Physical Address Extensions");
  241 #endif
  242 
  243 /*
  244  * The number of PHYSMAP entries must be one less than the number of
  245  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  246  * physical address that is accessible by ISA DMA is split into two
  247  * PHYSSEG entries.
  248  */
  249 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  250 
  251 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  252 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  253 
  254 /* must be 2 less so 0 0 can signal end of chunks */
  255 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
  256 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
  257 
  258 struct kva_md_info kmi;
  259 
  260 static struct trapframe proc0_tf;
  261 struct pcpu __pcpu[MAXCPU];
  262 
  263 struct mtx icu_lock;
  264 
  265 struct mem_range_softc mem_range_softc;
  266 
  267 static void
  268 cpu_startup(dummy)
  269         void *dummy;
  270 {
  271         uintmax_t memsize;
  272         char *sysenv;
  273 
  274 #ifndef PC98
  275         /*
  276          * On MacBooks, we need to disallow the legacy USB circuit to
  277          * generate an SMI# because this can cause several problems,
  278          * namely: incorrect CPU frequency detection and failure to
  279          * start the APs.
  280          * We do this by disabling a bit in the SMI_EN (SMI Control and
  281          * Enable register) of the Intel ICH LPC Interface Bridge.
  282          */
  283         sysenv = getenv("smbios.system.product");
  284         if (sysenv != NULL) {
  285                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  286                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  287                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  288                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  289                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  290                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  291                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  292                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  293                         if (bootverbose)
  294                                 printf("Disabling LEGACY_USB_EN bit on "
  295                                     "Intel ICH.\n");
  296                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  297                 }
  298                 freeenv(sysenv);
  299         }
  300 #endif /* !PC98 */
  301 
  302         /*
  303          * Good {morning,afternoon,evening,night}.
  304          */
  305         startrtclock();
  306         printcpuinfo();
  307         panicifcpuunsupported();
  308 #ifdef PERFMON
  309         perfmon_init();
  310 #endif
  311 
  312         /*
  313          * Display physical memory if SMBIOS reports reasonable amount.
  314          */
  315         memsize = 0;
  316         sysenv = getenv("smbios.memory.enabled");
  317         if (sysenv != NULL) {
  318                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  319                 freeenv(sysenv);
  320         }
  321         if (memsize < ptoa((uintmax_t)cnt.v_free_count))
  322                 memsize = ptoa((uintmax_t)Maxmem);
  323         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  324         realmem = atop(memsize);
  325 
  326         /*
  327          * Display any holes after the first chunk of extended memory.
  328          */
  329         if (bootverbose) {
  330                 int indx;
  331 
  332                 printf("Physical memory chunk(s):\n");
  333                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  334                         vm_paddr_t size;
  335 
  336                         size = phys_avail[indx + 1] - phys_avail[indx];
  337                         printf(
  338                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  339                             (uintmax_t)phys_avail[indx],
  340                             (uintmax_t)phys_avail[indx + 1] - 1,
  341                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  342                 }
  343         }
  344 
  345         vm_ksubmap_init(&kmi);
  346 
  347         printf("avail memory = %ju (%ju MB)\n",
  348             ptoa((uintmax_t)cnt.v_free_count),
  349             ptoa((uintmax_t)cnt.v_free_count) / 1048576);
  350 
  351         /*
  352          * Set up buffers, so they can be used to read disk labels.
  353          */
  354         bufinit();
  355         vm_pager_bufferinit();
  356 #ifndef XEN
  357         cpu_setregs();
  358 #endif
  359 }
  360 
  361 /*
  362  * Send an interrupt to process.
  363  *
  364  * Stack is set up to allow sigcode stored
  365  * at top to call routine, followed by call
  366  * to sigreturn routine below.  After sigreturn
  367  * resets the signal mask, the stack, and the
  368  * frame pointer, it returns to the user
  369  * specified pc, psl.
  370  */
  371 #ifdef COMPAT_43
  372 static void
  373 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  374 {
  375         struct osigframe sf, *fp;
  376         struct proc *p;
  377         struct thread *td;
  378         struct sigacts *psp;
  379         struct trapframe *regs;
  380         int sig;
  381         int oonstack;
  382 
  383         td = curthread;
  384         p = td->td_proc;
  385         PROC_LOCK_ASSERT(p, MA_OWNED);
  386         sig = ksi->ksi_signo;
  387         psp = p->p_sigacts;
  388         mtx_assert(&psp->ps_mtx, MA_OWNED);
  389         regs = td->td_frame;
  390         oonstack = sigonstack(regs->tf_esp);
  391 
  392         /* Allocate space for the signal handler context. */
  393         if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
  394             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  395                 fp = (struct osigframe *)(td->td_sigstk.ss_sp +
  396                     td->td_sigstk.ss_size - sizeof(struct osigframe));
  397 #if defined(COMPAT_43)
  398                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  399 #endif
  400         } else
  401                 fp = (struct osigframe *)regs->tf_esp - 1;
  402 
  403         /* Build the argument list for the signal handler. */
  404         sf.sf_signum = sig;
  405         sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
  406         bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
  407         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  408                 /* Signal handler installed with SA_SIGINFO. */
  409                 sf.sf_arg2 = (register_t)&fp->sf_siginfo;
  410                 sf.sf_siginfo.si_signo = sig;
  411                 sf.sf_siginfo.si_code = ksi->ksi_code;
  412                 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
  413                 sf.sf_addr = 0;
  414         } else {
  415                 /* Old FreeBSD-style arguments. */
  416                 sf.sf_arg2 = ksi->ksi_code;
  417                 sf.sf_addr = (register_t)ksi->ksi_addr;
  418                 sf.sf_ahu.sf_handler = catcher;
  419         }
  420         mtx_unlock(&psp->ps_mtx);
  421         PROC_UNLOCK(p);
  422 
  423         /* Save most if not all of trap frame. */
  424         sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
  425         sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
  426         sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
  427         sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
  428         sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
  429         sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
  430         sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
  431         sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
  432         sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
  433         sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
  434         sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
  435         sf.sf_siginfo.si_sc.sc_gs = rgs();
  436         sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
  437 
  438         /* Build the signal context to be used by osigreturn(). */
  439         sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
  440         SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
  441         sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
  442         sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
  443         sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
  444         sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
  445         sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
  446         sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
  447 
  448         /*
  449          * If we're a vm86 process, we want to save the segment registers.
  450          * We also change eflags to be our emulated eflags, not the actual
  451          * eflags.
  452          */
  453         if (regs->tf_eflags & PSL_VM) {
  454                 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
  455                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  456                 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  457 
  458                 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
  459                 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
  460                 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
  461                 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
  462 
  463                 if (vm86->vm86_has_vme == 0)
  464                         sf.sf_siginfo.si_sc.sc_ps =
  465                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
  466                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
  467 
  468                 /* See sendsig() for comments. */
  469                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
  470         }
  471 
  472         /*
  473          * Copy the sigframe out to the user's stack.
  474          */
  475         if (copyout(&sf, fp, sizeof(*fp)) != 0) {
  476 #ifdef DEBUG
  477                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  478 #endif
  479                 PROC_LOCK(p);
  480                 sigexit(td, SIGILL);
  481         }
  482 
  483         regs->tf_esp = (int)fp;
  484         if (p->p_sysent->sv_sigcode_base != 0) {
  485                 regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
  486                     szosigcode;
  487         } else {
  488                 /* a.out sysentvec does not use shared page */
  489                 regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode;
  490         }
  491         regs->tf_eflags &= ~(PSL_T | PSL_D);
  492         regs->tf_cs = _ucodesel;
  493         regs->tf_ds = _udatasel;
  494         regs->tf_es = _udatasel;
  495         regs->tf_fs = _udatasel;
  496         load_gs(_udatasel);
  497         regs->tf_ss = _udatasel;
  498         PROC_LOCK(p);
  499         mtx_lock(&psp->ps_mtx);
  500 }
  501 #endif /* COMPAT_43 */
  502 
  503 #ifdef COMPAT_FREEBSD4
  504 static void
  505 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  506 {
  507         struct sigframe4 sf, *sfp;
  508         struct proc *p;
  509         struct thread *td;
  510         struct sigacts *psp;
  511         struct trapframe *regs;
  512         int sig;
  513         int oonstack;
  514 
  515         td = curthread;
  516         p = td->td_proc;
  517         PROC_LOCK_ASSERT(p, MA_OWNED);
  518         sig = ksi->ksi_signo;
  519         psp = p->p_sigacts;
  520         mtx_assert(&psp->ps_mtx, MA_OWNED);
  521         regs = td->td_frame;
  522         oonstack = sigonstack(regs->tf_esp);
  523 
  524         /* Save user context. */
  525         bzero(&sf, sizeof(sf));
  526         sf.sf_uc.uc_sigmask = *mask;
  527         sf.sf_uc.uc_stack = td->td_sigstk;
  528         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  529             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  530         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  531         sf.sf_uc.uc_mcontext.mc_gs = rgs();
  532         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
  533         bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
  534             sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
  535         bzero(sf.sf_uc.uc_mcontext.__spare__,
  536             sizeof(sf.sf_uc.uc_mcontext.__spare__));
  537         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  538 
  539         /* Allocate space for the signal handler context. */
  540         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  541             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  542                 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
  543                     td->td_sigstk.ss_size - sizeof(struct sigframe4));
  544 #if defined(COMPAT_43)
  545                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  546 #endif
  547         } else
  548                 sfp = (struct sigframe4 *)regs->tf_esp - 1;
  549 
  550         /* Build the argument list for the signal handler. */
  551         sf.sf_signum = sig;
  552         sf.sf_ucontext = (register_t)&sfp->sf_uc;
  553         bzero(&sf.sf_si, sizeof(sf.sf_si));
  554         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  555                 /* Signal handler installed with SA_SIGINFO. */
  556                 sf.sf_siginfo = (register_t)&sfp->sf_si;
  557                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  558 
  559                 /* Fill in POSIX parts */
  560                 sf.sf_si.si_signo = sig;
  561                 sf.sf_si.si_code = ksi->ksi_code;
  562                 sf.sf_si.si_addr = ksi->ksi_addr;
  563         } else {
  564                 /* Old FreeBSD-style arguments. */
  565                 sf.sf_siginfo = ksi->ksi_code;
  566                 sf.sf_addr = (register_t)ksi->ksi_addr;
  567                 sf.sf_ahu.sf_handler = catcher;
  568         }
  569         mtx_unlock(&psp->ps_mtx);
  570         PROC_UNLOCK(p);
  571 
  572         /*
  573          * If we're a vm86 process, we want to save the segment registers.
  574          * We also change eflags to be our emulated eflags, not the actual
  575          * eflags.
  576          */
  577         if (regs->tf_eflags & PSL_VM) {
  578                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  579                 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  580 
  581                 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
  582                 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
  583                 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
  584                 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
  585 
  586                 if (vm86->vm86_has_vme == 0)
  587                         sf.sf_uc.uc_mcontext.mc_eflags =
  588                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
  589                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
  590 
  591                 /*
  592                  * Clear PSL_NT to inhibit T_TSSFLT faults on return from
  593                  * syscalls made by the signal handler.  This just avoids
  594                  * wasting time for our lazy fixup of such faults.  PSL_NT
  595                  * does nothing in vm86 mode, but vm86 programs can set it
  596                  * almost legitimately in probes for old cpu types.
  597                  */
  598                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
  599         }
  600 
  601         /*
  602          * Copy the sigframe out to the user's stack.
  603          */
  604         if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
  605 #ifdef DEBUG
  606                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  607 #endif
  608                 PROC_LOCK(p);
  609                 sigexit(td, SIGILL);
  610         }
  611 
  612         regs->tf_esp = (int)sfp;
  613         regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode -
  614             szfreebsd4_sigcode;
  615         regs->tf_eflags &= ~(PSL_T | PSL_D);
  616         regs->tf_cs = _ucodesel;
  617         regs->tf_ds = _udatasel;
  618         regs->tf_es = _udatasel;
  619         regs->tf_fs = _udatasel;
  620         regs->tf_ss = _udatasel;
  621         PROC_LOCK(p);
  622         mtx_lock(&psp->ps_mtx);
  623 }
  624 #endif  /* COMPAT_FREEBSD4 */
  625 
  626 void
  627 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  628 {
  629         struct sigframe sf, *sfp;
  630         struct proc *p;
  631         struct thread *td;
  632         struct sigacts *psp;
  633         char *sp;
  634         struct trapframe *regs;
  635         struct segment_descriptor *sdp;
  636         char *xfpusave;
  637         size_t xfpusave_len;
  638         int sig;
  639         int oonstack;
  640 
  641         td = curthread;
  642         p = td->td_proc;
  643         PROC_LOCK_ASSERT(p, MA_OWNED);
  644         sig = ksi->ksi_signo;
  645         psp = p->p_sigacts;
  646         mtx_assert(&psp->ps_mtx, MA_OWNED);
  647 #ifdef COMPAT_FREEBSD4
  648         if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
  649                 freebsd4_sendsig(catcher, ksi, mask);
  650                 return;
  651         }
  652 #endif
  653 #ifdef COMPAT_43
  654         if (SIGISMEMBER(psp->ps_osigset, sig)) {
  655                 osendsig(catcher, ksi, mask);
  656                 return;
  657         }
  658 #endif
  659         regs = td->td_frame;
  660         oonstack = sigonstack(regs->tf_esp);
  661 
  662 #ifdef CPU_ENABLE_SSE
  663         if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) {
  664                 xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu);
  665                 xfpusave = __builtin_alloca(xfpusave_len);
  666         } else {
  667 #else
  668         {
  669 #endif
  670                 xfpusave_len = 0;
  671                 xfpusave = NULL;
  672         }
  673 
  674         /* Save user context. */
  675         bzero(&sf, sizeof(sf));
  676         sf.sf_uc.uc_sigmask = *mask;
  677         sf.sf_uc.uc_stack = td->td_sigstk;
  678         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  679             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  680         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  681         sf.sf_uc.uc_mcontext.mc_gs = rgs();
  682         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
  683         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  684         get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
  685         fpstate_drop(td);
  686         /*
  687          * Unconditionally fill the fsbase and gsbase into the mcontext.
  688          */
  689         sdp = &td->td_pcb->pcb_fsd;
  690         sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
  691             sdp->sd_lobase;
  692         sdp = &td->td_pcb->pcb_gsd;
  693         sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
  694             sdp->sd_lobase;
  695         bzero(sf.sf_uc.uc_mcontext.mc_spare2,
  696             sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
  697         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  698 
  699         /* Allocate space for the signal handler context. */
  700         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  701             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  702                 sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
  703 #if defined(COMPAT_43)
  704                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  705 #endif
  706         } else
  707                 sp = (char *)regs->tf_esp - 128;
  708         if (xfpusave != NULL) {
  709                 sp -= xfpusave_len;
  710                 sp = (char *)((unsigned int)sp & ~0x3F);
  711                 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
  712         }
  713         sp -= sizeof(struct sigframe);
  714 
  715         /* Align to 16 bytes. */
  716         sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
  717 
  718         /* Translate the signal if appropriate. */
  719         if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
  720                 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
  721 
  722         /* Build the argument list for the signal handler. */
  723         sf.sf_signum = sig;
  724         sf.sf_ucontext = (register_t)&sfp->sf_uc;
  725         bzero(&sf.sf_si, sizeof(sf.sf_si));
  726         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  727                 /* Signal handler installed with SA_SIGINFO. */
  728                 sf.sf_siginfo = (register_t)&sfp->sf_si;
  729                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  730 
  731                 /* Fill in POSIX parts */
  732                 sf.sf_si = ksi->ksi_info;
  733                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  734         } else {
  735                 /* Old FreeBSD-style arguments. */
  736                 sf.sf_siginfo = ksi->ksi_code;
  737                 sf.sf_addr = (register_t)ksi->ksi_addr;
  738                 sf.sf_ahu.sf_handler = catcher;
  739         }
  740         mtx_unlock(&psp->ps_mtx);
  741         PROC_UNLOCK(p);
  742 
  743         /*
  744          * If we're a vm86 process, we want to save the segment registers.
  745          * We also change eflags to be our emulated eflags, not the actual
  746          * eflags.
  747          */
  748         if (regs->tf_eflags & PSL_VM) {
  749                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  750                 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  751 
  752                 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
  753                 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
  754                 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
  755                 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
  756 
  757                 if (vm86->vm86_has_vme == 0)
  758                         sf.sf_uc.uc_mcontext.mc_eflags =
  759                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
  760                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
  761 
  762                 /*
  763                  * Clear PSL_NT to inhibit T_TSSFLT faults on return from
  764                  * syscalls made by the signal handler.  This just avoids
  765                  * wasting time for our lazy fixup of such faults.  PSL_NT
  766                  * does nothing in vm86 mode, but vm86 programs can set it
  767                  * almost legitimately in probes for old cpu types.
  768                  */
  769                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
  770         }
  771 
  772         /*
  773          * Copy the sigframe out to the user's stack.
  774          */
  775         if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
  776             (xfpusave != NULL && copyout(xfpusave,
  777             (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
  778             != 0)) {
  779 #ifdef DEBUG
  780                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  781 #endif
  782                 PROC_LOCK(p);
  783                 sigexit(td, SIGILL);
  784         }
  785 
  786         regs->tf_esp = (int)sfp;
  787         regs->tf_eip = p->p_sysent->sv_sigcode_base;
  788         if (regs->tf_eip == 0)
  789                 regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode;
  790         regs->tf_eflags &= ~(PSL_T | PSL_D);
  791         regs->tf_cs = _ucodesel;
  792         regs->tf_ds = _udatasel;
  793         regs->tf_es = _udatasel;
  794         regs->tf_fs = _udatasel;
  795         regs->tf_ss = _udatasel;
  796         PROC_LOCK(p);
  797         mtx_lock(&psp->ps_mtx);
  798 }
  799 
  800 /*
  801  * System call to cleanup state after a signal
  802  * has been taken.  Reset signal mask and
  803  * stack state from context left by sendsig (above).
  804  * Return to previous pc and psl as specified by
  805  * context left by sendsig. Check carefully to
  806  * make sure that the user has not modified the
  807  * state to gain improper privileges.
  808  *
  809  * MPSAFE
  810  */
  811 #ifdef COMPAT_43
  812 int
  813 osigreturn(td, uap)
  814         struct thread *td;
  815         struct osigreturn_args /* {
  816                 struct osigcontext *sigcntxp;
  817         } */ *uap;
  818 {
  819         struct osigcontext sc;
  820         struct trapframe *regs;
  821         struct osigcontext *scp;
  822         int eflags, error;
  823         ksiginfo_t ksi;
  824 
  825         regs = td->td_frame;
  826         error = copyin(uap->sigcntxp, &sc, sizeof(sc));
  827         if (error != 0)
  828                 return (error);
  829         scp = &sc;
  830         eflags = scp->sc_ps;
  831         if (eflags & PSL_VM) {
  832                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  833                 struct vm86_kernel *vm86;
  834 
  835                 /*
  836                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
  837                  * set up the vm86 area, and we can't enter vm86 mode.
  838                  */
  839                 if (td->td_pcb->pcb_ext == 0)
  840                         return (EINVAL);
  841                 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  842                 if (vm86->vm86_inited == 0)
  843                         return (EINVAL);
  844 
  845                 /* Go back to user mode if both flags are set. */
  846                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
  847                         ksiginfo_init_trap(&ksi);
  848                         ksi.ksi_signo = SIGBUS;
  849                         ksi.ksi_code = BUS_OBJERR;
  850                         ksi.ksi_addr = (void *)regs->tf_eip;
  851                         trapsignal(td, &ksi);
  852                 }
  853 
  854                 if (vm86->vm86_has_vme) {
  855                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
  856                             (eflags & VME_USERCHANGE) | PSL_VM;
  857                 } else {
  858                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
  859                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
  860                             (eflags & VM_USERCHANGE) | PSL_VM;
  861                 }
  862                 tf->tf_vm86_ds = scp->sc_ds;
  863                 tf->tf_vm86_es = scp->sc_es;
  864                 tf->tf_vm86_fs = scp->sc_fs;
  865                 tf->tf_vm86_gs = scp->sc_gs;
  866                 tf->tf_ds = _udatasel;
  867                 tf->tf_es = _udatasel;
  868                 tf->tf_fs = _udatasel;
  869         } else {
  870                 /*
  871                  * Don't allow users to change privileged or reserved flags.
  872                  */
  873                 if (!EFL_SECURE(eflags, regs->tf_eflags)) {
  874                         return (EINVAL);
  875                 }
  876 
  877                 /*
  878                  * Don't allow users to load a valid privileged %cs.  Let the
  879                  * hardware check for invalid selectors, excess privilege in
  880                  * other selectors, invalid %eip's and invalid %esp's.
  881                  */
  882                 if (!CS_SECURE(scp->sc_cs)) {
  883                         ksiginfo_init_trap(&ksi);
  884                         ksi.ksi_signo = SIGBUS;
  885                         ksi.ksi_code = BUS_OBJERR;
  886                         ksi.ksi_trapno = T_PROTFLT;
  887                         ksi.ksi_addr = (void *)regs->tf_eip;
  888                         trapsignal(td, &ksi);
  889                         return (EINVAL);
  890                 }
  891                 regs->tf_ds = scp->sc_ds;
  892                 regs->tf_es = scp->sc_es;
  893                 regs->tf_fs = scp->sc_fs;
  894         }
  895 
  896         /* Restore remaining registers. */
  897         regs->tf_eax = scp->sc_eax;
  898         regs->tf_ebx = scp->sc_ebx;
  899         regs->tf_ecx = scp->sc_ecx;
  900         regs->tf_edx = scp->sc_edx;
  901         regs->tf_esi = scp->sc_esi;
  902         regs->tf_edi = scp->sc_edi;
  903         regs->tf_cs = scp->sc_cs;
  904         regs->tf_ss = scp->sc_ss;
  905         regs->tf_isp = scp->sc_isp;
  906         regs->tf_ebp = scp->sc_fp;
  907         regs->tf_esp = scp->sc_sp;
  908         regs->tf_eip = scp->sc_pc;
  909         regs->tf_eflags = eflags;
  910 
  911 #if defined(COMPAT_43)
  912         if (scp->sc_onstack & 1)
  913                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  914         else
  915                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  916 #endif
  917         kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
  918             SIGPROCMASK_OLD);
  919         return (EJUSTRETURN);
  920 }
  921 #endif /* COMPAT_43 */
  922 
  923 #ifdef COMPAT_FREEBSD4
  924 /*
  925  * MPSAFE
  926  */
  927 int
  928 freebsd4_sigreturn(td, uap)
  929         struct thread *td;
  930         struct freebsd4_sigreturn_args /* {
  931                 const ucontext4 *sigcntxp;
  932         } */ *uap;
  933 {
  934         struct ucontext4 uc;
  935         struct trapframe *regs;
  936         struct ucontext4 *ucp;
  937         int cs, eflags, error;
  938         ksiginfo_t ksi;
  939 
  940         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  941         if (error != 0)
  942                 return (error);
  943         ucp = &uc;
  944         regs = td->td_frame;
  945         eflags = ucp->uc_mcontext.mc_eflags;
  946         if (eflags & PSL_VM) {
  947                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  948                 struct vm86_kernel *vm86;
  949 
  950                 /*
  951                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
  952                  * set up the vm86 area, and we can't enter vm86 mode.
  953                  */
  954                 if (td->td_pcb->pcb_ext == 0)
  955                         return (EINVAL);
  956                 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  957                 if (vm86->vm86_inited == 0)
  958                         return (EINVAL);
  959 
  960                 /* Go back to user mode if both flags are set. */
  961                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
  962                         ksiginfo_init_trap(&ksi);
  963                         ksi.ksi_signo = SIGBUS;
  964                         ksi.ksi_code = BUS_OBJERR;
  965                         ksi.ksi_addr = (void *)regs->tf_eip;
  966                         trapsignal(td, &ksi);
  967                 }
  968                 if (vm86->vm86_has_vme) {
  969                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
  970                             (eflags & VME_USERCHANGE) | PSL_VM;
  971                 } else {
  972                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
  973                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
  974                             (eflags & VM_USERCHANGE) | PSL_VM;
  975                 }
  976                 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
  977                 tf->tf_eflags = eflags;
  978                 tf->tf_vm86_ds = tf->tf_ds;
  979                 tf->tf_vm86_es = tf->tf_es;
  980                 tf->tf_vm86_fs = tf->tf_fs;
  981                 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
  982                 tf->tf_ds = _udatasel;
  983                 tf->tf_es = _udatasel;
  984                 tf->tf_fs = _udatasel;
  985         } else {
  986                 /*
  987                  * Don't allow users to change privileged or reserved flags.
  988                  */
  989                 if (!EFL_SECURE(eflags, regs->tf_eflags)) {
  990                         uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
  991                             td->td_proc->p_pid, td->td_name, eflags);
  992                         return (EINVAL);
  993                 }
  994 
  995                 /*
  996                  * Don't allow users to load a valid privileged %cs.  Let the
  997                  * hardware check for invalid selectors, excess privilege in
  998                  * other selectors, invalid %eip's and invalid %esp's.
  999                  */
 1000                 cs = ucp->uc_mcontext.mc_cs;
 1001                 if (!CS_SECURE(cs)) {
 1002                         uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 1003                             td->td_proc->p_pid, td->td_name, cs);
 1004                         ksiginfo_init_trap(&ksi);
 1005                         ksi.ksi_signo = SIGBUS;
 1006                         ksi.ksi_code = BUS_OBJERR;
 1007                         ksi.ksi_trapno = T_PROTFLT;
 1008                         ksi.ksi_addr = (void *)regs->tf_eip;
 1009                         trapsignal(td, &ksi);
 1010                         return (EINVAL);
 1011                 }
 1012 
 1013                 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 1014         }
 1015 
 1016 #if defined(COMPAT_43)
 1017         if (ucp->uc_mcontext.mc_onstack & 1)
 1018                 td->td_sigstk.ss_flags |= SS_ONSTACK;
 1019         else
 1020                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 1021 #endif
 1022         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 1023         return (EJUSTRETURN);
 1024 }
 1025 #endif  /* COMPAT_FREEBSD4 */
 1026 
 1027 /*
 1028  * MPSAFE
 1029  */
 1030 int
 1031 sys_sigreturn(td, uap)
 1032         struct thread *td;
 1033         struct sigreturn_args /* {
 1034                 const struct __ucontext *sigcntxp;
 1035         } */ *uap;
 1036 {
 1037         ucontext_t uc;
 1038         struct proc *p;
 1039         struct trapframe *regs;
 1040         ucontext_t *ucp;
 1041         char *xfpustate;
 1042         size_t xfpustate_len;
 1043         int cs, eflags, error, ret;
 1044         ksiginfo_t ksi;
 1045 
 1046         p = td->td_proc;
 1047 
 1048         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 1049         if (error != 0)
 1050                 return (error);
 1051         ucp = &uc;
 1052         if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 1053                 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 1054                     td->td_name, ucp->uc_mcontext.mc_flags);
 1055                 return (EINVAL);
 1056         }
 1057         regs = td->td_frame;
 1058         eflags = ucp->uc_mcontext.mc_eflags;
 1059         if (eflags & PSL_VM) {
 1060                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 1061                 struct vm86_kernel *vm86;
 1062 
 1063                 /*
 1064                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 1065                  * set up the vm86 area, and we can't enter vm86 mode.
 1066                  */
 1067                 if (td->td_pcb->pcb_ext == 0)
 1068                         return (EINVAL);
 1069                 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 1070                 if (vm86->vm86_inited == 0)
 1071                         return (EINVAL);
 1072 
 1073                 /* Go back to user mode if both flags are set. */
 1074                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 1075                         ksiginfo_init_trap(&ksi);
 1076                         ksi.ksi_signo = SIGBUS;
 1077                         ksi.ksi_code = BUS_OBJERR;
 1078                         ksi.ksi_addr = (void *)regs->tf_eip;
 1079                         trapsignal(td, &ksi);
 1080                 }
 1081 
 1082                 if (vm86->vm86_has_vme) {
 1083                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 1084                             (eflags & VME_USERCHANGE) | PSL_VM;
 1085                 } else {
 1086                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
 1087                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 1088                             (eflags & VM_USERCHANGE) | PSL_VM;
 1089                 }
 1090                 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 1091                 tf->tf_eflags = eflags;
 1092                 tf->tf_vm86_ds = tf->tf_ds;
 1093                 tf->tf_vm86_es = tf->tf_es;
 1094                 tf->tf_vm86_fs = tf->tf_fs;
 1095                 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 1096                 tf->tf_ds = _udatasel;
 1097                 tf->tf_es = _udatasel;
 1098                 tf->tf_fs = _udatasel;
 1099         } else {
 1100                 /*
 1101                  * Don't allow users to change privileged or reserved flags.
 1102                  */
 1103                 if (!EFL_SECURE(eflags, regs->tf_eflags)) {
 1104                         uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 1105                             td->td_proc->p_pid, td->td_name, eflags);
 1106                         return (EINVAL);
 1107                 }
 1108 
 1109                 /*
 1110                  * Don't allow users to load a valid privileged %cs.  Let the
 1111                  * hardware check for invalid selectors, excess privilege in
 1112                  * other selectors, invalid %eip's and invalid %esp's.
 1113                  */
 1114                 cs = ucp->uc_mcontext.mc_cs;
 1115                 if (!CS_SECURE(cs)) {
 1116                         uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 1117                             td->td_proc->p_pid, td->td_name, cs);
 1118                         ksiginfo_init_trap(&ksi);
 1119                         ksi.ksi_signo = SIGBUS;
 1120                         ksi.ksi_code = BUS_OBJERR;
 1121                         ksi.ksi_trapno = T_PROTFLT;
 1122                         ksi.ksi_addr = (void *)regs->tf_eip;
 1123                         trapsignal(td, &ksi);
 1124                         return (EINVAL);
 1125                 }
 1126 
 1127                 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 1128                         xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 1129                         if (xfpustate_len > cpu_max_ext_state_size -
 1130                             sizeof(union savefpu)) {
 1131                                 uprintf(
 1132                             "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 1133                                     p->p_pid, td->td_name, xfpustate_len);
 1134                                 return (EINVAL);
 1135                         }
 1136                         xfpustate = __builtin_alloca(xfpustate_len);
 1137                         error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 1138                             xfpustate, xfpustate_len);
 1139                         if (error != 0) {
 1140                                 uprintf(
 1141         "pid %d (%s): sigreturn copying xfpustate failed\n",
 1142                                     p->p_pid, td->td_name);
 1143                                 return (error);
 1144                         }
 1145                 } else {
 1146                         xfpustate = NULL;
 1147                         xfpustate_len = 0;
 1148                 }
 1149                 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate,
 1150                     xfpustate_len);
 1151                 if (ret != 0)
 1152                         return (ret);
 1153                 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 1154         }
 1155 
 1156 #if defined(COMPAT_43)
 1157         if (ucp->uc_mcontext.mc_onstack & 1)
 1158                 td->td_sigstk.ss_flags |= SS_ONSTACK;
 1159         else
 1160                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 1161 #endif
 1162 
 1163         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 1164         return (EJUSTRETURN);
 1165 }
 1166 
 1167 /*
 1168  * Machine dependent boot() routine
 1169  *
 1170  * I haven't seen anything to put here yet
 1171  * Possibly some stuff might be grafted back here from boot()
 1172  */
 1173 void
 1174 cpu_boot(int howto)
 1175 {
 1176 }
 1177 
 1178 /*
 1179  * Flush the D-cache for non-DMA I/O so that the I-cache can
 1180  * be made coherent later.
 1181  */
 1182 void
 1183 cpu_flush_dcache(void *ptr, size_t len)
 1184 {
 1185         /* Not applicable */
 1186 }
 1187 
 1188 /* Get current clock frequency for the given cpu id. */
 1189 int
 1190 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 1191 {
 1192         uint64_t tsc1, tsc2;
 1193         uint64_t acnt, mcnt, perf;
 1194         register_t reg;
 1195 
 1196         if (pcpu_find(cpu_id) == NULL || rate == NULL)
 1197                 return (EINVAL);
 1198         if ((cpu_feature & CPUID_TSC) == 0)
 1199                 return (EOPNOTSUPP);
 1200 
 1201         /*
 1202          * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 1203          * DELAY(9) based logic fails.
 1204          */
 1205         if (tsc_is_invariant && !tsc_perf_stat)
 1206                 return (EOPNOTSUPP);
 1207 
 1208 #ifdef SMP
 1209         if (smp_cpus > 1) {
 1210                 /* Schedule ourselves on the indicated cpu. */
 1211                 thread_lock(curthread);
 1212                 sched_bind(curthread, cpu_id);
 1213                 thread_unlock(curthread);
 1214         }
 1215 #endif
 1216 
 1217         /* Calibrate by measuring a short delay. */
 1218         reg = intr_disable();
 1219         if (tsc_is_invariant) {
 1220                 wrmsr(MSR_MPERF, 0);
 1221                 wrmsr(MSR_APERF, 0);
 1222                 tsc1 = rdtsc();
 1223                 DELAY(1000);
 1224                 mcnt = rdmsr(MSR_MPERF);
 1225                 acnt = rdmsr(MSR_APERF);
 1226                 tsc2 = rdtsc();
 1227                 intr_restore(reg);
 1228                 perf = 1000 * acnt / mcnt;
 1229                 *rate = (tsc2 - tsc1) * perf;
 1230         } else {
 1231                 tsc1 = rdtsc();
 1232                 DELAY(1000);
 1233                 tsc2 = rdtsc();
 1234                 intr_restore(reg);
 1235                 *rate = (tsc2 - tsc1) * 1000;
 1236         }
 1237 
 1238 #ifdef SMP
 1239         if (smp_cpus > 1) {
 1240                 thread_lock(curthread);
 1241                 sched_unbind(curthread);
 1242                 thread_unlock(curthread);
 1243         }
 1244 #endif
 1245 
 1246         return (0);
 1247 }
 1248 
 1249 #ifdef XEN
 1250 
 1251 static void
 1252 idle_block(void)
 1253 {
 1254 
 1255         HYPERVISOR_sched_op(SCHEDOP_block, 0);
 1256 }
 1257 
 1258 void
 1259 cpu_halt(void)
 1260 {
 1261         HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 1262 }
 1263 
 1264 int scheduler_running;
 1265 
 1266 static void
 1267 cpu_idle_hlt(sbintime_t sbt)
 1268 {
 1269 
 1270         scheduler_running = 1;
 1271         enable_intr();
 1272         idle_block();
 1273 }
 1274 
 1275 #else
 1276 /*
 1277  * Shutdown the CPU as much as possible
 1278  */
 1279 void
 1280 cpu_halt(void)
 1281 {
 1282         for (;;)
 1283                 halt();
 1284 }
 1285 
 1286 #endif
 1287 
 1288 void (*cpu_idle_hook)(sbintime_t) = NULL;       /* ACPI idle hook. */
 1289 static int      cpu_ident_amdc1e = 0;   /* AMD C1E supported. */
 1290 static int      idle_mwait = 1;         /* Use MONITOR/MWAIT for short idle. */
 1291 TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
 1292 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
 1293     0, "Use MONITOR/MWAIT for short idle");
 1294 
 1295 #define STATE_RUNNING   0x0
 1296 #define STATE_MWAIT     0x1
 1297 #define STATE_SLEEPING  0x2
 1298 
 1299 #ifndef PC98
 1300 static void
 1301 cpu_idle_acpi(sbintime_t sbt)
 1302 {
 1303         int *state;
 1304 
 1305         state = (int *)PCPU_PTR(monitorbuf);
 1306         *state = STATE_SLEEPING;
 1307 
 1308         /* See comments in cpu_idle_hlt(). */
 1309         disable_intr();
 1310         if (sched_runnable())
 1311                 enable_intr();
 1312         else if (cpu_idle_hook)
 1313                 cpu_idle_hook(sbt);
 1314         else
 1315                 __asm __volatile("sti; hlt");
 1316         *state = STATE_RUNNING;
 1317 }
 1318 #endif /* !PC98 */
 1319 
 1320 #ifndef XEN
 1321 static void
 1322 cpu_idle_hlt(sbintime_t sbt)
 1323 {
 1324         int *state;
 1325 
 1326         state = (int *)PCPU_PTR(monitorbuf);
 1327         *state = STATE_SLEEPING;
 1328 
 1329         /*
 1330          * Since we may be in a critical section from cpu_idle(), if
 1331          * an interrupt fires during that critical section we may have
 1332          * a pending preemption.  If the CPU halts, then that thread
 1333          * may not execute until a later interrupt awakens the CPU.
 1334          * To handle this race, check for a runnable thread after
 1335          * disabling interrupts and immediately return if one is
 1336          * found.  Also, we must absolutely guarentee that hlt is
 1337          * the next instruction after sti.  This ensures that any
 1338          * interrupt that fires after the call to disable_intr() will
 1339          * immediately awaken the CPU from hlt.  Finally, please note
 1340          * that on x86 this works fine because of interrupts enabled only
 1341          * after the instruction following sti takes place, while IF is set
 1342          * to 1 immediately, allowing hlt instruction to acknowledge the
 1343          * interrupt.
 1344          */
 1345         disable_intr();
 1346         if (sched_runnable())
 1347                 enable_intr();
 1348         else
 1349                 __asm __volatile("sti; hlt");
 1350         *state = STATE_RUNNING;
 1351 }
 1352 #endif
 1353 
 1354 static void
 1355 cpu_idle_mwait(sbintime_t sbt)
 1356 {
 1357         int *state;
 1358 
 1359         state = (int *)PCPU_PTR(monitorbuf);
 1360         *state = STATE_MWAIT;
 1361 
 1362         /* See comments in cpu_idle_hlt(). */
 1363         disable_intr();
 1364         if (sched_runnable()) {
 1365                 enable_intr();
 1366                 *state = STATE_RUNNING;
 1367                 return;
 1368         }
 1369         cpu_monitor(state, 0, 0);
 1370         if (*state == STATE_MWAIT)
 1371                 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 1372         else
 1373                 enable_intr();
 1374         *state = STATE_RUNNING;
 1375 }
 1376 
 1377 static void
 1378 cpu_idle_spin(sbintime_t sbt)
 1379 {
 1380         int *state;
 1381         int i;
 1382 
 1383         state = (int *)PCPU_PTR(monitorbuf);
 1384         *state = STATE_RUNNING;
 1385 
 1386         /*
 1387          * The sched_runnable() call is racy but as long as there is
 1388          * a loop missing it one time will have just a little impact if any 
 1389          * (and it is much better than missing the check at all).
 1390          */
 1391         for (i = 0; i < 1000; i++) {
 1392                 if (sched_runnable())
 1393                         return;
 1394                 cpu_spinwait();
 1395         }
 1396 }
 1397 
 1398 /*
 1399  * C1E renders the local APIC timer dead, so we disable it by
 1400  * reading the Interrupt Pending Message register and clearing
 1401  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
 1402  * 
 1403  * Reference:
 1404  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
 1405  *   #32559 revision 3.00+
 1406  */
 1407 #define MSR_AMDK8_IPM           0xc0010055
 1408 #define AMDK8_SMIONCMPHALT      (1ULL << 27)
 1409 #define AMDK8_C1EONCMPHALT      (1ULL << 28)
 1410 #define AMDK8_CMPHALT           (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 1411 
 1412 static void
 1413 cpu_probe_amdc1e(void)
 1414 {
 1415 
 1416         /*
 1417          * Detect the presence of C1E capability mostly on latest
 1418          * dual-cores (or future) k8 family.
 1419          */
 1420         if (cpu_vendor_id == CPU_VENDOR_AMD &&
 1421             (cpu_id & 0x00000f00) == 0x00000f00 &&
 1422             (cpu_id & 0x0fff0000) >=  0x00040000) {
 1423                 cpu_ident_amdc1e = 1;
 1424         }
 1425 }
 1426 
 1427 #if defined(PC98) || defined(XEN)
 1428 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
 1429 #else
 1430 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 1431 #endif
 1432 
 1433 void
 1434 cpu_idle(int busy)
 1435 {
 1436 #ifndef XEN
 1437         uint64_t msr;
 1438 #endif
 1439         sbintime_t sbt = -1;
 1440 
 1441         CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 1442             busy, curcpu);
 1443 #if defined(MP_WATCHDOG) && !defined(XEN)
 1444         ap_watchdog(PCPU_GET(cpuid));
 1445 #endif
 1446 #ifndef XEN
 1447         /* If we are busy - try to use fast methods. */
 1448         if (busy) {
 1449                 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 1450                         cpu_idle_mwait(busy);
 1451                         goto out;
 1452                 }
 1453         }
 1454 #endif
 1455 
 1456         /* If we have time - switch timers into idle mode. */
 1457         if (!busy) {
 1458                 critical_enter();
 1459                 sbt = cpu_idleclock();
 1460         }
 1461 
 1462 #ifndef XEN
 1463         /* Apply AMD APIC timer C1E workaround. */
 1464         if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 1465                 msr = rdmsr(MSR_AMDK8_IPM);
 1466                 if (msr & AMDK8_CMPHALT)
 1467                         wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 1468         }
 1469 #endif
 1470 
 1471         /* Call main idle method. */
 1472         cpu_idle_fn(sbt);
 1473 
 1474         /* Switch timers mack into active mode. */
 1475         if (!busy) {
 1476                 cpu_activeclock();
 1477                 critical_exit();
 1478         }
 1479 #ifndef XEN
 1480 out:
 1481 #endif
 1482         CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 1483             busy, curcpu);
 1484 }
 1485 
 1486 int
 1487 cpu_idle_wakeup(int cpu)
 1488 {
 1489         struct pcpu *pcpu;
 1490         int *state;
 1491 
 1492         pcpu = pcpu_find(cpu);
 1493         state = (int *)pcpu->pc_monitorbuf;
 1494         /*
 1495          * This doesn't need to be atomic since missing the race will
 1496          * simply result in unnecessary IPIs.
 1497          */
 1498         if (*state == STATE_SLEEPING)
 1499                 return (0);
 1500         if (*state == STATE_MWAIT)
 1501                 *state = STATE_RUNNING;
 1502         return (1);
 1503 }
 1504 
 1505 /*
 1506  * Ordered by speed/power consumption.
 1507  */
 1508 struct {
 1509         void    *id_fn;
 1510         char    *id_name;
 1511 } idle_tbl[] = {
 1512         { cpu_idle_spin, "spin" },
 1513         { cpu_idle_mwait, "mwait" },
 1514         { cpu_idle_hlt, "hlt" },
 1515 #ifndef PC98
 1516         { cpu_idle_acpi, "acpi" },
 1517 #endif
 1518         { NULL, NULL }
 1519 };
 1520 
 1521 static int
 1522 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 1523 {
 1524         char *avail, *p;
 1525         int error;
 1526         int i;
 1527 
 1528         avail = malloc(256, M_TEMP, M_WAITOK);
 1529         p = avail;
 1530         for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 1531                 if (strstr(idle_tbl[i].id_name, "mwait") &&
 1532                     (cpu_feature2 & CPUID2_MON) == 0)
 1533                         continue;
 1534 #ifndef PC98
 1535                 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 1536                     cpu_idle_hook == NULL)
 1537                         continue;
 1538 #endif
 1539                 p += sprintf(p, "%s%s", p != avail ? ", " : "",
 1540                     idle_tbl[i].id_name);
 1541         }
 1542         error = sysctl_handle_string(oidp, avail, 0, req);
 1543         free(avail, M_TEMP);
 1544         return (error);
 1545 }
 1546 
 1547 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
 1548     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 1549 
 1550 static int
 1551 idle_sysctl(SYSCTL_HANDLER_ARGS)
 1552 {
 1553         char buf[16];
 1554         int error;
 1555         char *p;
 1556         int i;
 1557 
 1558         p = "unknown";
 1559         for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 1560                 if (idle_tbl[i].id_fn == cpu_idle_fn) {
 1561                         p = idle_tbl[i].id_name;
 1562                         break;
 1563                 }
 1564         }
 1565         strncpy(buf, p, sizeof(buf));
 1566         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 1567         if (error != 0 || req->newptr == NULL)
 1568                 return (error);
 1569         for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 1570                 if (strstr(idle_tbl[i].id_name, "mwait") &&
 1571                     (cpu_feature2 & CPUID2_MON) == 0)
 1572                         continue;
 1573 #ifndef PC98
 1574                 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 1575                     cpu_idle_hook == NULL)
 1576                         continue;
 1577 #endif
 1578                 if (strcmp(idle_tbl[i].id_name, buf))
 1579                         continue;
 1580                 cpu_idle_fn = idle_tbl[i].id_fn;
 1581                 return (0);
 1582         }
 1583         return (EINVAL);
 1584 }
 1585 
 1586 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
 1587     idle_sysctl, "A", "currently selected idle function");
 1588 
 1589 /*
 1590  * Reset registers to default values on exec.
 1591  */
 1592 void
 1593 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
 1594 {
 1595         struct trapframe *regs = td->td_frame;
 1596         struct pcb *pcb = td->td_pcb;
 1597 
 1598         /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 1599         pcb->pcb_gs = _udatasel;
 1600         load_gs(_udatasel);
 1601 
 1602         mtx_lock_spin(&dt_lock);
 1603         if (td->td_proc->p_md.md_ldt)
 1604                 user_ldt_free(td);
 1605         else
 1606                 mtx_unlock_spin(&dt_lock);
 1607   
 1608         bzero((char *)regs, sizeof(struct trapframe));
 1609         regs->tf_eip = imgp->entry_addr;
 1610         regs->tf_esp = stack;
 1611         regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 1612         regs->tf_ss = _udatasel;
 1613         regs->tf_ds = _udatasel;
 1614         regs->tf_es = _udatasel;
 1615         regs->tf_fs = _udatasel;
 1616         regs->tf_cs = _ucodesel;
 1617 
 1618         /* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 1619         regs->tf_ebx = imgp->ps_strings;
 1620 
 1621         /*
 1622          * Reset the hardware debug registers if they were in use.
 1623          * They won't have any meaning for the newly exec'd process.  
 1624          */
 1625         if (pcb->pcb_flags & PCB_DBREGS) {
 1626                 pcb->pcb_dr0 = 0;
 1627                 pcb->pcb_dr1 = 0;
 1628                 pcb->pcb_dr2 = 0;
 1629                 pcb->pcb_dr3 = 0;
 1630                 pcb->pcb_dr6 = 0;
 1631                 pcb->pcb_dr7 = 0;
 1632                 if (pcb == curpcb) {
 1633                         /*
 1634                          * Clear the debug registers on the running
 1635                          * CPU, otherwise they will end up affecting
 1636                          * the next process we switch to.
 1637                          */
 1638                         reset_dbregs();
 1639                 }
 1640                 pcb->pcb_flags &= ~PCB_DBREGS;
 1641         }
 1642 
 1643         pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 1644 
 1645         /*
 1646          * Drop the FP state if we hold it, so that the process gets a
 1647          * clean FP state if it uses the FPU again.
 1648          */
 1649         fpstate_drop(td);
 1650 
 1651         /*
 1652          * XXX - Linux emulator
 1653          * Make sure sure edx is 0x0 on entry. Linux binaries depend
 1654          * on it.
 1655          */
 1656         td->td_retval[1] = 0;
 1657 }
 1658 
 1659 void
 1660 cpu_setregs(void)
 1661 {
 1662         unsigned int cr0;
 1663 
 1664         cr0 = rcr0();
 1665 
 1666         /*
 1667          * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 1668          *
 1669          * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 1670          * instructions.  We must set the CR0_MP bit and use the CR0_TS
 1671          * bit to control the trap, because setting the CR0_EM bit does
 1672          * not cause WAIT instructions to trap.  It's important to trap
 1673          * WAIT instructions - otherwise the "wait" variants of no-wait
 1674          * control instructions would degenerate to the "no-wait" variants
 1675          * after FP context switches but work correctly otherwise.  It's
 1676          * particularly important to trap WAITs when there is no NPX -
 1677          * otherwise the "wait" variants would always degenerate.
 1678          *
 1679          * Try setting CR0_NE to get correct error reporting on 486DX's.
 1680          * Setting it should fail or do nothing on lesser processors.
 1681          */
 1682         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 1683         load_cr0(cr0);
 1684         load_gs(_udatasel);
 1685 }
 1686 
 1687 u_long bootdev;         /* not a struct cdev *- encoding is different */
 1688 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 1689         CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 1690 
 1691 static char bootmethod[16] = "BIOS";
 1692 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
 1693     "System firmware boot method");
 1694 
 1695 /*
 1696  * Initialize 386 and configure to run kernel
 1697  */
 1698 
 1699 /*
 1700  * Initialize segments & interrupt table
 1701  */
 1702 
 1703 int _default_ldt;
 1704 
 1705 #ifdef XEN
 1706 union descriptor *gdt;
 1707 union descriptor *ldt;
 1708 #else
 1709 union descriptor gdt[NGDT * MAXCPU];    /* global descriptor table */
 1710 union descriptor ldt[NLDT];             /* local descriptor table */
 1711 #endif
 1712 static struct gate_descriptor idt0[NIDT];
 1713 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
 1714 struct region_descriptor r_gdt, r_idt;  /* table descriptors */
 1715 struct mtx dt_lock;                     /* lock for GDT and LDT */
 1716 
 1717 static struct i386tss dblfault_tss;
 1718 static char dblfault_stack[PAGE_SIZE];
 1719 
 1720 extern  vm_offset_t     proc0kstack;
 1721 
 1722 
 1723 /*
 1724  * software prototypes -- in more palatable form.
 1725  *
 1726  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
 1727  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
 1728  */
 1729 struct soft_segment_descriptor gdt_segs[] = {
 1730 /* GNULL_SEL    0 Null Descriptor */
 1731 {       .ssd_base = 0x0,
 1732         .ssd_limit = 0x0,
 1733         .ssd_type = 0,
 1734         .ssd_dpl = SEL_KPL,
 1735         .ssd_p = 0,
 1736         .ssd_xx = 0, .ssd_xx1 = 0,
 1737         .ssd_def32 = 0,
 1738         .ssd_gran = 0           },
 1739 /* GPRIV_SEL    1 SMP Per-Processor Private Data Descriptor */
 1740 {       .ssd_base = 0x0,
 1741         .ssd_limit = 0xfffff,
 1742         .ssd_type = SDT_MEMRWA,
 1743         .ssd_dpl = SEL_KPL,
 1744         .ssd_p = 1,
 1745         .ssd_xx = 0, .ssd_xx1 = 0,
 1746         .ssd_def32 = 1,
 1747         .ssd_gran = 1           },
 1748 /* GUFS_SEL     2 %fs Descriptor for user */
 1749 {       .ssd_base = 0x0,
 1750         .ssd_limit = 0xfffff,
 1751         .ssd_type = SDT_MEMRWA,
 1752         .ssd_dpl = SEL_UPL,
 1753         .ssd_p = 1,
 1754         .ssd_xx = 0, .ssd_xx1 = 0,
 1755         .ssd_def32 = 1,
 1756         .ssd_gran = 1           },
 1757 /* GUGS_SEL     3 %gs Descriptor for user */
 1758 {       .ssd_base = 0x0,
 1759         .ssd_limit = 0xfffff,
 1760         .ssd_type = SDT_MEMRWA,
 1761         .ssd_dpl = SEL_UPL,
 1762         .ssd_p = 1,
 1763         .ssd_xx = 0, .ssd_xx1 = 0,
 1764         .ssd_def32 = 1,
 1765         .ssd_gran = 1           },
 1766 /* GCODE_SEL    4 Code Descriptor for kernel */
 1767 {       .ssd_base = 0x0,
 1768         .ssd_limit = 0xfffff,
 1769         .ssd_type = SDT_MEMERA,
 1770         .ssd_dpl = SEL_KPL,
 1771         .ssd_p = 1,
 1772         .ssd_xx = 0, .ssd_xx1 = 0,
 1773         .ssd_def32 = 1,
 1774         .ssd_gran = 1           },
 1775 /* GDATA_SEL    5 Data Descriptor for kernel */
 1776 {       .ssd_base = 0x0,
 1777         .ssd_limit = 0xfffff,
 1778         .ssd_type = SDT_MEMRWA,
 1779         .ssd_dpl = SEL_KPL,
 1780         .ssd_p = 1,
 1781         .ssd_xx = 0, .ssd_xx1 = 0,
 1782         .ssd_def32 = 1,
 1783         .ssd_gran = 1           },
 1784 /* GUCODE_SEL   6 Code Descriptor for user */
 1785 {       .ssd_base = 0x0,
 1786         .ssd_limit = 0xfffff,
 1787         .ssd_type = SDT_MEMERA,
 1788         .ssd_dpl = SEL_UPL,
 1789         .ssd_p = 1,
 1790         .ssd_xx = 0, .ssd_xx1 = 0,
 1791         .ssd_def32 = 1,
 1792         .ssd_gran = 1           },
 1793 /* GUDATA_SEL   7 Data Descriptor for user */
 1794 {       .ssd_base = 0x0,
 1795         .ssd_limit = 0xfffff,
 1796         .ssd_type = SDT_MEMRWA,
 1797         .ssd_dpl = SEL_UPL,
 1798         .ssd_p = 1,
 1799         .ssd_xx = 0, .ssd_xx1 = 0,
 1800         .ssd_def32 = 1,
 1801         .ssd_gran = 1           },
 1802 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 1803 {       .ssd_base = 0x400,
 1804         .ssd_limit = 0xfffff,
 1805         .ssd_type = SDT_MEMRWA,
 1806         .ssd_dpl = SEL_KPL,
 1807         .ssd_p = 1,
 1808         .ssd_xx = 0, .ssd_xx1 = 0,
 1809         .ssd_def32 = 1,
 1810         .ssd_gran = 1           },
 1811 #ifndef XEN
 1812 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
 1813 {
 1814         .ssd_base = 0x0,
 1815         .ssd_limit = sizeof(struct i386tss)-1,
 1816         .ssd_type = SDT_SYS386TSS,
 1817         .ssd_dpl = 0,
 1818         .ssd_p = 1,
 1819         .ssd_xx = 0, .ssd_xx1 = 0,
 1820         .ssd_def32 = 0,
 1821         .ssd_gran = 0           },
 1822 /* GLDT_SEL     10 LDT Descriptor */
 1823 {       .ssd_base = (int) ldt,
 1824         .ssd_limit = sizeof(ldt)-1,
 1825         .ssd_type = SDT_SYSLDT,
 1826         .ssd_dpl = SEL_UPL,
 1827         .ssd_p = 1,
 1828         .ssd_xx = 0, .ssd_xx1 = 0,
 1829         .ssd_def32 = 0,
 1830         .ssd_gran = 0           },
 1831 /* GUSERLDT_SEL 11 User LDT Descriptor per process */
 1832 {       .ssd_base = (int) ldt,
 1833         .ssd_limit = (512 * sizeof(union descriptor)-1),
 1834         .ssd_type = SDT_SYSLDT,
 1835         .ssd_dpl = 0,
 1836         .ssd_p = 1,
 1837         .ssd_xx = 0, .ssd_xx1 = 0,
 1838         .ssd_def32 = 0,
 1839         .ssd_gran = 0           },
 1840 /* GPANIC_SEL   12 Panic Tss Descriptor */
 1841 {       .ssd_base = (int) &dblfault_tss,
 1842         .ssd_limit = sizeof(struct i386tss)-1,
 1843         .ssd_type = SDT_SYS386TSS,
 1844         .ssd_dpl = 0,
 1845         .ssd_p = 1,
 1846         .ssd_xx = 0, .ssd_xx1 = 0,
 1847         .ssd_def32 = 0,
 1848         .ssd_gran = 0           },
 1849 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 1850 {       .ssd_base = 0,
 1851         .ssd_limit = 0xfffff,
 1852         .ssd_type = SDT_MEMERA,
 1853         .ssd_dpl = 0,
 1854         .ssd_p = 1,
 1855         .ssd_xx = 0, .ssd_xx1 = 0,
 1856         .ssd_def32 = 0,
 1857         .ssd_gran = 1           },
 1858 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 1859 {       .ssd_base = 0,
 1860         .ssd_limit = 0xfffff,
 1861         .ssd_type = SDT_MEMERA,
 1862         .ssd_dpl = 0,
 1863         .ssd_p = 1,
 1864         .ssd_xx = 0, .ssd_xx1 = 0,
 1865         .ssd_def32 = 0,
 1866         .ssd_gran = 1           },
 1867 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 1868 {       .ssd_base = 0,
 1869         .ssd_limit = 0xfffff,
 1870         .ssd_type = SDT_MEMRWA,
 1871         .ssd_dpl = 0,
 1872         .ssd_p = 1,
 1873         .ssd_xx = 0, .ssd_xx1 = 0,
 1874         .ssd_def32 = 1,
 1875         .ssd_gran = 1           },
 1876 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 1877 {       .ssd_base = 0,
 1878         .ssd_limit = 0xfffff,
 1879         .ssd_type = SDT_MEMRWA,
 1880         .ssd_dpl = 0,
 1881         .ssd_p = 1,
 1882         .ssd_xx = 0, .ssd_xx1 = 0,
 1883         .ssd_def32 = 0,
 1884         .ssd_gran = 1           },
 1885 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 1886 {       .ssd_base = 0,
 1887         .ssd_limit = 0xfffff,
 1888         .ssd_type = SDT_MEMRWA,
 1889         .ssd_dpl = 0,
 1890         .ssd_p = 1,
 1891         .ssd_xx = 0, .ssd_xx1 = 0,
 1892         .ssd_def32 = 0,
 1893         .ssd_gran = 1           },
 1894 /* GNDIS_SEL    18 NDIS Descriptor */
 1895 {       .ssd_base = 0x0,
 1896         .ssd_limit = 0x0,
 1897         .ssd_type = 0,
 1898         .ssd_dpl = 0,
 1899         .ssd_p = 0,
 1900         .ssd_xx = 0, .ssd_xx1 = 0,
 1901         .ssd_def32 = 0,
 1902         .ssd_gran = 0           },
 1903 #endif /* !XEN */
 1904 };
 1905 
 1906 static struct soft_segment_descriptor ldt_segs[] = {
 1907         /* Null Descriptor - overwritten by call gate */
 1908 {       .ssd_base = 0x0,
 1909         .ssd_limit = 0x0,
 1910         .ssd_type = 0,
 1911         .ssd_dpl = 0,
 1912         .ssd_p = 0,
 1913         .ssd_xx = 0, .ssd_xx1 = 0,
 1914         .ssd_def32 = 0,
 1915         .ssd_gran = 0           },
 1916         /* Null Descriptor - overwritten by call gate */
 1917 {       .ssd_base = 0x0,
 1918         .ssd_limit = 0x0,
 1919         .ssd_type = 0,
 1920         .ssd_dpl = 0,
 1921         .ssd_p = 0,
 1922         .ssd_xx = 0, .ssd_xx1 = 0,
 1923         .ssd_def32 = 0,
 1924         .ssd_gran = 0           },
 1925         /* Null Descriptor - overwritten by call gate */
 1926 {       .ssd_base = 0x0,
 1927         .ssd_limit = 0x0,
 1928         .ssd_type = 0,
 1929         .ssd_dpl = 0,
 1930         .ssd_p = 0,
 1931         .ssd_xx = 0, .ssd_xx1 = 0,
 1932         .ssd_def32 = 0,
 1933         .ssd_gran = 0           },
 1934         /* Code Descriptor for user */
 1935 {       .ssd_base = 0x0,
 1936         .ssd_limit = 0xfffff,
 1937         .ssd_type = SDT_MEMERA,
 1938         .ssd_dpl = SEL_UPL,
 1939         .ssd_p = 1,
 1940         .ssd_xx = 0, .ssd_xx1 = 0,
 1941         .ssd_def32 = 1,
 1942         .ssd_gran = 1           },
 1943         /* Null Descriptor - overwritten by call gate */
 1944 {       .ssd_base = 0x0,
 1945         .ssd_limit = 0x0,
 1946         .ssd_type = 0,
 1947         .ssd_dpl = 0,
 1948         .ssd_p = 0,
 1949         .ssd_xx = 0, .ssd_xx1 = 0,
 1950         .ssd_def32 = 0,
 1951         .ssd_gran = 0           },
 1952         /* Data Descriptor for user */
 1953 {       .ssd_base = 0x0,
 1954         .ssd_limit = 0xfffff,
 1955         .ssd_type = SDT_MEMRWA,
 1956         .ssd_dpl = SEL_UPL,
 1957         .ssd_p = 1,
 1958         .ssd_xx = 0, .ssd_xx1 = 0,
 1959         .ssd_def32 = 1,
 1960         .ssd_gran = 1           },
 1961 };
 1962 
 1963 void
 1964 setidt(idx, func, typ, dpl, selec)
 1965         int idx;
 1966         inthand_t *func;
 1967         int typ;
 1968         int dpl;
 1969         int selec;
 1970 {
 1971         struct gate_descriptor *ip;
 1972 
 1973         ip = idt + idx;
 1974         ip->gd_looffset = (int)func;
 1975         ip->gd_selector = selec;
 1976         ip->gd_stkcpy = 0;
 1977         ip->gd_xx = 0;
 1978         ip->gd_type = typ;
 1979         ip->gd_dpl = dpl;
 1980         ip->gd_p = 1;
 1981         ip->gd_hioffset = ((int)func)>>16 ;
 1982 }
 1983 
 1984 extern inthand_t
 1985         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 1986         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 1987         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 1988         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 1989         IDTVEC(xmm),
 1990 #ifdef KDTRACE_HOOKS
 1991         IDTVEC(dtrace_ret),
 1992 #endif
 1993 #ifdef XENHVM
 1994         IDTVEC(xen_intr_upcall),
 1995 #endif
 1996         IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 1997 
 1998 #ifdef DDB
 1999 /*
 2000  * Display the index and function name of any IDT entries that don't use
 2001  * the default 'rsvd' entry point.
 2002  */
 2003 DB_SHOW_COMMAND(idt, db_show_idt)
 2004 {
 2005         struct gate_descriptor *ip;
 2006         int idx;
 2007         uintptr_t func;
 2008 
 2009         ip = idt;
 2010         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 2011                 func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 2012                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
 2013                         db_printf("%3d\t", idx);
 2014                         db_printsym(func, DB_STGY_PROC);
 2015                         db_printf("\n");
 2016                 }
 2017                 ip++;
 2018         }
 2019 }
 2020 
 2021 /* Show privileged registers. */
 2022 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 2023 {
 2024         uint64_t idtr, gdtr;
 2025 
 2026         idtr = ridt();
 2027         db_printf("idtr\t0x%08x/%04x\n",
 2028             (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 2029         gdtr = rgdt();
 2030         db_printf("gdtr\t0x%08x/%04x\n",
 2031             (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 2032         db_printf("ldtr\t0x%04x\n", rldt());
 2033         db_printf("tr\t0x%04x\n", rtr());
 2034         db_printf("cr0\t0x%08x\n", rcr0());
 2035         db_printf("cr2\t0x%08x\n", rcr2());
 2036         db_printf("cr3\t0x%08x\n", rcr3());
 2037         db_printf("cr4\t0x%08x\n", rcr4());
 2038         if (rcr4() & CR4_XSAVE)
 2039                 db_printf("xcr0\t0x%016llx\n", rxcr(0));
 2040         if (amd_feature & (AMDID_NX | AMDID_LM))
 2041                 db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER));
 2042         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 2043                 db_printf("FEATURES_CTL\t0x%016llx\n",
 2044                     rdmsr(MSR_IA32_FEATURE_CONTROL));
 2045         if ((cpu_vendor_id == CPU_VENDOR_INTEL ||
 2046             cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6)
 2047                 db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR));
 2048         if (cpu_feature & CPUID_PAT)
 2049                 db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT));
 2050 }
 2051 
 2052 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 2053 {
 2054 
 2055         db_printf("dr0\t0x%08x\n", rdr0());
 2056         db_printf("dr1\t0x%08x\n", rdr1());
 2057         db_printf("dr2\t0x%08x\n", rdr2());
 2058         db_printf("dr3\t0x%08x\n", rdr3());
 2059         db_printf("dr6\t0x%08x\n", rdr6());
 2060         db_printf("dr7\t0x%08x\n", rdr7());     
 2061 }
 2062 #endif
 2063 
 2064 void
 2065 sdtossd(sd, ssd)
 2066         struct segment_descriptor *sd;
 2067         struct soft_segment_descriptor *ssd;
 2068 {
 2069         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 2070         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 2071         ssd->ssd_type  = sd->sd_type;
 2072         ssd->ssd_dpl   = sd->sd_dpl;
 2073         ssd->ssd_p     = sd->sd_p;
 2074         ssd->ssd_def32 = sd->sd_def32;
 2075         ssd->ssd_gran  = sd->sd_gran;
 2076 }
 2077 
 2078 #if !defined(PC98) && !defined(XEN)
 2079 static int
 2080 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
 2081 {
 2082         int i, insert_idx, physmap_idx;
 2083 
 2084         physmap_idx = *physmap_idxp;
 2085         
 2086         if (boothowto & RB_VERBOSE)
 2087                 printf("SMAP type=%02x base=%016llx len=%016llx\n",
 2088                     smap->type, smap->base, smap->length);
 2089 
 2090         if (smap->type != SMAP_TYPE_MEMORY)
 2091                 return (1);
 2092 
 2093         if (smap->length == 0)
 2094                 return (1);
 2095 
 2096 #ifndef PAE
 2097         if (smap->base > 0xffffffff) {
 2098                 printf("%uK of memory above 4GB ignored\n",
 2099                     (u_int)(smap->length / 1024));
 2100                 return (1);
 2101         }
 2102 #endif
 2103 
 2104         /*
 2105          * Find insertion point while checking for overlap.  Start off by
 2106          * assuming the new entry will be added to the end.
 2107          */
 2108         insert_idx = physmap_idx + 2;
 2109         for (i = 0; i <= physmap_idx; i += 2) {
 2110                 if (smap->base < physmap[i + 1]) {
 2111                         if (smap->base + smap->length <= physmap[i]) {
 2112                                 insert_idx = i;
 2113                                 break;
 2114                         }
 2115                         if (boothowto & RB_VERBOSE)
 2116                                 printf(
 2117                     "Overlapping memory regions, ignoring second region\n");
 2118                         return (1);
 2119                 }
 2120         }
 2121 
 2122         /* See if we can prepend to the next entry. */
 2123         if (insert_idx <= physmap_idx &&
 2124             smap->base + smap->length == physmap[insert_idx]) {
 2125                 physmap[insert_idx] = smap->base;
 2126                 return (1);
 2127         }
 2128 
 2129         /* See if we can append to the previous entry. */
 2130         if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
 2131                 physmap[insert_idx - 1] += smap->length;
 2132                 return (1);
 2133         }
 2134 
 2135         physmap_idx += 2;
 2136         *physmap_idxp = physmap_idx;
 2137         if (physmap_idx == PHYSMAP_SIZE) {
 2138                 printf(
 2139                 "Too many segments in the physical address map, giving up\n");
 2140                 return (0);
 2141         }
 2142 
 2143         /*
 2144          * Move the last 'N' entries down to make room for the new
 2145          * entry if needed.
 2146          */
 2147         for (i = physmap_idx; i > insert_idx; i -= 2) {
 2148                 physmap[i] = physmap[i - 2];
 2149                 physmap[i + 1] = physmap[i - 1];
 2150         }
 2151 
 2152         /* Insert the new entry. */
 2153         physmap[insert_idx] = smap->base;
 2154         physmap[insert_idx + 1] = smap->base + smap->length;
 2155         return (1);
 2156 }
 2157 #endif /* !PC98 && !XEN */
 2158 
 2159 #ifndef XEN
 2160 static void
 2161 basemem_setup(void)
 2162 {
 2163         vm_paddr_t pa;
 2164         pt_entry_t *pte;
 2165         int i;
 2166 
 2167         if (basemem > 640) {
 2168                 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 2169                         basemem);
 2170                 basemem = 640;
 2171         }
 2172 
 2173         /*
 2174          * XXX if biosbasemem is now < 640, there is a `hole'
 2175          * between the end of base memory and the start of
 2176          * ISA memory.  The hole may be empty or it may
 2177          * contain BIOS code or data.  Map it read/write so
 2178          * that the BIOS can write to it.  (Memory from 0 to
 2179          * the physical end of the kernel is mapped read-only
 2180          * to begin with and then parts of it are remapped.
 2181          * The parts that aren't remapped form holes that
 2182          * remain read-only and are unused by the kernel.
 2183          * The base memory area is below the physical end of
 2184          * the kernel and right now forms a read-only hole.
 2185          * The part of it from PAGE_SIZE to
 2186          * (trunc_page(biosbasemem * 1024) - 1) will be
 2187          * remapped and used by the kernel later.)
 2188          *
 2189          * This code is similar to the code used in
 2190          * pmap_mapdev, but since no memory needs to be
 2191          * allocated we simply change the mapping.
 2192          */
 2193         for (pa = trunc_page(basemem * 1024);
 2194              pa < ISA_HOLE_START; pa += PAGE_SIZE)
 2195                 pmap_kenter(KERNBASE + pa, pa);
 2196 
 2197         /*
 2198          * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 2199          * the vm86 page table so that vm86 can scribble on them using
 2200          * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 2201          * page 0, at least as initialized here?
 2202          */
 2203         pte = (pt_entry_t *)vm86paddr;
 2204         for (i = basemem / 4; i < 160; i++)
 2205                 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 2206 }
 2207 #endif /* !XEN */
 2208 
 2209 /*
 2210  * Populate the (physmap) array with base/bound pairs describing the
 2211  * available physical memory in the system, then test this memory and
 2212  * build the phys_avail array describing the actually-available memory.
 2213  *
 2214  * If we cannot accurately determine the physical memory map, then use
 2215  * value from the 0xE801 call, and failing that, the RTC.
 2216  *
 2217  * Total memory size may be set by the kernel environment variable
 2218  * hw.physmem or the compile-time define MAXMEM.
 2219  *
 2220  * XXX first should be vm_paddr_t.
 2221  */
 2222 #ifdef PC98
 2223 static void
 2224 getmemsize(int first)
 2225 {
 2226         int off, physmap_idx, pa_indx, da_indx;
 2227         u_long physmem_tunable, memtest;
 2228         vm_paddr_t physmap[PHYSMAP_SIZE];
 2229         pt_entry_t *pte;
 2230         quad_t dcons_addr, dcons_size;
 2231         int i;
 2232         int pg_n;
 2233         u_int extmem;
 2234         u_int under16;
 2235         vm_paddr_t pa;
 2236 
 2237         bzero(physmap, sizeof(physmap));
 2238 
 2239         /* XXX - some of EPSON machines can't use PG_N */
 2240         pg_n = PG_N;
 2241         if (pc98_machine_type & M_EPSON_PC98) {
 2242                 switch (epson_machine_id) {
 2243 #ifdef WB_CACHE
 2244                 default:
 2245 #endif
 2246                 case EPSON_PC486_HX:
 2247                 case EPSON_PC486_HG:
 2248                 case EPSON_PC486_HA:
 2249                         pg_n = 0;
 2250                         break;
 2251                 }
 2252         }
 2253 
 2254         under16 = pc98_getmemsize(&basemem, &extmem);
 2255         basemem_setup();
 2256 
 2257         physmap[0] = 0;
 2258         physmap[1] = basemem * 1024;
 2259         physmap_idx = 2;
 2260         physmap[physmap_idx] = 0x100000;
 2261         physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 2262 
 2263         /*
 2264          * Now, physmap contains a map of physical memory.
 2265          */
 2266 
 2267 #ifdef SMP
 2268         /* make hole for AP bootstrap code */
 2269         physmap[1] = mp_bootaddress(physmap[1]);
 2270 #endif
 2271 
 2272         /*
 2273          * Maxmem isn't the "maximum memory", it's one larger than the
 2274          * highest page of the physical address space.  It should be
 2275          * called something like "Maxphyspage".  We may adjust this 
 2276          * based on ``hw.physmem'' and the results of the memory test.
 2277          */
 2278         Maxmem = atop(physmap[physmap_idx + 1]);
 2279 
 2280 #ifdef MAXMEM
 2281         Maxmem = MAXMEM / 4;
 2282 #endif
 2283 
 2284         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 2285                 Maxmem = atop(physmem_tunable);
 2286 
 2287         /*
 2288          * By default keep the memtest enabled.  Use a general name so that
 2289          * one could eventually do more with the code than just disable it.
 2290          */
 2291         memtest = 1;
 2292         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 2293 
 2294         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 2295             (boothowto & RB_VERBOSE))
 2296                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 2297 
 2298         /*
 2299          * If Maxmem has been increased beyond what the system has detected,
 2300          * extend the last memory segment to the new limit.
 2301          */ 
 2302         if (atop(physmap[physmap_idx + 1]) < Maxmem)
 2303                 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 2304 
 2305         /*
 2306          * We need to divide chunk if Maxmem is larger than 16MB and
 2307          * under 16MB area is not full of memory.
 2308          * (1) system area (15-16MB region) is cut off
 2309          * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
 2310          */
 2311         if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
 2312                 /* 15M - 16M region is cut off, so need to divide chunk */
 2313                 physmap[physmap_idx + 1] = under16 * 1024;
 2314                 physmap_idx += 2;
 2315                 physmap[physmap_idx] = 0x1000000;
 2316                 physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
 2317         }
 2318 
 2319         /* call pmap initialization to make new kernel address space */
 2320         pmap_bootstrap(first);
 2321 
 2322         /*
 2323          * Size up each available chunk of physical memory.
 2324          */
 2325         physmap[0] = PAGE_SIZE;         /* mask off page 0 */
 2326         pa_indx = 0;
 2327         da_indx = 1;
 2328         phys_avail[pa_indx++] = physmap[0];
 2329         phys_avail[pa_indx] = physmap[0];
 2330         dump_avail[da_indx] = physmap[0];
 2331         pte = CMAP3;
 2332 
 2333         /*
 2334          * Get dcons buffer address
 2335          */
 2336         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 2337             getenv_quad("dcons.size", &dcons_size) == 0)
 2338                 dcons_addr = 0;
 2339 
 2340         /*
 2341          * physmap is in bytes, so when converting to page boundaries,
 2342          * round up the start address and round down the end address.
 2343          */
 2344         for (i = 0; i <= physmap_idx; i += 2) {
 2345                 vm_paddr_t end;
 2346 
 2347                 end = ptoa((vm_paddr_t)Maxmem);
 2348                 if (physmap[i + 1] < end)
 2349                         end = trunc_page(physmap[i + 1]);
 2350                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 2351                         int tmp, page_bad, full;
 2352                         int *ptr = (int *)CADDR3;
 2353 
 2354                         full = FALSE;
 2355                         /*
 2356                          * block out kernel memory as not available.
 2357                          */
 2358                         if (pa >= KERNLOAD && pa < first)
 2359                                 goto do_dump_avail;
 2360 
 2361                         /*
 2362                          * block out dcons buffer
 2363                          */
 2364                         if (dcons_addr > 0
 2365                             && pa >= trunc_page(dcons_addr)
 2366                             && pa < dcons_addr + dcons_size)
 2367                                 goto do_dump_avail;
 2368 
 2369                         page_bad = FALSE;
 2370                         if (memtest == 0)
 2371                                 goto skip_memtest;
 2372 
 2373                         /*
 2374                          * map page into kernel: valid, read/write,non-cacheable
 2375                          */
 2376                         *pte = pa | PG_V | PG_RW | pg_n;
 2377                         invltlb();
 2378 
 2379                         tmp = *(int *)ptr;
 2380                         /*
 2381                          * Test for alternating 1's and 0's
 2382                          */
 2383                         *(volatile int *)ptr = 0xaaaaaaaa;
 2384                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 2385                                 page_bad = TRUE;
 2386                         /*
 2387                          * Test for alternating 0's and 1's
 2388                          */
 2389                         *(volatile int *)ptr = 0x55555555;
 2390                         if (*(volatile int *)ptr != 0x55555555)
 2391                                 page_bad = TRUE;
 2392                         /*
 2393                          * Test for all 1's
 2394                          */
 2395                         *(volatile int *)ptr = 0xffffffff;
 2396                         if (*(volatile int *)ptr != 0xffffffff)
 2397                                 page_bad = TRUE;
 2398                         /*
 2399                          * Test for all 0's
 2400                          */
 2401                         *(volatile int *)ptr = 0x0;
 2402                         if (*(volatile int *)ptr != 0x0)
 2403                                 page_bad = TRUE;
 2404                         /*
 2405                          * Restore original value.
 2406                          */
 2407                         *(int *)ptr = tmp;
 2408 
 2409 skip_memtest:
 2410                         /*
 2411                          * Adjust array of valid/good pages.
 2412                          */
 2413                         if (page_bad == TRUE)
 2414                                 continue;
 2415                         /*
 2416                          * If this good page is a continuation of the
 2417                          * previous set of good pages, then just increase
 2418                          * the end pointer. Otherwise start a new chunk.
 2419                          * Note that "end" points one higher than end,
 2420                          * making the range >= start and < end.
 2421                          * If we're also doing a speculative memory
 2422                          * test and we at or past the end, bump up Maxmem
 2423                          * so that we keep going. The first bad page
 2424                          * will terminate the loop.
 2425                          */
 2426                         if (phys_avail[pa_indx] == pa) {
 2427                                 phys_avail[pa_indx] += PAGE_SIZE;
 2428                         } else {
 2429                                 pa_indx++;
 2430                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 2431                                         printf(
 2432                 "Too many holes in the physical address space, giving up\n");
 2433                                         pa_indx--;
 2434                                         full = TRUE;
 2435                                         goto do_dump_avail;
 2436                                 }
 2437                                 phys_avail[pa_indx++] = pa;     /* start */
 2438                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 2439                         }
 2440                         physmem++;
 2441 do_dump_avail:
 2442                         if (dump_avail[da_indx] == pa) {
 2443                                 dump_avail[da_indx] += PAGE_SIZE;
 2444                         } else {
 2445                                 da_indx++;
 2446                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 2447                                         da_indx--;
 2448                                         goto do_next;
 2449                                 }
 2450                                 dump_avail[da_indx++] = pa;     /* start */
 2451                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 2452                         }
 2453 do_next:
 2454                         if (full)
 2455                                 break;
 2456                 }
 2457         }
 2458         *pte = 0;
 2459         invltlb();
 2460         
 2461         /*
 2462          * XXX
 2463          * The last chunk must contain at least one page plus the message
 2464          * buffer to avoid complicating other code (message buffer address
 2465          * calculation, etc.).
 2466          */
 2467         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 2468             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 2469                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 2470                 phys_avail[pa_indx--] = 0;
 2471                 phys_avail[pa_indx--] = 0;
 2472         }
 2473 
 2474         Maxmem = atop(phys_avail[pa_indx]);
 2475 
 2476         /* Trim off space for the message buffer. */
 2477         phys_avail[pa_indx] -= round_page(msgbufsize);
 2478 
 2479         /* Map the message buffer. */
 2480         for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 2481                 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 2482                     off);
 2483 
 2484         PT_UPDATES_FLUSH();
 2485 }
 2486 #else /* PC98 */
 2487 static void
 2488 getmemsize(int first)
 2489 {
 2490         int has_smap, off, physmap_idx, pa_indx, da_indx;
 2491         u_long physmem_tunable, memtest;
 2492         vm_paddr_t physmap[PHYSMAP_SIZE];
 2493         pt_entry_t *pte;
 2494         quad_t dcons_addr, dcons_size;
 2495 #ifndef XEN
 2496         int hasbrokenint12, i, res;
 2497         u_int extmem;
 2498         struct vm86frame vmf;
 2499         struct vm86context vmc;
 2500         vm_paddr_t pa;
 2501         struct bios_smap *smap, *smapbase, *smapend;
 2502         u_int32_t smapsize;
 2503         caddr_t kmdp;
 2504 #endif
 2505 
 2506         has_smap = 0;
 2507 #if defined(XEN)
 2508         Maxmem = xen_start_info->nr_pages - init_first;
 2509         physmem = Maxmem;
 2510         basemem = 0;
 2511         physmap[0] = init_first << PAGE_SHIFT;
 2512         physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
 2513         physmap_idx = 0;
 2514 #else
 2515 #ifdef XBOX
 2516         if (arch_i386_is_xbox) {
 2517                 /*
 2518                  * We queried the memory size before, so chop off 4MB for
 2519                  * the framebuffer and inform the OS of this.
 2520                  */
 2521                 physmap[0] = 0;
 2522                 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
 2523                 physmap_idx = 0;
 2524                 goto physmap_done;
 2525         }
 2526 #endif
 2527         bzero(&vmf, sizeof(vmf));
 2528         bzero(physmap, sizeof(physmap));
 2529         basemem = 0;
 2530 
 2531         /*
 2532          * Check if the loader supplied an SMAP memory map.  If so,
 2533          * use that and do not make any VM86 calls.
 2534          */
 2535         physmap_idx = 0;
 2536         smapbase = NULL;
 2537         kmdp = preload_search_by_type("elf kernel");
 2538         if (kmdp == NULL)
 2539                 kmdp = preload_search_by_type("elf32 kernel");
 2540         if (kmdp != NULL)
 2541                 smapbase = (struct bios_smap *)preload_search_info(kmdp,
 2542                     MODINFO_METADATA | MODINFOMD_SMAP);
 2543         if (smapbase != NULL) {
 2544                 /*
 2545                  * subr_module.c says:
 2546                  * "Consumer may safely assume that size value precedes data."
 2547                  * ie: an int32_t immediately precedes SMAP.
 2548                  */
 2549                 smapsize = *((u_int32_t *)smapbase - 1);
 2550                 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 2551                 has_smap = 1;
 2552 
 2553                 for (smap = smapbase; smap < smapend; smap++)
 2554                         if (!add_smap_entry(smap, physmap, &physmap_idx))
 2555                                 break;
 2556                 goto have_smap;
 2557         }
 2558 
 2559         /*
 2560          * Some newer BIOSes have a broken INT 12H implementation
 2561          * which causes a kernel panic immediately.  In this case, we
 2562          * need use the SMAP to determine the base memory size.
 2563          */
 2564         hasbrokenint12 = 0;
 2565         TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 2566         if (hasbrokenint12 == 0) {
 2567                 /* Use INT12 to determine base memory size. */
 2568                 vm86_intcall(0x12, &vmf);
 2569                 basemem = vmf.vmf_ax;
 2570                 basemem_setup();
 2571         }
 2572 
 2573         /*
 2574          * Fetch the memory map with INT 15:E820.  Map page 1 R/W into
 2575          * the kernel page table so we can use it as a buffer.  The
 2576          * kernel will unmap this page later.
 2577          */
 2578         pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 2579         vmc.npages = 0;
 2580         smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 2581         res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 2582         KASSERT(res != 0, ("vm86_getptr() failed: address not found"));
 2583 
 2584         vmf.vmf_ebx = 0;
 2585         do {
 2586                 vmf.vmf_eax = 0xE820;
 2587                 vmf.vmf_edx = SMAP_SIG;
 2588                 vmf.vmf_ecx = sizeof(struct bios_smap);
 2589                 i = vm86_datacall(0x15, &vmf, &vmc);
 2590                 if (i || vmf.vmf_eax != SMAP_SIG)
 2591                         break;
 2592                 has_smap = 1;
 2593                 if (!add_smap_entry(smap, physmap, &physmap_idx))
 2594                         break;
 2595         } while (vmf.vmf_ebx != 0);
 2596 
 2597 have_smap:
 2598         /*
 2599          * If we didn't fetch the "base memory" size from INT12,
 2600          * figure it out from the SMAP (or just guess).
 2601          */
 2602         if (basemem == 0) {
 2603                 for (i = 0; i <= physmap_idx; i += 2) {
 2604                         if (physmap[i] == 0x00000000) {
 2605                                 basemem = physmap[i + 1] / 1024;
 2606                                 break;
 2607                         }
 2608                 }
 2609 
 2610                 /* XXX: If we couldn't find basemem from SMAP, just guess. */
 2611                 if (basemem == 0)
 2612                         basemem = 640;
 2613                 basemem_setup();
 2614         }
 2615 
 2616         if (physmap[1] != 0)
 2617                 goto physmap_done;
 2618 
 2619         /*
 2620          * If we failed to find an SMAP, figure out the extended
 2621          * memory size.  We will then build a simple memory map with
 2622          * two segments, one for "base memory" and the second for
 2623          * "extended memory".  Note that "extended memory" starts at a
 2624          * physical address of 1MB and that both basemem and extmem
 2625          * are in units of 1KB.
 2626          *
 2627          * First, try to fetch the extended memory size via INT 15:E801.
 2628          */
 2629         vmf.vmf_ax = 0xE801;
 2630         if (vm86_intcall(0x15, &vmf) == 0) {
 2631                 extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 2632         } else {
 2633                 /*
 2634                  * If INT15:E801 fails, this is our last ditch effort
 2635                  * to determine the extended memory size.  Currently
 2636                  * we prefer the RTC value over INT15:88.
 2637                  */
 2638 #if 0
 2639                 vmf.vmf_ah = 0x88;
 2640                 vm86_intcall(0x15, &vmf);
 2641                 extmem = vmf.vmf_ax;
 2642 #else
 2643                 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 2644 #endif
 2645         }
 2646 
 2647         /*
 2648          * Special hack for chipsets that still remap the 384k hole when
 2649          * there's 16MB of memory - this really confuses people that
 2650          * are trying to use bus mastering ISA controllers with the
 2651          * "16MB limit"; they only have 16MB, but the remapping puts
 2652          * them beyond the limit.
 2653          *
 2654          * If extended memory is between 15-16MB (16-17MB phys address range),
 2655          *      chop it to 15MB.
 2656          */
 2657         if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 2658                 extmem = 15 * 1024;
 2659 
 2660         physmap[0] = 0;
 2661         physmap[1] = basemem * 1024;
 2662         physmap_idx = 2;
 2663         physmap[physmap_idx] = 0x100000;
 2664         physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 2665 
 2666 physmap_done:
 2667 #endif  
 2668         /*
 2669          * Now, physmap contains a map of physical memory.
 2670          */
 2671 
 2672 #ifdef SMP
 2673         /* make hole for AP bootstrap code */
 2674         physmap[1] = mp_bootaddress(physmap[1]);
 2675 #endif
 2676 
 2677         /*
 2678          * Maxmem isn't the "maximum memory", it's one larger than the
 2679          * highest page of the physical address space.  It should be
 2680          * called something like "Maxphyspage".  We may adjust this 
 2681          * based on ``hw.physmem'' and the results of the memory test.
 2682          */
 2683         Maxmem = atop(physmap[physmap_idx + 1]);
 2684 
 2685 #ifdef MAXMEM
 2686         Maxmem = MAXMEM / 4;
 2687 #endif
 2688 
 2689         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 2690                 Maxmem = atop(physmem_tunable);
 2691 
 2692         /*
 2693          * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 2694          * the amount of memory in the system.
 2695          */
 2696         if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 2697                 Maxmem = atop(physmap[physmap_idx + 1]);
 2698 
 2699         /*
 2700          * By default enable the memory test on real hardware, and disable
 2701          * it if we appear to be running in a VM.  This avoids touching all
 2702          * pages unnecessarily, which doesn't matter on real hardware but is
 2703          * bad for shared VM hosts.  Use a general name so that
 2704          * one could eventually do more with the code than just disable it.
 2705          */
 2706         memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
 2707         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 2708 
 2709         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 2710             (boothowto & RB_VERBOSE))
 2711                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 2712 
 2713         /*
 2714          * If Maxmem has been increased beyond what the system has detected,
 2715          * extend the last memory segment to the new limit.
 2716          */ 
 2717         if (atop(physmap[physmap_idx + 1]) < Maxmem)
 2718                 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 2719 
 2720         /* call pmap initialization to make new kernel address space */
 2721         pmap_bootstrap(first);
 2722 
 2723         /*
 2724          * Size up each available chunk of physical memory.
 2725          */
 2726         physmap[0] = PAGE_SIZE;         /* mask off page 0 */
 2727         pa_indx = 0;
 2728         da_indx = 1;
 2729         phys_avail[pa_indx++] = physmap[0];
 2730         phys_avail[pa_indx] = physmap[0];
 2731         dump_avail[da_indx] = physmap[0];
 2732         pte = CMAP3;
 2733 
 2734         /*
 2735          * Get dcons buffer address
 2736          */
 2737         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 2738             getenv_quad("dcons.size", &dcons_size) == 0)
 2739                 dcons_addr = 0;
 2740 
 2741 #ifndef XEN
 2742         /*
 2743          * physmap is in bytes, so when converting to page boundaries,
 2744          * round up the start address and round down the end address.
 2745          */
 2746         for (i = 0; i <= physmap_idx; i += 2) {
 2747                 vm_paddr_t end;
 2748 
 2749                 end = ptoa((vm_paddr_t)Maxmem);
 2750                 if (physmap[i + 1] < end)
 2751                         end = trunc_page(physmap[i + 1]);
 2752                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 2753                         int tmp, page_bad, full;
 2754                         int *ptr = (int *)CADDR3;
 2755 
 2756                         full = FALSE;
 2757                         /*
 2758                          * block out kernel memory as not available.
 2759                          */
 2760                         if (pa >= KERNLOAD && pa < first)
 2761                                 goto do_dump_avail;
 2762 
 2763                         /*
 2764                          * block out dcons buffer
 2765                          */
 2766                         if (dcons_addr > 0
 2767                             && pa >= trunc_page(dcons_addr)
 2768                             && pa < dcons_addr + dcons_size)
 2769                                 goto do_dump_avail;
 2770 
 2771                         page_bad = FALSE;
 2772                         if (memtest == 0)
 2773                                 goto skip_memtest;
 2774 
 2775                         /*
 2776                          * map page into kernel: valid, read/write,non-cacheable
 2777                          */
 2778                         *pte = pa | PG_V | PG_RW | PG_N;
 2779                         invltlb();
 2780 
 2781                         tmp = *(int *)ptr;
 2782                         /*
 2783                          * Test for alternating 1's and 0's
 2784                          */
 2785                         *(volatile int *)ptr = 0xaaaaaaaa;
 2786                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 2787                                 page_bad = TRUE;
 2788                         /*
 2789                          * Test for alternating 0's and 1's
 2790                          */
 2791                         *(volatile int *)ptr = 0x55555555;
 2792                         if (*(volatile int *)ptr != 0x55555555)
 2793                                 page_bad = TRUE;
 2794                         /*
 2795                          * Test for all 1's
 2796                          */
 2797                         *(volatile int *)ptr = 0xffffffff;
 2798                         if (*(volatile int *)ptr != 0xffffffff)
 2799                                 page_bad = TRUE;
 2800                         /*
 2801                          * Test for all 0's
 2802                          */
 2803                         *(volatile int *)ptr = 0x0;
 2804                         if (*(volatile int *)ptr != 0x0)
 2805                                 page_bad = TRUE;
 2806                         /*
 2807                          * Restore original value.
 2808                          */
 2809                         *(int *)ptr = tmp;
 2810 
 2811 skip_memtest:
 2812                         /*
 2813                          * Adjust array of valid/good pages.
 2814                          */
 2815                         if (page_bad == TRUE)
 2816                                 continue;
 2817                         /*
 2818                          * If this good page is a continuation of the
 2819                          * previous set of good pages, then just increase
 2820                          * the end pointer. Otherwise start a new chunk.
 2821                          * Note that "end" points one higher than end,
 2822                          * making the range >= start and < end.
 2823                          * If we're also doing a speculative memory
 2824                          * test and we at or past the end, bump up Maxmem
 2825                          * so that we keep going. The first bad page
 2826                          * will terminate the loop.
 2827                          */
 2828                         if (phys_avail[pa_indx] == pa) {
 2829                                 phys_avail[pa_indx] += PAGE_SIZE;
 2830                         } else {
 2831                                 pa_indx++;
 2832                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 2833                                         printf(
 2834                 "Too many holes in the physical address space, giving up\n");
 2835                                         pa_indx--;
 2836                                         full = TRUE;
 2837                                         goto do_dump_avail;
 2838                                 }
 2839                                 phys_avail[pa_indx++] = pa;     /* start */
 2840                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 2841                         }
 2842                         physmem++;
 2843 do_dump_avail:
 2844                         if (dump_avail[da_indx] == pa) {
 2845                                 dump_avail[da_indx] += PAGE_SIZE;
 2846                         } else {
 2847                                 da_indx++;
 2848                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 2849                                         da_indx--;
 2850                                         goto do_next;
 2851                                 }
 2852                                 dump_avail[da_indx++] = pa;     /* start */
 2853                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 2854                         }
 2855 do_next:
 2856                         if (full)
 2857                                 break;
 2858                 }
 2859         }
 2860         *pte = 0;
 2861         invltlb();
 2862 #else
 2863         phys_avail[0] = physfree;
 2864         phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 2865         dump_avail[0] = 0;      
 2866         dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 2867         
 2868 #endif
 2869         
 2870         /*
 2871          * XXX
 2872          * The last chunk must contain at least one page plus the message
 2873          * buffer to avoid complicating other code (message buffer address
 2874          * calculation, etc.).
 2875          */
 2876         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 2877             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 2878                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 2879                 phys_avail[pa_indx--] = 0;
 2880                 phys_avail[pa_indx--] = 0;
 2881         }
 2882 
 2883         Maxmem = atop(phys_avail[pa_indx]);
 2884 
 2885         /* Trim off space for the message buffer. */
 2886         phys_avail[pa_indx] -= round_page(msgbufsize);
 2887 
 2888         /* Map the message buffer. */
 2889         for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 2890                 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 2891                     off);
 2892 
 2893         PT_UPDATES_FLUSH();
 2894 }
 2895 #endif /* PC98 */
 2896 
 2897 #ifdef XEN
 2898 #define MTOPSIZE (1<<(14 + PAGE_SHIFT))
 2899 
 2900 register_t
 2901 init386(first)
 2902         int first;
 2903 {
 2904         unsigned long gdtmachpfn;
 2905         int error, gsel_tss, metadata_missing, x, pa;
 2906         struct pcpu *pc;
 2907 #ifdef CPU_ENABLE_SSE
 2908         struct xstate_hdr *xhdr;
 2909 #endif
 2910         struct callback_register event = {
 2911                 .type = CALLBACKTYPE_event,
 2912                 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
 2913         };
 2914         struct callback_register failsafe = {
 2915                 .type = CALLBACKTYPE_failsafe,
 2916                 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
 2917         };
 2918 
 2919         thread0.td_kstack = proc0kstack;
 2920         thread0.td_kstack_pages = KSTACK_PAGES;
 2921 
 2922         /*
 2923          * This may be done better later if it gets more high level
 2924          * components in it. If so just link td->td_proc here.
 2925          */
 2926         proc_linkup0(&proc0, &thread0);
 2927 
 2928         metadata_missing = 0;
 2929         if (xen_start_info->mod_start) {
 2930                 preload_metadata = (caddr_t)xen_start_info->mod_start;
 2931                 preload_bootstrap_relocate(KERNBASE);
 2932         } else {
 2933                 metadata_missing = 1;
 2934         }
 2935         if (envmode == 1)
 2936                 kern_envp = static_env;
 2937         else if ((caddr_t)xen_start_info->cmd_line)
 2938                 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
 2939 
 2940         boothowto |= xen_boothowto(kern_envp);
 2941         
 2942         /* Init basic tunables, hz etc */
 2943         init_param1();
 2944 
 2945         /*
 2946          * XEN occupies a portion of the upper virtual address space 
 2947          * At its base it manages an array mapping machine page frames 
 2948          * to physical page frames - hence we need to be able to 
 2949          * access 4GB - (64MB  - 4MB + 64k) 
 2950          */
 2951         gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2952         gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2953         gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2954         gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2955         gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2956         gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2957         gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2958         gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2959 
 2960         pc = &__pcpu[0];
 2961         gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 2962         gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 2963 
 2964         PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
 2965         bzero(gdt, PAGE_SIZE);
 2966         for (x = 0; x < NGDT; x++)
 2967                 ssdtosd(&gdt_segs[x], &gdt[x].sd);
 2968 
 2969         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 2970 
 2971         gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
 2972         PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
 2973         PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);    
 2974         lgdt(&r_gdt);
 2975         gdtset = 1;
 2976 
 2977         if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
 2978                 panic("set_trap_table failed - error %d\n", error);
 2979         }
 2980         
 2981         error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
 2982         if (error == 0)
 2983                 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
 2984 #if     CONFIG_XEN_COMPAT <= 0x030002
 2985         if (error == -ENOXENSYS)
 2986                 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
 2987                     (unsigned long)Xhypervisor_callback,
 2988                     GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
 2989 #endif
 2990         pcpu_init(pc, 0, sizeof(struct pcpu));
 2991         for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 2992                 pmap_kenter(pa + KERNBASE, pa);
 2993         dpcpu_init((void *)(first + KERNBASE), 0);
 2994         first += DPCPU_SIZE;
 2995         physfree += DPCPU_SIZE;
 2996         init_first += DPCPU_SIZE / PAGE_SIZE;
 2997 
 2998         PCPU_SET(prvspace, pc);
 2999         PCPU_SET(curthread, &thread0);
 3000 
 3001         /*
 3002          * Initialize mutexes.
 3003          *
 3004          * icu_lock: in order to allow an interrupt to occur in a critical
 3005          *           section, to set pcpu->ipending (etc...) properly, we
 3006          *           must be able to get the icu lock, so it can't be
 3007          *           under witness.
 3008          */
 3009         mutex_init();
 3010         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 3011 
 3012         /* make ldt memory segments */
 3013         PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
 3014         bzero(ldt, PAGE_SIZE);
 3015         ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 3016         ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 3017         for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 3018                 ssdtosd(&ldt_segs[x], &ldt[x].sd);
 3019 
 3020         default_proc_ldt.ldt_base = (caddr_t)ldt;
 3021         default_proc_ldt.ldt_len = 6;
 3022         _default_ldt = (int)&default_proc_ldt;
 3023         PCPU_SET(currentldt, _default_ldt);
 3024         PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
 3025         xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
 3026         
 3027 #if defined(XEN_PRIVILEGED)
 3028         /*
 3029          * Initialize the i8254 before the console so that console
 3030          * initialization can use DELAY().
 3031          */
 3032         i8254_init();
 3033 #endif
 3034         
 3035         /*
 3036          * Initialize the console before we print anything out.
 3037          */
 3038         cninit();
 3039 
 3040         if (metadata_missing)
 3041                 printf("WARNING: loader(8) metadata is missing!\n");
 3042 
 3043 #ifdef DEV_ISA
 3044 #ifdef DEV_ATPIC
 3045         elcr_probe();
 3046         atpic_startup();
 3047 #else
 3048         /* Reset and mask the atpics and leave them shut down. */
 3049         atpic_reset();
 3050 
 3051         /*
 3052          * Point the ICU spurious interrupt vectors at the APIC spurious
 3053          * interrupt handler.
 3054          */
 3055         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 3056             GSEL(GCODE_SEL, SEL_KPL));
 3057         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 3058             GSEL(GCODE_SEL, SEL_KPL));
 3059 #endif
 3060 #endif
 3061 
 3062 #ifdef DDB
 3063         ksym_start = bootinfo.bi_symtab;
 3064         ksym_end = bootinfo.bi_esymtab;
 3065 #endif
 3066 
 3067         kdb_init();
 3068 
 3069 #ifdef KDB
 3070         if (boothowto & RB_KDB)
 3071                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 3072 #endif
 3073 
 3074         finishidentcpu();       /* Final stage of CPU initialization */
 3075         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 3076             GSEL(GCODE_SEL, SEL_KPL));
 3077         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 3078             GSEL(GCODE_SEL, SEL_KPL));
 3079         initializecpu();        /* Initialize CPU registers */
 3080         initializecpucache();
 3081 
 3082         /* pointer to selector slot for %fs/%gs */
 3083         PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 3084 
 3085         dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 3086             dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 3087         dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 3088             dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 3089 #if defined(PAE) || defined(PAE_TABLES)
 3090         dblfault_tss.tss_cr3 = (int)IdlePDPT;
 3091 #else
 3092         dblfault_tss.tss_cr3 = (int)IdlePTD;
 3093 #endif
 3094         dblfault_tss.tss_eip = (int)dblfault_handler;
 3095         dblfault_tss.tss_eflags = PSL_KERNEL;
 3096         dblfault_tss.tss_ds = dblfault_tss.tss_es =
 3097             dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 3098         dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 3099         dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 3100         dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 3101 
 3102         vm86_initialize();
 3103         getmemsize(first);
 3104         init_param2(physmem);
 3105 
 3106         /* now running on new page tables, configured,and u/iom is accessible */
 3107 
 3108         msgbufinit(msgbufp, msgbufsize);
 3109 #ifdef DEV_NPX
 3110         npxinit(true);
 3111 #endif
 3112         /*
 3113          * Set up thread0 pcb after npxinit calculated pcb + fpu save
 3114          * area size.  Zero out the extended state header in fpu save
 3115          * area.
 3116          */
 3117         thread0.td_pcb = get_pcb_td(&thread0);
 3118         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 3119 #ifdef CPU_ENABLE_SSE
 3120         if (use_xsave) {
 3121                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 3122                     1);
 3123                 xhdr->xstate_bv = xsave_mask;
 3124         }
 3125 #endif
 3126         PCPU_SET(curpcb, thread0.td_pcb);
 3127         /* make an initial tss so cpu can get interrupt stack on syscall! */
 3128         /* Note: -16 is so we can grow the trapframe if we came from vm86 */
 3129         PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16);
 3130         PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 3131         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 3132         HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
 3133             PCPU_GET(common_tss.tss_esp0));
 3134         
 3135         /* transfer to user mode */
 3136 
 3137         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 3138         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 3139 
 3140         /* setup proc 0's pcb */
 3141         thread0.td_pcb->pcb_flags = 0;
 3142 #if defined(PAE) || defined(PAE_TABLES)
 3143         thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 3144 #else
 3145         thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 3146 #endif
 3147         thread0.td_pcb->pcb_ext = 0;
 3148         thread0.td_frame = &proc0_tf;
 3149         thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
 3150         thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
 3151 
 3152         cpu_probe_amdc1e();
 3153 
 3154         /* Location of kernel stack for locore */
 3155         return ((register_t)thread0.td_pcb);
 3156 }
 3157 
 3158 #else
 3159 register_t
 3160 init386(int first)
 3161 {
 3162         struct gate_descriptor *gdp;
 3163         int gsel_tss, metadata_missing, x, pa;
 3164         struct pcpu *pc;
 3165 #ifdef CPU_ENABLE_SSE
 3166         struct xstate_hdr *xhdr;
 3167 #endif
 3168 
 3169         thread0.td_kstack = proc0kstack;
 3170         thread0.td_kstack_pages = TD0_KSTACK_PAGES;
 3171 
 3172         /*
 3173          * This may be done better later if it gets more high level
 3174          * components in it. If so just link td->td_proc here.
 3175          */
 3176         proc_linkup0(&proc0, &thread0);
 3177 
 3178 #ifdef PC98
 3179         /*
 3180          * Initialize DMAC
 3181          */
 3182         pc98_init_dmac();
 3183 #endif
 3184 
 3185         metadata_missing = 0;
 3186         if (bootinfo.bi_modulep) {
 3187                 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 3188                 preload_bootstrap_relocate(KERNBASE);
 3189         } else {
 3190                 metadata_missing = 1;
 3191         }
 3192 
 3193         if (bootinfo.bi_envp)
 3194                 init_static_kenv((caddr_t)bootinfo.bi_envp + KERNBASE, 0);
 3195         else
 3196                 init_static_kenv(NULL, 0);
 3197 
 3198 #ifndef XEN
 3199         identify_hypervisor();
 3200 #endif
 3201 
 3202         /* Init basic tunables, hz etc */
 3203         init_param1();
 3204 
 3205         /*
 3206          * Make gdt memory segments.  All segments cover the full 4GB
 3207          * of address space and permissions are enforced at page level.
 3208          */
 3209         gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 3210         gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 3211         gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 3212         gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 3213         gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 3214         gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 3215 
 3216         pc = &__pcpu[0];
 3217         gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 3218         gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 3219         gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 3220 
 3221         for (x = 0; x < NGDT; x++)
 3222                 ssdtosd(&gdt_segs[x], &gdt[x].sd);
 3223 
 3224         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 3225         r_gdt.rd_base =  (int) gdt;
 3226         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 3227         lgdt(&r_gdt);
 3228 
 3229         pcpu_init(pc, 0, sizeof(struct pcpu));
 3230         for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 3231                 pmap_kenter(pa + KERNBASE, pa);
 3232         dpcpu_init((void *)(first + KERNBASE), 0);
 3233         first += DPCPU_SIZE;
 3234         PCPU_SET(prvspace, pc);
 3235         PCPU_SET(curthread, &thread0);
 3236 
 3237         /*
 3238          * Initialize mutexes.
 3239          *
 3240          * icu_lock: in order to allow an interrupt to occur in a critical
 3241          *           section, to set pcpu->ipending (etc...) properly, we
 3242          *           must be able to get the icu lock, so it can't be
 3243          *           under witness.
 3244          */
 3245         mutex_init();
 3246         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 3247 
 3248         /* make ldt memory segments */
 3249         ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 3250         ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 3251         for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 3252                 ssdtosd(&ldt_segs[x], &ldt[x].sd);
 3253 
 3254         _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 3255         lldt(_default_ldt);
 3256         PCPU_SET(currentldt, _default_ldt);
 3257 
 3258         /* exceptions */
 3259         for (x = 0; x < NIDT; x++)
 3260                 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 3261                     GSEL(GCODE_SEL, SEL_KPL));
 3262         setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 3263             GSEL(GCODE_SEL, SEL_KPL));
 3264         setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 3265             GSEL(GCODE_SEL, SEL_KPL));
 3266         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 3267             GSEL(GCODE_SEL, SEL_KPL));
 3268         setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 3269             GSEL(GCODE_SEL, SEL_KPL));
 3270         setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 3271             GSEL(GCODE_SEL, SEL_KPL));
 3272         setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 3273             GSEL(GCODE_SEL, SEL_KPL));
 3274         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 3275             GSEL(GCODE_SEL, SEL_KPL));
 3276         setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 3277             , GSEL(GCODE_SEL, SEL_KPL));
 3278         setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 3279         setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 3280             GSEL(GCODE_SEL, SEL_KPL));
 3281         setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 3282             GSEL(GCODE_SEL, SEL_KPL));
 3283         setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 3284             GSEL(GCODE_SEL, SEL_KPL));
 3285         setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 3286             GSEL(GCODE_SEL, SEL_KPL));
 3287         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 3288             GSEL(GCODE_SEL, SEL_KPL));
 3289         setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 3290             GSEL(GCODE_SEL, SEL_KPL));
 3291         setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 3292             GSEL(GCODE_SEL, SEL_KPL));
 3293         setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 3294             GSEL(GCODE_SEL, SEL_KPL));
 3295         setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 3296             GSEL(GCODE_SEL, SEL_KPL));
 3297         setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 3298             GSEL(GCODE_SEL, SEL_KPL));
 3299         setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 3300             GSEL(GCODE_SEL, SEL_KPL));
 3301 #ifdef KDTRACE_HOOKS
 3302         setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
 3303             GSEL(GCODE_SEL, SEL_KPL));
 3304 #endif
 3305 #ifdef XENHVM
 3306         setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL,
 3307             GSEL(GCODE_SEL, SEL_KPL));
 3308 #endif
 3309 
 3310         r_idt.rd_limit = sizeof(idt0) - 1;
 3311         r_idt.rd_base = (int) idt;
 3312         lidt(&r_idt);
 3313 
 3314 #ifdef XBOX
 3315         /*
 3316          * The following code queries the PCI ID of 0:0:0. For the XBOX,
 3317          * This should be 0x10de / 0x02a5.
 3318          *
 3319          * This is exactly what Linux does.
 3320          */
 3321         outl(0xcf8, 0x80000000);
 3322         if (inl(0xcfc) == 0x02a510de) {
 3323                 arch_i386_is_xbox = 1;
 3324                 pic16l_setled(XBOX_LED_GREEN);
 3325 
 3326                 /*
 3327                  * We are an XBOX, but we may have either 64MB or 128MB of
 3328                  * memory. The PCI host bridge should be programmed for this,
 3329                  * so we just query it. 
 3330                  */
 3331                 outl(0xcf8, 0x80000084);
 3332                 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
 3333         }
 3334 #endif /* XBOX */
 3335 
 3336         /*
 3337          * Initialize the i8254 before the console so that console
 3338          * initialization can use DELAY().
 3339          */
 3340         i8254_init();
 3341 
 3342         finishidentcpu();       /* Final stage of CPU initialization */
 3343         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 3344             GSEL(GCODE_SEL, SEL_KPL));
 3345         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 3346             GSEL(GCODE_SEL, SEL_KPL));
 3347         initializecpu();        /* Initialize CPU registers */
 3348         initializecpucache();
 3349 
 3350         /* pointer to selector slot for %fs/%gs */
 3351         PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 3352 
 3353         dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 3354             dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 3355         dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 3356             dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 3357 #if defined(PAE) || defined(PAE_TABLES)
 3358         dblfault_tss.tss_cr3 = (int)IdlePDPT;
 3359 #else
 3360         dblfault_tss.tss_cr3 = (int)IdlePTD;
 3361 #endif
 3362         dblfault_tss.tss_eip = (int)dblfault_handler;
 3363         dblfault_tss.tss_eflags = PSL_KERNEL;
 3364         dblfault_tss.tss_ds = dblfault_tss.tss_es =
 3365             dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 3366         dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 3367         dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 3368         dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 3369 
 3370         vm86_initialize();
 3371         getmemsize(first);
 3372         init_param2(physmem);
 3373 
 3374         /* now running on new page tables, configured,and u/iom is accessible */
 3375 
 3376         /*
 3377          * Initialize the console before we print anything out.
 3378          */
 3379         cninit();
 3380 
 3381         if (metadata_missing)
 3382                 printf("WARNING: loader(8) metadata is missing!\n");
 3383 
 3384 #ifdef DEV_ISA
 3385 #ifdef DEV_ATPIC
 3386 #ifndef PC98
 3387         elcr_probe();
 3388 #endif
 3389         atpic_startup();
 3390 #else
 3391         /* Reset and mask the atpics and leave them shut down. */
 3392         atpic_reset();
 3393 
 3394         /*
 3395          * Point the ICU spurious interrupt vectors at the APIC spurious
 3396          * interrupt handler.
 3397          */
 3398         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 3399             GSEL(GCODE_SEL, SEL_KPL));
 3400         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL,
 3401             GSEL(GCODE_SEL, SEL_KPL));
 3402 #endif
 3403 #endif
 3404 
 3405 #ifdef DDB
 3406         ksym_start = bootinfo.bi_symtab;
 3407         ksym_end = bootinfo.bi_esymtab;
 3408 #endif
 3409 
 3410         kdb_init();
 3411 
 3412 #ifdef KDB
 3413         if (boothowto & RB_KDB)
 3414                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 3415 #endif
 3416 
 3417         msgbufinit(msgbufp, msgbufsize);
 3418 #ifdef DEV_NPX
 3419         npxinit(true);
 3420 #endif
 3421         /*
 3422          * Set up thread0 pcb after npxinit calculated pcb + fpu save
 3423          * area size.  Zero out the extended state header in fpu save
 3424          * area.
 3425          */
 3426         thread0.td_pcb = get_pcb_td(&thread0);
 3427         thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 3428         bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 3429 #ifdef CPU_ENABLE_SSE
 3430         if (use_xsave) {
 3431                 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 3432                     1);
 3433                 xhdr->xstate_bv = xsave_mask;
 3434         }
 3435 #endif
 3436         PCPU_SET(curpcb, thread0.td_pcb);
 3437         /* make an initial tss so cpu can get interrupt stack on syscall! */
 3438         /* Note: -16 is so we can grow the trapframe if we came from vm86 */
 3439         PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16);
 3440         PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 3441         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 3442         PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 3443         PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 3444         PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 3445         ltr(gsel_tss);
 3446 
 3447         /* make a call gate to reenter kernel with */
 3448         gdp = &ldt[LSYS5CALLS_SEL].gd;
 3449 
 3450         x = (int) &IDTVEC(lcall_syscall);
 3451         gdp->gd_looffset = x;
 3452         gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 3453         gdp->gd_stkcpy = 1;
 3454         gdp->gd_type = SDT_SYS386CGT;
 3455         gdp->gd_dpl = SEL_UPL;
 3456         gdp->gd_p = 1;
 3457         gdp->gd_hioffset = x >> 16;
 3458 
 3459         /* XXX does this work? */
 3460         /* XXX yes! */
 3461         ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 3462         ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 3463 
 3464         /* transfer to user mode */
 3465 
 3466         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 3467         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 3468 
 3469         /* setup proc 0's pcb */
 3470         thread0.td_pcb->pcb_flags = 0;
 3471 #if defined(PAE) || defined(PAE_TABLES)
 3472         thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 3473 #else
 3474         thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 3475 #endif
 3476         thread0.td_pcb->pcb_ext = 0;
 3477         thread0.td_frame = &proc0_tf;
 3478 
 3479         cpu_probe_amdc1e();
 3480 
 3481 #ifdef FDT
 3482         x86_init_fdt();
 3483 #endif
 3484 
 3485         /* Location of kernel stack for locore */
 3486         return ((register_t)thread0.td_pcb);
 3487 }
 3488 #endif
 3489 
 3490 void
 3491 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 3492 {
 3493 
 3494         pcpu->pc_acpi_id = 0xffffffff;
 3495 }
 3496 
 3497 #ifndef PC98
 3498 static int
 3499 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 3500 {
 3501         struct bios_smap *smapbase;
 3502         struct bios_smap_xattr smap;
 3503         caddr_t kmdp;
 3504         uint32_t *smapattr;
 3505         int count, error, i;
 3506 
 3507         /* Retrieve the system memory map from the loader. */
 3508         kmdp = preload_search_by_type("elf kernel");
 3509         if (kmdp == NULL)
 3510                 kmdp = preload_search_by_type("elf32 kernel");
 3511         if (kmdp == NULL)
 3512                 return (0);
 3513         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 3514             MODINFO_METADATA | MODINFOMD_SMAP);
 3515         if (smapbase == NULL)
 3516                 return (0);
 3517         smapattr = (uint32_t *)preload_search_info(kmdp,
 3518             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 3519         count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase);
 3520         error = 0;
 3521         for (i = 0; i < count; i++) {
 3522                 smap.base = smapbase[i].base;
 3523                 smap.length = smapbase[i].length;
 3524                 smap.type = smapbase[i].type;
 3525                 if (smapattr != NULL)
 3526                         smap.xattr = smapattr[i];
 3527                 else
 3528                         smap.xattr = 0;
 3529                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 3530         }
 3531         return (error);
 3532 }
 3533 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
 3534     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
 3535 #endif /* !PC98 */
 3536 
 3537 void
 3538 spinlock_enter(void)
 3539 {
 3540         struct thread *td;
 3541         register_t flags;
 3542 
 3543         td = curthread;
 3544         if (td->td_md.md_spinlock_count == 0) {
 3545                 flags = intr_disable();
 3546                 td->td_md.md_spinlock_count = 1;
 3547                 td->td_md.md_saved_flags = flags;
 3548         } else
 3549                 td->td_md.md_spinlock_count++;
 3550         critical_enter();
 3551 }
 3552 
 3553 void
 3554 spinlock_exit(void)
 3555 {
 3556         struct thread *td;
 3557         register_t flags;
 3558 
 3559         td = curthread;
 3560         critical_exit();
 3561         flags = td->td_md.md_saved_flags;
 3562         td->td_md.md_spinlock_count--;
 3563         if (td->td_md.md_spinlock_count == 0)
 3564                 intr_restore(flags);
 3565 }
 3566 
 3567 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 3568 static void f00f_hack(void *unused);
 3569 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 3570 
 3571 static void
 3572 f00f_hack(void *unused)
 3573 {
 3574         struct gate_descriptor *new_idt;
 3575         vm_offset_t tmp;
 3576 
 3577         if (!has_f00f_bug)
 3578                 return;
 3579 
 3580         GIANT_REQUIRED;
 3581 
 3582         printf("Intel Pentium detected, installing workaround for F00F bug\n");
 3583 
 3584         tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO);
 3585         if (tmp == 0)
 3586                 panic("kmem_malloc returned 0");
 3587 
 3588         /* Put the problematic entry (#6) at the end of the lower page. */
 3589         new_idt = (struct gate_descriptor*)
 3590             (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 3591         bcopy(idt, new_idt, sizeof(idt0));
 3592         r_idt.rd_base = (u_int)new_idt;
 3593         lidt(&r_idt);
 3594         idt = new_idt;
 3595         pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ);
 3596 }
 3597 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 3598 
 3599 /*
 3600  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 3601  * we want to start a backtrace from the function that caused us to enter
 3602  * the debugger. We have the context in the trapframe, but base the trace
 3603  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 3604  * enough for a backtrace.
 3605  */
 3606 void
 3607 makectx(struct trapframe *tf, struct pcb *pcb)
 3608 {
 3609 
 3610         pcb->pcb_edi = tf->tf_edi;
 3611         pcb->pcb_esi = tf->tf_esi;
 3612         pcb->pcb_ebp = tf->tf_ebp;
 3613         pcb->pcb_ebx = tf->tf_ebx;
 3614         pcb->pcb_eip = tf->tf_eip;
 3615         pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 3616         pcb->pcb_gs = rgs();
 3617 }
 3618 
 3619 int
 3620 ptrace_set_pc(struct thread *td, u_long addr)
 3621 {
 3622 
 3623         td->td_frame->tf_eip = addr;
 3624         return (0);
 3625 }
 3626 
 3627 int
 3628 ptrace_single_step(struct thread *td)
 3629 {
 3630         td->td_frame->tf_eflags |= PSL_T;
 3631         return (0);
 3632 }
 3633 
 3634 int
 3635 ptrace_clear_single_step(struct thread *td)
 3636 {
 3637         td->td_frame->tf_eflags &= ~PSL_T;
 3638         return (0);
 3639 }
 3640 
 3641 int
 3642 fill_regs(struct thread *td, struct reg *regs)
 3643 {
 3644         struct pcb *pcb;
 3645         struct trapframe *tp;
 3646 
 3647         tp = td->td_frame;
 3648         pcb = td->td_pcb;
 3649         regs->r_gs = pcb->pcb_gs;
 3650         return (fill_frame_regs(tp, regs));
 3651 }
 3652 
 3653 int
 3654 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 3655 {
 3656         regs->r_fs = tp->tf_fs;
 3657         regs->r_es = tp->tf_es;
 3658         regs->r_ds = tp->tf_ds;
 3659         regs->r_edi = tp->tf_edi;
 3660         regs->r_esi = tp->tf_esi;
 3661         regs->r_ebp = tp->tf_ebp;
 3662         regs->r_ebx = tp->tf_ebx;
 3663         regs->r_edx = tp->tf_edx;
 3664         regs->r_ecx = tp->tf_ecx;
 3665         regs->r_eax = tp->tf_eax;
 3666         regs->r_eip = tp->tf_eip;
 3667         regs->r_cs = tp->tf_cs;
 3668         regs->r_eflags = tp->tf_eflags;
 3669         regs->r_esp = tp->tf_esp;
 3670         regs->r_ss = tp->tf_ss;
 3671         return (0);
 3672 }
 3673 
 3674 int
 3675 set_regs(struct thread *td, struct reg *regs)
 3676 {
 3677         struct pcb *pcb;
 3678         struct trapframe *tp;
 3679 
 3680         tp = td->td_frame;
 3681         if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 3682             !CS_SECURE(regs->r_cs))
 3683                 return (EINVAL);
 3684         pcb = td->td_pcb;
 3685         tp->tf_fs = regs->r_fs;
 3686         tp->tf_es = regs->r_es;
 3687         tp->tf_ds = regs->r_ds;
 3688         tp->tf_edi = regs->r_edi;
 3689         tp->tf_esi = regs->r_esi;
 3690         tp->tf_ebp = regs->r_ebp;
 3691         tp->tf_ebx = regs->r_ebx;
 3692         tp->tf_edx = regs->r_edx;
 3693         tp->tf_ecx = regs->r_ecx;
 3694         tp->tf_eax = regs->r_eax;
 3695         tp->tf_eip = regs->r_eip;
 3696         tp->tf_cs = regs->r_cs;
 3697         tp->tf_eflags = regs->r_eflags;
 3698         tp->tf_esp = regs->r_esp;
 3699         tp->tf_ss = regs->r_ss;
 3700         pcb->pcb_gs = regs->r_gs;
 3701         return (0);
 3702 }
 3703 
 3704 #ifdef CPU_ENABLE_SSE
 3705 static void
 3706 fill_fpregs_xmm(sv_xmm, sv_87)
 3707         struct savexmm *sv_xmm;
 3708         struct save87 *sv_87;
 3709 {
 3710         register struct env87 *penv_87 = &sv_87->sv_env;
 3711         register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 3712         int i;
 3713 
 3714         bzero(sv_87, sizeof(*sv_87));
 3715 
 3716         /* FPU control/status */
 3717         penv_87->en_cw = penv_xmm->en_cw;
 3718         penv_87->en_sw = penv_xmm->en_sw;
 3719         penv_87->en_tw = penv_xmm->en_tw;
 3720         penv_87->en_fip = penv_xmm->en_fip;
 3721         penv_87->en_fcs = penv_xmm->en_fcs;
 3722         penv_87->en_opcode = penv_xmm->en_opcode;
 3723         penv_87->en_foo = penv_xmm->en_foo;
 3724         penv_87->en_fos = penv_xmm->en_fos;
 3725 
 3726         /* FPU registers */
 3727         for (i = 0; i < 8; ++i)
 3728                 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 3729 }
 3730 
 3731 static void
 3732 set_fpregs_xmm(sv_87, sv_xmm)
 3733         struct save87 *sv_87;
 3734         struct savexmm *sv_xmm;
 3735 {
 3736         register struct env87 *penv_87 = &sv_87->sv_env;
 3737         register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 3738         int i;
 3739 
 3740         /* FPU control/status */
 3741         penv_xmm->en_cw = penv_87->en_cw;
 3742         penv_xmm->en_sw = penv_87->en_sw;
 3743         penv_xmm->en_tw = penv_87->en_tw;
 3744         penv_xmm->en_fip = penv_87->en_fip;
 3745         penv_xmm->en_fcs = penv_87->en_fcs;
 3746         penv_xmm->en_opcode = penv_87->en_opcode;
 3747         penv_xmm->en_foo = penv_87->en_foo;
 3748         penv_xmm->en_fos = penv_87->en_fos;
 3749 
 3750         /* FPU registers */
 3751         for (i = 0; i < 8; ++i)
 3752                 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 3753 }
 3754 #endif /* CPU_ENABLE_SSE */
 3755 
 3756 int
 3757 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 3758 {
 3759 
 3760         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 3761             P_SHOULDSTOP(td->td_proc),
 3762             ("not suspended thread %p", td));
 3763 #ifdef DEV_NPX
 3764         npxgetregs(td);
 3765 #else
 3766         bzero(fpregs, sizeof(*fpregs));
 3767 #endif
 3768 #ifdef CPU_ENABLE_SSE
 3769         if (cpu_fxsr)
 3770                 fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm,
 3771                     (struct save87 *)fpregs);
 3772         else
 3773 #endif /* CPU_ENABLE_SSE */
 3774                 bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs,
 3775                     sizeof(*fpregs));
 3776         return (0);
 3777 }
 3778 
 3779 int
 3780 set_fpregs(struct thread *td, struct fpreg *fpregs)
 3781 {
 3782 
 3783         critical_enter();
 3784 #ifdef CPU_ENABLE_SSE
 3785         if (cpu_fxsr)
 3786                 set_fpregs_xmm((struct save87 *)fpregs,
 3787                     &get_pcb_user_save_td(td)->sv_xmm);
 3788         else
 3789 #endif /* CPU_ENABLE_SSE */
 3790                 bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87,
 3791                     sizeof(*fpregs));
 3792 #ifdef DEV_NPX
 3793         npxuserinited(td);
 3794 #endif
 3795         critical_exit();
 3796         return (0);
 3797 }
 3798 
 3799 /*
 3800  * Get machine context.
 3801  */
 3802 int
 3803 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 3804 {
 3805         struct trapframe *tp;
 3806         struct segment_descriptor *sdp;
 3807 
 3808         tp = td->td_frame;
 3809 
 3810         PROC_LOCK(curthread->td_proc);
 3811         mcp->mc_onstack = sigonstack(tp->tf_esp);
 3812         PROC_UNLOCK(curthread->td_proc);
 3813         mcp->mc_gs = td->td_pcb->pcb_gs;
 3814         mcp->mc_fs = tp->tf_fs;
 3815         mcp->mc_es = tp->tf_es;
 3816         mcp->mc_ds = tp->tf_ds;
 3817         mcp->mc_edi = tp->tf_edi;
 3818         mcp->mc_esi = tp->tf_esi;
 3819         mcp->mc_ebp = tp->tf_ebp;
 3820         mcp->mc_isp = tp->tf_isp;
 3821         mcp->mc_eflags = tp->tf_eflags;
 3822         if (flags & GET_MC_CLEAR_RET) {
 3823                 mcp->mc_eax = 0;
 3824                 mcp->mc_edx = 0;
 3825                 mcp->mc_eflags &= ~PSL_C;
 3826         } else {
 3827                 mcp->mc_eax = tp->tf_eax;
 3828                 mcp->mc_edx = tp->tf_edx;
 3829         }
 3830         mcp->mc_ebx = tp->tf_ebx;
 3831         mcp->mc_ecx = tp->tf_ecx;
 3832         mcp->mc_eip = tp->tf_eip;
 3833         mcp->mc_cs = tp->tf_cs;
 3834         mcp->mc_esp = tp->tf_esp;
 3835         mcp->mc_ss = tp->tf_ss;
 3836         mcp->mc_len = sizeof(*mcp);
 3837         get_fpcontext(td, mcp, NULL, 0);
 3838         sdp = &td->td_pcb->pcb_fsd;
 3839         mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 3840         sdp = &td->td_pcb->pcb_gsd;
 3841         mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 3842         mcp->mc_flags = 0;
 3843         mcp->mc_xfpustate = 0;
 3844         mcp->mc_xfpustate_len = 0;
 3845         bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 3846         return (0);
 3847 }
 3848 
 3849 /*
 3850  * Set machine context.
 3851  *
 3852  * However, we don't set any but the user modifiable flags, and we won't
 3853  * touch the cs selector.
 3854  */
 3855 int
 3856 set_mcontext(struct thread *td, mcontext_t *mcp)
 3857 {
 3858         struct trapframe *tp;
 3859         char *xfpustate;
 3860         int eflags, ret;
 3861 
 3862         tp = td->td_frame;
 3863         if (mcp->mc_len != sizeof(*mcp) ||
 3864             (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 3865                 return (EINVAL);
 3866         eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 3867             (tp->tf_eflags & ~PSL_USERCHANGE);
 3868         if (mcp->mc_flags & _MC_HASFPXSTATE) {
 3869                 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 3870                     sizeof(union savefpu))
 3871                         return (EINVAL);
 3872                 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 3873                 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 3874                     mcp->mc_xfpustate_len);
 3875                 if (ret != 0)
 3876                         return (ret);
 3877         } else
 3878                 xfpustate = NULL;
 3879         ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 3880         if (ret != 0)
 3881                 return (ret);
 3882         tp->tf_fs = mcp->mc_fs;
 3883         tp->tf_es = mcp->mc_es;
 3884         tp->tf_ds = mcp->mc_ds;
 3885         tp->tf_edi = mcp->mc_edi;
 3886         tp->tf_esi = mcp->mc_esi;
 3887         tp->tf_ebp = mcp->mc_ebp;
 3888         tp->tf_ebx = mcp->mc_ebx;
 3889         tp->tf_edx = mcp->mc_edx;
 3890         tp->tf_ecx = mcp->mc_ecx;
 3891         tp->tf_eax = mcp->mc_eax;
 3892         tp->tf_eip = mcp->mc_eip;
 3893         tp->tf_eflags = eflags;
 3894         tp->tf_esp = mcp->mc_esp;
 3895         tp->tf_ss = mcp->mc_ss;
 3896         td->td_pcb->pcb_gs = mcp->mc_gs;
 3897         return (0);
 3898 }
 3899 
 3900 static void
 3901 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
 3902     size_t xfpusave_len)
 3903 {
 3904 #ifdef CPU_ENABLE_SSE
 3905         size_t max_len, len;
 3906 #endif
 3907 
 3908 #ifndef DEV_NPX
 3909         mcp->mc_fpformat = _MC_FPFMT_NODEV;
 3910         mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 3911         bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 3912 #else
 3913         mcp->mc_ownedfp = npxgetregs(td);
 3914         bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 3915             sizeof(mcp->mc_fpstate));
 3916         mcp->mc_fpformat = npxformat();
 3917 #ifdef CPU_ENABLE_SSE
 3918         if (!use_xsave || xfpusave_len == 0)
 3919                 return;
 3920         max_len = cpu_max_ext_state_size - sizeof(union savefpu);
 3921         len = xfpusave_len;
 3922         if (len > max_len) {
 3923                 len = max_len;
 3924                 bzero(xfpusave + max_len, len - max_len);
 3925         }
 3926         mcp->mc_flags |= _MC_HASFPXSTATE;
 3927         mcp->mc_xfpustate_len = len;
 3928         bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 3929 #endif
 3930 #endif
 3931 }
 3932 
 3933 static int
 3934 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
 3935     size_t xfpustate_len)
 3936 {
 3937         int error;
 3938 
 3939         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 3940                 return (0);
 3941         else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 3942             mcp->mc_fpformat != _MC_FPFMT_XMM)
 3943                 return (EINVAL);
 3944         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 3945                 /* We don't care what state is left in the FPU or PCB. */
 3946                 fpstate_drop(td);
 3947                 error = 0;
 3948         } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 3949             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 3950 #ifdef DEV_NPX
 3951                 error = npxsetregs(td, (union savefpu *)&mcp->mc_fpstate,
 3952                     xfpustate, xfpustate_len);
 3953 #else
 3954                 error = EINVAL;
 3955 #endif
 3956         } else
 3957                 return (EINVAL);
 3958         return (error);
 3959 }
 3960 
 3961 static void
 3962 fpstate_drop(struct thread *td)
 3963 {
 3964 
 3965         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 3966         critical_enter();
 3967 #ifdef DEV_NPX
 3968         if (PCPU_GET(fpcurthread) == td)
 3969                 npxdrop();
 3970 #endif
 3971         /*
 3972          * XXX force a full drop of the npx.  The above only drops it if we
 3973          * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 3974          *
 3975          * XXX I don't much like npxgetregs()'s semantics of doing a full
 3976          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 3977          * We only need to drop to !PCB_INITDONE in sendsig().  But
 3978          * sendsig() is the only caller of npxgetregs()... perhaps we just
 3979          * have too many layers.
 3980          */
 3981         curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 3982             PCB_NPXUSERINITDONE);
 3983         critical_exit();
 3984 }
 3985 
 3986 int
 3987 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 3988 {
 3989         struct pcb *pcb;
 3990 
 3991         if (td == NULL) {
 3992                 dbregs->dr[0] = rdr0();
 3993                 dbregs->dr[1] = rdr1();
 3994                 dbregs->dr[2] = rdr2();
 3995                 dbregs->dr[3] = rdr3();
 3996                 dbregs->dr[4] = rdr4();
 3997                 dbregs->dr[5] = rdr5();
 3998                 dbregs->dr[6] = rdr6();
 3999                 dbregs->dr[7] = rdr7();
 4000         } else {
 4001                 pcb = td->td_pcb;
 4002                 dbregs->dr[0] = pcb->pcb_dr0;
 4003                 dbregs->dr[1] = pcb->pcb_dr1;
 4004                 dbregs->dr[2] = pcb->pcb_dr2;
 4005                 dbregs->dr[3] = pcb->pcb_dr3;
 4006                 dbregs->dr[4] = 0;
 4007                 dbregs->dr[5] = 0;
 4008                 dbregs->dr[6] = pcb->pcb_dr6;
 4009                 dbregs->dr[7] = pcb->pcb_dr7;
 4010         }
 4011         return (0);
 4012 }
 4013 
 4014 int
 4015 set_dbregs(struct thread *td, struct dbreg *dbregs)
 4016 {
 4017         struct pcb *pcb;
 4018         int i;
 4019 
 4020         if (td == NULL) {
 4021                 load_dr0(dbregs->dr[0]);
 4022                 load_dr1(dbregs->dr[1]);
 4023                 load_dr2(dbregs->dr[2]);
 4024                 load_dr3(dbregs->dr[3]);
 4025                 load_dr4(dbregs->dr[4]);
 4026                 load_dr5(dbregs->dr[5]);
 4027                 load_dr6(dbregs->dr[6]);
 4028                 load_dr7(dbregs->dr[7]);
 4029         } else {
 4030                 /*
 4031                  * Don't let an illegal value for dr7 get set.  Specifically,
 4032                  * check for undefined settings.  Setting these bit patterns
 4033                  * result in undefined behaviour and can lead to an unexpected
 4034                  * TRCTRAP.
 4035                  */
 4036                 for (i = 0; i < 4; i++) {
 4037                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 4038                                 return (EINVAL);
 4039                         if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 4040                                 return (EINVAL);
 4041                 }
 4042                 
 4043                 pcb = td->td_pcb;
 4044                 
 4045                 /*
 4046                  * Don't let a process set a breakpoint that is not within the
 4047                  * process's address space.  If a process could do this, it
 4048                  * could halt the system by setting a breakpoint in the kernel
 4049                  * (if ddb was enabled).  Thus, we need to check to make sure
 4050                  * that no breakpoints are being enabled for addresses outside
 4051                  * process's address space.
 4052                  *
 4053                  * XXX - what about when the watched area of the user's
 4054                  * address space is written into from within the kernel
 4055                  * ... wouldn't that still cause a breakpoint to be generated
 4056                  * from within kernel mode?
 4057                  */
 4058 
 4059                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 4060                         /* dr0 is enabled */
 4061                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 4062                                 return (EINVAL);
 4063                 }
 4064                         
 4065                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 4066                         /* dr1 is enabled */
 4067                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 4068                                 return (EINVAL);
 4069                 }
 4070                         
 4071                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 4072                         /* dr2 is enabled */
 4073                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 4074                                 return (EINVAL);
 4075                 }
 4076                         
 4077                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 4078                         /* dr3 is enabled */
 4079                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 4080                                 return (EINVAL);
 4081                 }
 4082 
 4083                 pcb->pcb_dr0 = dbregs->dr[0];
 4084                 pcb->pcb_dr1 = dbregs->dr[1];
 4085                 pcb->pcb_dr2 = dbregs->dr[2];
 4086                 pcb->pcb_dr3 = dbregs->dr[3];
 4087                 pcb->pcb_dr6 = dbregs->dr[6];
 4088                 pcb->pcb_dr7 = dbregs->dr[7];
 4089 
 4090                 pcb->pcb_flags |= PCB_DBREGS;
 4091         }
 4092 
 4093         return (0);
 4094 }
 4095 
 4096 /*
 4097  * Return > 0 if a hardware breakpoint has been hit, and the
 4098  * breakpoint was in user space.  Return 0, otherwise.
 4099  */
 4100 int
 4101 user_dbreg_trap(void)
 4102 {
 4103         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
 4104         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
 4105         int nbp;            /* number of breakpoints that triggered */
 4106         caddr_t addr[4];    /* breakpoint addresses */
 4107         int i;
 4108         
 4109         dr7 = rdr7();
 4110         if ((dr7 & 0x000000ff) == 0) {
 4111                 /*
 4112                  * all GE and LE bits in the dr7 register are zero,
 4113                  * thus the trap couldn't have been caused by the
 4114                  * hardware debug registers
 4115                  */
 4116                 return 0;
 4117         }
 4118 
 4119         nbp = 0;
 4120         dr6 = rdr6();
 4121         bp = dr6 & 0x0000000f;
 4122 
 4123         if (!bp) {
 4124                 /*
 4125                  * None of the breakpoint bits are set meaning this
 4126                  * trap was not caused by any of the debug registers
 4127                  */
 4128                 return 0;
 4129         }
 4130 
 4131         /*
 4132          * at least one of the breakpoints were hit, check to see
 4133          * which ones and if any of them are user space addresses
 4134          */
 4135 
 4136         if (bp & 0x01) {
 4137                 addr[nbp++] = (caddr_t)rdr0();
 4138         }
 4139         if (bp & 0x02) {
 4140                 addr[nbp++] = (caddr_t)rdr1();
 4141         }
 4142         if (bp & 0x04) {
 4143                 addr[nbp++] = (caddr_t)rdr2();
 4144         }
 4145         if (bp & 0x08) {
 4146                 addr[nbp++] = (caddr_t)rdr3();
 4147         }
 4148 
 4149         for (i = 0; i < nbp; i++) {
 4150                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 4151                         /*
 4152                          * addr[i] is in user space
 4153                          */
 4154                         return nbp;
 4155                 }
 4156         }
 4157 
 4158         /*
 4159          * None of the breakpoints are in user space.
 4160          */
 4161         return 0;
 4162 }
 4163 
 4164 #ifdef KDB
 4165 
 4166 /*
 4167  * Provide inb() and outb() as functions.  They are normally only available as
 4168  * inline functions, thus cannot be called from the debugger.
 4169  */
 4170 
 4171 /* silence compiler warnings */
 4172 u_char inb_(u_short);
 4173 void outb_(u_short, u_char);
 4174 
 4175 u_char
 4176 inb_(u_short port)
 4177 {
 4178         return inb(port);
 4179 }
 4180 
 4181 void
 4182 outb_(u_short port, u_char data)
 4183 {
 4184         outb(port, data);
 4185 }
 4186 
 4187 #endif /* KDB */

Cache object: ec7ae5f4a357c2c13b9680805ca459e4


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.