The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/machdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1992 Terrence R. Lambert.
    3  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    4  * All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * William Jolitz.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice, this list of conditions and the following disclaimer.
   14  * 2. Redistributions in binary form must reproduce the above copyright
   15  *    notice, this list of conditions and the following disclaimer in the
   16  *    documentation and/or other materials provided with the distribution.
   17  * 3. All advertising materials mentioning features or use of this software
   18  *    must display the following acknowledgement:
   19  *      This product includes software developed by the University of
   20  *      California, Berkeley and its contributors.
   21  * 4. Neither the name of the University nor the names of its contributors
   22  *    may be used to endorse or promote products derived from this software
   23  *    without specific prior written permission.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  *
   37  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   38  */
   39 
   40 #include <sys/cdefs.h>
   41 __FBSDID("$FreeBSD: releng/8.3/sys/i386/i386/machdep.c 230472 2012-01-22 21:25:47Z gavin $");
   42 
   43 #include "opt_apic.h"
   44 #include "opt_atalk.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_ipx.h"
   50 #include "opt_isa.h"
   51 #include "opt_kstack_pages.h"
   52 #include "opt_maxmem.h"
   53 #include "opt_npx.h"
   54 #include "opt_perfmon.h"
   55 #include "opt_xbox.h"
   56 #include "opt_kdtrace.h"
   57 
   58 #include <sys/param.h>
   59 #include <sys/proc.h>
   60 #include <sys/systm.h>
   61 #include <sys/bio.h>
   62 #include <sys/buf.h>
   63 #include <sys/bus.h>
   64 #include <sys/callout.h>
   65 #include <sys/cons.h>
   66 #include <sys/cpu.h>
   67 #include <sys/eventhandler.h>
   68 #include <sys/exec.h>
   69 #include <sys/imgact.h>
   70 #include <sys/kdb.h>
   71 #include <sys/kernel.h>
   72 #include <sys/ktr.h>
   73 #include <sys/linker.h>
   74 #include <sys/lock.h>
   75 #include <sys/malloc.h>
   76 #include <sys/msgbuf.h>
   77 #include <sys/mutex.h>
   78 #include <sys/pcpu.h>
   79 #include <sys/ptrace.h>
   80 #include <sys/reboot.h>
   81 #include <sys/sched.h>
   82 #include <sys/signalvar.h>
   83 #include <sys/sysctl.h>
   84 #include <sys/sysent.h>
   85 #include <sys/sysproto.h>
   86 #include <sys/ucontext.h>
   87 #include <sys/vmmeter.h>
   88 
   89 #include <vm/vm.h>
   90 #include <vm/vm_extern.h>
   91 #include <vm/vm_kern.h>
   92 #include <vm/vm_page.h>
   93 #include <vm/vm_map.h>
   94 #include <vm/vm_object.h>
   95 #include <vm/vm_pager.h>
   96 #include <vm/vm_param.h>
   97 
   98 #ifdef DDB
   99 #ifndef KDB
  100 #error KDB must be enabled in order for DDB to work!
  101 #endif
  102 #include <ddb/ddb.h>
  103 #include <ddb/db_sym.h>
  104 #endif
  105 
  106 #include <isa/rtc.h>
  107 
  108 #include <net/netisr.h>
  109 
  110 #include <machine/bootinfo.h>
  111 #include <machine/clock.h>
  112 #include <machine/cpu.h>
  113 #include <machine/cputypes.h>
  114 #include <machine/intr_machdep.h>
  115 #include <machine/mca.h>
  116 #include <machine/md_var.h>
  117 #include <machine/metadata.h>
  118 #include <machine/pc/bios.h>
  119 #include <machine/pcb.h>
  120 #include <machine/pcb_ext.h>
  121 #include <machine/proc.h>
  122 #include <machine/reg.h>
  123 #include <machine/sigframe.h>
  124 #include <machine/specialreg.h>
  125 #include <machine/vm86.h>
  126 #ifdef PERFMON
  127 #include <machine/perfmon.h>
  128 #endif
  129 #ifdef SMP
  130 #include <machine/smp.h>
  131 #endif
  132 
  133 #ifdef DEV_ISA
  134 #include <i386/isa/icu.h>
  135 #endif
  136 
  137 #ifdef XBOX
  138 #include <machine/xbox.h>
  139 
  140 int arch_i386_is_xbox = 0;
  141 uint32_t arch_i386_xbox_memsize = 0;
  142 #endif
  143 
  144 #ifdef XEN
  145 /* XEN includes */
  146 #include <machine/xen/xen-os.h>
  147 #include <xen/hypervisor.h>
  148 #include <machine/xen/xen-os.h>
  149 #include <machine/xen/xenvar.h>
  150 #include <machine/xen/xenfunc.h>
  151 #include <xen/xen_intr.h>
  152 
  153 void Xhypervisor_callback(void);
  154 void failsafe_callback(void);
  155 
  156 extern trap_info_t trap_table[];
  157 struct proc_ldt default_proc_ldt;
  158 extern int init_first;
  159 int running_xen = 1;
  160 extern unsigned long physfree;
  161 #endif /* XEN */
  162 
  163 /* Sanity check for __curthread() */
  164 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  165 
  166 extern void init386(int first);
  167 extern void dblfault_handler(void);
  168 
  169 extern void printcpuinfo(void); /* XXX header file */
  170 extern void finishidentcpu(void);
  171 extern void panicifcpuunsupported(void);
  172 extern void initializecpu(void);
  173 
  174 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  175 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  176 
  177 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
  178 #define CPU_ENABLE_SSE
  179 #endif
  180 
  181 static void cpu_startup(void *);
  182 static void fpstate_drop(struct thread *td);
  183 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
  184 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
  185 #ifdef CPU_ENABLE_SSE
  186 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
  187 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
  188 #endif /* CPU_ENABLE_SSE */
  189 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  190 
  191 #ifdef DDB
  192 extern vm_offset_t ksym_start, ksym_end;
  193 #endif
  194 
  195 /* Intel ICH registers */
  196 #define ICH_PMBASE      0x400
  197 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  198 
  199 int     _udatasel, _ucodesel;
  200 u_int   basemem;
  201 
  202 int cold = 1;
  203 
  204 #ifdef COMPAT_43
  205 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
  206 #endif
  207 #ifdef COMPAT_FREEBSD4
  208 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
  209 #endif
  210 
  211 long Maxmem = 0;
  212 long realmem = 0;
  213 
  214 #ifdef PAE
  215 FEATURE(pae, "Physical Address Extensions");
  216 #endif
  217 
  218 /*
  219  * The number of PHYSMAP entries must be one less than the number of
  220  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  221  * physical address that is accessible by ISA DMA is split into two
  222  * PHYSSEG entries.
  223  */
  224 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
  225 
  226 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
  227 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
  228 
  229 /* must be 2 less so 0 0 can signal end of chunks */
  230 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
  231 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
  232 
  233 struct kva_md_info kmi;
  234 
  235 static struct trapframe proc0_tf;
  236 struct pcpu __pcpu[MAXCPU];
  237 
  238 struct mtx icu_lock;
  239 
  240 static void
  241 cpu_startup(dummy)
  242         void *dummy;
  243 {
  244         uintmax_t memsize;
  245         char *sysenv;
  246         
  247         /*
  248          * On MacBooks, we need to disallow the legacy USB circuit to
  249          * generate an SMI# because this can cause several problems,
  250          * namely: incorrect CPU frequency detection and failure to
  251          * start the APs.
  252          * We do this by disabling a bit in the SMI_EN (SMI Control and
  253          * Enable register) of the Intel ICH LPC Interface Bridge.
  254          */
  255         sysenv = getenv("smbios.system.product");
  256         if (sysenv != NULL) {
  257                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  258                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  259                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  260                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  261                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  262                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  263                         if (bootverbose)
  264                                 printf("Disabling LEGACY_USB_EN bit on "
  265                                     "Intel ICH.\n");
  266                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  267                 }
  268                 freeenv(sysenv);
  269         }
  270 
  271         /*
  272          * Good {morning,afternoon,evening,night}.
  273          */
  274         startrtclock();
  275         printcpuinfo();
  276         panicifcpuunsupported();
  277 #ifdef PERFMON
  278         perfmon_init();
  279 #endif
  280         realmem = Maxmem;
  281 
  282         /*
  283          * Display physical memory if SMBIOS reports reasonable amount.
  284          */
  285         memsize = 0;
  286         sysenv = getenv("smbios.memory.enabled");
  287         if (sysenv != NULL) {
  288                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  289                 freeenv(sysenv);
  290         }
  291         if (memsize < ptoa((uintmax_t)cnt.v_free_count))
  292                 memsize = ptoa((uintmax_t)Maxmem);
  293         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  294 
  295         /*
  296          * Display any holes after the first chunk of extended memory.
  297          */
  298         if (bootverbose) {
  299                 int indx;
  300 
  301                 printf("Physical memory chunk(s):\n");
  302                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  303                         vm_paddr_t size;
  304 
  305                         size = phys_avail[indx + 1] - phys_avail[indx];
  306                         printf(
  307                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  308                             (uintmax_t)phys_avail[indx],
  309                             (uintmax_t)phys_avail[indx + 1] - 1,
  310                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  311                 }
  312         }
  313 
  314         vm_ksubmap_init(&kmi);
  315 
  316         printf("avail memory = %ju (%ju MB)\n",
  317             ptoa((uintmax_t)cnt.v_free_count),
  318             ptoa((uintmax_t)cnt.v_free_count) / 1048576);
  319 
  320         /*
  321          * Set up buffers, so they can be used to read disk labels.
  322          */
  323         bufinit();
  324         vm_pager_bufferinit();
  325 #ifndef XEN
  326         cpu_setregs();
  327 #endif
  328 }
  329 
  330 /*
  331  * Send an interrupt to process.
  332  *
  333  * Stack is set up to allow sigcode stored
  334  * at top to call routine, followed by kcall
  335  * to sigreturn routine below.  After sigreturn
  336  * resets the signal mask, the stack, and the
  337  * frame pointer, it returns to the user
  338  * specified pc, psl.
  339  */
  340 #ifdef COMPAT_43
  341 static void
  342 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  343 {
  344         struct osigframe sf, *fp;
  345         struct proc *p;
  346         struct thread *td;
  347         struct sigacts *psp;
  348         struct trapframe *regs;
  349         int sig;
  350         int oonstack;
  351 
  352         td = curthread;
  353         p = td->td_proc;
  354         PROC_LOCK_ASSERT(p, MA_OWNED);
  355         sig = ksi->ksi_signo;
  356         psp = p->p_sigacts;
  357         mtx_assert(&psp->ps_mtx, MA_OWNED);
  358         regs = td->td_frame;
  359         oonstack = sigonstack(regs->tf_esp);
  360 
  361         /* Allocate space for the signal handler context. */
  362         if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
  363             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  364                 fp = (struct osigframe *)(td->td_sigstk.ss_sp +
  365                     td->td_sigstk.ss_size - sizeof(struct osigframe));
  366 #if defined(COMPAT_43)
  367                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  368 #endif
  369         } else
  370                 fp = (struct osigframe *)regs->tf_esp - 1;
  371 
  372         /* Translate the signal if appropriate. */
  373         if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
  374                 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
  375 
  376         /* Build the argument list for the signal handler. */
  377         sf.sf_signum = sig;
  378         sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
  379         bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
  380         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  381                 /* Signal handler installed with SA_SIGINFO. */
  382                 sf.sf_arg2 = (register_t)&fp->sf_siginfo;
  383                 sf.sf_siginfo.si_signo = sig;
  384                 sf.sf_siginfo.si_code = ksi->ksi_code;
  385                 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
  386                 sf.sf_addr = 0;
  387         } else {
  388                 /* Old FreeBSD-style arguments. */
  389                 sf.sf_arg2 = ksi->ksi_code;
  390                 sf.sf_addr = (register_t)ksi->ksi_addr;
  391                 sf.sf_ahu.sf_handler = catcher;
  392         }
  393         mtx_unlock(&psp->ps_mtx);
  394         PROC_UNLOCK(p);
  395 
  396         /* Save most if not all of trap frame. */
  397         sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
  398         sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
  399         sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
  400         sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
  401         sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
  402         sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
  403         sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
  404         sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
  405         sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
  406         sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
  407         sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
  408         sf.sf_siginfo.si_sc.sc_gs = rgs();
  409         sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
  410 
  411         /* Build the signal context to be used by osigreturn(). */
  412         sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
  413         SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
  414         sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
  415         sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
  416         sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
  417         sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
  418         sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
  419         sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
  420 
  421         /*
  422          * If we're a vm86 process, we want to save the segment registers.
  423          * We also change eflags to be our emulated eflags, not the actual
  424          * eflags.
  425          */
  426         if (regs->tf_eflags & PSL_VM) {
  427                 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
  428                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  429                 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  430 
  431                 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
  432                 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
  433                 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
  434                 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
  435 
  436                 if (vm86->vm86_has_vme == 0)
  437                         sf.sf_siginfo.si_sc.sc_ps =
  438                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
  439                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
  440 
  441                 /* See sendsig() for comments. */
  442                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
  443         }
  444 
  445         /*
  446          * Copy the sigframe out to the user's stack.
  447          */
  448         if (copyout(&sf, fp, sizeof(*fp)) != 0) {
  449 #ifdef DEBUG
  450                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  451 #endif
  452                 PROC_LOCK(p);
  453                 sigexit(td, SIGILL);
  454         }
  455 
  456         regs->tf_esp = (int)fp;
  457         regs->tf_eip = PS_STRINGS - szosigcode;
  458         regs->tf_eflags &= ~(PSL_T | PSL_D);
  459         regs->tf_cs = _ucodesel;
  460         regs->tf_ds = _udatasel;
  461         regs->tf_es = _udatasel;
  462         regs->tf_fs = _udatasel;
  463         load_gs(_udatasel);
  464         regs->tf_ss = _udatasel;
  465         PROC_LOCK(p);
  466         mtx_lock(&psp->ps_mtx);
  467 }
  468 #endif /* COMPAT_43 */
  469 
  470 #ifdef COMPAT_FREEBSD4
  471 static void
  472 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  473 {
  474         struct sigframe4 sf, *sfp;
  475         struct proc *p;
  476         struct thread *td;
  477         struct sigacts *psp;
  478         struct trapframe *regs;
  479         int sig;
  480         int oonstack;
  481 
  482         td = curthread;
  483         p = td->td_proc;
  484         PROC_LOCK_ASSERT(p, MA_OWNED);
  485         sig = ksi->ksi_signo;
  486         psp = p->p_sigacts;
  487         mtx_assert(&psp->ps_mtx, MA_OWNED);
  488         regs = td->td_frame;
  489         oonstack = sigonstack(regs->tf_esp);
  490 
  491         /* Save user context. */
  492         bzero(&sf, sizeof(sf));
  493         sf.sf_uc.uc_sigmask = *mask;
  494         sf.sf_uc.uc_stack = td->td_sigstk;
  495         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  496             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  497         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  498         sf.sf_uc.uc_mcontext.mc_gs = rgs();
  499         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
  500         bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
  501             sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
  502         bzero(sf.sf_uc.uc_mcontext.__spare__,
  503             sizeof(sf.sf_uc.uc_mcontext.__spare__));
  504         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  505 
  506         /* Allocate space for the signal handler context. */
  507         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  508             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  509                 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
  510                     td->td_sigstk.ss_size - sizeof(struct sigframe4));
  511 #if defined(COMPAT_43)
  512                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  513 #endif
  514         } else
  515                 sfp = (struct sigframe4 *)regs->tf_esp - 1;
  516 
  517         /* Translate the signal if appropriate. */
  518         if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
  519                 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
  520 
  521         /* Build the argument list for the signal handler. */
  522         sf.sf_signum = sig;
  523         sf.sf_ucontext = (register_t)&sfp->sf_uc;
  524         bzero(&sf.sf_si, sizeof(sf.sf_si));
  525         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  526                 /* Signal handler installed with SA_SIGINFO. */
  527                 sf.sf_siginfo = (register_t)&sfp->sf_si;
  528                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  529 
  530                 /* Fill in POSIX parts */
  531                 sf.sf_si.si_signo = sig;
  532                 sf.sf_si.si_code = ksi->ksi_code;
  533                 sf.sf_si.si_addr = ksi->ksi_addr;
  534         } else {
  535                 /* Old FreeBSD-style arguments. */
  536                 sf.sf_siginfo = ksi->ksi_code;
  537                 sf.sf_addr = (register_t)ksi->ksi_addr;
  538                 sf.sf_ahu.sf_handler = catcher;
  539         }
  540         mtx_unlock(&psp->ps_mtx);
  541         PROC_UNLOCK(p);
  542 
  543         /*
  544          * If we're a vm86 process, we want to save the segment registers.
  545          * We also change eflags to be our emulated eflags, not the actual
  546          * eflags.
  547          */
  548         if (regs->tf_eflags & PSL_VM) {
  549                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  550                 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  551 
  552                 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
  553                 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
  554                 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
  555                 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
  556 
  557                 if (vm86->vm86_has_vme == 0)
  558                         sf.sf_uc.uc_mcontext.mc_eflags =
  559                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
  560                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
  561 
  562                 /*
  563                  * Clear PSL_NT to inhibit T_TSSFLT faults on return from
  564                  * syscalls made by the signal handler.  This just avoids
  565                  * wasting time for our lazy fixup of such faults.  PSL_NT
  566                  * does nothing in vm86 mode, but vm86 programs can set it
  567                  * almost legitimately in probes for old cpu types.
  568                  */
  569                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
  570         }
  571 
  572         /*
  573          * Copy the sigframe out to the user's stack.
  574          */
  575         if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
  576 #ifdef DEBUG
  577                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  578 #endif
  579                 PROC_LOCK(p);
  580                 sigexit(td, SIGILL);
  581         }
  582 
  583         regs->tf_esp = (int)sfp;
  584         regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
  585         regs->tf_eflags &= ~(PSL_T | PSL_D);
  586         regs->tf_cs = _ucodesel;
  587         regs->tf_ds = _udatasel;
  588         regs->tf_es = _udatasel;
  589         regs->tf_fs = _udatasel;
  590         regs->tf_ss = _udatasel;
  591         PROC_LOCK(p);
  592         mtx_lock(&psp->ps_mtx);
  593 }
  594 #endif  /* COMPAT_FREEBSD4 */
  595 
  596 void
  597 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
  598 {
  599         struct sigframe sf, *sfp;
  600         struct proc *p;
  601         struct thread *td;
  602         struct sigacts *psp;
  603         char *sp;
  604         struct trapframe *regs;
  605         struct segment_descriptor *sdp;
  606         int sig;
  607         int oonstack;
  608 
  609         td = curthread;
  610         p = td->td_proc;
  611         PROC_LOCK_ASSERT(p, MA_OWNED);
  612         sig = ksi->ksi_signo;
  613         psp = p->p_sigacts;
  614         mtx_assert(&psp->ps_mtx, MA_OWNED);
  615 #ifdef COMPAT_FREEBSD4
  616         if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
  617                 freebsd4_sendsig(catcher, ksi, mask);
  618                 return;
  619         }
  620 #endif
  621 #ifdef COMPAT_43
  622         if (SIGISMEMBER(psp->ps_osigset, sig)) {
  623                 osendsig(catcher, ksi, mask);
  624                 return;
  625         }
  626 #endif
  627         regs = td->td_frame;
  628         oonstack = sigonstack(regs->tf_esp);
  629 
  630         /* Save user context. */
  631         bzero(&sf, sizeof(sf));
  632         sf.sf_uc.uc_sigmask = *mask;
  633         sf.sf_uc.uc_stack = td->td_sigstk;
  634         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  635             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  636         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  637         sf.sf_uc.uc_mcontext.mc_gs = rgs();
  638         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
  639         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  640         get_fpcontext(td, &sf.sf_uc.uc_mcontext);
  641         fpstate_drop(td);
  642         /*
  643          * Unconditionally fill the fsbase and gsbase into the mcontext.
  644          */
  645         sdp = &td->td_pcb->pcb_fsd;
  646         sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 |
  647             sdp->sd_lobase;
  648         sdp = &td->td_pcb->pcb_gsd;
  649         sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 |
  650             sdp->sd_lobase;
  651         bzero(sf.sf_uc.uc_mcontext.mc_spare1,
  652             sizeof(sf.sf_uc.uc_mcontext.mc_spare1));
  653         bzero(sf.sf_uc.uc_mcontext.mc_spare2,
  654             sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
  655         bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
  656 
  657         /* Allocate space for the signal handler context. */
  658         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  659             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  660                 sp = td->td_sigstk.ss_sp +
  661                     td->td_sigstk.ss_size - sizeof(struct sigframe);
  662 #if defined(COMPAT_43)
  663                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  664 #endif
  665         } else
  666                 sp = (char *)regs->tf_esp - sizeof(struct sigframe);
  667         /* Align to 16 bytes. */
  668         sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
  669 
  670         /* Translate the signal if appropriate. */
  671         if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
  672                 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
  673 
  674         /* Build the argument list for the signal handler. */
  675         sf.sf_signum = sig;
  676         sf.sf_ucontext = (register_t)&sfp->sf_uc;
  677         bzero(&sf.sf_si, sizeof(sf.sf_si));
  678         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  679                 /* Signal handler installed with SA_SIGINFO. */
  680                 sf.sf_siginfo = (register_t)&sfp->sf_si;
  681                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  682 
  683                 /* Fill in POSIX parts */
  684                 sf.sf_si = ksi->ksi_info;
  685                 sf.sf_si.si_signo = sig; /* maybe a translated signal */
  686         } else {
  687                 /* Old FreeBSD-style arguments. */
  688                 sf.sf_siginfo = ksi->ksi_code;
  689                 sf.sf_addr = (register_t)ksi->ksi_addr;
  690                 sf.sf_ahu.sf_handler = catcher;
  691         }
  692         mtx_unlock(&psp->ps_mtx);
  693         PROC_UNLOCK(p);
  694 
  695         /*
  696          * If we're a vm86 process, we want to save the segment registers.
  697          * We also change eflags to be our emulated eflags, not the actual
  698          * eflags.
  699          */
  700         if (regs->tf_eflags & PSL_VM) {
  701                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  702                 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  703 
  704                 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
  705                 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
  706                 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
  707                 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
  708 
  709                 if (vm86->vm86_has_vme == 0)
  710                         sf.sf_uc.uc_mcontext.mc_eflags =
  711                             (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
  712                             (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
  713 
  714                 /*
  715                  * Clear PSL_NT to inhibit T_TSSFLT faults on return from
  716                  * syscalls made by the signal handler.  This just avoids
  717                  * wasting time for our lazy fixup of such faults.  PSL_NT
  718                  * does nothing in vm86 mode, but vm86 programs can set it
  719                  * almost legitimately in probes for old cpu types.
  720                  */
  721                 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
  722         }
  723 
  724         /*
  725          * Copy the sigframe out to the user's stack.
  726          */
  727         if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
  728 #ifdef DEBUG
  729                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  730 #endif
  731                 PROC_LOCK(p);
  732                 sigexit(td, SIGILL);
  733         }
  734 
  735         regs->tf_esp = (int)sfp;
  736         regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
  737         regs->tf_eflags &= ~(PSL_T | PSL_D);
  738         regs->tf_cs = _ucodesel;
  739         regs->tf_ds = _udatasel;
  740         regs->tf_es = _udatasel;
  741         regs->tf_fs = _udatasel;
  742         regs->tf_ss = _udatasel;
  743         PROC_LOCK(p);
  744         mtx_lock(&psp->ps_mtx);
  745 }
  746 
  747 /*
  748  * System call to cleanup state after a signal
  749  * has been taken.  Reset signal mask and
  750  * stack state from context left by sendsig (above).
  751  * Return to previous pc and psl as specified by
  752  * context left by sendsig. Check carefully to
  753  * make sure that the user has not modified the
  754  * state to gain improper privileges.
  755  *
  756  * MPSAFE
  757  */
  758 #ifdef COMPAT_43
  759 int
  760 osigreturn(td, uap)
  761         struct thread *td;
  762         struct osigreturn_args /* {
  763                 struct osigcontext *sigcntxp;
  764         } */ *uap;
  765 {
  766         struct osigcontext sc;
  767         struct trapframe *regs;
  768         struct osigcontext *scp;
  769         int eflags, error;
  770         ksiginfo_t ksi;
  771 
  772         regs = td->td_frame;
  773         error = copyin(uap->sigcntxp, &sc, sizeof(sc));
  774         if (error != 0)
  775                 return (error);
  776         scp = &sc;
  777         eflags = scp->sc_ps;
  778         if (eflags & PSL_VM) {
  779                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  780                 struct vm86_kernel *vm86;
  781 
  782                 /*
  783                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
  784                  * set up the vm86 area, and we can't enter vm86 mode.
  785                  */
  786                 if (td->td_pcb->pcb_ext == 0)
  787                         return (EINVAL);
  788                 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  789                 if (vm86->vm86_inited == 0)
  790                         return (EINVAL);
  791 
  792                 /* Go back to user mode if both flags are set. */
  793                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
  794                         ksiginfo_init_trap(&ksi);
  795                         ksi.ksi_signo = SIGBUS;
  796                         ksi.ksi_code = BUS_OBJERR;
  797                         ksi.ksi_addr = (void *)regs->tf_eip;
  798                         trapsignal(td, &ksi);
  799                 }
  800 
  801                 if (vm86->vm86_has_vme) {
  802                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
  803                             (eflags & VME_USERCHANGE) | PSL_VM;
  804                 } else {
  805                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
  806                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
  807                             (eflags & VM_USERCHANGE) | PSL_VM;
  808                 }
  809                 tf->tf_vm86_ds = scp->sc_ds;
  810                 tf->tf_vm86_es = scp->sc_es;
  811                 tf->tf_vm86_fs = scp->sc_fs;
  812                 tf->tf_vm86_gs = scp->sc_gs;
  813                 tf->tf_ds = _udatasel;
  814                 tf->tf_es = _udatasel;
  815                 tf->tf_fs = _udatasel;
  816         } else {
  817                 /*
  818                  * Don't allow users to change privileged or reserved flags.
  819                  */
  820                 /*
  821                  * XXX do allow users to change the privileged flag PSL_RF.
  822                  * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
  823                  * should sometimes set it there too.  tf_eflags is kept in
  824                  * the signal context during signal handling and there is no
  825                  * other place to remember it, so the PSL_RF bit may be
  826                  * corrupted by the signal handler without us knowing.
  827                  * Corruption of the PSL_RF bit at worst causes one more or
  828                  * one less debugger trap, so allowing it is fairly harmless.
  829                  */
  830                 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
  831                         return (EINVAL);
  832                 }
  833 
  834                 /*
  835                  * Don't allow users to load a valid privileged %cs.  Let the
  836                  * hardware check for invalid selectors, excess privilege in
  837                  * other selectors, invalid %eip's and invalid %esp's.
  838                  */
  839                 if (!CS_SECURE(scp->sc_cs)) {
  840                         ksiginfo_init_trap(&ksi);
  841                         ksi.ksi_signo = SIGBUS;
  842                         ksi.ksi_code = BUS_OBJERR;
  843                         ksi.ksi_trapno = T_PROTFLT;
  844                         ksi.ksi_addr = (void *)regs->tf_eip;
  845                         trapsignal(td, &ksi);
  846                         return (EINVAL);
  847                 }
  848                 regs->tf_ds = scp->sc_ds;
  849                 regs->tf_es = scp->sc_es;
  850                 regs->tf_fs = scp->sc_fs;
  851         }
  852 
  853         /* Restore remaining registers. */
  854         regs->tf_eax = scp->sc_eax;
  855         regs->tf_ebx = scp->sc_ebx;
  856         regs->tf_ecx = scp->sc_ecx;
  857         regs->tf_edx = scp->sc_edx;
  858         regs->tf_esi = scp->sc_esi;
  859         regs->tf_edi = scp->sc_edi;
  860         regs->tf_cs = scp->sc_cs;
  861         regs->tf_ss = scp->sc_ss;
  862         regs->tf_isp = scp->sc_isp;
  863         regs->tf_ebp = scp->sc_fp;
  864         regs->tf_esp = scp->sc_sp;
  865         regs->tf_eip = scp->sc_pc;
  866         regs->tf_eflags = eflags;
  867 
  868 #if defined(COMPAT_43)
  869         if (scp->sc_onstack & 1)
  870                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  871         else
  872                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  873 #endif
  874         kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
  875             SIGPROCMASK_OLD);
  876         return (EJUSTRETURN);
  877 }
  878 #endif /* COMPAT_43 */
  879 
  880 #ifdef COMPAT_FREEBSD4
  881 /*
  882  * MPSAFE
  883  */
  884 int
  885 freebsd4_sigreturn(td, uap)
  886         struct thread *td;
  887         struct freebsd4_sigreturn_args /* {
  888                 const ucontext4 *sigcntxp;
  889         } */ *uap;
  890 {
  891         struct ucontext4 uc;
  892         struct trapframe *regs;
  893         struct ucontext4 *ucp;
  894         int cs, eflags, error;
  895         ksiginfo_t ksi;
  896 
  897         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  898         if (error != 0)
  899                 return (error);
  900         ucp = &uc;
  901         regs = td->td_frame;
  902         eflags = ucp->uc_mcontext.mc_eflags;
  903         if (eflags & PSL_VM) {
  904                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
  905                 struct vm86_kernel *vm86;
  906 
  907                 /*
  908                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
  909                  * set up the vm86 area, and we can't enter vm86 mode.
  910                  */
  911                 if (td->td_pcb->pcb_ext == 0)
  912                         return (EINVAL);
  913                 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
  914                 if (vm86->vm86_inited == 0)
  915                         return (EINVAL);
  916 
  917                 /* Go back to user mode if both flags are set. */
  918                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
  919                         ksiginfo_init_trap(&ksi);
  920                         ksi.ksi_signo = SIGBUS;
  921                         ksi.ksi_code = BUS_OBJERR;
  922                         ksi.ksi_addr = (void *)regs->tf_eip;
  923                         trapsignal(td, &ksi);
  924                 }
  925                 if (vm86->vm86_has_vme) {
  926                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
  927                             (eflags & VME_USERCHANGE) | PSL_VM;
  928                 } else {
  929                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
  930                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
  931                             (eflags & VM_USERCHANGE) | PSL_VM;
  932                 }
  933                 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
  934                 tf->tf_eflags = eflags;
  935                 tf->tf_vm86_ds = tf->tf_ds;
  936                 tf->tf_vm86_es = tf->tf_es;
  937                 tf->tf_vm86_fs = tf->tf_fs;
  938                 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
  939                 tf->tf_ds = _udatasel;
  940                 tf->tf_es = _udatasel;
  941                 tf->tf_fs = _udatasel;
  942         } else {
  943                 /*
  944                  * Don't allow users to change privileged or reserved flags.
  945                  */
  946                 /*
  947                  * XXX do allow users to change the privileged flag PSL_RF.
  948                  * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
  949                  * should sometimes set it there too.  tf_eflags is kept in
  950                  * the signal context during signal handling and there is no
  951                  * other place to remember it, so the PSL_RF bit may be
  952                  * corrupted by the signal handler without us knowing.
  953                  * Corruption of the PSL_RF bit at worst causes one more or
  954                  * one less debugger trap, so allowing it is fairly harmless.
  955                  */
  956                 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
  957                         uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
  958                             td->td_proc->p_pid, td->td_name, eflags);
  959                         return (EINVAL);
  960                 }
  961 
  962                 /*
  963                  * Don't allow users to load a valid privileged %cs.  Let the
  964                  * hardware check for invalid selectors, excess privilege in
  965                  * other selectors, invalid %eip's and invalid %esp's.
  966                  */
  967                 cs = ucp->uc_mcontext.mc_cs;
  968                 if (!CS_SECURE(cs)) {
  969                         uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
  970                             td->td_proc->p_pid, td->td_name, cs);
  971                         ksiginfo_init_trap(&ksi);
  972                         ksi.ksi_signo = SIGBUS;
  973                         ksi.ksi_code = BUS_OBJERR;
  974                         ksi.ksi_trapno = T_PROTFLT;
  975                         ksi.ksi_addr = (void *)regs->tf_eip;
  976                         trapsignal(td, &ksi);
  977                         return (EINVAL);
  978                 }
  979 
  980                 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
  981         }
  982 
  983 #if defined(COMPAT_43)
  984         if (ucp->uc_mcontext.mc_onstack & 1)
  985                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  986         else
  987                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  988 #endif
  989         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
  990         return (EJUSTRETURN);
  991 }
  992 #endif  /* COMPAT_FREEBSD4 */
  993 
  994 /*
  995  * MPSAFE
  996  */
  997 int
  998 sigreturn(td, uap)
  999         struct thread *td;
 1000         struct sigreturn_args /* {
 1001                 const struct __ucontext *sigcntxp;
 1002         } */ *uap;
 1003 {
 1004         ucontext_t uc;
 1005         struct trapframe *regs;
 1006         ucontext_t *ucp;
 1007         int cs, eflags, error, ret;
 1008         ksiginfo_t ksi;
 1009 
 1010         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 1011         if (error != 0)
 1012                 return (error);
 1013         ucp = &uc;
 1014         regs = td->td_frame;
 1015         eflags = ucp->uc_mcontext.mc_eflags;
 1016         if (eflags & PSL_VM) {
 1017                 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 1018                 struct vm86_kernel *vm86;
 1019 
 1020                 /*
 1021                  * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 1022                  * set up the vm86 area, and we can't enter vm86 mode.
 1023                  */
 1024                 if (td->td_pcb->pcb_ext == 0)
 1025                         return (EINVAL);
 1026                 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 1027                 if (vm86->vm86_inited == 0)
 1028                         return (EINVAL);
 1029 
 1030                 /* Go back to user mode if both flags are set. */
 1031                 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 1032                         ksiginfo_init_trap(&ksi);
 1033                         ksi.ksi_signo = SIGBUS;
 1034                         ksi.ksi_code = BUS_OBJERR;
 1035                         ksi.ksi_addr = (void *)regs->tf_eip;
 1036                         trapsignal(td, &ksi);
 1037                 }
 1038 
 1039                 if (vm86->vm86_has_vme) {
 1040                         eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 1041                             (eflags & VME_USERCHANGE) | PSL_VM;
 1042                 } else {
 1043                         vm86->vm86_eflags = eflags;     /* save VIF, VIP */
 1044                         eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 1045                             (eflags & VM_USERCHANGE) | PSL_VM;
 1046                 }
 1047                 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 1048                 tf->tf_eflags = eflags;
 1049                 tf->tf_vm86_ds = tf->tf_ds;
 1050                 tf->tf_vm86_es = tf->tf_es;
 1051                 tf->tf_vm86_fs = tf->tf_fs;
 1052                 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 1053                 tf->tf_ds = _udatasel;
 1054                 tf->tf_es = _udatasel;
 1055                 tf->tf_fs = _udatasel;
 1056         } else {
 1057                 /*
 1058                  * Don't allow users to change privileged or reserved flags.
 1059                  */
 1060                 /*
 1061                  * XXX do allow users to change the privileged flag PSL_RF.
 1062                  * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 1063                  * should sometimes set it there too.  tf_eflags is kept in
 1064                  * the signal context during signal handling and there is no
 1065                  * other place to remember it, so the PSL_RF bit may be
 1066                  * corrupted by the signal handler without us knowing.
 1067                  * Corruption of the PSL_RF bit at worst causes one more or
 1068                  * one less debugger trap, so allowing it is fairly harmless.
 1069                  */
 1070                 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 1071                         uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
 1072                             td->td_proc->p_pid, td->td_name, eflags);
 1073                         return (EINVAL);
 1074                 }
 1075 
 1076                 /*
 1077                  * Don't allow users to load a valid privileged %cs.  Let the
 1078                  * hardware check for invalid selectors, excess privilege in
 1079                  * other selectors, invalid %eip's and invalid %esp's.
 1080                  */
 1081                 cs = ucp->uc_mcontext.mc_cs;
 1082                 if (!CS_SECURE(cs)) {
 1083                         uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 1084                             td->td_proc->p_pid, td->td_name, cs);
 1085                         ksiginfo_init_trap(&ksi);
 1086                         ksi.ksi_signo = SIGBUS;
 1087                         ksi.ksi_code = BUS_OBJERR;
 1088                         ksi.ksi_trapno = T_PROTFLT;
 1089                         ksi.ksi_addr = (void *)regs->tf_eip;
 1090                         trapsignal(td, &ksi);
 1091                         return (EINVAL);
 1092                 }
 1093 
 1094                 ret = set_fpcontext(td, &ucp->uc_mcontext);
 1095                 if (ret != 0)
 1096                         return (ret);
 1097                 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 1098         }
 1099 
 1100 #if defined(COMPAT_43)
 1101         if (ucp->uc_mcontext.mc_onstack & 1)
 1102                 td->td_sigstk.ss_flags |= SS_ONSTACK;
 1103         else
 1104                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 1105 #endif
 1106 
 1107         kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 1108         return (EJUSTRETURN);
 1109 }
 1110 
 1111 /*
 1112  * Machine dependent boot() routine
 1113  *
 1114  * I haven't seen anything to put here yet
 1115  * Possibly some stuff might be grafted back here from boot()
 1116  */
 1117 void
 1118 cpu_boot(int howto)
 1119 {
 1120 }
 1121 
 1122 /*
 1123  * Flush the D-cache for non-DMA I/O so that the I-cache can
 1124  * be made coherent later.
 1125  */
 1126 void
 1127 cpu_flush_dcache(void *ptr, size_t len)
 1128 {
 1129         /* Not applicable */
 1130 }
 1131 
 1132 /* Get current clock frequency for the given cpu id. */
 1133 int
 1134 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 1135 {
 1136         register_t reg;
 1137         uint64_t tsc1, tsc2;
 1138 
 1139         if (pcpu_find(cpu_id) == NULL || rate == NULL)
 1140                 return (EINVAL);
 1141         if (!tsc_present)
 1142                 return (EOPNOTSUPP);
 1143 
 1144         /* If we're booting, trust the rate calibrated moments ago. */
 1145         if (cold) {
 1146                 *rate = tsc_freq;
 1147                 return (0);
 1148         }
 1149 
 1150 #ifdef SMP
 1151         /* Schedule ourselves on the indicated cpu. */
 1152         thread_lock(curthread);
 1153         sched_bind(curthread, cpu_id);
 1154         thread_unlock(curthread);
 1155 #endif
 1156 
 1157         /* Calibrate by measuring a short delay. */
 1158         reg = intr_disable();
 1159         tsc1 = rdtsc();
 1160         DELAY(1000);
 1161         tsc2 = rdtsc();
 1162         intr_restore(reg);
 1163 
 1164 #ifdef SMP
 1165         thread_lock(curthread);
 1166         sched_unbind(curthread);
 1167         thread_unlock(curthread);
 1168 #endif
 1169 
 1170         /*
 1171          * Calculate the difference in readings, convert to Mhz, and
 1172          * subtract 0.5% of the total.  Empirical testing has shown that
 1173          * overhead in DELAY() works out to approximately this value.
 1174          */
 1175         tsc2 -= tsc1;
 1176         *rate = tsc2 * 1000 - tsc2 * 5;
 1177         return (0);
 1178 }
 1179 
 1180 
 1181 void (*cpu_idle_hook)(void) = NULL;     /* ACPI idle hook. */
 1182 
 1183 #ifdef XEN
 1184 
 1185 void
 1186 cpu_halt(void)
 1187 {
 1188         HYPERVISOR_shutdown(SHUTDOWN_poweroff);
 1189 }
 1190 
 1191 int scheduler_running;
 1192 
 1193 static void
 1194 cpu_idle_hlt(int busy)
 1195 {
 1196 
 1197         scheduler_running = 1;
 1198         enable_intr();
 1199         idle_block();
 1200 }
 1201 
 1202 #else
 1203 /*
 1204  * Shutdown the CPU as much as possible
 1205  */
 1206 void
 1207 cpu_halt(void)
 1208 {
 1209         for (;;)
 1210                 __asm__ ("hlt");
 1211 }
 1212 
 1213 static void
 1214 cpu_idle_hlt(int busy)
 1215 {
 1216         /*
 1217          * we must absolutely guarentee that hlt is the next instruction
 1218          * after sti or we introduce a timing window.
 1219          */
 1220         disable_intr();
 1221         if (sched_runnable())
 1222                 enable_intr();
 1223         else
 1224                 __asm __volatile("sti; hlt");
 1225 }
 1226 #endif
 1227 
 1228 static void
 1229 cpu_idle_acpi(int busy)
 1230 {
 1231         disable_intr();
 1232         if (sched_runnable())
 1233                 enable_intr();
 1234         else if (cpu_idle_hook)
 1235                 cpu_idle_hook();
 1236         else
 1237                 __asm __volatile("sti; hlt");
 1238 }
 1239 
 1240 static int cpu_ident_amdc1e = 0;
 1241 
 1242 #if !defined(XEN) || defined(XEN_PRIVILEGED)
 1243 static int
 1244 cpu_probe_amdc1e(void)
 1245 { 
 1246 #ifdef DEV_APIC
 1247         int i;
 1248 
 1249         /*
 1250          * Forget it, if we're not using local APIC timer.
 1251          */
 1252         if (resource_disabled("apic", 0) ||
 1253             (resource_int_value("apic", 0, "clock", &i) == 0 && i == 0))
 1254                 return (0);
 1255 
 1256         /*
 1257          * Detect the presence of C1E capability mostly on latest
 1258          * dual-cores (or future) k8 family.
 1259          */
 1260         if (cpu_vendor_id == CPU_VENDOR_AMD &&
 1261             (cpu_id & 0x00000f00) == 0x00000f00 &&
 1262             (cpu_id & 0x0fff0000) >=  0x00040000) {
 1263                 cpu_ident_amdc1e = 1;
 1264                 return (1);
 1265         }
 1266 #endif
 1267         return (0);
 1268 }
 1269 #endif
 1270 
 1271 /*
 1272  * C1E renders the local APIC timer dead, so we disable it by
 1273  * reading the Interrupt Pending Message register and clearing
 1274  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
 1275  * 
 1276  * Reference:
 1277  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
 1278  *   #32559 revision 3.00+
 1279  */
 1280 #define MSR_AMDK8_IPM           0xc0010055
 1281 #define AMDK8_SMIONCMPHALT      (1ULL << 27)
 1282 #define AMDK8_C1EONCMPHALT      (1ULL << 28)
 1283 #define AMDK8_CMPHALT           (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 1284 
 1285 static void
 1286 cpu_idle_amdc1e(int busy)
 1287 {
 1288 
 1289         disable_intr();
 1290         if (sched_runnable())
 1291                 enable_intr();
 1292         else {
 1293                 uint64_t msr;
 1294 
 1295                 msr = rdmsr(MSR_AMDK8_IPM);
 1296                 if (msr & AMDK8_CMPHALT)
 1297                         wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 1298 
 1299                 if (cpu_idle_hook)
 1300                         cpu_idle_hook();
 1301                 else
 1302                         __asm __volatile("sti; hlt");
 1303         }
 1304 }
 1305 
 1306 static void
 1307 cpu_idle_spin(int busy)
 1308 {
 1309         return;
 1310 }
 1311 
 1312 #ifdef XEN
 1313 void (*cpu_idle_fn)(int) = cpu_idle_hlt;
 1314 #else
 1315 void (*cpu_idle_fn)(int) = cpu_idle_acpi;
 1316 #endif
 1317 
 1318 void
 1319 cpu_idle(int busy)
 1320 {
 1321 #if defined(SMP) && !defined(XEN)
 1322         if (mp_grab_cpu_hlt())
 1323                 return;
 1324 #endif
 1325         cpu_idle_fn(busy);
 1326 }
 1327 
 1328 /*
 1329  * mwait cpu power states.  Lower 4 bits are sub-states.
 1330  */
 1331 #define MWAIT_C0        0xf0
 1332 #define MWAIT_C1        0x00
 1333 #define MWAIT_C2        0x10
 1334 #define MWAIT_C3        0x20
 1335 #define MWAIT_C4        0x30
 1336 
 1337 #define MWAIT_DISABLED  0x0
 1338 #define MWAIT_WOKEN     0x1
 1339 #define MWAIT_WAITING   0x2
 1340 
 1341 static void
 1342 cpu_idle_mwait(int busy)
 1343 {
 1344         int *mwait;
 1345 
 1346         mwait = (int *)PCPU_PTR(monitorbuf);
 1347         *mwait = MWAIT_WAITING;
 1348         if (sched_runnable())
 1349                 return;
 1350         cpu_monitor(mwait, 0, 0);
 1351         if (*mwait == MWAIT_WAITING)
 1352                 cpu_mwait(0, MWAIT_C1);
 1353 }
 1354 
 1355 static void
 1356 cpu_idle_mwait_hlt(int busy)
 1357 {
 1358         int *mwait;
 1359 
 1360         mwait = (int *)PCPU_PTR(monitorbuf);
 1361         if (busy == 0) {
 1362                 *mwait = MWAIT_DISABLED;
 1363                 cpu_idle_hlt(busy);
 1364                 return;
 1365         }
 1366         *mwait = MWAIT_WAITING;
 1367         if (sched_runnable())
 1368                 return;
 1369         cpu_monitor(mwait, 0, 0);
 1370         if (*mwait == MWAIT_WAITING)
 1371                 cpu_mwait(0, MWAIT_C1);
 1372 }
 1373 
 1374 int
 1375 cpu_idle_wakeup(int cpu)
 1376 {
 1377         struct pcpu *pcpu;
 1378         int *mwait;
 1379 
 1380         if (cpu_idle_fn == cpu_idle_spin)
 1381                 return (1);
 1382         if (cpu_idle_fn != cpu_idle_mwait && cpu_idle_fn != cpu_idle_mwait_hlt)
 1383                 return (0);
 1384         pcpu = pcpu_find(cpu);
 1385         mwait = (int *)pcpu->pc_monitorbuf;
 1386         /*
 1387          * This doesn't need to be atomic since missing the race will
 1388          * simply result in unnecessary IPIs.
 1389          */
 1390         if (cpu_idle_fn == cpu_idle_mwait_hlt && *mwait == MWAIT_DISABLED)
 1391                 return (0);
 1392         *mwait = MWAIT_WOKEN;
 1393 
 1394         return (1);
 1395 }
 1396 
 1397 /*
 1398  * Ordered by speed/power consumption.
 1399  */
 1400 struct {
 1401         void    *id_fn;
 1402         char    *id_name;
 1403 } idle_tbl[] = {
 1404         { cpu_idle_spin, "spin" },
 1405         { cpu_idle_mwait, "mwait" },
 1406         { cpu_idle_mwait_hlt, "mwait_hlt" },
 1407         { cpu_idle_amdc1e, "amdc1e" },
 1408         { cpu_idle_hlt, "hlt" },
 1409         { cpu_idle_acpi, "acpi" },
 1410         { NULL, NULL }
 1411 };
 1412 
 1413 static int
 1414 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 1415 {
 1416         char *avail, *p;
 1417         int error;
 1418         int i;
 1419 
 1420         avail = malloc(256, M_TEMP, M_WAITOK);
 1421         p = avail;
 1422         for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 1423                 if (strstr(idle_tbl[i].id_name, "mwait") &&
 1424                     (cpu_feature2 & CPUID2_MON) == 0)
 1425                         continue;
 1426                 if (strcmp(idle_tbl[i].id_name, "amdc1e") == 0 &&
 1427                     cpu_ident_amdc1e == 0)
 1428                         continue;
 1429                 p += sprintf(p, "%s, ", idle_tbl[i].id_name);
 1430         }
 1431         error = sysctl_handle_string(oidp, avail, 0, req);
 1432         free(avail, M_TEMP);
 1433         return (error);
 1434 }
 1435 
 1436 static int
 1437 idle_sysctl(SYSCTL_HANDLER_ARGS)
 1438 {
 1439         char buf[16];
 1440         int error;
 1441         char *p;
 1442         int i;
 1443 
 1444         p = "unknown";
 1445         for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 1446                 if (idle_tbl[i].id_fn == cpu_idle_fn) {
 1447                         p = idle_tbl[i].id_name;
 1448                         break;
 1449                 }
 1450         }
 1451         strncpy(buf, p, sizeof(buf));
 1452         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 1453         if (error != 0 || req->newptr == NULL)
 1454                 return (error);
 1455         for (i = 0; idle_tbl[i].id_name != NULL; i++) {
 1456                 if (strstr(idle_tbl[i].id_name, "mwait") &&
 1457                     (cpu_feature2 & CPUID2_MON) == 0)
 1458                         continue;
 1459                 if (strcmp(idle_tbl[i].id_name, "amdc1e") == 0 &&
 1460                     cpu_ident_amdc1e == 0)
 1461                         continue;
 1462                 if (strcmp(idle_tbl[i].id_name, buf))
 1463                         continue;
 1464                 cpu_idle_fn = idle_tbl[i].id_fn;
 1465                 return (0);
 1466         }
 1467         return (EINVAL);
 1468 }
 1469 
 1470 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
 1471     0, 0, idle_sysctl_available, "A", "list of available idle functions");
 1472 
 1473 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
 1474     idle_sysctl, "A", "currently selected idle function");
 1475 
 1476 /*
 1477  * Reset registers to default values on exec.
 1478  */
 1479 void
 1480 exec_setregs(td, entry, stack, ps_strings)
 1481         struct thread *td;
 1482         u_long entry;
 1483         u_long stack;
 1484         u_long ps_strings;
 1485 {
 1486         struct trapframe *regs = td->td_frame;
 1487         struct pcb *pcb = td->td_pcb;
 1488 
 1489         /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 1490         pcb->pcb_gs = _udatasel;
 1491         load_gs(_udatasel);
 1492 
 1493         mtx_lock_spin(&dt_lock);
 1494         if (td->td_proc->p_md.md_ldt)
 1495                 user_ldt_free(td);
 1496         else
 1497                 mtx_unlock_spin(&dt_lock);
 1498   
 1499         bzero((char *)regs, sizeof(struct trapframe));
 1500         regs->tf_eip = entry;
 1501         regs->tf_esp = stack;
 1502         regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 1503         regs->tf_ss = _udatasel;
 1504         regs->tf_ds = _udatasel;
 1505         regs->tf_es = _udatasel;
 1506         regs->tf_fs = _udatasel;
 1507         regs->tf_cs = _ucodesel;
 1508 
 1509         /* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 1510         regs->tf_ebx = ps_strings;
 1511 
 1512         /*
 1513          * Reset the hardware debug registers if they were in use.
 1514          * They won't have any meaning for the newly exec'd process.  
 1515          */
 1516         if (pcb->pcb_flags & PCB_DBREGS) {
 1517                 pcb->pcb_dr0 = 0;
 1518                 pcb->pcb_dr1 = 0;
 1519                 pcb->pcb_dr2 = 0;
 1520                 pcb->pcb_dr3 = 0;
 1521                 pcb->pcb_dr6 = 0;
 1522                 pcb->pcb_dr7 = 0;
 1523                 if (pcb == PCPU_GET(curpcb)) {
 1524                         /*
 1525                          * Clear the debug registers on the running
 1526                          * CPU, otherwise they will end up affecting
 1527                          * the next process we switch to.
 1528                          */
 1529                         reset_dbregs();
 1530                 }
 1531                 pcb->pcb_flags &= ~PCB_DBREGS;
 1532         }
 1533 
 1534         /*
 1535          * Initialize the math emulator (if any) for the current process.
 1536          * Actually, just clear the bit that says that the emulator has
 1537          * been initialized.  Initialization is delayed until the process
 1538          * traps to the emulator (if it is done at all) mainly because
 1539          * emulators don't provide an entry point for initialization.
 1540          */
 1541         td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 1542         pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
 1543 
 1544         /*
 1545          * Drop the FP state if we hold it, so that the process gets a
 1546          * clean FP state if it uses the FPU again.
 1547          */
 1548         fpstate_drop(td);
 1549 
 1550         /*
 1551          * XXX - Linux emulator
 1552          * Make sure sure edx is 0x0 on entry. Linux binaries depend
 1553          * on it.
 1554          */
 1555         td->td_retval[1] = 0;
 1556 }
 1557 
 1558 void
 1559 cpu_setregs(void)
 1560 {
 1561         unsigned int cr0;
 1562 
 1563         cr0 = rcr0();
 1564 
 1565         /*
 1566          * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 1567          *
 1568          * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 1569          * instructions.  We must set the CR0_MP bit and use the CR0_TS
 1570          * bit to control the trap, because setting the CR0_EM bit does
 1571          * not cause WAIT instructions to trap.  It's important to trap
 1572          * WAIT instructions - otherwise the "wait" variants of no-wait
 1573          * control instructions would degenerate to the "no-wait" variants
 1574          * after FP context switches but work correctly otherwise.  It's
 1575          * particularly important to trap WAITs when there is no NPX -
 1576          * otherwise the "wait" variants would always degenerate.
 1577          *
 1578          * Try setting CR0_NE to get correct error reporting on 486DX's.
 1579          * Setting it should fail or do nothing on lesser processors.
 1580          */
 1581         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 1582         load_cr0(cr0);
 1583         load_gs(_udatasel);
 1584 }
 1585 
 1586 u_long bootdev;         /* not a struct cdev *- encoding is different */
 1587 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 1588         CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 1589 
 1590 /*
 1591  * Initialize 386 and configure to run kernel
 1592  */
 1593 
 1594 /*
 1595  * Initialize segments & interrupt table
 1596  */
 1597 
 1598 int _default_ldt;
 1599 
 1600 #ifdef XEN
 1601 union descriptor *gdt;
 1602 union descriptor *ldt;
 1603 #else
 1604 union descriptor gdt[NGDT * MAXCPU];    /* global descriptor table */
 1605 union descriptor ldt[NLDT];             /* local descriptor table */
 1606 #endif
 1607 static struct gate_descriptor idt0[NIDT];
 1608 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
 1609 struct region_descriptor r_gdt, r_idt;  /* table descriptors */
 1610 struct mtx dt_lock;                     /* lock for GDT and LDT */
 1611 
 1612 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 1613 extern int has_f00f_bug;
 1614 #endif
 1615 
 1616 static struct i386tss dblfault_tss;
 1617 static char dblfault_stack[PAGE_SIZE];
 1618 
 1619 extern  vm_offset_t     proc0kstack;
 1620 
 1621 
 1622 /*
 1623  * software prototypes -- in more palatable form.
 1624  *
 1625  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
 1626  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
 1627  */
 1628 struct soft_segment_descriptor gdt_segs[] = {
 1629 /* GNULL_SEL    0 Null Descriptor */
 1630 {       .ssd_base = 0x0,
 1631         .ssd_limit = 0x0,
 1632         .ssd_type = 0,
 1633         .ssd_dpl = SEL_KPL,
 1634         .ssd_p = 0,
 1635         .ssd_xx = 0, .ssd_xx1 = 0,
 1636         .ssd_def32 = 0,
 1637         .ssd_gran = 0           },
 1638 /* GPRIV_SEL    1 SMP Per-Processor Private Data Descriptor */
 1639 {       .ssd_base = 0x0,
 1640         .ssd_limit = 0xfffff,
 1641         .ssd_type = SDT_MEMRWA,
 1642         .ssd_dpl = SEL_KPL,
 1643         .ssd_p = 1,
 1644         .ssd_xx = 0, .ssd_xx1 = 0,
 1645         .ssd_def32 = 1,
 1646         .ssd_gran = 1           },
 1647 /* GUFS_SEL     2 %fs Descriptor for user */
 1648 {       .ssd_base = 0x0,
 1649         .ssd_limit = 0xfffff,
 1650         .ssd_type = SDT_MEMRWA,
 1651         .ssd_dpl = SEL_UPL,
 1652         .ssd_p = 1,
 1653         .ssd_xx = 0, .ssd_xx1 = 0,
 1654         .ssd_def32 = 1,
 1655         .ssd_gran = 1           },
 1656 /* GUGS_SEL     3 %gs Descriptor for user */
 1657 {       .ssd_base = 0x0,
 1658         .ssd_limit = 0xfffff,
 1659         .ssd_type = SDT_MEMRWA,
 1660         .ssd_dpl = SEL_UPL,
 1661         .ssd_p = 1,
 1662         .ssd_xx = 0, .ssd_xx1 = 0,
 1663         .ssd_def32 = 1,
 1664         .ssd_gran = 1           },
 1665 /* GCODE_SEL    4 Code Descriptor for kernel */
 1666 {       .ssd_base = 0x0,
 1667         .ssd_limit = 0xfffff,
 1668         .ssd_type = SDT_MEMERA,
 1669         .ssd_dpl = SEL_KPL,
 1670         .ssd_p = 1,
 1671         .ssd_xx = 0, .ssd_xx1 = 0,
 1672         .ssd_def32 = 1,
 1673         .ssd_gran = 1           },
 1674 /* GDATA_SEL    5 Data Descriptor for kernel */
 1675 {       .ssd_base = 0x0,
 1676         .ssd_limit = 0xfffff,
 1677         .ssd_type = SDT_MEMRWA,
 1678         .ssd_dpl = SEL_KPL,
 1679         .ssd_p = 1,
 1680         .ssd_xx = 0, .ssd_xx1 = 0,
 1681         .ssd_def32 = 1,
 1682         .ssd_gran = 1           },
 1683 /* GUCODE_SEL   6 Code Descriptor for user */
 1684 {       .ssd_base = 0x0,
 1685         .ssd_limit = 0xfffff,
 1686         .ssd_type = SDT_MEMERA,
 1687         .ssd_dpl = SEL_UPL,
 1688         .ssd_p = 1,
 1689         .ssd_xx = 0, .ssd_xx1 = 0,
 1690         .ssd_def32 = 1,
 1691         .ssd_gran = 1           },
 1692 /* GUDATA_SEL   7 Data Descriptor for user */
 1693 {       .ssd_base = 0x0,
 1694         .ssd_limit = 0xfffff,
 1695         .ssd_type = SDT_MEMRWA,
 1696         .ssd_dpl = SEL_UPL,
 1697         .ssd_p = 1,
 1698         .ssd_xx = 0, .ssd_xx1 = 0,
 1699         .ssd_def32 = 1,
 1700         .ssd_gran = 1           },
 1701 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 1702 {       .ssd_base = 0x400,
 1703         .ssd_limit = 0xfffff,
 1704         .ssd_type = SDT_MEMRWA,
 1705         .ssd_dpl = SEL_KPL,
 1706         .ssd_p = 1,
 1707         .ssd_xx = 0, .ssd_xx1 = 0,
 1708         .ssd_def32 = 1,
 1709         .ssd_gran = 1           },
 1710 #ifndef XEN
 1711 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
 1712 {
 1713         .ssd_base = 0x0,
 1714         .ssd_limit = sizeof(struct i386tss)-1,
 1715         .ssd_type = SDT_SYS386TSS,
 1716         .ssd_dpl = 0,
 1717         .ssd_p = 1,
 1718         .ssd_xx = 0, .ssd_xx1 = 0,
 1719         .ssd_def32 = 0,
 1720         .ssd_gran = 0           },
 1721 /* GLDT_SEL     10 LDT Descriptor */
 1722 {       .ssd_base = (int) ldt,
 1723         .ssd_limit = sizeof(ldt)-1,
 1724         .ssd_type = SDT_SYSLDT,
 1725         .ssd_dpl = SEL_UPL,
 1726         .ssd_p = 1,
 1727         .ssd_xx = 0, .ssd_xx1 = 0,
 1728         .ssd_def32 = 0,
 1729         .ssd_gran = 0           },
 1730 /* GUSERLDT_SEL 11 User LDT Descriptor per process */
 1731 {       .ssd_base = (int) ldt,
 1732         .ssd_limit = (512 * sizeof(union descriptor)-1),
 1733         .ssd_type = SDT_SYSLDT,
 1734         .ssd_dpl = 0,
 1735         .ssd_p = 1,
 1736         .ssd_xx = 0, .ssd_xx1 = 0,
 1737         .ssd_def32 = 0,
 1738         .ssd_gran = 0           },
 1739 /* GPANIC_SEL   12 Panic Tss Descriptor */
 1740 {       .ssd_base = (int) &dblfault_tss,
 1741         .ssd_limit = sizeof(struct i386tss)-1,
 1742         .ssd_type = SDT_SYS386TSS,
 1743         .ssd_dpl = 0,
 1744         .ssd_p = 1,
 1745         .ssd_xx = 0, .ssd_xx1 = 0,
 1746         .ssd_def32 = 0,
 1747         .ssd_gran = 0           },
 1748 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 1749 {       .ssd_base = 0,
 1750         .ssd_limit = 0xfffff,
 1751         .ssd_type = SDT_MEMERA,
 1752         .ssd_dpl = 0,
 1753         .ssd_p = 1,
 1754         .ssd_xx = 0, .ssd_xx1 = 0,
 1755         .ssd_def32 = 0,
 1756         .ssd_gran = 1           },
 1757 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 1758 {       .ssd_base = 0,
 1759         .ssd_limit = 0xfffff,
 1760         .ssd_type = SDT_MEMERA,
 1761         .ssd_dpl = 0,
 1762         .ssd_p = 1,
 1763         .ssd_xx = 0, .ssd_xx1 = 0,
 1764         .ssd_def32 = 0,
 1765         .ssd_gran = 1           },
 1766 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 1767 {       .ssd_base = 0,
 1768         .ssd_limit = 0xfffff,
 1769         .ssd_type = SDT_MEMRWA,
 1770         .ssd_dpl = 0,
 1771         .ssd_p = 1,
 1772         .ssd_xx = 0, .ssd_xx1 = 0,
 1773         .ssd_def32 = 1,
 1774         .ssd_gran = 1           },
 1775 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 1776 {       .ssd_base = 0,
 1777         .ssd_limit = 0xfffff,
 1778         .ssd_type = SDT_MEMRWA,
 1779         .ssd_dpl = 0,
 1780         .ssd_p = 1,
 1781         .ssd_xx = 0, .ssd_xx1 = 0,
 1782         .ssd_def32 = 0,
 1783         .ssd_gran = 1           },
 1784 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 1785 {       .ssd_base = 0,
 1786         .ssd_limit = 0xfffff,
 1787         .ssd_type = SDT_MEMRWA,
 1788         .ssd_dpl = 0,
 1789         .ssd_p = 1,
 1790         .ssd_xx = 0, .ssd_xx1 = 0,
 1791         .ssd_def32 = 0,
 1792         .ssd_gran = 1           },
 1793 /* GNDIS_SEL    18 NDIS Descriptor */
 1794 {       .ssd_base = 0x0,
 1795         .ssd_limit = 0x0,
 1796         .ssd_type = 0,
 1797         .ssd_dpl = 0,
 1798         .ssd_p = 0,
 1799         .ssd_xx = 0, .ssd_xx1 = 0,
 1800         .ssd_def32 = 0,
 1801         .ssd_gran = 0           },
 1802 #endif /* !XEN */
 1803 };
 1804 
 1805 static struct soft_segment_descriptor ldt_segs[] = {
 1806         /* Null Descriptor - overwritten by call gate */
 1807 {       .ssd_base = 0x0,
 1808         .ssd_limit = 0x0,
 1809         .ssd_type = 0,
 1810         .ssd_dpl = 0,
 1811         .ssd_p = 0,
 1812         .ssd_xx = 0, .ssd_xx1 = 0,
 1813         .ssd_def32 = 0,
 1814         .ssd_gran = 0           },
 1815         /* Null Descriptor - overwritten by call gate */
 1816 {       .ssd_base = 0x0,
 1817         .ssd_limit = 0x0,
 1818         .ssd_type = 0,
 1819         .ssd_dpl = 0,
 1820         .ssd_p = 0,
 1821         .ssd_xx = 0, .ssd_xx1 = 0,
 1822         .ssd_def32 = 0,
 1823         .ssd_gran = 0           },
 1824         /* Null Descriptor - overwritten by call gate */
 1825 {       .ssd_base = 0x0,
 1826         .ssd_limit = 0x0,
 1827         .ssd_type = 0,
 1828         .ssd_dpl = 0,
 1829         .ssd_p = 0,
 1830         .ssd_xx = 0, .ssd_xx1 = 0,
 1831         .ssd_def32 = 0,
 1832         .ssd_gran = 0           },
 1833         /* Code Descriptor for user */
 1834 {       .ssd_base = 0x0,
 1835         .ssd_limit = 0xfffff,
 1836         .ssd_type = SDT_MEMERA,
 1837         .ssd_dpl = SEL_UPL,
 1838         .ssd_p = 1,
 1839         .ssd_xx = 0, .ssd_xx1 = 0,
 1840         .ssd_def32 = 1,
 1841         .ssd_gran = 1           },
 1842         /* Null Descriptor - overwritten by call gate */
 1843 {       .ssd_base = 0x0,
 1844         .ssd_limit = 0x0,
 1845         .ssd_type = 0,
 1846         .ssd_dpl = 0,
 1847         .ssd_p = 0,
 1848         .ssd_xx = 0, .ssd_xx1 = 0,
 1849         .ssd_def32 = 0,
 1850         .ssd_gran = 0           },
 1851         /* Data Descriptor for user */
 1852 {       .ssd_base = 0x0,
 1853         .ssd_limit = 0xfffff,
 1854         .ssd_type = SDT_MEMRWA,
 1855         .ssd_dpl = SEL_UPL,
 1856         .ssd_p = 1,
 1857         .ssd_xx = 0, .ssd_xx1 = 0,
 1858         .ssd_def32 = 1,
 1859         .ssd_gran = 1           },
 1860 };
 1861 
 1862 void
 1863 setidt(idx, func, typ, dpl, selec)
 1864         int idx;
 1865         inthand_t *func;
 1866         int typ;
 1867         int dpl;
 1868         int selec;
 1869 {
 1870         struct gate_descriptor *ip;
 1871 
 1872         ip = idt + idx;
 1873         ip->gd_looffset = (int)func;
 1874         ip->gd_selector = selec;
 1875         ip->gd_stkcpy = 0;
 1876         ip->gd_xx = 0;
 1877         ip->gd_type = typ;
 1878         ip->gd_dpl = dpl;
 1879         ip->gd_p = 1;
 1880         ip->gd_hioffset = ((int)func)>>16 ;
 1881 }
 1882 
 1883 extern inthand_t
 1884         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 1885         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 1886         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 1887         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 1888         IDTVEC(xmm),
 1889 #ifdef KDTRACE_HOOKS
 1890         IDTVEC(dtrace_ret),
 1891 #endif
 1892         IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 1893 
 1894 #ifdef DDB
 1895 /*
 1896  * Display the index and function name of any IDT entries that don't use
 1897  * the default 'rsvd' entry point.
 1898  */
 1899 DB_SHOW_COMMAND(idt, db_show_idt)
 1900 {
 1901         struct gate_descriptor *ip;
 1902         int idx;
 1903         uintptr_t func;
 1904 
 1905         ip = idt;
 1906         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 1907                 func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 1908                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
 1909                         db_printf("%3d\t", idx);
 1910                         db_printsym(func, DB_STGY_PROC);
 1911                         db_printf("\n");
 1912                 }
 1913                 ip++;
 1914         }
 1915 }
 1916 
 1917 /* Show privileged registers. */
 1918 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 1919 {
 1920         uint64_t idtr, gdtr;
 1921 
 1922         idtr = ridt();
 1923         db_printf("idtr\t0x%08x/%04x\n",
 1924             (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 1925         gdtr = rgdt();
 1926         db_printf("gdtr\t0x%08x/%04x\n",
 1927             (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 1928         db_printf("ldtr\t0x%04x\n", rldt());
 1929         db_printf("tr\t0x%04x\n", rtr());
 1930         db_printf("cr0\t0x%08x\n", rcr0());
 1931         db_printf("cr2\t0x%08x\n", rcr2());
 1932         db_printf("cr3\t0x%08x\n", rcr3());
 1933         db_printf("cr4\t0x%08x\n", rcr4());
 1934 }
 1935 #endif
 1936 
 1937 void
 1938 sdtossd(sd, ssd)
 1939         struct segment_descriptor *sd;
 1940         struct soft_segment_descriptor *ssd;
 1941 {
 1942         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 1943         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 1944         ssd->ssd_type  = sd->sd_type;
 1945         ssd->ssd_dpl   = sd->sd_dpl;
 1946         ssd->ssd_p     = sd->sd_p;
 1947         ssd->ssd_def32 = sd->sd_def32;
 1948         ssd->ssd_gran  = sd->sd_gran;
 1949 }
 1950 
 1951 #ifndef XEN
 1952 static int
 1953 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
 1954 {
 1955         int i, insert_idx, physmap_idx;
 1956 
 1957         physmap_idx = *physmap_idxp;
 1958         
 1959         if (boothowto & RB_VERBOSE)
 1960                 printf("SMAP type=%02x base=%016llx len=%016llx\n",
 1961                     smap->type, smap->base, smap->length);
 1962 
 1963         if (smap->type != SMAP_TYPE_MEMORY)
 1964                 return (1);
 1965 
 1966         if (smap->length == 0)
 1967                 return (1);
 1968 
 1969 #ifndef PAE
 1970         if (smap->base > 0xffffffff) {
 1971                 printf("%uK of memory above 4GB ignored\n",
 1972                     (u_int)(smap->length / 1024));
 1973                 return (1);
 1974         }
 1975 #endif
 1976 
 1977         /*
 1978          * Find insertion point while checking for overlap.  Start off by
 1979          * assuming the new entry will be added to the end.
 1980          */
 1981         insert_idx = physmap_idx + 2;
 1982         for (i = 0; i <= physmap_idx; i += 2) {
 1983                 if (smap->base < physmap[i + 1]) {
 1984                         if (smap->base + smap->length <= physmap[i]) {
 1985                                 insert_idx = i;
 1986                                 break;
 1987                         }
 1988                         if (boothowto & RB_VERBOSE)
 1989                                 printf(
 1990                     "Overlapping memory regions, ignoring second region\n");
 1991                         return (1);
 1992                 }
 1993         }
 1994 
 1995         /* See if we can prepend to the next entry. */
 1996         if (insert_idx <= physmap_idx &&
 1997             smap->base + smap->length == physmap[insert_idx]) {
 1998                 physmap[insert_idx] = smap->base;
 1999                 return (1);
 2000         }
 2001 
 2002         /* See if we can append to the previous entry. */
 2003         if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
 2004                 physmap[insert_idx - 1] += smap->length;
 2005                 return (1);
 2006         }
 2007 
 2008         physmap_idx += 2;
 2009         *physmap_idxp = physmap_idx;
 2010         if (physmap_idx == PHYSMAP_SIZE) {
 2011                 printf(
 2012                 "Too many segments in the physical address map, giving up\n");
 2013                 return (0);
 2014         }
 2015 
 2016         /*
 2017          * Move the last 'N' entries down to make room for the new
 2018          * entry if needed.
 2019          */
 2020         for (i = physmap_idx; i > insert_idx; i -= 2) {
 2021                 physmap[i] = physmap[i - 2];
 2022                 physmap[i + 1] = physmap[i - 1];
 2023         }
 2024 
 2025         /* Insert the new entry. */
 2026         physmap[insert_idx] = smap->base;
 2027         physmap[insert_idx + 1] = smap->base + smap->length;
 2028         return (1);
 2029 }
 2030 
 2031 static void
 2032 basemem_setup(void)
 2033 {
 2034         vm_paddr_t pa;
 2035         pt_entry_t *pte;
 2036         int i;
 2037 
 2038         if (basemem > 640) {
 2039                 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 2040                         basemem);
 2041                 basemem = 640;
 2042         }
 2043 
 2044         /*
 2045          * XXX if biosbasemem is now < 640, there is a `hole'
 2046          * between the end of base memory and the start of
 2047          * ISA memory.  The hole may be empty or it may
 2048          * contain BIOS code or data.  Map it read/write so
 2049          * that the BIOS can write to it.  (Memory from 0 to
 2050          * the physical end of the kernel is mapped read-only
 2051          * to begin with and then parts of it are remapped.
 2052          * The parts that aren't remapped form holes that
 2053          * remain read-only and are unused by the kernel.
 2054          * The base memory area is below the physical end of
 2055          * the kernel and right now forms a read-only hole.
 2056          * The part of it from PAGE_SIZE to
 2057          * (trunc_page(biosbasemem * 1024) - 1) will be
 2058          * remapped and used by the kernel later.)
 2059          *
 2060          * This code is similar to the code used in
 2061          * pmap_mapdev, but since no memory needs to be
 2062          * allocated we simply change the mapping.
 2063          */
 2064         for (pa = trunc_page(basemem * 1024);
 2065              pa < ISA_HOLE_START; pa += PAGE_SIZE)
 2066                 pmap_kenter(KERNBASE + pa, pa);
 2067 
 2068         /*
 2069          * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 2070          * the vm86 page table so that vm86 can scribble on them using
 2071          * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 2072          * page 0, at least as initialized here?
 2073          */
 2074         pte = (pt_entry_t *)vm86paddr;
 2075         for (i = basemem / 4; i < 160; i++)
 2076                 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 2077 }
 2078 #endif
 2079 
 2080 /*
 2081  * Populate the (physmap) array with base/bound pairs describing the
 2082  * available physical memory in the system, then test this memory and
 2083  * build the phys_avail array describing the actually-available memory.
 2084  *
 2085  * If we cannot accurately determine the physical memory map, then use
 2086  * value from the 0xE801 call, and failing that, the RTC.
 2087  *
 2088  * Total memory size may be set by the kernel environment variable
 2089  * hw.physmem or the compile-time define MAXMEM.
 2090  *
 2091  * XXX first should be vm_paddr_t.
 2092  */
 2093 static void
 2094 getmemsize(int first)
 2095 {
 2096         int has_smap, off, physmap_idx, pa_indx, da_indx;
 2097         u_long physmem_tunable, memtest;
 2098         vm_paddr_t physmap[PHYSMAP_SIZE];
 2099         pt_entry_t *pte;
 2100         quad_t dcons_addr, dcons_size;
 2101 #ifndef XEN
 2102         int hasbrokenint12, i;
 2103         u_int extmem;
 2104         struct vm86frame vmf;
 2105         struct vm86context vmc;
 2106         vm_paddr_t pa;
 2107         struct bios_smap *smap, *smapbase, *smapend;
 2108         u_int32_t smapsize;
 2109         caddr_t kmdp;
 2110 #endif
 2111 
 2112         has_smap = 0;
 2113 #if defined(XEN)
 2114         Maxmem = xen_start_info->nr_pages - init_first;
 2115         physmem = Maxmem;
 2116         basemem = 0;
 2117         physmap[0] = init_first << PAGE_SHIFT;
 2118         physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
 2119         physmap_idx = 0;
 2120 #else
 2121 #ifdef XBOX
 2122         if (arch_i386_is_xbox) {
 2123                 /*
 2124                  * We queried the memory size before, so chop off 4MB for
 2125                  * the framebuffer and inform the OS of this.
 2126                  */
 2127                 physmap[0] = 0;
 2128                 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
 2129                 physmap_idx = 0;
 2130                 goto physmap_done;
 2131         }
 2132 #endif
 2133         bzero(&vmf, sizeof(vmf));
 2134         bzero(physmap, sizeof(physmap));
 2135         basemem = 0;
 2136 
 2137         /*
 2138          * Check if the loader supplied an SMAP memory map.  If so,
 2139          * use that and do not make any VM86 calls.
 2140          */
 2141         physmap_idx = 0;
 2142         smapbase = NULL;
 2143         kmdp = preload_search_by_type("elf kernel");
 2144         if (kmdp == NULL)
 2145                 kmdp = preload_search_by_type("elf32 kernel");
 2146         if (kmdp != NULL)
 2147                 smapbase = (struct bios_smap *)preload_search_info(kmdp,
 2148                     MODINFO_METADATA | MODINFOMD_SMAP);
 2149         if (smapbase != NULL) {
 2150                 /*
 2151                  * subr_module.c says:
 2152                  * "Consumer may safely assume that size value precedes data."
 2153                  * ie: an int32_t immediately precedes SMAP.
 2154                  */
 2155                 smapsize = *((u_int32_t *)smapbase - 1);
 2156                 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 2157                 has_smap = 1;
 2158 
 2159                 for (smap = smapbase; smap < smapend; smap++)
 2160                         if (!add_smap_entry(smap, physmap, &physmap_idx))
 2161                                 break;
 2162                 goto have_smap;
 2163         }
 2164 
 2165         /*
 2166          * Some newer BIOSes have a broken INT 12H implementation
 2167          * which causes a kernel panic immediately.  In this case, we
 2168          * need use the SMAP to determine the base memory size.
 2169          */
 2170         hasbrokenint12 = 0;
 2171         TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 2172         if (hasbrokenint12 == 0) {
 2173                 /* Use INT12 to determine base memory size. */
 2174                 vm86_intcall(0x12, &vmf);
 2175                 basemem = vmf.vmf_ax;
 2176                 basemem_setup();
 2177         }
 2178 
 2179         /*
 2180          * Fetch the memory map with INT 15:E820.  Map page 1 R/W into
 2181          * the kernel page table so we can use it as a buffer.  The
 2182          * kernel will unmap this page later.
 2183          */
 2184         pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 2185         vmc.npages = 0;
 2186         smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 2187         vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 2188 
 2189         vmf.vmf_ebx = 0;
 2190         do {
 2191                 vmf.vmf_eax = 0xE820;
 2192                 vmf.vmf_edx = SMAP_SIG;
 2193                 vmf.vmf_ecx = sizeof(struct bios_smap);
 2194                 i = vm86_datacall(0x15, &vmf, &vmc);
 2195                 if (i || vmf.vmf_eax != SMAP_SIG)
 2196                         break;
 2197                 has_smap = 1;
 2198                 if (!add_smap_entry(smap, physmap, &physmap_idx))
 2199                         break;
 2200         } while (vmf.vmf_ebx != 0);
 2201 
 2202 have_smap:
 2203         /*
 2204          * If we didn't fetch the "base memory" size from INT12,
 2205          * figure it out from the SMAP (or just guess).
 2206          */
 2207         if (basemem == 0) {
 2208                 for (i = 0; i <= physmap_idx; i += 2) {
 2209                         if (physmap[i] == 0x00000000) {
 2210                                 basemem = physmap[i + 1] / 1024;
 2211                                 break;
 2212                         }
 2213                 }
 2214 
 2215                 /* XXX: If we couldn't find basemem from SMAP, just guess. */
 2216                 if (basemem == 0)
 2217                         basemem = 640;
 2218                 basemem_setup();
 2219         }
 2220 
 2221         if (physmap[1] != 0)
 2222                 goto physmap_done;
 2223 
 2224         /*
 2225          * If we failed to find an SMAP, figure out the extended
 2226          * memory size.  We will then build a simple memory map with
 2227          * two segments, one for "base memory" and the second for
 2228          * "extended memory".  Note that "extended memory" starts at a
 2229          * physical address of 1MB and that both basemem and extmem
 2230          * are in units of 1KB.
 2231          *
 2232          * First, try to fetch the extended memory size via INT 15:E801.
 2233          */
 2234         vmf.vmf_ax = 0xE801;
 2235         if (vm86_intcall(0x15, &vmf) == 0) {
 2236                 extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 2237         } else {
 2238                 /*
 2239                  * If INT15:E801 fails, this is our last ditch effort
 2240                  * to determine the extended memory size.  Currently
 2241                  * we prefer the RTC value over INT15:88.
 2242                  */
 2243 #if 0
 2244                 vmf.vmf_ah = 0x88;
 2245                 vm86_intcall(0x15, &vmf);
 2246                 extmem = vmf.vmf_ax;
 2247 #else
 2248                 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 2249 #endif
 2250         }
 2251 
 2252         /*
 2253          * Special hack for chipsets that still remap the 384k hole when
 2254          * there's 16MB of memory - this really confuses people that
 2255          * are trying to use bus mastering ISA controllers with the
 2256          * "16MB limit"; they only have 16MB, but the remapping puts
 2257          * them beyond the limit.
 2258          *
 2259          * If extended memory is between 15-16MB (16-17MB phys address range),
 2260          *      chop it to 15MB.
 2261          */
 2262         if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 2263                 extmem = 15 * 1024;
 2264 
 2265         physmap[0] = 0;
 2266         physmap[1] = basemem * 1024;
 2267         physmap_idx = 2;
 2268         physmap[physmap_idx] = 0x100000;
 2269         physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 2270 
 2271 physmap_done:
 2272 #endif  
 2273         /*
 2274          * Now, physmap contains a map of physical memory.
 2275          */
 2276 
 2277 #ifdef SMP
 2278         /* make hole for AP bootstrap code */
 2279         physmap[1] = mp_bootaddress(physmap[1]);
 2280 #endif
 2281 
 2282         /*
 2283          * Maxmem isn't the "maximum memory", it's one larger than the
 2284          * highest page of the physical address space.  It should be
 2285          * called something like "Maxphyspage".  We may adjust this 
 2286          * based on ``hw.physmem'' and the results of the memory test.
 2287          */
 2288         Maxmem = atop(physmap[physmap_idx + 1]);
 2289 
 2290 #ifdef MAXMEM
 2291         Maxmem = MAXMEM / 4;
 2292 #endif
 2293 
 2294         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 2295                 Maxmem = atop(physmem_tunable);
 2296 
 2297         /*
 2298          * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 2299          * the amount of memory in the system.
 2300          */
 2301         if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 2302                 Maxmem = atop(physmap[physmap_idx + 1]);
 2303 
 2304         /*
 2305          * By default enable the memory test on real hardware, and disable
 2306          * it if we appear to be running in a VM.  This avoids touching all
 2307          * pages unnecessarily, which doesn't matter on real hardware but is
 2308          * bad for shared VM hosts.  Use a general name so that
 2309          * one could eventually do more with the code than just disable it.
 2310          */
 2311         memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1;
 2312         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 2313 
 2314         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 2315             (boothowto & RB_VERBOSE))
 2316                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
 2317 
 2318         /*
 2319          * If Maxmem has been increased beyond what the system has detected,
 2320          * extend the last memory segment to the new limit.
 2321          */ 
 2322         if (atop(physmap[physmap_idx + 1]) < Maxmem)
 2323                 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 2324 
 2325         /* call pmap initialization to make new kernel address space */
 2326         pmap_bootstrap(first);
 2327 
 2328         /*
 2329          * Size up each available chunk of physical memory.
 2330          */
 2331         physmap[0] = PAGE_SIZE;         /* mask off page 0 */
 2332         pa_indx = 0;
 2333         da_indx = 1;
 2334         phys_avail[pa_indx++] = physmap[0];
 2335         phys_avail[pa_indx] = physmap[0];
 2336         dump_avail[da_indx] = physmap[0];
 2337         pte = CMAP1;
 2338 
 2339         /*
 2340          * Get dcons buffer address
 2341          */
 2342         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 2343             getenv_quad("dcons.size", &dcons_size) == 0)
 2344                 dcons_addr = 0;
 2345 
 2346 #ifndef XEN
 2347         /*
 2348          * physmap is in bytes, so when converting to page boundaries,
 2349          * round up the start address and round down the end address.
 2350          */
 2351         for (i = 0; i <= physmap_idx; i += 2) {
 2352                 vm_paddr_t end;
 2353 
 2354                 end = ptoa((vm_paddr_t)Maxmem);
 2355                 if (physmap[i + 1] < end)
 2356                         end = trunc_page(physmap[i + 1]);
 2357                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 2358                         int tmp, page_bad, full;
 2359                         int *ptr = (int *)CADDR1;
 2360 
 2361                         full = FALSE;
 2362                         /*
 2363                          * block out kernel memory as not available.
 2364                          */
 2365                         if (pa >= KERNLOAD && pa < first)
 2366                                 goto do_dump_avail;
 2367 
 2368                         /*
 2369                          * block out dcons buffer
 2370                          */
 2371                         if (dcons_addr > 0
 2372                             && pa >= trunc_page(dcons_addr)
 2373                             && pa < dcons_addr + dcons_size)
 2374                                 goto do_dump_avail;
 2375 
 2376                         page_bad = FALSE;
 2377                         if (memtest == 0)
 2378                                 goto skip_memtest;
 2379 
 2380                         /*
 2381                          * map page into kernel: valid, read/write,non-cacheable
 2382                          */
 2383                         *pte = pa | PG_V | PG_RW | PG_N;
 2384                         invltlb();
 2385 
 2386                         tmp = *(int *)ptr;
 2387                         /*
 2388                          * Test for alternating 1's and 0's
 2389                          */
 2390                         *(volatile int *)ptr = 0xaaaaaaaa;
 2391                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 2392                                 page_bad = TRUE;
 2393                         /*
 2394                          * Test for alternating 0's and 1's
 2395                          */
 2396                         *(volatile int *)ptr = 0x55555555;
 2397                         if (*(volatile int *)ptr != 0x55555555)
 2398                                 page_bad = TRUE;
 2399                         /*
 2400                          * Test for all 1's
 2401                          */
 2402                         *(volatile int *)ptr = 0xffffffff;
 2403                         if (*(volatile int *)ptr != 0xffffffff)
 2404                                 page_bad = TRUE;
 2405                         /*
 2406                          * Test for all 0's
 2407                          */
 2408                         *(volatile int *)ptr = 0x0;
 2409                         if (*(volatile int *)ptr != 0x0)
 2410                                 page_bad = TRUE;
 2411                         /*
 2412                          * Restore original value.
 2413                          */
 2414                         *(int *)ptr = tmp;
 2415 
 2416 skip_memtest:
 2417                         /*
 2418                          * Adjust array of valid/good pages.
 2419                          */
 2420                         if (page_bad == TRUE)
 2421                                 continue;
 2422                         /*
 2423                          * If this good page is a continuation of the
 2424                          * previous set of good pages, then just increase
 2425                          * the end pointer. Otherwise start a new chunk.
 2426                          * Note that "end" points one higher than end,
 2427                          * making the range >= start and < end.
 2428                          * If we're also doing a speculative memory
 2429                          * test and we at or past the end, bump up Maxmem
 2430                          * so that we keep going. The first bad page
 2431                          * will terminate the loop.
 2432                          */
 2433                         if (phys_avail[pa_indx] == pa) {
 2434                                 phys_avail[pa_indx] += PAGE_SIZE;
 2435                         } else {
 2436                                 pa_indx++;
 2437                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 2438                                         printf(
 2439                 "Too many holes in the physical address space, giving up\n");
 2440                                         pa_indx--;
 2441                                         full = TRUE;
 2442                                         goto do_dump_avail;
 2443                                 }
 2444                                 phys_avail[pa_indx++] = pa;     /* start */
 2445                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 2446                         }
 2447                         physmem++;
 2448 do_dump_avail:
 2449                         if (dump_avail[da_indx] == pa) {
 2450                                 dump_avail[da_indx] += PAGE_SIZE;
 2451                         } else {
 2452                                 da_indx++;
 2453                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 2454                                         da_indx--;
 2455                                         goto do_next;
 2456                                 }
 2457                                 dump_avail[da_indx++] = pa;     /* start */
 2458                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 2459                         }
 2460 do_next:
 2461                         if (full)
 2462                                 break;
 2463                 }
 2464         }
 2465         *pte = 0;
 2466         invltlb();
 2467 #else
 2468         phys_avail[0] = physfree;
 2469         phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 2470         dump_avail[0] = 0;      
 2471         dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
 2472         
 2473 #endif
 2474         
 2475         /*
 2476          * XXX
 2477          * The last chunk must contain at least one page plus the message
 2478          * buffer to avoid complicating other code (message buffer address
 2479          * calculation, etc.).
 2480          */
 2481         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 2482             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 2483                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 2484                 phys_avail[pa_indx--] = 0;
 2485                 phys_avail[pa_indx--] = 0;
 2486         }
 2487 
 2488         Maxmem = atop(phys_avail[pa_indx]);
 2489 
 2490         /* Trim off space for the message buffer. */
 2491         phys_avail[pa_indx] -= round_page(msgbufsize);
 2492 
 2493         /* Map the message buffer. */
 2494         for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
 2495                 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 2496                     off);
 2497 
 2498         PT_UPDATES_FLUSH();
 2499 }
 2500 
 2501 #ifdef XEN
 2502 #define MTOPSIZE (1<<(14 + PAGE_SHIFT))
 2503 
 2504 void
 2505 init386(first)
 2506         int first;
 2507 {
 2508         unsigned long gdtmachpfn;
 2509         int error, gsel_tss, metadata_missing, x, pa;
 2510         struct pcpu *pc;
 2511         struct callback_register event = {
 2512                 .type = CALLBACKTYPE_event,
 2513                 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
 2514         };
 2515         struct callback_register failsafe = {
 2516                 .type = CALLBACKTYPE_failsafe,
 2517                 .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
 2518         };
 2519 
 2520         thread0.td_kstack = proc0kstack;
 2521         thread0.td_pcb = (struct pcb *)
 2522            (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 2523 
 2524         /*
 2525          * This may be done better later if it gets more high level
 2526          * components in it. If so just link td->td_proc here.
 2527          */
 2528         proc_linkup0(&proc0, &thread0);
 2529 
 2530         metadata_missing = 0;
 2531         if (xen_start_info->mod_start) {
 2532                 preload_metadata = (caddr_t)xen_start_info->mod_start;
 2533                 preload_bootstrap_relocate(KERNBASE);
 2534         } else {
 2535                 metadata_missing = 1;
 2536         }
 2537         if (envmode == 1)
 2538                 kern_envp = static_env;
 2539         else if ((caddr_t)xen_start_info->cmd_line)
 2540                 kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);
 2541 
 2542         boothowto |= xen_boothowto(kern_envp);
 2543         
 2544         /* Init basic tunables, hz etc */
 2545         init_param1();
 2546 
 2547         /*
 2548          * XEN occupies a portion of the upper virtual address space 
 2549          * At its base it manages an array mapping machine page frames 
 2550          * to physical page frames - hence we need to be able to 
 2551          * access 4GB - (64MB  - 4MB + 64k) 
 2552          */
 2553         gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2554         gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2555         gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2556         gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2557         gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2558         gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2559         gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2560         gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
 2561 
 2562         pc = &__pcpu[0];
 2563         gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 2564         gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 2565 
 2566         PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V | PG_RW);
 2567         bzero(gdt, PAGE_SIZE);
 2568         for (x = 0; x < NGDT; x++)
 2569                 ssdtosd(&gdt_segs[x], &gdt[x].sd);
 2570 
 2571         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 2572 
 2573         gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
 2574         PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) | PG_V);
 2575         PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);    
 2576         lgdt(&r_gdt);
 2577         gdtset = 1;
 2578 
 2579         if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
 2580                 panic("set_trap_table failed - error %d\n", error);
 2581         }
 2582         
 2583         error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
 2584         if (error == 0)
 2585                 error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
 2586 #if     CONFIG_XEN_COMPAT <= 0x030002
 2587         if (error == -ENOXENSYS)
 2588                 HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
 2589                     (unsigned long)Xhypervisor_callback,
 2590                     GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
 2591 #endif
 2592         pcpu_init(pc, 0, sizeof(struct pcpu));
 2593         for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 2594                 pmap_kenter(pa + KERNBASE, pa);
 2595         dpcpu_init((void *)(first + KERNBASE), 0);
 2596         first += DPCPU_SIZE;
 2597         physfree += DPCPU_SIZE;
 2598         init_first += DPCPU_SIZE / PAGE_SIZE;
 2599 
 2600         PCPU_SET(prvspace, pc);
 2601         PCPU_SET(curthread, &thread0);
 2602         PCPU_SET(curpcb, thread0.td_pcb);
 2603 
 2604         /*
 2605          * Initialize mutexes.
 2606          *
 2607          * icu_lock: in order to allow an interrupt to occur in a critical
 2608          *           section, to set pcpu->ipending (etc...) properly, we
 2609          *           must be able to get the icu lock, so it can't be
 2610          *           under witness.
 2611          */
 2612         mutex_init();
 2613         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 2614 
 2615         /* make ldt memory segments */
 2616         PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) | PG_V | PG_RW);
 2617         bzero(ldt, PAGE_SIZE);
 2618         ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 2619         ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 2620         for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 2621                 ssdtosd(&ldt_segs[x], &ldt[x].sd);
 2622 
 2623         default_proc_ldt.ldt_base = (caddr_t)ldt;
 2624         default_proc_ldt.ldt_len = 6;
 2625         _default_ldt = (int)&default_proc_ldt;
 2626         PCPU_SET(currentldt, _default_ldt);
 2627         PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
 2628         xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));
 2629         
 2630 #if defined(XEN_PRIVILEGED)
 2631         /*
 2632          * Initialize the i8254 before the console so that console
 2633          * initialization can use DELAY().
 2634          */
 2635         i8254_init();
 2636 #endif
 2637         
 2638         /*
 2639          * Initialize the console before we print anything out.
 2640          */
 2641         cninit();
 2642 
 2643         if (metadata_missing)
 2644                 printf("WARNING: loader(8) metadata is missing!\n");
 2645 
 2646 #ifdef DEV_ISA
 2647         elcr_probe();
 2648         atpic_startup();
 2649 #endif
 2650 
 2651 #ifdef DDB
 2652         ksym_start = bootinfo.bi_symtab;
 2653         ksym_end = bootinfo.bi_esymtab;
 2654 #endif
 2655 
 2656         kdb_init();
 2657 
 2658 #ifdef KDB
 2659         if (boothowto & RB_KDB)
 2660                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 2661 #endif
 2662 
 2663         finishidentcpu();       /* Final stage of CPU initialization */
 2664         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 2665             GSEL(GCODE_SEL, SEL_KPL));
 2666         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 2667             GSEL(GCODE_SEL, SEL_KPL));
 2668         initializecpu();        /* Initialize CPU registers */
 2669 
 2670         /* make an initial tss so cpu can get interrupt stack on syscall! */
 2671         /* Note: -16 is so we can grow the trapframe if we came from vm86 */
 2672         PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 2673             KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 2674         PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 2675         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 2676         HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
 2677             PCPU_GET(common_tss.tss_esp0));
 2678         
 2679         /* pointer to selector slot for %fs/%gs */
 2680         PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 2681 
 2682         dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 2683             dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 2684         dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 2685             dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 2686 #ifdef PAE
 2687         dblfault_tss.tss_cr3 = (int)IdlePDPT;
 2688 #else
 2689         dblfault_tss.tss_cr3 = (int)IdlePTD;
 2690 #endif
 2691         dblfault_tss.tss_eip = (int)dblfault_handler;
 2692         dblfault_tss.tss_eflags = PSL_KERNEL;
 2693         dblfault_tss.tss_ds = dblfault_tss.tss_es =
 2694             dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 2695         dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 2696         dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 2697         dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 2698 
 2699         vm86_initialize();
 2700         getmemsize(first);
 2701         init_param2(physmem);
 2702 
 2703         /* now running on new page tables, configured,and u/iom is accessible */
 2704 
 2705         msgbufinit(msgbufp, msgbufsize);
 2706         /* transfer to user mode */
 2707 
 2708         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 2709         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 2710 
 2711         /* setup proc 0's pcb */
 2712         thread0.td_pcb->pcb_flags = 0;
 2713 #ifdef PAE
 2714         thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 2715 #else
 2716         thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 2717 #endif
 2718         thread0.td_pcb->pcb_ext = 0;
 2719         thread0.td_frame = &proc0_tf;
 2720         thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
 2721         thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];
 2722 
 2723 #if defined(XEN_PRIVILEGED)
 2724         if (cpu_probe_amdc1e())
 2725                 cpu_idle_fn = cpu_idle_amdc1e;
 2726 #endif
 2727 }
 2728 
 2729 #else
 2730 void
 2731 init386(first)
 2732         int first;
 2733 {
 2734         struct gate_descriptor *gdp;
 2735         int gsel_tss, metadata_missing, x, pa;
 2736         struct pcpu *pc;
 2737 
 2738         thread0.td_kstack = proc0kstack;
 2739         thread0.td_pcb = (struct pcb *)
 2740            (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 2741 
 2742         /*
 2743          * This may be done better later if it gets more high level
 2744          * components in it. If so just link td->td_proc here.
 2745          */
 2746         proc_linkup0(&proc0, &thread0);
 2747 
 2748         metadata_missing = 0;
 2749         if (bootinfo.bi_modulep) {
 2750                 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 2751                 preload_bootstrap_relocate(KERNBASE);
 2752         } else {
 2753                 metadata_missing = 1;
 2754         }
 2755         if (envmode == 1)
 2756                 kern_envp = static_env;
 2757         else if (bootinfo.bi_envp)
 2758                 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 2759 
 2760         /* Init basic tunables, hz etc */
 2761         init_param1();
 2762 
 2763         /*
 2764          * Make gdt memory segments.  All segments cover the full 4GB
 2765          * of address space and permissions are enforced at page level.
 2766          */
 2767         gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 2768         gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 2769         gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 2770         gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 2771         gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 2772         gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 2773 
 2774         pc = &__pcpu[0];
 2775         gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 2776         gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 2777         gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 2778 
 2779         for (x = 0; x < NGDT; x++)
 2780                 ssdtosd(&gdt_segs[x], &gdt[x].sd);
 2781 
 2782         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 2783         r_gdt.rd_base =  (int) gdt;
 2784         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 2785         lgdt(&r_gdt);
 2786 
 2787         pcpu_init(pc, 0, sizeof(struct pcpu));
 2788         for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
 2789                 pmap_kenter(pa + KERNBASE, pa);
 2790         dpcpu_init((void *)(first + KERNBASE), 0);
 2791         first += DPCPU_SIZE;
 2792         PCPU_SET(prvspace, pc);
 2793         PCPU_SET(curthread, &thread0);
 2794         PCPU_SET(curpcb, thread0.td_pcb);
 2795 
 2796         /*
 2797          * Initialize mutexes.
 2798          *
 2799          * icu_lock: in order to allow an interrupt to occur in a critical
 2800          *           section, to set pcpu->ipending (etc...) properly, we
 2801          *           must be able to get the icu lock, so it can't be
 2802          *           under witness.
 2803          */
 2804         mutex_init();
 2805         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 2806 
 2807         /* make ldt memory segments */
 2808         ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 2809         ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 2810         for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 2811                 ssdtosd(&ldt_segs[x], &ldt[x].sd);
 2812 
 2813         _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 2814         lldt(_default_ldt);
 2815         PCPU_SET(currentldt, _default_ldt);
 2816 
 2817         /* exceptions */
 2818         for (x = 0; x < NIDT; x++)
 2819                 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 2820                     GSEL(GCODE_SEL, SEL_KPL));
 2821         setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 2822             GSEL(GCODE_SEL, SEL_KPL));
 2823         setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 2824             GSEL(GCODE_SEL, SEL_KPL));
 2825         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 2826             GSEL(GCODE_SEL, SEL_KPL));
 2827         setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 2828             GSEL(GCODE_SEL, SEL_KPL));
 2829         setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 2830             GSEL(GCODE_SEL, SEL_KPL));
 2831         setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 2832             GSEL(GCODE_SEL, SEL_KPL));
 2833         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 2834             GSEL(GCODE_SEL, SEL_KPL));
 2835         setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 2836             , GSEL(GCODE_SEL, SEL_KPL));
 2837         setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 2838         setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 2839             GSEL(GCODE_SEL, SEL_KPL));
 2840         setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 2841             GSEL(GCODE_SEL, SEL_KPL));
 2842         setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 2843             GSEL(GCODE_SEL, SEL_KPL));
 2844         setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 2845             GSEL(GCODE_SEL, SEL_KPL));
 2846         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 2847             GSEL(GCODE_SEL, SEL_KPL));
 2848         setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 2849             GSEL(GCODE_SEL, SEL_KPL));
 2850         setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 2851             GSEL(GCODE_SEL, SEL_KPL));
 2852         setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 2853             GSEL(GCODE_SEL, SEL_KPL));
 2854         setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 2855             GSEL(GCODE_SEL, SEL_KPL));
 2856         setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 2857             GSEL(GCODE_SEL, SEL_KPL));
 2858         setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 2859             GSEL(GCODE_SEL, SEL_KPL));
 2860 #ifdef KDTRACE_HOOKS
 2861         setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
 2862             GSEL(GCODE_SEL, SEL_KPL));
 2863 #endif
 2864 
 2865         r_idt.rd_limit = sizeof(idt0) - 1;
 2866         r_idt.rd_base = (int) idt;
 2867         lidt(&r_idt);
 2868 
 2869 #ifdef XBOX
 2870         /*
 2871          * The following code queries the PCI ID of 0:0:0. For the XBOX,
 2872          * This should be 0x10de / 0x02a5.
 2873          *
 2874          * This is exactly what Linux does.
 2875          */
 2876         outl(0xcf8, 0x80000000);
 2877         if (inl(0xcfc) == 0x02a510de) {
 2878                 arch_i386_is_xbox = 1;
 2879                 pic16l_setled(XBOX_LED_GREEN);
 2880 
 2881                 /*
 2882                  * We are an XBOX, but we may have either 64MB or 128MB of
 2883                  * memory. The PCI host bridge should be programmed for this,
 2884                  * so we just query it. 
 2885                  */
 2886                 outl(0xcf8, 0x80000084);
 2887                 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
 2888         }
 2889 #endif /* XBOX */
 2890 
 2891         /*
 2892          * Initialize the i8254 before the console so that console
 2893          * initialization can use DELAY().
 2894          */
 2895         i8254_init();
 2896 
 2897         /*
 2898          * Initialize the console before we print anything out.
 2899          */
 2900         cninit();
 2901 
 2902         if (metadata_missing)
 2903                 printf("WARNING: loader(8) metadata is missing!\n");
 2904 
 2905 #ifdef DEV_ISA
 2906         elcr_probe();
 2907         atpic_startup();
 2908 #endif
 2909 
 2910 #ifdef DDB
 2911         ksym_start = bootinfo.bi_symtab;
 2912         ksym_end = bootinfo.bi_esymtab;
 2913 #endif
 2914 
 2915         kdb_init();
 2916 
 2917 #ifdef KDB
 2918         if (boothowto & RB_KDB)
 2919                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 2920 #endif
 2921 
 2922         finishidentcpu();       /* Final stage of CPU initialization */
 2923         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 2924             GSEL(GCODE_SEL, SEL_KPL));
 2925         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 2926             GSEL(GCODE_SEL, SEL_KPL));
 2927         initializecpu();        /* Initialize CPU registers */
 2928 
 2929         /* make an initial tss so cpu can get interrupt stack on syscall! */
 2930         /* Note: -16 is so we can grow the trapframe if we came from vm86 */
 2931         PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 2932             KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 2933         PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 2934         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 2935         PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 2936         PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 2937         PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 2938         ltr(gsel_tss);
 2939 
 2940         /* pointer to selector slot for %fs/%gs */
 2941         PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 2942 
 2943         dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 2944             dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 2945         dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 2946             dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 2947 #ifdef PAE
 2948         dblfault_tss.tss_cr3 = (int)IdlePDPT;
 2949 #else
 2950         dblfault_tss.tss_cr3 = (int)IdlePTD;
 2951 #endif
 2952         dblfault_tss.tss_eip = (int)dblfault_handler;
 2953         dblfault_tss.tss_eflags = PSL_KERNEL;
 2954         dblfault_tss.tss_ds = dblfault_tss.tss_es =
 2955             dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 2956         dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 2957         dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 2958         dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 2959 
 2960         vm86_initialize();
 2961         getmemsize(first);
 2962         init_param2(physmem);
 2963 
 2964         /* now running on new page tables, configured,and u/iom is accessible */
 2965 
 2966         msgbufinit(msgbufp, msgbufsize);
 2967 
 2968         /* make a call gate to reenter kernel with */
 2969         gdp = &ldt[LSYS5CALLS_SEL].gd;
 2970 
 2971         x = (int) &IDTVEC(lcall_syscall);
 2972         gdp->gd_looffset = x;
 2973         gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 2974         gdp->gd_stkcpy = 1;
 2975         gdp->gd_type = SDT_SYS386CGT;
 2976         gdp->gd_dpl = SEL_UPL;
 2977         gdp->gd_p = 1;
 2978         gdp->gd_hioffset = x >> 16;
 2979 
 2980         /* XXX does this work? */
 2981         /* XXX yes! */
 2982         ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 2983         ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 2984 
 2985         /* transfer to user mode */
 2986 
 2987         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 2988         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 2989 
 2990         /* setup proc 0's pcb */
 2991         thread0.td_pcb->pcb_flags = 0;
 2992 #ifdef PAE
 2993         thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 2994 #else
 2995         thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 2996 #endif
 2997         thread0.td_pcb->pcb_ext = 0;
 2998         thread0.td_frame = &proc0_tf;
 2999 
 3000         if (cpu_probe_amdc1e())
 3001                 cpu_idle_fn = cpu_idle_amdc1e;
 3002 }
 3003 #endif
 3004 
 3005 void
 3006 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 3007 {
 3008 
 3009         pcpu->pc_acpi_id = 0xffffffff;
 3010 }
 3011 
 3012 void
 3013 spinlock_enter(void)
 3014 {
 3015         struct thread *td;
 3016         register_t flags;
 3017 
 3018         td = curthread;
 3019         if (td->td_md.md_spinlock_count == 0) {
 3020                 flags = intr_disable();
 3021                 td->td_md.md_spinlock_count = 1;
 3022                 td->td_md.md_saved_flags = flags;
 3023         } else
 3024                 td->td_md.md_spinlock_count++;
 3025         critical_enter();
 3026 }
 3027 
 3028 void
 3029 spinlock_exit(void)
 3030 {
 3031         struct thread *td;
 3032         register_t flags;
 3033 
 3034         td = curthread;
 3035         critical_exit();
 3036         flags = td->td_md.md_saved_flags;
 3037         td->td_md.md_spinlock_count--;
 3038         if (td->td_md.md_spinlock_count == 0)
 3039                 intr_restore(flags);
 3040 }
 3041 
 3042 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 3043 static void f00f_hack(void *unused);
 3044 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 3045 
 3046 static void
 3047 f00f_hack(void *unused)
 3048 {
 3049         struct gate_descriptor *new_idt;
 3050         vm_offset_t tmp;
 3051 
 3052         if (!has_f00f_bug)
 3053                 return;
 3054 
 3055         GIANT_REQUIRED;
 3056 
 3057         printf("Intel Pentium detected, installing workaround for F00F bug\n");
 3058 
 3059         tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 3060         if (tmp == 0)
 3061                 panic("kmem_alloc returned 0");
 3062 
 3063         /* Put the problematic entry (#6) at the end of the lower page. */
 3064         new_idt = (struct gate_descriptor*)
 3065             (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 3066         bcopy(idt, new_idt, sizeof(idt0));
 3067         r_idt.rd_base = (u_int)new_idt;
 3068         lidt(&r_idt);
 3069         idt = new_idt;
 3070         if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 3071                            VM_PROT_READ, FALSE) != KERN_SUCCESS)
 3072                 panic("vm_map_protect failed");
 3073 }
 3074 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 3075 
 3076 /*
 3077  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 3078  * we want to start a backtrace from the function that caused us to enter
 3079  * the debugger. We have the context in the trapframe, but base the trace
 3080  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 3081  * enough for a backtrace.
 3082  */
 3083 void
 3084 makectx(struct trapframe *tf, struct pcb *pcb)
 3085 {
 3086 
 3087         pcb->pcb_edi = tf->tf_edi;
 3088         pcb->pcb_esi = tf->tf_esi;
 3089         pcb->pcb_ebp = tf->tf_ebp;
 3090         pcb->pcb_ebx = tf->tf_ebx;
 3091         pcb->pcb_eip = tf->tf_eip;
 3092         pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 3093 }
 3094 
 3095 int
 3096 ptrace_set_pc(struct thread *td, u_long addr)
 3097 {
 3098 
 3099         td->td_frame->tf_eip = addr;
 3100         return (0);
 3101 }
 3102 
 3103 int
 3104 ptrace_single_step(struct thread *td)
 3105 {
 3106         td->td_frame->tf_eflags |= PSL_T;
 3107         return (0);
 3108 }
 3109 
 3110 int
 3111 ptrace_clear_single_step(struct thread *td)
 3112 {
 3113         td->td_frame->tf_eflags &= ~PSL_T;
 3114         return (0);
 3115 }
 3116 
 3117 int
 3118 fill_regs(struct thread *td, struct reg *regs)
 3119 {
 3120         struct pcb *pcb;
 3121         struct trapframe *tp;
 3122 
 3123         tp = td->td_frame;
 3124         pcb = td->td_pcb;
 3125         regs->r_gs = pcb->pcb_gs;
 3126         return (fill_frame_regs(tp, regs));
 3127 }
 3128 
 3129 int
 3130 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 3131 {
 3132         regs->r_fs = tp->tf_fs;
 3133         regs->r_es = tp->tf_es;
 3134         regs->r_ds = tp->tf_ds;
 3135         regs->r_edi = tp->tf_edi;
 3136         regs->r_esi = tp->tf_esi;
 3137         regs->r_ebp = tp->tf_ebp;
 3138         regs->r_ebx = tp->tf_ebx;
 3139         regs->r_edx = tp->tf_edx;
 3140         regs->r_ecx = tp->tf_ecx;
 3141         regs->r_eax = tp->tf_eax;
 3142         regs->r_eip = tp->tf_eip;
 3143         regs->r_cs = tp->tf_cs;
 3144         regs->r_eflags = tp->tf_eflags;
 3145         regs->r_esp = tp->tf_esp;
 3146         regs->r_ss = tp->tf_ss;
 3147         return (0);
 3148 }
 3149 
 3150 int
 3151 set_regs(struct thread *td, struct reg *regs)
 3152 {
 3153         struct pcb *pcb;
 3154         struct trapframe *tp;
 3155 
 3156         tp = td->td_frame;
 3157         if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 3158             !CS_SECURE(regs->r_cs))
 3159                 return (EINVAL);
 3160         pcb = td->td_pcb;
 3161         tp->tf_fs = regs->r_fs;
 3162         tp->tf_es = regs->r_es;
 3163         tp->tf_ds = regs->r_ds;
 3164         tp->tf_edi = regs->r_edi;
 3165         tp->tf_esi = regs->r_esi;
 3166         tp->tf_ebp = regs->r_ebp;
 3167         tp->tf_ebx = regs->r_ebx;
 3168         tp->tf_edx = regs->r_edx;
 3169         tp->tf_ecx = regs->r_ecx;
 3170         tp->tf_eax = regs->r_eax;
 3171         tp->tf_eip = regs->r_eip;
 3172         tp->tf_cs = regs->r_cs;
 3173         tp->tf_eflags = regs->r_eflags;
 3174         tp->tf_esp = regs->r_esp;
 3175         tp->tf_ss = regs->r_ss;
 3176         pcb->pcb_gs = regs->r_gs;
 3177         return (0);
 3178 }
 3179 
 3180 #ifdef CPU_ENABLE_SSE
 3181 static void
 3182 fill_fpregs_xmm(sv_xmm, sv_87)
 3183         struct savexmm *sv_xmm;
 3184         struct save87 *sv_87;
 3185 {
 3186         register struct env87 *penv_87 = &sv_87->sv_env;
 3187         register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 3188         int i;
 3189 
 3190         bzero(sv_87, sizeof(*sv_87));
 3191 
 3192         /* FPU control/status */
 3193         penv_87->en_cw = penv_xmm->en_cw;
 3194         penv_87->en_sw = penv_xmm->en_sw;
 3195         penv_87->en_tw = penv_xmm->en_tw;
 3196         penv_87->en_fip = penv_xmm->en_fip;
 3197         penv_87->en_fcs = penv_xmm->en_fcs;
 3198         penv_87->en_opcode = penv_xmm->en_opcode;
 3199         penv_87->en_foo = penv_xmm->en_foo;
 3200         penv_87->en_fos = penv_xmm->en_fos;
 3201 
 3202         /* FPU registers */
 3203         for (i = 0; i < 8; ++i)
 3204                 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 3205 }
 3206 
 3207 static void
 3208 set_fpregs_xmm(sv_87, sv_xmm)
 3209         struct save87 *sv_87;
 3210         struct savexmm *sv_xmm;
 3211 {
 3212         register struct env87 *penv_87 = &sv_87->sv_env;
 3213         register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 3214         int i;
 3215 
 3216         /* FPU control/status */
 3217         penv_xmm->en_cw = penv_87->en_cw;
 3218         penv_xmm->en_sw = penv_87->en_sw;
 3219         penv_xmm->en_tw = penv_87->en_tw;
 3220         penv_xmm->en_fip = penv_87->en_fip;
 3221         penv_xmm->en_fcs = penv_87->en_fcs;
 3222         penv_xmm->en_opcode = penv_87->en_opcode;
 3223         penv_xmm->en_foo = penv_87->en_foo;
 3224         penv_xmm->en_fos = penv_87->en_fos;
 3225 
 3226         /* FPU registers */
 3227         for (i = 0; i < 8; ++i)
 3228                 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 3229 }
 3230 #endif /* CPU_ENABLE_SSE */
 3231 
 3232 int
 3233 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 3234 {
 3235 
 3236         KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 3237             P_SHOULDSTOP(td->td_proc),
 3238             ("not suspended thread %p", td));
 3239 #ifdef DEV_NPX
 3240         npxgetregs(td);
 3241 #else
 3242         bzero(fpregs, sizeof(*fpregs));
 3243 #endif
 3244 #ifdef CPU_ENABLE_SSE
 3245         if (cpu_fxsr)
 3246                 fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
 3247                     (struct save87 *)fpregs);
 3248         else
 3249 #endif /* CPU_ENABLE_SSE */
 3250                 bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
 3251                     sizeof(*fpregs));
 3252         return (0);
 3253 }
 3254 
 3255 int
 3256 set_fpregs(struct thread *td, struct fpreg *fpregs)
 3257 {
 3258 
 3259 #ifdef CPU_ENABLE_SSE
 3260         if (cpu_fxsr)
 3261                 set_fpregs_xmm((struct save87 *)fpregs,
 3262                     &td->td_pcb->pcb_user_save.sv_xmm);
 3263         else
 3264 #endif /* CPU_ENABLE_SSE */
 3265                 bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
 3266                     sizeof(*fpregs));
 3267 #ifdef DEV_NPX
 3268         npxuserinited(td);
 3269 #endif
 3270         return (0);
 3271 }
 3272 
 3273 /*
 3274  * Get machine context.
 3275  */
 3276 int
 3277 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 3278 {
 3279         struct trapframe *tp;
 3280         struct segment_descriptor *sdp;
 3281 
 3282         tp = td->td_frame;
 3283 
 3284         PROC_LOCK(curthread->td_proc);
 3285         mcp->mc_onstack = sigonstack(tp->tf_esp);
 3286         PROC_UNLOCK(curthread->td_proc);
 3287         mcp->mc_gs = td->td_pcb->pcb_gs;
 3288         mcp->mc_fs = tp->tf_fs;
 3289         mcp->mc_es = tp->tf_es;
 3290         mcp->mc_ds = tp->tf_ds;
 3291         mcp->mc_edi = tp->tf_edi;
 3292         mcp->mc_esi = tp->tf_esi;
 3293         mcp->mc_ebp = tp->tf_ebp;
 3294         mcp->mc_isp = tp->tf_isp;
 3295         mcp->mc_eflags = tp->tf_eflags;
 3296         if (flags & GET_MC_CLEAR_RET) {
 3297                 mcp->mc_eax = 0;
 3298                 mcp->mc_edx = 0;
 3299                 mcp->mc_eflags &= ~PSL_C;
 3300         } else {
 3301                 mcp->mc_eax = tp->tf_eax;
 3302                 mcp->mc_edx = tp->tf_edx;
 3303         }
 3304         mcp->mc_ebx = tp->tf_ebx;
 3305         mcp->mc_ecx = tp->tf_ecx;
 3306         mcp->mc_eip = tp->tf_eip;
 3307         mcp->mc_cs = tp->tf_cs;
 3308         mcp->mc_esp = tp->tf_esp;
 3309         mcp->mc_ss = tp->tf_ss;
 3310         mcp->mc_len = sizeof(*mcp);
 3311         get_fpcontext(td, mcp);
 3312         sdp = &td->td_pcb->pcb_fsd;
 3313         mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 3314         sdp = &td->td_pcb->pcb_gsd;
 3315         mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase;
 3316         bzero(mcp->mc_spare1, sizeof(mcp->mc_spare1));
 3317         bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 3318         return (0);
 3319 }
 3320 
 3321 /*
 3322  * Set machine context.
 3323  *
 3324  * However, we don't set any but the user modifiable flags, and we won't
 3325  * touch the cs selector.
 3326  */
 3327 int
 3328 set_mcontext(struct thread *td, const mcontext_t *mcp)
 3329 {
 3330         struct trapframe *tp;
 3331         int eflags, ret;
 3332 
 3333         tp = td->td_frame;
 3334         if (mcp->mc_len != sizeof(*mcp))
 3335                 return (EINVAL);
 3336         eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 3337             (tp->tf_eflags & ~PSL_USERCHANGE);
 3338         if ((ret = set_fpcontext(td, mcp)) == 0) {
 3339                 tp->tf_fs = mcp->mc_fs;
 3340                 tp->tf_es = mcp->mc_es;
 3341                 tp->tf_ds = mcp->mc_ds;
 3342                 tp->tf_edi = mcp->mc_edi;
 3343                 tp->tf_esi = mcp->mc_esi;
 3344                 tp->tf_ebp = mcp->mc_ebp;
 3345                 tp->tf_ebx = mcp->mc_ebx;
 3346                 tp->tf_edx = mcp->mc_edx;
 3347                 tp->tf_ecx = mcp->mc_ecx;
 3348                 tp->tf_eax = mcp->mc_eax;
 3349                 tp->tf_eip = mcp->mc_eip;
 3350                 tp->tf_eflags = eflags;
 3351                 tp->tf_esp = mcp->mc_esp;
 3352                 tp->tf_ss = mcp->mc_ss;
 3353                 td->td_pcb->pcb_gs = mcp->mc_gs;
 3354                 ret = 0;
 3355         }
 3356         return (ret);
 3357 }
 3358 
 3359 static void
 3360 get_fpcontext(struct thread *td, mcontext_t *mcp)
 3361 {
 3362 
 3363 #ifndef DEV_NPX
 3364         mcp->mc_fpformat = _MC_FPFMT_NODEV;
 3365         mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 3366         bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 3367 #else
 3368         mcp->mc_ownedfp = npxgetregs(td);
 3369         bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
 3370             sizeof(mcp->mc_fpstate));
 3371         mcp->mc_fpformat = npxformat();
 3372 #endif
 3373 }
 3374 
 3375 static int
 3376 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 3377 {
 3378 
 3379         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 3380                 return (0);
 3381         else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 3382             mcp->mc_fpformat != _MC_FPFMT_XMM)
 3383                 return (EINVAL);
 3384         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 3385                 /* We don't care what state is left in the FPU or PCB. */
 3386                 fpstate_drop(td);
 3387         else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 3388             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 3389 #ifdef DEV_NPX
 3390 #ifdef CPU_ENABLE_SSE
 3391                 if (cpu_fxsr)
 3392                         ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
 3393                             en_mxcsr &= cpu_mxcsr_mask;
 3394 #endif
 3395                 npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
 3396 #endif
 3397         } else
 3398                 return (EINVAL);
 3399         return (0);
 3400 }
 3401 
 3402 static void
 3403 fpstate_drop(struct thread *td)
 3404 {
 3405 
 3406         KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 3407         critical_enter();
 3408 #ifdef DEV_NPX
 3409         if (PCPU_GET(fpcurthread) == td)
 3410                 npxdrop();
 3411 #endif
 3412         /*
 3413          * XXX force a full drop of the npx.  The above only drops it if we
 3414          * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 3415          *
 3416          * XXX I don't much like npxgetregs()'s semantics of doing a full
 3417          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 3418          * We only need to drop to !PCB_INITDONE in sendsig().  But
 3419          * sendsig() is the only caller of npxgetregs()... perhaps we just
 3420          * have too many layers.
 3421          */
 3422         curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE |
 3423             PCB_NPXUSERINITDONE);
 3424         critical_exit();
 3425 }
 3426 
 3427 int
 3428 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 3429 {
 3430         struct pcb *pcb;
 3431 
 3432         if (td == NULL) {
 3433                 dbregs->dr[0] = rdr0();
 3434                 dbregs->dr[1] = rdr1();
 3435                 dbregs->dr[2] = rdr2();
 3436                 dbregs->dr[3] = rdr3();
 3437                 dbregs->dr[4] = rdr4();
 3438                 dbregs->dr[5] = rdr5();
 3439                 dbregs->dr[6] = rdr6();
 3440                 dbregs->dr[7] = rdr7();
 3441         } else {
 3442                 pcb = td->td_pcb;
 3443                 dbregs->dr[0] = pcb->pcb_dr0;
 3444                 dbregs->dr[1] = pcb->pcb_dr1;
 3445                 dbregs->dr[2] = pcb->pcb_dr2;
 3446                 dbregs->dr[3] = pcb->pcb_dr3;
 3447                 dbregs->dr[4] = 0;
 3448                 dbregs->dr[5] = 0;
 3449                 dbregs->dr[6] = pcb->pcb_dr6;
 3450                 dbregs->dr[7] = pcb->pcb_dr7;
 3451         }
 3452         return (0);
 3453 }
 3454 
 3455 int
 3456 set_dbregs(struct thread *td, struct dbreg *dbregs)
 3457 {
 3458         struct pcb *pcb;
 3459         int i;
 3460 
 3461         if (td == NULL) {
 3462                 load_dr0(dbregs->dr[0]);
 3463                 load_dr1(dbregs->dr[1]);
 3464                 load_dr2(dbregs->dr[2]);
 3465                 load_dr3(dbregs->dr[3]);
 3466                 load_dr4(dbregs->dr[4]);
 3467                 load_dr5(dbregs->dr[5]);
 3468                 load_dr6(dbregs->dr[6]);
 3469                 load_dr7(dbregs->dr[7]);
 3470         } else {
 3471                 /*
 3472                  * Don't let an illegal value for dr7 get set.  Specifically,
 3473                  * check for undefined settings.  Setting these bit patterns
 3474                  * result in undefined behaviour and can lead to an unexpected
 3475                  * TRCTRAP.
 3476                  */
 3477                 for (i = 0; i < 4; i++) {
 3478                         if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 3479                                 return (EINVAL);
 3480                         if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 3481                                 return (EINVAL);
 3482                 }
 3483                 
 3484                 pcb = td->td_pcb;
 3485                 
 3486                 /*
 3487                  * Don't let a process set a breakpoint that is not within the
 3488                  * process's address space.  If a process could do this, it
 3489                  * could halt the system by setting a breakpoint in the kernel
 3490                  * (if ddb was enabled).  Thus, we need to check to make sure
 3491                  * that no breakpoints are being enabled for addresses outside
 3492                  * process's address space.
 3493                  *
 3494                  * XXX - what about when the watched area of the user's
 3495                  * address space is written into from within the kernel
 3496                  * ... wouldn't that still cause a breakpoint to be generated
 3497                  * from within kernel mode?
 3498                  */
 3499 
 3500                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 3501                         /* dr0 is enabled */
 3502                         if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 3503                                 return (EINVAL);
 3504                 }
 3505                         
 3506                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 3507                         /* dr1 is enabled */
 3508                         if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 3509                                 return (EINVAL);
 3510                 }
 3511                         
 3512                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 3513                         /* dr2 is enabled */
 3514                         if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 3515                                 return (EINVAL);
 3516                 }
 3517                         
 3518                 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 3519                         /* dr3 is enabled */
 3520                         if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 3521                                 return (EINVAL);
 3522                 }
 3523 
 3524                 pcb->pcb_dr0 = dbregs->dr[0];
 3525                 pcb->pcb_dr1 = dbregs->dr[1];
 3526                 pcb->pcb_dr2 = dbregs->dr[2];
 3527                 pcb->pcb_dr3 = dbregs->dr[3];
 3528                 pcb->pcb_dr6 = dbregs->dr[6];
 3529                 pcb->pcb_dr7 = dbregs->dr[7];
 3530 
 3531                 pcb->pcb_flags |= PCB_DBREGS;
 3532         }
 3533 
 3534         return (0);
 3535 }
 3536 
 3537 /*
 3538  * Return > 0 if a hardware breakpoint has been hit, and the
 3539  * breakpoint was in user space.  Return 0, otherwise.
 3540  */
 3541 int
 3542 user_dbreg_trap(void)
 3543 {
 3544         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
 3545         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
 3546         int nbp;            /* number of breakpoints that triggered */
 3547         caddr_t addr[4];    /* breakpoint addresses */
 3548         int i;
 3549         
 3550         dr7 = rdr7();
 3551         if ((dr7 & 0x000000ff) == 0) {
 3552                 /*
 3553                  * all GE and LE bits in the dr7 register are zero,
 3554                  * thus the trap couldn't have been caused by the
 3555                  * hardware debug registers
 3556                  */
 3557                 return 0;
 3558         }
 3559 
 3560         nbp = 0;
 3561         dr6 = rdr6();
 3562         bp = dr6 & 0x0000000f;
 3563 
 3564         if (!bp) {
 3565                 /*
 3566                  * None of the breakpoint bits are set meaning this
 3567                  * trap was not caused by any of the debug registers
 3568                  */
 3569                 return 0;
 3570         }
 3571 
 3572         /*
 3573          * at least one of the breakpoints were hit, check to see
 3574          * which ones and if any of them are user space addresses
 3575          */
 3576 
 3577         if (bp & 0x01) {
 3578                 addr[nbp++] = (caddr_t)rdr0();
 3579         }
 3580         if (bp & 0x02) {
 3581                 addr[nbp++] = (caddr_t)rdr1();
 3582         }
 3583         if (bp & 0x04) {
 3584                 addr[nbp++] = (caddr_t)rdr2();
 3585         }
 3586         if (bp & 0x08) {
 3587                 addr[nbp++] = (caddr_t)rdr3();
 3588         }
 3589 
 3590         for (i = 0; i < nbp; i++) {
 3591                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
 3592                         /*
 3593                          * addr[i] is in user space
 3594                          */
 3595                         return nbp;
 3596                 }
 3597         }
 3598 
 3599         /*
 3600          * None of the breakpoints are in user space.
 3601          */
 3602         return 0;
 3603 }
 3604 
 3605 #ifndef DEV_APIC
 3606 #include <machine/apicvar.h>
 3607 
 3608 /*
 3609  * Provide stub functions so that the MADT APIC enumerator in the acpi
 3610  * kernel module will link against a kernel without 'device apic'.
 3611  *
 3612  * XXX - This is a gross hack.
 3613  */
 3614 void
 3615 apic_register_enumerator(struct apic_enumerator *enumerator)
 3616 {
 3617 }
 3618 
 3619 void *
 3620 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 3621 {
 3622         return (NULL);
 3623 }
 3624 
 3625 int
 3626 ioapic_disable_pin(void *cookie, u_int pin)
 3627 {
 3628         return (ENXIO);
 3629 }
 3630 
 3631 int
 3632 ioapic_get_vector(void *cookie, u_int pin)
 3633 {
 3634         return (-1);
 3635 }
 3636 
 3637 void
 3638 ioapic_register(void *cookie)
 3639 {
 3640 }
 3641 
 3642 int
 3643 ioapic_remap_vector(void *cookie, u_int pin, int vector)
 3644 {
 3645         return (ENXIO);
 3646 }
 3647 
 3648 int
 3649 ioapic_set_extint(void *cookie, u_int pin)
 3650 {
 3651         return (ENXIO);
 3652 }
 3653 
 3654 int
 3655 ioapic_set_nmi(void *cookie, u_int pin)
 3656 {
 3657         return (ENXIO);
 3658 }
 3659 
 3660 int
 3661 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 3662 {
 3663         return (ENXIO);
 3664 }
 3665 
 3666 int
 3667 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 3668 {
 3669         return (ENXIO);
 3670 }
 3671 
 3672 void
 3673 lapic_create(u_int apic_id, int boot_cpu)
 3674 {
 3675 }
 3676 
 3677 void
 3678 lapic_init(vm_paddr_t addr)
 3679 {
 3680 }
 3681 
 3682 int
 3683 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
 3684 {
 3685         return (ENXIO);
 3686 }
 3687 
 3688 int
 3689 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 3690 {
 3691         return (ENXIO);
 3692 }
 3693 
 3694 int
 3695 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
 3696 {
 3697         return (ENXIO);
 3698 }
 3699 #endif
 3700 
 3701 #ifdef KDB
 3702 
 3703 /*
 3704  * Provide inb() and outb() as functions.  They are normally only available as
 3705  * inline functions, thus cannot be called from the debugger.
 3706  */
 3707 
 3708 /* silence compiler warnings */
 3709 u_char inb_(u_short);
 3710 void outb_(u_short, u_char);
 3711 
 3712 u_char
 3713 inb_(u_short port)
 3714 {
 3715         return inb(port);
 3716 }
 3717 
 3718 void
 3719 outb_(u_short port, u_char data)
 3720 {
 3721         outb(port, data);
 3722 }
 3723 
 3724 #endif /* KDB */

Cache object: ef8bd3a2fc3455bdd6e7fc0231505dd2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.