The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: releng/6.0/sys/amd64/amd64/machdep.c 150834 2005-10-02 16:38:12Z cperciva $");
   43 
   44 #include "opt_atalk.h"
   45 #include "opt_atpic.h"
   46 #include "opt_compat.h"
   47 #include "opt_cpu.h"
   48 #include "opt_ddb.h"
   49 #include "opt_inet.h"
   50 #include "opt_ipx.h"
   51 #include "opt_isa.h"
   52 #include "opt_kstack_pages.h"
   53 #include "opt_maxmem.h"
   54 #include "opt_msgbuf.h"
   55 #include "opt_perfmon.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/bio.h>
   61 #include <sys/buf.h>
   62 #include <sys/bus.h>
   63 #include <sys/callout.h>
   64 #include <sys/cons.h>
   65 #include <sys/cpu.h>
   66 #include <sys/eventhandler.h>
   67 #include <sys/exec.h>
   68 #include <sys/imgact.h>
   69 #include <sys/kdb.h>
   70 #include <sys/kernel.h>
   71 #include <sys/ktr.h>
   72 #include <sys/linker.h>
   73 #include <sys/lock.h>
   74 #include <sys/malloc.h>
   75 #include <sys/memrange.h>
   76 #include <sys/msgbuf.h>
   77 #include <sys/mutex.h>
   78 #include <sys/pcpu.h>
   79 #include <sys/ptrace.h>
   80 #include <sys/reboot.h>
   81 #include <sys/sched.h>
   82 #include <sys/signalvar.h>
   83 #include <sys/sysctl.h>
   84 #include <sys/sysent.h>
   85 #include <sys/sysproto.h>
   86 #include <sys/ucontext.h>
   87 #include <sys/vmmeter.h>
   88 
   89 #include <vm/vm.h>
   90 #include <vm/vm_extern.h>
   91 #include <vm/vm_kern.h>
   92 #include <vm/vm_page.h>
   93 #include <vm/vm_map.h>
   94 #include <vm/vm_object.h>
   95 #include <vm/vm_pager.h>
   96 #include <vm/vm_param.h>
   97 
   98 #ifdef DDB
   99 #ifndef KDB
  100 #error KDB must be enabled in order for DDB to work!
  101 #endif
  102 #endif
  103 #include <ddb/ddb.h>
  104 
  105 #include <net/netisr.h>
  106 
  107 #include <machine/clock.h>
  108 #include <machine/cpu.h>
  109 #include <machine/cputypes.h>
  110 #include <machine/intr_machdep.h>
  111 #include <machine/md_var.h>
  112 #include <machine/metadata.h>
  113 #include <machine/pc/bios.h>
  114 #include <machine/pcb.h>
  115 #include <machine/proc.h>
  116 #include <machine/reg.h>
  117 #include <machine/sigframe.h>
  118 #include <machine/specialreg.h>
  119 #ifdef PERFMON
  120 #include <machine/perfmon.h>
  121 #endif
  122 #include <machine/tss.h>
  123 #ifdef SMP
  124 #include <machine/smp.h>
  125 #endif
  126 
  127 #include <amd64/isa/icu.h>
  128 
  129 #include <isa/isareg.h>
  130 #include <isa/rtc.h>
  131 
  132 /* Sanity check for __curthread() */
  133 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  134 
  135 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  136 extern void dblfault_handler(void);
  137 
  138 extern void printcpuinfo(void); /* XXX header file */
  139 extern void identify_cpu(void);
  140 extern void panicifcpuunsupported(void);
  141 
  142 #define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
  143 #define EFL_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
  144 
  145 static void cpu_startup(void *);
  146 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
  147 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
  148 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
  149 
  150 #ifdef DDB
  151 extern vm_offset_t ksym_start, ksym_end;
  152 #endif
  153 
  154 int     _udatasel, _ucodesel, _ucode32sel;
  155 
  156 int cold = 1;
  157 
  158 long Maxmem = 0;
  159 long realmem = 0;
  160 
  161 vm_paddr_t phys_avail[20];
  162 vm_paddr_t dump_avail[20];
  163 
  164 /* must be 2 less so 0 0 can signal end of chunks */
  165 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
  166 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
  167 
  168 struct kva_md_info kmi;
  169 
  170 static struct trapframe proc0_tf;
  171 struct region_descriptor r_gdt, r_idt;
  172 
  173 struct pcpu __pcpu[MAXCPU];
  174 
  175 struct mtx icu_lock;
  176 
  177 struct mem_range_softc mem_range_softc;
  178 
  179 static void
  180 cpu_startup(dummy)
  181         void *dummy;
  182 {
  183         /*
  184          * Good {morning,afternoon,evening,night}.
  185          */
  186         startrtclock();
  187         printcpuinfo();
  188         panicifcpuunsupported();
  189 #ifdef PERFMON
  190         perfmon_init();
  191 #endif
  192         printf("real memory  = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
  193             ptoa((uintmax_t)Maxmem) / 1048576);
  194         realmem = Maxmem;
  195         /*
  196          * Display any holes after the first chunk of extended memory.
  197          */
  198         if (bootverbose) {
  199                 int indx;
  200 
  201                 printf("Physical memory chunk(s):\n");
  202                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  203                         vm_paddr_t size;
  204 
  205                         size = phys_avail[indx + 1] - phys_avail[indx];
  206                         printf(
  207                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  208                             (uintmax_t)phys_avail[indx],
  209                             (uintmax_t)phys_avail[indx + 1] - 1,
  210                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  211                 }
  212         }
  213 
  214         vm_ksubmap_init(&kmi);
  215 
  216         printf("avail memory = %ju (%ju MB)\n",
  217             ptoa((uintmax_t)cnt.v_free_count),
  218             ptoa((uintmax_t)cnt.v_free_count) / 1048576);
  219 
  220         /*
  221          * Set up buffers, so they can be used to read disk labels.
  222          */
  223         bufinit();
  224         vm_pager_bufferinit();
  225 
  226         cpu_setregs();
  227 }
  228 
  229 /*
  230  * Send an interrupt to process.
  231  *
  232  * Stack is set up to allow sigcode stored
  233  * at top to call routine, followed by kcall
  234  * to sigreturn routine below.  After sigreturn
  235  * resets the signal mask, the stack, and the
  236  * frame pointer, it returns to the user
  237  * specified pc, psl.
  238  */
  239 void
  240 sendsig(catcher, sig, mask, code)
  241         sig_t catcher;
  242         int sig;
  243         sigset_t *mask;
  244         u_long code;
  245 {
  246         struct sigframe sf, *sfp;
  247         struct proc *p;
  248         struct thread *td;
  249         struct sigacts *psp;
  250         char *sp;
  251         struct trapframe *regs;
  252         int oonstack;
  253 
  254         td = curthread;
  255         p = td->td_proc;
  256         PROC_LOCK_ASSERT(p, MA_OWNED);
  257         psp = p->p_sigacts;
  258         mtx_assert(&psp->ps_mtx, MA_OWNED);
  259         regs = td->td_frame;
  260         oonstack = sigonstack(regs->tf_rsp);
  261 
  262         /* Save user context. */
  263         bzero(&sf, sizeof(sf));
  264         sf.sf_uc.uc_sigmask = *mask;
  265         sf.sf_uc.uc_stack = td->td_sigstk;
  266         sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
  267             ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
  268         sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
  269         bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
  270         sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
  271         get_fpcontext(td, &sf.sf_uc.uc_mcontext);
  272         fpstate_drop(td);
  273 
  274         /* Allocate space for the signal handler context. */
  275         if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
  276             SIGISMEMBER(psp->ps_sigonstack, sig)) {
  277                 sp = td->td_sigstk.ss_sp +
  278                     td->td_sigstk.ss_size - sizeof(struct sigframe);
  279 #if defined(COMPAT_43)
  280                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  281 #endif
  282         } else
  283                 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
  284         /* Align to 16 bytes. */
  285         sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
  286 
  287         /* Translate the signal if appropriate. */
  288         if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
  289                 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
  290 
  291         /* Build the argument list for the signal handler. */
  292         regs->tf_rdi = sig;                     /* arg 1 in %rdi */
  293         regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
  294         if (SIGISMEMBER(psp->ps_siginfo, sig)) {
  295                 /* Signal handler installed with SA_SIGINFO. */
  296                 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
  297                 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
  298 
  299                 /* Fill in POSIX parts */
  300                 sf.sf_si.si_signo = sig;
  301                 sf.sf_si.si_code = code;
  302                 regs->tf_rcx = regs->tf_addr;   /* arg 4 in %rcx */
  303         } else {
  304                 /* Old FreeBSD-style arguments. */
  305                 regs->tf_rsi = code;            /* arg 2 in %rsi */
  306                 regs->tf_rcx = regs->tf_addr;   /* arg 4 in %rcx */
  307                 sf.sf_ahu.sf_handler = catcher;
  308         }
  309         mtx_unlock(&psp->ps_mtx);
  310         PROC_UNLOCK(p);
  311 
  312         /*
  313          * Copy the sigframe out to the user's stack.
  314          */
  315         if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
  316 #ifdef DEBUG
  317                 printf("process %ld has trashed its stack\n", (long)p->p_pid);
  318 #endif
  319                 PROC_LOCK(p);
  320                 sigexit(td, SIGILL);
  321         }
  322 
  323         regs->tf_rsp = (long)sfp;
  324         regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
  325         regs->tf_rflags &= ~PSL_T;
  326         regs->tf_cs = _ucodesel;
  327         PROC_LOCK(p);
  328         mtx_lock(&psp->ps_mtx);
  329 }
  330 
  331 /*
  332  * Build siginfo_t for SA thread
  333  */
  334 void
  335 cpu_thread_siginfo(int sig, u_long code, siginfo_t *si)
  336 {
  337         struct proc *p;
  338         struct thread *td;
  339         struct trapframe *regs;
  340 
  341         td = curthread;
  342         p = td->td_proc;
  343         regs = td->td_frame;
  344         PROC_LOCK_ASSERT(p, MA_OWNED);
  345 
  346         bzero(si, sizeof(*si));
  347         si->si_signo = sig;
  348         si->si_code = code;
  349         si->si_addr = (void *)regs->tf_addr;
  350         /* XXXKSE fill other fields */
  351 }
  352 
  353 /*
  354  * System call to cleanup state after a signal
  355  * has been taken.  Reset signal mask and
  356  * stack state from context left by sendsig (above).
  357  * Return to previous pc and psl as specified by
  358  * context left by sendsig. Check carefully to
  359  * make sure that the user has not modified the
  360  * state to gain improper privileges.
  361  *
  362  * MPSAFE
  363  */
  364 int
  365 sigreturn(td, uap)
  366         struct thread *td;
  367         struct sigreturn_args /* {
  368                 const __ucontext *sigcntxp;
  369         } */ *uap;
  370 {
  371         ucontext_t uc;
  372         struct proc *p = td->td_proc;
  373         struct trapframe *regs;
  374         const ucontext_t *ucp;
  375         long rflags;
  376         int cs, error, ret;
  377 
  378         error = copyin(uap->sigcntxp, &uc, sizeof(uc));
  379         if (error != 0)
  380                 return (error);
  381         ucp = &uc;
  382         regs = td->td_frame;
  383         rflags = ucp->uc_mcontext.mc_rflags;
  384         /*
  385          * Don't allow users to change privileged or reserved flags.
  386          */
  387         /*
  388          * XXX do allow users to change the privileged flag PSL_RF.
  389          * The cpu sets PSL_RF in tf_rflags for faults.  Debuggers
  390          * should sometimes set it there too.  tf_rflags is kept in
  391          * the signal context during signal handling and there is no
  392          * other place to remember it, so the PSL_RF bit may be
  393          * corrupted by the signal handler without us knowing.
  394          * Corruption of the PSL_RF bit at worst causes one more or
  395          * one less debugger trap, so allowing it is fairly harmless.
  396          */
  397         if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
  398                 printf("sigreturn: rflags = 0x%lx\n", rflags);
  399                 return (EINVAL);
  400         }
  401 
  402         /*
  403          * Don't allow users to load a valid privileged %cs.  Let the
  404          * hardware check for invalid selectors, excess privilege in
  405          * other selectors, invalid %eip's and invalid %esp's.
  406          */
  407         cs = ucp->uc_mcontext.mc_cs;
  408         if (!CS_SECURE(cs)) {
  409                 printf("sigreturn: cs = 0x%x\n", cs);
  410                 trapsignal(td, SIGBUS, T_PROTFLT);
  411                 return (EINVAL);
  412         }
  413 
  414         ret = set_fpcontext(td, &ucp->uc_mcontext);
  415         if (ret != 0)
  416                 return (ret);
  417         bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
  418 
  419         PROC_LOCK(p);
  420 #if defined(COMPAT_43)
  421         if (ucp->uc_mcontext.mc_onstack & 1)
  422                 td->td_sigstk.ss_flags |= SS_ONSTACK;
  423         else
  424                 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
  425 #endif
  426 
  427         td->td_sigmask = ucp->uc_sigmask;
  428         SIG_CANTMASK(td->td_sigmask);
  429         signotify(td);
  430         PROC_UNLOCK(p);
  431         td->td_pcb->pcb_flags |= PCB_FULLCTX;
  432         return (EJUSTRETURN);
  433 }
  434 
  435 #ifdef COMPAT_FREEBSD4
  436 int
  437 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
  438 {
  439  
  440         return sigreturn(td, (struct sigreturn_args *)uap);
  441 }
  442 #endif
  443 
  444 
  445 /*
  446  * Machine dependent boot() routine
  447  *
  448  * I haven't seen anything to put here yet
  449  * Possibly some stuff might be grafted back here from boot()
  450  */
  451 void
  452 cpu_boot(int howto)
  453 {
  454 }
  455 
  456 /* Get current clock frequency for the given cpu id. */
  457 int
  458 cpu_est_clockrate(int cpu_id, uint64_t *rate)
  459 {
  460         register_t reg;
  461         uint64_t tsc1, tsc2;
  462 
  463         if (pcpu_find(cpu_id) == NULL || rate == NULL)
  464                 return (EINVAL);
  465 
  466         /* If we're booting, trust the rate calibrated moments ago. */
  467         if (cold) {
  468                 *rate = tsc_freq;
  469                 return (0);
  470         }
  471 
  472 #ifdef SMP
  473         /* Schedule ourselves on the indicated cpu. */
  474         mtx_lock_spin(&sched_lock);
  475         sched_bind(curthread, cpu_id);
  476         mtx_unlock_spin(&sched_lock);
  477 #endif
  478 
  479         /* Calibrate by measuring a short delay. */
  480         reg = intr_disable();
  481         tsc1 = rdtsc();
  482         DELAY(1000);
  483         tsc2 = rdtsc();
  484         intr_restore(reg);
  485 
  486 #ifdef SMP
  487         mtx_lock_spin(&sched_lock);
  488         sched_unbind(curthread);
  489         mtx_unlock_spin(&sched_lock);
  490 #endif
  491 
  492         /*
  493          * Calculate the difference in readings, convert to Mhz, and
  494          * subtract 0.5% of the total.  Empirical testing has shown that
  495          * overhead in DELAY() works out to approximately this value.
  496          */
  497         tsc2 -= tsc1;
  498         *rate = tsc2 * 1000 - tsc2 * 5;
  499         return (0);
  500 }
  501 
  502 /*
  503  * Shutdown the CPU as much as possible
  504  */
  505 void
  506 cpu_halt(void)
  507 {
  508         for (;;)
  509                 __asm__ ("hlt");
  510 }
  511 
  512 /*
  513  * Hook to idle the CPU when possible.  In the SMP case we default to
  514  * off because a halted cpu will not currently pick up a new thread in the
  515  * run queue until the next timer tick.  If turned on this will result in
  516  * approximately a 4.2% loss in real time performance in buildworld tests
  517  * (but improves user and sys times oddly enough), and saves approximately
  518  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  519  *
  520  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  521  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  522  * Then we can have our cake and eat it too.
  523  *
  524  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  525  * help lock contention somewhat, and this is critical for HTT. -Peter
  526  */
  527 static int      cpu_idle_hlt = 1;
  528 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
  529     &cpu_idle_hlt, 0, "Idle loop HLT enable");
  530 
  531 static void
  532 cpu_idle_default(void)
  533 {
  534         /*
  535          * we must absolutely guarentee that hlt is the
  536          * absolute next instruction after sti or we
  537          * introduce a timing window.
  538          */
  539         __asm __volatile("sti; hlt");
  540 }
  541 
  542 /*
  543  * Note that we have to be careful here to avoid a race between checking
  544  * sched_runnable() and actually halting.  If we don't do this, we may waste
  545  * the time between calling hlt and the next interrupt even though there
  546  * is a runnable process.
  547  */
  548 void
  549 cpu_idle(void)
  550 {
  551 
  552 #ifdef SMP
  553         if (mp_grab_cpu_hlt())
  554                 return;
  555 #endif
  556         if (cpu_idle_hlt) {
  557                 disable_intr();
  558                 if (sched_runnable())
  559                         enable_intr();
  560                 else
  561                         (*cpu_idle_hook)();
  562         }
  563 }
  564 
  565 /* Other subsystems (e.g., ACPI) can hook this later. */
  566 void (*cpu_idle_hook)(void) = cpu_idle_default;
  567 
  568 /*
  569  * Clear registers on exec
  570  */
  571 void
  572 exec_setregs(td, entry, stack, ps_strings)
  573         struct thread *td;
  574         u_long entry;
  575         u_long stack;
  576         u_long ps_strings;
  577 {
  578         struct trapframe *regs = td->td_frame;
  579         struct pcb *pcb = td->td_pcb;
  580         
  581         wrmsr(MSR_FSBASE, 0);
  582         wrmsr(MSR_KGSBASE, 0);  /* User value while we're in the kernel */
  583         pcb->pcb_fsbase = 0;
  584         pcb->pcb_gsbase = 0;
  585         load_ds(_udatasel);
  586         load_es(_udatasel);
  587         load_fs(_udatasel);
  588         load_gs(_udatasel);
  589         pcb->pcb_ds = _udatasel;
  590         pcb->pcb_es = _udatasel;
  591         pcb->pcb_fs = _udatasel;
  592         pcb->pcb_gs = _udatasel;
  593 
  594         bzero((char *)regs, sizeof(struct trapframe));
  595         regs->tf_rip = entry;
  596         regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
  597         regs->tf_rdi = stack;           /* argv */
  598         regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
  599         regs->tf_ss = _udatasel;
  600         regs->tf_cs = _ucodesel;
  601 
  602         /*
  603          * Reset the hardware debug registers if they were in use.
  604          * They won't have any meaning for the newly exec'd process.
  605          */
  606         if (pcb->pcb_flags & PCB_DBREGS) {
  607                 pcb->pcb_dr0 = 0;
  608                 pcb->pcb_dr1 = 0;
  609                 pcb->pcb_dr2 = 0;
  610                 pcb->pcb_dr3 = 0;
  611                 pcb->pcb_dr6 = 0;
  612                 pcb->pcb_dr7 = 0;
  613                 if (pcb == PCPU_GET(curpcb)) {
  614                         /*
  615                          * Clear the debug registers on the running
  616                          * CPU, otherwise they will end up affecting
  617                          * the next process we switch to.
  618                          */
  619                         reset_dbregs();
  620                 }
  621                 pcb->pcb_flags &= ~PCB_DBREGS;
  622         }
  623 
  624         /*
  625          * Drop the FP state if we hold it, so that the process gets a
  626          * clean FP state if it uses the FPU again.
  627          */
  628         fpstate_drop(td);
  629 }
  630 
  631 void
  632 cpu_setregs(void)
  633 {
  634         register_t cr0;
  635 
  636         cr0 = rcr0();
  637         /*
  638          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  639          * BSP.  See the comments there about why we set them.
  640          */
  641         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  642         load_cr0(cr0);
  643 }
  644 
  645 static int
  646 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
  647 {
  648         int error;
  649         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
  650                 req);
  651         if (!error && req->newptr)
  652                 resettodr();
  653         return (error);
  654 }
  655 
  656 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
  657         &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
  658 
  659 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
  660         CTLFLAG_RW, &disable_rtc_set, 0, "");
  661 
  662 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
  663         CTLFLAG_RW, &wall_cmos_clock, 0, "");
  664 
  665 /*
  666  * Initialize amd64 and configure to run kernel
  667  */
  668 
  669 /*
  670  * Initialize segments & interrupt table
  671  */
  672 
  673 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor table */
  674 static struct gate_descriptor idt0[NIDT];
  675 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  676 
  677 static char dblfault_stack[PAGE_SIZE] __aligned(16);
  678 
  679 struct amd64tss common_tss[MAXCPU];
  680 
  681 /* software prototypes -- in more palatable form */
  682 struct soft_segment_descriptor gdt_segs[] = {
  683 /* GNULL_SEL    0 Null Descriptor */
  684 {       0x0,                    /* segment base address  */
  685         0x0,                    /* length */
  686         0,                      /* segment type */
  687         0,                      /* segment descriptor priority level */
  688         0,                      /* segment descriptor present */
  689         0,                      /* long */
  690         0,                      /* default 32 vs 16 bit size */
  691         0                       /* limit granularity (byte/page units)*/ },
  692 /* GCODE_SEL    1 Code Descriptor for kernel */
  693 {       0x0,                    /* segment base address  */
  694         0xfffff,                /* length - all address space */
  695         SDT_MEMERA,             /* segment type */
  696         SEL_KPL,                /* segment descriptor priority level */
  697         1,                      /* segment descriptor present */
  698         1,                      /* long */
  699         0,                      /* default 32 vs 16 bit size */
  700         1                       /* limit granularity (byte/page units)*/ },
  701 /* GDATA_SEL    2 Data Descriptor for kernel */
  702 {       0x0,                    /* segment base address  */
  703         0xfffff,                /* length - all address space */
  704         SDT_MEMRWA,             /* segment type */
  705         SEL_KPL,                /* segment descriptor priority level */
  706         1,                      /* segment descriptor present */
  707         1,                      /* long */
  708         0,                      /* default 32 vs 16 bit size */
  709         1                       /* limit granularity (byte/page units)*/ },
  710 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
  711 {       0x0,                    /* segment base address  */
  712         0xfffff,                /* length - all address space */
  713         SDT_MEMERA,             /* segment type */
  714         SEL_UPL,                /* segment descriptor priority level */
  715         1,                      /* segment descriptor present */
  716         0,                      /* long */
  717         1,                      /* default 32 vs 16 bit size */
  718         1                       /* limit granularity (byte/page units)*/ },
  719 /* GUDATA_SEL   4 32/64 bit Data Descriptor for user */
  720 {       0x0,                    /* segment base address  */
  721         0xfffff,                /* length - all address space */
  722         SDT_MEMRWA,             /* segment type */
  723         SEL_UPL,                /* segment descriptor priority level */
  724         1,                      /* segment descriptor present */
  725         0,                      /* long */
  726         1,                      /* default 32 vs 16 bit size */
  727         1                       /* limit granularity (byte/page units)*/ },
  728 /* GUCODE_SEL   5 64 bit Code Descriptor for user */
  729 {       0x0,                    /* segment base address  */
  730         0xfffff,                /* length - all address space */
  731         SDT_MEMERA,             /* segment type */
  732         SEL_UPL,                /* segment descriptor priority level */
  733         1,                      /* segment descriptor present */
  734         1,                      /* long */
  735         0,                      /* default 32 vs 16 bit size */
  736         1                       /* limit granularity (byte/page units)*/ },
  737 /* GPROC0_SEL   6 Proc 0 Tss Descriptor */
  738 {
  739         0x0,                    /* segment base address */
  740         sizeof(struct amd64tss)-1,/* length - all address space */
  741         SDT_SYSTSS,             /* segment type */
  742         SEL_KPL,                /* segment descriptor priority level */
  743         1,                      /* segment descriptor present */
  744         0,                      /* long */
  745         0,                      /* unused - default 32 vs 16 bit size */
  746         0                       /* limit granularity (byte/page units)*/ },
  747 /* Actually, the TSS is a system descriptor which is double size */
  748 {       0x0,                    /* segment base address  */
  749         0x0,                    /* length */
  750         0,                      /* segment type */
  751         0,                      /* segment descriptor priority level */
  752         0,                      /* segment descriptor present */
  753         0,                      /* long */
  754         0,                      /* default 32 vs 16 bit size */
  755         0                       /* limit granularity (byte/page units)*/ },
  756 };
  757 
  758 void
  759 setidt(idx, func, typ, dpl, ist)
  760         int idx;
  761         inthand_t *func;
  762         int typ;
  763         int dpl;
  764         int ist;
  765 {
  766         struct gate_descriptor *ip;
  767 
  768         ip = idt + idx;
  769         ip->gd_looffset = (uintptr_t)func;
  770         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  771         ip->gd_ist = ist;
  772         ip->gd_xx = 0;
  773         ip->gd_type = typ;
  774         ip->gd_dpl = dpl;
  775         ip->gd_p = 1;
  776         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  777 }
  778 
  779 #define IDTVEC(name)    __CONCAT(X,name)
  780 
  781 extern inthand_t
  782         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  783         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  784         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  785         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  786         IDTVEC(xmm), IDTVEC(dblfault),
  787         IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
  788 
  789 void
  790 sdtossd(sd, ssd)
  791         struct user_segment_descriptor *sd;
  792         struct soft_segment_descriptor *ssd;
  793 {
  794 
  795         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  796         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  797         ssd->ssd_type  = sd->sd_type;
  798         ssd->ssd_dpl   = sd->sd_dpl;
  799         ssd->ssd_p     = sd->sd_p;
  800         ssd->ssd_long  = sd->sd_long;
  801         ssd->ssd_def32 = sd->sd_def32;
  802         ssd->ssd_gran  = sd->sd_gran;
  803 }
  804 
  805 void
  806 ssdtosd(ssd, sd)
  807         struct soft_segment_descriptor *ssd;
  808         struct user_segment_descriptor *sd;
  809 {
  810 
  811         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  812         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  813         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  814         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  815         sd->sd_type  = ssd->ssd_type;
  816         sd->sd_dpl   = ssd->ssd_dpl;
  817         sd->sd_p     = ssd->ssd_p;
  818         sd->sd_long  = ssd->ssd_long;
  819         sd->sd_def32 = ssd->ssd_def32;
  820         sd->sd_gran  = ssd->ssd_gran;
  821 }
  822 
  823 void
  824 ssdtosyssd(ssd, sd)
  825         struct soft_segment_descriptor *ssd;
  826         struct system_segment_descriptor *sd;
  827 {
  828 
  829         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  830         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  831         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  832         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  833         sd->sd_type  = ssd->ssd_type;
  834         sd->sd_dpl   = ssd->ssd_dpl;
  835         sd->sd_p     = ssd->ssd_p;
  836         sd->sd_gran  = ssd->ssd_gran;
  837 }
  838 
  839 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
  840 #include <isa/isavar.h>
  841 u_int
  842 isa_irq_pending(void)
  843 {
  844 
  845         return (0);
  846 }
  847 #endif
  848 
  849 #define PHYSMAP_SIZE    (2 * 8)
  850 
  851 u_int basemem;
  852 
  853 /*
  854  * Populate the (physmap) array with base/bound pairs describing the
  855  * available physical memory in the system, then test this memory and
  856  * build the phys_avail array describing the actually-available memory.
  857  *
  858  * If we cannot accurately determine the physical memory map, then use
  859  * value from the 0xE801 call, and failing that, the RTC.
  860  *
  861  * Total memory size may be set by the kernel environment variable
  862  * hw.physmem or the compile-time define MAXMEM.
  863  *
  864  * XXX first should be vm_paddr_t.
  865  */
  866 static void
  867 getmemsize(caddr_t kmdp, u_int64_t first)
  868 {
  869         int i, physmap_idx, pa_indx, da_indx;
  870         vm_paddr_t pa, physmap[PHYSMAP_SIZE];
  871         u_long physmem_tunable;
  872         pt_entry_t *pte;
  873         struct bios_smap *smapbase, *smap, *smapend;
  874         u_int32_t smapsize;
  875         quad_t dcons_addr, dcons_size;
  876 
  877         bzero(physmap, sizeof(physmap));
  878         basemem = 0;
  879         physmap_idx = 0;
  880 
  881         /*
  882          * get memory map from INT 15:E820, kindly supplied by the loader.
  883          *
  884          * subr_module.c says:
  885          * "Consumer may safely assume that size value precedes data."
  886          * ie: an int32_t immediately precedes smap.
  887          */
  888         smapbase = (struct bios_smap *)preload_search_info(kmdp,
  889             MODINFO_METADATA | MODINFOMD_SMAP);
  890         if (smapbase == NULL)
  891                 panic("No BIOS smap info from loader!");
  892 
  893         smapsize = *((u_int32_t *)smapbase - 1);
  894         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
  895 
  896         for (smap = smapbase; smap < smapend; smap++) {
  897                 if (boothowto & RB_VERBOSE)
  898                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
  899                             smap->type, smap->base, smap->length);
  900 
  901                 if (smap->type != 0x01)
  902                         continue;
  903 
  904                 if (smap->length == 0)
  905                         continue;
  906 
  907                 for (i = 0; i <= physmap_idx; i += 2) {
  908                         if (smap->base < physmap[i + 1]) {
  909                                 if (boothowto & RB_VERBOSE)
  910                                         printf(
  911         "Overlapping or non-montonic memory region, ignoring second region\n");
  912                                 continue;
  913                         }
  914                 }
  915 
  916                 if (smap->base == physmap[physmap_idx + 1]) {
  917                         physmap[physmap_idx + 1] += smap->length;
  918                         continue;
  919                 }
  920 
  921                 physmap_idx += 2;
  922                 if (physmap_idx == PHYSMAP_SIZE) {
  923                         printf(
  924                 "Too many segments in the physical address map, giving up\n");
  925                         break;
  926                 }
  927                 physmap[physmap_idx] = smap->base;
  928                 physmap[physmap_idx + 1] = smap->base + smap->length;
  929         }
  930 
  931         /*
  932          * Find the 'base memory' segment for SMP
  933          */
  934         basemem = 0;
  935         for (i = 0; i <= physmap_idx; i += 2) {
  936                 if (physmap[i] == 0x00000000) {
  937                         basemem = physmap[i + 1] / 1024;
  938                         break;
  939                 }
  940         }
  941         if (basemem == 0)
  942                 panic("BIOS smap did not include a basemem segment!");
  943 
  944 #ifdef SMP
  945         /* make hole for AP bootstrap code */
  946         physmap[1] = mp_bootaddress(physmap[1] / 1024);
  947 #endif
  948 
  949         /*
  950          * Maxmem isn't the "maximum memory", it's one larger than the
  951          * highest page of the physical address space.  It should be
  952          * called something like "Maxphyspage".  We may adjust this
  953          * based on ``hw.physmem'' and the results of the memory test.
  954          */
  955         Maxmem = atop(physmap[physmap_idx + 1]);
  956 
  957 #ifdef MAXMEM
  958         Maxmem = MAXMEM / 4;
  959 #endif
  960 
  961         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
  962                 Maxmem = atop(physmem_tunable);
  963 
  964         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
  965             (boothowto & RB_VERBOSE))
  966                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
  967 
  968         /*
  969          * If Maxmem has been increased beyond what the system has detected,
  970          * extend the last memory segment to the new limit.
  971          */
  972         if (atop(physmap[physmap_idx + 1]) < Maxmem)
  973                 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
  974 
  975         /* call pmap initialization to make new kernel address space */
  976         pmap_bootstrap(&first);
  977 
  978         /*
  979          * Size up each available chunk of physical memory.
  980          */
  981         physmap[0] = PAGE_SIZE;         /* mask off page 0 */
  982         pa_indx = 0;
  983         da_indx = 1;
  984         phys_avail[pa_indx++] = physmap[0];
  985         phys_avail[pa_indx] = physmap[0];
  986         dump_avail[da_indx] = physmap[0];
  987         pte = CMAP1;
  988 
  989         /*
  990          * Get dcons buffer address
  991          */
  992         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
  993             getenv_quad("dcons.size", &dcons_size) == 0)
  994                 dcons_addr = 0;
  995 
  996         /*
  997          * physmap is in bytes, so when converting to page boundaries,
  998          * round up the start address and round down the end address.
  999          */
 1000         for (i = 0; i <= physmap_idx; i += 2) {
 1001                 vm_paddr_t end;
 1002 
 1003                 end = ptoa((vm_paddr_t)Maxmem);
 1004                 if (physmap[i + 1] < end)
 1005                         end = trunc_page(physmap[i + 1]);
 1006                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 1007                         int tmp, page_bad, full;
 1008                         int *ptr = (int *)CADDR1;
 1009 
 1010                         full = FALSE;
 1011                         /*
 1012                          * block out kernel memory as not available.
 1013                          */
 1014                         if (pa >= 0x100000 && pa < first)
 1015                                 goto do_dump_avail;
 1016 
 1017                         /*
 1018                          * block out dcons buffer
 1019                          */
 1020                         if (dcons_addr > 0
 1021                             && pa >= trunc_page(dcons_addr)
 1022                             && pa < dcons_addr + dcons_size)
 1023                                 goto do_dump_avail;
 1024 
 1025                         page_bad = FALSE;
 1026 
 1027                         /*
 1028                          * map page into kernel: valid, read/write,non-cacheable
 1029                          */
 1030                         *pte = pa | PG_V | PG_RW | PG_N;
 1031                         invltlb();
 1032 
 1033                         tmp = *(int *)ptr;
 1034                         /*
 1035                          * Test for alternating 1's and 0's
 1036                          */
 1037                         *(volatile int *)ptr = 0xaaaaaaaa;
 1038                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1039                                 page_bad = TRUE;
 1040                         /*
 1041                          * Test for alternating 0's and 1's
 1042                          */
 1043                         *(volatile int *)ptr = 0x55555555;
 1044                         if (*(volatile int *)ptr != 0x55555555)
 1045                                 page_bad = TRUE;
 1046                         /*
 1047                          * Test for all 1's
 1048                          */
 1049                         *(volatile int *)ptr = 0xffffffff;
 1050                         if (*(volatile int *)ptr != 0xffffffff)
 1051                                 page_bad = TRUE;
 1052                         /*
 1053                          * Test for all 0's
 1054                          */
 1055                         *(volatile int *)ptr = 0x0;
 1056                         if (*(volatile int *)ptr != 0x0)
 1057                                 page_bad = TRUE;
 1058                         /*
 1059                          * Restore original value.
 1060                          */
 1061                         *(int *)ptr = tmp;
 1062 
 1063                         /*
 1064                          * Adjust array of valid/good pages.
 1065                          */
 1066                         if (page_bad == TRUE)
 1067                                 continue;
 1068                         /*
 1069                          * If this good page is a continuation of the
 1070                          * previous set of good pages, then just increase
 1071                          * the end pointer. Otherwise start a new chunk.
 1072                          * Note that "end" points one higher than end,
 1073                          * making the range >= start and < end.
 1074                          * If we're also doing a speculative memory
 1075                          * test and we at or past the end, bump up Maxmem
 1076                          * so that we keep going. The first bad page
 1077                          * will terminate the loop.
 1078                          */
 1079                         if (phys_avail[pa_indx] == pa) {
 1080                                 phys_avail[pa_indx] += PAGE_SIZE;
 1081                         } else {
 1082                                 pa_indx++;
 1083                                 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 1084                                         printf(
 1085                 "Too many holes in the physical address space, giving up\n");
 1086                                         pa_indx--;
 1087                                         full = TRUE;
 1088                                         goto do_dump_avail;
 1089                                 }
 1090                                 phys_avail[pa_indx++] = pa;     /* start */
 1091                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1092                         }
 1093                         physmem++;
 1094 do_dump_avail:
 1095                         if (dump_avail[da_indx] == pa) {
 1096                                 dump_avail[da_indx] += PAGE_SIZE;
 1097                         } else {
 1098                                 da_indx++;
 1099                                 if (da_indx == DUMP_AVAIL_ARRAY_END) {
 1100                                         da_indx--;
 1101                                         goto do_next;
 1102                                 }
 1103                                 dump_avail[da_indx++] = pa; /* start */
 1104                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1105                         }
 1106 do_next:
 1107                         if (full)
 1108                                 break;
 1109                 }
 1110         }
 1111         *pte = 0;
 1112         invltlb();
 1113 
 1114         /*
 1115          * XXX
 1116          * The last chunk must contain at least one page plus the message
 1117          * buffer to avoid complicating other code (message buffer address
 1118          * calculation, etc.).
 1119          */
 1120         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1121             round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 1122                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1123                 phys_avail[pa_indx--] = 0;
 1124                 phys_avail[pa_indx--] = 0;
 1125         }
 1126 
 1127         Maxmem = atop(phys_avail[pa_indx]);
 1128 
 1129         /* Trim off space for the message buffer. */
 1130         phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 1131 
 1132         avail_end = phys_avail[pa_indx];
 1133 }
 1134 
 1135 u_int64_t
 1136 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1137 {
 1138         caddr_t kmdp;
 1139         int gsel_tss, off, x;
 1140         struct pcpu *pc;
 1141         u_int64_t msr;
 1142         char *env;
 1143 
 1144 #ifdef DEV_ISA
 1145         /* Preemptively mask the atpics and leave them shut down */
 1146         outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
 1147         outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
 1148 #else
 1149 #error "have you forgotten the isa device?";
 1150 #endif
 1151 
 1152         thread0.td_kstack = physfree + KERNBASE;
 1153         bzero((void *)thread0.td_kstack, KSTACK_PAGES * PAGE_SIZE);
 1154         physfree += KSTACK_PAGES * PAGE_SIZE;
 1155         thread0.td_pcb = (struct pcb *)
 1156            (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 1157 
 1158         /*
 1159          * This may be done better later if it gets more high level
 1160          * components in it. If so just link td->td_proc here.
 1161          */
 1162         proc_linkup(&proc0, &ksegrp0, &thread0);
 1163 
 1164         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1165         preload_bootstrap_relocate(KERNBASE);
 1166         kmdp = preload_search_by_type("elf kernel");
 1167         if (kmdp == NULL)
 1168                 kmdp = preload_search_by_type("elf64 kernel");
 1169         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1170         kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
 1171 #ifdef DDB
 1172         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1173         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1174 #endif
 1175 
 1176         /* Init basic tunables, hz etc */
 1177         init_param1();
 1178 
 1179         /*
 1180          * make gdt memory segments
 1181          */
 1182         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 1183 
 1184         for (x = 0; x < NGDT; x++) {
 1185                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
 1186                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1187         }
 1188         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1189             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1190 
 1191         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1192         r_gdt.rd_base =  (long) gdt;
 1193         lgdt(&r_gdt);
 1194         pc = &__pcpu[0];
 1195 
 1196         wrmsr(MSR_FSBASE, 0);           /* User value */
 1197         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1198         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1199 
 1200         pcpu_init(pc, 0, sizeof(struct pcpu));
 1201         PCPU_SET(prvspace, pc);
 1202         PCPU_SET(curthread, &thread0);
 1203         PCPU_SET(curpcb, thread0.td_pcb);
 1204         PCPU_SET(tssp, &common_tss[0]);
 1205 
 1206         /*
 1207          * Initialize mutexes.
 1208          *
 1209          * icu_lock: in order to allow an interrupt to occur in a critical
 1210          *           section, to set pcpu->ipending (etc...) properly, we
 1211          *           must be able to get the icu lock, so it can't be
 1212          *           under witness.
 1213          */
 1214         mutex_init();
 1215         mtx_init(&clock_lock, "clk", NULL, MTX_SPIN);
 1216         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1217 
 1218         /* exceptions */
 1219         for (x = 0; x < NIDT; x++)
 1220                 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 1221         setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 1222         setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
 1223         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 0);
 1224         setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 1225         setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
 1226         setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
 1227         setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
 1228         setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
 1229         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1230         setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
 1231         setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
 1232         setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
 1233         setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
 1234         setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 1235         setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
 1236         setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
 1237         setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
 1238         setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
 1239         setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
 1240 
 1241         r_idt.rd_limit = sizeof(idt0) - 1;
 1242         r_idt.rd_base = (long) idt;
 1243         lidt(&r_idt);
 1244 
 1245         /*
 1246          * Initialize the console before we print anything out.
 1247          */
 1248         cninit();
 1249 
 1250 #ifdef DEV_ATPIC
 1251         elcr_probe();
 1252         atpic_startup();
 1253 #endif
 1254 
 1255         kdb_init();
 1256 
 1257 #ifdef KDB
 1258         if (boothowto & RB_KDB)
 1259                 kdb_enter("Boot flags requested debugger");
 1260 #endif
 1261 
 1262         identify_cpu();         /* Final stage of CPU initialization */
 1263         initializecpu();        /* Initialize CPU registers */
 1264 
 1265         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1266         common_tss[0].tss_rsp0 = thread0.td_kstack + \
 1267             KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb);
 1268         /* Ensure the stack is aligned to 16 bytes */
 1269         common_tss[0].tss_rsp0 &= ~0xFul;
 1270         PCPU_SET(rsp0, common_tss[0].tss_rsp0);
 1271 
 1272         /* doublefault stack space, runs on ist1 */
 1273         common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 1274 
 1275         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1276         common_tss[0].tss_iobase = sizeof(struct amd64tss);
 1277 
 1278         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1279         ltr(gsel_tss);
 1280 
 1281         /* Set up the fast syscall stuff */
 1282         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1283         wrmsr(MSR_EFER, msr);
 1284         wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
 1285         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1286         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1287               ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1288         wrmsr(MSR_STAR, msr);
 1289         wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 1290 
 1291         getmemsize(kmdp, physfree);
 1292         init_param2(physmem);
 1293 
 1294         /* now running on new page tables, configured,and u/iom is accessible */
 1295 
 1296         /* Map the message buffer. */
 1297         for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 1298                 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
 1299 
 1300         msgbufinit(msgbufp, MSGBUF_SIZE);
 1301         fpuinit();
 1302 
 1303         /* transfer to user mode */
 1304 
 1305         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1306         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1307         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1308 
 1309         /* setup proc 0's pcb */
 1310         thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
 1311         thread0.td_pcb->pcb_cr3 = KPML4phys;
 1312         thread0.td_frame = &proc0_tf;
 1313 
 1314         env = getenv("kernelname");
 1315         if (env != NULL)
 1316                 strlcpy(kernelname, env, sizeof(kernelname));
 1317 
 1318         /* Location of kernel stack for locore */
 1319         return ((u_int64_t)thread0.td_pcb);
 1320 }
 1321 
 1322 void
 1323 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1324 {
 1325 
 1326         pcpu->pc_acpi_id = 0xffffffff;
 1327 }
 1328 
 1329 void
 1330 spinlock_enter(void)
 1331 {
 1332         struct thread *td;
 1333 
 1334         td = curthread;
 1335         if (td->td_md.md_spinlock_count == 0)
 1336                 td->td_md.md_saved_flags = intr_disable();
 1337         td->td_md.md_spinlock_count++;
 1338         critical_enter();
 1339 }
 1340 
 1341 void
 1342 spinlock_exit(void)
 1343 {
 1344         struct thread *td;
 1345 
 1346         td = curthread;
 1347         critical_exit();
 1348         td->td_md.md_spinlock_count--;
 1349         if (td->td_md.md_spinlock_count == 0)
 1350                 intr_restore(td->td_md.md_saved_flags);
 1351 }
 1352 
 1353 /*
 1354  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1355  * we want to start a backtrace from the function that caused us to enter
 1356  * the debugger. We have the context in the trapframe, but base the trace
 1357  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1358  * enough for a backtrace.
 1359  */
 1360 void
 1361 makectx(struct trapframe *tf, struct pcb *pcb)
 1362 {
 1363 
 1364         pcb->pcb_r12 = tf->tf_r12;
 1365         pcb->pcb_r13 = tf->tf_r13;
 1366         pcb->pcb_r14 = tf->tf_r14;
 1367         pcb->pcb_r15 = tf->tf_r15;
 1368         pcb->pcb_rbp = tf->tf_rbp;
 1369         pcb->pcb_rbx = tf->tf_rbx;
 1370         pcb->pcb_rip = tf->tf_rip;
 1371         pcb->pcb_rsp = (ISPL(tf->tf_cs)) ? tf->tf_rsp : (long)(tf + 1) - 8;
 1372 }
 1373 
 1374 int
 1375 ptrace_set_pc(struct thread *td, unsigned long addr)
 1376 {
 1377         td->td_frame->tf_rip = addr;
 1378         return (0);
 1379 }
 1380 
 1381 int
 1382 ptrace_single_step(struct thread *td)
 1383 {
 1384         td->td_frame->tf_rflags |= PSL_T;
 1385         return (0);
 1386 }
 1387 
 1388 int
 1389 ptrace_clear_single_step(struct thread *td)
 1390 {
 1391         td->td_frame->tf_rflags &= ~PSL_T;
 1392         return (0);
 1393 }
 1394 
 1395 int
 1396 fill_regs(struct thread *td, struct reg *regs)
 1397 {
 1398         struct trapframe *tp;
 1399 
 1400         tp = td->td_frame;
 1401         regs->r_r15 = tp->tf_r15;
 1402         regs->r_r14 = tp->tf_r14;
 1403         regs->r_r13 = tp->tf_r13;
 1404         regs->r_r12 = tp->tf_r12;
 1405         regs->r_r11 = tp->tf_r11;
 1406         regs->r_r10 = tp->tf_r10;
 1407         regs->r_r9  = tp->tf_r9;
 1408         regs->r_r8  = tp->tf_r8;
 1409         regs->r_rdi = tp->tf_rdi;
 1410         regs->r_rsi = tp->tf_rsi;
 1411         regs->r_rbp = tp->tf_rbp;
 1412         regs->r_rbx = tp->tf_rbx;
 1413         regs->r_rdx = tp->tf_rdx;
 1414         regs->r_rcx = tp->tf_rcx;
 1415         regs->r_rax = tp->tf_rax;
 1416         regs->r_rip = tp->tf_rip;
 1417         regs->r_cs = tp->tf_cs;
 1418         regs->r_rflags = tp->tf_rflags;
 1419         regs->r_rsp = tp->tf_rsp;
 1420         regs->r_ss = tp->tf_ss;
 1421         return (0);
 1422 }
 1423 
 1424 int
 1425 set_regs(struct thread *td, struct reg *regs)
 1426 {
 1427         struct trapframe *tp;
 1428         register_t rflags;
 1429 
 1430         tp = td->td_frame;
 1431         rflags = regs->r_rflags & 0xffffffff;
 1432         if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 1433                 return (EINVAL);
 1434         tp->tf_r15 = regs->r_r15;
 1435         tp->tf_r14 = regs->r_r14;
 1436         tp->tf_r13 = regs->r_r13;
 1437         tp->tf_r12 = regs->r_r12;
 1438         tp->tf_r11 = regs->r_r11;
 1439         tp->tf_r10 = regs->r_r10;
 1440         tp->tf_r9  = regs->r_r9;
 1441         tp->tf_r8  = regs->r_r8;
 1442         tp->tf_rdi = regs->r_rdi;
 1443         tp->tf_rsi = regs->r_rsi;
 1444         tp->tf_rbp = regs->r_rbp;
 1445         tp->tf_rbx = regs->r_rbx;
 1446         tp->tf_rdx = regs->r_rdx;
 1447         tp->tf_rcx = regs->r_rcx;
 1448         tp->tf_rax = regs->r_rax;
 1449         tp->tf_rip = regs->r_rip;
 1450         tp->tf_cs = regs->r_cs;
 1451         tp->tf_rflags = rflags;
 1452         tp->tf_rsp = regs->r_rsp;
 1453         tp->tf_ss = regs->r_ss;
 1454         td->td_pcb->pcb_flags |= PCB_FULLCTX;
 1455         return (0);
 1456 }
 1457 
 1458 /* XXX check all this stuff! */
 1459 /* externalize from sv_xmm */
 1460 static void
 1461 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 1462 {
 1463         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 1464         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 1465         int i;
 1466 
 1467         /* pcb -> fpregs */
 1468         bzero(fpregs, sizeof(*fpregs));
 1469 
 1470         /* FPU control/status */
 1471         penv_fpreg->en_cw = penv_xmm->en_cw;
 1472         penv_fpreg->en_sw = penv_xmm->en_sw;
 1473         penv_fpreg->en_tw = penv_xmm->en_tw;
 1474         penv_fpreg->en_opcode = penv_xmm->en_opcode;
 1475         penv_fpreg->en_rip = penv_xmm->en_rip;
 1476         penv_fpreg->en_rdp = penv_xmm->en_rdp;
 1477         penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 1478         penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 1479 
 1480         /* FPU registers */
 1481         for (i = 0; i < 8; ++i)
 1482                 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 1483 
 1484         /* SSE registers */
 1485         for (i = 0; i < 16; ++i)
 1486                 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 1487 }
 1488 
 1489 /* internalize from fpregs into sv_xmm */
 1490 static void
 1491 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 1492 {
 1493         struct envxmm *penv_xmm = &sv_xmm->sv_env;
 1494         struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 1495         int i;
 1496 
 1497         /* fpregs -> pcb */
 1498         /* FPU control/status */
 1499         penv_xmm->en_cw = penv_fpreg->en_cw;
 1500         penv_xmm->en_sw = penv_fpreg->en_sw;
 1501         penv_xmm->en_tw = penv_fpreg->en_tw;
 1502         penv_xmm->en_opcode = penv_fpreg->en_opcode;
 1503         penv_xmm->en_rip = penv_fpreg->en_rip;
 1504         penv_xmm->en_rdp = penv_fpreg->en_rdp;
 1505         penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 1506         penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask;
 1507 
 1508         /* FPU registers */
 1509         for (i = 0; i < 8; ++i)
 1510                 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 1511 
 1512         /* SSE registers */
 1513         for (i = 0; i < 16; ++i)
 1514                 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 1515 }
 1516 
 1517 /* externalize from td->pcb */
 1518 int
 1519 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 1520 {
 1521 
 1522         fill_fpregs_xmm(&td->td_pcb->pcb_save, fpregs);
 1523         return (0);
 1524 }
 1525 
 1526 /* internalize to td->pcb */
 1527 int
 1528 set_fpregs(struct thread *td, struct fpreg *fpregs)
 1529 {
 1530 
 1531         set_fpregs_xmm(fpregs, &td->td_pcb->pcb_save);
 1532         return (0);
 1533 }
 1534 
 1535 /*
 1536  * Get machine context.
 1537  */
 1538 int
 1539 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 1540 {
 1541         struct trapframe *tp;
 1542 
 1543         tp = td->td_frame;
 1544         PROC_LOCK(curthread->td_proc);
 1545         mcp->mc_onstack = sigonstack(tp->tf_rsp);
 1546         PROC_UNLOCK(curthread->td_proc);
 1547         mcp->mc_r15 = tp->tf_r15;
 1548         mcp->mc_r14 = tp->tf_r14;
 1549         mcp->mc_r13 = tp->tf_r13;
 1550         mcp->mc_r12 = tp->tf_r12;
 1551         mcp->mc_r11 = tp->tf_r11;
 1552         mcp->mc_r10 = tp->tf_r10;
 1553         mcp->mc_r9  = tp->tf_r9;
 1554         mcp->mc_r8  = tp->tf_r8;
 1555         mcp->mc_rdi = tp->tf_rdi;
 1556         mcp->mc_rsi = tp->tf_rsi;
 1557         mcp->mc_rbp = tp->tf_rbp;
 1558         mcp->mc_rbx = tp->tf_rbx;
 1559         mcp->mc_rcx = tp->tf_rcx;
 1560         if (flags & GET_MC_CLEAR_RET) {
 1561                 mcp->mc_rax = 0;
 1562                 mcp->mc_rdx = 0;
 1563         } else {
 1564                 mcp->mc_rax = tp->tf_rax;
 1565                 mcp->mc_rdx = tp->tf_rdx;
 1566         }
 1567         mcp->mc_rip = tp->tf_rip;
 1568         mcp->mc_cs = tp->tf_cs;
 1569         mcp->mc_rflags = tp->tf_rflags;
 1570         mcp->mc_rsp = tp->tf_rsp;
 1571         mcp->mc_ss = tp->tf_ss;
 1572         mcp->mc_len = sizeof(*mcp);
 1573         get_fpcontext(td, mcp);
 1574         return (0);
 1575 }
 1576 
 1577 /*
 1578  * Set machine context.
 1579  *
 1580  * However, we don't set any but the user modifiable flags, and we won't
 1581  * touch the cs selector.
 1582  */
 1583 int
 1584 set_mcontext(struct thread *td, const mcontext_t *mcp)
 1585 {
 1586         struct trapframe *tp;
 1587         long rflags;
 1588         int ret;
 1589 
 1590         tp = td->td_frame;
 1591         if (mcp->mc_len != sizeof(*mcp))
 1592                 return (EINVAL);
 1593         rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 1594             (tp->tf_rflags & ~PSL_USERCHANGE);
 1595         ret = set_fpcontext(td, mcp);
 1596         if (ret != 0)
 1597                 return (ret);
 1598         tp->tf_r15 = mcp->mc_r15;
 1599         tp->tf_r14 = mcp->mc_r14;
 1600         tp->tf_r13 = mcp->mc_r13;
 1601         tp->tf_r12 = mcp->mc_r12;
 1602         tp->tf_r11 = mcp->mc_r11;
 1603         tp->tf_r10 = mcp->mc_r10;
 1604         tp->tf_r9  = mcp->mc_r9;
 1605         tp->tf_r8  = mcp->mc_r8;
 1606         tp->tf_rdi = mcp->mc_rdi;
 1607         tp->tf_rsi = mcp->mc_rsi;
 1608         tp->tf_rbp = mcp->mc_rbp;
 1609         tp->tf_rbx = mcp->mc_rbx;
 1610         tp->tf_rdx = mcp->mc_rdx;
 1611         tp->tf_rcx = mcp->mc_rcx;
 1612         tp->tf_rax = mcp->mc_rax;
 1613         tp->tf_rip = mcp->mc_rip;
 1614         tp->tf_rflags = rflags;
 1615         tp->tf_rsp = mcp->mc_rsp;
 1616         tp->tf_ss = mcp->mc_ss;
 1617         td->td_pcb->pcb_flags |= PCB_FULLCTX;
 1618         return (0);
 1619 }
 1620 
 1621 static void
 1622 get_fpcontext(struct thread *td, mcontext_t *mcp)
 1623 {
 1624 
 1625         mcp->mc_ownedfp = fpugetregs(td, (struct savefpu *)&mcp->mc_fpstate);
 1626         mcp->mc_fpformat = fpuformat();
 1627 }
 1628 
 1629 static int
 1630 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 1631 {
 1632 
 1633         if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 1634                 return (0);
 1635         else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 1636                 return (EINVAL);
 1637         else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 1638                 /* We don't care what state is left in the FPU or PCB. */
 1639                 fpstate_drop(td);
 1640         else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 1641             mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 1642                 /*
 1643                  * XXX we violate the dubious requirement that fpusetregs()
 1644                  * be called with interrupts disabled.
 1645                  * XXX obsolete on trap-16 systems?
 1646                  */
 1647                 fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate);
 1648         } else
 1649                 return (EINVAL);
 1650         return (0);
 1651 }
 1652 
 1653 void
 1654 fpstate_drop(struct thread *td)
 1655 {
 1656         register_t s;
 1657 
 1658         s = intr_disable();
 1659         if (PCPU_GET(fpcurthread) == td)
 1660                 fpudrop();
 1661         /*
 1662          * XXX force a full drop of the fpu.  The above only drops it if we
 1663          * owned it.
 1664          *
 1665          * XXX I don't much like fpugetregs()'s semantics of doing a full
 1666          * drop.  Dropping only to the pcb matches fnsave's behaviour.
 1667          * We only need to drop to !PCB_INITDONE in sendsig().  But
 1668          * sendsig() is the only caller of fpugetregs()... perhaps we just
 1669          * have too many layers.
 1670          */
 1671         curthread->td_pcb->pcb_flags &= ~PCB_FPUINITDONE;
 1672         intr_restore(s);
 1673 }
 1674 
 1675 int
 1676 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 1677 {
 1678         struct pcb *pcb;
 1679 
 1680         if (td == NULL) {
 1681                 dbregs->dr[0] = rdr0();
 1682                 dbregs->dr[1] = rdr1();
 1683                 dbregs->dr[2] = rdr2();
 1684                 dbregs->dr[3] = rdr3();
 1685                 dbregs->dr[6] = rdr6();
 1686                 dbregs->dr[7] = rdr7();
 1687         } else {
 1688                 pcb = td->td_pcb;
 1689                 dbregs->dr[0] = pcb->pcb_dr0;
 1690                 dbregs->dr[1] = pcb->pcb_dr1;
 1691                 dbregs->dr[2] = pcb->pcb_dr2;
 1692                 dbregs->dr[3] = pcb->pcb_dr3;
 1693                 dbregs->dr[6] = pcb->pcb_dr6;
 1694                 dbregs->dr[7] = pcb->pcb_dr7;
 1695         }
 1696         dbregs->dr[4] = 0;
 1697         dbregs->dr[5] = 0;
 1698         dbregs->dr[8] = 0;
 1699         dbregs->dr[9] = 0;
 1700         dbregs->dr[10] = 0;
 1701         dbregs->dr[11] = 0;
 1702         dbregs->dr[12] = 0;
 1703         dbregs->dr[13] = 0;
 1704         dbregs->dr[14] = 0;
 1705         dbregs->dr[15] = 0;
 1706         return (0);
 1707 }
 1708 
 1709 int
 1710 set_dbregs(struct thread *td, struct dbreg *dbregs)
 1711 {
 1712         struct pcb *pcb;
 1713         int i;
 1714         u_int64_t mask1, mask2;
 1715 
 1716         if (td == NULL) {
 1717                 load_dr0(dbregs->dr[0]);
 1718                 load_dr1(dbregs->dr[1]);
 1719                 load_dr2(dbregs->dr[2]);
 1720                 load_dr3(dbregs->dr[3]);
 1721                 load_dr6(dbregs->dr[6]);
 1722                 load_dr7(dbregs->dr[7]);
 1723         } else {
 1724                 /*
 1725                  * Don't let an illegal value for dr7 get set.  Specifically,
 1726                  * check for undefined settings.  Setting these bit patterns
 1727                  * result in undefined behaviour and can lead to an unexpected
 1728                  * TRCTRAP or a general protection fault right here.
 1729                  * Upper bits of dr6 and dr7 must not be set
 1730                  */
 1731                 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8;
 1732                      i++, mask1 <<= 2, mask2 <<= 2)
 1733                         if ((dbregs->dr[7] & mask1) == mask2)
 1734                                 return (EINVAL);
 1735                 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 1736                     (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 1737                         return (EINVAL);
 1738 
 1739                 pcb = td->td_pcb;
 1740 
 1741                 /*
 1742                  * Don't let a process set a breakpoint that is not within the
 1743                  * process's address space.  If a process could do this, it
 1744                  * could halt the system by setting a breakpoint in the kernel
 1745                  * (if ddb was enabled).  Thus, we need to check to make sure
 1746                  * that no breakpoints are being enabled for addresses outside
 1747                  * process's address space, unless, perhaps, we were called by
 1748                  * uid 0.
 1749                  *
 1750                  * XXX - what about when the watched area of the user's
 1751                  * address space is written into from within the kernel
 1752                  * ... wouldn't that still cause a breakpoint to be generated
 1753                  * from within kernel mode?
 1754                  */
 1755 
 1756                 if (suser(td) != 0) {
 1757                         if (dbregs->dr[7] & 0x3) {
 1758                                 /* dr0 is enabled */
 1759                                 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 1760                                         return (EINVAL);
 1761                         }
 1762                         if (dbregs->dr[7] & 0x3<<2) {
 1763                                 /* dr1 is enabled */
 1764                                 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 1765                                         return (EINVAL);
 1766                         }
 1767                         if (dbregs->dr[7] & 0x3<<4) {
 1768                                 /* dr2 is enabled */
 1769                                 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 1770                                         return (EINVAL);
 1771                         }
 1772                         if (dbregs->dr[7] & 0x3<<6) {
 1773                                 /* dr3 is enabled */
 1774                                 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 1775                                         return (EINVAL);
 1776                         }
 1777                 }
 1778 
 1779                 pcb->pcb_dr0 = dbregs->dr[0];
 1780                 pcb->pcb_dr1 = dbregs->dr[1];
 1781                 pcb->pcb_dr2 = dbregs->dr[2];
 1782                 pcb->pcb_dr3 = dbregs->dr[3];
 1783                 pcb->pcb_dr6 = dbregs->dr[6];
 1784                 pcb->pcb_dr7 = dbregs->dr[7];
 1785 
 1786                 pcb->pcb_flags |= PCB_DBREGS;
 1787         }
 1788 
 1789         return (0);
 1790 }
 1791 
 1792 void
 1793 reset_dbregs(void)
 1794 {
 1795 
 1796         load_dr7(0);    /* Turn off the control bits first */
 1797         load_dr0(0);
 1798         load_dr1(0);
 1799         load_dr2(0);
 1800         load_dr3(0);
 1801         load_dr6(0);
 1802 }
 1803 
 1804 /*
 1805  * Return > 0 if a hardware breakpoint has been hit, and the
 1806  * breakpoint was in user space.  Return 0, otherwise.
 1807  */
 1808 int
 1809 user_dbreg_trap(void)
 1810 {
 1811         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
 1812         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
 1813         int nbp;            /* number of breakpoints that triggered */
 1814         caddr_t addr[4];    /* breakpoint addresses */
 1815         int i;
 1816         
 1817         dr7 = rdr7();
 1818         if ((dr7 & 0x000000ff) == 0) {
 1819                 /*
 1820                  * all GE and LE bits in the dr7 register are zero,
 1821                  * thus the trap couldn't have been caused by the
 1822                  * hardware debug registers
 1823                  */
 1824                 return 0;
 1825         }
 1826 
 1827         nbp = 0;
 1828         dr6 = rdr6();
 1829         bp = dr6 & 0x0000000f;
 1830 
 1831         if (!bp) {
 1832                 /*
 1833                  * None of the breakpoint bits are set meaning this
 1834                  * trap was not caused by any of the debug registers
 1835                  */
 1836                 return 0;
 1837         }
 1838 
 1839         /*
 1840          * at least one of the breakpoints were hit, check to see
 1841          * which ones and if any of them are user space addresses
 1842          */
 1843 
 1844         if (bp & 0x01) {
 1845                 addr[nbp++] = (caddr_t)rdr0();
 1846         }
 1847         if (bp & 0x02) {
 1848                 addr[nbp++] = (caddr_t)rdr1();
 1849         }
 1850         if (bp & 0x04) {
 1851                 addr[nbp++] = (caddr_t)rdr2();
 1852         }
 1853         if (bp & 0x08) {
 1854                 addr[nbp++] = (caddr_t)rdr3();
 1855         }
 1856 
 1857         for (i=0; i<nbp; i++) {
 1858                 if (addr[i] <
 1859                     (caddr_t)VM_MAXUSER_ADDRESS) {
 1860                         /*
 1861                          * addr[i] is in user space
 1862                          */
 1863                         return nbp;
 1864                 }
 1865         }
 1866 
 1867         /*
 1868          * None of the breakpoints are in user space.
 1869          */
 1870         return 0;
 1871 }
 1872 
 1873 #ifdef KDB
 1874 
 1875 /*
 1876  * Provide inb() and outb() as functions.  They are normally only
 1877  * available as macros calling inlined functions, thus cannot be
 1878  * called from the debugger.
 1879  *
 1880  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
 1881  */
 1882 
 1883 #undef inb
 1884 #undef outb
 1885 
 1886 /* silence compiler warnings */
 1887 u_char inb(u_int);
 1888 void outb(u_int, u_char);
 1889 
 1890 u_char
 1891 inb(u_int port)
 1892 {
 1893         u_char  data;
 1894         /*
 1895          * We use %%dx and not %1 here because i/o is done at %dx and not at
 1896          * %edx, while gcc generates inferior code (movw instead of movl)
 1897          * if we tell it to load (u_short) port.
 1898          */
 1899         __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 1900         return (data);
 1901 }
 1902 
 1903 void
 1904 outb(u_int port, u_char data)
 1905 {
 1906         u_char  al;
 1907         /*
 1908          * Use an unnecessary assignment to help gcc's register allocator.
 1909          * This make a large difference for gcc-1.40 and a tiny difference
 1910          * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 1911          * best results.  gcc-2.6.0 can't handle this.
 1912          */
 1913         al = data;
 1914         __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 1915 }
 1916 
 1917 #endif /* KDB */

Cache object: d139c4dd3aab0732287c53b0332b3446


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.