1 /*-
2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
4 * All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * William Jolitz.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
38 */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_apic.h"
44 #include "opt_atalk.h"
45 #include "opt_compat.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_inet.h"
49 #include "opt_ipx.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_msgbuf.h"
54 #include "opt_npx.h"
55 #include "opt_perfmon.h"
56 #include "opt_xbox.h"
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/clock.h>
66 #include <sys/cons.h>
67 #include <sys/cpu.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #include <sys/sysctl.h>
86 #include <sys/sysent.h>
87 #include <sys/sysproto.h>
88 #include <sys/ucontext.h>
89 #include <sys/vmmeter.h>
90
91 #include <vm/vm.h>
92 #include <vm/vm_extern.h>
93 #include <vm/vm_kern.h>
94 #include <vm/vm_page.h>
95 #include <vm/vm_map.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_pager.h>
98 #include <vm/vm_param.h>
99
100 #ifdef DDB
101 #ifndef KDB
102 #error KDB must be enabled in order for DDB to work!
103 #endif
104 #include <ddb/ddb.h>
105 #include <ddb/db_sym.h>
106 #endif
107
108 #include <isa/rtc.h>
109
110 #include <net/netisr.h>
111
112 #include <machine/bootinfo.h>
113 #include <machine/clock.h>
114 #include <machine/cpu.h>
115 #include <machine/cputypes.h>
116 #include <machine/intr_machdep.h>
117 #include <machine/mca.h>
118 #include <machine/md_var.h>
119 #include <machine/metadata.h>
120 #include <machine/pc/bios.h>
121 #include <machine/pcb.h>
122 #include <machine/pcb_ext.h>
123 #include <machine/proc.h>
124 #include <machine/reg.h>
125 #include <machine/sigframe.h>
126 #include <machine/specialreg.h>
127 #include <machine/vm86.h>
128 #ifdef PERFMON
129 #include <machine/perfmon.h>
130 #endif
131 #ifdef SMP
132 #include <machine/smp.h>
133 #endif
134
135 #ifdef DEV_ISA
136 #include <i386/isa/icu.h>
137 #endif
138
139 #ifdef XBOX
140 #include <machine/xbox.h>
141
142 int arch_i386_is_xbox = 0;
143 uint32_t arch_i386_xbox_memsize = 0;
144 #endif
145
146 /* Sanity check for __curthread() */
147 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
148
149 extern void init386(int first);
150 extern void dblfault_handler(void);
151
152 extern void printcpuinfo(void); /* XXX header file */
153 extern void finishidentcpu(void);
154 extern void panicifcpuunsupported(void);
155 extern void initializecpu(void);
156
157 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
158 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
159
160 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
161 #define CPU_ENABLE_SSE
162 #endif
163
164 static void cpu_startup(void *);
165 static void fpstate_drop(struct thread *td);
166 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
167 static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
168 #ifdef CPU_ENABLE_SSE
169 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
170 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
171 #endif /* CPU_ENABLE_SSE */
172 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
173
174 #ifdef DDB
175 extern vm_offset_t ksym_start, ksym_end;
176 #endif
177
178 /* Intel ICH registers */
179 #define ICH_PMBASE 0x400
180 #define ICH_SMI_EN ICH_PMBASE + 0x30
181
182 int _udatasel, _ucodesel;
183 u_int basemem;
184
185 int cold = 1;
186
187 #ifdef COMPAT_43
188 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
189 #endif
190 #ifdef COMPAT_FREEBSD4
191 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
192 #endif
193
194 long Maxmem = 0;
195 long realmem = 0;
196
197 #ifdef PAE
198 FEATURE(pae, "Physical Address Extensions");
199 #endif
200
201 /*
202 * The number of PHYSMAP entries must be one less than the number of
203 * PHYSSEG entries because the PHYSMAP entry that spans the largest
204 * physical address that is accessible by ISA DMA is split into two
205 * PHYSSEG entries.
206 */
207 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
208
209 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
210 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
211
212 /* must be 2 less so 0 0 can signal end of chunks */
213 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
214 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
215
216 struct kva_md_info kmi;
217
218 static struct trapframe proc0_tf;
219 struct pcpu __pcpu[MAXCPU];
220
221 struct mtx icu_lock;
222
223 struct mem_range_softc mem_range_softc;
224
225 static void
226 cpu_startup(dummy)
227 void *dummy;
228 {
229 char *sysenv;
230
231 /*
232 * On MacBooks, we need to disallow the legacy USB circuit to
233 * generate an SMI# because this can cause several problems,
234 * namely: incorrect CPU frequency detection and failure to
235 * start the APs.
236 * We do this by disabling a bit in the SMI_EN (SMI Control and
237 * Enable register) of the Intel ICH LPC Interface Bridge.
238 */
239 sysenv = getenv("smbios.system.product");
240 if (sysenv != NULL) {
241 if (strncmp(sysenv, "MacBook", 7) == 0) {
242 if (bootverbose)
243 printf("Disabling LEGACY_USB_EN bit on "
244 "Intel ICH.\n");
245 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
246 }
247 freeenv(sysenv);
248 }
249
250 /*
251 * Good {morning,afternoon,evening,night}.
252 */
253 startrtclock();
254 printcpuinfo();
255 panicifcpuunsupported();
256 #ifdef PERFMON
257 perfmon_init();
258 #endif
259 printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
260 ptoa((uintmax_t)Maxmem) / 1048576);
261 realmem = Maxmem;
262 /*
263 * Display any holes after the first chunk of extended memory.
264 */
265 if (bootverbose) {
266 int indx;
267
268 printf("Physical memory chunk(s):\n");
269 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
270 vm_paddr_t size;
271
272 size = phys_avail[indx + 1] - phys_avail[indx];
273 printf(
274 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
275 (uintmax_t)phys_avail[indx],
276 (uintmax_t)phys_avail[indx + 1] - 1,
277 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
278 }
279 }
280
281 vm_ksubmap_init(&kmi);
282
283 printf("avail memory = %ju (%ju MB)\n",
284 ptoa((uintmax_t)cnt.v_free_count),
285 ptoa((uintmax_t)cnt.v_free_count) / 1048576);
286
287 /*
288 * Set up buffers, so they can be used to read disk labels.
289 */
290 bufinit();
291 vm_pager_bufferinit();
292
293 cpu_setregs();
294 }
295
296 /*
297 * Send an interrupt to process.
298 *
299 * Stack is set up to allow sigcode stored
300 * at top to call routine, followed by kcall
301 * to sigreturn routine below. After sigreturn
302 * resets the signal mask, the stack, and the
303 * frame pointer, it returns to the user
304 * specified pc, psl.
305 */
306 #ifdef COMPAT_43
307 static void
308 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
309 {
310 struct osigframe sf, *fp;
311 struct proc *p;
312 struct thread *td;
313 struct sigacts *psp;
314 struct trapframe *regs;
315 int sig;
316 int oonstack;
317
318 td = curthread;
319 p = td->td_proc;
320 PROC_LOCK_ASSERT(p, MA_OWNED);
321 sig = ksi->ksi_signo;
322 psp = p->p_sigacts;
323 mtx_assert(&psp->ps_mtx, MA_OWNED);
324 regs = td->td_frame;
325 oonstack = sigonstack(regs->tf_esp);
326
327 /* Allocate space for the signal handler context. */
328 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
329 SIGISMEMBER(psp->ps_sigonstack, sig)) {
330 fp = (struct osigframe *)(td->td_sigstk.ss_sp +
331 td->td_sigstk.ss_size - sizeof(struct osigframe));
332 #if defined(COMPAT_43)
333 td->td_sigstk.ss_flags |= SS_ONSTACK;
334 #endif
335 } else
336 fp = (struct osigframe *)regs->tf_esp - 1;
337
338 /* Translate the signal if appropriate. */
339 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
340 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
341
342 /* Build the argument list for the signal handler. */
343 sf.sf_signum = sig;
344 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
345 bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
346 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
347 /* Signal handler installed with SA_SIGINFO. */
348 sf.sf_arg2 = (register_t)&fp->sf_siginfo;
349 sf.sf_siginfo.si_signo = sig;
350 sf.sf_siginfo.si_code = ksi->ksi_code;
351 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
352 sf.sf_addr = 0;
353 } else {
354 /* Old FreeBSD-style arguments. */
355 sf.sf_arg2 = ksi->ksi_code;
356 sf.sf_addr = (register_t)ksi->ksi_addr;
357 sf.sf_ahu.sf_handler = catcher;
358 }
359 mtx_unlock(&psp->ps_mtx);
360 PROC_UNLOCK(p);
361
362 /* Save most if not all of trap frame. */
363 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
364 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
365 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
366 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
367 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
368 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
369 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
370 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
371 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
372 sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
373 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
374 sf.sf_siginfo.si_sc.sc_gs = rgs();
375 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
376
377 /* Build the signal context to be used by osigreturn(). */
378 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
379 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
380 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
381 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
382 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
383 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
384 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
385 sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
386
387 /*
388 * If we're a vm86 process, we want to save the segment registers.
389 * We also change eflags to be our emulated eflags, not the actual
390 * eflags.
391 */
392 if (regs->tf_eflags & PSL_VM) {
393 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
394 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
395 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
396
397 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
398 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
399 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
400 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
401
402 if (vm86->vm86_has_vme == 0)
403 sf.sf_siginfo.si_sc.sc_ps =
404 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
405 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
406
407 /* See sendsig() for comments. */
408 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
409 }
410
411 /*
412 * Copy the sigframe out to the user's stack.
413 */
414 if (copyout(&sf, fp, sizeof(*fp)) != 0) {
415 #ifdef DEBUG
416 printf("process %ld has trashed its stack\n", (long)p->p_pid);
417 #endif
418 PROC_LOCK(p);
419 sigexit(td, SIGILL);
420 }
421
422 regs->tf_esp = (int)fp;
423 regs->tf_eip = PS_STRINGS - szosigcode;
424 regs->tf_eflags &= ~(PSL_T | PSL_D);
425 regs->tf_cs = _ucodesel;
426 regs->tf_ds = _udatasel;
427 regs->tf_es = _udatasel;
428 regs->tf_fs = _udatasel;
429 load_gs(_udatasel);
430 regs->tf_ss = _udatasel;
431 PROC_LOCK(p);
432 mtx_lock(&psp->ps_mtx);
433 }
434 #endif /* COMPAT_43 */
435
436 #ifdef COMPAT_FREEBSD4
437 static void
438 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
439 {
440 struct sigframe4 sf, *sfp;
441 struct proc *p;
442 struct thread *td;
443 struct sigacts *psp;
444 struct trapframe *regs;
445 int sig;
446 int oonstack;
447
448 td = curthread;
449 p = td->td_proc;
450 PROC_LOCK_ASSERT(p, MA_OWNED);
451 sig = ksi->ksi_signo;
452 psp = p->p_sigacts;
453 mtx_assert(&psp->ps_mtx, MA_OWNED);
454 regs = td->td_frame;
455 oonstack = sigonstack(regs->tf_esp);
456
457 /* Save user context. */
458 bzero(&sf, sizeof(sf));
459 sf.sf_uc.uc_sigmask = *mask;
460 sf.sf_uc.uc_stack = td->td_sigstk;
461 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
462 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
463 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
464 sf.sf_uc.uc_mcontext.mc_gs = rgs();
465 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
466 bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
467 sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
468 bzero(sf.sf_uc.uc_mcontext.__spare__,
469 sizeof(sf.sf_uc.uc_mcontext.__spare__));
470 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
471
472 /* Allocate space for the signal handler context. */
473 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
474 SIGISMEMBER(psp->ps_sigonstack, sig)) {
475 sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
476 td->td_sigstk.ss_size - sizeof(struct sigframe4));
477 #if defined(COMPAT_43)
478 td->td_sigstk.ss_flags |= SS_ONSTACK;
479 #endif
480 } else
481 sfp = (struct sigframe4 *)regs->tf_esp - 1;
482
483 /* Translate the signal if appropriate. */
484 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
485 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
486
487 /* Build the argument list for the signal handler. */
488 sf.sf_signum = sig;
489 sf.sf_ucontext = (register_t)&sfp->sf_uc;
490 bzero(&sf.sf_si, sizeof(sf.sf_si));
491 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
492 /* Signal handler installed with SA_SIGINFO. */
493 sf.sf_siginfo = (register_t)&sfp->sf_si;
494 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
495
496 /* Fill in POSIX parts */
497 sf.sf_si.si_signo = sig;
498 sf.sf_si.si_code = ksi->ksi_code;
499 sf.sf_si.si_addr = ksi->ksi_addr;
500 } else {
501 /* Old FreeBSD-style arguments. */
502 sf.sf_siginfo = ksi->ksi_code;
503 sf.sf_addr = (register_t)ksi->ksi_addr;
504 sf.sf_ahu.sf_handler = catcher;
505 }
506 mtx_unlock(&psp->ps_mtx);
507 PROC_UNLOCK(p);
508
509 /*
510 * If we're a vm86 process, we want to save the segment registers.
511 * We also change eflags to be our emulated eflags, not the actual
512 * eflags.
513 */
514 if (regs->tf_eflags & PSL_VM) {
515 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
516 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
517
518 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
519 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
520 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
521 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
522
523 if (vm86->vm86_has_vme == 0)
524 sf.sf_uc.uc_mcontext.mc_eflags =
525 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
526 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
527
528 /*
529 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
530 * syscalls made by the signal handler. This just avoids
531 * wasting time for our lazy fixup of such faults. PSL_NT
532 * does nothing in vm86 mode, but vm86 programs can set it
533 * almost legitimately in probes for old cpu types.
534 */
535 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
536 }
537
538 /*
539 * Copy the sigframe out to the user's stack.
540 */
541 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
542 #ifdef DEBUG
543 printf("process %ld has trashed its stack\n", (long)p->p_pid);
544 #endif
545 PROC_LOCK(p);
546 sigexit(td, SIGILL);
547 }
548
549 regs->tf_esp = (int)sfp;
550 regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
551 regs->tf_eflags &= ~(PSL_T | PSL_D);
552 regs->tf_cs = _ucodesel;
553 regs->tf_ds = _udatasel;
554 regs->tf_es = _udatasel;
555 regs->tf_fs = _udatasel;
556 regs->tf_ss = _udatasel;
557 PROC_LOCK(p);
558 mtx_lock(&psp->ps_mtx);
559 }
560 #endif /* COMPAT_FREEBSD4 */
561
562 void
563 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
564 {
565 struct sigframe sf, *sfp;
566 struct proc *p;
567 struct thread *td;
568 struct sigacts *psp;
569 char *sp;
570 struct trapframe *regs;
571 int sig;
572 int oonstack;
573
574 td = curthread;
575 p = td->td_proc;
576 PROC_LOCK_ASSERT(p, MA_OWNED);
577 sig = ksi->ksi_signo;
578 psp = p->p_sigacts;
579 mtx_assert(&psp->ps_mtx, MA_OWNED);
580 #ifdef COMPAT_FREEBSD4
581 if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
582 freebsd4_sendsig(catcher, ksi, mask);
583 return;
584 }
585 #endif
586 #ifdef COMPAT_43
587 if (SIGISMEMBER(psp->ps_osigset, sig)) {
588 osendsig(catcher, ksi, mask);
589 return;
590 }
591 #endif
592 regs = td->td_frame;
593 oonstack = sigonstack(regs->tf_esp);
594
595 /* Save user context. */
596 bzero(&sf, sizeof(sf));
597 sf.sf_uc.uc_sigmask = *mask;
598 sf.sf_uc.uc_stack = td->td_sigstk;
599 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
600 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
601 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
602 sf.sf_uc.uc_mcontext.mc_gs = rgs();
603 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
604 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
605 get_fpcontext(td, &sf.sf_uc.uc_mcontext);
606 fpstate_drop(td);
607 bzero(sf.sf_uc.uc_mcontext.mc_spare1,
608 sizeof(sf.sf_uc.uc_mcontext.mc_spare1));
609 bzero(sf.sf_uc.uc_mcontext.mc_spare2,
610 sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
611 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
612
613 /* Allocate space for the signal handler context. */
614 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
615 SIGISMEMBER(psp->ps_sigonstack, sig)) {
616 sp = td->td_sigstk.ss_sp +
617 td->td_sigstk.ss_size - sizeof(struct sigframe);
618 #if defined(COMPAT_43)
619 td->td_sigstk.ss_flags |= SS_ONSTACK;
620 #endif
621 } else
622 sp = (char *)regs->tf_esp - sizeof(struct sigframe);
623 /* Align to 16 bytes. */
624 sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
625
626 /* Translate the signal if appropriate. */
627 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
628 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
629
630 /* Build the argument list for the signal handler. */
631 sf.sf_signum = sig;
632 sf.sf_ucontext = (register_t)&sfp->sf_uc;
633 bzero(&sf.sf_si, sizeof(sf.sf_si));
634 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
635 /* Signal handler installed with SA_SIGINFO. */
636 sf.sf_siginfo = (register_t)&sfp->sf_si;
637 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
638
639 /* Fill in POSIX parts */
640 sf.sf_si = ksi->ksi_info;
641 sf.sf_si.si_signo = sig; /* maybe a translated signal */
642 } else {
643 /* Old FreeBSD-style arguments. */
644 sf.sf_siginfo = ksi->ksi_code;
645 sf.sf_addr = (register_t)ksi->ksi_addr;
646 sf.sf_ahu.sf_handler = catcher;
647 }
648 mtx_unlock(&psp->ps_mtx);
649 PROC_UNLOCK(p);
650
651 /*
652 * If we're a vm86 process, we want to save the segment registers.
653 * We also change eflags to be our emulated eflags, not the actual
654 * eflags.
655 */
656 if (regs->tf_eflags & PSL_VM) {
657 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
658 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
659
660 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
661 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
662 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
663 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
664
665 if (vm86->vm86_has_vme == 0)
666 sf.sf_uc.uc_mcontext.mc_eflags =
667 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
668 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
669
670 /*
671 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
672 * syscalls made by the signal handler. This just avoids
673 * wasting time for our lazy fixup of such faults. PSL_NT
674 * does nothing in vm86 mode, but vm86 programs can set it
675 * almost legitimately in probes for old cpu types.
676 */
677 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
678 }
679
680 /*
681 * Copy the sigframe out to the user's stack.
682 */
683 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
684 #ifdef DEBUG
685 printf("process %ld has trashed its stack\n", (long)p->p_pid);
686 #endif
687 PROC_LOCK(p);
688 sigexit(td, SIGILL);
689 }
690
691 regs->tf_esp = (int)sfp;
692 regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
693 regs->tf_eflags &= ~(PSL_T | PSL_D);
694 regs->tf_cs = _ucodesel;
695 regs->tf_ds = _udatasel;
696 regs->tf_es = _udatasel;
697 regs->tf_fs = _udatasel;
698 regs->tf_ss = _udatasel;
699 PROC_LOCK(p);
700 mtx_lock(&psp->ps_mtx);
701 }
702
703 /*
704 * System call to cleanup state after a signal
705 * has been taken. Reset signal mask and
706 * stack state from context left by sendsig (above).
707 * Return to previous pc and psl as specified by
708 * context left by sendsig. Check carefully to
709 * make sure that the user has not modified the
710 * state to gain improper privileges.
711 *
712 * MPSAFE
713 */
714 #ifdef COMPAT_43
715 int
716 osigreturn(td, uap)
717 struct thread *td;
718 struct osigreturn_args /* {
719 struct osigcontext *sigcntxp;
720 } */ *uap;
721 {
722 struct osigcontext sc;
723 struct trapframe *regs;
724 struct osigcontext *scp;
725 struct proc *p = td->td_proc;
726 int eflags, error;
727 ksiginfo_t ksi;
728
729 regs = td->td_frame;
730 error = copyin(uap->sigcntxp, &sc, sizeof(sc));
731 if (error != 0)
732 return (error);
733 scp = ≻
734 eflags = scp->sc_ps;
735 if (eflags & PSL_VM) {
736 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
737 struct vm86_kernel *vm86;
738
739 /*
740 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
741 * set up the vm86 area, and we can't enter vm86 mode.
742 */
743 if (td->td_pcb->pcb_ext == 0)
744 return (EINVAL);
745 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
746 if (vm86->vm86_inited == 0)
747 return (EINVAL);
748
749 /* Go back to user mode if both flags are set. */
750 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
751 ksiginfo_init_trap(&ksi);
752 ksi.ksi_signo = SIGBUS;
753 ksi.ksi_code = BUS_OBJERR;
754 ksi.ksi_addr = (void *)regs->tf_eip;
755 trapsignal(td, &ksi);
756 }
757
758 if (vm86->vm86_has_vme) {
759 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
760 (eflags & VME_USERCHANGE) | PSL_VM;
761 } else {
762 vm86->vm86_eflags = eflags; /* save VIF, VIP */
763 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
764 (eflags & VM_USERCHANGE) | PSL_VM;
765 }
766 tf->tf_vm86_ds = scp->sc_ds;
767 tf->tf_vm86_es = scp->sc_es;
768 tf->tf_vm86_fs = scp->sc_fs;
769 tf->tf_vm86_gs = scp->sc_gs;
770 tf->tf_ds = _udatasel;
771 tf->tf_es = _udatasel;
772 tf->tf_fs = _udatasel;
773 } else {
774 /*
775 * Don't allow users to change privileged or reserved flags.
776 */
777 /*
778 * XXX do allow users to change the privileged flag PSL_RF.
779 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
780 * should sometimes set it there too. tf_eflags is kept in
781 * the signal context during signal handling and there is no
782 * other place to remember it, so the PSL_RF bit may be
783 * corrupted by the signal handler without us knowing.
784 * Corruption of the PSL_RF bit at worst causes one more or
785 * one less debugger trap, so allowing it is fairly harmless.
786 */
787 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
788 return (EINVAL);
789 }
790
791 /*
792 * Don't allow users to load a valid privileged %cs. Let the
793 * hardware check for invalid selectors, excess privilege in
794 * other selectors, invalid %eip's and invalid %esp's.
795 */
796 if (!CS_SECURE(scp->sc_cs)) {
797 ksiginfo_init_trap(&ksi);
798 ksi.ksi_signo = SIGBUS;
799 ksi.ksi_code = BUS_OBJERR;
800 ksi.ksi_trapno = T_PROTFLT;
801 ksi.ksi_addr = (void *)regs->tf_eip;
802 trapsignal(td, &ksi);
803 return (EINVAL);
804 }
805 regs->tf_ds = scp->sc_ds;
806 regs->tf_es = scp->sc_es;
807 regs->tf_fs = scp->sc_fs;
808 }
809
810 /* Restore remaining registers. */
811 regs->tf_eax = scp->sc_eax;
812 regs->tf_ebx = scp->sc_ebx;
813 regs->tf_ecx = scp->sc_ecx;
814 regs->tf_edx = scp->sc_edx;
815 regs->tf_esi = scp->sc_esi;
816 regs->tf_edi = scp->sc_edi;
817 regs->tf_cs = scp->sc_cs;
818 regs->tf_ss = scp->sc_ss;
819 regs->tf_isp = scp->sc_isp;
820 regs->tf_ebp = scp->sc_fp;
821 regs->tf_esp = scp->sc_sp;
822 regs->tf_eip = scp->sc_pc;
823 regs->tf_eflags = eflags;
824
825 PROC_LOCK(p);
826 #if defined(COMPAT_43)
827 if (scp->sc_onstack & 1)
828 td->td_sigstk.ss_flags |= SS_ONSTACK;
829 else
830 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
831 #endif
832 SIGSETOLD(td->td_sigmask, scp->sc_mask);
833 SIG_CANTMASK(td->td_sigmask);
834 signotify(td);
835 PROC_UNLOCK(p);
836 return (EJUSTRETURN);
837 }
838 #endif /* COMPAT_43 */
839
840 #ifdef COMPAT_FREEBSD4
841 /*
842 * MPSAFE
843 */
844 int
845 freebsd4_sigreturn(td, uap)
846 struct thread *td;
847 struct freebsd4_sigreturn_args /* {
848 const ucontext4 *sigcntxp;
849 } */ *uap;
850 {
851 struct ucontext4 uc;
852 struct proc *p = td->td_proc;
853 struct trapframe *regs;
854 const struct ucontext4 *ucp;
855 int cs, eflags, error;
856 ksiginfo_t ksi;
857
858 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
859 if (error != 0)
860 return (error);
861 ucp = &uc;
862 regs = td->td_frame;
863 eflags = ucp->uc_mcontext.mc_eflags;
864 if (eflags & PSL_VM) {
865 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
866 struct vm86_kernel *vm86;
867
868 /*
869 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
870 * set up the vm86 area, and we can't enter vm86 mode.
871 */
872 if (td->td_pcb->pcb_ext == 0)
873 return (EINVAL);
874 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
875 if (vm86->vm86_inited == 0)
876 return (EINVAL);
877
878 /* Go back to user mode if both flags are set. */
879 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
880 ksiginfo_init_trap(&ksi);
881 ksi.ksi_signo = SIGBUS;
882 ksi.ksi_code = BUS_OBJERR;
883 ksi.ksi_addr = (void *)regs->tf_eip;
884 trapsignal(td, &ksi);
885 }
886 if (vm86->vm86_has_vme) {
887 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
888 (eflags & VME_USERCHANGE) | PSL_VM;
889 } else {
890 vm86->vm86_eflags = eflags; /* save VIF, VIP */
891 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
892 (eflags & VM_USERCHANGE) | PSL_VM;
893 }
894 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
895 tf->tf_eflags = eflags;
896 tf->tf_vm86_ds = tf->tf_ds;
897 tf->tf_vm86_es = tf->tf_es;
898 tf->tf_vm86_fs = tf->tf_fs;
899 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
900 tf->tf_ds = _udatasel;
901 tf->tf_es = _udatasel;
902 tf->tf_fs = _udatasel;
903 } else {
904 /*
905 * Don't allow users to change privileged or reserved flags.
906 */
907 /*
908 * XXX do allow users to change the privileged flag PSL_RF.
909 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
910 * should sometimes set it there too. tf_eflags is kept in
911 * the signal context during signal handling and there is no
912 * other place to remember it, so the PSL_RF bit may be
913 * corrupted by the signal handler without us knowing.
914 * Corruption of the PSL_RF bit at worst causes one more or
915 * one less debugger trap, so allowing it is fairly harmless.
916 */
917 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
918 printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags);
919 return (EINVAL);
920 }
921
922 /*
923 * Don't allow users to load a valid privileged %cs. Let the
924 * hardware check for invalid selectors, excess privilege in
925 * other selectors, invalid %eip's and invalid %esp's.
926 */
927 cs = ucp->uc_mcontext.mc_cs;
928 if (!CS_SECURE(cs)) {
929 printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
930 ksiginfo_init_trap(&ksi);
931 ksi.ksi_signo = SIGBUS;
932 ksi.ksi_code = BUS_OBJERR;
933 ksi.ksi_trapno = T_PROTFLT;
934 ksi.ksi_addr = (void *)regs->tf_eip;
935 trapsignal(td, &ksi);
936 return (EINVAL);
937 }
938
939 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
940 }
941
942 PROC_LOCK(p);
943 #if defined(COMPAT_43)
944 if (ucp->uc_mcontext.mc_onstack & 1)
945 td->td_sigstk.ss_flags |= SS_ONSTACK;
946 else
947 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
948 #endif
949
950 td->td_sigmask = ucp->uc_sigmask;
951 SIG_CANTMASK(td->td_sigmask);
952 signotify(td);
953 PROC_UNLOCK(p);
954 return (EJUSTRETURN);
955 }
956 #endif /* COMPAT_FREEBSD4 */
957
958 /*
959 * MPSAFE
960 */
961 int
962 sigreturn(td, uap)
963 struct thread *td;
964 struct sigreturn_args /* {
965 const struct __ucontext *sigcntxp;
966 } */ *uap;
967 {
968 ucontext_t uc;
969 struct proc *p = td->td_proc;
970 struct trapframe *regs;
971 const ucontext_t *ucp;
972 int cs, eflags, error, ret;
973 ksiginfo_t ksi;
974
975 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
976 if (error != 0)
977 return (error);
978 ucp = &uc;
979 regs = td->td_frame;
980 eflags = ucp->uc_mcontext.mc_eflags;
981 if (eflags & PSL_VM) {
982 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
983 struct vm86_kernel *vm86;
984
985 /*
986 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
987 * set up the vm86 area, and we can't enter vm86 mode.
988 */
989 if (td->td_pcb->pcb_ext == 0)
990 return (EINVAL);
991 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
992 if (vm86->vm86_inited == 0)
993 return (EINVAL);
994
995 /* Go back to user mode if both flags are set. */
996 if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
997 ksiginfo_init_trap(&ksi);
998 ksi.ksi_signo = SIGBUS;
999 ksi.ksi_code = BUS_OBJERR;
1000 ksi.ksi_addr = (void *)regs->tf_eip;
1001 trapsignal(td, &ksi);
1002 }
1003
1004 if (vm86->vm86_has_vme) {
1005 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
1006 (eflags & VME_USERCHANGE) | PSL_VM;
1007 } else {
1008 vm86->vm86_eflags = eflags; /* save VIF, VIP */
1009 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
1010 (eflags & VM_USERCHANGE) | PSL_VM;
1011 }
1012 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
1013 tf->tf_eflags = eflags;
1014 tf->tf_vm86_ds = tf->tf_ds;
1015 tf->tf_vm86_es = tf->tf_es;
1016 tf->tf_vm86_fs = tf->tf_fs;
1017 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
1018 tf->tf_ds = _udatasel;
1019 tf->tf_es = _udatasel;
1020 tf->tf_fs = _udatasel;
1021 } else {
1022 /*
1023 * Don't allow users to change privileged or reserved flags.
1024 */
1025 /*
1026 * XXX do allow users to change the privileged flag PSL_RF.
1027 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
1028 * should sometimes set it there too. tf_eflags is kept in
1029 * the signal context during signal handling and there is no
1030 * other place to remember it, so the PSL_RF bit may be
1031 * corrupted by the signal handler without us knowing.
1032 * Corruption of the PSL_RF bit at worst causes one more or
1033 * one less debugger trap, so allowing it is fairly harmless.
1034 */
1035 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
1036 printf("sigreturn: eflags = 0x%x\n", eflags);
1037 return (EINVAL);
1038 }
1039
1040 /*
1041 * Don't allow users to load a valid privileged %cs. Let the
1042 * hardware check for invalid selectors, excess privilege in
1043 * other selectors, invalid %eip's and invalid %esp's.
1044 */
1045 cs = ucp->uc_mcontext.mc_cs;
1046 if (!CS_SECURE(cs)) {
1047 printf("sigreturn: cs = 0x%x\n", cs);
1048 ksiginfo_init_trap(&ksi);
1049 ksi.ksi_signo = SIGBUS;
1050 ksi.ksi_code = BUS_OBJERR;
1051 ksi.ksi_trapno = T_PROTFLT;
1052 ksi.ksi_addr = (void *)regs->tf_eip;
1053 trapsignal(td, &ksi);
1054 return (EINVAL);
1055 }
1056
1057 ret = set_fpcontext(td, &ucp->uc_mcontext);
1058 if (ret != 0)
1059 return (ret);
1060 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
1061 }
1062
1063 PROC_LOCK(p);
1064 #if defined(COMPAT_43)
1065 if (ucp->uc_mcontext.mc_onstack & 1)
1066 td->td_sigstk.ss_flags |= SS_ONSTACK;
1067 else
1068 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
1069 #endif
1070
1071 td->td_sigmask = ucp->uc_sigmask;
1072 SIG_CANTMASK(td->td_sigmask);
1073 signotify(td);
1074 PROC_UNLOCK(p);
1075 return (EJUSTRETURN);
1076 }
1077
1078 /*
1079 * Machine dependent boot() routine
1080 *
1081 * I haven't seen anything to put here yet
1082 * Possibly some stuff might be grafted back here from boot()
1083 */
1084 void
1085 cpu_boot(int howto)
1086 {
1087 }
1088
1089 /* Get current clock frequency for the given cpu id. */
1090 int
1091 cpu_est_clockrate(int cpu_id, uint64_t *rate)
1092 {
1093 register_t reg;
1094 uint64_t tsc1, tsc2;
1095
1096 if (pcpu_find(cpu_id) == NULL || rate == NULL)
1097 return (EINVAL);
1098 if (!tsc_present)
1099 return (EOPNOTSUPP);
1100
1101 /* If we're booting, trust the rate calibrated moments ago. */
1102 if (cold) {
1103 *rate = tsc_freq;
1104 return (0);
1105 }
1106
1107 #ifdef SMP
1108 /* Schedule ourselves on the indicated cpu. */
1109 thread_lock(curthread);
1110 sched_bind(curthread, cpu_id);
1111 thread_unlock(curthread);
1112 #endif
1113
1114 /* Calibrate by measuring a short delay. */
1115 reg = intr_disable();
1116 tsc1 = rdtsc();
1117 DELAY(1000);
1118 tsc2 = rdtsc();
1119 intr_restore(reg);
1120
1121 #ifdef SMP
1122 thread_lock(curthread);
1123 sched_unbind(curthread);
1124 thread_unlock(curthread);
1125 #endif
1126
1127 /*
1128 * Calculate the difference in readings, convert to Mhz, and
1129 * subtract 0.5% of the total. Empirical testing has shown that
1130 * overhead in DELAY() works out to approximately this value.
1131 */
1132 tsc2 -= tsc1;
1133 *rate = tsc2 * 1000 - tsc2 * 5;
1134 return (0);
1135 }
1136
1137 /*
1138 * Shutdown the CPU as much as possible
1139 */
1140 void
1141 cpu_halt(void)
1142 {
1143 for (;;)
1144 __asm__ ("hlt");
1145 }
1146
1147 /*
1148 * Hook to idle the CPU when possible. In the SMP case we default to
1149 * off because a halted cpu will not currently pick up a new thread in the
1150 * run queue until the next timer tick. If turned on this will result in
1151 * approximately a 4.2% loss in real time performance in buildworld tests
1152 * (but improves user and sys times oddly enough), and saves approximately
1153 * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
1154 *
1155 * XXX we need to have a cpu mask of idle cpus and generate an IPI or
1156 * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
1157 * Then we can have our cake and eat it too.
1158 *
1159 * XXX I'm turning it on for SMP as well by default for now. It seems to
1160 * help lock contention somewhat, and this is critical for HTT. -Peter
1161 */
1162 static int cpu_idle_hlt = 1;
1163 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
1164 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1165 &cpu_idle_hlt, 0, "Idle loop HLT enable");
1166
1167 static void
1168 cpu_idle_default(void)
1169 {
1170 /*
1171 * we must absolutely guarentee that hlt is the
1172 * absolute next instruction after sti or we
1173 * introduce a timing window.
1174 */
1175 __asm __volatile("sti; hlt");
1176 }
1177
1178 /*
1179 * Note that we have to be careful here to avoid a race between checking
1180 * sched_runnable() and actually halting. If we don't do this, we may waste
1181 * the time between calling hlt and the next interrupt even though there
1182 * is a runnable process.
1183 */
1184 void
1185 cpu_idle(void)
1186 {
1187
1188 #ifdef SMP
1189 if (mp_grab_cpu_hlt())
1190 return;
1191 #endif
1192
1193 if (cpu_idle_hlt) {
1194 disable_intr();
1195 if (sched_runnable())
1196 enable_intr();
1197 else
1198 (*cpu_idle_hook)();
1199 }
1200 }
1201
1202 /* Other subsystems (e.g., ACPI) can hook this later. */
1203 void (*cpu_idle_hook)(void) = cpu_idle_default;
1204
1205 /*
1206 * Reset registers to default values on exec.
1207 */
1208 void
1209 exec_setregs(td, entry, stack, ps_strings)
1210 struct thread *td;
1211 u_long entry;
1212 u_long stack;
1213 u_long ps_strings;
1214 {
1215 struct trapframe *regs = td->td_frame;
1216 struct pcb *pcb = td->td_pcb;
1217
1218 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
1219 pcb->pcb_gs = _udatasel;
1220 load_gs(_udatasel);
1221
1222 mtx_lock_spin(&dt_lock);
1223 if (td->td_proc->p_md.md_ldt)
1224 user_ldt_free(td);
1225 else
1226 mtx_unlock_spin(&dt_lock);
1227
1228 bzero((char *)regs, sizeof(struct trapframe));
1229 regs->tf_eip = entry;
1230 regs->tf_esp = stack;
1231 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
1232 regs->tf_ss = _udatasel;
1233 regs->tf_ds = _udatasel;
1234 regs->tf_es = _udatasel;
1235 regs->tf_fs = _udatasel;
1236 regs->tf_cs = _ucodesel;
1237
1238 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
1239 regs->tf_ebx = ps_strings;
1240
1241 /*
1242 * Reset the hardware debug registers if they were in use.
1243 * They won't have any meaning for the newly exec'd process.
1244 */
1245 if (pcb->pcb_flags & PCB_DBREGS) {
1246 pcb->pcb_dr0 = 0;
1247 pcb->pcb_dr1 = 0;
1248 pcb->pcb_dr2 = 0;
1249 pcb->pcb_dr3 = 0;
1250 pcb->pcb_dr6 = 0;
1251 pcb->pcb_dr7 = 0;
1252 if (pcb == PCPU_GET(curpcb)) {
1253 /*
1254 * Clear the debug registers on the running
1255 * CPU, otherwise they will end up affecting
1256 * the next process we switch to.
1257 */
1258 reset_dbregs();
1259 }
1260 pcb->pcb_flags &= ~PCB_DBREGS;
1261 }
1262
1263 /*
1264 * Initialize the math emulator (if any) for the current process.
1265 * Actually, just clear the bit that says that the emulator has
1266 * been initialized. Initialization is delayed until the process
1267 * traps to the emulator (if it is done at all) mainly because
1268 * emulators don't provide an entry point for initialization.
1269 */
1270 td->td_pcb->pcb_flags &= ~FP_SOFTFP;
1271 pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;
1272
1273 /*
1274 * Drop the FP state if we hold it, so that the process gets a
1275 * clean FP state if it uses the FPU again.
1276 */
1277 fpstate_drop(td);
1278
1279 /*
1280 * XXX - Linux emulator
1281 * Make sure sure edx is 0x0 on entry. Linux binaries depend
1282 * on it.
1283 */
1284 td->td_retval[1] = 0;
1285 }
1286
1287 void
1288 cpu_setregs(void)
1289 {
1290 unsigned int cr0;
1291
1292 cr0 = rcr0();
1293
1294 /*
1295 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
1296 *
1297 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
1298 * instructions. We must set the CR0_MP bit and use the CR0_TS
1299 * bit to control the trap, because setting the CR0_EM bit does
1300 * not cause WAIT instructions to trap. It's important to trap
1301 * WAIT instructions - otherwise the "wait" variants of no-wait
1302 * control instructions would degenerate to the "no-wait" variants
1303 * after FP context switches but work correctly otherwise. It's
1304 * particularly important to trap WAITs when there is no NPX -
1305 * otherwise the "wait" variants would always degenerate.
1306 *
1307 * Try setting CR0_NE to get correct error reporting on 486DX's.
1308 * Setting it should fail or do nothing on lesser processors.
1309 */
1310 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
1311 load_cr0(cr0);
1312 load_gs(_udatasel);
1313 }
1314
1315 u_long bootdev; /* not a struct cdev *- encoding is different */
1316 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
1317 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
1318
1319 /*
1320 * Initialize 386 and configure to run kernel
1321 */
1322
1323 /*
1324 * Initialize segments & interrupt table
1325 */
1326
1327 int _default_ldt;
1328 union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
1329 static struct gate_descriptor idt0[NIDT];
1330 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
1331 union descriptor ldt[NLDT]; /* local descriptor table */
1332 struct region_descriptor r_gdt, r_idt; /* table descriptors */
1333 struct mtx dt_lock; /* lock for GDT and LDT */
1334
1335 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1336 extern int has_f00f_bug;
1337 #endif
1338
1339 static struct i386tss dblfault_tss;
1340 static char dblfault_stack[PAGE_SIZE];
1341
1342 extern vm_offset_t proc0kstack;
1343
1344
1345 /*
1346 * software prototypes -- in more palatable form.
1347 *
1348 * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
1349 * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
1350 */
1351 struct soft_segment_descriptor gdt_segs[] = {
1352 /* GNULL_SEL 0 Null Descriptor */
1353 { 0x0, /* segment base address */
1354 0x0, /* length */
1355 0, /* segment type */
1356 0, /* segment descriptor priority level */
1357 0, /* segment descriptor present */
1358 0, 0,
1359 0, /* default 32 vs 16 bit size */
1360 0 /* limit granularity (byte/page units)*/ },
1361 /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
1362 { 0x0, /* segment base address */
1363 0xfffff, /* length - all address space */
1364 SDT_MEMRWA, /* segment type */
1365 0, /* segment descriptor priority level */
1366 1, /* segment descriptor present */
1367 0, 0,
1368 1, /* default 32 vs 16 bit size */
1369 1 /* limit granularity (byte/page units)*/ },
1370 /* GUFS_SEL 2 %fs Descriptor for user */
1371 { 0x0, /* segment base address */
1372 0xfffff, /* length - all address space */
1373 SDT_MEMRWA, /* segment type */
1374 SEL_UPL, /* segment descriptor priority level */
1375 1, /* segment descriptor present */
1376 0, 0,
1377 1, /* default 32 vs 16 bit size */
1378 1 /* limit granularity (byte/page units)*/ },
1379 /* GUGS_SEL 3 %gs Descriptor for user */
1380 { 0x0, /* segment base address */
1381 0xfffff, /* length - all address space */
1382 SDT_MEMRWA, /* segment type */
1383 SEL_UPL, /* segment descriptor priority level */
1384 1, /* segment descriptor present */
1385 0, 0,
1386 1, /* default 32 vs 16 bit size */
1387 1 /* limit granularity (byte/page units)*/ },
1388 /* GCODE_SEL 4 Code Descriptor for kernel */
1389 { 0x0, /* segment base address */
1390 0xfffff, /* length - all address space */
1391 SDT_MEMERA, /* segment type */
1392 0, /* segment descriptor priority level */
1393 1, /* segment descriptor present */
1394 0, 0,
1395 1, /* default 32 vs 16 bit size */
1396 1 /* limit granularity (byte/page units)*/ },
1397 /* GDATA_SEL 5 Data Descriptor for kernel */
1398 { 0x0, /* segment base address */
1399 0xfffff, /* length - all address space */
1400 SDT_MEMRWA, /* segment type */
1401 0, /* segment descriptor priority level */
1402 1, /* segment descriptor present */
1403 0, 0,
1404 1, /* default 32 vs 16 bit size */
1405 1 /* limit granularity (byte/page units)*/ },
1406 /* GUCODE_SEL 6 Code Descriptor for user */
1407 { 0x0, /* segment base address */
1408 0xfffff, /* length - all address space */
1409 SDT_MEMERA, /* segment type */
1410 SEL_UPL, /* segment descriptor priority level */
1411 1, /* segment descriptor present */
1412 0, 0,
1413 1, /* default 32 vs 16 bit size */
1414 1 /* limit granularity (byte/page units)*/ },
1415 /* GUDATA_SEL 7 Data Descriptor for user */
1416 { 0x0, /* segment base address */
1417 0xfffff, /* length - all address space */
1418 SDT_MEMRWA, /* segment type */
1419 SEL_UPL, /* segment descriptor priority level */
1420 1, /* segment descriptor present */
1421 0, 0,
1422 1, /* default 32 vs 16 bit size */
1423 1 /* limit granularity (byte/page units)*/ },
1424 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1425 { 0x400, /* segment base address */
1426 0xfffff, /* length */
1427 SDT_MEMRWA, /* segment type */
1428 0, /* segment descriptor priority level */
1429 1, /* segment descriptor present */
1430 0, 0,
1431 1, /* default 32 vs 16 bit size */
1432 1 /* limit granularity (byte/page units)*/ },
1433 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */
1434 {
1435 0x0, /* segment base address */
1436 sizeof(struct i386tss)-1,/* length */
1437 SDT_SYS386TSS, /* segment type */
1438 0, /* segment descriptor priority level */
1439 1, /* segment descriptor present */
1440 0, 0,
1441 0, /* unused - default 32 vs 16 bit size */
1442 0 /* limit granularity (byte/page units)*/ },
1443 /* GLDT_SEL 10 LDT Descriptor */
1444 { (int) ldt, /* segment base address */
1445 sizeof(ldt)-1, /* length - all address space */
1446 SDT_SYSLDT, /* segment type */
1447 SEL_UPL, /* segment descriptor priority level */
1448 1, /* segment descriptor present */
1449 0, 0,
1450 0, /* unused - default 32 vs 16 bit size */
1451 0 /* limit granularity (byte/page units)*/ },
1452 /* GUSERLDT_SEL 11 User LDT Descriptor per process */
1453 { (int) ldt, /* segment base address */
1454 (512 * sizeof(union descriptor)-1), /* length */
1455 SDT_SYSLDT, /* segment type */
1456 0, /* segment descriptor priority level */
1457 1, /* segment descriptor present */
1458 0, 0,
1459 0, /* unused - default 32 vs 16 bit size */
1460 0 /* limit granularity (byte/page units)*/ },
1461 /* GPANIC_SEL 12 Panic Tss Descriptor */
1462 { (int) &dblfault_tss, /* segment base address */
1463 sizeof(struct i386tss)-1,/* length - all address space */
1464 SDT_SYS386TSS, /* segment type */
1465 0, /* segment descriptor priority level */
1466 1, /* segment descriptor present */
1467 0, 0,
1468 0, /* unused - default 32 vs 16 bit size */
1469 0 /* limit granularity (byte/page units)*/ },
1470 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
1471 { 0, /* segment base address (overwritten) */
1472 0xfffff, /* length */
1473 SDT_MEMERA, /* segment type */
1474 0, /* segment descriptor priority level */
1475 1, /* segment descriptor present */
1476 0, 0,
1477 0, /* default 32 vs 16 bit size */
1478 1 /* limit granularity (byte/page units)*/ },
1479 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
1480 { 0, /* segment base address (overwritten) */
1481 0xfffff, /* length */
1482 SDT_MEMERA, /* segment type */
1483 0, /* segment descriptor priority level */
1484 1, /* segment descriptor present */
1485 0, 0,
1486 0, /* default 32 vs 16 bit size */
1487 1 /* limit granularity (byte/page units)*/ },
1488 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
1489 { 0, /* segment base address (overwritten) */
1490 0xfffff, /* length */
1491 SDT_MEMRWA, /* segment type */
1492 0, /* segment descriptor priority level */
1493 1, /* segment descriptor present */
1494 0, 0,
1495 1, /* default 32 vs 16 bit size */
1496 1 /* limit granularity (byte/page units)*/ },
1497 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
1498 { 0, /* segment base address (overwritten) */
1499 0xfffff, /* length */
1500 SDT_MEMRWA, /* segment type */
1501 0, /* segment descriptor priority level */
1502 1, /* segment descriptor present */
1503 0, 0,
1504 0, /* default 32 vs 16 bit size */
1505 1 /* limit granularity (byte/page units)*/ },
1506 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
1507 { 0, /* segment base address (overwritten) */
1508 0xfffff, /* length */
1509 SDT_MEMRWA, /* segment type */
1510 0, /* segment descriptor priority level */
1511 1, /* segment descriptor present */
1512 0, 0,
1513 0, /* default 32 vs 16 bit size */
1514 1 /* limit granularity (byte/page units)*/ },
1515 /* GNDIS_SEL 18 NDIS Descriptor */
1516 { 0x0, /* segment base address */
1517 0x0, /* length */
1518 0, /* segment type */
1519 0, /* segment descriptor priority level */
1520 0, /* segment descriptor present */
1521 0, 0,
1522 0, /* default 32 vs 16 bit size */
1523 0 /* limit granularity (byte/page units)*/ },
1524 };
1525
1526 static struct soft_segment_descriptor ldt_segs[] = {
1527 /* Null Descriptor - overwritten by call gate */
1528 { 0x0, /* segment base address */
1529 0x0, /* length - all address space */
1530 0, /* segment type */
1531 0, /* segment descriptor priority level */
1532 0, /* segment descriptor present */
1533 0, 0,
1534 0, /* default 32 vs 16 bit size */
1535 0 /* limit granularity (byte/page units)*/ },
1536 /* Null Descriptor - overwritten by call gate */
1537 { 0x0, /* segment base address */
1538 0x0, /* length - all address space */
1539 0, /* segment type */
1540 0, /* segment descriptor priority level */
1541 0, /* segment descriptor present */
1542 0, 0,
1543 0, /* default 32 vs 16 bit size */
1544 0 /* limit granularity (byte/page units)*/ },
1545 /* Null Descriptor - overwritten by call gate */
1546 { 0x0, /* segment base address */
1547 0x0, /* length - all address space */
1548 0, /* segment type */
1549 0, /* segment descriptor priority level */
1550 0, /* segment descriptor present */
1551 0, 0,
1552 0, /* default 32 vs 16 bit size */
1553 0 /* limit granularity (byte/page units)*/ },
1554 /* Code Descriptor for user */
1555 { 0x0, /* segment base address */
1556 0xfffff, /* length - all address space */
1557 SDT_MEMERA, /* segment type */
1558 SEL_UPL, /* segment descriptor priority level */
1559 1, /* segment descriptor present */
1560 0, 0,
1561 1, /* default 32 vs 16 bit size */
1562 1 /* limit granularity (byte/page units)*/ },
1563 /* Null Descriptor - overwritten by call gate */
1564 { 0x0, /* segment base address */
1565 0x0, /* length - all address space */
1566 0, /* segment type */
1567 0, /* segment descriptor priority level */
1568 0, /* segment descriptor present */
1569 0, 0,
1570 0, /* default 32 vs 16 bit size */
1571 0 /* limit granularity (byte/page units)*/ },
1572 /* Data Descriptor for user */
1573 { 0x0, /* segment base address */
1574 0xfffff, /* length - all address space */
1575 SDT_MEMRWA, /* segment type */
1576 SEL_UPL, /* segment descriptor priority level */
1577 1, /* segment descriptor present */
1578 0, 0,
1579 1, /* default 32 vs 16 bit size */
1580 1 /* limit granularity (byte/page units)*/ },
1581 };
1582
1583 void
1584 setidt(idx, func, typ, dpl, selec)
1585 int idx;
1586 inthand_t *func;
1587 int typ;
1588 int dpl;
1589 int selec;
1590 {
1591 struct gate_descriptor *ip;
1592
1593 ip = idt + idx;
1594 ip->gd_looffset = (int)func;
1595 ip->gd_selector = selec;
1596 ip->gd_stkcpy = 0;
1597 ip->gd_xx = 0;
1598 ip->gd_type = typ;
1599 ip->gd_dpl = dpl;
1600 ip->gd_p = 1;
1601 ip->gd_hioffset = ((int)func)>>16 ;
1602 }
1603
1604 extern inthand_t
1605 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1606 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1607 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1608 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
1609 IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
1610
1611 #ifdef DDB
1612 /*
1613 * Display the index and function name of any IDT entries that don't use
1614 * the default 'rsvd' entry point.
1615 */
1616 DB_SHOW_COMMAND(idt, db_show_idt)
1617 {
1618 struct gate_descriptor *ip;
1619 int idx;
1620 uintptr_t func;
1621
1622 ip = idt;
1623 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
1624 func = (ip->gd_hioffset << 16 | ip->gd_looffset);
1625 if (func != (uintptr_t)&IDTVEC(rsvd)) {
1626 db_printf("%3d\t", idx);
1627 db_printsym(func, DB_STGY_PROC);
1628 db_printf("\n");
1629 }
1630 ip++;
1631 }
1632 }
1633
1634 /* Show privileged registers. */
1635 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
1636 {
1637 uint64_t idtr, gdtr;
1638
1639 idtr = ridt();
1640 db_printf("idtr\t0x%08x/%04x\n",
1641 (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
1642 gdtr = rgdt();
1643 db_printf("gdtr\t0x%08x/%04x\n",
1644 (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
1645 db_printf("ldtr\t0x%04x\n", rldt());
1646 db_printf("tr\t0x%04x\n", rtr());
1647 db_printf("cr0\t0x%08x\n", rcr0());
1648 db_printf("cr2\t0x%08x\n", rcr2());
1649 db_printf("cr3\t0x%08x\n", rcr3());
1650 db_printf("cr4\t0x%08x\n", rcr4());
1651 }
1652 #endif
1653
1654 void
1655 sdtossd(sd, ssd)
1656 struct segment_descriptor *sd;
1657 struct soft_segment_descriptor *ssd;
1658 {
1659 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
1660 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1661 ssd->ssd_type = sd->sd_type;
1662 ssd->ssd_dpl = sd->sd_dpl;
1663 ssd->ssd_p = sd->sd_p;
1664 ssd->ssd_def32 = sd->sd_def32;
1665 ssd->ssd_gran = sd->sd_gran;
1666 }
1667
1668 static int
1669 add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp)
1670 {
1671 int i, insert_idx, physmap_idx;
1672
1673 physmap_idx = *physmap_idxp;
1674
1675 if (boothowto & RB_VERBOSE)
1676 printf("SMAP type=%02x base=%016llx len=%016llx\n",
1677 smap->type, smap->base, smap->length);
1678
1679 if (smap->type != SMAP_TYPE_MEMORY)
1680 return (1);
1681
1682 if (smap->length == 0)
1683 return (1);
1684
1685 #ifndef PAE
1686 if (smap->base >= 0xffffffff) {
1687 printf("%uK of memory above 4GB ignored\n",
1688 (u_int)(smap->length / 1024));
1689 return (1);
1690 }
1691 #endif
1692
1693 /*
1694 * Find insertion point while checking for overlap. Start off by
1695 * assuming the new entry will be added to the end.
1696 */
1697 insert_idx = physmap_idx + 2;
1698 for (i = 0; i <= physmap_idx; i += 2) {
1699 if (smap->base < physmap[i + 1]) {
1700 if (smap->base + smap->length <= physmap[i]) {
1701 insert_idx = i;
1702 break;
1703 }
1704 if (boothowto & RB_VERBOSE)
1705 printf(
1706 "Overlapping memory regions, ignoring second region\n");
1707 return (1);
1708 }
1709 }
1710
1711 /* See if we can prepend to the next entry. */
1712 if (insert_idx <= physmap_idx &&
1713 smap->base + smap->length == physmap[insert_idx]) {
1714 physmap[insert_idx] = smap->base;
1715 return (1);
1716 }
1717
1718 /* See if we can append to the previous entry. */
1719 if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
1720 physmap[insert_idx - 1] += smap->length;
1721 return (1);
1722 }
1723
1724 physmap_idx += 2;
1725 *physmap_idxp = physmap_idx;
1726 if (physmap_idx == PHYSMAP_SIZE) {
1727 printf(
1728 "Too many segments in the physical address map, giving up\n");
1729 return (0);
1730 }
1731
1732 /*
1733 * Move the last 'N' entries down to make room for the new
1734 * entry if needed.
1735 */
1736 for (i = physmap_idx; i > insert_idx; i -= 2) {
1737 physmap[i] = physmap[i - 2];
1738 physmap[i + 1] = physmap[i - 1];
1739 }
1740
1741 /* Insert the new entry. */
1742 physmap[insert_idx] = smap->base;
1743 physmap[insert_idx + 1] = smap->base + smap->length;
1744 return (1);
1745 }
1746
1747 static void
1748 basemem_setup(void)
1749 {
1750 vm_paddr_t pa;
1751 pt_entry_t *pte;
1752 int i;
1753
1754 if (basemem > 640) {
1755 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1756 basemem);
1757 basemem = 640;
1758 }
1759
1760 /*
1761 * XXX if biosbasemem is now < 640, there is a `hole'
1762 * between the end of base memory and the start of
1763 * ISA memory. The hole may be empty or it may
1764 * contain BIOS code or data. Map it read/write so
1765 * that the BIOS can write to it. (Memory from 0 to
1766 * the physical end of the kernel is mapped read-only
1767 * to begin with and then parts of it are remapped.
1768 * The parts that aren't remapped form holes that
1769 * remain read-only and are unused by the kernel.
1770 * The base memory area is below the physical end of
1771 * the kernel and right now forms a read-only hole.
1772 * The part of it from PAGE_SIZE to
1773 * (trunc_page(biosbasemem * 1024) - 1) will be
1774 * remapped and used by the kernel later.)
1775 *
1776 * This code is similar to the code used in
1777 * pmap_mapdev, but since no memory needs to be
1778 * allocated we simply change the mapping.
1779 */
1780 for (pa = trunc_page(basemem * 1024);
1781 pa < ISA_HOLE_START; pa += PAGE_SIZE)
1782 pmap_kenter(KERNBASE + pa, pa);
1783
1784 /*
1785 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
1786 * the vm86 page table so that vm86 can scribble on them using
1787 * the vm86 map too. XXX: why 2 ways for this and only 1 way for
1788 * page 0, at least as initialized here?
1789 */
1790 pte = (pt_entry_t *)vm86paddr;
1791 for (i = basemem / 4; i < 160; i++)
1792 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
1793 }
1794
1795 /*
1796 * Populate the (physmap) array with base/bound pairs describing the
1797 * available physical memory in the system, then test this memory and
1798 * build the phys_avail array describing the actually-available memory.
1799 *
1800 * If we cannot accurately determine the physical memory map, then use
1801 * value from the 0xE801 call, and failing that, the RTC.
1802 *
1803 * Total memory size may be set by the kernel environment variable
1804 * hw.physmem or the compile-time define MAXMEM.
1805 *
1806 * XXX first should be vm_paddr_t.
1807 */
1808 static void
1809 getmemsize(int first)
1810 {
1811 int has_smap, off, physmap_idx, pa_indx, da_indx;
1812 u_long physmem_tunable, memtest;
1813 vm_paddr_t physmap[PHYSMAP_SIZE];
1814 pt_entry_t *pte;
1815 quad_t dcons_addr, dcons_size;
1816 int hasbrokenint12, i;
1817 u_int extmem;
1818 struct vm86frame vmf;
1819 struct vm86context vmc;
1820 vm_paddr_t pa;
1821 struct bios_smap *smap, *smapbase, *smapend;
1822 u_int32_t smapsize;
1823 caddr_t kmdp;
1824
1825 has_smap = 0;
1826 #ifdef XBOX
1827 if (arch_i386_is_xbox) {
1828 /*
1829 * We queried the memory size before, so chop off 4MB for
1830 * the framebuffer and inform the OS of this.
1831 */
1832 physmap[0] = 0;
1833 physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
1834 physmap_idx = 0;
1835 goto physmap_done;
1836 }
1837 #endif
1838 bzero(&vmf, sizeof(vmf));
1839 bzero(physmap, sizeof(physmap));
1840 basemem = 0;
1841
1842 /*
1843 * Check if the loader supplied an SMAP memory map. If so,
1844 * use that and do not make any VM86 calls.
1845 */
1846 physmap_idx = 0;
1847 smapbase = NULL;
1848 kmdp = preload_search_by_type("elf kernel");
1849 if (kmdp == NULL)
1850 kmdp = preload_search_by_type("elf32 kernel");
1851 if (kmdp != NULL)
1852 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1853 MODINFO_METADATA | MODINFOMD_SMAP);
1854 if (smapbase != NULL) {
1855 /*
1856 * subr_module.c says:
1857 * "Consumer may safely assume that size value precedes data."
1858 * ie: an int32_t immediately precedes SMAP.
1859 */
1860 smapsize = *((u_int32_t *)smapbase - 1);
1861 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1862 has_smap = 1;
1863
1864 for (smap = smapbase; smap < smapend; smap++)
1865 if (!add_smap_entry(smap, physmap, &physmap_idx))
1866 break;
1867 goto have_smap;
1868 }
1869
1870 /*
1871 * Some newer BIOSes have a broken INT 12H implementation
1872 * which causes a kernel panic immediately. In this case, we
1873 * need use the SMAP to determine the base memory size.
1874 */
1875 hasbrokenint12 = 0;
1876 TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
1877 if (hasbrokenint12 == 0) {
1878 /* Use INT12 to determine base memory size. */
1879 vm86_intcall(0x12, &vmf);
1880 basemem = vmf.vmf_ax;
1881 basemem_setup();
1882 }
1883
1884 /*
1885 * Fetch the memory map with INT 15:E820. Map page 1 R/W into
1886 * the kernel page table so we can use it as a buffer. The
1887 * kernel will unmap this page later.
1888 */
1889 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
1890 vmc.npages = 0;
1891 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
1892 vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
1893
1894 vmf.vmf_ebx = 0;
1895 do {
1896 vmf.vmf_eax = 0xE820;
1897 vmf.vmf_edx = SMAP_SIG;
1898 vmf.vmf_ecx = sizeof(struct bios_smap);
1899 i = vm86_datacall(0x15, &vmf, &vmc);
1900 if (i || vmf.vmf_eax != SMAP_SIG)
1901 break;
1902 has_smap = 1;
1903 if (!add_smap_entry(smap, physmap, &physmap_idx))
1904 break;
1905 } while (vmf.vmf_ebx != 0);
1906
1907 have_smap:
1908 /*
1909 * If we didn't fetch the "base memory" size from INT12,
1910 * figure it out from the SMAP (or just guess).
1911 */
1912 if (basemem == 0) {
1913 for (i = 0; i <= physmap_idx; i += 2) {
1914 if (physmap[i] == 0x00000000) {
1915 basemem = physmap[i + 1] / 1024;
1916 break;
1917 }
1918 }
1919
1920 /* XXX: If we couldn't find basemem from SMAP, just guess. */
1921 if (basemem == 0)
1922 basemem = 640;
1923 basemem_setup();
1924 }
1925
1926 if (physmap[1] != 0)
1927 goto physmap_done;
1928
1929 /*
1930 * If we failed to find an SMAP, figure out the extended
1931 * memory size. We will then build a simple memory map with
1932 * two segments, one for "base memory" and the second for
1933 * "extended memory". Note that "extended memory" starts at a
1934 * physical address of 1MB and that both basemem and extmem
1935 * are in units of 1KB.
1936 *
1937 * First, try to fetch the extended memory size via INT 15:E801.
1938 */
1939 vmf.vmf_ax = 0xE801;
1940 if (vm86_intcall(0x15, &vmf) == 0) {
1941 extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
1942 } else {
1943 /*
1944 * If INT15:E801 fails, this is our last ditch effort
1945 * to determine the extended memory size. Currently
1946 * we prefer the RTC value over INT15:88.
1947 */
1948 #if 0
1949 vmf.vmf_ah = 0x88;
1950 vm86_intcall(0x15, &vmf);
1951 extmem = vmf.vmf_ax;
1952 #else
1953 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
1954 #endif
1955 }
1956
1957 /*
1958 * Special hack for chipsets that still remap the 384k hole when
1959 * there's 16MB of memory - this really confuses people that
1960 * are trying to use bus mastering ISA controllers with the
1961 * "16MB limit"; they only have 16MB, but the remapping puts
1962 * them beyond the limit.
1963 *
1964 * If extended memory is between 15-16MB (16-17MB phys address range),
1965 * chop it to 15MB.
1966 */
1967 if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
1968 extmem = 15 * 1024;
1969
1970 physmap[0] = 0;
1971 physmap[1] = basemem * 1024;
1972 physmap_idx = 2;
1973 physmap[physmap_idx] = 0x100000;
1974 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
1975
1976 physmap_done:
1977 /*
1978 * Now, physmap contains a map of physical memory.
1979 */
1980
1981 #ifdef SMP
1982 /* make hole for AP bootstrap code */
1983 physmap[1] = mp_bootaddress(physmap[1]);
1984 #endif
1985
1986 /*
1987 * Maxmem isn't the "maximum memory", it's one larger than the
1988 * highest page of the physical address space. It should be
1989 * called something like "Maxphyspage". We may adjust this
1990 * based on ``hw.physmem'' and the results of the memory test.
1991 */
1992 Maxmem = atop(physmap[physmap_idx + 1]);
1993
1994 #ifdef MAXMEM
1995 Maxmem = MAXMEM / 4;
1996 #endif
1997
1998 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1999 Maxmem = atop(physmem_tunable);
2000
2001 /*
2002 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
2003 * the amount of memory in the system.
2004 */
2005 if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
2006 Maxmem = atop(physmap[physmap_idx + 1]);
2007
2008 /*
2009 * By default keep the memtest enabled. Use a general name so that
2010 * one could eventually do more with the code than just disable it.
2011 */
2012 memtest = 1;
2013 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
2014
2015 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2016 (boothowto & RB_VERBOSE))
2017 printf("Physical memory use set to %ldK\n", Maxmem * 4);
2018
2019 /*
2020 * If Maxmem has been increased beyond what the system has detected,
2021 * extend the last memory segment to the new limit.
2022 */
2023 if (atop(physmap[physmap_idx + 1]) < Maxmem)
2024 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
2025
2026 /* call pmap initialization to make new kernel address space */
2027 pmap_bootstrap(first);
2028
2029 /*
2030 * Size up each available chunk of physical memory.
2031 */
2032 physmap[0] = PAGE_SIZE; /* mask off page 0 */
2033 pa_indx = 0;
2034 da_indx = 1;
2035 phys_avail[pa_indx++] = physmap[0];
2036 phys_avail[pa_indx] = physmap[0];
2037 dump_avail[da_indx] = physmap[0];
2038 pte = CMAP1;
2039
2040 /*
2041 * Get dcons buffer address
2042 */
2043 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
2044 getenv_quad("dcons.size", &dcons_size) == 0)
2045 dcons_addr = 0;
2046
2047 /*
2048 * physmap is in bytes, so when converting to page boundaries,
2049 * round up the start address and round down the end address.
2050 */
2051 for (i = 0; i <= physmap_idx; i += 2) {
2052 vm_paddr_t end;
2053
2054 end = ptoa((vm_paddr_t)Maxmem);
2055 if (physmap[i + 1] < end)
2056 end = trunc_page(physmap[i + 1]);
2057 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
2058 int tmp, page_bad, full;
2059 int *ptr = (int *)CADDR1;
2060
2061 full = FALSE;
2062 /*
2063 * block out kernel memory as not available.
2064 */
2065 if (pa >= KERNLOAD && pa < first)
2066 goto do_dump_avail;
2067
2068 /*
2069 * block out dcons buffer
2070 */
2071 if (dcons_addr > 0
2072 && pa >= trunc_page(dcons_addr)
2073 && pa < dcons_addr + dcons_size)
2074 goto do_dump_avail;
2075
2076 page_bad = FALSE;
2077 if (memtest == 0)
2078 goto skip_memtest;
2079
2080 /*
2081 * map page into kernel: valid, read/write,non-cacheable
2082 */
2083 *pte = pa | PG_V | PG_RW | PG_N;
2084 invltlb();
2085
2086 tmp = *(int *)ptr;
2087 /*
2088 * Test for alternating 1's and 0's
2089 */
2090 *(volatile int *)ptr = 0xaaaaaaaa;
2091 if (*(volatile int *)ptr != 0xaaaaaaaa)
2092 page_bad = TRUE;
2093 /*
2094 * Test for alternating 0's and 1's
2095 */
2096 *(volatile int *)ptr = 0x55555555;
2097 if (*(volatile int *)ptr != 0x55555555)
2098 page_bad = TRUE;
2099 /*
2100 * Test for all 1's
2101 */
2102 *(volatile int *)ptr = 0xffffffff;
2103 if (*(volatile int *)ptr != 0xffffffff)
2104 page_bad = TRUE;
2105 /*
2106 * Test for all 0's
2107 */
2108 *(volatile int *)ptr = 0x0;
2109 if (*(volatile int *)ptr != 0x0)
2110 page_bad = TRUE;
2111 /*
2112 * Restore original value.
2113 */
2114 *(int *)ptr = tmp;
2115
2116 skip_memtest:
2117 /*
2118 * Adjust array of valid/good pages.
2119 */
2120 if (page_bad == TRUE)
2121 continue;
2122 /*
2123 * If this good page is a continuation of the
2124 * previous set of good pages, then just increase
2125 * the end pointer. Otherwise start a new chunk.
2126 * Note that "end" points one higher than end,
2127 * making the range >= start and < end.
2128 * If we're also doing a speculative memory
2129 * test and we at or past the end, bump up Maxmem
2130 * so that we keep going. The first bad page
2131 * will terminate the loop.
2132 */
2133 if (phys_avail[pa_indx] == pa) {
2134 phys_avail[pa_indx] += PAGE_SIZE;
2135 } else {
2136 pa_indx++;
2137 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2138 printf(
2139 "Too many holes in the physical address space, giving up\n");
2140 pa_indx--;
2141 full = TRUE;
2142 goto do_dump_avail;
2143 }
2144 phys_avail[pa_indx++] = pa; /* start */
2145 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
2146 }
2147 physmem++;
2148 do_dump_avail:
2149 if (dump_avail[da_indx] == pa) {
2150 dump_avail[da_indx] += PAGE_SIZE;
2151 } else {
2152 da_indx++;
2153 if (da_indx == DUMP_AVAIL_ARRAY_END) {
2154 da_indx--;
2155 goto do_next;
2156 }
2157 dump_avail[da_indx++] = pa; /* start */
2158 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
2159 }
2160 do_next:
2161 if (full)
2162 break;
2163 }
2164 }
2165 *pte = 0;
2166 invltlb();
2167
2168 /*
2169 * XXX
2170 * The last chunk must contain at least one page plus the message
2171 * buffer to avoid complicating other code (message buffer address
2172 * calculation, etc.).
2173 */
2174 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
2175 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
2176 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
2177 phys_avail[pa_indx--] = 0;
2178 phys_avail[pa_indx--] = 0;
2179 }
2180
2181 Maxmem = atop(phys_avail[pa_indx]);
2182
2183 /* Trim off space for the message buffer. */
2184 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
2185
2186 /* Map the message buffer. */
2187 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2188 pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
2189 off);
2190 }
2191
2192 void
2193 init386(first)
2194 int first;
2195 {
2196 struct gate_descriptor *gdp;
2197 int gsel_tss, metadata_missing, x;
2198 struct pcpu *pc;
2199
2200 thread0.td_kstack = proc0kstack;
2201 thread0.td_pcb = (struct pcb *)
2202 (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
2203
2204 /*
2205 * This may be done better later if it gets more high level
2206 * components in it. If so just link td->td_proc here.
2207 */
2208 proc_linkup0(&proc0, &thread0);
2209
2210 metadata_missing = 0;
2211 if (bootinfo.bi_modulep) {
2212 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2213 preload_bootstrap_relocate(KERNBASE);
2214 } else {
2215 metadata_missing = 1;
2216 }
2217 if (envmode == 1)
2218 kern_envp = static_env;
2219 else if (bootinfo.bi_envp)
2220 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2221
2222 /* Init basic tunables, hz etc */
2223 init_param1();
2224
2225 /*
2226 * Make gdt memory segments. All segments cover the full 4GB
2227 * of address space and permissions are enforced at page level.
2228 */
2229 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
2230 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
2231 gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
2232 gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
2233 gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
2234 gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
2235
2236 pc = &__pcpu[0];
2237 gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
2238 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
2239 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
2240
2241 for (x = 0; x < NGDT; x++)
2242 ssdtosd(&gdt_segs[x], &gdt[x].sd);
2243
2244 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2245 r_gdt.rd_base = (int) gdt;
2246 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
2247 lgdt(&r_gdt);
2248
2249 pcpu_init(pc, 0, sizeof(struct pcpu));
2250 PCPU_SET(prvspace, pc);
2251 PCPU_SET(curthread, &thread0);
2252 PCPU_SET(curpcb, thread0.td_pcb);
2253
2254 /*
2255 * Initialize mutexes.
2256 *
2257 * icu_lock: in order to allow an interrupt to occur in a critical
2258 * section, to set pcpu->ipending (etc...) properly, we
2259 * must be able to get the icu lock, so it can't be
2260 * under witness.
2261 */
2262 mutex_init();
2263 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
2264
2265 /* make ldt memory segments */
2266 ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
2267 ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
2268 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
2269 ssdtosd(&ldt_segs[x], &ldt[x].sd);
2270
2271 _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
2272 lldt(_default_ldt);
2273 PCPU_SET(currentldt, _default_ldt);
2274
2275 /* exceptions */
2276 for (x = 0; x < NIDT; x++)
2277 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
2278 GSEL(GCODE_SEL, SEL_KPL));
2279 setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
2280 GSEL(GCODE_SEL, SEL_KPL));
2281 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
2282 GSEL(GCODE_SEL, SEL_KPL));
2283 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
2284 GSEL(GCODE_SEL, SEL_KPL));
2285 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
2286 GSEL(GCODE_SEL, SEL_KPL));
2287 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
2288 GSEL(GCODE_SEL, SEL_KPL));
2289 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
2290 GSEL(GCODE_SEL, SEL_KPL));
2291 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2292 GSEL(GCODE_SEL, SEL_KPL));
2293 setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
2294 , GSEL(GCODE_SEL, SEL_KPL));
2295 setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
2296 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
2297 GSEL(GCODE_SEL, SEL_KPL));
2298 setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
2299 GSEL(GCODE_SEL, SEL_KPL));
2300 setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
2301 GSEL(GCODE_SEL, SEL_KPL));
2302 setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
2303 GSEL(GCODE_SEL, SEL_KPL));
2304 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2305 GSEL(GCODE_SEL, SEL_KPL));
2306 setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
2307 GSEL(GCODE_SEL, SEL_KPL));
2308 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
2309 GSEL(GCODE_SEL, SEL_KPL));
2310 setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
2311 GSEL(GCODE_SEL, SEL_KPL));
2312 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
2313 GSEL(GCODE_SEL, SEL_KPL));
2314 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
2315 GSEL(GCODE_SEL, SEL_KPL));
2316 setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
2317 GSEL(GCODE_SEL, SEL_KPL));
2318
2319 r_idt.rd_limit = sizeof(idt0) - 1;
2320 r_idt.rd_base = (int) idt;
2321 lidt(&r_idt);
2322
2323 #ifdef XBOX
2324 /*
2325 * The following code queries the PCI ID of 0:0:0. For the XBOX,
2326 * This should be 0x10de / 0x02a5.
2327 *
2328 * This is exactly what Linux does.
2329 */
2330 outl(0xcf8, 0x80000000);
2331 if (inl(0xcfc) == 0x02a510de) {
2332 arch_i386_is_xbox = 1;
2333 pic16l_setled(XBOX_LED_GREEN);
2334
2335 /*
2336 * We are an XBOX, but we may have either 64MB or 128MB of
2337 * memory. The PCI host bridge should be programmed for this,
2338 * so we just query it.
2339 */
2340 outl(0xcf8, 0x80000084);
2341 arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
2342 }
2343 #endif /* XBOX */
2344
2345 /*
2346 * Initialize the i8254 before the console so that console
2347 * initialization can use DELAY().
2348 */
2349 i8254_init();
2350
2351 /*
2352 * Initialize the console before we print anything out.
2353 */
2354 cninit();
2355
2356 if (metadata_missing)
2357 printf("WARNING: loader(8) metadata is missing!\n");
2358
2359 #ifdef DEV_ISA
2360 elcr_probe();
2361 atpic_startup();
2362 #endif
2363
2364 #ifdef DDB
2365 ksym_start = bootinfo.bi_symtab;
2366 ksym_end = bootinfo.bi_esymtab;
2367 #endif
2368
2369 kdb_init();
2370
2371 #ifdef KDB
2372 if (boothowto & RB_KDB)
2373 kdb_enter_why(KDB_WHY_BOOTFLAGS,
2374 "Boot flags requested debugger");
2375 #endif
2376
2377 finishidentcpu(); /* Final stage of CPU initialization */
2378 setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2379 GSEL(GCODE_SEL, SEL_KPL));
2380 setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2381 GSEL(GCODE_SEL, SEL_KPL));
2382 initializecpu(); /* Initialize CPU registers */
2383
2384 /* make an initial tss so cpu can get interrupt stack on syscall! */
2385 /* Note: -16 is so we can grow the trapframe if we came from vm86 */
2386 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
2387 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
2388 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
2389 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2390 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
2391 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
2392 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
2393 ltr(gsel_tss);
2394
2395 /* pointer to selector slot for %fs/%gs */
2396 PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
2397
2398 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
2399 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
2400 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
2401 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
2402 #ifdef PAE
2403 dblfault_tss.tss_cr3 = (int)IdlePDPT;
2404 #else
2405 dblfault_tss.tss_cr3 = (int)IdlePTD;
2406 #endif
2407 dblfault_tss.tss_eip = (int)dblfault_handler;
2408 dblfault_tss.tss_eflags = PSL_KERNEL;
2409 dblfault_tss.tss_ds = dblfault_tss.tss_es =
2410 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
2411 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
2412 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
2413 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
2414
2415 vm86_initialize();
2416 getmemsize(first);
2417 init_param2(physmem);
2418
2419 /* now running on new page tables, configured,and u/iom is accessible */
2420
2421 msgbufinit(msgbufp, MSGBUF_SIZE);
2422
2423 /* make a call gate to reenter kernel with */
2424 gdp = &ldt[LSYS5CALLS_SEL].gd;
2425
2426 x = (int) &IDTVEC(lcall_syscall);
2427 gdp->gd_looffset = x;
2428 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
2429 gdp->gd_stkcpy = 1;
2430 gdp->gd_type = SDT_SYS386CGT;
2431 gdp->gd_dpl = SEL_UPL;
2432 gdp->gd_p = 1;
2433 gdp->gd_hioffset = x >> 16;
2434
2435 /* XXX does this work? */
2436 /* XXX yes! */
2437 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
2438 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
2439
2440 /* transfer to user mode */
2441
2442 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2443 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2444
2445 /* setup proc 0's pcb */
2446 thread0.td_pcb->pcb_flags = 0;
2447 #ifdef PAE
2448 thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
2449 #else
2450 thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
2451 #endif
2452 thread0.td_pcb->pcb_ext = 0;
2453 thread0.td_frame = &proc0_tf;
2454 }
2455
2456 void
2457 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
2458 {
2459
2460 pcpu->pc_acpi_id = 0xffffffff;
2461 }
2462
2463 void
2464 spinlock_enter(void)
2465 {
2466 struct thread *td;
2467 register_t flags;
2468
2469 td = curthread;
2470 if (td->td_md.md_spinlock_count == 0) {
2471 flags = intr_disable();
2472 td->td_md.md_spinlock_count = 1;
2473 td->td_md.md_saved_flags = flags;
2474 } else
2475 td->td_md.md_spinlock_count++;
2476 critical_enter();
2477 }
2478
2479 void
2480 spinlock_exit(void)
2481 {
2482 struct thread *td;
2483 register_t flags;
2484
2485 td = curthread;
2486 critical_exit();
2487 flags = td->td_md.md_saved_flags;
2488 td->td_md.md_spinlock_count--;
2489 if (td->td_md.md_spinlock_count == 0)
2490 intr_restore(flags);
2491 }
2492
2493 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2494 static void f00f_hack(void *unused);
2495 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
2496
2497 static void
2498 f00f_hack(void *unused)
2499 {
2500 struct gate_descriptor *new_idt;
2501 vm_offset_t tmp;
2502
2503 if (!has_f00f_bug)
2504 return;
2505
2506 GIANT_REQUIRED;
2507
2508 printf("Intel Pentium detected, installing workaround for F00F bug\n");
2509
2510 tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
2511 if (tmp == 0)
2512 panic("kmem_alloc returned 0");
2513
2514 /* Put the problematic entry (#6) at the end of the lower page. */
2515 new_idt = (struct gate_descriptor*)
2516 (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
2517 bcopy(idt, new_idt, sizeof(idt0));
2518 r_idt.rd_base = (u_int)new_idt;
2519 lidt(&r_idt);
2520 idt = new_idt;
2521 if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
2522 VM_PROT_READ, FALSE) != KERN_SUCCESS)
2523 panic("vm_map_protect failed");
2524 }
2525 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2526
2527 /*
2528 * Construct a PCB from a trapframe. This is called from kdb_trap() where
2529 * we want to start a backtrace from the function that caused us to enter
2530 * the debugger. We have the context in the trapframe, but base the trace
2531 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2532 * enough for a backtrace.
2533 */
2534 void
2535 makectx(struct trapframe *tf, struct pcb *pcb)
2536 {
2537
2538 pcb->pcb_edi = tf->tf_edi;
2539 pcb->pcb_esi = tf->tf_esi;
2540 pcb->pcb_ebp = tf->tf_ebp;
2541 pcb->pcb_ebx = tf->tf_ebx;
2542 pcb->pcb_eip = tf->tf_eip;
2543 pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
2544 }
2545
2546 int
2547 ptrace_set_pc(struct thread *td, u_long addr)
2548 {
2549
2550 td->td_frame->tf_eip = addr;
2551 return (0);
2552 }
2553
2554 int
2555 ptrace_single_step(struct thread *td)
2556 {
2557 td->td_frame->tf_eflags |= PSL_T;
2558 return (0);
2559 }
2560
2561 int
2562 ptrace_clear_single_step(struct thread *td)
2563 {
2564 td->td_frame->tf_eflags &= ~PSL_T;
2565 return (0);
2566 }
2567
2568 int
2569 fill_regs(struct thread *td, struct reg *regs)
2570 {
2571 struct pcb *pcb;
2572 struct trapframe *tp;
2573
2574 tp = td->td_frame;
2575 pcb = td->td_pcb;
2576 regs->r_fs = tp->tf_fs;
2577 regs->r_es = tp->tf_es;
2578 regs->r_ds = tp->tf_ds;
2579 regs->r_edi = tp->tf_edi;
2580 regs->r_esi = tp->tf_esi;
2581 regs->r_ebp = tp->tf_ebp;
2582 regs->r_ebx = tp->tf_ebx;
2583 regs->r_edx = tp->tf_edx;
2584 regs->r_ecx = tp->tf_ecx;
2585 regs->r_eax = tp->tf_eax;
2586 regs->r_eip = tp->tf_eip;
2587 regs->r_cs = tp->tf_cs;
2588 regs->r_eflags = tp->tf_eflags;
2589 regs->r_esp = tp->tf_esp;
2590 regs->r_ss = tp->tf_ss;
2591 regs->r_gs = pcb->pcb_gs;
2592 return (0);
2593 }
2594
2595 int
2596 set_regs(struct thread *td, struct reg *regs)
2597 {
2598 struct pcb *pcb;
2599 struct trapframe *tp;
2600
2601 tp = td->td_frame;
2602 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
2603 !CS_SECURE(regs->r_cs))
2604 return (EINVAL);
2605 pcb = td->td_pcb;
2606 tp->tf_fs = regs->r_fs;
2607 tp->tf_es = regs->r_es;
2608 tp->tf_ds = regs->r_ds;
2609 tp->tf_edi = regs->r_edi;
2610 tp->tf_esi = regs->r_esi;
2611 tp->tf_ebp = regs->r_ebp;
2612 tp->tf_ebx = regs->r_ebx;
2613 tp->tf_edx = regs->r_edx;
2614 tp->tf_ecx = regs->r_ecx;
2615 tp->tf_eax = regs->r_eax;
2616 tp->tf_eip = regs->r_eip;
2617 tp->tf_cs = regs->r_cs;
2618 tp->tf_eflags = regs->r_eflags;
2619 tp->tf_esp = regs->r_esp;
2620 tp->tf_ss = regs->r_ss;
2621 pcb->pcb_gs = regs->r_gs;
2622 return (0);
2623 }
2624
2625 #ifdef CPU_ENABLE_SSE
2626 static void
2627 fill_fpregs_xmm(sv_xmm, sv_87)
2628 struct savexmm *sv_xmm;
2629 struct save87 *sv_87;
2630 {
2631 register struct env87 *penv_87 = &sv_87->sv_env;
2632 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
2633 int i;
2634
2635 bzero(sv_87, sizeof(*sv_87));
2636
2637 /* FPU control/status */
2638 penv_87->en_cw = penv_xmm->en_cw;
2639 penv_87->en_sw = penv_xmm->en_sw;
2640 penv_87->en_tw = penv_xmm->en_tw;
2641 penv_87->en_fip = penv_xmm->en_fip;
2642 penv_87->en_fcs = penv_xmm->en_fcs;
2643 penv_87->en_opcode = penv_xmm->en_opcode;
2644 penv_87->en_foo = penv_xmm->en_foo;
2645 penv_87->en_fos = penv_xmm->en_fos;
2646
2647 /* FPU registers */
2648 for (i = 0; i < 8; ++i)
2649 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
2650 }
2651
2652 static void
2653 set_fpregs_xmm(sv_87, sv_xmm)
2654 struct save87 *sv_87;
2655 struct savexmm *sv_xmm;
2656 {
2657 register struct env87 *penv_87 = &sv_87->sv_env;
2658 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
2659 int i;
2660
2661 /* FPU control/status */
2662 penv_xmm->en_cw = penv_87->en_cw;
2663 penv_xmm->en_sw = penv_87->en_sw;
2664 penv_xmm->en_tw = penv_87->en_tw;
2665 penv_xmm->en_fip = penv_87->en_fip;
2666 penv_xmm->en_fcs = penv_87->en_fcs;
2667 penv_xmm->en_opcode = penv_87->en_opcode;
2668 penv_xmm->en_foo = penv_87->en_foo;
2669 penv_xmm->en_fos = penv_87->en_fos;
2670
2671 /* FPU registers */
2672 for (i = 0; i < 8; ++i)
2673 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
2674 }
2675 #endif /* CPU_ENABLE_SSE */
2676
2677 int
2678 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2679 {
2680 #ifdef CPU_ENABLE_SSE
2681 if (cpu_fxsr) {
2682 fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm,
2683 (struct save87 *)fpregs);
2684 return (0);
2685 }
2686 #endif /* CPU_ENABLE_SSE */
2687 bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
2688 return (0);
2689 }
2690
2691 int
2692 set_fpregs(struct thread *td, struct fpreg *fpregs)
2693 {
2694 #ifdef CPU_ENABLE_SSE
2695 if (cpu_fxsr) {
2696 set_fpregs_xmm((struct save87 *)fpregs,
2697 &td->td_pcb->pcb_save.sv_xmm);
2698 return (0);
2699 }
2700 #endif /* CPU_ENABLE_SSE */
2701 bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs);
2702 return (0);
2703 }
2704
2705 /*
2706 * Get machine context.
2707 */
2708 int
2709 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2710 {
2711 struct trapframe *tp;
2712
2713 tp = td->td_frame;
2714
2715 PROC_LOCK(curthread->td_proc);
2716 mcp->mc_onstack = sigonstack(tp->tf_esp);
2717 PROC_UNLOCK(curthread->td_proc);
2718 mcp->mc_gs = td->td_pcb->pcb_gs;
2719 mcp->mc_fs = tp->tf_fs;
2720 mcp->mc_es = tp->tf_es;
2721 mcp->mc_ds = tp->tf_ds;
2722 mcp->mc_edi = tp->tf_edi;
2723 mcp->mc_esi = tp->tf_esi;
2724 mcp->mc_ebp = tp->tf_ebp;
2725 mcp->mc_isp = tp->tf_isp;
2726 mcp->mc_eflags = tp->tf_eflags;
2727 if (flags & GET_MC_CLEAR_RET) {
2728 mcp->mc_eax = 0;
2729 mcp->mc_edx = 0;
2730 mcp->mc_eflags &= ~PSL_C;
2731 } else {
2732 mcp->mc_eax = tp->tf_eax;
2733 mcp->mc_edx = tp->tf_edx;
2734 }
2735 mcp->mc_ebx = tp->tf_ebx;
2736 mcp->mc_ecx = tp->tf_ecx;
2737 mcp->mc_eip = tp->tf_eip;
2738 mcp->mc_cs = tp->tf_cs;
2739 mcp->mc_esp = tp->tf_esp;
2740 mcp->mc_ss = tp->tf_ss;
2741 mcp->mc_len = sizeof(*mcp);
2742 get_fpcontext(td, mcp);
2743 bzero(mcp->mc_spare1, sizeof(mcp->mc_spare1));
2744 bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
2745 return (0);
2746 }
2747
2748 /*
2749 * Set machine context.
2750 *
2751 * However, we don't set any but the user modifiable flags, and we won't
2752 * touch the cs selector.
2753 */
2754 int
2755 set_mcontext(struct thread *td, const mcontext_t *mcp)
2756 {
2757 struct trapframe *tp;
2758 int eflags, ret;
2759
2760 tp = td->td_frame;
2761 if (mcp->mc_len != sizeof(*mcp))
2762 return (EINVAL);
2763 eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
2764 (tp->tf_eflags & ~PSL_USERCHANGE);
2765 if ((ret = set_fpcontext(td, mcp)) == 0) {
2766 tp->tf_fs = mcp->mc_fs;
2767 tp->tf_es = mcp->mc_es;
2768 tp->tf_ds = mcp->mc_ds;
2769 tp->tf_edi = mcp->mc_edi;
2770 tp->tf_esi = mcp->mc_esi;
2771 tp->tf_ebp = mcp->mc_ebp;
2772 tp->tf_ebx = mcp->mc_ebx;
2773 tp->tf_edx = mcp->mc_edx;
2774 tp->tf_ecx = mcp->mc_ecx;
2775 tp->tf_eax = mcp->mc_eax;
2776 tp->tf_eip = mcp->mc_eip;
2777 tp->tf_eflags = eflags;
2778 tp->tf_esp = mcp->mc_esp;
2779 tp->tf_ss = mcp->mc_ss;
2780 td->td_pcb->pcb_gs = mcp->mc_gs;
2781 ret = 0;
2782 }
2783 return (ret);
2784 }
2785
2786 static void
2787 get_fpcontext(struct thread *td, mcontext_t *mcp)
2788 {
2789 #ifndef DEV_NPX
2790 mcp->mc_fpformat = _MC_FPFMT_NODEV;
2791 mcp->mc_ownedfp = _MC_FPOWNED_NONE;
2792 bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
2793 #else
2794 union savefpu *addr;
2795
2796 /*
2797 * XXX mc_fpstate might be misaligned, since its declaration is not
2798 * unportabilized using __attribute__((aligned(16))) like the
2799 * declaration of struct savemm, and anyway, alignment doesn't work
2800 * for auto variables since we don't use gcc's pessimal stack
2801 * alignment. Work around this by abusing the spare fields after
2802 * mcp->mc_fpstate.
2803 *
2804 * XXX unpessimize most cases by only aligning when fxsave might be
2805 * called, although this requires knowing too much about
2806 * npxgetregs()'s internals.
2807 */
2808 addr = (union savefpu *)&mcp->mc_fpstate;
2809 if (td == PCPU_GET(fpcurthread) &&
2810 #ifdef CPU_ENABLE_SSE
2811 cpu_fxsr &&
2812 #endif
2813 ((uintptr_t)(void *)addr & 0xF)) {
2814 do
2815 addr = (void *)((char *)addr + 4);
2816 while ((uintptr_t)(void *)addr & 0xF);
2817 }
2818 mcp->mc_ownedfp = npxgetregs(td, addr);
2819 if (addr != (union savefpu *)&mcp->mc_fpstate) {
2820 bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
2821 bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2));
2822 }
2823 mcp->mc_fpformat = npxformat();
2824 #endif
2825 }
2826
2827 static int
2828 set_fpcontext(struct thread *td, const mcontext_t *mcp)
2829 {
2830 union savefpu *addr;
2831
2832 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2833 return (0);
2834 else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
2835 mcp->mc_fpformat != _MC_FPFMT_XMM)
2836 return (EINVAL);
2837 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
2838 /* We don't care what state is left in the FPU or PCB. */
2839 fpstate_drop(td);
2840 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2841 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2842 /* XXX align as above. */
2843 addr = (union savefpu *)&mcp->mc_fpstate;
2844 if (td == PCPU_GET(fpcurthread) &&
2845 #ifdef CPU_ENABLE_SSE
2846 cpu_fxsr &&
2847 #endif
2848 ((uintptr_t)(void *)addr & 0xF)) {
2849 do
2850 addr = (void *)((char *)addr + 4);
2851 while ((uintptr_t)(void *)addr & 0xF);
2852 bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate));
2853 }
2854 #ifdef DEV_NPX
2855 #ifdef CPU_ENABLE_SSE
2856 if (cpu_fxsr)
2857 addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
2858 #endif
2859 /*
2860 * XXX we violate the dubious requirement that npxsetregs()
2861 * be called with interrupts disabled.
2862 */
2863 npxsetregs(td, addr);
2864 #endif
2865 /*
2866 * Don't bother putting things back where they were in the
2867 * misaligned case, since we know that the caller won't use
2868 * them again.
2869 */
2870 } else
2871 return (EINVAL);
2872 return (0);
2873 }
2874
2875 static void
2876 fpstate_drop(struct thread *td)
2877 {
2878 register_t s;
2879
2880 s = intr_disable();
2881 #ifdef DEV_NPX
2882 if (PCPU_GET(fpcurthread) == td)
2883 npxdrop();
2884 #endif
2885 /*
2886 * XXX force a full drop of the npx. The above only drops it if we
2887 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
2888 *
2889 * XXX I don't much like npxgetregs()'s semantics of doing a full
2890 * drop. Dropping only to the pcb matches fnsave's behaviour.
2891 * We only need to drop to !PCB_INITDONE in sendsig(). But
2892 * sendsig() is the only caller of npxgetregs()... perhaps we just
2893 * have too many layers.
2894 */
2895 curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE;
2896 intr_restore(s);
2897 }
2898
2899 int
2900 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2901 {
2902 struct pcb *pcb;
2903
2904 if (td == NULL) {
2905 dbregs->dr[0] = rdr0();
2906 dbregs->dr[1] = rdr1();
2907 dbregs->dr[2] = rdr2();
2908 dbregs->dr[3] = rdr3();
2909 dbregs->dr[4] = rdr4();
2910 dbregs->dr[5] = rdr5();
2911 dbregs->dr[6] = rdr6();
2912 dbregs->dr[7] = rdr7();
2913 } else {
2914 pcb = td->td_pcb;
2915 dbregs->dr[0] = pcb->pcb_dr0;
2916 dbregs->dr[1] = pcb->pcb_dr1;
2917 dbregs->dr[2] = pcb->pcb_dr2;
2918 dbregs->dr[3] = pcb->pcb_dr3;
2919 dbregs->dr[4] = 0;
2920 dbregs->dr[5] = 0;
2921 dbregs->dr[6] = pcb->pcb_dr6;
2922 dbregs->dr[7] = pcb->pcb_dr7;
2923 }
2924 return (0);
2925 }
2926
2927 int
2928 set_dbregs(struct thread *td, struct dbreg *dbregs)
2929 {
2930 struct pcb *pcb;
2931 int i;
2932
2933 if (td == NULL) {
2934 load_dr0(dbregs->dr[0]);
2935 load_dr1(dbregs->dr[1]);
2936 load_dr2(dbregs->dr[2]);
2937 load_dr3(dbregs->dr[3]);
2938 load_dr4(dbregs->dr[4]);
2939 load_dr5(dbregs->dr[5]);
2940 load_dr6(dbregs->dr[6]);
2941 load_dr7(dbregs->dr[7]);
2942 } else {
2943 /*
2944 * Don't let an illegal value for dr7 get set. Specifically,
2945 * check for undefined settings. Setting these bit patterns
2946 * result in undefined behaviour and can lead to an unexpected
2947 * TRCTRAP.
2948 */
2949 for (i = 0; i < 4; i++) {
2950 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2951 return (EINVAL);
2952 if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
2953 return (EINVAL);
2954 }
2955
2956 pcb = td->td_pcb;
2957
2958 /*
2959 * Don't let a process set a breakpoint that is not within the
2960 * process's address space. If a process could do this, it
2961 * could halt the system by setting a breakpoint in the kernel
2962 * (if ddb was enabled). Thus, we need to check to make sure
2963 * that no breakpoints are being enabled for addresses outside
2964 * process's address space.
2965 *
2966 * XXX - what about when the watched area of the user's
2967 * address space is written into from within the kernel
2968 * ... wouldn't that still cause a breakpoint to be generated
2969 * from within kernel mode?
2970 */
2971
2972 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2973 /* dr0 is enabled */
2974 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2975 return (EINVAL);
2976 }
2977
2978 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2979 /* dr1 is enabled */
2980 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2981 return (EINVAL);
2982 }
2983
2984 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2985 /* dr2 is enabled */
2986 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2987 return (EINVAL);
2988 }
2989
2990 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2991 /* dr3 is enabled */
2992 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2993 return (EINVAL);
2994 }
2995
2996 pcb->pcb_dr0 = dbregs->dr[0];
2997 pcb->pcb_dr1 = dbregs->dr[1];
2998 pcb->pcb_dr2 = dbregs->dr[2];
2999 pcb->pcb_dr3 = dbregs->dr[3];
3000 pcb->pcb_dr6 = dbregs->dr[6];
3001 pcb->pcb_dr7 = dbregs->dr[7];
3002
3003 pcb->pcb_flags |= PCB_DBREGS;
3004 }
3005
3006 return (0);
3007 }
3008
3009 /*
3010 * Return > 0 if a hardware breakpoint has been hit, and the
3011 * breakpoint was in user space. Return 0, otherwise.
3012 */
3013 int
3014 user_dbreg_trap(void)
3015 {
3016 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
3017 u_int32_t bp; /* breakpoint bits extracted from dr6 */
3018 int nbp; /* number of breakpoints that triggered */
3019 caddr_t addr[4]; /* breakpoint addresses */
3020 int i;
3021
3022 dr7 = rdr7();
3023 if ((dr7 & 0x000000ff) == 0) {
3024 /*
3025 * all GE and LE bits in the dr7 register are zero,
3026 * thus the trap couldn't have been caused by the
3027 * hardware debug registers
3028 */
3029 return 0;
3030 }
3031
3032 nbp = 0;
3033 dr6 = rdr6();
3034 bp = dr6 & 0x0000000f;
3035
3036 if (!bp) {
3037 /*
3038 * None of the breakpoint bits are set meaning this
3039 * trap was not caused by any of the debug registers
3040 */
3041 return 0;
3042 }
3043
3044 /*
3045 * at least one of the breakpoints were hit, check to see
3046 * which ones and if any of them are user space addresses
3047 */
3048
3049 if (bp & 0x01) {
3050 addr[nbp++] = (caddr_t)rdr0();
3051 }
3052 if (bp & 0x02) {
3053 addr[nbp++] = (caddr_t)rdr1();
3054 }
3055 if (bp & 0x04) {
3056 addr[nbp++] = (caddr_t)rdr2();
3057 }
3058 if (bp & 0x08) {
3059 addr[nbp++] = (caddr_t)rdr3();
3060 }
3061
3062 for (i = 0; i < nbp; i++) {
3063 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
3064 /*
3065 * addr[i] is in user space
3066 */
3067 return nbp;
3068 }
3069 }
3070
3071 /*
3072 * None of the breakpoints are in user space.
3073 */
3074 return 0;
3075 }
3076
3077 #ifndef DEV_APIC
3078 #include <machine/apicvar.h>
3079
3080 /*
3081 * Provide stub functions so that the MADT APIC enumerator in the acpi
3082 * kernel module will link against a kernel without 'device apic'.
3083 *
3084 * XXX - This is a gross hack.
3085 */
3086 void
3087 apic_register_enumerator(struct apic_enumerator *enumerator)
3088 {
3089 }
3090
3091 void *
3092 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
3093 {
3094 return (NULL);
3095 }
3096
3097 int
3098 ioapic_disable_pin(void *cookie, u_int pin)
3099 {
3100 return (ENXIO);
3101 }
3102
3103 int
3104 ioapic_get_vector(void *cookie, u_int pin)
3105 {
3106 return (-1);
3107 }
3108
3109 void
3110 ioapic_register(void *cookie)
3111 {
3112 }
3113
3114 int
3115 ioapic_remap_vector(void *cookie, u_int pin, int vector)
3116 {
3117 return (ENXIO);
3118 }
3119
3120 int
3121 ioapic_set_extint(void *cookie, u_int pin)
3122 {
3123 return (ENXIO);
3124 }
3125
3126 int
3127 ioapic_set_nmi(void *cookie, u_int pin)
3128 {
3129 return (ENXIO);
3130 }
3131
3132 int
3133 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
3134 {
3135 return (ENXIO);
3136 }
3137
3138 int
3139 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
3140 {
3141 return (ENXIO);
3142 }
3143
3144 void
3145 lapic_create(u_int apic_id, int boot_cpu)
3146 {
3147 }
3148
3149 void
3150 lapic_init(vm_paddr_t addr)
3151 {
3152 }
3153
3154 int
3155 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
3156 {
3157 return (ENXIO);
3158 }
3159
3160 int
3161 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
3162 {
3163 return (ENXIO);
3164 }
3165
3166 int
3167 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
3168 {
3169 return (ENXIO);
3170 }
3171 #endif
3172
3173 #ifdef KDB
3174
3175 /*
3176 * Provide inb() and outb() as functions. They are normally only
3177 * available as macros calling inlined functions, thus cannot be
3178 * called from the debugger.
3179 *
3180 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3181 */
3182
3183 #undef inb
3184 #undef outb
3185
3186 /* silence compiler warnings */
3187 u_char inb(u_int);
3188 void outb(u_int, u_char);
3189
3190 u_char
3191 inb(u_int port)
3192 {
3193 u_char data;
3194 /*
3195 * We use %%dx and not %1 here because i/o is done at %dx and not at
3196 * %edx, while gcc generates inferior code (movw instead of movl)
3197 * if we tell it to load (u_short) port.
3198 */
3199 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3200 return (data);
3201 }
3202
3203 void
3204 outb(u_int port, u_char data)
3205 {
3206 u_char al;
3207 /*
3208 * Use an unnecessary assignment to help gcc's register allocator.
3209 * This make a large difference for gcc-1.40 and a tiny difference
3210 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
3211 * best results. gcc-2.6.0 can't handle this.
3212 */
3213 al = data;
3214 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3215 }
3216
3217 #endif /* KDB */
Cache object: e95ceb92f33378db5f48012c3e9ce9e4
|