1 /*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD: releng/5.3/sys/amd64/amd64/machdep.c 144698 2005-04-06 01:06:44Z cperciva $");
43
44 #include "opt_atalk.h"
45 #include "opt_atpic.h"
46 #include "opt_compat.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_ipx.h"
51 #include "opt_isa.h"
52 #include "opt_kstack_pages.h"
53 #include "opt_maxmem.h"
54 #include "opt_msgbuf.h"
55 #include "opt_perfmon.h"
56
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/sysproto.h>
60 #include <sys/signalvar.h>
61 #include <sys/imgact.h>
62 #include <sys/kdb.h>
63 #include <sys/kernel.h>
64 #include <sys/ktr.h>
65 #include <sys/linker.h>
66 #include <sys/lock.h>
67 #include <sys/malloc.h>
68 #include <sys/memrange.h>
69 #include <sys/mutex.h>
70 #include <sys/pcpu.h>
71 #include <sys/proc.h>
72 #include <sys/bio.h>
73 #include <sys/buf.h>
74 #include <sys/reboot.h>
75 #include <sys/callout.h>
76 #include <sys/msgbuf.h>
77 #include <sys/sched.h>
78 #include <sys/sysent.h>
79 #include <sys/sysctl.h>
80 #include <sys/ucontext.h>
81 #include <sys/vmmeter.h>
82 #include <sys/bus.h>
83 #include <sys/eventhandler.h>
84
85 #include <vm/vm.h>
86 #include <vm/vm_param.h>
87 #include <vm/vm_kern.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_map.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_extern.h>
93
94 #include <sys/user.h>
95 #include <sys/exec.h>
96 #include <sys/cons.h>
97
98 #ifdef DDB
99 #ifndef KDB
100 #error KDB must be enabled in order for DDB to work!
101 #endif
102 #endif
103 #include <ddb/ddb.h>
104
105 #include <net/netisr.h>
106
107 #include <machine/cpu.h>
108 #include <machine/cputypes.h>
109 #include <machine/reg.h>
110 #include <machine/clock.h>
111 #include <machine/specialreg.h>
112 #include <machine/intr_machdep.h>
113 #include <machine/md_var.h>
114 #include <machine/metadata.h>
115 #include <machine/proc.h>
116 #ifdef PERFMON
117 #include <machine/perfmon.h>
118 #endif
119 #include <machine/tss.h>
120 #ifdef SMP
121 #include <machine/smp.h>
122 #endif
123
124 #include <amd64/isa/icu.h>
125
126 #include <isa/isareg.h>
127 #include <isa/rtc.h>
128 #include <sys/ptrace.h>
129 #include <machine/sigframe.h>
130
131 /* Sanity check for __curthread() */
132 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
133
134 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
135 extern void dblfault_handler(void);
136
137 extern void printcpuinfo(void); /* XXX header file */
138 extern void identify_cpu(void);
139 extern void panicifcpuunsupported(void);
140
141 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
142 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
143
144 static void cpu_startup(void *);
145 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
146 static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
147 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
148
149 #ifdef DDB
150 extern vm_offset_t ksym_start, ksym_end;
151 #endif
152
153 int _udatasel, _ucodesel, _ucode32sel;
154
155 int cold = 1;
156
157 long Maxmem = 0;
158
159 vm_paddr_t phys_avail[20];
160
161 /* must be 2 less so 0 0 can signal end of chunks */
162 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
163
164 struct kva_md_info kmi;
165
166 static struct trapframe proc0_tf;
167 struct region_descriptor r_gdt, r_idt;
168
169 struct pcpu __pcpu[MAXCPU];
170
171 struct mtx icu_lock;
172
173 struct mem_range_softc mem_range_softc;
174
175 static void
176 cpu_startup(dummy)
177 void *dummy;
178 {
179 /*
180 * Good {morning,afternoon,evening,night}.
181 */
182 startrtclock();
183 printcpuinfo();
184 panicifcpuunsupported();
185 #ifdef PERFMON
186 perfmon_init();
187 #endif
188 printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
189 ptoa((uintmax_t)Maxmem) / 1048576);
190 /*
191 * Display any holes after the first chunk of extended memory.
192 */
193 if (bootverbose) {
194 int indx;
195
196 printf("Physical memory chunk(s):\n");
197 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
198 vm_paddr_t size;
199
200 size = phys_avail[indx + 1] - phys_avail[indx];
201 printf(
202 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
203 (uintmax_t)phys_avail[indx],
204 (uintmax_t)phys_avail[indx + 1] - 1,
205 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
206 }
207 }
208
209 vm_ksubmap_init(&kmi);
210
211 printf("avail memory = %ju (%ju MB)\n",
212 ptoa((uintmax_t)cnt.v_free_count),
213 ptoa((uintmax_t)cnt.v_free_count) / 1048576);
214
215 /*
216 * Set up buffers, so they can be used to read disk labels.
217 */
218 bufinit();
219 vm_pager_bufferinit();
220
221 cpu_setregs();
222 }
223
224 /*
225 * Send an interrupt to process.
226 *
227 * Stack is set up to allow sigcode stored
228 * at top to call routine, followed by kcall
229 * to sigreturn routine below. After sigreturn
230 * resets the signal mask, the stack, and the
231 * frame pointer, it returns to the user
232 * specified pc, psl.
233 */
234 void
235 sendsig(catcher, sig, mask, code)
236 sig_t catcher;
237 int sig;
238 sigset_t *mask;
239 u_long code;
240 {
241 struct sigframe sf, *sfp;
242 struct proc *p;
243 struct thread *td;
244 struct sigacts *psp;
245 char *sp;
246 struct trapframe *regs;
247 int oonstack;
248
249 td = curthread;
250 p = td->td_proc;
251 PROC_LOCK_ASSERT(p, MA_OWNED);
252 psp = p->p_sigacts;
253 mtx_assert(&psp->ps_mtx, MA_OWNED);
254 regs = td->td_frame;
255 oonstack = sigonstack(regs->tf_rsp);
256
257 /* Save user context. */
258 bzero(&sf, sizeof(sf));
259 sf.sf_uc.uc_sigmask = *mask;
260 sf.sf_uc.uc_stack = td->td_sigstk;
261 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
262 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
263 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
264 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
265 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
266 get_fpcontext(td, &sf.sf_uc.uc_mcontext);
267 fpstate_drop(td);
268
269 /* Allocate space for the signal handler context. */
270 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
271 SIGISMEMBER(psp->ps_sigonstack, sig)) {
272 sp = td->td_sigstk.ss_sp +
273 td->td_sigstk.ss_size - sizeof(struct sigframe);
274 #if defined(COMPAT_43)
275 td->td_sigstk.ss_flags |= SS_ONSTACK;
276 #endif
277 } else
278 sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
279 /* Align to 16 bytes. */
280 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
281
282 /* Translate the signal if appropriate. */
283 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
284 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
285
286 /* Build the argument list for the signal handler. */
287 regs->tf_rdi = sig; /* arg 1 in %rdi */
288 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
289 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
290 /* Signal handler installed with SA_SIGINFO. */
291 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
292 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
293
294 /* Fill in POSIX parts */
295 sf.sf_si.si_signo = sig;
296 sf.sf_si.si_code = code;
297 regs->tf_rcx = regs->tf_addr; /* arg 4 in %rcx */
298 } else {
299 /* Old FreeBSD-style arguments. */
300 regs->tf_rsi = code; /* arg 2 in %rsi */
301 regs->tf_rcx = regs->tf_addr; /* arg 4 in %rcx */
302 sf.sf_ahu.sf_handler = catcher;
303 }
304 mtx_unlock(&psp->ps_mtx);
305 PROC_UNLOCK(p);
306
307 /*
308 * Copy the sigframe out to the user's stack.
309 */
310 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
311 #ifdef DEBUG
312 printf("process %ld has trashed its stack\n", (long)p->p_pid);
313 #endif
314 PROC_LOCK(p);
315 sigexit(td, SIGILL);
316 }
317
318 regs->tf_rsp = (long)sfp;
319 regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
320 regs->tf_rflags &= ~PSL_T;
321 regs->tf_cs = _ucodesel;
322 PROC_LOCK(p);
323 mtx_lock(&psp->ps_mtx);
324 }
325
326 /*
327 * Build siginfo_t for SA thread
328 */
329 void
330 cpu_thread_siginfo(int sig, u_long code, siginfo_t *si)
331 {
332 struct proc *p;
333 struct thread *td;
334 struct trapframe *regs;
335
336 td = curthread;
337 p = td->td_proc;
338 regs = td->td_frame;
339 PROC_LOCK_ASSERT(p, MA_OWNED);
340
341 bzero(si, sizeof(*si));
342 si->si_signo = sig;
343 si->si_code = code;
344 si->si_addr = (void *)regs->tf_addr;
345 /* XXXKSE fill other fields */
346 }
347
348 /*
349 * System call to cleanup state after a signal
350 * has been taken. Reset signal mask and
351 * stack state from context left by sendsig (above).
352 * Return to previous pc and psl as specified by
353 * context left by sendsig. Check carefully to
354 * make sure that the user has not modified the
355 * state to gain improper privileges.
356 *
357 * MPSAFE
358 */
359 int
360 sigreturn(td, uap)
361 struct thread *td;
362 struct sigreturn_args /* {
363 const __ucontext *sigcntxp;
364 } */ *uap;
365 {
366 ucontext_t uc;
367 struct proc *p = td->td_proc;
368 struct trapframe *regs;
369 const ucontext_t *ucp;
370 long rflags;
371 int cs, error, ret;
372
373 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
374 if (error != 0)
375 return (error);
376 ucp = &uc;
377 regs = td->td_frame;
378 rflags = ucp->uc_mcontext.mc_rflags;
379 /*
380 * Don't allow users to change privileged or reserved flags.
381 */
382 /*
383 * XXX do allow users to change the privileged flag PSL_RF.
384 * The cpu sets PSL_RF in tf_rflags for faults. Debuggers
385 * should sometimes set it there too. tf_rflags is kept in
386 * the signal context during signal handling and there is no
387 * other place to remember it, so the PSL_RF bit may be
388 * corrupted by the signal handler without us knowing.
389 * Corruption of the PSL_RF bit at worst causes one more or
390 * one less debugger trap, so allowing it is fairly harmless.
391 */
392 if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
393 printf("sigreturn: rflags = 0x%lx\n", rflags);
394 return (EINVAL);
395 }
396
397 /*
398 * Don't allow users to load a valid privileged %cs. Let the
399 * hardware check for invalid selectors, excess privilege in
400 * other selectors, invalid %eip's and invalid %esp's.
401 */
402 cs = ucp->uc_mcontext.mc_cs;
403 if (!CS_SECURE(cs)) {
404 printf("sigreturn: cs = 0x%x\n", cs);
405 trapsignal(td, SIGBUS, T_PROTFLT);
406 return (EINVAL);
407 }
408
409 ret = set_fpcontext(td, &ucp->uc_mcontext);
410 if (ret != 0)
411 return (ret);
412 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
413
414 PROC_LOCK(p);
415 #if defined(COMPAT_43)
416 if (ucp->uc_mcontext.mc_onstack & 1)
417 td->td_sigstk.ss_flags |= SS_ONSTACK;
418 else
419 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
420 #endif
421
422 td->td_sigmask = ucp->uc_sigmask;
423 SIG_CANTMASK(td->td_sigmask);
424 signotify(td);
425 PROC_UNLOCK(p);
426 td->td_pcb->pcb_flags |= PCB_FULLCTX;
427 return (EJUSTRETURN);
428 }
429
430 #ifdef COMPAT_FREEBSD4
431 int
432 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
433 {
434
435 return sigreturn(td, (struct sigreturn_args *)uap);
436 }
437 #endif
438
439
440 /*
441 * Machine dependent boot() routine
442 *
443 * I haven't seen anything to put here yet
444 * Possibly some stuff might be grafted back here from boot()
445 */
446 void
447 cpu_boot(int howto)
448 {
449 }
450
451 /*
452 * Shutdown the CPU as much as possible
453 */
454 void
455 cpu_halt(void)
456 {
457 for (;;)
458 __asm__ ("hlt");
459 }
460
461 /*
462 * Hook to idle the CPU when possible. In the SMP case we default to
463 * off because a halted cpu will not currently pick up a new thread in the
464 * run queue until the next timer tick. If turned on this will result in
465 * approximately a 4.2% loss in real time performance in buildworld tests
466 * (but improves user and sys times oddly enough), and saves approximately
467 * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
468 *
469 * XXX we need to have a cpu mask of idle cpus and generate an IPI or
470 * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
471 * Then we can have our cake and eat it too.
472 *
473 * XXX I'm turning it on for SMP as well by default for now. It seems to
474 * help lock contention somewhat, and this is critical for HTT. -Peter
475 */
476 static int cpu_idle_hlt = 1;
477 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
478 &cpu_idle_hlt, 0, "Idle loop HLT enable");
479
480 static void
481 cpu_idle_default(void)
482 {
483 /*
484 * we must absolutely guarentee that hlt is the
485 * absolute next instruction after sti or we
486 * introduce a timing window.
487 */
488 __asm __volatile("sti; hlt");
489 }
490
491 /*
492 * Note that we have to be careful here to avoid a race between checking
493 * sched_runnable() and actually halting. If we don't do this, we may waste
494 * the time between calling hlt and the next interrupt even though there
495 * is a runnable process.
496 */
497 void
498 cpu_idle(void)
499 {
500
501 if (cpu_idle_hlt) {
502 disable_intr();
503 if (sched_runnable())
504 enable_intr();
505 else
506 (*cpu_idle_hook)();
507 }
508 }
509
510 /* Other subsystems (e.g., ACPI) can hook this later. */
511 void (*cpu_idle_hook)(void) = cpu_idle_default;
512
513 /*
514 * Clear registers on exec
515 */
516 void
517 exec_setregs(td, entry, stack, ps_strings)
518 struct thread *td;
519 u_long entry;
520 u_long stack;
521 u_long ps_strings;
522 {
523 struct trapframe *regs = td->td_frame;
524 struct pcb *pcb = td->td_pcb;
525
526 wrmsr(MSR_FSBASE, 0);
527 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
528 pcb->pcb_fsbase = 0;
529 pcb->pcb_gsbase = 0;
530 load_ds(_udatasel);
531 load_es(_udatasel);
532 load_fs(_udatasel);
533 load_gs(_udatasel);
534 pcb->pcb_ds = _udatasel;
535 pcb->pcb_es = _udatasel;
536 pcb->pcb_fs = _udatasel;
537 pcb->pcb_gs = _udatasel;
538
539 bzero((char *)regs, sizeof(struct trapframe));
540 regs->tf_rip = entry;
541 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
542 regs->tf_rdi = stack; /* argv */
543 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
544 regs->tf_ss = _udatasel;
545 regs->tf_cs = _ucodesel;
546
547 /*
548 * Reset the hardware debug registers if they were in use.
549 * They won't have any meaning for the newly exec'd process.
550 */
551 if (pcb->pcb_flags & PCB_DBREGS) {
552 pcb->pcb_dr0 = 0;
553 pcb->pcb_dr1 = 0;
554 pcb->pcb_dr2 = 0;
555 pcb->pcb_dr3 = 0;
556 pcb->pcb_dr6 = 0;
557 pcb->pcb_dr7 = 0;
558 if (pcb == PCPU_GET(curpcb)) {
559 /*
560 * Clear the debug registers on the running
561 * CPU, otherwise they will end up affecting
562 * the next process we switch to.
563 */
564 reset_dbregs();
565 }
566 pcb->pcb_flags &= ~PCB_DBREGS;
567 }
568
569 /*
570 * Drop the FP state if we hold it, so that the process gets a
571 * clean FP state if it uses the FPU again.
572 */
573 fpstate_drop(td);
574 }
575
576 void
577 cpu_setregs(void)
578 {
579 register_t cr0;
580
581 cr0 = rcr0();
582 /*
583 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
584 * BSP. See the comments there about why we set them.
585 */
586 cr0 |= CR0_MP | CR0_NE | CR0_TS;
587 cr0 |= CR0_WP | CR0_AM;
588 load_cr0(cr0);
589 }
590
591 static int
592 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
593 {
594 int error;
595 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
596 req);
597 if (!error && req->newptr)
598 resettodr();
599 return (error);
600 }
601
602 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
603 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
604
605 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
606 CTLFLAG_RW, &disable_rtc_set, 0, "");
607
608 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
609 CTLFLAG_RW, &wall_cmos_clock, 0, "");
610
611 /*
612 * Initialize 386 and configure to run kernel
613 */
614
615 /*
616 * Initialize segments & interrupt table
617 */
618
619 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor table */
620 static struct gate_descriptor idt0[NIDT];
621 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
622
623 static char dblfault_stack[PAGE_SIZE] __aligned(16);
624
625 struct amd64tss common_tss[MAXCPU];
626
627 /* software prototypes -- in more palatable form */
628 struct soft_segment_descriptor gdt_segs[] = {
629 /* GNULL_SEL 0 Null Descriptor */
630 { 0x0, /* segment base address */
631 0x0, /* length */
632 0, /* segment type */
633 0, /* segment descriptor priority level */
634 0, /* segment descriptor present */
635 0, /* long */
636 0, /* default 32 vs 16 bit size */
637 0 /* limit granularity (byte/page units)*/ },
638 /* GCODE_SEL 1 Code Descriptor for kernel */
639 { 0x0, /* segment base address */
640 0xfffff, /* length - all address space */
641 SDT_MEMERA, /* segment type */
642 SEL_KPL, /* segment descriptor priority level */
643 1, /* segment descriptor present */
644 1, /* long */
645 0, /* default 32 vs 16 bit size */
646 1 /* limit granularity (byte/page units)*/ },
647 /* GDATA_SEL 2 Data Descriptor for kernel */
648 { 0x0, /* segment base address */
649 0xfffff, /* length - all address space */
650 SDT_MEMRWA, /* segment type */
651 SEL_KPL, /* segment descriptor priority level */
652 1, /* segment descriptor present */
653 1, /* long */
654 0, /* default 32 vs 16 bit size */
655 1 /* limit granularity (byte/page units)*/ },
656 /* GUCODE32_SEL 3 32 bit Code Descriptor for user */
657 { 0x0, /* segment base address */
658 0xfffff, /* length - all address space */
659 SDT_MEMERA, /* segment type */
660 SEL_UPL, /* segment descriptor priority level */
661 1, /* segment descriptor present */
662 0, /* long */
663 1, /* default 32 vs 16 bit size */
664 1 /* limit granularity (byte/page units)*/ },
665 /* GUDATA_SEL 4 32/64 bit Data Descriptor for user */
666 { 0x0, /* segment base address */
667 0xfffff, /* length - all address space */
668 SDT_MEMRWA, /* segment type */
669 SEL_UPL, /* segment descriptor priority level */
670 1, /* segment descriptor present */
671 0, /* long */
672 1, /* default 32 vs 16 bit size */
673 1 /* limit granularity (byte/page units)*/ },
674 /* GUCODE_SEL 5 64 bit Code Descriptor for user */
675 { 0x0, /* segment base address */
676 0xfffff, /* length - all address space */
677 SDT_MEMERA, /* segment type */
678 SEL_UPL, /* segment descriptor priority level */
679 1, /* segment descriptor present */
680 1, /* long */
681 0, /* default 32 vs 16 bit size */
682 1 /* limit granularity (byte/page units)*/ },
683 /* GPROC0_SEL 6 Proc 0 Tss Descriptor */
684 {
685 0x0, /* segment base address */
686 sizeof(struct amd64tss)-1,/* length - all address space */
687 SDT_SYSTSS, /* segment type */
688 SEL_KPL, /* segment descriptor priority level */
689 1, /* segment descriptor present */
690 0, /* long */
691 0, /* unused - default 32 vs 16 bit size */
692 0 /* limit granularity (byte/page units)*/ },
693 /* Actually, the TSS is a system descriptor which is double size */
694 { 0x0, /* segment base address */
695 0x0, /* length */
696 0, /* segment type */
697 0, /* segment descriptor priority level */
698 0, /* segment descriptor present */
699 0, /* long */
700 0, /* default 32 vs 16 bit size */
701 0 /* limit granularity (byte/page units)*/ },
702 };
703
704 void
705 setidt(idx, func, typ, dpl, ist)
706 int idx;
707 inthand_t *func;
708 int typ;
709 int dpl;
710 int ist;
711 {
712 struct gate_descriptor *ip;
713
714 ip = idt + idx;
715 ip->gd_looffset = (uintptr_t)func;
716 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
717 ip->gd_ist = ist;
718 ip->gd_xx = 0;
719 ip->gd_type = typ;
720 ip->gd_dpl = dpl;
721 ip->gd_p = 1;
722 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
723 }
724
725 #define IDTVEC(name) __CONCAT(X,name)
726
727 extern inthand_t
728 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
729 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
730 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
731 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
732 IDTVEC(xmm), IDTVEC(dblfault),
733 IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
734
735 void
736 sdtossd(sd, ssd)
737 struct user_segment_descriptor *sd;
738 struct soft_segment_descriptor *ssd;
739 {
740
741 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
742 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
743 ssd->ssd_type = sd->sd_type;
744 ssd->ssd_dpl = sd->sd_dpl;
745 ssd->ssd_p = sd->sd_p;
746 ssd->ssd_long = sd->sd_long;
747 ssd->ssd_def32 = sd->sd_def32;
748 ssd->ssd_gran = sd->sd_gran;
749 }
750
751 void
752 ssdtosd(ssd, sd)
753 struct soft_segment_descriptor *ssd;
754 struct user_segment_descriptor *sd;
755 {
756
757 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
758 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
759 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
760 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
761 sd->sd_type = ssd->ssd_type;
762 sd->sd_dpl = ssd->ssd_dpl;
763 sd->sd_p = ssd->ssd_p;
764 sd->sd_long = ssd->ssd_long;
765 sd->sd_def32 = ssd->ssd_def32;
766 sd->sd_gran = ssd->ssd_gran;
767 }
768
769 void
770 ssdtosyssd(ssd, sd)
771 struct soft_segment_descriptor *ssd;
772 struct system_segment_descriptor *sd;
773 {
774
775 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
776 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
777 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
778 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
779 sd->sd_type = ssd->ssd_type;
780 sd->sd_dpl = ssd->ssd_dpl;
781 sd->sd_p = ssd->ssd_p;
782 sd->sd_gran = ssd->ssd_gran;
783 }
784
785 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
786 #include <isa/isavar.h>
787 u_int
788 isa_irq_pending(void)
789 {
790
791 return (0);
792 }
793 #endif
794
795 #define PHYSMAP_SIZE (2 * 8)
796
797 struct bios_smap {
798 u_int64_t base;
799 u_int64_t length;
800 u_int32_t type;
801 } __packed;
802
803 u_int basemem;
804
805 /*
806 * Populate the (physmap) array with base/bound pairs describing the
807 * available physical memory in the system, then test this memory and
808 * build the phys_avail array describing the actually-available memory.
809 *
810 * If we cannot accurately determine the physical memory map, then use
811 * value from the 0xE801 call, and failing that, the RTC.
812 *
813 * Total memory size may be set by the kernel environment variable
814 * hw.physmem or the compile-time define MAXMEM.
815 *
816 * XXX first should be vm_paddr_t.
817 */
818 static void
819 getmemsize(caddr_t kmdp, u_int64_t first)
820 {
821 int i, physmap_idx, pa_indx;
822 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
823 pt_entry_t *pte;
824 char *cp;
825 struct bios_smap *smapbase, *smap, *smapend;
826 u_int32_t smapsize;
827
828 bzero(physmap, sizeof(physmap));
829 basemem = 0;
830 physmap_idx = 0;
831
832 /*
833 * get memory map from INT 15:E820, kindly supplied by the loader.
834 *
835 * subr_module.c says:
836 * "Consumer may safely assume that size value precedes data."
837 * ie: an int32_t immediately precedes smap.
838 */
839 smapbase = (struct bios_smap *)preload_search_info(kmdp,
840 MODINFO_METADATA | MODINFOMD_SMAP);
841 if (smapbase == NULL)
842 panic("No BIOS smap info from loader!");
843
844 smapsize = *((u_int32_t *)smapbase - 1);
845 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
846
847 for (smap = smapbase; smap < smapend; smap++) {
848 if (boothowto & RB_VERBOSE)
849 printf("SMAP type=%02x base=%016lx len=%016lx\n",
850 smap->type, smap->base, smap->length);
851
852 if (smap->type != 0x01)
853 continue;
854
855 if (smap->length == 0)
856 continue;
857
858 for (i = 0; i <= physmap_idx; i += 2) {
859 if (smap->base < physmap[i + 1]) {
860 if (boothowto & RB_VERBOSE)
861 printf(
862 "Overlapping or non-montonic memory region, ignoring second region\n");
863 goto next_run;
864 }
865 }
866
867 if (smap->base == physmap[physmap_idx + 1]) {
868 physmap[physmap_idx + 1] += smap->length;
869 next_run:
870 continue;
871 }
872
873 physmap_idx += 2;
874 if (physmap_idx == PHYSMAP_SIZE) {
875 printf(
876 "Too many segments in the physical address map, giving up\n");
877 break;
878 }
879 physmap[physmap_idx] = smap->base;
880 physmap[physmap_idx + 1] = smap->base + smap->length;
881 }
882
883 /*
884 * Find the 'base memory' segment for SMP
885 */
886 basemem = 0;
887 for (i = 0; i <= physmap_idx; i += 2) {
888 if (physmap[i] == 0x00000000) {
889 basemem = physmap[i + 1] / 1024;
890 break;
891 }
892 }
893 if (basemem == 0)
894 panic("BIOS smap did not include a basemem segment!");
895
896 #ifdef SMP
897 /* make hole for AP bootstrap code */
898 physmap[1] = mp_bootaddress(physmap[1] / 1024);
899 #endif
900
901 /*
902 * Maxmem isn't the "maximum memory", it's one larger than the
903 * highest page of the physical address space. It should be
904 * called something like "Maxphyspage". We may adjust this
905 * based on ``hw.physmem'' and the results of the memory test.
906 */
907 Maxmem = atop(physmap[physmap_idx + 1]);
908
909 #ifdef MAXMEM
910 Maxmem = MAXMEM / 4;
911 #endif
912
913 /*
914 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes
915 * for the appropriate modifiers. This overrides MAXMEM.
916 */
917 cp = getenv("hw.physmem");
918 if (cp != NULL) {
919 u_int64_t AllowMem, sanity;
920 char *ep;
921
922 sanity = AllowMem = strtouq(cp, &ep, 0);
923 if ((ep != cp) && (*ep != 0)) {
924 switch(*ep) {
925 case 'g':
926 case 'G':
927 AllowMem <<= 10;
928 case 'm':
929 case 'M':
930 AllowMem <<= 10;
931 case 'k':
932 case 'K':
933 AllowMem <<= 10;
934 break;
935 default:
936 AllowMem = sanity = 0;
937 }
938 if (AllowMem < sanity)
939 AllowMem = 0;
940 }
941 if (AllowMem == 0)
942 printf("Ignoring invalid memory size of '%s'\n", cp);
943 else
944 Maxmem = atop(AllowMem);
945 freeenv(cp);
946 }
947
948 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
949 (boothowto & RB_VERBOSE))
950 printf("Physical memory use set to %ldK\n", Maxmem * 4);
951
952 /*
953 * If Maxmem has been increased beyond what the system has detected,
954 * extend the last memory segment to the new limit.
955 */
956 if (atop(physmap[physmap_idx + 1]) < Maxmem)
957 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
958
959 /* call pmap initialization to make new kernel address space */
960 pmap_bootstrap(&first);
961
962 /*
963 * Size up each available chunk of physical memory.
964 */
965 physmap[0] = PAGE_SIZE; /* mask off page 0 */
966 pa_indx = 0;
967 phys_avail[pa_indx++] = physmap[0];
968 phys_avail[pa_indx] = physmap[0];
969 pte = CMAP1;
970
971 /*
972 * physmap is in bytes, so when converting to page boundaries,
973 * round up the start address and round down the end address.
974 */
975 for (i = 0; i <= physmap_idx; i += 2) {
976 vm_paddr_t end;
977
978 end = ptoa((vm_paddr_t)Maxmem);
979 if (physmap[i + 1] < end)
980 end = trunc_page(physmap[i + 1]);
981 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
982 int tmp, page_bad;
983 int *ptr = (int *)CADDR1;
984
985 /*
986 * block out kernel memory as not available.
987 */
988 if (pa >= 0x100000 && pa < first)
989 continue;
990
991 page_bad = FALSE;
992
993 /*
994 * map page into kernel: valid, read/write,non-cacheable
995 */
996 *pte = pa | PG_V | PG_RW | PG_N;
997 invltlb();
998
999 tmp = *(int *)ptr;
1000 /*
1001 * Test for alternating 1's and 0's
1002 */
1003 *(volatile int *)ptr = 0xaaaaaaaa;
1004 if (*(volatile int *)ptr != 0xaaaaaaaa)
1005 page_bad = TRUE;
1006 /*
1007 * Test for alternating 0's and 1's
1008 */
1009 *(volatile int *)ptr = 0x55555555;
1010 if (*(volatile int *)ptr != 0x55555555)
1011 page_bad = TRUE;
1012 /*
1013 * Test for all 1's
1014 */
1015 *(volatile int *)ptr = 0xffffffff;
1016 if (*(volatile int *)ptr != 0xffffffff)
1017 page_bad = TRUE;
1018 /*
1019 * Test for all 0's
1020 */
1021 *(volatile int *)ptr = 0x0;
1022 if (*(volatile int *)ptr != 0x0)
1023 page_bad = TRUE;
1024 /*
1025 * Restore original value.
1026 */
1027 *(int *)ptr = tmp;
1028
1029 /*
1030 * Adjust array of valid/good pages.
1031 */
1032 if (page_bad == TRUE)
1033 continue;
1034 /*
1035 * If this good page is a continuation of the
1036 * previous set of good pages, then just increase
1037 * the end pointer. Otherwise start a new chunk.
1038 * Note that "end" points one higher than end,
1039 * making the range >= start and < end.
1040 * If we're also doing a speculative memory
1041 * test and we at or past the end, bump up Maxmem
1042 * so that we keep going. The first bad page
1043 * will terminate the loop.
1044 */
1045 if (phys_avail[pa_indx] == pa) {
1046 phys_avail[pa_indx] += PAGE_SIZE;
1047 } else {
1048 pa_indx++;
1049 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1050 printf(
1051 "Too many holes in the physical address space, giving up\n");
1052 pa_indx--;
1053 break;
1054 }
1055 phys_avail[pa_indx++] = pa; /* start */
1056 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1057 }
1058 physmem++;
1059 }
1060 }
1061 *pte = 0;
1062 invltlb();
1063
1064 /*
1065 * XXX
1066 * The last chunk must contain at least one page plus the message
1067 * buffer to avoid complicating other code (message buffer address
1068 * calculation, etc.).
1069 */
1070 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1071 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
1072 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1073 phys_avail[pa_indx--] = 0;
1074 phys_avail[pa_indx--] = 0;
1075 }
1076
1077 Maxmem = atop(phys_avail[pa_indx]);
1078
1079 /* Trim off space for the message buffer. */
1080 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
1081
1082 avail_end = phys_avail[pa_indx];
1083 }
1084
1085 u_int64_t
1086 hammer_time(u_int64_t modulep, u_int64_t physfree)
1087 {
1088 caddr_t kmdp;
1089 int gsel_tss, off, x;
1090 struct pcpu *pc;
1091 u_int64_t msr;
1092 char *env;
1093
1094 #ifdef DEV_ISA
1095 /* Preemptively mask the atpics and leave them shut down */
1096 outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
1097 outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
1098 #else
1099 #error "have you forgotten the isa device?";
1100 #endif
1101
1102 proc0.p_uarea = (struct user *)(physfree + KERNBASE);
1103 bzero(proc0.p_uarea, UAREA_PAGES * PAGE_SIZE);
1104 physfree += UAREA_PAGES * PAGE_SIZE;
1105 thread0.td_kstack = physfree + KERNBASE;
1106 bzero((void *)thread0.td_kstack, KSTACK_PAGES * PAGE_SIZE);
1107 physfree += KSTACK_PAGES * PAGE_SIZE;
1108 thread0.td_pcb = (struct pcb *)
1109 (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
1110
1111 /*
1112 * This may be done better later if it gets more high level
1113 * components in it. If so just link td->td_proc here.
1114 */
1115 proc_linkup(&proc0, &ksegrp0, &thread0);
1116
1117 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1118 preload_bootstrap_relocate(KERNBASE);
1119 kmdp = preload_search_by_type("elf kernel");
1120 if (kmdp == NULL)
1121 kmdp = preload_search_by_type("elf64 kernel");
1122 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1123 kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
1124 #ifdef DDB
1125 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1126 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1127 #endif
1128
1129 /* Init basic tunables, hz etc */
1130 init_param1();
1131
1132 /*
1133 * make gdt memory segments
1134 */
1135 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1136
1137 for (x = 0; x < NGDT; x++) {
1138 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
1139 ssdtosd(&gdt_segs[x], &gdt[x]);
1140 }
1141 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1142 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1143
1144 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1145 r_gdt.rd_base = (long) gdt;
1146 lgdt(&r_gdt);
1147 pc = &__pcpu[0];
1148
1149 wrmsr(MSR_FSBASE, 0); /* User value */
1150 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1151 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1152
1153 pcpu_init(pc, 0, sizeof(struct pcpu));
1154 PCPU_SET(prvspace, pc);
1155 PCPU_SET(curthread, &thread0);
1156 PCPU_SET(curpcb, thread0.td_pcb);
1157 PCPU_SET(tssp, &common_tss[0]);
1158
1159 /*
1160 * Initialize mutexes.
1161 *
1162 * icu_lock: in order to allow an interrupt to occur in a critical
1163 * section, to set pcpu->ipending (etc...) properly, we
1164 * must be able to get the icu lock, so it can't be
1165 * under witness.
1166 */
1167 mutex_init();
1168 mtx_init(&clock_lock, "clk", NULL, MTX_SPIN);
1169 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1170
1171 /* exceptions */
1172 for (x = 0; x < NIDT; x++)
1173 setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
1174 setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
1175 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
1176 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 0);
1177 setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
1178 setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
1179 setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
1180 setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
1181 setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
1182 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1183 setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
1184 setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
1185 setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
1186 setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
1187 setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
1188 setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
1189 setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
1190 setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
1191 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
1192 setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
1193
1194 r_idt.rd_limit = sizeof(idt0) - 1;
1195 r_idt.rd_base = (long) idt;
1196 lidt(&r_idt);
1197
1198 /*
1199 * Initialize the console before we print anything out.
1200 */
1201 cninit();
1202
1203 #ifdef DEV_ATPIC
1204 atpic_startup();
1205 #endif
1206
1207 kdb_init();
1208
1209 #ifdef KDB
1210 if (boothowto & RB_KDB)
1211 kdb_enter("Boot flags requested debugger");
1212 #endif
1213
1214 identify_cpu(); /* Final stage of CPU initialization */
1215 initializecpu(); /* Initialize CPU registers */
1216
1217 /* make an initial tss so cpu can get interrupt stack on syscall! */
1218 common_tss[0].tss_rsp0 = thread0.td_kstack + \
1219 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb);
1220 /* Ensure the stack is aligned to 16 bytes */
1221 common_tss[0].tss_rsp0 &= ~0xFul;
1222 PCPU_SET(rsp0, common_tss[0].tss_rsp0);
1223
1224 /* doublefault stack space, runs on ist1 */
1225 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1226
1227 /* Set the IO permission bitmap (empty due to tss seg limit) */
1228 common_tss[0].tss_iobase = sizeof(struct amd64tss);
1229
1230 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1231 ltr(gsel_tss);
1232
1233 /* Set up the fast syscall stuff */
1234 msr = rdmsr(MSR_EFER) | EFER_SCE;
1235 wrmsr(MSR_EFER, msr);
1236 wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
1237 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1238 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1239 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1240 wrmsr(MSR_STAR, msr);
1241 wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
1242
1243 getmemsize(kmdp, physfree);
1244 init_param2(physmem);
1245
1246 /* now running on new page tables, configured,and u/iom is accessible */
1247
1248 /* Map the message buffer. */
1249 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
1250 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
1251
1252 msgbufinit(msgbufp, MSGBUF_SIZE);
1253 fpuinit();
1254
1255 /* transfer to user mode */
1256
1257 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1258 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1259 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1260
1261 /* setup proc 0's pcb */
1262 thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
1263 thread0.td_pcb->pcb_cr3 = KPML4phys;
1264 thread0.td_frame = &proc0_tf;
1265
1266 env = getenv("kernelname");
1267 if (env != NULL)
1268 strlcpy(kernelname, env, sizeof(kernelname));
1269
1270 /* Location of kernel stack for locore */
1271 return ((u_int64_t)thread0.td_pcb);
1272 }
1273
1274 void
1275 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1276 {
1277
1278 pcpu->pc_acpi_id = 0xffffffff;
1279 }
1280
1281 /*
1282 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1283 * we want to start a backtrace from the function that caused us to enter
1284 * the debugger. We have the context in the trapframe, but base the trace
1285 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1286 * enough for a backtrace.
1287 */
1288 void
1289 makectx(struct trapframe *tf, struct pcb *pcb)
1290 {
1291
1292 pcb->pcb_r12 = tf->tf_r12;
1293 pcb->pcb_r13 = tf->tf_r13;
1294 pcb->pcb_r14 = tf->tf_r14;
1295 pcb->pcb_r15 = tf->tf_r15;
1296 pcb->pcb_rbp = tf->tf_rbp;
1297 pcb->pcb_rbx = tf->tf_rbx;
1298 pcb->pcb_rip = tf->tf_rip;
1299 pcb->pcb_rsp = (ISPL(tf->tf_cs)) ? tf->tf_rsp : (long)(tf + 1) - 8;
1300 }
1301
1302 int
1303 ptrace_set_pc(struct thread *td, unsigned long addr)
1304 {
1305 td->td_frame->tf_rip = addr;
1306 return (0);
1307 }
1308
1309 int
1310 ptrace_single_step(struct thread *td)
1311 {
1312 td->td_frame->tf_rflags |= PSL_T;
1313 return (0);
1314 }
1315
1316 int
1317 ptrace_clear_single_step(struct thread *td)
1318 {
1319 td->td_frame->tf_rflags &= ~PSL_T;
1320 return (0);
1321 }
1322
1323 int
1324 fill_regs(struct thread *td, struct reg *regs)
1325 {
1326 struct pcb *pcb;
1327 struct trapframe *tp;
1328
1329 tp = td->td_frame;
1330 regs->r_r15 = tp->tf_r15;
1331 regs->r_r14 = tp->tf_r14;
1332 regs->r_r13 = tp->tf_r13;
1333 regs->r_r12 = tp->tf_r12;
1334 regs->r_r11 = tp->tf_r11;
1335 regs->r_r10 = tp->tf_r10;
1336 regs->r_r9 = tp->tf_r9;
1337 regs->r_r8 = tp->tf_r8;
1338 regs->r_rdi = tp->tf_rdi;
1339 regs->r_rsi = tp->tf_rsi;
1340 regs->r_rbp = tp->tf_rbp;
1341 regs->r_rbx = tp->tf_rbx;
1342 regs->r_rdx = tp->tf_rdx;
1343 regs->r_rcx = tp->tf_rcx;
1344 regs->r_rax = tp->tf_rax;
1345 regs->r_rip = tp->tf_rip;
1346 regs->r_cs = tp->tf_cs;
1347 regs->r_rflags = tp->tf_rflags;
1348 regs->r_rsp = tp->tf_rsp;
1349 regs->r_ss = tp->tf_ss;
1350 pcb = td->td_pcb;
1351 return (0);
1352 }
1353
1354 int
1355 set_regs(struct thread *td, struct reg *regs)
1356 {
1357 struct pcb *pcb;
1358 struct trapframe *tp;
1359 register_t rflags;
1360
1361 tp = td->td_frame;
1362 rflags = regs->r_rflags & 0xffffffff;
1363 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
1364 return (EINVAL);
1365 tp->tf_r15 = regs->r_r15;
1366 tp->tf_r14 = regs->r_r14;
1367 tp->tf_r13 = regs->r_r13;
1368 tp->tf_r12 = regs->r_r12;
1369 tp->tf_r11 = regs->r_r11;
1370 tp->tf_r10 = regs->r_r10;
1371 tp->tf_r9 = regs->r_r9;
1372 tp->tf_r8 = regs->r_r8;
1373 tp->tf_rdi = regs->r_rdi;
1374 tp->tf_rsi = regs->r_rsi;
1375 tp->tf_rbp = regs->r_rbp;
1376 tp->tf_rbx = regs->r_rbx;
1377 tp->tf_rdx = regs->r_rdx;
1378 tp->tf_rcx = regs->r_rcx;
1379 tp->tf_rax = regs->r_rax;
1380 tp->tf_rip = regs->r_rip;
1381 tp->tf_cs = regs->r_cs;
1382 tp->tf_rflags = rflags;
1383 tp->tf_rsp = regs->r_rsp;
1384 tp->tf_ss = regs->r_ss;
1385 pcb = td->td_pcb;
1386 return (0);
1387 }
1388
1389 /* XXX check all this stuff! */
1390 /* externalize from sv_xmm */
1391 static void
1392 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
1393 {
1394 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
1395 struct envxmm *penv_xmm = &sv_xmm->sv_env;
1396 int i;
1397
1398 /* pcb -> fpregs */
1399 bzero(fpregs, sizeof(*fpregs));
1400
1401 /* FPU control/status */
1402 penv_fpreg->en_cw = penv_xmm->en_cw;
1403 penv_fpreg->en_sw = penv_xmm->en_sw;
1404 penv_fpreg->en_tw = penv_xmm->en_tw;
1405 penv_fpreg->en_opcode = penv_xmm->en_opcode;
1406 penv_fpreg->en_rip = penv_xmm->en_rip;
1407 penv_fpreg->en_rdp = penv_xmm->en_rdp;
1408 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
1409 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
1410
1411 /* FPU registers */
1412 for (i = 0; i < 8; ++i)
1413 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
1414
1415 /* SSE registers */
1416 for (i = 0; i < 16; ++i)
1417 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
1418 }
1419
1420 /* internalize from fpregs into sv_xmm */
1421 static void
1422 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
1423 {
1424 struct envxmm *penv_xmm = &sv_xmm->sv_env;
1425 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
1426 int i;
1427
1428 /* fpregs -> pcb */
1429 /* FPU control/status */
1430 penv_xmm->en_cw = penv_fpreg->en_cw;
1431 penv_xmm->en_sw = penv_fpreg->en_sw;
1432 penv_xmm->en_tw = penv_fpreg->en_tw;
1433 penv_xmm->en_opcode = penv_fpreg->en_opcode;
1434 penv_xmm->en_rip = penv_fpreg->en_rip;
1435 penv_xmm->en_rdp = penv_fpreg->en_rdp;
1436 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
1437 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask;
1438
1439 /* FPU registers */
1440 for (i = 0; i < 8; ++i)
1441 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
1442
1443 /* SSE registers */
1444 for (i = 0; i < 16; ++i)
1445 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
1446 }
1447
1448 /* externalize from td->pcb */
1449 int
1450 fill_fpregs(struct thread *td, struct fpreg *fpregs)
1451 {
1452
1453 fill_fpregs_xmm(&td->td_pcb->pcb_save, fpregs);
1454 return (0);
1455 }
1456
1457 /* internalize to td->pcb */
1458 int
1459 set_fpregs(struct thread *td, struct fpreg *fpregs)
1460 {
1461
1462 set_fpregs_xmm(fpregs, &td->td_pcb->pcb_save);
1463 return (0);
1464 }
1465
1466 /*
1467 * Get machine context.
1468 */
1469 int
1470 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
1471 {
1472 struct trapframe *tp;
1473
1474 tp = td->td_frame;
1475 PROC_LOCK(curthread->td_proc);
1476 mcp->mc_onstack = sigonstack(tp->tf_rsp);
1477 PROC_UNLOCK(curthread->td_proc);
1478 mcp->mc_r15 = tp->tf_r15;
1479 mcp->mc_r14 = tp->tf_r14;
1480 mcp->mc_r13 = tp->tf_r13;
1481 mcp->mc_r12 = tp->tf_r12;
1482 mcp->mc_r11 = tp->tf_r11;
1483 mcp->mc_r10 = tp->tf_r10;
1484 mcp->mc_r9 = tp->tf_r9;
1485 mcp->mc_r8 = tp->tf_r8;
1486 mcp->mc_rdi = tp->tf_rdi;
1487 mcp->mc_rsi = tp->tf_rsi;
1488 mcp->mc_rbp = tp->tf_rbp;
1489 mcp->mc_rbx = tp->tf_rbx;
1490 mcp->mc_rcx = tp->tf_rcx;
1491 if (flags & GET_MC_CLEAR_RET) {
1492 mcp->mc_rax = 0;
1493 mcp->mc_rdx = 0;
1494 } else {
1495 mcp->mc_rax = tp->tf_rax;
1496 mcp->mc_rdx = tp->tf_rdx;
1497 }
1498 mcp->mc_rip = tp->tf_rip;
1499 mcp->mc_cs = tp->tf_cs;
1500 mcp->mc_rflags = tp->tf_rflags;
1501 mcp->mc_rsp = tp->tf_rsp;
1502 mcp->mc_ss = tp->tf_ss;
1503 mcp->mc_len = sizeof(*mcp);
1504 get_fpcontext(td, mcp);
1505 return (0);
1506 }
1507
1508 /*
1509 * Set machine context.
1510 *
1511 * However, we don't set any but the user modifiable flags, and we won't
1512 * touch the cs selector.
1513 */
1514 int
1515 set_mcontext(struct thread *td, const mcontext_t *mcp)
1516 {
1517 struct trapframe *tp;
1518 long rflags;
1519 int ret;
1520
1521 tp = td->td_frame;
1522 if (mcp->mc_len != sizeof(*mcp))
1523 return (EINVAL);
1524 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
1525 (tp->tf_rflags & ~PSL_USERCHANGE);
1526 ret = set_fpcontext(td, mcp);
1527 if (ret != 0)
1528 return (ret);
1529 tp->tf_r15 = mcp->mc_r15;
1530 tp->tf_r14 = mcp->mc_r14;
1531 tp->tf_r13 = mcp->mc_r13;
1532 tp->tf_r12 = mcp->mc_r12;
1533 tp->tf_r11 = mcp->mc_r11;
1534 tp->tf_r10 = mcp->mc_r10;
1535 tp->tf_r9 = mcp->mc_r9;
1536 tp->tf_r8 = mcp->mc_r8;
1537 tp->tf_rdi = mcp->mc_rdi;
1538 tp->tf_rsi = mcp->mc_rsi;
1539 tp->tf_rbp = mcp->mc_rbp;
1540 tp->tf_rbx = mcp->mc_rbx;
1541 tp->tf_rdx = mcp->mc_rdx;
1542 tp->tf_rcx = mcp->mc_rcx;
1543 tp->tf_rax = mcp->mc_rax;
1544 tp->tf_rip = mcp->mc_rip;
1545 tp->tf_rflags = rflags;
1546 tp->tf_rsp = mcp->mc_rsp;
1547 tp->tf_ss = mcp->mc_ss;
1548 td->td_pcb->pcb_flags |= PCB_FULLCTX;
1549 return (0);
1550 }
1551
1552 static void
1553 get_fpcontext(struct thread *td, mcontext_t *mcp)
1554 {
1555
1556 mcp->mc_ownedfp = fpugetregs(td, (struct savefpu *)&mcp->mc_fpstate);
1557 mcp->mc_fpformat = fpuformat();
1558 }
1559
1560 static int
1561 set_fpcontext(struct thread *td, const mcontext_t *mcp)
1562 {
1563
1564 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
1565 return (0);
1566 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
1567 return (EINVAL);
1568 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
1569 /* We don't care what state is left in the FPU or PCB. */
1570 fpstate_drop(td);
1571 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
1572 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
1573 /*
1574 * XXX we violate the dubious requirement that fpusetregs()
1575 * be called with interrupts disabled.
1576 * XXX obsolete on trap-16 systems?
1577 */
1578 fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate);
1579 } else
1580 return (EINVAL);
1581 return (0);
1582 }
1583
1584 void
1585 fpstate_drop(struct thread *td)
1586 {
1587 register_t s;
1588
1589 s = intr_disable();
1590 if (PCPU_GET(fpcurthread) == td)
1591 fpudrop();
1592 /*
1593 * XXX force a full drop of the fpu. The above only drops it if we
1594 * owned it.
1595 *
1596 * XXX I don't much like fpugetregs()'s semantics of doing a full
1597 * drop. Dropping only to the pcb matches fnsave's behaviour.
1598 * We only need to drop to !PCB_INITDONE in sendsig(). But
1599 * sendsig() is the only caller of fpugetregs()... perhaps we just
1600 * have too many layers.
1601 */
1602 curthread->td_pcb->pcb_flags &= ~PCB_FPUINITDONE;
1603 intr_restore(s);
1604 }
1605
1606 int
1607 fill_dbregs(struct thread *td, struct dbreg *dbregs)
1608 {
1609 struct pcb *pcb;
1610
1611 if (td == NULL) {
1612 dbregs->dr[0] = rdr0();
1613 dbregs->dr[1] = rdr1();
1614 dbregs->dr[2] = rdr2();
1615 dbregs->dr[3] = rdr3();
1616 dbregs->dr[6] = rdr6();
1617 dbregs->dr[7] = rdr7();
1618 } else {
1619 pcb = td->td_pcb;
1620 dbregs->dr[0] = pcb->pcb_dr0;
1621 dbregs->dr[1] = pcb->pcb_dr1;
1622 dbregs->dr[2] = pcb->pcb_dr2;
1623 dbregs->dr[3] = pcb->pcb_dr3;
1624 dbregs->dr[6] = pcb->pcb_dr6;
1625 dbregs->dr[7] = pcb->pcb_dr7;
1626 }
1627 dbregs->dr[4] = 0;
1628 dbregs->dr[5] = 0;
1629 dbregs->dr[8] = 0;
1630 dbregs->dr[9] = 0;
1631 dbregs->dr[10] = 0;
1632 dbregs->dr[11] = 0;
1633 dbregs->dr[12] = 0;
1634 dbregs->dr[13] = 0;
1635 dbregs->dr[14] = 0;
1636 dbregs->dr[15] = 0;
1637 return (0);
1638 }
1639
1640 int
1641 set_dbregs(struct thread *td, struct dbreg *dbregs)
1642 {
1643 struct pcb *pcb;
1644 int i;
1645 u_int64_t mask1, mask2;
1646
1647 if (td == NULL) {
1648 load_dr0(dbregs->dr[0]);
1649 load_dr1(dbregs->dr[1]);
1650 load_dr2(dbregs->dr[2]);
1651 load_dr3(dbregs->dr[3]);
1652 load_dr6(dbregs->dr[6]);
1653 load_dr7(dbregs->dr[7]);
1654 } else {
1655 /*
1656 * Don't let an illegal value for dr7 get set. Specifically,
1657 * check for undefined settings. Setting these bit patterns
1658 * result in undefined behaviour and can lead to an unexpected
1659 * TRCTRAP or a general protection fault right here.
1660 */
1661 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8;
1662 i++, mask1 <<= 2, mask2 <<= 2)
1663 if ((dbregs->dr[7] & mask1) == mask2)
1664 return (EINVAL);
1665
1666 pcb = td->td_pcb;
1667
1668 /*
1669 * Don't let a process set a breakpoint that is not within the
1670 * process's address space. If a process could do this, it
1671 * could halt the system by setting a breakpoint in the kernel
1672 * (if ddb was enabled). Thus, we need to check to make sure
1673 * that no breakpoints are being enabled for addresses outside
1674 * process's address space, unless, perhaps, we were called by
1675 * uid 0.
1676 *
1677 * XXX - what about when the watched area of the user's
1678 * address space is written into from within the kernel
1679 * ... wouldn't that still cause a breakpoint to be generated
1680 * from within kernel mode?
1681 */
1682
1683 if (suser(td) != 0) {
1684 if (dbregs->dr[7] & 0x3) {
1685 /* dr0 is enabled */
1686 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
1687 return (EINVAL);
1688 }
1689 if (dbregs->dr[7] & 0x3<<2) {
1690 /* dr1 is enabled */
1691 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
1692 return (EINVAL);
1693 }
1694 if (dbregs->dr[7] & 0x3<<4) {
1695 /* dr2 is enabled */
1696 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
1697 return (EINVAL);
1698 }
1699 if (dbregs->dr[7] & 0x3<<6) {
1700 /* dr3 is enabled */
1701 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
1702 return (EINVAL);
1703 }
1704 }
1705
1706 pcb->pcb_dr0 = dbregs->dr[0];
1707 pcb->pcb_dr1 = dbregs->dr[1];
1708 pcb->pcb_dr2 = dbregs->dr[2];
1709 pcb->pcb_dr3 = dbregs->dr[3];
1710 pcb->pcb_dr6 = dbregs->dr[6];
1711 pcb->pcb_dr7 = dbregs->dr[7];
1712
1713 pcb->pcb_flags |= PCB_DBREGS;
1714 }
1715
1716 return (0);
1717 }
1718
1719 void
1720 reset_dbregs(void)
1721 {
1722
1723 load_dr7(0); /* Turn off the control bits first */
1724 load_dr0(0);
1725 load_dr1(0);
1726 load_dr2(0);
1727 load_dr3(0);
1728 load_dr6(0);
1729 }
1730
1731 /*
1732 * Return > 0 if a hardware breakpoint has been hit, and the
1733 * breakpoint was in user space. Return 0, otherwise.
1734 */
1735 int
1736 user_dbreg_trap(void)
1737 {
1738 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
1739 u_int64_t bp; /* breakpoint bits extracted from dr6 */
1740 int nbp; /* number of breakpoints that triggered */
1741 caddr_t addr[4]; /* breakpoint addresses */
1742 int i;
1743
1744 dr7 = rdr7();
1745 if ((dr7 & 0x000000ff) == 0) {
1746 /*
1747 * all GE and LE bits in the dr7 register are zero,
1748 * thus the trap couldn't have been caused by the
1749 * hardware debug registers
1750 */
1751 return 0;
1752 }
1753
1754 nbp = 0;
1755 dr6 = rdr6();
1756 bp = dr6 & 0x0000000f;
1757
1758 if (!bp) {
1759 /*
1760 * None of the breakpoint bits are set meaning this
1761 * trap was not caused by any of the debug registers
1762 */
1763 return 0;
1764 }
1765
1766 /*
1767 * at least one of the breakpoints were hit, check to see
1768 * which ones and if any of them are user space addresses
1769 */
1770
1771 if (bp & 0x01) {
1772 addr[nbp++] = (caddr_t)rdr0();
1773 }
1774 if (bp & 0x02) {
1775 addr[nbp++] = (caddr_t)rdr1();
1776 }
1777 if (bp & 0x04) {
1778 addr[nbp++] = (caddr_t)rdr2();
1779 }
1780 if (bp & 0x08) {
1781 addr[nbp++] = (caddr_t)rdr3();
1782 }
1783
1784 for (i=0; i<nbp; i++) {
1785 if (addr[i] <
1786 (caddr_t)VM_MAXUSER_ADDRESS) {
1787 /*
1788 * addr[i] is in user space
1789 */
1790 return nbp;
1791 }
1792 }
1793
1794 /*
1795 * None of the breakpoints are in user space.
1796 */
1797 return 0;
1798 }
1799
1800 #ifdef KDB
1801
1802 /*
1803 * Provide inb() and outb() as functions. They are normally only
1804 * available as macros calling inlined functions, thus cannot be
1805 * called from the debugger.
1806 *
1807 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
1808 */
1809
1810 #undef inb
1811 #undef outb
1812
1813 /* silence compiler warnings */
1814 u_char inb(u_int);
1815 void outb(u_int, u_char);
1816
1817 u_char
1818 inb(u_int port)
1819 {
1820 u_char data;
1821 /*
1822 * We use %%dx and not %1 here because i/o is done at %dx and not at
1823 * %edx, while gcc generates inferior code (movw instead of movl)
1824 * if we tell it to load (u_short) port.
1825 */
1826 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
1827 return (data);
1828 }
1829
1830 void
1831 outb(u_int port, u_char data)
1832 {
1833 u_char al;
1834 /*
1835 * Use an unnecessary assignment to help gcc's register allocator.
1836 * This make a large difference for gcc-1.40 and a tiny difference
1837 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
1838 * best results. gcc-2.6.0 can't handle this.
1839 */
1840 al = data;
1841 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
1842 }
1843
1844 #endif /* KDB */
Cache object: 6dca0bc415586eb1232076469b3c33c8
|