1 /*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD: releng/11.1/sys/amd64/amd64/machdep.c 338607 2018-09-12 05:08:49Z gordon $");
43
44 #include "opt_atpic.h"
45 #include "opt_compat.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_inet.h"
49 #include "opt_isa.h"
50 #include "opt_kstack_pages.h"
51 #include "opt_maxmem.h"
52 #include "opt_mp_watchdog.h"
53 #include "opt_perfmon.h"
54 #include "opt_platform.h"
55 #include "opt_sched.h"
56
57 #include <sys/param.h>
58 #include <sys/proc.h>
59 #include <sys/systm.h>
60 #include <sys/bio.h>
61 #include <sys/buf.h>
62 #include <sys/bus.h>
63 #include <sys/callout.h>
64 #include <sys/cons.h>
65 #include <sys/cpu.h>
66 #include <sys/efi.h>
67 #include <sys/eventhandler.h>
68 #include <sys/exec.h>
69 #include <sys/imgact.h>
70 #include <sys/kdb.h>
71 #include <sys/kernel.h>
72 #include <sys/ktr.h>
73 #include <sys/linker.h>
74 #include <sys/lock.h>
75 #include <sys/malloc.h>
76 #include <sys/memrange.h>
77 #include <sys/msgbuf.h>
78 #include <sys/mutex.h>
79 #include <sys/pcpu.h>
80 #include <sys/ptrace.h>
81 #include <sys/reboot.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94
95 #include <vm/vm.h>
96 #include <vm/vm_extern.h>
97 #include <vm/vm_kern.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_map.h>
100 #include <vm/vm_object.h>
101 #include <vm/vm_pager.h>
102 #include <vm/vm_param.h>
103
104 #ifdef DDB
105 #ifndef KDB
106 #error KDB must be enabled in order for DDB to work!
107 #endif
108 #include <ddb/ddb.h>
109 #include <ddb/db_sym.h>
110 #endif
111
112 #include <net/netisr.h>
113
114 #include <machine/clock.h>
115 #include <machine/cpu.h>
116 #include <machine/cputypes.h>
117 #include <machine/frame.h>
118 #include <machine/intr_machdep.h>
119 #include <x86/mca.h>
120 #include <machine/md_var.h>
121 #include <machine/metadata.h>
122 #include <machine/mp_watchdog.h>
123 #include <machine/pc/bios.h>
124 #include <machine/pcb.h>
125 #include <machine/proc.h>
126 #include <machine/reg.h>
127 #include <machine/sigframe.h>
128 #include <machine/specialreg.h>
129 #ifdef PERFMON
130 #include <machine/perfmon.h>
131 #endif
132 #include <machine/tss.h>
133 #ifdef SMP
134 #include <machine/smp.h>
135 #endif
136 #ifdef FDT
137 #include <x86/fdt.h>
138 #endif
139
140 #ifdef DEV_ATPIC
141 #include <x86/isa/icu.h>
142 #else
143 #include <x86/apicvar.h>
144 #endif
145
146 #include <isa/isareg.h>
147 #include <isa/rtc.h>
148 #include <x86/init.h>
149
150 /* Sanity check for __curthread() */
151 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
152
153 /*
154 * The PTI trampoline stack needs enough space for a hardware trapframe and a
155 * couple of scratch registers, as well as the trapframe left behind after an
156 * iret fault.
157 */
158 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
159 offsetof(struct pti_frame, pti_rip));
160
161 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
162
163 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
164 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
165
166 static void cpu_startup(void *);
167 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
168 char *xfpusave, size_t xfpusave_len);
169 static int set_fpcontext(struct thread *td, mcontext_t *mcp,
170 char *xfpustate, size_t xfpustate_len);
171 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
172
173 /* Preload data parse function */
174 static caddr_t native_parse_preload_data(u_int64_t);
175
176 /* Native function to fetch and parse the e820 map */
177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
178
179 /* Default init_ops implementation. */
180 struct init_ops init_ops = {
181 .parse_preload_data = native_parse_preload_data,
182 .early_clock_source_init = i8254_init,
183 .early_delay = i8254_delay,
184 .parse_memmap = native_parse_memmap,
185 #ifdef SMP
186 .mp_bootaddress = mp_bootaddress,
187 .start_all_aps = native_start_all_aps,
188 #endif
189 .msi_init = msi_init,
190 };
191
192 struct msgbuf *msgbufp;
193
194 /*
195 * Physical address of the EFI System Table. Stashed from the metadata hints
196 * passed into the kernel and used by the EFI code to call runtime services.
197 */
198 vm_paddr_t efi_systbl_phys;
199
200 /* Intel ICH registers */
201 #define ICH_PMBASE 0x400
202 #define ICH_SMI_EN ICH_PMBASE + 0x30
203
204 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
205
206 int cold = 1;
207
208 long Maxmem = 0;
209 long realmem = 0;
210
211 /*
212 * The number of PHYSMAP entries must be one less than the number of
213 * PHYSSEG entries because the PHYSMAP entry that spans the largest
214 * physical address that is accessible by ISA DMA is split into two
215 * PHYSSEG entries.
216 */
217 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
218
219 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
220 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
221
222 /* must be 2 less so 0 0 can signal end of chunks */
223 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
224 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
225
226 struct kva_md_info kmi;
227
228 static struct trapframe proc0_tf;
229 struct region_descriptor r_gdt, r_idt;
230
231 struct pcpu __pcpu[MAXCPU];
232
233 struct mtx icu_lock;
234
235 struct mem_range_softc mem_range_softc;
236
237 struct mtx dt_lock; /* lock for GDT and LDT */
238
239 void (*vmm_resume_p)(void);
240
241 static void
242 cpu_startup(dummy)
243 void *dummy;
244 {
245 uintmax_t memsize;
246 char *sysenv;
247
248 /*
249 * On MacBooks, we need to disallow the legacy USB circuit to
250 * generate an SMI# because this can cause several problems,
251 * namely: incorrect CPU frequency detection and failure to
252 * start the APs.
253 * We do this by disabling a bit in the SMI_EN (SMI Control and
254 * Enable register) of the Intel ICH LPC Interface Bridge.
255 */
256 sysenv = kern_getenv("smbios.system.product");
257 if (sysenv != NULL) {
258 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
259 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
260 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
261 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
262 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
263 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
264 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
265 strncmp(sysenv, "Macmini1,1", 10) == 0) {
266 if (bootverbose)
267 printf("Disabling LEGACY_USB_EN bit on "
268 "Intel ICH.\n");
269 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
270 }
271 freeenv(sysenv);
272 }
273
274 /*
275 * Good {morning,afternoon,evening,night}.
276 */
277 startrtclock();
278 printcpuinfo();
279 #ifdef PERFMON
280 perfmon_init();
281 #endif
282
283 /*
284 * Display physical memory if SMBIOS reports reasonable amount.
285 */
286 memsize = 0;
287 sysenv = kern_getenv("smbios.memory.enabled");
288 if (sysenv != NULL) {
289 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
290 freeenv(sysenv);
291 }
292 if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
293 memsize = ptoa((uintmax_t)Maxmem);
294 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
295 realmem = atop(memsize);
296
297 /*
298 * Display any holes after the first chunk of extended memory.
299 */
300 if (bootverbose) {
301 int indx;
302
303 printf("Physical memory chunk(s):\n");
304 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
305 vm_paddr_t size;
306
307 size = phys_avail[indx + 1] - phys_avail[indx];
308 printf(
309 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
310 (uintmax_t)phys_avail[indx],
311 (uintmax_t)phys_avail[indx + 1] - 1,
312 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
313 }
314 }
315
316 vm_ksubmap_init(&kmi);
317
318 printf("avail memory = %ju (%ju MB)\n",
319 ptoa((uintmax_t)vm_cnt.v_free_count),
320 ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
321
322 /*
323 * Set up buffers, so they can be used to read disk labels.
324 */
325 bufinit();
326 vm_pager_bufferinit();
327
328 cpu_setregs();
329 }
330
331 /*
332 * Send an interrupt to process.
333 *
334 * Stack is set up to allow sigcode stored
335 * at top to call routine, followed by call
336 * to sigreturn routine below. After sigreturn
337 * resets the signal mask, the stack, and the
338 * frame pointer, it returns to the user
339 * specified pc, psl.
340 */
341 void
342 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
343 {
344 struct sigframe sf, *sfp;
345 struct pcb *pcb;
346 struct proc *p;
347 struct thread *td;
348 struct sigacts *psp;
349 char *sp;
350 struct trapframe *regs;
351 char *xfpusave;
352 size_t xfpusave_len;
353 int sig;
354 int oonstack;
355
356 td = curthread;
357 pcb = td->td_pcb;
358 p = td->td_proc;
359 PROC_LOCK_ASSERT(p, MA_OWNED);
360 sig = ksi->ksi_signo;
361 psp = p->p_sigacts;
362 mtx_assert(&psp->ps_mtx, MA_OWNED);
363 regs = td->td_frame;
364 oonstack = sigonstack(regs->tf_rsp);
365
366 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
367 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
368 xfpusave = __builtin_alloca(xfpusave_len);
369 } else {
370 xfpusave_len = 0;
371 xfpusave = NULL;
372 }
373
374 /* Save user context. */
375 bzero(&sf, sizeof(sf));
376 sf.sf_uc.uc_sigmask = *mask;
377 sf.sf_uc.uc_stack = td->td_sigstk;
378 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
379 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
380 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
381 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
382 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
383 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
384 fpstate_drop(td);
385 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
386 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
387 bzero(sf.sf_uc.uc_mcontext.mc_spare,
388 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
389 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
390
391 /* Allocate space for the signal handler context. */
392 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
393 SIGISMEMBER(psp->ps_sigonstack, sig)) {
394 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
395 #if defined(COMPAT_43)
396 td->td_sigstk.ss_flags |= SS_ONSTACK;
397 #endif
398 } else
399 sp = (char *)regs->tf_rsp - 128;
400 if (xfpusave != NULL) {
401 sp -= xfpusave_len;
402 sp = (char *)((unsigned long)sp & ~0x3Ful);
403 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
404 }
405 sp -= sizeof(struct sigframe);
406 /* Align to 16 bytes. */
407 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
408
409 /* Build the argument list for the signal handler. */
410 regs->tf_rdi = sig; /* arg 1 in %rdi */
411 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
412 bzero(&sf.sf_si, sizeof(sf.sf_si));
413 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
414 /* Signal handler installed with SA_SIGINFO. */
415 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
416 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
417
418 /* Fill in POSIX parts */
419 sf.sf_si = ksi->ksi_info;
420 sf.sf_si.si_signo = sig; /* maybe a translated signal */
421 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
422 } else {
423 /* Old FreeBSD-style arguments. */
424 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
425 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
426 sf.sf_ahu.sf_handler = catcher;
427 }
428 mtx_unlock(&psp->ps_mtx);
429 PROC_UNLOCK(p);
430
431 /*
432 * Copy the sigframe out to the user's stack.
433 */
434 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
435 (xfpusave != NULL && copyout(xfpusave,
436 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
437 != 0)) {
438 #ifdef DEBUG
439 printf("process %ld has trashed its stack\n", (long)p->p_pid);
440 #endif
441 PROC_LOCK(p);
442 sigexit(td, SIGILL);
443 }
444
445 regs->tf_rsp = (long)sfp;
446 regs->tf_rip = p->p_sysent->sv_sigcode_base;
447 regs->tf_rflags &= ~(PSL_T | PSL_D);
448 regs->tf_cs = _ucodesel;
449 regs->tf_ds = _udatasel;
450 regs->tf_ss = _udatasel;
451 regs->tf_es = _udatasel;
452 regs->tf_fs = _ufssel;
453 regs->tf_gs = _ugssel;
454 regs->tf_flags = TF_HASSEGS;
455 set_pcb_flags(pcb, PCB_FULL_IRET);
456 PROC_LOCK(p);
457 mtx_lock(&psp->ps_mtx);
458 }
459
460 /*
461 * System call to cleanup state after a signal
462 * has been taken. Reset signal mask and
463 * stack state from context left by sendsig (above).
464 * Return to previous pc and psl as specified by
465 * context left by sendsig. Check carefully to
466 * make sure that the user has not modified the
467 * state to gain improper privileges.
468 *
469 * MPSAFE
470 */
471 int
472 sys_sigreturn(td, uap)
473 struct thread *td;
474 struct sigreturn_args /* {
475 const struct __ucontext *sigcntxp;
476 } */ *uap;
477 {
478 ucontext_t uc;
479 struct pcb *pcb;
480 struct proc *p;
481 struct trapframe *regs;
482 ucontext_t *ucp;
483 char *xfpustate;
484 size_t xfpustate_len;
485 long rflags;
486 int cs, error, ret;
487 ksiginfo_t ksi;
488
489 pcb = td->td_pcb;
490 p = td->td_proc;
491
492 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
493 if (error != 0) {
494 uprintf("pid %d (%s): sigreturn copyin failed\n",
495 p->p_pid, td->td_name);
496 return (error);
497 }
498 ucp = &uc;
499 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
500 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
501 td->td_name, ucp->uc_mcontext.mc_flags);
502 return (EINVAL);
503 }
504 regs = td->td_frame;
505 rflags = ucp->uc_mcontext.mc_rflags;
506 /*
507 * Don't allow users to change privileged or reserved flags.
508 */
509 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
510 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
511 td->td_name, rflags);
512 return (EINVAL);
513 }
514
515 /*
516 * Don't allow users to load a valid privileged %cs. Let the
517 * hardware check for invalid selectors, excess privilege in
518 * other selectors, invalid %eip's and invalid %esp's.
519 */
520 cs = ucp->uc_mcontext.mc_cs;
521 if (!CS_SECURE(cs)) {
522 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
523 td->td_name, cs);
524 ksiginfo_init_trap(&ksi);
525 ksi.ksi_signo = SIGBUS;
526 ksi.ksi_code = BUS_OBJERR;
527 ksi.ksi_trapno = T_PROTFLT;
528 ksi.ksi_addr = (void *)regs->tf_rip;
529 trapsignal(td, &ksi);
530 return (EINVAL);
531 }
532
533 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
534 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
535 if (xfpustate_len > cpu_max_ext_state_size -
536 sizeof(struct savefpu)) {
537 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
538 p->p_pid, td->td_name, xfpustate_len);
539 return (EINVAL);
540 }
541 xfpustate = __builtin_alloca(xfpustate_len);
542 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
543 xfpustate, xfpustate_len);
544 if (error != 0) {
545 uprintf(
546 "pid %d (%s): sigreturn copying xfpustate failed\n",
547 p->p_pid, td->td_name);
548 return (error);
549 }
550 } else {
551 xfpustate = NULL;
552 xfpustate_len = 0;
553 }
554 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
555 if (ret != 0) {
556 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
557 p->p_pid, td->td_name, ret);
558 return (ret);
559 }
560 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
561 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
562 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
563
564 #if defined(COMPAT_43)
565 if (ucp->uc_mcontext.mc_onstack & 1)
566 td->td_sigstk.ss_flags |= SS_ONSTACK;
567 else
568 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
569 #endif
570
571 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
572 set_pcb_flags(pcb, PCB_FULL_IRET);
573 return (EJUSTRETURN);
574 }
575
576 #ifdef COMPAT_FREEBSD4
577 int
578 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
579 {
580
581 return sys_sigreturn(td, (struct sigreturn_args *)uap);
582 }
583 #endif
584
585 /*
586 * Reset registers to default values on exec.
587 */
588 void
589 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
590 {
591 struct trapframe *regs = td->td_frame;
592 struct pcb *pcb = td->td_pcb;
593
594 mtx_lock(&dt_lock);
595 if (td->td_proc->p_md.md_ldt != NULL)
596 user_ldt_free(td);
597 else
598 mtx_unlock(&dt_lock);
599
600 pcb->pcb_fsbase = 0;
601 pcb->pcb_gsbase = 0;
602 clear_pcb_flags(pcb, PCB_32BIT);
603 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
604 set_pcb_flags(pcb, PCB_FULL_IRET);
605
606 bzero((char *)regs, sizeof(struct trapframe));
607 regs->tf_rip = imgp->entry_addr;
608 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
609 regs->tf_rdi = stack; /* argv */
610 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
611 regs->tf_ss = _udatasel;
612 regs->tf_cs = _ucodesel;
613 regs->tf_ds = _udatasel;
614 regs->tf_es = _udatasel;
615 regs->tf_fs = _ufssel;
616 regs->tf_gs = _ugssel;
617 regs->tf_flags = TF_HASSEGS;
618 td->td_retval[1] = 0;
619
620 /*
621 * Reset the hardware debug registers if they were in use.
622 * They won't have any meaning for the newly exec'd process.
623 */
624 if (pcb->pcb_flags & PCB_DBREGS) {
625 pcb->pcb_dr0 = 0;
626 pcb->pcb_dr1 = 0;
627 pcb->pcb_dr2 = 0;
628 pcb->pcb_dr3 = 0;
629 pcb->pcb_dr6 = 0;
630 pcb->pcb_dr7 = 0;
631 if (pcb == curpcb) {
632 /*
633 * Clear the debug registers on the running
634 * CPU, otherwise they will end up affecting
635 * the next process we switch to.
636 */
637 reset_dbregs();
638 }
639 clear_pcb_flags(pcb, PCB_DBREGS);
640 }
641
642 /*
643 * Drop the FP state if we hold it, so that the process gets a
644 * clean FP state if it uses the FPU again.
645 */
646 fpstate_drop(td);
647 }
648
649 void
650 cpu_setregs(void)
651 {
652 register_t cr0;
653
654 cr0 = rcr0();
655 /*
656 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
657 * BSP. See the comments there about why we set them.
658 */
659 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
660 load_cr0(cr0);
661 }
662
663 /*
664 * Initialize amd64 and configure to run kernel
665 */
666
667 /*
668 * Initialize segments & interrupt table
669 */
670
671 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
672 static struct gate_descriptor idt0[NIDT];
673 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
674
675 static char dblfault_stack[PAGE_SIZE] __aligned(16);
676 static char mce0_stack[PAGE_SIZE] __aligned(16);
677 static char nmi0_stack[PAGE_SIZE] __aligned(16);
678 static char dbg0_stack[PAGE_SIZE] __aligned(16);
679 CTASSERT(sizeof(struct nmi_pcpu) == 16);
680
681 struct amd64tss common_tss[MAXCPU];
682
683 /*
684 * Software prototypes -- in more palatable form.
685 *
686 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
687 * slots as corresponding segments for i386 kernel.
688 */
689 struct soft_segment_descriptor gdt_segs[] = {
690 /* GNULL_SEL 0 Null Descriptor */
691 { .ssd_base = 0x0,
692 .ssd_limit = 0x0,
693 .ssd_type = 0,
694 .ssd_dpl = 0,
695 .ssd_p = 0,
696 .ssd_long = 0,
697 .ssd_def32 = 0,
698 .ssd_gran = 0 },
699 /* GNULL2_SEL 1 Null Descriptor */
700 { .ssd_base = 0x0,
701 .ssd_limit = 0x0,
702 .ssd_type = 0,
703 .ssd_dpl = 0,
704 .ssd_p = 0,
705 .ssd_long = 0,
706 .ssd_def32 = 0,
707 .ssd_gran = 0 },
708 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */
709 { .ssd_base = 0x0,
710 .ssd_limit = 0xfffff,
711 .ssd_type = SDT_MEMRWA,
712 .ssd_dpl = SEL_UPL,
713 .ssd_p = 1,
714 .ssd_long = 0,
715 .ssd_def32 = 1,
716 .ssd_gran = 1 },
717 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */
718 { .ssd_base = 0x0,
719 .ssd_limit = 0xfffff,
720 .ssd_type = SDT_MEMRWA,
721 .ssd_dpl = SEL_UPL,
722 .ssd_p = 1,
723 .ssd_long = 0,
724 .ssd_def32 = 1,
725 .ssd_gran = 1 },
726 /* GCODE_SEL 4 Code Descriptor for kernel */
727 { .ssd_base = 0x0,
728 .ssd_limit = 0xfffff,
729 .ssd_type = SDT_MEMERA,
730 .ssd_dpl = SEL_KPL,
731 .ssd_p = 1,
732 .ssd_long = 1,
733 .ssd_def32 = 0,
734 .ssd_gran = 1 },
735 /* GDATA_SEL 5 Data Descriptor for kernel */
736 { .ssd_base = 0x0,
737 .ssd_limit = 0xfffff,
738 .ssd_type = SDT_MEMRWA,
739 .ssd_dpl = SEL_KPL,
740 .ssd_p = 1,
741 .ssd_long = 1,
742 .ssd_def32 = 0,
743 .ssd_gran = 1 },
744 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
745 { .ssd_base = 0x0,
746 .ssd_limit = 0xfffff,
747 .ssd_type = SDT_MEMERA,
748 .ssd_dpl = SEL_UPL,
749 .ssd_p = 1,
750 .ssd_long = 0,
751 .ssd_def32 = 1,
752 .ssd_gran = 1 },
753 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
754 { .ssd_base = 0x0,
755 .ssd_limit = 0xfffff,
756 .ssd_type = SDT_MEMRWA,
757 .ssd_dpl = SEL_UPL,
758 .ssd_p = 1,
759 .ssd_long = 0,
760 .ssd_def32 = 1,
761 .ssd_gran = 1 },
762 /* GUCODE_SEL 8 64 bit Code Descriptor for user */
763 { .ssd_base = 0x0,
764 .ssd_limit = 0xfffff,
765 .ssd_type = SDT_MEMERA,
766 .ssd_dpl = SEL_UPL,
767 .ssd_p = 1,
768 .ssd_long = 1,
769 .ssd_def32 = 0,
770 .ssd_gran = 1 },
771 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */
772 { .ssd_base = 0x0,
773 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
774 .ssd_type = SDT_SYSTSS,
775 .ssd_dpl = SEL_KPL,
776 .ssd_p = 1,
777 .ssd_long = 0,
778 .ssd_def32 = 0,
779 .ssd_gran = 0 },
780 /* Actually, the TSS is a system descriptor which is double size */
781 { .ssd_base = 0x0,
782 .ssd_limit = 0x0,
783 .ssd_type = 0,
784 .ssd_dpl = 0,
785 .ssd_p = 0,
786 .ssd_long = 0,
787 .ssd_def32 = 0,
788 .ssd_gran = 0 },
789 /* GUSERLDT_SEL 11 LDT Descriptor */
790 { .ssd_base = 0x0,
791 .ssd_limit = 0x0,
792 .ssd_type = 0,
793 .ssd_dpl = 0,
794 .ssd_p = 0,
795 .ssd_long = 0,
796 .ssd_def32 = 0,
797 .ssd_gran = 0 },
798 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
799 { .ssd_base = 0x0,
800 .ssd_limit = 0x0,
801 .ssd_type = 0,
802 .ssd_dpl = 0,
803 .ssd_p = 0,
804 .ssd_long = 0,
805 .ssd_def32 = 0,
806 .ssd_gran = 0 },
807 };
808
809 void
810 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
811 {
812 struct gate_descriptor *ip;
813
814 ip = idt + idx;
815 ip->gd_looffset = (uintptr_t)func;
816 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
817 ip->gd_ist = ist;
818 ip->gd_xx = 0;
819 ip->gd_type = typ;
820 ip->gd_dpl = dpl;
821 ip->gd_p = 1;
822 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
823 }
824
825 extern inthand_t
826 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
827 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
828 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
829 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
830 IDTVEC(xmm), IDTVEC(dblfault),
831 IDTVEC(div_pti), IDTVEC(bpt_pti),
832 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
833 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
834 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
835 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
836 IDTVEC(xmm_pti),
837 #ifdef KDTRACE_HOOKS
838 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
839 #endif
840 #ifdef XENHVM
841 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
842 #endif
843 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
844 IDTVEC(fast_syscall_pti);
845
846 #ifdef DDB
847 /*
848 * Display the index and function name of any IDT entries that don't use
849 * the default 'rsvd' entry point.
850 */
851 DB_SHOW_COMMAND(idt, db_show_idt)
852 {
853 struct gate_descriptor *ip;
854 int idx;
855 uintptr_t func;
856
857 ip = idt;
858 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
859 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
860 if (func != (uintptr_t)&IDTVEC(rsvd)) {
861 db_printf("%3d\t", idx);
862 db_printsym(func, DB_STGY_PROC);
863 db_printf("\n");
864 }
865 ip++;
866 }
867 }
868
869 /* Show privileged registers. */
870 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
871 {
872 struct {
873 uint16_t limit;
874 uint64_t base;
875 } __packed idtr, gdtr;
876 uint16_t ldt, tr;
877
878 __asm __volatile("sidt %0" : "=m" (idtr));
879 db_printf("idtr\t0x%016lx/%04x\n",
880 (u_long)idtr.base, (u_int)idtr.limit);
881 __asm __volatile("sgdt %0" : "=m" (gdtr));
882 db_printf("gdtr\t0x%016lx/%04x\n",
883 (u_long)gdtr.base, (u_int)gdtr.limit);
884 __asm __volatile("sldt %0" : "=r" (ldt));
885 db_printf("ldtr\t0x%04x\n", ldt);
886 __asm __volatile("str %0" : "=r" (tr));
887 db_printf("tr\t0x%04x\n", tr);
888 db_printf("cr0\t0x%016lx\n", rcr0());
889 db_printf("cr2\t0x%016lx\n", rcr2());
890 db_printf("cr3\t0x%016lx\n", rcr3());
891 db_printf("cr4\t0x%016lx\n", rcr4());
892 if (rcr4() & CR4_XSAVE)
893 db_printf("xcr0\t0x%016lx\n", rxcr(0));
894 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
895 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
896 db_printf("FEATURES_CTL\t%016lx\n",
897 rdmsr(MSR_IA32_FEATURE_CONTROL));
898 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
899 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
900 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
901 }
902
903 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
904 {
905
906 db_printf("dr0\t0x%016lx\n", rdr0());
907 db_printf("dr1\t0x%016lx\n", rdr1());
908 db_printf("dr2\t0x%016lx\n", rdr2());
909 db_printf("dr3\t0x%016lx\n", rdr3());
910 db_printf("dr6\t0x%016lx\n", rdr6());
911 db_printf("dr7\t0x%016lx\n", rdr7());
912 }
913 #endif
914
915 void
916 sdtossd(sd, ssd)
917 struct user_segment_descriptor *sd;
918 struct soft_segment_descriptor *ssd;
919 {
920
921 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
922 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
923 ssd->ssd_type = sd->sd_type;
924 ssd->ssd_dpl = sd->sd_dpl;
925 ssd->ssd_p = sd->sd_p;
926 ssd->ssd_long = sd->sd_long;
927 ssd->ssd_def32 = sd->sd_def32;
928 ssd->ssd_gran = sd->sd_gran;
929 }
930
931 void
932 ssdtosd(ssd, sd)
933 struct soft_segment_descriptor *ssd;
934 struct user_segment_descriptor *sd;
935 {
936
937 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
938 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
939 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
940 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
941 sd->sd_type = ssd->ssd_type;
942 sd->sd_dpl = ssd->ssd_dpl;
943 sd->sd_p = ssd->ssd_p;
944 sd->sd_long = ssd->ssd_long;
945 sd->sd_def32 = ssd->ssd_def32;
946 sd->sd_gran = ssd->ssd_gran;
947 }
948
949 void
950 ssdtosyssd(ssd, sd)
951 struct soft_segment_descriptor *ssd;
952 struct system_segment_descriptor *sd;
953 {
954
955 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
956 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
957 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
958 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
959 sd->sd_type = ssd->ssd_type;
960 sd->sd_dpl = ssd->ssd_dpl;
961 sd->sd_p = ssd->ssd_p;
962 sd->sd_gran = ssd->ssd_gran;
963 }
964
965 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
966 #include <isa/isavar.h>
967 #include <isa/isareg.h>
968 /*
969 * Return a bitmap of the current interrupt requests. This is 8259-specific
970 * and is only suitable for use at probe time.
971 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
972 * It shouldn't be here. There should probably be an APIC centric
973 * implementation in the apic driver code, if at all.
974 */
975 intrmask_t
976 isa_irq_pending(void)
977 {
978 u_char irr1;
979 u_char irr2;
980
981 irr1 = inb(IO_ICU1);
982 irr2 = inb(IO_ICU2);
983 return ((irr2 << 8) | irr1);
984 }
985 #endif
986
987 u_int basemem;
988
989 static int
990 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
991 int *physmap_idxp)
992 {
993 int i, insert_idx, physmap_idx;
994
995 physmap_idx = *physmap_idxp;
996
997 if (length == 0)
998 return (1);
999
1000 /*
1001 * Find insertion point while checking for overlap. Start off by
1002 * assuming the new entry will be added to the end.
1003 *
1004 * NB: physmap_idx points to the next free slot.
1005 */
1006 insert_idx = physmap_idx;
1007 for (i = 0; i <= physmap_idx; i += 2) {
1008 if (base < physmap[i + 1]) {
1009 if (base + length <= physmap[i]) {
1010 insert_idx = i;
1011 break;
1012 }
1013 if (boothowto & RB_VERBOSE)
1014 printf(
1015 "Overlapping memory regions, ignoring second region\n");
1016 return (1);
1017 }
1018 }
1019
1020 /* See if we can prepend to the next entry. */
1021 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1022 physmap[insert_idx] = base;
1023 return (1);
1024 }
1025
1026 /* See if we can append to the previous entry. */
1027 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1028 physmap[insert_idx - 1] += length;
1029 return (1);
1030 }
1031
1032 physmap_idx += 2;
1033 *physmap_idxp = physmap_idx;
1034 if (physmap_idx == PHYSMAP_SIZE) {
1035 printf(
1036 "Too many segments in the physical address map, giving up\n");
1037 return (0);
1038 }
1039
1040 /*
1041 * Move the last 'N' entries down to make room for the new
1042 * entry if needed.
1043 */
1044 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1045 physmap[i] = physmap[i - 2];
1046 physmap[i + 1] = physmap[i - 1];
1047 }
1048
1049 /* Insert the new entry. */
1050 physmap[insert_idx] = base;
1051 physmap[insert_idx + 1] = base + length;
1052 return (1);
1053 }
1054
1055 void
1056 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1057 vm_paddr_t *physmap, int *physmap_idx)
1058 {
1059 struct bios_smap *smap, *smapend;
1060
1061 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1062
1063 for (smap = smapbase; smap < smapend; smap++) {
1064 if (boothowto & RB_VERBOSE)
1065 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1066 smap->type, smap->base, smap->length);
1067
1068 if (smap->type != SMAP_TYPE_MEMORY)
1069 continue;
1070
1071 if (!add_physmap_entry(smap->base, smap->length, physmap,
1072 physmap_idx))
1073 break;
1074 }
1075 }
1076
1077 static void
1078 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1079 int *physmap_idx)
1080 {
1081 struct efi_md *map, *p;
1082 const char *type;
1083 size_t efisz;
1084 int ndesc, i;
1085
1086 static const char *types[] = {
1087 "Reserved",
1088 "LoaderCode",
1089 "LoaderData",
1090 "BootServicesCode",
1091 "BootServicesData",
1092 "RuntimeServicesCode",
1093 "RuntimeServicesData",
1094 "ConventionalMemory",
1095 "UnusableMemory",
1096 "ACPIReclaimMemory",
1097 "ACPIMemoryNVS",
1098 "MemoryMappedIO",
1099 "MemoryMappedIOPortSpace",
1100 "PalCode",
1101 "PersistentMemory"
1102 };
1103
1104 /*
1105 * Memory map data provided by UEFI via the GetMemoryMap
1106 * Boot Services API.
1107 */
1108 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1109 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1110
1111 if (efihdr->descriptor_size == 0)
1112 return;
1113 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1114
1115 if (boothowto & RB_VERBOSE)
1116 printf("%23s %12s %12s %8s %4s\n",
1117 "Type", "Physical", "Virtual", "#Pages", "Attr");
1118
1119 for (i = 0, p = map; i < ndesc; i++,
1120 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1121 if (boothowto & RB_VERBOSE) {
1122 if (p->md_type < nitems(types))
1123 type = types[p->md_type];
1124 else
1125 type = "<INVALID>";
1126 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1127 p->md_virt, p->md_pages);
1128 if (p->md_attr & EFI_MD_ATTR_UC)
1129 printf("UC ");
1130 if (p->md_attr & EFI_MD_ATTR_WC)
1131 printf("WC ");
1132 if (p->md_attr & EFI_MD_ATTR_WT)
1133 printf("WT ");
1134 if (p->md_attr & EFI_MD_ATTR_WB)
1135 printf("WB ");
1136 if (p->md_attr & EFI_MD_ATTR_UCE)
1137 printf("UCE ");
1138 if (p->md_attr & EFI_MD_ATTR_WP)
1139 printf("WP ");
1140 if (p->md_attr & EFI_MD_ATTR_RP)
1141 printf("RP ");
1142 if (p->md_attr & EFI_MD_ATTR_XP)
1143 printf("XP ");
1144 if (p->md_attr & EFI_MD_ATTR_NV)
1145 printf("NV ");
1146 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1147 printf("MORE_RELIABLE ");
1148 if (p->md_attr & EFI_MD_ATTR_RO)
1149 printf("RO ");
1150 if (p->md_attr & EFI_MD_ATTR_RT)
1151 printf("RUNTIME");
1152 printf("\n");
1153 }
1154
1155 switch (p->md_type) {
1156 case EFI_MD_TYPE_CODE:
1157 case EFI_MD_TYPE_DATA:
1158 case EFI_MD_TYPE_BS_CODE:
1159 case EFI_MD_TYPE_BS_DATA:
1160 case EFI_MD_TYPE_FREE:
1161 /*
1162 * We're allowed to use any entry with these types.
1163 */
1164 break;
1165 default:
1166 continue;
1167 }
1168
1169 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1170 physmap, physmap_idx))
1171 break;
1172 }
1173 }
1174
1175 static char bootmethod[16] = "";
1176 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1177 "System firmware boot method");
1178
1179 static void
1180 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1181 {
1182 struct bios_smap *smap;
1183 struct efi_map_header *efihdr;
1184 u_int32_t size;
1185
1186 /*
1187 * Memory map from INT 15:E820.
1188 *
1189 * subr_module.c says:
1190 * "Consumer may safely assume that size value precedes data."
1191 * ie: an int32_t immediately precedes smap.
1192 */
1193
1194 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1195 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1196 smap = (struct bios_smap *)preload_search_info(kmdp,
1197 MODINFO_METADATA | MODINFOMD_SMAP);
1198 if (efihdr == NULL && smap == NULL)
1199 panic("No BIOS smap or EFI map info from loader!");
1200
1201 if (efihdr != NULL) {
1202 add_efi_map_entries(efihdr, physmap, physmap_idx);
1203 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1204 } else {
1205 size = *((u_int32_t *)smap - 1);
1206 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1207 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1208 }
1209 }
1210
1211 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1212
1213 /*
1214 * Populate the (physmap) array with base/bound pairs describing the
1215 * available physical memory in the system, then test this memory and
1216 * build the phys_avail array describing the actually-available memory.
1217 *
1218 * Total memory size may be set by the kernel environment variable
1219 * hw.physmem or the compile-time define MAXMEM.
1220 *
1221 * XXX first should be vm_paddr_t.
1222 */
1223 static void
1224 getmemsize(caddr_t kmdp, u_int64_t first)
1225 {
1226 int i, physmap_idx, pa_indx, da_indx;
1227 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1228 u_long physmem_start, physmem_tunable, memtest;
1229 pt_entry_t *pte;
1230 quad_t dcons_addr, dcons_size;
1231 int page_counter;
1232
1233 bzero(physmap, sizeof(physmap));
1234 physmap_idx = 0;
1235
1236 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1237 physmap_idx -= 2;
1238
1239 /*
1240 * Find the 'base memory' segment for SMP
1241 */
1242 basemem = 0;
1243 for (i = 0; i <= physmap_idx; i += 2) {
1244 if (physmap[i] <= 0xA0000) {
1245 basemem = physmap[i + 1] / 1024;
1246 break;
1247 }
1248 }
1249 if (basemem == 0 || basemem > 640) {
1250 if (bootverbose)
1251 printf(
1252 "Memory map doesn't contain a basemem segment, faking it");
1253 basemem = 640;
1254 }
1255
1256 /*
1257 * Make hole for "AP -> long mode" bootstrap code. The
1258 * mp_bootaddress vector is only available when the kernel
1259 * is configured to support APs and APs for the system start
1260 * in 32bit mode (e.g. SMP bare metal).
1261 */
1262 if (init_ops.mp_bootaddress) {
1263 if (physmap[1] >= 0x100000000)
1264 panic(
1265 "Basemem segment is not suitable for AP bootstrap code!");
1266 physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1267 }
1268
1269 /*
1270 * Maxmem isn't the "maximum memory", it's one larger than the
1271 * highest page of the physical address space. It should be
1272 * called something like "Maxphyspage". We may adjust this
1273 * based on ``hw.physmem'' and the results of the memory test.
1274 */
1275 Maxmem = atop(physmap[physmap_idx + 1]);
1276
1277 #ifdef MAXMEM
1278 Maxmem = MAXMEM / 4;
1279 #endif
1280
1281 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1282 Maxmem = atop(physmem_tunable);
1283
1284 /*
1285 * The boot memory test is disabled by default, as it takes a
1286 * significant amount of time on large-memory systems, and is
1287 * unfriendly to virtual machines as it unnecessarily touches all
1288 * pages.
1289 *
1290 * A general name is used as the code may be extended to support
1291 * additional tests beyond the current "page present" test.
1292 */
1293 memtest = 0;
1294 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1295
1296 /*
1297 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1298 * in the system.
1299 */
1300 if (Maxmem > atop(physmap[physmap_idx + 1]))
1301 Maxmem = atop(physmap[physmap_idx + 1]);
1302
1303 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1304 (boothowto & RB_VERBOSE))
1305 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1306
1307 /* call pmap initialization to make new kernel address space */
1308 pmap_bootstrap(&first);
1309
1310 /*
1311 * Size up each available chunk of physical memory.
1312 *
1313 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1314 * By default, mask off the first 16 pages unless we appear to be
1315 * running in a VM.
1316 */
1317 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1318 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1319 if (physmap[0] < physmem_start) {
1320 if (physmem_start < PAGE_SIZE)
1321 physmap[0] = PAGE_SIZE;
1322 else if (physmem_start >= physmap[1])
1323 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1324 else
1325 physmap[0] = round_page(physmem_start);
1326 }
1327 pa_indx = 0;
1328 da_indx = 1;
1329 phys_avail[pa_indx++] = physmap[0];
1330 phys_avail[pa_indx] = physmap[0];
1331 dump_avail[da_indx] = physmap[0];
1332 pte = CMAP1;
1333
1334 /*
1335 * Get dcons buffer address
1336 */
1337 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1338 getenv_quad("dcons.size", &dcons_size) == 0)
1339 dcons_addr = 0;
1340
1341 /*
1342 * physmap is in bytes, so when converting to page boundaries,
1343 * round up the start address and round down the end address.
1344 */
1345 page_counter = 0;
1346 if (memtest != 0)
1347 printf("Testing system memory");
1348 for (i = 0; i <= physmap_idx; i += 2) {
1349 vm_paddr_t end;
1350
1351 end = ptoa((vm_paddr_t)Maxmem);
1352 if (physmap[i + 1] < end)
1353 end = trunc_page(physmap[i + 1]);
1354 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1355 int tmp, page_bad, full;
1356 int *ptr = (int *)CADDR1;
1357
1358 full = FALSE;
1359 /*
1360 * block out kernel memory as not available.
1361 */
1362 if (pa >= (vm_paddr_t)kernphys && pa < first)
1363 goto do_dump_avail;
1364
1365 /*
1366 * block out dcons buffer
1367 */
1368 if (dcons_addr > 0
1369 && pa >= trunc_page(dcons_addr)
1370 && pa < dcons_addr + dcons_size)
1371 goto do_dump_avail;
1372
1373 page_bad = FALSE;
1374 if (memtest == 0)
1375 goto skip_memtest;
1376
1377 /*
1378 * Print a "." every GB to show we're making
1379 * progress.
1380 */
1381 page_counter++;
1382 if ((page_counter % PAGES_PER_GB) == 0)
1383 printf(".");
1384
1385 /*
1386 * map page into kernel: valid, read/write,non-cacheable
1387 */
1388 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1389 invltlb();
1390
1391 tmp = *(int *)ptr;
1392 /*
1393 * Test for alternating 1's and 0's
1394 */
1395 *(volatile int *)ptr = 0xaaaaaaaa;
1396 if (*(volatile int *)ptr != 0xaaaaaaaa)
1397 page_bad = TRUE;
1398 /*
1399 * Test for alternating 0's and 1's
1400 */
1401 *(volatile int *)ptr = 0x55555555;
1402 if (*(volatile int *)ptr != 0x55555555)
1403 page_bad = TRUE;
1404 /*
1405 * Test for all 1's
1406 */
1407 *(volatile int *)ptr = 0xffffffff;
1408 if (*(volatile int *)ptr != 0xffffffff)
1409 page_bad = TRUE;
1410 /*
1411 * Test for all 0's
1412 */
1413 *(volatile int *)ptr = 0x0;
1414 if (*(volatile int *)ptr != 0x0)
1415 page_bad = TRUE;
1416 /*
1417 * Restore original value.
1418 */
1419 *(int *)ptr = tmp;
1420
1421 skip_memtest:
1422 /*
1423 * Adjust array of valid/good pages.
1424 */
1425 if (page_bad == TRUE)
1426 continue;
1427 /*
1428 * If this good page is a continuation of the
1429 * previous set of good pages, then just increase
1430 * the end pointer. Otherwise start a new chunk.
1431 * Note that "end" points one higher than end,
1432 * making the range >= start and < end.
1433 * If we're also doing a speculative memory
1434 * test and we at or past the end, bump up Maxmem
1435 * so that we keep going. The first bad page
1436 * will terminate the loop.
1437 */
1438 if (phys_avail[pa_indx] == pa) {
1439 phys_avail[pa_indx] += PAGE_SIZE;
1440 } else {
1441 pa_indx++;
1442 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1443 printf(
1444 "Too many holes in the physical address space, giving up\n");
1445 pa_indx--;
1446 full = TRUE;
1447 goto do_dump_avail;
1448 }
1449 phys_avail[pa_indx++] = pa; /* start */
1450 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1451 }
1452 physmem++;
1453 do_dump_avail:
1454 if (dump_avail[da_indx] == pa) {
1455 dump_avail[da_indx] += PAGE_SIZE;
1456 } else {
1457 da_indx++;
1458 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1459 da_indx--;
1460 goto do_next;
1461 }
1462 dump_avail[da_indx++] = pa; /* start */
1463 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1464 }
1465 do_next:
1466 if (full)
1467 break;
1468 }
1469 }
1470 *pte = 0;
1471 invltlb();
1472 if (memtest != 0)
1473 printf("\n");
1474
1475 /*
1476 * XXX
1477 * The last chunk must contain at least one page plus the message
1478 * buffer to avoid complicating other code (message buffer address
1479 * calculation, etc.).
1480 */
1481 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1482 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1483 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1484 phys_avail[pa_indx--] = 0;
1485 phys_avail[pa_indx--] = 0;
1486 }
1487
1488 Maxmem = atop(phys_avail[pa_indx]);
1489
1490 /* Trim off space for the message buffer. */
1491 phys_avail[pa_indx] -= round_page(msgbufsize);
1492
1493 /* Map the message buffer. */
1494 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1495 }
1496
1497 static caddr_t
1498 native_parse_preload_data(u_int64_t modulep)
1499 {
1500 caddr_t kmdp;
1501 char *envp;
1502 #ifdef DDB
1503 vm_offset_t ksym_start;
1504 vm_offset_t ksym_end;
1505 #endif
1506
1507 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1508 preload_bootstrap_relocate(KERNBASE);
1509 kmdp = preload_search_by_type("elf kernel");
1510 if (kmdp == NULL)
1511 kmdp = preload_search_by_type("elf64 kernel");
1512 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1513 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1514 if (envp != NULL)
1515 envp += KERNBASE;
1516 init_static_kenv(envp, 0);
1517 #ifdef DDB
1518 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1519 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1520 db_fetch_ksymtab(ksym_start, ksym_end);
1521 #endif
1522 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1523
1524 return (kmdp);
1525 }
1526
1527 static void
1528 amd64_kdb_init(void)
1529 {
1530 kdb_init();
1531 #ifdef KDB
1532 if (boothowto & RB_KDB)
1533 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1534 #endif
1535 }
1536
1537 /* Set up the fast syscall stuff */
1538 void
1539 amd64_conf_fast_syscall(void)
1540 {
1541 uint64_t msr;
1542
1543 msr = rdmsr(MSR_EFER) | EFER_SCE;
1544 wrmsr(MSR_EFER, msr);
1545 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1546 (u_int64_t)IDTVEC(fast_syscall));
1547 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1548 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1549 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1550 wrmsr(MSR_STAR, msr);
1551 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1552 }
1553
1554 u_int64_t
1555 hammer_time(u_int64_t modulep, u_int64_t physfree)
1556 {
1557 caddr_t kmdp;
1558 int gsel_tss, x;
1559 struct pcpu *pc;
1560 struct nmi_pcpu *np;
1561 struct xstate_hdr *xhdr;
1562 u_int64_t rsp0;
1563 char *env;
1564 size_t kstack0_sz;
1565 int late_console;
1566
1567 /*
1568 * This may be done better later if it gets more high level
1569 * components in it. If so just link td->td_proc here.
1570 */
1571 proc_linkup0(&proc0, &thread0);
1572
1573 kmdp = init_ops.parse_preload_data(modulep);
1574
1575 identify_cpu1();
1576
1577 /* Init basic tunables, hz etc */
1578 init_param1();
1579
1580 thread0.td_kstack = physfree + KERNBASE;
1581 thread0.td_kstack_pages = kstack_pages;
1582 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1583 bzero((void *)thread0.td_kstack, kstack0_sz);
1584 physfree += kstack0_sz;
1585
1586 /*
1587 * make gdt memory segments
1588 */
1589 for (x = 0; x < NGDT; x++) {
1590 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1591 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1592 ssdtosd(&gdt_segs[x], &gdt[x]);
1593 }
1594 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1595 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1596 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1597
1598 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1599 r_gdt.rd_base = (long) gdt;
1600 lgdt(&r_gdt);
1601 pc = &__pcpu[0];
1602
1603 wrmsr(MSR_FSBASE, 0); /* User value */
1604 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1605 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1606
1607 pcpu_init(pc, 0, sizeof(struct pcpu));
1608 dpcpu_init((void *)(physfree + KERNBASE), 0);
1609 physfree += DPCPU_SIZE;
1610 PCPU_SET(prvspace, pc);
1611 PCPU_SET(curthread, &thread0);
1612 /* Non-late cninit() and printf() can be moved up to here. */
1613 PCPU_SET(tssp, &common_tss[0]);
1614 PCPU_SET(commontssp, &common_tss[0]);
1615 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1616 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1617 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1618 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1619
1620 /*
1621 * Initialize mutexes.
1622 *
1623 * icu_lock: in order to allow an interrupt to occur in a critical
1624 * section, to set pcpu->ipending (etc...) properly, we
1625 * must be able to get the icu lock, so it can't be
1626 * under witness.
1627 */
1628 mutex_init();
1629 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1630 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1631
1632 /* exceptions */
1633 pti = pti_get_default();
1634 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1635
1636 for (x = 0; x < NIDT; x++)
1637 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1638 SEL_KPL, 0);
1639 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1640 SEL_KPL, 0);
1641 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1642 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1643 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1644 SEL_UPL, 0);
1645 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1646 SEL_KPL, 0);
1647 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1648 SEL_KPL, 0);
1649 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1650 SEL_KPL, 0);
1651 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1652 SEL_KPL, 0);
1653 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1654 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1655 SDT_SYSIGT, SEL_KPL, 0);
1656 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1657 SEL_KPL, 0);
1658 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1659 SDT_SYSIGT, SEL_KPL, 0);
1660 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1661 SEL_KPL, 0);
1662 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1663 SEL_KPL, 0);
1664 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1665 SEL_KPL, 0);
1666 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1667 SEL_KPL, 0);
1668 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1669 SEL_KPL, 0);
1670 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1671 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1672 SEL_KPL, 0);
1673 #ifdef KDTRACE_HOOKS
1674 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1675 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1676 #endif
1677 #ifdef XENHVM
1678 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1679 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1680 #endif
1681 r_idt.rd_limit = sizeof(idt0) - 1;
1682 r_idt.rd_base = (long) idt;
1683 lidt(&r_idt);
1684
1685 /*
1686 * Initialize the clock before the console so that console
1687 * initialization can use DELAY().
1688 */
1689 clock_init();
1690
1691 /*
1692 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1693 * transition).
1694 * Once bootblocks have updated, we can test directly for
1695 * efi_systbl != NULL here...
1696 */
1697 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1698 != NULL)
1699 vty_set_preferred(VTY_VT);
1700
1701 finishidentcpu(); /* Final stage of CPU initialization */
1702 initializecpu(); /* Initialize CPU registers */
1703 initializecpucache();
1704
1705 /* doublefault stack space, runs on ist1 */
1706 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1707
1708 /*
1709 * NMI stack, runs on ist2. The pcpu pointer is stored just
1710 * above the start of the ist2 stack.
1711 */
1712 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1713 np->np_pcpu = (register_t) pc;
1714 common_tss[0].tss_ist2 = (long) np;
1715
1716 /*
1717 * MC# stack, runs on ist3. The pcpu pointer is stored just
1718 * above the start of the ist3 stack.
1719 */
1720 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1721 np->np_pcpu = (register_t) pc;
1722 common_tss[0].tss_ist3 = (long) np;
1723
1724 /*
1725 * DB# stack, runs on ist4.
1726 */
1727 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1728 np->np_pcpu = (register_t) pc;
1729 common_tss[0].tss_ist4 = (long) np;
1730
1731 /* Set the IO permission bitmap (empty due to tss seg limit) */
1732 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1733
1734 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1735 ltr(gsel_tss);
1736
1737 amd64_conf_fast_syscall();
1738
1739 /*
1740 * Temporary forge some valid pointer to PCB, for exception
1741 * handlers. It is reinitialized properly below after FPU is
1742 * set up. Also set up td_critnest to short-cut the page
1743 * fault handler.
1744 */
1745 cpu_max_ext_state_size = sizeof(struct savefpu);
1746 thread0.td_pcb = get_pcb_td(&thread0);
1747 thread0.td_critnest = 1;
1748
1749 /*
1750 * The console and kdb should be initialized even earlier than here,
1751 * but some console drivers don't work until after getmemsize().
1752 * Default to late console initialization to support these drivers.
1753 * This loses mainly printf()s in getmemsize() and early debugging.
1754 */
1755 late_console = 1;
1756 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1757 if (!late_console) {
1758 cninit();
1759 amd64_kdb_init();
1760 }
1761
1762 getmemsize(kmdp, physfree);
1763 init_param2(physmem);
1764
1765 /* now running on new page tables, configured,and u/iom is accessible */
1766
1767 if (late_console)
1768 cninit();
1769
1770 #ifdef DEV_ISA
1771 #ifdef DEV_ATPIC
1772 elcr_probe();
1773 atpic_startup();
1774 #else
1775 /* Reset and mask the atpics and leave them shut down. */
1776 atpic_reset();
1777
1778 /*
1779 * Point the ICU spurious interrupt vectors at the APIC spurious
1780 * interrupt handler.
1781 */
1782 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1783 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1784 #endif
1785 #else
1786 #error "have you forgotten the isa device?";
1787 #endif
1788
1789 if (late_console)
1790 amd64_kdb_init();
1791
1792 msgbufinit(msgbufp, msgbufsize);
1793 fpuinit();
1794
1795 /*
1796 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1797 * area size. Zero out the extended state header in fpu save
1798 * area.
1799 */
1800 thread0.td_pcb = get_pcb_td(&thread0);
1801 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1802 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1803 if (use_xsave) {
1804 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1805 1);
1806 xhdr->xstate_bv = xsave_mask;
1807 }
1808 /* make an initial tss so cpu can get interrupt stack on syscall! */
1809 rsp0 = (vm_offset_t)thread0.td_pcb;
1810 /* Ensure the stack is aligned to 16 bytes */
1811 rsp0 &= ~0xFul;
1812 common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
1813 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
1814 PCPU_SET(rsp0, rsp0);
1815 PCPU_SET(curpcb, thread0.td_pcb);
1816
1817 /* transfer to user mode */
1818
1819 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1820 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1821 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1822 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1823 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1824
1825 load_ds(_udatasel);
1826 load_es(_udatasel);
1827 load_fs(_ufssel);
1828
1829 /* setup proc 0's pcb */
1830 thread0.td_pcb->pcb_flags = 0;
1831 thread0.td_frame = &proc0_tf;
1832
1833 env = kern_getenv("kernelname");
1834 if (env != NULL)
1835 strlcpy(kernelname, env, sizeof(kernelname));
1836
1837 cpu_probe_amdc1e();
1838
1839 #ifdef FDT
1840 x86_init_fdt();
1841 #endif
1842 thread0.td_critnest = 0;
1843
1844 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1845
1846 /* Location of kernel stack for locore */
1847 return ((u_int64_t)thread0.td_pcb);
1848 }
1849
1850 void
1851 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1852 {
1853
1854 pcpu->pc_acpi_id = 0xffffffff;
1855 }
1856
1857 static int
1858 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1859 {
1860 struct bios_smap *smapbase;
1861 struct bios_smap_xattr smap;
1862 caddr_t kmdp;
1863 uint32_t *smapattr;
1864 int count, error, i;
1865
1866 /* Retrieve the system memory map from the loader. */
1867 kmdp = preload_search_by_type("elf kernel");
1868 if (kmdp == NULL)
1869 kmdp = preload_search_by_type("elf64 kernel");
1870 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1871 MODINFO_METADATA | MODINFOMD_SMAP);
1872 if (smapbase == NULL)
1873 return (0);
1874 smapattr = (uint32_t *)preload_search_info(kmdp,
1875 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1876 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1877 error = 0;
1878 for (i = 0; i < count; i++) {
1879 smap.base = smapbase[i].base;
1880 smap.length = smapbase[i].length;
1881 smap.type = smapbase[i].type;
1882 if (smapattr != NULL)
1883 smap.xattr = smapattr[i];
1884 else
1885 smap.xattr = 0;
1886 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1887 }
1888 return (error);
1889 }
1890 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1891 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1892
1893 static int
1894 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1895 {
1896 struct efi_map_header *efihdr;
1897 caddr_t kmdp;
1898 uint32_t efisize;
1899
1900 kmdp = preload_search_by_type("elf kernel");
1901 if (kmdp == NULL)
1902 kmdp = preload_search_by_type("elf64 kernel");
1903 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1904 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1905 if (efihdr == NULL)
1906 return (0);
1907 efisize = *((uint32_t *)efihdr - 1);
1908 return (SYSCTL_OUT(req, efihdr, efisize));
1909 }
1910 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1911 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1912
1913 void
1914 spinlock_enter(void)
1915 {
1916 struct thread *td;
1917 register_t flags;
1918
1919 td = curthread;
1920 if (td->td_md.md_spinlock_count == 0) {
1921 flags = intr_disable();
1922 td->td_md.md_spinlock_count = 1;
1923 td->td_md.md_saved_flags = flags;
1924 } else
1925 td->td_md.md_spinlock_count++;
1926 critical_enter();
1927 }
1928
1929 void
1930 spinlock_exit(void)
1931 {
1932 struct thread *td;
1933 register_t flags;
1934
1935 td = curthread;
1936 critical_exit();
1937 flags = td->td_md.md_saved_flags;
1938 td->td_md.md_spinlock_count--;
1939 if (td->td_md.md_spinlock_count == 0)
1940 intr_restore(flags);
1941 }
1942
1943 /*
1944 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1945 * we want to start a backtrace from the function that caused us to enter
1946 * the debugger. We have the context in the trapframe, but base the trace
1947 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1948 * enough for a backtrace.
1949 */
1950 void
1951 makectx(struct trapframe *tf, struct pcb *pcb)
1952 {
1953
1954 pcb->pcb_r12 = tf->tf_r12;
1955 pcb->pcb_r13 = tf->tf_r13;
1956 pcb->pcb_r14 = tf->tf_r14;
1957 pcb->pcb_r15 = tf->tf_r15;
1958 pcb->pcb_rbp = tf->tf_rbp;
1959 pcb->pcb_rbx = tf->tf_rbx;
1960 pcb->pcb_rip = tf->tf_rip;
1961 pcb->pcb_rsp = tf->tf_rsp;
1962 }
1963
1964 int
1965 ptrace_set_pc(struct thread *td, unsigned long addr)
1966 {
1967
1968 td->td_frame->tf_rip = addr;
1969 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
1970 return (0);
1971 }
1972
1973 int
1974 ptrace_single_step(struct thread *td)
1975 {
1976 td->td_frame->tf_rflags |= PSL_T;
1977 return (0);
1978 }
1979
1980 int
1981 ptrace_clear_single_step(struct thread *td)
1982 {
1983 td->td_frame->tf_rflags &= ~PSL_T;
1984 return (0);
1985 }
1986
1987 int
1988 fill_regs(struct thread *td, struct reg *regs)
1989 {
1990 struct trapframe *tp;
1991
1992 tp = td->td_frame;
1993 return (fill_frame_regs(tp, regs));
1994 }
1995
1996 int
1997 fill_frame_regs(struct trapframe *tp, struct reg *regs)
1998 {
1999 regs->r_r15 = tp->tf_r15;
2000 regs->r_r14 = tp->tf_r14;
2001 regs->r_r13 = tp->tf_r13;
2002 regs->r_r12 = tp->tf_r12;
2003 regs->r_r11 = tp->tf_r11;
2004 regs->r_r10 = tp->tf_r10;
2005 regs->r_r9 = tp->tf_r9;
2006 regs->r_r8 = tp->tf_r8;
2007 regs->r_rdi = tp->tf_rdi;
2008 regs->r_rsi = tp->tf_rsi;
2009 regs->r_rbp = tp->tf_rbp;
2010 regs->r_rbx = tp->tf_rbx;
2011 regs->r_rdx = tp->tf_rdx;
2012 regs->r_rcx = tp->tf_rcx;
2013 regs->r_rax = tp->tf_rax;
2014 regs->r_rip = tp->tf_rip;
2015 regs->r_cs = tp->tf_cs;
2016 regs->r_rflags = tp->tf_rflags;
2017 regs->r_rsp = tp->tf_rsp;
2018 regs->r_ss = tp->tf_ss;
2019 if (tp->tf_flags & TF_HASSEGS) {
2020 regs->r_ds = tp->tf_ds;
2021 regs->r_es = tp->tf_es;
2022 regs->r_fs = tp->tf_fs;
2023 regs->r_gs = tp->tf_gs;
2024 } else {
2025 regs->r_ds = 0;
2026 regs->r_es = 0;
2027 regs->r_fs = 0;
2028 regs->r_gs = 0;
2029 }
2030 return (0);
2031 }
2032
2033 int
2034 set_regs(struct thread *td, struct reg *regs)
2035 {
2036 struct trapframe *tp;
2037 register_t rflags;
2038
2039 tp = td->td_frame;
2040 rflags = regs->r_rflags & 0xffffffff;
2041 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2042 return (EINVAL);
2043 tp->tf_r15 = regs->r_r15;
2044 tp->tf_r14 = regs->r_r14;
2045 tp->tf_r13 = regs->r_r13;
2046 tp->tf_r12 = regs->r_r12;
2047 tp->tf_r11 = regs->r_r11;
2048 tp->tf_r10 = regs->r_r10;
2049 tp->tf_r9 = regs->r_r9;
2050 tp->tf_r8 = regs->r_r8;
2051 tp->tf_rdi = regs->r_rdi;
2052 tp->tf_rsi = regs->r_rsi;
2053 tp->tf_rbp = regs->r_rbp;
2054 tp->tf_rbx = regs->r_rbx;
2055 tp->tf_rdx = regs->r_rdx;
2056 tp->tf_rcx = regs->r_rcx;
2057 tp->tf_rax = regs->r_rax;
2058 tp->tf_rip = regs->r_rip;
2059 tp->tf_cs = regs->r_cs;
2060 tp->tf_rflags = rflags;
2061 tp->tf_rsp = regs->r_rsp;
2062 tp->tf_ss = regs->r_ss;
2063 if (0) { /* XXXKIB */
2064 tp->tf_ds = regs->r_ds;
2065 tp->tf_es = regs->r_es;
2066 tp->tf_fs = regs->r_fs;
2067 tp->tf_gs = regs->r_gs;
2068 tp->tf_flags = TF_HASSEGS;
2069 }
2070 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2071 return (0);
2072 }
2073
2074 /* XXX check all this stuff! */
2075 /* externalize from sv_xmm */
2076 static void
2077 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2078 {
2079 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2080 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2081 int i;
2082
2083 /* pcb -> fpregs */
2084 bzero(fpregs, sizeof(*fpregs));
2085
2086 /* FPU control/status */
2087 penv_fpreg->en_cw = penv_xmm->en_cw;
2088 penv_fpreg->en_sw = penv_xmm->en_sw;
2089 penv_fpreg->en_tw = penv_xmm->en_tw;
2090 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2091 penv_fpreg->en_rip = penv_xmm->en_rip;
2092 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2093 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2094 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2095
2096 /* FPU registers */
2097 for (i = 0; i < 8; ++i)
2098 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2099
2100 /* SSE registers */
2101 for (i = 0; i < 16; ++i)
2102 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2103 }
2104
2105 /* internalize from fpregs into sv_xmm */
2106 static void
2107 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2108 {
2109 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2110 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2111 int i;
2112
2113 /* fpregs -> pcb */
2114 /* FPU control/status */
2115 penv_xmm->en_cw = penv_fpreg->en_cw;
2116 penv_xmm->en_sw = penv_fpreg->en_sw;
2117 penv_xmm->en_tw = penv_fpreg->en_tw;
2118 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2119 penv_xmm->en_rip = penv_fpreg->en_rip;
2120 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2121 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2122 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2123
2124 /* FPU registers */
2125 for (i = 0; i < 8; ++i)
2126 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2127
2128 /* SSE registers */
2129 for (i = 0; i < 16; ++i)
2130 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2131 }
2132
2133 /* externalize from td->pcb */
2134 int
2135 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2136 {
2137
2138 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2139 P_SHOULDSTOP(td->td_proc),
2140 ("not suspended thread %p", td));
2141 fpugetregs(td);
2142 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2143 return (0);
2144 }
2145
2146 /* internalize to td->pcb */
2147 int
2148 set_fpregs(struct thread *td, struct fpreg *fpregs)
2149 {
2150
2151 critical_enter();
2152 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2153 fpuuserinited(td);
2154 critical_exit();
2155 return (0);
2156 }
2157
2158 /*
2159 * Get machine context.
2160 */
2161 int
2162 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2163 {
2164 struct pcb *pcb;
2165 struct trapframe *tp;
2166
2167 pcb = td->td_pcb;
2168 tp = td->td_frame;
2169 PROC_LOCK(curthread->td_proc);
2170 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2171 PROC_UNLOCK(curthread->td_proc);
2172 mcp->mc_r15 = tp->tf_r15;
2173 mcp->mc_r14 = tp->tf_r14;
2174 mcp->mc_r13 = tp->tf_r13;
2175 mcp->mc_r12 = tp->tf_r12;
2176 mcp->mc_r11 = tp->tf_r11;
2177 mcp->mc_r10 = tp->tf_r10;
2178 mcp->mc_r9 = tp->tf_r9;
2179 mcp->mc_r8 = tp->tf_r8;
2180 mcp->mc_rdi = tp->tf_rdi;
2181 mcp->mc_rsi = tp->tf_rsi;
2182 mcp->mc_rbp = tp->tf_rbp;
2183 mcp->mc_rbx = tp->tf_rbx;
2184 mcp->mc_rcx = tp->tf_rcx;
2185 mcp->mc_rflags = tp->tf_rflags;
2186 if (flags & GET_MC_CLEAR_RET) {
2187 mcp->mc_rax = 0;
2188 mcp->mc_rdx = 0;
2189 mcp->mc_rflags &= ~PSL_C;
2190 } else {
2191 mcp->mc_rax = tp->tf_rax;
2192 mcp->mc_rdx = tp->tf_rdx;
2193 }
2194 mcp->mc_rip = tp->tf_rip;
2195 mcp->mc_cs = tp->tf_cs;
2196 mcp->mc_rsp = tp->tf_rsp;
2197 mcp->mc_ss = tp->tf_ss;
2198 mcp->mc_ds = tp->tf_ds;
2199 mcp->mc_es = tp->tf_es;
2200 mcp->mc_fs = tp->tf_fs;
2201 mcp->mc_gs = tp->tf_gs;
2202 mcp->mc_flags = tp->tf_flags;
2203 mcp->mc_len = sizeof(*mcp);
2204 get_fpcontext(td, mcp, NULL, 0);
2205 mcp->mc_fsbase = pcb->pcb_fsbase;
2206 mcp->mc_gsbase = pcb->pcb_gsbase;
2207 mcp->mc_xfpustate = 0;
2208 mcp->mc_xfpustate_len = 0;
2209 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2210 return (0);
2211 }
2212
2213 /*
2214 * Set machine context.
2215 *
2216 * However, we don't set any but the user modifiable flags, and we won't
2217 * touch the cs selector.
2218 */
2219 int
2220 set_mcontext(struct thread *td, mcontext_t *mcp)
2221 {
2222 struct pcb *pcb;
2223 struct trapframe *tp;
2224 char *xfpustate;
2225 long rflags;
2226 int ret;
2227
2228 pcb = td->td_pcb;
2229 tp = td->td_frame;
2230 if (mcp->mc_len != sizeof(*mcp) ||
2231 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2232 return (EINVAL);
2233 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2234 (tp->tf_rflags & ~PSL_USERCHANGE);
2235 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2236 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2237 sizeof(struct savefpu))
2238 return (EINVAL);
2239 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2240 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2241 mcp->mc_xfpustate_len);
2242 if (ret != 0)
2243 return (ret);
2244 } else
2245 xfpustate = NULL;
2246 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2247 if (ret != 0)
2248 return (ret);
2249 tp->tf_r15 = mcp->mc_r15;
2250 tp->tf_r14 = mcp->mc_r14;
2251 tp->tf_r13 = mcp->mc_r13;
2252 tp->tf_r12 = mcp->mc_r12;
2253 tp->tf_r11 = mcp->mc_r11;
2254 tp->tf_r10 = mcp->mc_r10;
2255 tp->tf_r9 = mcp->mc_r9;
2256 tp->tf_r8 = mcp->mc_r8;
2257 tp->tf_rdi = mcp->mc_rdi;
2258 tp->tf_rsi = mcp->mc_rsi;
2259 tp->tf_rbp = mcp->mc_rbp;
2260 tp->tf_rbx = mcp->mc_rbx;
2261 tp->tf_rdx = mcp->mc_rdx;
2262 tp->tf_rcx = mcp->mc_rcx;
2263 tp->tf_rax = mcp->mc_rax;
2264 tp->tf_rip = mcp->mc_rip;
2265 tp->tf_rflags = rflags;
2266 tp->tf_rsp = mcp->mc_rsp;
2267 tp->tf_ss = mcp->mc_ss;
2268 tp->tf_flags = mcp->mc_flags;
2269 if (tp->tf_flags & TF_HASSEGS) {
2270 tp->tf_ds = mcp->mc_ds;
2271 tp->tf_es = mcp->mc_es;
2272 tp->tf_fs = mcp->mc_fs;
2273 tp->tf_gs = mcp->mc_gs;
2274 }
2275 if (mcp->mc_flags & _MC_HASBASES) {
2276 pcb->pcb_fsbase = mcp->mc_fsbase;
2277 pcb->pcb_gsbase = mcp->mc_gsbase;
2278 }
2279 set_pcb_flags(pcb, PCB_FULL_IRET);
2280 return (0);
2281 }
2282
2283 static void
2284 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2285 size_t xfpusave_len)
2286 {
2287 size_t max_len, len;
2288
2289 mcp->mc_ownedfp = fpugetregs(td);
2290 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2291 sizeof(mcp->mc_fpstate));
2292 mcp->mc_fpformat = fpuformat();
2293 if (!use_xsave || xfpusave_len == 0)
2294 return;
2295 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2296 len = xfpusave_len;
2297 if (len > max_len) {
2298 len = max_len;
2299 bzero(xfpusave + max_len, len - max_len);
2300 }
2301 mcp->mc_flags |= _MC_HASFPXSTATE;
2302 mcp->mc_xfpustate_len = len;
2303 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2304 }
2305
2306 static int
2307 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2308 size_t xfpustate_len)
2309 {
2310 struct savefpu *fpstate;
2311 int error;
2312
2313 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2314 return (0);
2315 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2316 return (EINVAL);
2317 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2318 /* We don't care what state is left in the FPU or PCB. */
2319 fpstate_drop(td);
2320 error = 0;
2321 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2322 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2323 fpstate = (struct savefpu *)&mcp->mc_fpstate;
2324 fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
2325 error = fpusetregs(td, fpstate, xfpustate, xfpustate_len);
2326 } else
2327 return (EINVAL);
2328 return (error);
2329 }
2330
2331 void
2332 fpstate_drop(struct thread *td)
2333 {
2334
2335 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2336 critical_enter();
2337 if (PCPU_GET(fpcurthread) == td)
2338 fpudrop();
2339 /*
2340 * XXX force a full drop of the fpu. The above only drops it if we
2341 * owned it.
2342 *
2343 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2344 * drop. Dropping only to the pcb matches fnsave's behaviour.
2345 * We only need to drop to !PCB_INITDONE in sendsig(). But
2346 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2347 * have too many layers.
2348 */
2349 clear_pcb_flags(curthread->td_pcb,
2350 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2351 critical_exit();
2352 }
2353
2354 int
2355 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2356 {
2357 struct pcb *pcb;
2358
2359 if (td == NULL) {
2360 dbregs->dr[0] = rdr0();
2361 dbregs->dr[1] = rdr1();
2362 dbregs->dr[2] = rdr2();
2363 dbregs->dr[3] = rdr3();
2364 dbregs->dr[6] = rdr6();
2365 dbregs->dr[7] = rdr7();
2366 } else {
2367 pcb = td->td_pcb;
2368 dbregs->dr[0] = pcb->pcb_dr0;
2369 dbregs->dr[1] = pcb->pcb_dr1;
2370 dbregs->dr[2] = pcb->pcb_dr2;
2371 dbregs->dr[3] = pcb->pcb_dr3;
2372 dbregs->dr[6] = pcb->pcb_dr6;
2373 dbregs->dr[7] = pcb->pcb_dr7;
2374 }
2375 dbregs->dr[4] = 0;
2376 dbregs->dr[5] = 0;
2377 dbregs->dr[8] = 0;
2378 dbregs->dr[9] = 0;
2379 dbregs->dr[10] = 0;
2380 dbregs->dr[11] = 0;
2381 dbregs->dr[12] = 0;
2382 dbregs->dr[13] = 0;
2383 dbregs->dr[14] = 0;
2384 dbregs->dr[15] = 0;
2385 return (0);
2386 }
2387
2388 int
2389 set_dbregs(struct thread *td, struct dbreg *dbregs)
2390 {
2391 struct pcb *pcb;
2392 int i;
2393
2394 if (td == NULL) {
2395 load_dr0(dbregs->dr[0]);
2396 load_dr1(dbregs->dr[1]);
2397 load_dr2(dbregs->dr[2]);
2398 load_dr3(dbregs->dr[3]);
2399 load_dr6(dbregs->dr[6]);
2400 load_dr7(dbregs->dr[7]);
2401 } else {
2402 /*
2403 * Don't let an illegal value for dr7 get set. Specifically,
2404 * check for undefined settings. Setting these bit patterns
2405 * result in undefined behaviour and can lead to an unexpected
2406 * TRCTRAP or a general protection fault right here.
2407 * Upper bits of dr6 and dr7 must not be set
2408 */
2409 for (i = 0; i < 4; i++) {
2410 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2411 return (EINVAL);
2412 if (td->td_frame->tf_cs == _ucode32sel &&
2413 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2414 return (EINVAL);
2415 }
2416 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2417 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2418 return (EINVAL);
2419
2420 pcb = td->td_pcb;
2421
2422 /*
2423 * Don't let a process set a breakpoint that is not within the
2424 * process's address space. If a process could do this, it
2425 * could halt the system by setting a breakpoint in the kernel
2426 * (if ddb was enabled). Thus, we need to check to make sure
2427 * that no breakpoints are being enabled for addresses outside
2428 * process's address space.
2429 *
2430 * XXX - what about when the watched area of the user's
2431 * address space is written into from within the kernel
2432 * ... wouldn't that still cause a breakpoint to be generated
2433 * from within kernel mode?
2434 */
2435
2436 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2437 /* dr0 is enabled */
2438 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2439 return (EINVAL);
2440 }
2441 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2442 /* dr1 is enabled */
2443 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2444 return (EINVAL);
2445 }
2446 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2447 /* dr2 is enabled */
2448 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2449 return (EINVAL);
2450 }
2451 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2452 /* dr3 is enabled */
2453 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2454 return (EINVAL);
2455 }
2456
2457 pcb->pcb_dr0 = dbregs->dr[0];
2458 pcb->pcb_dr1 = dbregs->dr[1];
2459 pcb->pcb_dr2 = dbregs->dr[2];
2460 pcb->pcb_dr3 = dbregs->dr[3];
2461 pcb->pcb_dr6 = dbregs->dr[6];
2462 pcb->pcb_dr7 = dbregs->dr[7];
2463
2464 set_pcb_flags(pcb, PCB_DBREGS);
2465 }
2466
2467 return (0);
2468 }
2469
2470 void
2471 reset_dbregs(void)
2472 {
2473
2474 load_dr7(0); /* Turn off the control bits first */
2475 load_dr0(0);
2476 load_dr1(0);
2477 load_dr2(0);
2478 load_dr3(0);
2479 load_dr6(0);
2480 }
2481
2482 /*
2483 * Return > 0 if a hardware breakpoint has been hit, and the
2484 * breakpoint was in user space. Return 0, otherwise.
2485 */
2486 int
2487 user_dbreg_trap(void)
2488 {
2489 u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
2490 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2491 int nbp; /* number of breakpoints that triggered */
2492 caddr_t addr[4]; /* breakpoint addresses */
2493 int i;
2494
2495 dr7 = rdr7();
2496 if ((dr7 & 0x000000ff) == 0) {
2497 /*
2498 * all GE and LE bits in the dr7 register are zero,
2499 * thus the trap couldn't have been caused by the
2500 * hardware debug registers
2501 */
2502 return 0;
2503 }
2504
2505 nbp = 0;
2506 dr6 = rdr6();
2507 bp = dr6 & 0x0000000f;
2508
2509 if (!bp) {
2510 /*
2511 * None of the breakpoint bits are set meaning this
2512 * trap was not caused by any of the debug registers
2513 */
2514 return 0;
2515 }
2516
2517 /*
2518 * at least one of the breakpoints were hit, check to see
2519 * which ones and if any of them are user space addresses
2520 */
2521
2522 if (bp & 0x01) {
2523 addr[nbp++] = (caddr_t)rdr0();
2524 }
2525 if (bp & 0x02) {
2526 addr[nbp++] = (caddr_t)rdr1();
2527 }
2528 if (bp & 0x04) {
2529 addr[nbp++] = (caddr_t)rdr2();
2530 }
2531 if (bp & 0x08) {
2532 addr[nbp++] = (caddr_t)rdr3();
2533 }
2534
2535 for (i = 0; i < nbp; i++) {
2536 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2537 /*
2538 * addr[i] is in user space
2539 */
2540 return nbp;
2541 }
2542 }
2543
2544 /*
2545 * None of the breakpoints are in user space.
2546 */
2547 return 0;
2548 }
2549
2550 #ifdef KDB
2551
2552 /*
2553 * Provide inb() and outb() as functions. They are normally only available as
2554 * inline functions, thus cannot be called from the debugger.
2555 */
2556
2557 /* silence compiler warnings */
2558 u_char inb_(u_short);
2559 void outb_(u_short, u_char);
2560
2561 u_char
2562 inb_(u_short port)
2563 {
2564 return inb(port);
2565 }
2566
2567 void
2568 outb_(u_short port, u_char data)
2569 {
2570 outb(port, data);
2571 }
2572
2573 #endif /* KDB */
Cache object: e98599797ce99f24365138f01f82b9ee
|