1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD: releng/12.0/sys/amd64/amd64/machdep.c 341603 2018-12-05 19:16:12Z gjb $");
45
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/efi.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/rwlock.h>
84 #include <sys/sched.h>
85 #include <sys/signalvar.h>
86 #ifdef SMP
87 #include <sys/smp.h>
88 #endif
89 #include <sys/syscallsubr.h>
90 #include <sys/sysctl.h>
91 #include <sys/sysent.h>
92 #include <sys/sysproto.h>
93 #include <sys/ucontext.h>
94 #include <sys/vmmeter.h>
95
96 #include <vm/vm.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_param.h>
104 #include <vm/vm_phys.h>
105
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113
114 #include <net/netisr.h>
115
116 #include <machine/clock.h>
117 #include <machine/cpu.h>
118 #include <machine/cputypes.h>
119 #include <machine/frame.h>
120 #include <machine/intr_machdep.h>
121 #include <x86/mca.h>
122 #include <machine/md_var.h>
123 #include <machine/metadata.h>
124 #include <machine/mp_watchdog.h>
125 #include <machine/pc/bios.h>
126 #include <machine/pcb.h>
127 #include <machine/proc.h>
128 #include <machine/reg.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155 /*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
166 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167
168 static void cpu_startup(void *);
169 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170 char *xfpusave, size_t xfpusave_len);
171 static int set_fpcontext(struct thread *td, mcontext_t *mcp,
172 char *xfpustate, size_t xfpustate_len);
173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174
175 /* Preload data parse function */
176 static caddr_t native_parse_preload_data(u_int64_t);
177
178 /* Native function to fetch and parse the e820 map */
179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180
181 /* Default init_ops implementation. */
182 struct init_ops init_ops = {
183 .parse_preload_data = native_parse_preload_data,
184 .early_clock_source_init = i8254_init,
185 .early_delay = i8254_delay,
186 .parse_memmap = native_parse_memmap,
187 #ifdef SMP
188 .mp_bootaddress = mp_bootaddress,
189 .start_all_aps = native_start_all_aps,
190 #endif
191 #ifdef DEV_PCI
192 .msi_init = msi_init,
193 #endif
194 };
195
196 /*
197 * Physical address of the EFI System Table. Stashed from the metadata hints
198 * passed into the kernel and used by the EFI code to call runtime services.
199 */
200 vm_paddr_t efi_systbl_phys;
201
202 /* Intel ICH registers */
203 #define ICH_PMBASE 0x400
204 #define ICH_SMI_EN ICH_PMBASE + 0x30
205
206 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207
208 int cold = 1;
209
210 long Maxmem = 0;
211 long realmem = 0;
212
213 /*
214 * The number of PHYSMAP entries must be one less than the number of
215 * PHYSSEG entries because the PHYSMAP entry that spans the largest
216 * physical address that is accessible by ISA DMA is split into two
217 * PHYSSEG entries.
218 */
219 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
220
221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223
224 /* must be 2 less so 0 0 can signal end of chunks */
225 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227
228 struct kva_md_info kmi;
229
230 static struct trapframe proc0_tf;
231 struct region_descriptor r_gdt, r_idt;
232
233 struct pcpu __pcpu[MAXCPU];
234
235 struct mtx icu_lock;
236
237 struct mem_range_softc mem_range_softc;
238
239 struct mtx dt_lock; /* lock for GDT and LDT */
240
241 void (*vmm_resume_p)(void);
242
243 static void
244 cpu_startup(dummy)
245 void *dummy;
246 {
247 uintmax_t memsize;
248 char *sysenv;
249
250 /*
251 * On MacBooks, we need to disallow the legacy USB circuit to
252 * generate an SMI# because this can cause several problems,
253 * namely: incorrect CPU frequency detection and failure to
254 * start the APs.
255 * We do this by disabling a bit in the SMI_EN (SMI Control and
256 * Enable register) of the Intel ICH LPC Interface Bridge.
257 */
258 sysenv = kern_getenv("smbios.system.product");
259 if (sysenv != NULL) {
260 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
261 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
262 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
263 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
264 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
265 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
266 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
267 strncmp(sysenv, "Macmini1,1", 10) == 0) {
268 if (bootverbose)
269 printf("Disabling LEGACY_USB_EN bit on "
270 "Intel ICH.\n");
271 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
272 }
273 freeenv(sysenv);
274 }
275
276 /*
277 * Good {morning,afternoon,evening,night}.
278 */
279 startrtclock();
280 printcpuinfo();
281
282 /*
283 * Display physical memory if SMBIOS reports reasonable amount.
284 */
285 memsize = 0;
286 sysenv = kern_getenv("smbios.memory.enabled");
287 if (sysenv != NULL) {
288 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
289 freeenv(sysenv);
290 }
291 if (memsize < ptoa((uintmax_t)vm_free_count()))
292 memsize = ptoa((uintmax_t)Maxmem);
293 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
294 realmem = atop(memsize);
295
296 /*
297 * Display any holes after the first chunk of extended memory.
298 */
299 if (bootverbose) {
300 int indx;
301
302 printf("Physical memory chunk(s):\n");
303 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
304 vm_paddr_t size;
305
306 size = phys_avail[indx + 1] - phys_avail[indx];
307 printf(
308 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
309 (uintmax_t)phys_avail[indx],
310 (uintmax_t)phys_avail[indx + 1] - 1,
311 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
312 }
313 }
314
315 vm_ksubmap_init(&kmi);
316
317 printf("avail memory = %ju (%ju MB)\n",
318 ptoa((uintmax_t)vm_free_count()),
319 ptoa((uintmax_t)vm_free_count()) / 1048576);
320 #ifdef DEV_PCI
321 if (bootverbose && intel_graphics_stolen_base != 0)
322 printf("intel stolen mem: base %#jx size %ju MB\n",
323 (uintmax_t)intel_graphics_stolen_base,
324 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
325 #endif
326
327 /*
328 * Set up buffers, so they can be used to read disk labels.
329 */
330 bufinit();
331 vm_pager_bufferinit();
332
333 cpu_setregs();
334 }
335
336 /*
337 * Send an interrupt to process.
338 *
339 * Stack is set up to allow sigcode stored
340 * at top to call routine, followed by call
341 * to sigreturn routine below. After sigreturn
342 * resets the signal mask, the stack, and the
343 * frame pointer, it returns to the user
344 * specified pc, psl.
345 */
346 void
347 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
348 {
349 struct sigframe sf, *sfp;
350 struct pcb *pcb;
351 struct proc *p;
352 struct thread *td;
353 struct sigacts *psp;
354 char *sp;
355 struct trapframe *regs;
356 char *xfpusave;
357 size_t xfpusave_len;
358 int sig;
359 int oonstack;
360
361 td = curthread;
362 pcb = td->td_pcb;
363 p = td->td_proc;
364 PROC_LOCK_ASSERT(p, MA_OWNED);
365 sig = ksi->ksi_signo;
366 psp = p->p_sigacts;
367 mtx_assert(&psp->ps_mtx, MA_OWNED);
368 regs = td->td_frame;
369 oonstack = sigonstack(regs->tf_rsp);
370
371 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
372 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
373 xfpusave = __builtin_alloca(xfpusave_len);
374 } else {
375 xfpusave_len = 0;
376 xfpusave = NULL;
377 }
378
379 /* Save user context. */
380 bzero(&sf, sizeof(sf));
381 sf.sf_uc.uc_sigmask = *mask;
382 sf.sf_uc.uc_stack = td->td_sigstk;
383 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
384 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
385 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
386 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
387 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
388 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
389 fpstate_drop(td);
390 update_pcb_bases(pcb);
391 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
392 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
393 bzero(sf.sf_uc.uc_mcontext.mc_spare,
394 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
395 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
396
397 /* Allocate space for the signal handler context. */
398 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
399 SIGISMEMBER(psp->ps_sigonstack, sig)) {
400 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
401 #if defined(COMPAT_43)
402 td->td_sigstk.ss_flags |= SS_ONSTACK;
403 #endif
404 } else
405 sp = (char *)regs->tf_rsp - 128;
406 if (xfpusave != NULL) {
407 sp -= xfpusave_len;
408 sp = (char *)((unsigned long)sp & ~0x3Ful);
409 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
410 }
411 sp -= sizeof(struct sigframe);
412 /* Align to 16 bytes. */
413 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
414
415 /* Build the argument list for the signal handler. */
416 regs->tf_rdi = sig; /* arg 1 in %rdi */
417 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
418 bzero(&sf.sf_si, sizeof(sf.sf_si));
419 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
420 /* Signal handler installed with SA_SIGINFO. */
421 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
422 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
423
424 /* Fill in POSIX parts */
425 sf.sf_si = ksi->ksi_info;
426 sf.sf_si.si_signo = sig; /* maybe a translated signal */
427 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
428 } else {
429 /* Old FreeBSD-style arguments. */
430 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
431 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
432 sf.sf_ahu.sf_handler = catcher;
433 }
434 mtx_unlock(&psp->ps_mtx);
435 PROC_UNLOCK(p);
436
437 /*
438 * Copy the sigframe out to the user's stack.
439 */
440 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
441 (xfpusave != NULL && copyout(xfpusave,
442 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
443 != 0)) {
444 #ifdef DEBUG
445 printf("process %ld has trashed its stack\n", (long)p->p_pid);
446 #endif
447 PROC_LOCK(p);
448 sigexit(td, SIGILL);
449 }
450
451 regs->tf_rsp = (long)sfp;
452 regs->tf_rip = p->p_sysent->sv_sigcode_base;
453 regs->tf_rflags &= ~(PSL_T | PSL_D);
454 regs->tf_cs = _ucodesel;
455 regs->tf_ds = _udatasel;
456 regs->tf_ss = _udatasel;
457 regs->tf_es = _udatasel;
458 regs->tf_fs = _ufssel;
459 regs->tf_gs = _ugssel;
460 regs->tf_flags = TF_HASSEGS;
461 PROC_LOCK(p);
462 mtx_lock(&psp->ps_mtx);
463 }
464
465 /*
466 * System call to cleanup state after a signal
467 * has been taken. Reset signal mask and
468 * stack state from context left by sendsig (above).
469 * Return to previous pc and psl as specified by
470 * context left by sendsig. Check carefully to
471 * make sure that the user has not modified the
472 * state to gain improper privileges.
473 *
474 * MPSAFE
475 */
476 int
477 sys_sigreturn(td, uap)
478 struct thread *td;
479 struct sigreturn_args /* {
480 const struct __ucontext *sigcntxp;
481 } */ *uap;
482 {
483 ucontext_t uc;
484 struct pcb *pcb;
485 struct proc *p;
486 struct trapframe *regs;
487 ucontext_t *ucp;
488 char *xfpustate;
489 size_t xfpustate_len;
490 long rflags;
491 int cs, error, ret;
492 ksiginfo_t ksi;
493
494 pcb = td->td_pcb;
495 p = td->td_proc;
496
497 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
498 if (error != 0) {
499 uprintf("pid %d (%s): sigreturn copyin failed\n",
500 p->p_pid, td->td_name);
501 return (error);
502 }
503 ucp = &uc;
504 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
505 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
506 td->td_name, ucp->uc_mcontext.mc_flags);
507 return (EINVAL);
508 }
509 regs = td->td_frame;
510 rflags = ucp->uc_mcontext.mc_rflags;
511 /*
512 * Don't allow users to change privileged or reserved flags.
513 */
514 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
515 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
516 td->td_name, rflags);
517 return (EINVAL);
518 }
519
520 /*
521 * Don't allow users to load a valid privileged %cs. Let the
522 * hardware check for invalid selectors, excess privilege in
523 * other selectors, invalid %eip's and invalid %esp's.
524 */
525 cs = ucp->uc_mcontext.mc_cs;
526 if (!CS_SECURE(cs)) {
527 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
528 td->td_name, cs);
529 ksiginfo_init_trap(&ksi);
530 ksi.ksi_signo = SIGBUS;
531 ksi.ksi_code = BUS_OBJERR;
532 ksi.ksi_trapno = T_PROTFLT;
533 ksi.ksi_addr = (void *)regs->tf_rip;
534 trapsignal(td, &ksi);
535 return (EINVAL);
536 }
537
538 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
539 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
540 if (xfpustate_len > cpu_max_ext_state_size -
541 sizeof(struct savefpu)) {
542 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
543 p->p_pid, td->td_name, xfpustate_len);
544 return (EINVAL);
545 }
546 xfpustate = __builtin_alloca(xfpustate_len);
547 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
548 xfpustate, xfpustate_len);
549 if (error != 0) {
550 uprintf(
551 "pid %d (%s): sigreturn copying xfpustate failed\n",
552 p->p_pid, td->td_name);
553 return (error);
554 }
555 } else {
556 xfpustate = NULL;
557 xfpustate_len = 0;
558 }
559 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
560 if (ret != 0) {
561 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
562 p->p_pid, td->td_name, ret);
563 return (ret);
564 }
565 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
566 update_pcb_bases(pcb);
567 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
568 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
569
570 #if defined(COMPAT_43)
571 if (ucp->uc_mcontext.mc_onstack & 1)
572 td->td_sigstk.ss_flags |= SS_ONSTACK;
573 else
574 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
575 #endif
576
577 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
578 return (EJUSTRETURN);
579 }
580
581 #ifdef COMPAT_FREEBSD4
582 int
583 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
584 {
585
586 return sys_sigreturn(td, (struct sigreturn_args *)uap);
587 }
588 #endif
589
590 /*
591 * Reset registers to default values on exec.
592 */
593 void
594 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
595 {
596 struct trapframe *regs;
597 struct pcb *pcb;
598 register_t saved_rflags;
599
600 regs = td->td_frame;
601 pcb = td->td_pcb;
602
603 if (td->td_proc->p_md.md_ldt != NULL)
604 user_ldt_free(td);
605
606 update_pcb_bases(pcb);
607 pcb->pcb_fsbase = 0;
608 pcb->pcb_gsbase = 0;
609 clear_pcb_flags(pcb, PCB_32BIT);
610 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
611
612 saved_rflags = regs->tf_rflags & PSL_T;
613 bzero((char *)regs, sizeof(struct trapframe));
614 regs->tf_rip = imgp->entry_addr;
615 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
616 regs->tf_rdi = stack; /* argv */
617 regs->tf_rflags = PSL_USER | saved_rflags;
618 regs->tf_ss = _udatasel;
619 regs->tf_cs = _ucodesel;
620 regs->tf_ds = _udatasel;
621 regs->tf_es = _udatasel;
622 regs->tf_fs = _ufssel;
623 regs->tf_gs = _ugssel;
624 regs->tf_flags = TF_HASSEGS;
625
626 /*
627 * Reset the hardware debug registers if they were in use.
628 * They won't have any meaning for the newly exec'd process.
629 */
630 if (pcb->pcb_flags & PCB_DBREGS) {
631 pcb->pcb_dr0 = 0;
632 pcb->pcb_dr1 = 0;
633 pcb->pcb_dr2 = 0;
634 pcb->pcb_dr3 = 0;
635 pcb->pcb_dr6 = 0;
636 pcb->pcb_dr7 = 0;
637 if (pcb == curpcb) {
638 /*
639 * Clear the debug registers on the running
640 * CPU, otherwise they will end up affecting
641 * the next process we switch to.
642 */
643 reset_dbregs();
644 }
645 clear_pcb_flags(pcb, PCB_DBREGS);
646 }
647
648 /*
649 * Drop the FP state if we hold it, so that the process gets a
650 * clean FP state if it uses the FPU again.
651 */
652 fpstate_drop(td);
653 }
654
655 void
656 cpu_setregs(void)
657 {
658 register_t cr0;
659
660 cr0 = rcr0();
661 /*
662 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
663 * BSP. See the comments there about why we set them.
664 */
665 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
666 load_cr0(cr0);
667 }
668
669 /*
670 * Initialize amd64 and configure to run kernel
671 */
672
673 /*
674 * Initialize segments & interrupt table
675 */
676
677 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
678 static struct gate_descriptor idt0[NIDT];
679 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
680
681 static char dblfault_stack[PAGE_SIZE] __aligned(16);
682 static char mce0_stack[PAGE_SIZE] __aligned(16);
683 static char nmi0_stack[PAGE_SIZE] __aligned(16);
684 static char dbg0_stack[PAGE_SIZE] __aligned(16);
685 CTASSERT(sizeof(struct nmi_pcpu) == 16);
686
687 struct amd64tss common_tss[MAXCPU];
688
689 /*
690 * Software prototypes -- in more palatable form.
691 *
692 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
693 * slots as corresponding segments for i386 kernel.
694 */
695 struct soft_segment_descriptor gdt_segs[] = {
696 /* GNULL_SEL 0 Null Descriptor */
697 { .ssd_base = 0x0,
698 .ssd_limit = 0x0,
699 .ssd_type = 0,
700 .ssd_dpl = 0,
701 .ssd_p = 0,
702 .ssd_long = 0,
703 .ssd_def32 = 0,
704 .ssd_gran = 0 },
705 /* GNULL2_SEL 1 Null Descriptor */
706 { .ssd_base = 0x0,
707 .ssd_limit = 0x0,
708 .ssd_type = 0,
709 .ssd_dpl = 0,
710 .ssd_p = 0,
711 .ssd_long = 0,
712 .ssd_def32 = 0,
713 .ssd_gran = 0 },
714 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */
715 { .ssd_base = 0x0,
716 .ssd_limit = 0xfffff,
717 .ssd_type = SDT_MEMRWA,
718 .ssd_dpl = SEL_UPL,
719 .ssd_p = 1,
720 .ssd_long = 0,
721 .ssd_def32 = 1,
722 .ssd_gran = 1 },
723 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */
724 { .ssd_base = 0x0,
725 .ssd_limit = 0xfffff,
726 .ssd_type = SDT_MEMRWA,
727 .ssd_dpl = SEL_UPL,
728 .ssd_p = 1,
729 .ssd_long = 0,
730 .ssd_def32 = 1,
731 .ssd_gran = 1 },
732 /* GCODE_SEL 4 Code Descriptor for kernel */
733 { .ssd_base = 0x0,
734 .ssd_limit = 0xfffff,
735 .ssd_type = SDT_MEMERA,
736 .ssd_dpl = SEL_KPL,
737 .ssd_p = 1,
738 .ssd_long = 1,
739 .ssd_def32 = 0,
740 .ssd_gran = 1 },
741 /* GDATA_SEL 5 Data Descriptor for kernel */
742 { .ssd_base = 0x0,
743 .ssd_limit = 0xfffff,
744 .ssd_type = SDT_MEMRWA,
745 .ssd_dpl = SEL_KPL,
746 .ssd_p = 1,
747 .ssd_long = 1,
748 .ssd_def32 = 0,
749 .ssd_gran = 1 },
750 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
751 { .ssd_base = 0x0,
752 .ssd_limit = 0xfffff,
753 .ssd_type = SDT_MEMERA,
754 .ssd_dpl = SEL_UPL,
755 .ssd_p = 1,
756 .ssd_long = 0,
757 .ssd_def32 = 1,
758 .ssd_gran = 1 },
759 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
760 { .ssd_base = 0x0,
761 .ssd_limit = 0xfffff,
762 .ssd_type = SDT_MEMRWA,
763 .ssd_dpl = SEL_UPL,
764 .ssd_p = 1,
765 .ssd_long = 0,
766 .ssd_def32 = 1,
767 .ssd_gran = 1 },
768 /* GUCODE_SEL 8 64 bit Code Descriptor for user */
769 { .ssd_base = 0x0,
770 .ssd_limit = 0xfffff,
771 .ssd_type = SDT_MEMERA,
772 .ssd_dpl = SEL_UPL,
773 .ssd_p = 1,
774 .ssd_long = 1,
775 .ssd_def32 = 0,
776 .ssd_gran = 1 },
777 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */
778 { .ssd_base = 0x0,
779 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
780 .ssd_type = SDT_SYSTSS,
781 .ssd_dpl = SEL_KPL,
782 .ssd_p = 1,
783 .ssd_long = 0,
784 .ssd_def32 = 0,
785 .ssd_gran = 0 },
786 /* Actually, the TSS is a system descriptor which is double size */
787 { .ssd_base = 0x0,
788 .ssd_limit = 0x0,
789 .ssd_type = 0,
790 .ssd_dpl = 0,
791 .ssd_p = 0,
792 .ssd_long = 0,
793 .ssd_def32 = 0,
794 .ssd_gran = 0 },
795 /* GUSERLDT_SEL 11 LDT Descriptor */
796 { .ssd_base = 0x0,
797 .ssd_limit = 0x0,
798 .ssd_type = 0,
799 .ssd_dpl = 0,
800 .ssd_p = 0,
801 .ssd_long = 0,
802 .ssd_def32 = 0,
803 .ssd_gran = 0 },
804 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
805 { .ssd_base = 0x0,
806 .ssd_limit = 0x0,
807 .ssd_type = 0,
808 .ssd_dpl = 0,
809 .ssd_p = 0,
810 .ssd_long = 0,
811 .ssd_def32 = 0,
812 .ssd_gran = 0 },
813 };
814
815 void
816 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
817 {
818 struct gate_descriptor *ip;
819
820 ip = idt + idx;
821 ip->gd_looffset = (uintptr_t)func;
822 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
823 ip->gd_ist = ist;
824 ip->gd_xx = 0;
825 ip->gd_type = typ;
826 ip->gd_dpl = dpl;
827 ip->gd_p = 1;
828 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
829 }
830
831 extern inthand_t
832 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
833 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
834 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
835 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
836 IDTVEC(xmm), IDTVEC(dblfault),
837 IDTVEC(div_pti), IDTVEC(bpt_pti),
838 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
839 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
840 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
841 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
842 IDTVEC(xmm_pti),
843 #ifdef KDTRACE_HOOKS
844 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
845 #endif
846 #ifdef XENHVM
847 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
848 #endif
849 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
850 IDTVEC(fast_syscall_pti);
851
852 #ifdef DDB
853 /*
854 * Display the index and function name of any IDT entries that don't use
855 * the default 'rsvd' entry point.
856 */
857 DB_SHOW_COMMAND(idt, db_show_idt)
858 {
859 struct gate_descriptor *ip;
860 int idx;
861 uintptr_t func;
862
863 ip = idt;
864 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
865 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
866 if (func != (uintptr_t)&IDTVEC(rsvd)) {
867 db_printf("%3d\t", idx);
868 db_printsym(func, DB_STGY_PROC);
869 db_printf("\n");
870 }
871 ip++;
872 }
873 }
874
875 /* Show privileged registers. */
876 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
877 {
878 struct {
879 uint16_t limit;
880 uint64_t base;
881 } __packed idtr, gdtr;
882 uint16_t ldt, tr;
883
884 __asm __volatile("sidt %0" : "=m" (idtr));
885 db_printf("idtr\t0x%016lx/%04x\n",
886 (u_long)idtr.base, (u_int)idtr.limit);
887 __asm __volatile("sgdt %0" : "=m" (gdtr));
888 db_printf("gdtr\t0x%016lx/%04x\n",
889 (u_long)gdtr.base, (u_int)gdtr.limit);
890 __asm __volatile("sldt %0" : "=r" (ldt));
891 db_printf("ldtr\t0x%04x\n", ldt);
892 __asm __volatile("str %0" : "=r" (tr));
893 db_printf("tr\t0x%04x\n", tr);
894 db_printf("cr0\t0x%016lx\n", rcr0());
895 db_printf("cr2\t0x%016lx\n", rcr2());
896 db_printf("cr3\t0x%016lx\n", rcr3());
897 db_printf("cr4\t0x%016lx\n", rcr4());
898 if (rcr4() & CR4_XSAVE)
899 db_printf("xcr0\t0x%016lx\n", rxcr(0));
900 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
901 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
902 db_printf("FEATURES_CTL\t%016lx\n",
903 rdmsr(MSR_IA32_FEATURE_CONTROL));
904 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
905 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
906 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
907 }
908
909 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
910 {
911
912 db_printf("dr0\t0x%016lx\n", rdr0());
913 db_printf("dr1\t0x%016lx\n", rdr1());
914 db_printf("dr2\t0x%016lx\n", rdr2());
915 db_printf("dr3\t0x%016lx\n", rdr3());
916 db_printf("dr6\t0x%016lx\n", rdr6());
917 db_printf("dr7\t0x%016lx\n", rdr7());
918 }
919 #endif
920
921 void
922 sdtossd(sd, ssd)
923 struct user_segment_descriptor *sd;
924 struct soft_segment_descriptor *ssd;
925 {
926
927 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
928 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
929 ssd->ssd_type = sd->sd_type;
930 ssd->ssd_dpl = sd->sd_dpl;
931 ssd->ssd_p = sd->sd_p;
932 ssd->ssd_long = sd->sd_long;
933 ssd->ssd_def32 = sd->sd_def32;
934 ssd->ssd_gran = sd->sd_gran;
935 }
936
937 void
938 ssdtosd(ssd, sd)
939 struct soft_segment_descriptor *ssd;
940 struct user_segment_descriptor *sd;
941 {
942
943 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
944 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
945 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
946 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
947 sd->sd_type = ssd->ssd_type;
948 sd->sd_dpl = ssd->ssd_dpl;
949 sd->sd_p = ssd->ssd_p;
950 sd->sd_long = ssd->ssd_long;
951 sd->sd_def32 = ssd->ssd_def32;
952 sd->sd_gran = ssd->ssd_gran;
953 }
954
955 void
956 ssdtosyssd(ssd, sd)
957 struct soft_segment_descriptor *ssd;
958 struct system_segment_descriptor *sd;
959 {
960
961 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
962 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
963 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
964 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
965 sd->sd_type = ssd->ssd_type;
966 sd->sd_dpl = ssd->ssd_dpl;
967 sd->sd_p = ssd->ssd_p;
968 sd->sd_gran = ssd->ssd_gran;
969 }
970
971 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
972 #include <isa/isavar.h>
973 #include <isa/isareg.h>
974 /*
975 * Return a bitmap of the current interrupt requests. This is 8259-specific
976 * and is only suitable for use at probe time.
977 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
978 * It shouldn't be here. There should probably be an APIC centric
979 * implementation in the apic driver code, if at all.
980 */
981 intrmask_t
982 isa_irq_pending(void)
983 {
984 u_char irr1;
985 u_char irr2;
986
987 irr1 = inb(IO_ICU1);
988 irr2 = inb(IO_ICU2);
989 return ((irr2 << 8) | irr1);
990 }
991 #endif
992
993 u_int basemem;
994
995 static int
996 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
997 int *physmap_idxp)
998 {
999 int i, insert_idx, physmap_idx;
1000
1001 physmap_idx = *physmap_idxp;
1002
1003 if (length == 0)
1004 return (1);
1005
1006 /*
1007 * Find insertion point while checking for overlap. Start off by
1008 * assuming the new entry will be added to the end.
1009 *
1010 * NB: physmap_idx points to the next free slot.
1011 */
1012 insert_idx = physmap_idx;
1013 for (i = 0; i <= physmap_idx; i += 2) {
1014 if (base < physmap[i + 1]) {
1015 if (base + length <= physmap[i]) {
1016 insert_idx = i;
1017 break;
1018 }
1019 if (boothowto & RB_VERBOSE)
1020 printf(
1021 "Overlapping memory regions, ignoring second region\n");
1022 return (1);
1023 }
1024 }
1025
1026 /* See if we can prepend to the next entry. */
1027 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1028 physmap[insert_idx] = base;
1029 return (1);
1030 }
1031
1032 /* See if we can append to the previous entry. */
1033 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1034 physmap[insert_idx - 1] += length;
1035 return (1);
1036 }
1037
1038 physmap_idx += 2;
1039 *physmap_idxp = physmap_idx;
1040 if (physmap_idx == PHYSMAP_SIZE) {
1041 printf(
1042 "Too many segments in the physical address map, giving up\n");
1043 return (0);
1044 }
1045
1046 /*
1047 * Move the last 'N' entries down to make room for the new
1048 * entry if needed.
1049 */
1050 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1051 physmap[i] = physmap[i - 2];
1052 physmap[i + 1] = physmap[i - 1];
1053 }
1054
1055 /* Insert the new entry. */
1056 physmap[insert_idx] = base;
1057 physmap[insert_idx + 1] = base + length;
1058 return (1);
1059 }
1060
1061 void
1062 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1063 vm_paddr_t *physmap, int *physmap_idx)
1064 {
1065 struct bios_smap *smap, *smapend;
1066
1067 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1068
1069 for (smap = smapbase; smap < smapend; smap++) {
1070 if (boothowto & RB_VERBOSE)
1071 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1072 smap->type, smap->base, smap->length);
1073
1074 if (smap->type != SMAP_TYPE_MEMORY)
1075 continue;
1076
1077 if (!add_physmap_entry(smap->base, smap->length, physmap,
1078 physmap_idx))
1079 break;
1080 }
1081 }
1082
1083 static void
1084 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1085 int *physmap_idx)
1086 {
1087 struct efi_md *map, *p;
1088 const char *type;
1089 size_t efisz;
1090 int ndesc, i;
1091
1092 static const char *types[] = {
1093 "Reserved",
1094 "LoaderCode",
1095 "LoaderData",
1096 "BootServicesCode",
1097 "BootServicesData",
1098 "RuntimeServicesCode",
1099 "RuntimeServicesData",
1100 "ConventionalMemory",
1101 "UnusableMemory",
1102 "ACPIReclaimMemory",
1103 "ACPIMemoryNVS",
1104 "MemoryMappedIO",
1105 "MemoryMappedIOPortSpace",
1106 "PalCode",
1107 "PersistentMemory"
1108 };
1109
1110 /*
1111 * Memory map data provided by UEFI via the GetMemoryMap
1112 * Boot Services API.
1113 */
1114 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1115 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1116
1117 if (efihdr->descriptor_size == 0)
1118 return;
1119 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1120
1121 if (boothowto & RB_VERBOSE)
1122 printf("%23s %12s %12s %8s %4s\n",
1123 "Type", "Physical", "Virtual", "#Pages", "Attr");
1124
1125 for (i = 0, p = map; i < ndesc; i++,
1126 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1127 if (boothowto & RB_VERBOSE) {
1128 if (p->md_type < nitems(types))
1129 type = types[p->md_type];
1130 else
1131 type = "<INVALID>";
1132 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1133 p->md_virt, p->md_pages);
1134 if (p->md_attr & EFI_MD_ATTR_UC)
1135 printf("UC ");
1136 if (p->md_attr & EFI_MD_ATTR_WC)
1137 printf("WC ");
1138 if (p->md_attr & EFI_MD_ATTR_WT)
1139 printf("WT ");
1140 if (p->md_attr & EFI_MD_ATTR_WB)
1141 printf("WB ");
1142 if (p->md_attr & EFI_MD_ATTR_UCE)
1143 printf("UCE ");
1144 if (p->md_attr & EFI_MD_ATTR_WP)
1145 printf("WP ");
1146 if (p->md_attr & EFI_MD_ATTR_RP)
1147 printf("RP ");
1148 if (p->md_attr & EFI_MD_ATTR_XP)
1149 printf("XP ");
1150 if (p->md_attr & EFI_MD_ATTR_NV)
1151 printf("NV ");
1152 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1153 printf("MORE_RELIABLE ");
1154 if (p->md_attr & EFI_MD_ATTR_RO)
1155 printf("RO ");
1156 if (p->md_attr & EFI_MD_ATTR_RT)
1157 printf("RUNTIME");
1158 printf("\n");
1159 }
1160
1161 switch (p->md_type) {
1162 case EFI_MD_TYPE_CODE:
1163 case EFI_MD_TYPE_DATA:
1164 case EFI_MD_TYPE_BS_CODE:
1165 case EFI_MD_TYPE_BS_DATA:
1166 case EFI_MD_TYPE_FREE:
1167 /*
1168 * We're allowed to use any entry with these types.
1169 */
1170 break;
1171 default:
1172 continue;
1173 }
1174
1175 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1176 physmap, physmap_idx))
1177 break;
1178 }
1179 }
1180
1181 static char bootmethod[16] = "";
1182 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1183 "System firmware boot method");
1184
1185 static void
1186 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1187 {
1188 struct bios_smap *smap;
1189 struct efi_map_header *efihdr;
1190 u_int32_t size;
1191
1192 /*
1193 * Memory map from INT 15:E820.
1194 *
1195 * subr_module.c says:
1196 * "Consumer may safely assume that size value precedes data."
1197 * ie: an int32_t immediately precedes smap.
1198 */
1199
1200 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1201 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1202 smap = (struct bios_smap *)preload_search_info(kmdp,
1203 MODINFO_METADATA | MODINFOMD_SMAP);
1204 if (efihdr == NULL && smap == NULL)
1205 panic("No BIOS smap or EFI map info from loader!");
1206
1207 if (efihdr != NULL) {
1208 add_efi_map_entries(efihdr, physmap, physmap_idx);
1209 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1210 } else {
1211 size = *((u_int32_t *)smap - 1);
1212 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1213 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1214 }
1215 }
1216
1217 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1218
1219 /*
1220 * Populate the (physmap) array with base/bound pairs describing the
1221 * available physical memory in the system, then test this memory and
1222 * build the phys_avail array describing the actually-available memory.
1223 *
1224 * Total memory size may be set by the kernel environment variable
1225 * hw.physmem or the compile-time define MAXMEM.
1226 *
1227 * XXX first should be vm_paddr_t.
1228 */
1229 static void
1230 getmemsize(caddr_t kmdp, u_int64_t first)
1231 {
1232 int i, physmap_idx, pa_indx, da_indx;
1233 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1234 u_long physmem_start, physmem_tunable, memtest;
1235 pt_entry_t *pte;
1236 quad_t dcons_addr, dcons_size;
1237 int page_counter;
1238
1239 /*
1240 * Tell the physical memory allocator about pages used to store
1241 * the kernel and preloaded data. See kmem_bootstrap_free().
1242 */
1243 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1244
1245 bzero(physmap, sizeof(physmap));
1246 physmap_idx = 0;
1247
1248 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1249 physmap_idx -= 2;
1250
1251 /*
1252 * Find the 'base memory' segment for SMP
1253 */
1254 basemem = 0;
1255 for (i = 0; i <= physmap_idx; i += 2) {
1256 if (physmap[i] <= 0xA0000) {
1257 basemem = physmap[i + 1] / 1024;
1258 break;
1259 }
1260 }
1261 if (basemem == 0 || basemem > 640) {
1262 if (bootverbose)
1263 printf(
1264 "Memory map doesn't contain a basemem segment, faking it");
1265 basemem = 640;
1266 }
1267
1268 /*
1269 * Maxmem isn't the "maximum memory", it's one larger than the
1270 * highest page of the physical address space. It should be
1271 * called something like "Maxphyspage". We may adjust this
1272 * based on ``hw.physmem'' and the results of the memory test.
1273 */
1274 Maxmem = atop(physmap[physmap_idx + 1]);
1275
1276 #ifdef MAXMEM
1277 Maxmem = MAXMEM / 4;
1278 #endif
1279
1280 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1281 Maxmem = atop(physmem_tunable);
1282
1283 /*
1284 * The boot memory test is disabled by default, as it takes a
1285 * significant amount of time on large-memory systems, and is
1286 * unfriendly to virtual machines as it unnecessarily touches all
1287 * pages.
1288 *
1289 * A general name is used as the code may be extended to support
1290 * additional tests beyond the current "page present" test.
1291 */
1292 memtest = 0;
1293 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1294
1295 /*
1296 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1297 * in the system.
1298 */
1299 if (Maxmem > atop(physmap[physmap_idx + 1]))
1300 Maxmem = atop(physmap[physmap_idx + 1]);
1301
1302 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1303 (boothowto & RB_VERBOSE))
1304 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1305
1306 /*
1307 * Make hole for "AP -> long mode" bootstrap code. The
1308 * mp_bootaddress vector is only available when the kernel
1309 * is configured to support APs and APs for the system start
1310 * in real mode mode (e.g. SMP bare metal).
1311 */
1312 if (init_ops.mp_bootaddress)
1313 init_ops.mp_bootaddress(physmap, &physmap_idx);
1314
1315 /* call pmap initialization to make new kernel address space */
1316 pmap_bootstrap(&first);
1317
1318 /*
1319 * Size up each available chunk of physical memory.
1320 *
1321 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1322 * By default, mask off the first 16 pages unless we appear to be
1323 * running in a VM.
1324 */
1325 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1326 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1327 if (physmap[0] < physmem_start) {
1328 if (physmem_start < PAGE_SIZE)
1329 physmap[0] = PAGE_SIZE;
1330 else if (physmem_start >= physmap[1])
1331 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1332 else
1333 physmap[0] = round_page(physmem_start);
1334 }
1335 pa_indx = 0;
1336 da_indx = 1;
1337 phys_avail[pa_indx++] = physmap[0];
1338 phys_avail[pa_indx] = physmap[0];
1339 dump_avail[da_indx] = physmap[0];
1340 pte = CMAP1;
1341
1342 /*
1343 * Get dcons buffer address
1344 */
1345 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1346 getenv_quad("dcons.size", &dcons_size) == 0)
1347 dcons_addr = 0;
1348
1349 /*
1350 * physmap is in bytes, so when converting to page boundaries,
1351 * round up the start address and round down the end address.
1352 */
1353 page_counter = 0;
1354 if (memtest != 0)
1355 printf("Testing system memory");
1356 for (i = 0; i <= physmap_idx; i += 2) {
1357 vm_paddr_t end;
1358
1359 end = ptoa((vm_paddr_t)Maxmem);
1360 if (physmap[i + 1] < end)
1361 end = trunc_page(physmap[i + 1]);
1362 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1363 int tmp, page_bad, full;
1364 int *ptr = (int *)CADDR1;
1365
1366 full = FALSE;
1367 /*
1368 * block out kernel memory as not available.
1369 */
1370 if (pa >= (vm_paddr_t)kernphys && pa < first)
1371 goto do_dump_avail;
1372
1373 /*
1374 * block out dcons buffer
1375 */
1376 if (dcons_addr > 0
1377 && pa >= trunc_page(dcons_addr)
1378 && pa < dcons_addr + dcons_size)
1379 goto do_dump_avail;
1380
1381 page_bad = FALSE;
1382 if (memtest == 0)
1383 goto skip_memtest;
1384
1385 /*
1386 * Print a "." every GB to show we're making
1387 * progress.
1388 */
1389 page_counter++;
1390 if ((page_counter % PAGES_PER_GB) == 0)
1391 printf(".");
1392
1393 /*
1394 * map page into kernel: valid, read/write,non-cacheable
1395 */
1396 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1397 invltlb();
1398
1399 tmp = *(int *)ptr;
1400 /*
1401 * Test for alternating 1's and 0's
1402 */
1403 *(volatile int *)ptr = 0xaaaaaaaa;
1404 if (*(volatile int *)ptr != 0xaaaaaaaa)
1405 page_bad = TRUE;
1406 /*
1407 * Test for alternating 0's and 1's
1408 */
1409 *(volatile int *)ptr = 0x55555555;
1410 if (*(volatile int *)ptr != 0x55555555)
1411 page_bad = TRUE;
1412 /*
1413 * Test for all 1's
1414 */
1415 *(volatile int *)ptr = 0xffffffff;
1416 if (*(volatile int *)ptr != 0xffffffff)
1417 page_bad = TRUE;
1418 /*
1419 * Test for all 0's
1420 */
1421 *(volatile int *)ptr = 0x0;
1422 if (*(volatile int *)ptr != 0x0)
1423 page_bad = TRUE;
1424 /*
1425 * Restore original value.
1426 */
1427 *(int *)ptr = tmp;
1428
1429 skip_memtest:
1430 /*
1431 * Adjust array of valid/good pages.
1432 */
1433 if (page_bad == TRUE)
1434 continue;
1435 /*
1436 * If this good page is a continuation of the
1437 * previous set of good pages, then just increase
1438 * the end pointer. Otherwise start a new chunk.
1439 * Note that "end" points one higher than end,
1440 * making the range >= start and < end.
1441 * If we're also doing a speculative memory
1442 * test and we at or past the end, bump up Maxmem
1443 * so that we keep going. The first bad page
1444 * will terminate the loop.
1445 */
1446 if (phys_avail[pa_indx] == pa) {
1447 phys_avail[pa_indx] += PAGE_SIZE;
1448 } else {
1449 pa_indx++;
1450 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1451 printf(
1452 "Too many holes in the physical address space, giving up\n");
1453 pa_indx--;
1454 full = TRUE;
1455 goto do_dump_avail;
1456 }
1457 phys_avail[pa_indx++] = pa; /* start */
1458 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1459 }
1460 physmem++;
1461 do_dump_avail:
1462 if (dump_avail[da_indx] == pa) {
1463 dump_avail[da_indx] += PAGE_SIZE;
1464 } else {
1465 da_indx++;
1466 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1467 da_indx--;
1468 goto do_next;
1469 }
1470 dump_avail[da_indx++] = pa; /* start */
1471 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1472 }
1473 do_next:
1474 if (full)
1475 break;
1476 }
1477 }
1478 *pte = 0;
1479 invltlb();
1480 if (memtest != 0)
1481 printf("\n");
1482
1483 /*
1484 * XXX
1485 * The last chunk must contain at least one page plus the message
1486 * buffer to avoid complicating other code (message buffer address
1487 * calculation, etc.).
1488 */
1489 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1490 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1491 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1492 phys_avail[pa_indx--] = 0;
1493 phys_avail[pa_indx--] = 0;
1494 }
1495
1496 Maxmem = atop(phys_avail[pa_indx]);
1497
1498 /* Trim off space for the message buffer. */
1499 phys_avail[pa_indx] -= round_page(msgbufsize);
1500
1501 /* Map the message buffer. */
1502 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1503 }
1504
1505 static caddr_t
1506 native_parse_preload_data(u_int64_t modulep)
1507 {
1508 caddr_t kmdp;
1509 char *envp;
1510 #ifdef DDB
1511 vm_offset_t ksym_start;
1512 vm_offset_t ksym_end;
1513 #endif
1514
1515 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1516 preload_bootstrap_relocate(KERNBASE);
1517 kmdp = preload_search_by_type("elf kernel");
1518 if (kmdp == NULL)
1519 kmdp = preload_search_by_type("elf64 kernel");
1520 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1521 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1522 if (envp != NULL)
1523 envp += KERNBASE;
1524 init_static_kenv(envp, 0);
1525 #ifdef DDB
1526 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1527 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1528 db_fetch_ksymtab(ksym_start, ksym_end);
1529 #endif
1530 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1531
1532 return (kmdp);
1533 }
1534
1535 static void
1536 amd64_kdb_init(void)
1537 {
1538 kdb_init();
1539 #ifdef KDB
1540 if (boothowto & RB_KDB)
1541 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1542 #endif
1543 }
1544
1545 /* Set up the fast syscall stuff */
1546 void
1547 amd64_conf_fast_syscall(void)
1548 {
1549 uint64_t msr;
1550
1551 msr = rdmsr(MSR_EFER) | EFER_SCE;
1552 wrmsr(MSR_EFER, msr);
1553 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1554 (u_int64_t)IDTVEC(fast_syscall));
1555 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1556 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1557 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1558 wrmsr(MSR_STAR, msr);
1559 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1560 }
1561
1562 u_int64_t
1563 hammer_time(u_int64_t modulep, u_int64_t physfree)
1564 {
1565 caddr_t kmdp;
1566 int gsel_tss, x;
1567 struct pcpu *pc;
1568 struct nmi_pcpu *np;
1569 struct xstate_hdr *xhdr;
1570 u_int64_t rsp0;
1571 char *env;
1572 size_t kstack0_sz;
1573 int late_console;
1574
1575 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1576
1577 kmdp = init_ops.parse_preload_data(modulep);
1578
1579 physfree += ucode_load_bsp(physfree + KERNBASE);
1580 physfree = roundup2(physfree, PAGE_SIZE);
1581
1582 identify_cpu1();
1583 identify_hypervisor();
1584 identify_cpu2();
1585 initializecpucache();
1586
1587 /*
1588 * Check for pti, pcid, and invpcid before ifuncs are
1589 * resolved, to correctly select the implementation for
1590 * pmap_activate_sw_mode().
1591 */
1592 pti = pti_get_default();
1593 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1594 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1595 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1596 invpcid_works = (cpu_stdext_feature &
1597 CPUID_STDEXT_INVPCID) != 0;
1598 } else {
1599 pmap_pcid_enabled = 0;
1600 }
1601
1602 link_elf_ireloc(kmdp);
1603
1604 /*
1605 * This may be done better later if it gets more high level
1606 * components in it. If so just link td->td_proc here.
1607 */
1608 proc_linkup0(&proc0, &thread0);
1609
1610 /* Init basic tunables, hz etc */
1611 init_param1();
1612
1613 thread0.td_kstack = physfree + KERNBASE;
1614 thread0.td_kstack_pages = kstack_pages;
1615 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1616 bzero((void *)thread0.td_kstack, kstack0_sz);
1617 physfree += kstack0_sz;
1618
1619 /*
1620 * make gdt memory segments
1621 */
1622 for (x = 0; x < NGDT; x++) {
1623 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1624 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1625 ssdtosd(&gdt_segs[x], &gdt[x]);
1626 }
1627 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1628 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1629 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1630
1631 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1632 r_gdt.rd_base = (long) gdt;
1633 lgdt(&r_gdt);
1634 pc = &__pcpu[0];
1635
1636 wrmsr(MSR_FSBASE, 0); /* User value */
1637 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1638 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1639
1640 pcpu_init(pc, 0, sizeof(struct pcpu));
1641 dpcpu_init((void *)(physfree + KERNBASE), 0);
1642 physfree += DPCPU_SIZE;
1643 PCPU_SET(prvspace, pc);
1644 PCPU_SET(curthread, &thread0);
1645 /* Non-late cninit() and printf() can be moved up to here. */
1646 PCPU_SET(tssp, &common_tss[0]);
1647 PCPU_SET(commontssp, &common_tss[0]);
1648 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1649 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1650 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1651 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1652
1653 /*
1654 * Initialize mutexes.
1655 *
1656 * icu_lock: in order to allow an interrupt to occur in a critical
1657 * section, to set pcpu->ipending (etc...) properly, we
1658 * must be able to get the icu lock, so it can't be
1659 * under witness.
1660 */
1661 mutex_init();
1662 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1663 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1664
1665 /* exceptions */
1666 for (x = 0; x < NIDT; x++)
1667 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1668 SEL_KPL, 0);
1669 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1670 SEL_KPL, 0);
1671 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1672 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1673 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1674 SEL_UPL, 0);
1675 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1676 SEL_UPL, 0);
1677 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1678 SEL_KPL, 0);
1679 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1680 SEL_KPL, 0);
1681 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1682 SEL_KPL, 0);
1683 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1684 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1685 SDT_SYSIGT, SEL_KPL, 0);
1686 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1687 SEL_KPL, 0);
1688 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1689 SDT_SYSIGT, SEL_KPL, 0);
1690 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1691 SEL_KPL, 0);
1692 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1693 SEL_KPL, 0);
1694 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1695 SEL_KPL, 0);
1696 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1697 SEL_KPL, 0);
1698 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1699 SEL_KPL, 0);
1700 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1701 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1702 SEL_KPL, 0);
1703 #ifdef KDTRACE_HOOKS
1704 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1705 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1706 #endif
1707 #ifdef XENHVM
1708 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1709 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1710 #endif
1711 r_idt.rd_limit = sizeof(idt0) - 1;
1712 r_idt.rd_base = (long) idt;
1713 lidt(&r_idt);
1714
1715 /*
1716 * Initialize the clock before the console so that console
1717 * initialization can use DELAY().
1718 */
1719 clock_init();
1720
1721 /*
1722 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1723 * transition).
1724 * Once bootblocks have updated, we can test directly for
1725 * efi_systbl != NULL here...
1726 */
1727 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1728 != NULL)
1729 vty_set_preferred(VTY_VT);
1730
1731 finishidentcpu(); /* Final stage of CPU initialization */
1732 initializecpu(); /* Initialize CPU registers */
1733
1734 /* doublefault stack space, runs on ist1 */
1735 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1736
1737 /*
1738 * NMI stack, runs on ist2. The pcpu pointer is stored just
1739 * above the start of the ist2 stack.
1740 */
1741 np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1742 np->np_pcpu = (register_t) pc;
1743 common_tss[0].tss_ist2 = (long) np;
1744
1745 /*
1746 * MC# stack, runs on ist3. The pcpu pointer is stored just
1747 * above the start of the ist3 stack.
1748 */
1749 np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1750 np->np_pcpu = (register_t) pc;
1751 common_tss[0].tss_ist3 = (long) np;
1752
1753 /*
1754 * DB# stack, runs on ist4.
1755 */
1756 np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1757 np->np_pcpu = (register_t) pc;
1758 common_tss[0].tss_ist4 = (long) np;
1759
1760 /* Set the IO permission bitmap (empty due to tss seg limit) */
1761 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1762
1763 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1764 ltr(gsel_tss);
1765
1766 amd64_conf_fast_syscall();
1767
1768 /*
1769 * Temporary forge some valid pointer to PCB, for exception
1770 * handlers. It is reinitialized properly below after FPU is
1771 * set up. Also set up td_critnest to short-cut the page
1772 * fault handler.
1773 */
1774 cpu_max_ext_state_size = sizeof(struct savefpu);
1775 thread0.td_pcb = get_pcb_td(&thread0);
1776 thread0.td_critnest = 1;
1777
1778 /*
1779 * The console and kdb should be initialized even earlier than here,
1780 * but some console drivers don't work until after getmemsize().
1781 * Default to late console initialization to support these drivers.
1782 * This loses mainly printf()s in getmemsize() and early debugging.
1783 */
1784 late_console = 1;
1785 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1786 if (!late_console) {
1787 cninit();
1788 amd64_kdb_init();
1789 }
1790
1791 getmemsize(kmdp, physfree);
1792 init_param2(physmem);
1793
1794 /* now running on new page tables, configured,and u/iom is accessible */
1795
1796 #ifdef DEV_PCI
1797 /* This call might adjust phys_avail[]. */
1798 pci_early_quirks();
1799 #endif
1800
1801 if (late_console)
1802 cninit();
1803
1804 #ifdef DEV_ISA
1805 #ifdef DEV_ATPIC
1806 elcr_probe();
1807 atpic_startup();
1808 #else
1809 /* Reset and mask the atpics and leave them shut down. */
1810 atpic_reset();
1811
1812 /*
1813 * Point the ICU spurious interrupt vectors at the APIC spurious
1814 * interrupt handler.
1815 */
1816 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1817 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1818 #endif
1819 #else
1820 #error "have you forgotten the isa device?";
1821 #endif
1822
1823 if (late_console)
1824 amd64_kdb_init();
1825
1826 msgbufinit(msgbufp, msgbufsize);
1827 fpuinit();
1828
1829 /*
1830 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1831 * area size. Zero out the extended state header in fpu save
1832 * area.
1833 */
1834 thread0.td_pcb = get_pcb_td(&thread0);
1835 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1836 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1837 if (use_xsave) {
1838 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1839 1);
1840 xhdr->xstate_bv = xsave_mask;
1841 }
1842 /* make an initial tss so cpu can get interrupt stack on syscall! */
1843 rsp0 = (vm_offset_t)thread0.td_pcb;
1844 /* Ensure the stack is aligned to 16 bytes */
1845 rsp0 &= ~0xFul;
1846 common_tss[0].tss_rsp0 = rsp0;
1847 PCPU_SET(rsp0, rsp0);
1848 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1849 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1850 PCPU_SET(curpcb, thread0.td_pcb);
1851
1852 /* transfer to user mode */
1853
1854 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1855 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1856 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1857 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1858 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1859
1860 load_ds(_udatasel);
1861 load_es(_udatasel);
1862 load_fs(_ufssel);
1863
1864 /* setup proc 0's pcb */
1865 thread0.td_pcb->pcb_flags = 0;
1866 thread0.td_frame = &proc0_tf;
1867
1868 env = kern_getenv("kernelname");
1869 if (env != NULL)
1870 strlcpy(kernelname, env, sizeof(kernelname));
1871
1872 cpu_probe_amdc1e();
1873
1874 #ifdef FDT
1875 x86_init_fdt();
1876 #endif
1877 thread0.td_critnest = 0;
1878
1879 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1880 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1881
1882 TSEXIT();
1883
1884 /* Location of kernel stack for locore */
1885 return ((u_int64_t)thread0.td_pcb);
1886 }
1887
1888 void
1889 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1890 {
1891
1892 pcpu->pc_acpi_id = 0xffffffff;
1893 }
1894
1895 static int
1896 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1897 {
1898 struct bios_smap *smapbase;
1899 struct bios_smap_xattr smap;
1900 caddr_t kmdp;
1901 uint32_t *smapattr;
1902 int count, error, i;
1903
1904 /* Retrieve the system memory map from the loader. */
1905 kmdp = preload_search_by_type("elf kernel");
1906 if (kmdp == NULL)
1907 kmdp = preload_search_by_type("elf64 kernel");
1908 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1909 MODINFO_METADATA | MODINFOMD_SMAP);
1910 if (smapbase == NULL)
1911 return (0);
1912 smapattr = (uint32_t *)preload_search_info(kmdp,
1913 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1914 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1915 error = 0;
1916 for (i = 0; i < count; i++) {
1917 smap.base = smapbase[i].base;
1918 smap.length = smapbase[i].length;
1919 smap.type = smapbase[i].type;
1920 if (smapattr != NULL)
1921 smap.xattr = smapattr[i];
1922 else
1923 smap.xattr = 0;
1924 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1925 }
1926 return (error);
1927 }
1928 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1929 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1930
1931 static int
1932 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1933 {
1934 struct efi_map_header *efihdr;
1935 caddr_t kmdp;
1936 uint32_t efisize;
1937
1938 kmdp = preload_search_by_type("elf kernel");
1939 if (kmdp == NULL)
1940 kmdp = preload_search_by_type("elf64 kernel");
1941 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1942 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1943 if (efihdr == NULL)
1944 return (0);
1945 efisize = *((uint32_t *)efihdr - 1);
1946 return (SYSCTL_OUT(req, efihdr, efisize));
1947 }
1948 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1949 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1950
1951 void
1952 spinlock_enter(void)
1953 {
1954 struct thread *td;
1955 register_t flags;
1956
1957 td = curthread;
1958 if (td->td_md.md_spinlock_count == 0) {
1959 flags = intr_disable();
1960 td->td_md.md_spinlock_count = 1;
1961 td->td_md.md_saved_flags = flags;
1962 critical_enter();
1963 } else
1964 td->td_md.md_spinlock_count++;
1965 }
1966
1967 void
1968 spinlock_exit(void)
1969 {
1970 struct thread *td;
1971 register_t flags;
1972
1973 td = curthread;
1974 flags = td->td_md.md_saved_flags;
1975 td->td_md.md_spinlock_count--;
1976 if (td->td_md.md_spinlock_count == 0) {
1977 critical_exit();
1978 intr_restore(flags);
1979 }
1980 }
1981
1982 /*
1983 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1984 * we want to start a backtrace from the function that caused us to enter
1985 * the debugger. We have the context in the trapframe, but base the trace
1986 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1987 * enough for a backtrace.
1988 */
1989 void
1990 makectx(struct trapframe *tf, struct pcb *pcb)
1991 {
1992
1993 pcb->pcb_r12 = tf->tf_r12;
1994 pcb->pcb_r13 = tf->tf_r13;
1995 pcb->pcb_r14 = tf->tf_r14;
1996 pcb->pcb_r15 = tf->tf_r15;
1997 pcb->pcb_rbp = tf->tf_rbp;
1998 pcb->pcb_rbx = tf->tf_rbx;
1999 pcb->pcb_rip = tf->tf_rip;
2000 pcb->pcb_rsp = tf->tf_rsp;
2001 }
2002
2003 int
2004 ptrace_set_pc(struct thread *td, unsigned long addr)
2005 {
2006
2007 td->td_frame->tf_rip = addr;
2008 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2009 return (0);
2010 }
2011
2012 int
2013 ptrace_single_step(struct thread *td)
2014 {
2015
2016 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2017 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2018 td->td_frame->tf_rflags |= PSL_T;
2019 td->td_dbgflags |= TDB_STEP;
2020 }
2021 return (0);
2022 }
2023
2024 int
2025 ptrace_clear_single_step(struct thread *td)
2026 {
2027
2028 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2029 td->td_frame->tf_rflags &= ~PSL_T;
2030 td->td_dbgflags &= ~TDB_STEP;
2031 return (0);
2032 }
2033
2034 int
2035 fill_regs(struct thread *td, struct reg *regs)
2036 {
2037 struct trapframe *tp;
2038
2039 tp = td->td_frame;
2040 return (fill_frame_regs(tp, regs));
2041 }
2042
2043 int
2044 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2045 {
2046
2047 regs->r_r15 = tp->tf_r15;
2048 regs->r_r14 = tp->tf_r14;
2049 regs->r_r13 = tp->tf_r13;
2050 regs->r_r12 = tp->tf_r12;
2051 regs->r_r11 = tp->tf_r11;
2052 regs->r_r10 = tp->tf_r10;
2053 regs->r_r9 = tp->tf_r9;
2054 regs->r_r8 = tp->tf_r8;
2055 regs->r_rdi = tp->tf_rdi;
2056 regs->r_rsi = tp->tf_rsi;
2057 regs->r_rbp = tp->tf_rbp;
2058 regs->r_rbx = tp->tf_rbx;
2059 regs->r_rdx = tp->tf_rdx;
2060 regs->r_rcx = tp->tf_rcx;
2061 regs->r_rax = tp->tf_rax;
2062 regs->r_rip = tp->tf_rip;
2063 regs->r_cs = tp->tf_cs;
2064 regs->r_rflags = tp->tf_rflags;
2065 regs->r_rsp = tp->tf_rsp;
2066 regs->r_ss = tp->tf_ss;
2067 if (tp->tf_flags & TF_HASSEGS) {
2068 regs->r_ds = tp->tf_ds;
2069 regs->r_es = tp->tf_es;
2070 regs->r_fs = tp->tf_fs;
2071 regs->r_gs = tp->tf_gs;
2072 } else {
2073 regs->r_ds = 0;
2074 regs->r_es = 0;
2075 regs->r_fs = 0;
2076 regs->r_gs = 0;
2077 }
2078 regs->r_err = 0;
2079 regs->r_trapno = 0;
2080 return (0);
2081 }
2082
2083 int
2084 set_regs(struct thread *td, struct reg *regs)
2085 {
2086 struct trapframe *tp;
2087 register_t rflags;
2088
2089 tp = td->td_frame;
2090 rflags = regs->r_rflags & 0xffffffff;
2091 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2092 return (EINVAL);
2093 tp->tf_r15 = regs->r_r15;
2094 tp->tf_r14 = regs->r_r14;
2095 tp->tf_r13 = regs->r_r13;
2096 tp->tf_r12 = regs->r_r12;
2097 tp->tf_r11 = regs->r_r11;
2098 tp->tf_r10 = regs->r_r10;
2099 tp->tf_r9 = regs->r_r9;
2100 tp->tf_r8 = regs->r_r8;
2101 tp->tf_rdi = regs->r_rdi;
2102 tp->tf_rsi = regs->r_rsi;
2103 tp->tf_rbp = regs->r_rbp;
2104 tp->tf_rbx = regs->r_rbx;
2105 tp->tf_rdx = regs->r_rdx;
2106 tp->tf_rcx = regs->r_rcx;
2107 tp->tf_rax = regs->r_rax;
2108 tp->tf_rip = regs->r_rip;
2109 tp->tf_cs = regs->r_cs;
2110 tp->tf_rflags = rflags;
2111 tp->tf_rsp = regs->r_rsp;
2112 tp->tf_ss = regs->r_ss;
2113 if (0) { /* XXXKIB */
2114 tp->tf_ds = regs->r_ds;
2115 tp->tf_es = regs->r_es;
2116 tp->tf_fs = regs->r_fs;
2117 tp->tf_gs = regs->r_gs;
2118 tp->tf_flags = TF_HASSEGS;
2119 }
2120 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2121 return (0);
2122 }
2123
2124 /* XXX check all this stuff! */
2125 /* externalize from sv_xmm */
2126 static void
2127 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2128 {
2129 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2130 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2131 int i;
2132
2133 /* pcb -> fpregs */
2134 bzero(fpregs, sizeof(*fpregs));
2135
2136 /* FPU control/status */
2137 penv_fpreg->en_cw = penv_xmm->en_cw;
2138 penv_fpreg->en_sw = penv_xmm->en_sw;
2139 penv_fpreg->en_tw = penv_xmm->en_tw;
2140 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2141 penv_fpreg->en_rip = penv_xmm->en_rip;
2142 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2143 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2144 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2145
2146 /* FPU registers */
2147 for (i = 0; i < 8; ++i)
2148 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2149
2150 /* SSE registers */
2151 for (i = 0; i < 16; ++i)
2152 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2153 }
2154
2155 /* internalize from fpregs into sv_xmm */
2156 static void
2157 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2158 {
2159 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2160 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2161 int i;
2162
2163 /* fpregs -> pcb */
2164 /* FPU control/status */
2165 penv_xmm->en_cw = penv_fpreg->en_cw;
2166 penv_xmm->en_sw = penv_fpreg->en_sw;
2167 penv_xmm->en_tw = penv_fpreg->en_tw;
2168 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2169 penv_xmm->en_rip = penv_fpreg->en_rip;
2170 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2171 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2172 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2173
2174 /* FPU registers */
2175 for (i = 0; i < 8; ++i)
2176 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2177
2178 /* SSE registers */
2179 for (i = 0; i < 16; ++i)
2180 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2181 }
2182
2183 /* externalize from td->pcb */
2184 int
2185 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2186 {
2187
2188 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2189 P_SHOULDSTOP(td->td_proc),
2190 ("not suspended thread %p", td));
2191 fpugetregs(td);
2192 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2193 return (0);
2194 }
2195
2196 /* internalize to td->pcb */
2197 int
2198 set_fpregs(struct thread *td, struct fpreg *fpregs)
2199 {
2200
2201 critical_enter();
2202 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2203 fpuuserinited(td);
2204 critical_exit();
2205 return (0);
2206 }
2207
2208 /*
2209 * Get machine context.
2210 */
2211 int
2212 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2213 {
2214 struct pcb *pcb;
2215 struct trapframe *tp;
2216
2217 pcb = td->td_pcb;
2218 tp = td->td_frame;
2219 PROC_LOCK(curthread->td_proc);
2220 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2221 PROC_UNLOCK(curthread->td_proc);
2222 mcp->mc_r15 = tp->tf_r15;
2223 mcp->mc_r14 = tp->tf_r14;
2224 mcp->mc_r13 = tp->tf_r13;
2225 mcp->mc_r12 = tp->tf_r12;
2226 mcp->mc_r11 = tp->tf_r11;
2227 mcp->mc_r10 = tp->tf_r10;
2228 mcp->mc_r9 = tp->tf_r9;
2229 mcp->mc_r8 = tp->tf_r8;
2230 mcp->mc_rdi = tp->tf_rdi;
2231 mcp->mc_rsi = tp->tf_rsi;
2232 mcp->mc_rbp = tp->tf_rbp;
2233 mcp->mc_rbx = tp->tf_rbx;
2234 mcp->mc_rcx = tp->tf_rcx;
2235 mcp->mc_rflags = tp->tf_rflags;
2236 if (flags & GET_MC_CLEAR_RET) {
2237 mcp->mc_rax = 0;
2238 mcp->mc_rdx = 0;
2239 mcp->mc_rflags &= ~PSL_C;
2240 } else {
2241 mcp->mc_rax = tp->tf_rax;
2242 mcp->mc_rdx = tp->tf_rdx;
2243 }
2244 mcp->mc_rip = tp->tf_rip;
2245 mcp->mc_cs = tp->tf_cs;
2246 mcp->mc_rsp = tp->tf_rsp;
2247 mcp->mc_ss = tp->tf_ss;
2248 mcp->mc_ds = tp->tf_ds;
2249 mcp->mc_es = tp->tf_es;
2250 mcp->mc_fs = tp->tf_fs;
2251 mcp->mc_gs = tp->tf_gs;
2252 mcp->mc_flags = tp->tf_flags;
2253 mcp->mc_len = sizeof(*mcp);
2254 get_fpcontext(td, mcp, NULL, 0);
2255 update_pcb_bases(pcb);
2256 mcp->mc_fsbase = pcb->pcb_fsbase;
2257 mcp->mc_gsbase = pcb->pcb_gsbase;
2258 mcp->mc_xfpustate = 0;
2259 mcp->mc_xfpustate_len = 0;
2260 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2261 return (0);
2262 }
2263
2264 /*
2265 * Set machine context.
2266 *
2267 * However, we don't set any but the user modifiable flags, and we won't
2268 * touch the cs selector.
2269 */
2270 int
2271 set_mcontext(struct thread *td, mcontext_t *mcp)
2272 {
2273 struct pcb *pcb;
2274 struct trapframe *tp;
2275 char *xfpustate;
2276 long rflags;
2277 int ret;
2278
2279 pcb = td->td_pcb;
2280 tp = td->td_frame;
2281 if (mcp->mc_len != sizeof(*mcp) ||
2282 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2283 return (EINVAL);
2284 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2285 (tp->tf_rflags & ~PSL_USERCHANGE);
2286 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2287 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2288 sizeof(struct savefpu))
2289 return (EINVAL);
2290 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2291 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2292 mcp->mc_xfpustate_len);
2293 if (ret != 0)
2294 return (ret);
2295 } else
2296 xfpustate = NULL;
2297 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2298 if (ret != 0)
2299 return (ret);
2300 tp->tf_r15 = mcp->mc_r15;
2301 tp->tf_r14 = mcp->mc_r14;
2302 tp->tf_r13 = mcp->mc_r13;
2303 tp->tf_r12 = mcp->mc_r12;
2304 tp->tf_r11 = mcp->mc_r11;
2305 tp->tf_r10 = mcp->mc_r10;
2306 tp->tf_r9 = mcp->mc_r9;
2307 tp->tf_r8 = mcp->mc_r8;
2308 tp->tf_rdi = mcp->mc_rdi;
2309 tp->tf_rsi = mcp->mc_rsi;
2310 tp->tf_rbp = mcp->mc_rbp;
2311 tp->tf_rbx = mcp->mc_rbx;
2312 tp->tf_rdx = mcp->mc_rdx;
2313 tp->tf_rcx = mcp->mc_rcx;
2314 tp->tf_rax = mcp->mc_rax;
2315 tp->tf_rip = mcp->mc_rip;
2316 tp->tf_rflags = rflags;
2317 tp->tf_rsp = mcp->mc_rsp;
2318 tp->tf_ss = mcp->mc_ss;
2319 tp->tf_flags = mcp->mc_flags;
2320 if (tp->tf_flags & TF_HASSEGS) {
2321 tp->tf_ds = mcp->mc_ds;
2322 tp->tf_es = mcp->mc_es;
2323 tp->tf_fs = mcp->mc_fs;
2324 tp->tf_gs = mcp->mc_gs;
2325 }
2326 set_pcb_flags(pcb, PCB_FULL_IRET);
2327 if (mcp->mc_flags & _MC_HASBASES) {
2328 pcb->pcb_fsbase = mcp->mc_fsbase;
2329 pcb->pcb_gsbase = mcp->mc_gsbase;
2330 }
2331 return (0);
2332 }
2333
2334 static void
2335 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2336 size_t xfpusave_len)
2337 {
2338 size_t max_len, len;
2339
2340 mcp->mc_ownedfp = fpugetregs(td);
2341 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2342 sizeof(mcp->mc_fpstate));
2343 mcp->mc_fpformat = fpuformat();
2344 if (!use_xsave || xfpusave_len == 0)
2345 return;
2346 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2347 len = xfpusave_len;
2348 if (len > max_len) {
2349 len = max_len;
2350 bzero(xfpusave + max_len, len - max_len);
2351 }
2352 mcp->mc_flags |= _MC_HASFPXSTATE;
2353 mcp->mc_xfpustate_len = len;
2354 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2355 }
2356
2357 static int
2358 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2359 size_t xfpustate_len)
2360 {
2361 int error;
2362
2363 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2364 return (0);
2365 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2366 return (EINVAL);
2367 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2368 /* We don't care what state is left in the FPU or PCB. */
2369 fpstate_drop(td);
2370 error = 0;
2371 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2372 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2373 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2374 xfpustate, xfpustate_len);
2375 } else
2376 return (EINVAL);
2377 return (error);
2378 }
2379
2380 void
2381 fpstate_drop(struct thread *td)
2382 {
2383
2384 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2385 critical_enter();
2386 if (PCPU_GET(fpcurthread) == td)
2387 fpudrop();
2388 /*
2389 * XXX force a full drop of the fpu. The above only drops it if we
2390 * owned it.
2391 *
2392 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2393 * drop. Dropping only to the pcb matches fnsave's behaviour.
2394 * We only need to drop to !PCB_INITDONE in sendsig(). But
2395 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2396 * have too many layers.
2397 */
2398 clear_pcb_flags(curthread->td_pcb,
2399 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2400 critical_exit();
2401 }
2402
2403 int
2404 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2405 {
2406 struct pcb *pcb;
2407
2408 if (td == NULL) {
2409 dbregs->dr[0] = rdr0();
2410 dbregs->dr[1] = rdr1();
2411 dbregs->dr[2] = rdr2();
2412 dbregs->dr[3] = rdr3();
2413 dbregs->dr[6] = rdr6();
2414 dbregs->dr[7] = rdr7();
2415 } else {
2416 pcb = td->td_pcb;
2417 dbregs->dr[0] = pcb->pcb_dr0;
2418 dbregs->dr[1] = pcb->pcb_dr1;
2419 dbregs->dr[2] = pcb->pcb_dr2;
2420 dbregs->dr[3] = pcb->pcb_dr3;
2421 dbregs->dr[6] = pcb->pcb_dr6;
2422 dbregs->dr[7] = pcb->pcb_dr7;
2423 }
2424 dbregs->dr[4] = 0;
2425 dbregs->dr[5] = 0;
2426 dbregs->dr[8] = 0;
2427 dbregs->dr[9] = 0;
2428 dbregs->dr[10] = 0;
2429 dbregs->dr[11] = 0;
2430 dbregs->dr[12] = 0;
2431 dbregs->dr[13] = 0;
2432 dbregs->dr[14] = 0;
2433 dbregs->dr[15] = 0;
2434 return (0);
2435 }
2436
2437 int
2438 set_dbregs(struct thread *td, struct dbreg *dbregs)
2439 {
2440 struct pcb *pcb;
2441 int i;
2442
2443 if (td == NULL) {
2444 load_dr0(dbregs->dr[0]);
2445 load_dr1(dbregs->dr[1]);
2446 load_dr2(dbregs->dr[2]);
2447 load_dr3(dbregs->dr[3]);
2448 load_dr6(dbregs->dr[6]);
2449 load_dr7(dbregs->dr[7]);
2450 } else {
2451 /*
2452 * Don't let an illegal value for dr7 get set. Specifically,
2453 * check for undefined settings. Setting these bit patterns
2454 * result in undefined behaviour and can lead to an unexpected
2455 * TRCTRAP or a general protection fault right here.
2456 * Upper bits of dr6 and dr7 must not be set
2457 */
2458 for (i = 0; i < 4; i++) {
2459 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2460 return (EINVAL);
2461 if (td->td_frame->tf_cs == _ucode32sel &&
2462 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2463 return (EINVAL);
2464 }
2465 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2466 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2467 return (EINVAL);
2468
2469 pcb = td->td_pcb;
2470
2471 /*
2472 * Don't let a process set a breakpoint that is not within the
2473 * process's address space. If a process could do this, it
2474 * could halt the system by setting a breakpoint in the kernel
2475 * (if ddb was enabled). Thus, we need to check to make sure
2476 * that no breakpoints are being enabled for addresses outside
2477 * process's address space.
2478 *
2479 * XXX - what about when the watched area of the user's
2480 * address space is written into from within the kernel
2481 * ... wouldn't that still cause a breakpoint to be generated
2482 * from within kernel mode?
2483 */
2484
2485 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2486 /* dr0 is enabled */
2487 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2488 return (EINVAL);
2489 }
2490 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2491 /* dr1 is enabled */
2492 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2493 return (EINVAL);
2494 }
2495 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2496 /* dr2 is enabled */
2497 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2498 return (EINVAL);
2499 }
2500 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2501 /* dr3 is enabled */
2502 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2503 return (EINVAL);
2504 }
2505
2506 pcb->pcb_dr0 = dbregs->dr[0];
2507 pcb->pcb_dr1 = dbregs->dr[1];
2508 pcb->pcb_dr2 = dbregs->dr[2];
2509 pcb->pcb_dr3 = dbregs->dr[3];
2510 pcb->pcb_dr6 = dbregs->dr[6];
2511 pcb->pcb_dr7 = dbregs->dr[7];
2512
2513 set_pcb_flags(pcb, PCB_DBREGS);
2514 }
2515
2516 return (0);
2517 }
2518
2519 void
2520 reset_dbregs(void)
2521 {
2522
2523 load_dr7(0); /* Turn off the control bits first */
2524 load_dr0(0);
2525 load_dr1(0);
2526 load_dr2(0);
2527 load_dr3(0);
2528 load_dr6(0);
2529 }
2530
2531 /*
2532 * Return > 0 if a hardware breakpoint has been hit, and the
2533 * breakpoint was in user space. Return 0, otherwise.
2534 */
2535 int
2536 user_dbreg_trap(register_t dr6)
2537 {
2538 u_int64_t dr7;
2539 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2540 int nbp; /* number of breakpoints that triggered */
2541 caddr_t addr[4]; /* breakpoint addresses */
2542 int i;
2543
2544 bp = dr6 & DBREG_DR6_BMASK;
2545 if (bp == 0) {
2546 /*
2547 * None of the breakpoint bits are set meaning this
2548 * trap was not caused by any of the debug registers
2549 */
2550 return 0;
2551 }
2552
2553 dr7 = rdr7();
2554 if ((dr7 & 0x000000ff) == 0) {
2555 /*
2556 * all GE and LE bits in the dr7 register are zero,
2557 * thus the trap couldn't have been caused by the
2558 * hardware debug registers
2559 */
2560 return 0;
2561 }
2562
2563 nbp = 0;
2564
2565 /*
2566 * at least one of the breakpoints were hit, check to see
2567 * which ones and if any of them are user space addresses
2568 */
2569
2570 if (bp & 0x01) {
2571 addr[nbp++] = (caddr_t)rdr0();
2572 }
2573 if (bp & 0x02) {
2574 addr[nbp++] = (caddr_t)rdr1();
2575 }
2576 if (bp & 0x04) {
2577 addr[nbp++] = (caddr_t)rdr2();
2578 }
2579 if (bp & 0x08) {
2580 addr[nbp++] = (caddr_t)rdr3();
2581 }
2582
2583 for (i = 0; i < nbp; i++) {
2584 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2585 /*
2586 * addr[i] is in user space
2587 */
2588 return nbp;
2589 }
2590 }
2591
2592 /*
2593 * None of the breakpoints are in user space.
2594 */
2595 return 0;
2596 }
2597
2598 /*
2599 * The pcb_flags is only modified by current thread, or by other threads
2600 * when current thread is stopped. However, current thread may change it
2601 * from the interrupt context in cpu_switch(), or in the trap handler.
2602 * When we read-modify-write pcb_flags from C sources, compiler may generate
2603 * code that is not atomic regarding the interrupt handler. If a trap or
2604 * interrupt happens and any flag is modified from the handler, it can be
2605 * clobbered with the cached value later. Therefore, we implement setting
2606 * and clearing flags with single-instruction functions, which do not race
2607 * with possible modification of the flags from the trap or interrupt context,
2608 * because traps and interrupts are executed only on instruction boundary.
2609 */
2610 void
2611 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2612 {
2613
2614 __asm __volatile("orl %1,%0"
2615 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2616 : "cc", "memory");
2617
2618 }
2619
2620 /*
2621 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2622 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2623 * pcb if user space modified the bases. We must save on the context
2624 * switch or if the return to usermode happens through the doreti.
2625 *
2626 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2627 * which have a consequence that the base MSRs must be saved each time
2628 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2629 * context switches.
2630 */
2631 void
2632 set_pcb_flags(struct pcb *pcb, const u_int flags)
2633 {
2634 register_t r;
2635
2636 if (curpcb == pcb &&
2637 (flags & PCB_FULL_IRET) != 0 &&
2638 (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2639 (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2640 r = intr_disable();
2641 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2642 if (rfs() == _ufssel)
2643 pcb->pcb_fsbase = rdfsbase();
2644 if (rgs() == _ugssel)
2645 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2646 }
2647 set_pcb_flags_raw(pcb, flags);
2648 intr_restore(r);
2649 } else {
2650 set_pcb_flags_raw(pcb, flags);
2651 }
2652 }
2653
2654 void
2655 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2656 {
2657
2658 __asm __volatile("andl %1,%0"
2659 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2660 : "cc", "memory");
2661 }
2662
2663 #ifdef KDB
2664
2665 /*
2666 * Provide inb() and outb() as functions. They are normally only available as
2667 * inline functions, thus cannot be called from the debugger.
2668 */
2669
2670 /* silence compiler warnings */
2671 u_char inb_(u_short);
2672 void outb_(u_short, u_char);
2673
2674 u_char
2675 inb_(u_short port)
2676 {
2677 return inb(port);
2678 }
2679
2680 void
2681 outb_(u_short port, u_char data)
2682 {
2683 outb(port, data);
2684 }
2685
2686 #endif /* KDB */
2687
2688 #undef memset
2689 #undef memmove
2690 #undef memcpy
2691
2692 void *memset_std(void *buf, int c, size_t len);
2693 void *memset_erms(void *buf, int c, size_t len);
2694 DEFINE_IFUNC(, void *, memset, (void *, int, size_t), static)
2695 {
2696
2697 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2698 memset_erms : memset_std);
2699 }
2700
2701 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, size_t len);
2702 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len);
2703 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, size_t), static)
2704 {
2705
2706 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2707 memmove_erms : memmove_std);
2708 }
2709
2710 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, size_t len);
2711 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len);
2712 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull, size_t), static)
2713 {
2714
2715 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2716 memcpy_erms : memcpy_std);
2717 }
2718
2719 void pagezero_std(void *addr);
2720 void pagezero_erms(void *addr);
2721 DEFINE_IFUNC(, void , pagezero, (void *), static)
2722 {
2723
2724 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2725 pagezero_erms : pagezero_std);
2726 }
Cache object: 43bb14add3851bc6e1fa21517d4bc28d
|