1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/efi.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/rwlock.h>
84 #include <sys/sched.h>
85 #include <sys/signalvar.h>
86 #ifdef SMP
87 #include <sys/smp.h>
88 #endif
89 #include <sys/syscallsubr.h>
90 #include <sys/sysctl.h>
91 #include <sys/sysent.h>
92 #include <sys/sysproto.h>
93 #include <sys/ucontext.h>
94 #include <sys/vmmeter.h>
95
96 #include <vm/vm.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_param.h>
104 #include <vm/vm_phys.h>
105
106 #ifdef DDB
107 #ifndef KDB
108 #error KDB must be enabled in order for DDB to work!
109 #endif
110 #include <ddb/ddb.h>
111 #include <ddb/db_sym.h>
112 #endif
113
114 #include <net/netisr.h>
115
116 #include <machine/clock.h>
117 #include <machine/cpu.h>
118 #include <machine/cputypes.h>
119 #include <machine/frame.h>
120 #include <machine/intr_machdep.h>
121 #include <x86/mca.h>
122 #include <machine/md_var.h>
123 #include <machine/metadata.h>
124 #include <machine/mp_watchdog.h>
125 #include <machine/pc/bios.h>
126 #include <machine/pcb.h>
127 #include <machine/proc.h>
128 #include <machine/reg.h>
129 #include <machine/sigframe.h>
130 #include <machine/specialreg.h>
131 #include <machine/trap.h>
132 #include <machine/tss.h>
133 #include <x86/ucode.h>
134 #include <x86/ifunc.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154
155 /*
156 * The PTI trampoline stack needs enough space for a hardware trapframe and a
157 * couple of scratch registers, as well as the trapframe left behind after an
158 * iret fault.
159 */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161 offsetof(struct pti_frame, pti_rip));
162
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164
165 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
166 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167
168 static void cpu_startup(void *);
169 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170 char *xfpusave, size_t xfpusave_len);
171 static int set_fpcontext(struct thread *td, mcontext_t *mcp,
172 char *xfpustate, size_t xfpustate_len);
173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174
175 /* Preload data parse function */
176 static caddr_t native_parse_preload_data(u_int64_t);
177
178 /* Native function to fetch and parse the e820 map */
179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180
181 /* Default init_ops implementation. */
182 struct init_ops init_ops = {
183 .parse_preload_data = native_parse_preload_data,
184 .early_clock_source_init = i8254_init,
185 .early_delay = i8254_delay,
186 .parse_memmap = native_parse_memmap,
187 #ifdef SMP
188 .mp_bootaddress = mp_bootaddress,
189 .start_all_aps = native_start_all_aps,
190 #endif
191 #ifdef DEV_PCI
192 .msi_init = msi_init,
193 #endif
194 };
195
196 /*
197 * Physical address of the EFI System Table. Stashed from the metadata hints
198 * passed into the kernel and used by the EFI code to call runtime services.
199 */
200 vm_paddr_t efi_systbl_phys;
201
202 /* Intel ICH registers */
203 #define ICH_PMBASE 0x400
204 #define ICH_SMI_EN ICH_PMBASE + 0x30
205
206 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207
208 int cold = 1;
209
210 long Maxmem = 0;
211 long realmem = 0;
212
213 /*
214 * The number of PHYSMAP entries must be one less than the number of
215 * PHYSSEG entries because the PHYSMAP entry that spans the largest
216 * physical address that is accessible by ISA DMA is split into two
217 * PHYSSEG entries.
218 */
219 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
220
221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223
224 /* must be 2 less so 0 0 can signal end of chunks */
225 #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226 #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227
228 struct kva_md_info kmi;
229
230 static struct trapframe proc0_tf;
231 struct region_descriptor r_idt;
232
233 struct pcpu *__pcpu;
234 struct pcpu temp_bsp_pcpu;
235
236 struct mtx icu_lock;
237
238 struct mem_range_softc mem_range_softc;
239
240 struct mtx dt_lock; /* lock for GDT and LDT */
241
242 void (*vmm_resume_p)(void);
243
244 static void
245 cpu_startup(dummy)
246 void *dummy;
247 {
248 uintmax_t memsize;
249 char *sysenv;
250
251 /*
252 * On MacBooks, we need to disallow the legacy USB circuit to
253 * generate an SMI# because this can cause several problems,
254 * namely: incorrect CPU frequency detection and failure to
255 * start the APs.
256 * We do this by disabling a bit in the SMI_EN (SMI Control and
257 * Enable register) of the Intel ICH LPC Interface Bridge.
258 */
259 sysenv = kern_getenv("smbios.system.product");
260 if (sysenv != NULL) {
261 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
262 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
263 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
264 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
265 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
266 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
267 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
268 strncmp(sysenv, "Macmini1,1", 10) == 0) {
269 if (bootverbose)
270 printf("Disabling LEGACY_USB_EN bit on "
271 "Intel ICH.\n");
272 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
273 }
274 freeenv(sysenv);
275 }
276
277 /*
278 * Good {morning,afternoon,evening,night}.
279 */
280 startrtclock();
281 printcpuinfo();
282
283 /*
284 * Display physical memory if SMBIOS reports reasonable amount.
285 */
286 memsize = 0;
287 sysenv = kern_getenv("smbios.memory.enabled");
288 if (sysenv != NULL) {
289 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
290 freeenv(sysenv);
291 }
292 if (memsize < ptoa((uintmax_t)vm_free_count()))
293 memsize = ptoa((uintmax_t)Maxmem);
294 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
295 realmem = atop(memsize);
296
297 /*
298 * Display any holes after the first chunk of extended memory.
299 */
300 if (bootverbose) {
301 int indx;
302
303 printf("Physical memory chunk(s):\n");
304 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
305 vm_paddr_t size;
306
307 size = phys_avail[indx + 1] - phys_avail[indx];
308 printf(
309 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
310 (uintmax_t)phys_avail[indx],
311 (uintmax_t)phys_avail[indx + 1] - 1,
312 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
313 }
314 }
315
316 vm_ksubmap_init(&kmi);
317
318 printf("avail memory = %ju (%ju MB)\n",
319 ptoa((uintmax_t)vm_free_count()),
320 ptoa((uintmax_t)vm_free_count()) / 1048576);
321 #ifdef DEV_PCI
322 if (bootverbose && intel_graphics_stolen_base != 0)
323 printf("intel stolen mem: base %#jx size %ju MB\n",
324 (uintmax_t)intel_graphics_stolen_base,
325 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
326 #endif
327
328 /*
329 * Set up buffers, so they can be used to read disk labels.
330 */
331 bufinit();
332 vm_pager_bufferinit();
333
334 cpu_setregs();
335 }
336
337 /*
338 * Send an interrupt to process.
339 *
340 * Stack is set up to allow sigcode stored
341 * at top to call routine, followed by call
342 * to sigreturn routine below. After sigreturn
343 * resets the signal mask, the stack, and the
344 * frame pointer, it returns to the user
345 * specified pc, psl.
346 */
347 void
348 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
349 {
350 struct sigframe sf, *sfp;
351 struct pcb *pcb;
352 struct proc *p;
353 struct thread *td;
354 struct sigacts *psp;
355 char *sp;
356 struct trapframe *regs;
357 char *xfpusave;
358 size_t xfpusave_len;
359 int sig;
360 int oonstack;
361
362 td = curthread;
363 pcb = td->td_pcb;
364 p = td->td_proc;
365 PROC_LOCK_ASSERT(p, MA_OWNED);
366 sig = ksi->ksi_signo;
367 psp = p->p_sigacts;
368 mtx_assert(&psp->ps_mtx, MA_OWNED);
369 regs = td->td_frame;
370 oonstack = sigonstack(regs->tf_rsp);
371
372 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
373 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
374 xfpusave = __builtin_alloca(xfpusave_len);
375 } else {
376 xfpusave_len = 0;
377 xfpusave = NULL;
378 }
379
380 /* Save user context. */
381 bzero(&sf, sizeof(sf));
382 sf.sf_uc.uc_sigmask = *mask;
383 sf.sf_uc.uc_stack = td->td_sigstk;
384 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
385 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
386 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
387 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
388 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
389 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
390 fpstate_drop(td);
391 update_pcb_bases(pcb);
392 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
393 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
394 bzero(sf.sf_uc.uc_mcontext.mc_spare,
395 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
396 bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
397
398 /* Allocate space for the signal handler context. */
399 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
400 SIGISMEMBER(psp->ps_sigonstack, sig)) {
401 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
402 #if defined(COMPAT_43)
403 td->td_sigstk.ss_flags |= SS_ONSTACK;
404 #endif
405 } else
406 sp = (char *)regs->tf_rsp - 128;
407 if (xfpusave != NULL) {
408 sp -= xfpusave_len;
409 sp = (char *)((unsigned long)sp & ~0x3Ful);
410 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
411 }
412 sp -= sizeof(struct sigframe);
413 /* Align to 16 bytes. */
414 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
415
416 /* Build the argument list for the signal handler. */
417 regs->tf_rdi = sig; /* arg 1 in %rdi */
418 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
419 bzero(&sf.sf_si, sizeof(sf.sf_si));
420 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
421 /* Signal handler installed with SA_SIGINFO. */
422 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
423 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
424
425 /* Fill in POSIX parts */
426 sf.sf_si = ksi->ksi_info;
427 sf.sf_si.si_signo = sig; /* maybe a translated signal */
428 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
429 } else {
430 /* Old FreeBSD-style arguments. */
431 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
432 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
433 sf.sf_ahu.sf_handler = catcher;
434 }
435 mtx_unlock(&psp->ps_mtx);
436 PROC_UNLOCK(p);
437
438 /*
439 * Copy the sigframe out to the user's stack.
440 */
441 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
442 (xfpusave != NULL && copyout(xfpusave,
443 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
444 != 0)) {
445 #ifdef DEBUG
446 printf("process %ld has trashed its stack\n", (long)p->p_pid);
447 #endif
448 PROC_LOCK(p);
449 sigexit(td, SIGILL);
450 }
451
452 regs->tf_rsp = (long)sfp;
453 regs->tf_rip = p->p_sysent->sv_sigcode_base;
454 regs->tf_rflags &= ~(PSL_T | PSL_D);
455 regs->tf_cs = _ucodesel;
456 regs->tf_ds = _udatasel;
457 regs->tf_ss = _udatasel;
458 regs->tf_es = _udatasel;
459 regs->tf_fs = _ufssel;
460 regs->tf_gs = _ugssel;
461 regs->tf_flags = TF_HASSEGS;
462 PROC_LOCK(p);
463 mtx_lock(&psp->ps_mtx);
464 }
465
466 /*
467 * System call to cleanup state after a signal
468 * has been taken. Reset signal mask and
469 * stack state from context left by sendsig (above).
470 * Return to previous pc and psl as specified by
471 * context left by sendsig. Check carefully to
472 * make sure that the user has not modified the
473 * state to gain improper privileges.
474 *
475 * MPSAFE
476 */
477 int
478 sys_sigreturn(td, uap)
479 struct thread *td;
480 struct sigreturn_args /* {
481 const struct __ucontext *sigcntxp;
482 } */ *uap;
483 {
484 ucontext_t uc;
485 struct pcb *pcb;
486 struct proc *p;
487 struct trapframe *regs;
488 ucontext_t *ucp;
489 char *xfpustate;
490 size_t xfpustate_len;
491 long rflags;
492 int cs, error, ret;
493 ksiginfo_t ksi;
494
495 pcb = td->td_pcb;
496 p = td->td_proc;
497
498 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
499 if (error != 0) {
500 uprintf("pid %d (%s): sigreturn copyin failed\n",
501 p->p_pid, td->td_name);
502 return (error);
503 }
504 ucp = &uc;
505 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
506 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
507 td->td_name, ucp->uc_mcontext.mc_flags);
508 return (EINVAL);
509 }
510 regs = td->td_frame;
511 rflags = ucp->uc_mcontext.mc_rflags;
512 /*
513 * Don't allow users to change privileged or reserved flags.
514 */
515 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
516 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
517 td->td_name, rflags);
518 return (EINVAL);
519 }
520
521 /*
522 * Don't allow users to load a valid privileged %cs. Let the
523 * hardware check for invalid selectors, excess privilege in
524 * other selectors, invalid %eip's and invalid %esp's.
525 */
526 cs = ucp->uc_mcontext.mc_cs;
527 if (!CS_SECURE(cs)) {
528 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
529 td->td_name, cs);
530 ksiginfo_init_trap(&ksi);
531 ksi.ksi_signo = SIGBUS;
532 ksi.ksi_code = BUS_OBJERR;
533 ksi.ksi_trapno = T_PROTFLT;
534 ksi.ksi_addr = (void *)regs->tf_rip;
535 trapsignal(td, &ksi);
536 return (EINVAL);
537 }
538
539 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
540 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
541 if (xfpustate_len > cpu_max_ext_state_size -
542 sizeof(struct savefpu)) {
543 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
544 p->p_pid, td->td_name, xfpustate_len);
545 return (EINVAL);
546 }
547 xfpustate = __builtin_alloca(xfpustate_len);
548 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
549 xfpustate, xfpustate_len);
550 if (error != 0) {
551 uprintf(
552 "pid %d (%s): sigreturn copying xfpustate failed\n",
553 p->p_pid, td->td_name);
554 return (error);
555 }
556 } else {
557 xfpustate = NULL;
558 xfpustate_len = 0;
559 }
560 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
561 if (ret != 0) {
562 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
563 p->p_pid, td->td_name, ret);
564 return (ret);
565 }
566 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
567 update_pcb_bases(pcb);
568 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
569 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
570
571 #if defined(COMPAT_43)
572 if (ucp->uc_mcontext.mc_onstack & 1)
573 td->td_sigstk.ss_flags |= SS_ONSTACK;
574 else
575 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
576 #endif
577
578 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
579 return (EJUSTRETURN);
580 }
581
582 #ifdef COMPAT_FREEBSD4
583 int
584 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
585 {
586
587 return sys_sigreturn(td, (struct sigreturn_args *)uap);
588 }
589 #endif
590
591 /*
592 * Reset the hardware debug registers if they were in use.
593 * They won't have any meaning for the newly exec'd process.
594 */
595 void
596 x86_clear_dbregs(struct pcb *pcb)
597 {
598 if ((pcb->pcb_flags & PCB_DBREGS) == 0)
599 return;
600
601 pcb->pcb_dr0 = 0;
602 pcb->pcb_dr1 = 0;
603 pcb->pcb_dr2 = 0;
604 pcb->pcb_dr3 = 0;
605 pcb->pcb_dr6 = 0;
606 pcb->pcb_dr7 = 0;
607
608 if (pcb == curpcb) {
609 /*
610 * Clear the debug registers on the running CPU,
611 * otherwise they will end up affecting the next
612 * process we switch to.
613 */
614 reset_dbregs();
615 }
616 clear_pcb_flags(pcb, PCB_DBREGS);
617 }
618
619 /*
620 * Reset registers to default values on exec.
621 */
622 void
623 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
624 {
625 struct trapframe *regs;
626 struct pcb *pcb;
627 register_t saved_rflags;
628
629 regs = td->td_frame;
630 pcb = td->td_pcb;
631
632 if (td->td_proc->p_md.md_ldt != NULL)
633 user_ldt_free(td);
634
635 update_pcb_bases(pcb);
636 pcb->pcb_fsbase = 0;
637 pcb->pcb_gsbase = 0;
638 clear_pcb_flags(pcb, PCB_32BIT);
639 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
640
641 saved_rflags = regs->tf_rflags & PSL_T;
642 bzero((char *)regs, sizeof(struct trapframe));
643 regs->tf_rip = imgp->entry_addr;
644 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
645 regs->tf_rdi = stack; /* argv */
646 regs->tf_rflags = PSL_USER | saved_rflags;
647 regs->tf_ss = _udatasel;
648 regs->tf_cs = _ucodesel;
649 regs->tf_ds = _udatasel;
650 regs->tf_es = _udatasel;
651 regs->tf_fs = _ufssel;
652 regs->tf_gs = _ugssel;
653 regs->tf_flags = TF_HASSEGS;
654
655 x86_clear_dbregs(pcb);
656
657 /*
658 * Drop the FP state if we hold it, so that the process gets a
659 * clean FP state if it uses the FPU again.
660 */
661 fpstate_drop(td);
662 }
663
664 void
665 cpu_setregs(void)
666 {
667 register_t cr0;
668
669 cr0 = rcr0();
670 /*
671 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
672 * BSP. See the comments there about why we set them.
673 */
674 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
675 load_cr0(cr0);
676 }
677
678 /*
679 * Initialize amd64 and configure to run kernel
680 */
681
682 /*
683 * Initialize segments & interrupt table
684 */
685
686 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
687 static struct gate_descriptor idt0[NIDT];
688 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
689
690 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
691 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
692 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
693 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
694 CTASSERT(sizeof(struct nmi_pcpu) == 16);
695
696 struct amd64tss common_tss[MAXCPU];
697
698 /*
699 * Software prototypes -- in more palatable form.
700 *
701 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
702 * slots as corresponding segments for i386 kernel.
703 */
704 struct soft_segment_descriptor gdt_segs[] = {
705 /* GNULL_SEL 0 Null Descriptor */
706 { .ssd_base = 0x0,
707 .ssd_limit = 0x0,
708 .ssd_type = 0,
709 .ssd_dpl = 0,
710 .ssd_p = 0,
711 .ssd_long = 0,
712 .ssd_def32 = 0,
713 .ssd_gran = 0 },
714 /* GNULL2_SEL 1 Null Descriptor */
715 { .ssd_base = 0x0,
716 .ssd_limit = 0x0,
717 .ssd_type = 0,
718 .ssd_dpl = 0,
719 .ssd_p = 0,
720 .ssd_long = 0,
721 .ssd_def32 = 0,
722 .ssd_gran = 0 },
723 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */
724 { .ssd_base = 0x0,
725 .ssd_limit = 0xfffff,
726 .ssd_type = SDT_MEMRWA,
727 .ssd_dpl = SEL_UPL,
728 .ssd_p = 1,
729 .ssd_long = 0,
730 .ssd_def32 = 1,
731 .ssd_gran = 1 },
732 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */
733 { .ssd_base = 0x0,
734 .ssd_limit = 0xfffff,
735 .ssd_type = SDT_MEMRWA,
736 .ssd_dpl = SEL_UPL,
737 .ssd_p = 1,
738 .ssd_long = 0,
739 .ssd_def32 = 1,
740 .ssd_gran = 1 },
741 /* GCODE_SEL 4 Code Descriptor for kernel */
742 { .ssd_base = 0x0,
743 .ssd_limit = 0xfffff,
744 .ssd_type = SDT_MEMERA,
745 .ssd_dpl = SEL_KPL,
746 .ssd_p = 1,
747 .ssd_long = 1,
748 .ssd_def32 = 0,
749 .ssd_gran = 1 },
750 /* GDATA_SEL 5 Data Descriptor for kernel */
751 { .ssd_base = 0x0,
752 .ssd_limit = 0xfffff,
753 .ssd_type = SDT_MEMRWA,
754 .ssd_dpl = SEL_KPL,
755 .ssd_p = 1,
756 .ssd_long = 1,
757 .ssd_def32 = 0,
758 .ssd_gran = 1 },
759 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
760 { .ssd_base = 0x0,
761 .ssd_limit = 0xfffff,
762 .ssd_type = SDT_MEMERA,
763 .ssd_dpl = SEL_UPL,
764 .ssd_p = 1,
765 .ssd_long = 0,
766 .ssd_def32 = 1,
767 .ssd_gran = 1 },
768 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
769 { .ssd_base = 0x0,
770 .ssd_limit = 0xfffff,
771 .ssd_type = SDT_MEMRWA,
772 .ssd_dpl = SEL_UPL,
773 .ssd_p = 1,
774 .ssd_long = 0,
775 .ssd_def32 = 1,
776 .ssd_gran = 1 },
777 /* GUCODE_SEL 8 64 bit Code Descriptor for user */
778 { .ssd_base = 0x0,
779 .ssd_limit = 0xfffff,
780 .ssd_type = SDT_MEMERA,
781 .ssd_dpl = SEL_UPL,
782 .ssd_p = 1,
783 .ssd_long = 1,
784 .ssd_def32 = 0,
785 .ssd_gran = 1 },
786 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */
787 { .ssd_base = 0x0,
788 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
789 .ssd_type = SDT_SYSTSS,
790 .ssd_dpl = SEL_KPL,
791 .ssd_p = 1,
792 .ssd_long = 0,
793 .ssd_def32 = 0,
794 .ssd_gran = 0 },
795 /* Actually, the TSS is a system descriptor which is double size */
796 { .ssd_base = 0x0,
797 .ssd_limit = 0x0,
798 .ssd_type = 0,
799 .ssd_dpl = 0,
800 .ssd_p = 0,
801 .ssd_long = 0,
802 .ssd_def32 = 0,
803 .ssd_gran = 0 },
804 /* GUSERLDT_SEL 11 LDT Descriptor */
805 { .ssd_base = 0x0,
806 .ssd_limit = 0x0,
807 .ssd_type = 0,
808 .ssd_dpl = 0,
809 .ssd_p = 0,
810 .ssd_long = 0,
811 .ssd_def32 = 0,
812 .ssd_gran = 0 },
813 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
814 { .ssd_base = 0x0,
815 .ssd_limit = 0x0,
816 .ssd_type = 0,
817 .ssd_dpl = 0,
818 .ssd_p = 0,
819 .ssd_long = 0,
820 .ssd_def32 = 0,
821 .ssd_gran = 0 },
822 };
823 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
824
825 void
826 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
827 {
828 struct gate_descriptor *ip;
829
830 ip = idt + idx;
831 ip->gd_looffset = (uintptr_t)func;
832 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
833 ip->gd_ist = ist;
834 ip->gd_xx = 0;
835 ip->gd_type = typ;
836 ip->gd_dpl = dpl;
837 ip->gd_p = 1;
838 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
839 }
840
841 extern inthand_t
842 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
843 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
844 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
845 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
846 IDTVEC(xmm), IDTVEC(dblfault),
847 IDTVEC(div_pti), IDTVEC(bpt_pti),
848 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
849 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
850 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
851 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
852 IDTVEC(xmm_pti),
853 #ifdef KDTRACE_HOOKS
854 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
855 #endif
856 #ifdef XENHVM
857 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
858 #endif
859 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
860 IDTVEC(fast_syscall_pti);
861
862 #ifdef DDB
863 /*
864 * Display the index and function name of any IDT entries that don't use
865 * the default 'rsvd' entry point.
866 */
867 DB_SHOW_COMMAND(idt, db_show_idt)
868 {
869 struct gate_descriptor *ip;
870 int idx;
871 uintptr_t func;
872
873 ip = idt;
874 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
875 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
876 if (func != (uintptr_t)&IDTVEC(rsvd)) {
877 db_printf("%3d\t", idx);
878 db_printsym(func, DB_STGY_PROC);
879 db_printf("\n");
880 }
881 ip++;
882 }
883 }
884
885 /* Show privileged registers. */
886 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
887 {
888 struct {
889 uint16_t limit;
890 uint64_t base;
891 } __packed idtr, gdtr;
892 uint16_t ldt, tr;
893
894 __asm __volatile("sidt %0" : "=m" (idtr));
895 db_printf("idtr\t0x%016lx/%04x\n",
896 (u_long)idtr.base, (u_int)idtr.limit);
897 __asm __volatile("sgdt %0" : "=m" (gdtr));
898 db_printf("gdtr\t0x%016lx/%04x\n",
899 (u_long)gdtr.base, (u_int)gdtr.limit);
900 __asm __volatile("sldt %0" : "=r" (ldt));
901 db_printf("ldtr\t0x%04x\n", ldt);
902 __asm __volatile("str %0" : "=r" (tr));
903 db_printf("tr\t0x%04x\n", tr);
904 db_printf("cr0\t0x%016lx\n", rcr0());
905 db_printf("cr2\t0x%016lx\n", rcr2());
906 db_printf("cr3\t0x%016lx\n", rcr3());
907 db_printf("cr4\t0x%016lx\n", rcr4());
908 if (rcr4() & CR4_XSAVE)
909 db_printf("xcr0\t0x%016lx\n", rxcr(0));
910 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
911 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
912 db_printf("FEATURES_CTL\t%016lx\n",
913 rdmsr(MSR_IA32_FEATURE_CONTROL));
914 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
915 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
916 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
917 }
918
919 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
920 {
921
922 db_printf("dr0\t0x%016lx\n", rdr0());
923 db_printf("dr1\t0x%016lx\n", rdr1());
924 db_printf("dr2\t0x%016lx\n", rdr2());
925 db_printf("dr3\t0x%016lx\n", rdr3());
926 db_printf("dr6\t0x%016lx\n", rdr6());
927 db_printf("dr7\t0x%016lx\n", rdr7());
928 }
929 #endif
930
931 void
932 sdtossd(sd, ssd)
933 struct user_segment_descriptor *sd;
934 struct soft_segment_descriptor *ssd;
935 {
936
937 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
938 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
939 ssd->ssd_type = sd->sd_type;
940 ssd->ssd_dpl = sd->sd_dpl;
941 ssd->ssd_p = sd->sd_p;
942 ssd->ssd_long = sd->sd_long;
943 ssd->ssd_def32 = sd->sd_def32;
944 ssd->ssd_gran = sd->sd_gran;
945 }
946
947 void
948 ssdtosd(ssd, sd)
949 struct soft_segment_descriptor *ssd;
950 struct user_segment_descriptor *sd;
951 {
952
953 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
954 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
955 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
956 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
957 sd->sd_type = ssd->ssd_type;
958 sd->sd_dpl = ssd->ssd_dpl;
959 sd->sd_p = ssd->ssd_p;
960 sd->sd_long = ssd->ssd_long;
961 sd->sd_def32 = ssd->ssd_def32;
962 sd->sd_gran = ssd->ssd_gran;
963 }
964
965 void
966 ssdtosyssd(ssd, sd)
967 struct soft_segment_descriptor *ssd;
968 struct system_segment_descriptor *sd;
969 {
970
971 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
972 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
973 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
974 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
975 sd->sd_type = ssd->ssd_type;
976 sd->sd_dpl = ssd->ssd_dpl;
977 sd->sd_p = ssd->ssd_p;
978 sd->sd_gran = ssd->ssd_gran;
979 }
980
981 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
982 #include <isa/isavar.h>
983 #include <isa/isareg.h>
984 /*
985 * Return a bitmap of the current interrupt requests. This is 8259-specific
986 * and is only suitable for use at probe time.
987 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
988 * It shouldn't be here. There should probably be an APIC centric
989 * implementation in the apic driver code, if at all.
990 */
991 intrmask_t
992 isa_irq_pending(void)
993 {
994 u_char irr1;
995 u_char irr2;
996
997 irr1 = inb(IO_ICU1);
998 irr2 = inb(IO_ICU2);
999 return ((irr2 << 8) | irr1);
1000 }
1001 #endif
1002
1003 u_int basemem;
1004
1005 static int
1006 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
1007 int *physmap_idxp)
1008 {
1009 int i, insert_idx, physmap_idx;
1010
1011 physmap_idx = *physmap_idxp;
1012
1013 if (length == 0)
1014 return (1);
1015
1016 /*
1017 * Find insertion point while checking for overlap. Start off by
1018 * assuming the new entry will be added to the end.
1019 *
1020 * NB: physmap_idx points to the next free slot.
1021 */
1022 insert_idx = physmap_idx;
1023 for (i = 0; i <= physmap_idx; i += 2) {
1024 if (base < physmap[i + 1]) {
1025 if (base + length <= physmap[i]) {
1026 insert_idx = i;
1027 break;
1028 }
1029 if (boothowto & RB_VERBOSE)
1030 printf(
1031 "Overlapping memory regions, ignoring second region\n");
1032 return (1);
1033 }
1034 }
1035
1036 /* See if we can prepend to the next entry. */
1037 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1038 physmap[insert_idx] = base;
1039 return (1);
1040 }
1041
1042 /* See if we can append to the previous entry. */
1043 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1044 physmap[insert_idx - 1] += length;
1045 return (1);
1046 }
1047
1048 physmap_idx += 2;
1049 *physmap_idxp = physmap_idx;
1050 if (physmap_idx == PHYSMAP_SIZE) {
1051 printf(
1052 "Too many segments in the physical address map, giving up\n");
1053 return (0);
1054 }
1055
1056 /*
1057 * Move the last 'N' entries down to make room for the new
1058 * entry if needed.
1059 */
1060 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1061 physmap[i] = physmap[i - 2];
1062 physmap[i + 1] = physmap[i - 1];
1063 }
1064
1065 /* Insert the new entry. */
1066 physmap[insert_idx] = base;
1067 physmap[insert_idx + 1] = base + length;
1068 return (1);
1069 }
1070
1071 void
1072 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1073 vm_paddr_t *physmap, int *physmap_idx)
1074 {
1075 struct bios_smap *smap, *smapend;
1076
1077 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1078
1079 for (smap = smapbase; smap < smapend; smap++) {
1080 if (boothowto & RB_VERBOSE)
1081 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1082 smap->type, smap->base, smap->length);
1083
1084 if (smap->type != SMAP_TYPE_MEMORY)
1085 continue;
1086
1087 if (!add_physmap_entry(smap->base, smap->length, physmap,
1088 physmap_idx))
1089 break;
1090 }
1091 }
1092
1093 static void
1094 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1095 int *physmap_idx)
1096 {
1097 struct efi_md *map, *p;
1098 const char *type;
1099 size_t efisz;
1100 int ndesc, i;
1101
1102 static const char *types[] = {
1103 "Reserved",
1104 "LoaderCode",
1105 "LoaderData",
1106 "BootServicesCode",
1107 "BootServicesData",
1108 "RuntimeServicesCode",
1109 "RuntimeServicesData",
1110 "ConventionalMemory",
1111 "UnusableMemory",
1112 "ACPIReclaimMemory",
1113 "ACPIMemoryNVS",
1114 "MemoryMappedIO",
1115 "MemoryMappedIOPortSpace",
1116 "PalCode",
1117 "PersistentMemory"
1118 };
1119
1120 /*
1121 * Memory map data provided by UEFI via the GetMemoryMap
1122 * Boot Services API.
1123 */
1124 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1125 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1126
1127 if (efihdr->descriptor_size == 0)
1128 return;
1129 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1130
1131 if (boothowto & RB_VERBOSE)
1132 printf("%23s %12s %12s %8s %4s\n",
1133 "Type", "Physical", "Virtual", "#Pages", "Attr");
1134
1135 for (i = 0, p = map; i < ndesc; i++,
1136 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1137 if (boothowto & RB_VERBOSE) {
1138 if (p->md_type < nitems(types))
1139 type = types[p->md_type];
1140 else
1141 type = "<INVALID>";
1142 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1143 p->md_virt, p->md_pages);
1144 if (p->md_attr & EFI_MD_ATTR_UC)
1145 printf("UC ");
1146 if (p->md_attr & EFI_MD_ATTR_WC)
1147 printf("WC ");
1148 if (p->md_attr & EFI_MD_ATTR_WT)
1149 printf("WT ");
1150 if (p->md_attr & EFI_MD_ATTR_WB)
1151 printf("WB ");
1152 if (p->md_attr & EFI_MD_ATTR_UCE)
1153 printf("UCE ");
1154 if (p->md_attr & EFI_MD_ATTR_WP)
1155 printf("WP ");
1156 if (p->md_attr & EFI_MD_ATTR_RP)
1157 printf("RP ");
1158 if (p->md_attr & EFI_MD_ATTR_XP)
1159 printf("XP ");
1160 if (p->md_attr & EFI_MD_ATTR_NV)
1161 printf("NV ");
1162 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1163 printf("MORE_RELIABLE ");
1164 if (p->md_attr & EFI_MD_ATTR_RO)
1165 printf("RO ");
1166 if (p->md_attr & EFI_MD_ATTR_RT)
1167 printf("RUNTIME");
1168 printf("\n");
1169 }
1170
1171 switch (p->md_type) {
1172 case EFI_MD_TYPE_CODE:
1173 case EFI_MD_TYPE_DATA:
1174 case EFI_MD_TYPE_BS_CODE:
1175 case EFI_MD_TYPE_BS_DATA:
1176 case EFI_MD_TYPE_FREE:
1177 /*
1178 * We're allowed to use any entry with these types.
1179 */
1180 break;
1181 default:
1182 continue;
1183 }
1184
1185 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1186 physmap, physmap_idx))
1187 break;
1188 }
1189 }
1190
1191 static char bootmethod[16] = "";
1192 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1193 "System firmware boot method");
1194
1195 static void
1196 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1197 {
1198 struct bios_smap *smap;
1199 struct efi_map_header *efihdr;
1200 u_int32_t size;
1201
1202 /*
1203 * Memory map from INT 15:E820.
1204 *
1205 * subr_module.c says:
1206 * "Consumer may safely assume that size value precedes data."
1207 * ie: an int32_t immediately precedes smap.
1208 */
1209
1210 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1211 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1212 smap = (struct bios_smap *)preload_search_info(kmdp,
1213 MODINFO_METADATA | MODINFOMD_SMAP);
1214 if (efihdr == NULL && smap == NULL)
1215 panic("No BIOS smap or EFI map info from loader!");
1216
1217 if (efihdr != NULL) {
1218 add_efi_map_entries(efihdr, physmap, physmap_idx);
1219 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1220 } else {
1221 size = *((u_int32_t *)smap - 1);
1222 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1223 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1224 }
1225 }
1226
1227 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1228
1229 /*
1230 * Populate the (physmap) array with base/bound pairs describing the
1231 * available physical memory in the system, then test this memory and
1232 * build the phys_avail array describing the actually-available memory.
1233 *
1234 * Total memory size may be set by the kernel environment variable
1235 * hw.physmem or the compile-time define MAXMEM.
1236 *
1237 * XXX first should be vm_paddr_t.
1238 */
1239 static void
1240 getmemsize(caddr_t kmdp, u_int64_t first)
1241 {
1242 int i, physmap_idx, pa_indx, da_indx;
1243 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1244 u_long physmem_start, physmem_tunable, memtest;
1245 pt_entry_t *pte;
1246 quad_t dcons_addr, dcons_size;
1247 int page_counter;
1248
1249 /*
1250 * Tell the physical memory allocator about pages used to store
1251 * the kernel and preloaded data. See kmem_bootstrap_free().
1252 */
1253 vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1254
1255 bzero(physmap, sizeof(physmap));
1256 physmap_idx = 0;
1257
1258 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1259 physmap_idx -= 2;
1260
1261 /*
1262 * Find the 'base memory' segment for SMP
1263 */
1264 basemem = 0;
1265 for (i = 0; i <= physmap_idx; i += 2) {
1266 if (physmap[i] <= 0xA0000) {
1267 basemem = physmap[i + 1] / 1024;
1268 break;
1269 }
1270 }
1271 if (basemem == 0 || basemem > 640) {
1272 if (bootverbose)
1273 printf(
1274 "Memory map doesn't contain a basemem segment, faking it");
1275 basemem = 640;
1276 }
1277
1278 /*
1279 * Maxmem isn't the "maximum memory", it's one larger than the
1280 * highest page of the physical address space. It should be
1281 * called something like "Maxphyspage". We may adjust this
1282 * based on ``hw.physmem'' and the results of the memory test.
1283 */
1284 Maxmem = atop(physmap[physmap_idx + 1]);
1285
1286 #ifdef MAXMEM
1287 Maxmem = MAXMEM / 4;
1288 #endif
1289
1290 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1291 Maxmem = atop(physmem_tunable);
1292
1293 /*
1294 * The boot memory test is disabled by default, as it takes a
1295 * significant amount of time on large-memory systems, and is
1296 * unfriendly to virtual machines as it unnecessarily touches all
1297 * pages.
1298 *
1299 * A general name is used as the code may be extended to support
1300 * additional tests beyond the current "page present" test.
1301 */
1302 memtest = 0;
1303 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1304
1305 /*
1306 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1307 * in the system.
1308 */
1309 if (Maxmem > atop(physmap[physmap_idx + 1]))
1310 Maxmem = atop(physmap[physmap_idx + 1]);
1311
1312 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1313 (boothowto & RB_VERBOSE))
1314 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1315
1316 /*
1317 * Make hole for "AP -> long mode" bootstrap code. The
1318 * mp_bootaddress vector is only available when the kernel
1319 * is configured to support APs and APs for the system start
1320 * in real mode mode (e.g. SMP bare metal).
1321 */
1322 if (init_ops.mp_bootaddress)
1323 init_ops.mp_bootaddress(physmap, &physmap_idx);
1324
1325 /* call pmap initialization to make new kernel address space */
1326 pmap_bootstrap(&first);
1327
1328 /*
1329 * Size up each available chunk of physical memory.
1330 *
1331 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1332 * By default, mask off the first 16 pages unless we appear to be
1333 * running in a VM.
1334 */
1335 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1336 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1337 if (physmap[0] < physmem_start) {
1338 if (physmem_start < PAGE_SIZE)
1339 physmap[0] = PAGE_SIZE;
1340 else if (physmem_start >= physmap[1])
1341 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1342 else
1343 physmap[0] = round_page(physmem_start);
1344 }
1345 pa_indx = 0;
1346 da_indx = 1;
1347 phys_avail[pa_indx++] = physmap[0];
1348 phys_avail[pa_indx] = physmap[0];
1349 dump_avail[da_indx] = physmap[0];
1350 pte = CMAP1;
1351
1352 /*
1353 * Get dcons buffer address
1354 */
1355 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1356 getenv_quad("dcons.size", &dcons_size) == 0)
1357 dcons_addr = 0;
1358
1359 /*
1360 * physmap is in bytes, so when converting to page boundaries,
1361 * round up the start address and round down the end address.
1362 */
1363 page_counter = 0;
1364 if (memtest != 0)
1365 printf("Testing system memory");
1366 for (i = 0; i <= physmap_idx; i += 2) {
1367 vm_paddr_t end;
1368
1369 end = ptoa((vm_paddr_t)Maxmem);
1370 if (physmap[i + 1] < end)
1371 end = trunc_page(physmap[i + 1]);
1372 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1373 int tmp, page_bad, full;
1374 int *ptr = (int *)CADDR1;
1375
1376 full = FALSE;
1377 /*
1378 * block out kernel memory as not available.
1379 */
1380 if (pa >= (vm_paddr_t)kernphys && pa < first)
1381 goto do_dump_avail;
1382
1383 /*
1384 * block out dcons buffer
1385 */
1386 if (dcons_addr > 0
1387 && pa >= trunc_page(dcons_addr)
1388 && pa < dcons_addr + dcons_size)
1389 goto do_dump_avail;
1390
1391 page_bad = FALSE;
1392 if (memtest == 0)
1393 goto skip_memtest;
1394
1395 /*
1396 * Print a "." every GB to show we're making
1397 * progress.
1398 */
1399 page_counter++;
1400 if ((page_counter % PAGES_PER_GB) == 0)
1401 printf(".");
1402
1403 /*
1404 * map page into kernel: valid, read/write,non-cacheable
1405 */
1406 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1407 invltlb();
1408
1409 tmp = *(int *)ptr;
1410 /*
1411 * Test for alternating 1's and 0's
1412 */
1413 *(volatile int *)ptr = 0xaaaaaaaa;
1414 if (*(volatile int *)ptr != 0xaaaaaaaa)
1415 page_bad = TRUE;
1416 /*
1417 * Test for alternating 0's and 1's
1418 */
1419 *(volatile int *)ptr = 0x55555555;
1420 if (*(volatile int *)ptr != 0x55555555)
1421 page_bad = TRUE;
1422 /*
1423 * Test for all 1's
1424 */
1425 *(volatile int *)ptr = 0xffffffff;
1426 if (*(volatile int *)ptr != 0xffffffff)
1427 page_bad = TRUE;
1428 /*
1429 * Test for all 0's
1430 */
1431 *(volatile int *)ptr = 0x0;
1432 if (*(volatile int *)ptr != 0x0)
1433 page_bad = TRUE;
1434 /*
1435 * Restore original value.
1436 */
1437 *(int *)ptr = tmp;
1438
1439 skip_memtest:
1440 /*
1441 * Adjust array of valid/good pages.
1442 */
1443 if (page_bad == TRUE)
1444 continue;
1445 /*
1446 * If this good page is a continuation of the
1447 * previous set of good pages, then just increase
1448 * the end pointer. Otherwise start a new chunk.
1449 * Note that "end" points one higher than end,
1450 * making the range >= start and < end.
1451 * If we're also doing a speculative memory
1452 * test and we at or past the end, bump up Maxmem
1453 * so that we keep going. The first bad page
1454 * will terminate the loop.
1455 */
1456 if (phys_avail[pa_indx] == pa) {
1457 phys_avail[pa_indx] += PAGE_SIZE;
1458 } else {
1459 pa_indx++;
1460 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1461 printf(
1462 "Too many holes in the physical address space, giving up\n");
1463 pa_indx--;
1464 full = TRUE;
1465 goto do_dump_avail;
1466 }
1467 phys_avail[pa_indx++] = pa; /* start */
1468 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1469 }
1470 physmem++;
1471 do_dump_avail:
1472 if (dump_avail[da_indx] == pa) {
1473 dump_avail[da_indx] += PAGE_SIZE;
1474 } else {
1475 da_indx++;
1476 if (da_indx == DUMP_AVAIL_ARRAY_END) {
1477 da_indx--;
1478 goto do_next;
1479 }
1480 dump_avail[da_indx++] = pa; /* start */
1481 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1482 }
1483 do_next:
1484 if (full)
1485 break;
1486 }
1487 }
1488 *pte = 0;
1489 invltlb();
1490 if (memtest != 0)
1491 printf("\n");
1492
1493 /*
1494 * XXX
1495 * The last chunk must contain at least one page plus the message
1496 * buffer to avoid complicating other code (message buffer address
1497 * calculation, etc.).
1498 */
1499 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1500 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1501 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1502 phys_avail[pa_indx--] = 0;
1503 phys_avail[pa_indx--] = 0;
1504 }
1505
1506 Maxmem = atop(phys_avail[pa_indx]);
1507
1508 /* Trim off space for the message buffer. */
1509 phys_avail[pa_indx] -= round_page(msgbufsize);
1510
1511 /* Map the message buffer. */
1512 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1513 }
1514
1515 static caddr_t
1516 native_parse_preload_data(u_int64_t modulep)
1517 {
1518 caddr_t kmdp;
1519 char *envp;
1520 #ifdef DDB
1521 vm_offset_t ksym_start;
1522 vm_offset_t ksym_end;
1523 #endif
1524
1525 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1526 preload_bootstrap_relocate(KERNBASE);
1527 kmdp = preload_search_by_type("elf kernel");
1528 if (kmdp == NULL)
1529 kmdp = preload_search_by_type("elf64 kernel");
1530 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1531 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1532 if (envp != NULL)
1533 envp += KERNBASE;
1534 init_static_kenv(envp, 0);
1535 #ifdef DDB
1536 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1537 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1538 db_fetch_ksymtab(ksym_start, ksym_end);
1539 #endif
1540 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1541
1542 return (kmdp);
1543 }
1544
1545 static void
1546 amd64_kdb_init(void)
1547 {
1548 kdb_init();
1549 #ifdef KDB
1550 if (boothowto & RB_KDB)
1551 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1552 #endif
1553 }
1554
1555 /* Set up the fast syscall stuff */
1556 void
1557 amd64_conf_fast_syscall(void)
1558 {
1559 uint64_t msr;
1560
1561 msr = rdmsr(MSR_EFER) | EFER_SCE;
1562 wrmsr(MSR_EFER, msr);
1563 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1564 (u_int64_t)IDTVEC(fast_syscall));
1565 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1566 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1567 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1568 wrmsr(MSR_STAR, msr);
1569 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1570 }
1571
1572 void
1573 amd64_bsp_pcpu_init1(struct pcpu *pc)
1574 {
1575
1576 PCPU_SET(prvspace, pc);
1577 PCPU_SET(curthread, &thread0);
1578 PCPU_SET(tssp, &common_tss[0]);
1579 PCPU_SET(commontssp, &common_tss[0]);
1580 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1581 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1582 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1583 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1584 }
1585
1586 void
1587 amd64_bsp_pcpu_init2(uint64_t rsp0)
1588 {
1589
1590 PCPU_SET(rsp0, rsp0);
1591 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1592 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1593 PCPU_SET(curpcb, thread0.td_pcb);
1594 }
1595
1596 void
1597 amd64_bsp_ist_init(struct pcpu *pc)
1598 {
1599 struct nmi_pcpu *np;
1600
1601 /* doublefault stack space, runs on ist1 */
1602 common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1603
1604 /*
1605 * NMI stack, runs on ist2. The pcpu pointer is stored just
1606 * above the start of the ist2 stack.
1607 */
1608 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1609 np->np_pcpu = (register_t)pc;
1610 common_tss[0].tss_ist2 = (long)np;
1611
1612 /*
1613 * MC# stack, runs on ist3. The pcpu pointer is stored just
1614 * above the start of the ist3 stack.
1615 */
1616 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1617 np->np_pcpu = (register_t)pc;
1618 common_tss[0].tss_ist3 = (long)np;
1619
1620 /*
1621 * DB# stack, runs on ist4.
1622 */
1623 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1624 np->np_pcpu = (register_t)pc;
1625 common_tss[0].tss_ist4 = (long)np;
1626 }
1627
1628 u_int64_t
1629 hammer_time(u_int64_t modulep, u_int64_t physfree)
1630 {
1631 caddr_t kmdp;
1632 int gsel_tss, x;
1633 struct pcpu *pc;
1634 struct xstate_hdr *xhdr;
1635 u_int64_t rsp0;
1636 char *env;
1637 struct region_descriptor r_gdt;
1638 size_t kstack0_sz;
1639 int late_console;
1640
1641 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1642
1643 kmdp = init_ops.parse_preload_data(modulep);
1644
1645 physfree += ucode_load_bsp(physfree + KERNBASE);
1646 physfree = roundup2(physfree, PAGE_SIZE);
1647
1648 identify_cpu1();
1649 identify_hypervisor();
1650 identify_cpu_fixup_bsp();
1651 identify_cpu2();
1652 initializecpucache();
1653
1654 /*
1655 * Check for pti, pcid, and invpcid before ifuncs are
1656 * resolved, to correctly select the implementation for
1657 * pmap_activate_sw_mode().
1658 */
1659 pti = pti_get_default();
1660 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1661 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1662 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1663 invpcid_works = (cpu_stdext_feature &
1664 CPUID_STDEXT_INVPCID) != 0;
1665 } else {
1666 pmap_pcid_enabled = 0;
1667 }
1668
1669 link_elf_ireloc(kmdp);
1670
1671 /*
1672 * This may be done better later if it gets more high level
1673 * components in it. If so just link td->td_proc here.
1674 */
1675 proc_linkup0(&proc0, &thread0);
1676
1677 /* Init basic tunables, hz etc */
1678 init_param1();
1679
1680 thread0.td_kstack = physfree + KERNBASE;
1681 thread0.td_kstack_pages = kstack_pages;
1682 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1683 bzero((void *)thread0.td_kstack, kstack0_sz);
1684 physfree += kstack0_sz;
1685
1686 /*
1687 * Initialize enough of thread0 for delayed invalidation to
1688 * work very early. Rely on thread0.td_base_pri
1689 * zero-initialization, it is reset to PVM at proc0_init().
1690 */
1691 pmap_thread_init_invl_gen(&thread0);
1692
1693 /*
1694 * make gdt memory segments
1695 */
1696 for (x = 0; x < NGDT; x++) {
1697 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1698 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1699 ssdtosd(&gdt_segs[x], &gdt[x]);
1700 }
1701 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1702 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1703 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1704
1705 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1706 r_gdt.rd_base = (long) gdt;
1707 lgdt(&r_gdt);
1708 pc = &temp_bsp_pcpu;
1709
1710 wrmsr(MSR_FSBASE, 0); /* User value */
1711 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1712 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1713
1714 pcpu_init(pc, 0, sizeof(struct pcpu));
1715 dpcpu_init((void *)(physfree + KERNBASE), 0);
1716 physfree += DPCPU_SIZE;
1717 amd64_bsp_pcpu_init1(pc);
1718 /* Non-late cninit() and printf() can be moved up to here. */
1719
1720 /*
1721 * Initialize mutexes.
1722 *
1723 * icu_lock: in order to allow an interrupt to occur in a critical
1724 * section, to set pcpu->ipending (etc...) properly, we
1725 * must be able to get the icu lock, so it can't be
1726 * under witness.
1727 */
1728 mutex_init();
1729 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1730 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1731
1732 /* exceptions */
1733 for (x = 0; x < NIDT; x++)
1734 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1735 SEL_KPL, 0);
1736 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1737 SEL_KPL, 0);
1738 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1739 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1740 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1741 SEL_UPL, 0);
1742 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1743 SEL_UPL, 0);
1744 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1745 SEL_KPL, 0);
1746 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1747 SEL_KPL, 0);
1748 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1749 SEL_KPL, 0);
1750 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1751 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1752 SDT_SYSIGT, SEL_KPL, 0);
1753 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1754 SEL_KPL, 0);
1755 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1756 SDT_SYSIGT, SEL_KPL, 0);
1757 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1758 SEL_KPL, 0);
1759 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1760 SEL_KPL, 0);
1761 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1762 SEL_KPL, 0);
1763 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1764 SEL_KPL, 0);
1765 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1766 SEL_KPL, 0);
1767 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1768 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1769 SEL_KPL, 0);
1770 #ifdef KDTRACE_HOOKS
1771 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1772 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1773 #endif
1774 #ifdef XENHVM
1775 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1776 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1777 #endif
1778 r_idt.rd_limit = sizeof(idt0) - 1;
1779 r_idt.rd_base = (long) idt;
1780 lidt(&r_idt);
1781
1782 /*
1783 * Initialize the clock before the console so that console
1784 * initialization can use DELAY().
1785 */
1786 clock_init();
1787
1788 /*
1789 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1790 * transition).
1791 * Once bootblocks have updated, we can test directly for
1792 * efi_systbl != NULL here...
1793 */
1794 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1795 != NULL)
1796 vty_set_preferred(VTY_VT);
1797
1798 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1799 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1800 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1801 &syscall_ret_l1d_flush_mode);
1802 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1803 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1804
1805 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1806 &x86_rngds_mitg_enable);
1807
1808 finishidentcpu(); /* Final stage of CPU initialization */
1809 initializecpu(); /* Initialize CPU registers */
1810
1811 amd64_bsp_ist_init(pc);
1812
1813 /* Set the IO permission bitmap (empty due to tss seg limit) */
1814 common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1815
1816 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1817 ltr(gsel_tss);
1818
1819 amd64_conf_fast_syscall();
1820
1821 /*
1822 * We initialize the PCB pointer early so that exception
1823 * handlers will work. Also set up td_critnest to short-cut
1824 * the page fault handler.
1825 */
1826 cpu_max_ext_state_size = sizeof(struct savefpu);
1827 set_top_of_stack_td(&thread0);
1828 thread0.td_pcb = get_pcb_td(&thread0);
1829 thread0.td_critnest = 1;
1830
1831 /*
1832 * The console and kdb should be initialized even earlier than here,
1833 * but some console drivers don't work until after getmemsize().
1834 * Default to late console initialization to support these drivers.
1835 * This loses mainly printf()s in getmemsize() and early debugging.
1836 */
1837 late_console = 1;
1838 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1839 if (!late_console) {
1840 cninit();
1841 amd64_kdb_init();
1842 }
1843
1844 getmemsize(kmdp, physfree);
1845 init_param2(physmem);
1846
1847 /* now running on new page tables, configured,and u/iom is accessible */
1848
1849 #ifdef DEV_PCI
1850 /* This call might adjust phys_avail[]. */
1851 pci_early_quirks();
1852 #endif
1853
1854 if (late_console)
1855 cninit();
1856
1857 #ifdef DEV_ISA
1858 #ifdef DEV_ATPIC
1859 elcr_probe();
1860 atpic_startup();
1861 #else
1862 /* Reset and mask the atpics and leave them shut down. */
1863 atpic_reset();
1864
1865 /*
1866 * Point the ICU spurious interrupt vectors at the APIC spurious
1867 * interrupt handler.
1868 */
1869 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1870 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1871 #endif
1872 #else
1873 #error "have you forgotten the isa device?";
1874 #endif
1875
1876 if (late_console)
1877 amd64_kdb_init();
1878
1879 msgbufinit(msgbufp, msgbufsize);
1880 fpuinit();
1881
1882 /*
1883 * Set up thread0 pcb save area after fpuinit calculated fpu save
1884 * area size. Zero out the extended state header in fpu save
1885 * area.
1886 */
1887 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1888 bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1889 if (use_xsave) {
1890 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1891 1);
1892 xhdr->xstate_bv = xsave_mask;
1893 }
1894 /* make an initial tss so cpu can get interrupt stack on syscall! */
1895 rsp0 = thread0.td_md.md_stack_base;
1896 /* Ensure the stack is aligned to 16 bytes */
1897 rsp0 &= ~0xFul;
1898 common_tss[0].tss_rsp0 = rsp0;
1899 amd64_bsp_pcpu_init2(rsp0);
1900
1901 /* transfer to user mode */
1902
1903 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1904 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1905 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1906 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1907 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1908
1909 load_ds(_udatasel);
1910 load_es(_udatasel);
1911 load_fs(_ufssel);
1912
1913 /* setup proc 0's pcb */
1914 thread0.td_pcb->pcb_flags = 0;
1915 thread0.td_frame = &proc0_tf;
1916
1917 env = kern_getenv("kernelname");
1918 if (env != NULL)
1919 strlcpy(kernelname, env, sizeof(kernelname));
1920
1921 #ifdef FDT
1922 x86_init_fdt();
1923 #endif
1924 thread0.td_critnest = 0;
1925
1926 TSEXIT();
1927
1928 /* Location of kernel stack for locore */
1929 return (thread0.td_md.md_stack_base);
1930 }
1931
1932 void
1933 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1934 {
1935
1936 pcpu->pc_acpi_id = 0xffffffff;
1937 }
1938
1939 static int
1940 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1941 {
1942 struct bios_smap *smapbase;
1943 struct bios_smap_xattr smap;
1944 caddr_t kmdp;
1945 uint32_t *smapattr;
1946 int count, error, i;
1947
1948 /* Retrieve the system memory map from the loader. */
1949 kmdp = preload_search_by_type("elf kernel");
1950 if (kmdp == NULL)
1951 kmdp = preload_search_by_type("elf64 kernel");
1952 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1953 MODINFO_METADATA | MODINFOMD_SMAP);
1954 if (smapbase == NULL)
1955 return (0);
1956 smapattr = (uint32_t *)preload_search_info(kmdp,
1957 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1958 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1959 error = 0;
1960 for (i = 0; i < count; i++) {
1961 smap.base = smapbase[i].base;
1962 smap.length = smapbase[i].length;
1963 smap.type = smapbase[i].type;
1964 if (smapattr != NULL)
1965 smap.xattr = smapattr[i];
1966 else
1967 smap.xattr = 0;
1968 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1969 }
1970 return (error);
1971 }
1972 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1973 smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1974
1975 static int
1976 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1977 {
1978 struct efi_map_header *efihdr;
1979 caddr_t kmdp;
1980 uint32_t efisize;
1981
1982 kmdp = preload_search_by_type("elf kernel");
1983 if (kmdp == NULL)
1984 kmdp = preload_search_by_type("elf64 kernel");
1985 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1986 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1987 if (efihdr == NULL)
1988 return (0);
1989 efisize = *((uint32_t *)efihdr - 1);
1990 return (SYSCTL_OUT(req, efihdr, efisize));
1991 }
1992 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1993 efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1994
1995 void
1996 spinlock_enter(void)
1997 {
1998 struct thread *td;
1999 register_t flags;
2000
2001 td = curthread;
2002 if (td->td_md.md_spinlock_count == 0) {
2003 flags = intr_disable();
2004 td->td_md.md_spinlock_count = 1;
2005 td->td_md.md_saved_flags = flags;
2006 critical_enter();
2007 } else
2008 td->td_md.md_spinlock_count++;
2009 }
2010
2011 void
2012 spinlock_exit(void)
2013 {
2014 struct thread *td;
2015 register_t flags;
2016
2017 td = curthread;
2018 flags = td->td_md.md_saved_flags;
2019 td->td_md.md_spinlock_count--;
2020 if (td->td_md.md_spinlock_count == 0) {
2021 critical_exit();
2022 intr_restore(flags);
2023 }
2024 }
2025
2026 /*
2027 * Construct a PCB from a trapframe. This is called from kdb_trap() where
2028 * we want to start a backtrace from the function that caused us to enter
2029 * the debugger. We have the context in the trapframe, but base the trace
2030 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2031 * enough for a backtrace.
2032 */
2033 void
2034 makectx(struct trapframe *tf, struct pcb *pcb)
2035 {
2036
2037 pcb->pcb_r12 = tf->tf_r12;
2038 pcb->pcb_r13 = tf->tf_r13;
2039 pcb->pcb_r14 = tf->tf_r14;
2040 pcb->pcb_r15 = tf->tf_r15;
2041 pcb->pcb_rbp = tf->tf_rbp;
2042 pcb->pcb_rbx = tf->tf_rbx;
2043 pcb->pcb_rip = tf->tf_rip;
2044 pcb->pcb_rsp = tf->tf_rsp;
2045 }
2046
2047 int
2048 ptrace_set_pc(struct thread *td, unsigned long addr)
2049 {
2050
2051 td->td_frame->tf_rip = addr;
2052 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2053 return (0);
2054 }
2055
2056 int
2057 ptrace_single_step(struct thread *td)
2058 {
2059
2060 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2061 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2062 td->td_frame->tf_rflags |= PSL_T;
2063 td->td_dbgflags |= TDB_STEP;
2064 }
2065 return (0);
2066 }
2067
2068 int
2069 ptrace_clear_single_step(struct thread *td)
2070 {
2071
2072 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2073 td->td_frame->tf_rflags &= ~PSL_T;
2074 td->td_dbgflags &= ~TDB_STEP;
2075 return (0);
2076 }
2077
2078 int
2079 fill_regs(struct thread *td, struct reg *regs)
2080 {
2081 struct trapframe *tp;
2082
2083 tp = td->td_frame;
2084 return (fill_frame_regs(tp, regs));
2085 }
2086
2087 int
2088 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2089 {
2090
2091 regs->r_r15 = tp->tf_r15;
2092 regs->r_r14 = tp->tf_r14;
2093 regs->r_r13 = tp->tf_r13;
2094 regs->r_r12 = tp->tf_r12;
2095 regs->r_r11 = tp->tf_r11;
2096 regs->r_r10 = tp->tf_r10;
2097 regs->r_r9 = tp->tf_r9;
2098 regs->r_r8 = tp->tf_r8;
2099 regs->r_rdi = tp->tf_rdi;
2100 regs->r_rsi = tp->tf_rsi;
2101 regs->r_rbp = tp->tf_rbp;
2102 regs->r_rbx = tp->tf_rbx;
2103 regs->r_rdx = tp->tf_rdx;
2104 regs->r_rcx = tp->tf_rcx;
2105 regs->r_rax = tp->tf_rax;
2106 regs->r_rip = tp->tf_rip;
2107 regs->r_cs = tp->tf_cs;
2108 regs->r_rflags = tp->tf_rflags;
2109 regs->r_rsp = tp->tf_rsp;
2110 regs->r_ss = tp->tf_ss;
2111 if (tp->tf_flags & TF_HASSEGS) {
2112 regs->r_ds = tp->tf_ds;
2113 regs->r_es = tp->tf_es;
2114 regs->r_fs = tp->tf_fs;
2115 regs->r_gs = tp->tf_gs;
2116 } else {
2117 regs->r_ds = 0;
2118 regs->r_es = 0;
2119 regs->r_fs = 0;
2120 regs->r_gs = 0;
2121 }
2122 regs->r_err = 0;
2123 regs->r_trapno = 0;
2124 return (0);
2125 }
2126
2127 int
2128 set_regs(struct thread *td, struct reg *regs)
2129 {
2130 struct trapframe *tp;
2131 register_t rflags;
2132
2133 tp = td->td_frame;
2134 rflags = regs->r_rflags & 0xffffffff;
2135 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2136 return (EINVAL);
2137 tp->tf_r15 = regs->r_r15;
2138 tp->tf_r14 = regs->r_r14;
2139 tp->tf_r13 = regs->r_r13;
2140 tp->tf_r12 = regs->r_r12;
2141 tp->tf_r11 = regs->r_r11;
2142 tp->tf_r10 = regs->r_r10;
2143 tp->tf_r9 = regs->r_r9;
2144 tp->tf_r8 = regs->r_r8;
2145 tp->tf_rdi = regs->r_rdi;
2146 tp->tf_rsi = regs->r_rsi;
2147 tp->tf_rbp = regs->r_rbp;
2148 tp->tf_rbx = regs->r_rbx;
2149 tp->tf_rdx = regs->r_rdx;
2150 tp->tf_rcx = regs->r_rcx;
2151 tp->tf_rax = regs->r_rax;
2152 tp->tf_rip = regs->r_rip;
2153 tp->tf_cs = regs->r_cs;
2154 tp->tf_rflags = rflags;
2155 tp->tf_rsp = regs->r_rsp;
2156 tp->tf_ss = regs->r_ss;
2157 if (0) { /* XXXKIB */
2158 tp->tf_ds = regs->r_ds;
2159 tp->tf_es = regs->r_es;
2160 tp->tf_fs = regs->r_fs;
2161 tp->tf_gs = regs->r_gs;
2162 tp->tf_flags = TF_HASSEGS;
2163 }
2164 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2165 return (0);
2166 }
2167
2168 /* XXX check all this stuff! */
2169 /* externalize from sv_xmm */
2170 static void
2171 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2172 {
2173 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2174 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2175 int i;
2176
2177 /* pcb -> fpregs */
2178 bzero(fpregs, sizeof(*fpregs));
2179
2180 /* FPU control/status */
2181 penv_fpreg->en_cw = penv_xmm->en_cw;
2182 penv_fpreg->en_sw = penv_xmm->en_sw;
2183 penv_fpreg->en_tw = penv_xmm->en_tw;
2184 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2185 penv_fpreg->en_rip = penv_xmm->en_rip;
2186 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2187 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2188 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2189
2190 /* FPU registers */
2191 for (i = 0; i < 8; ++i)
2192 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2193
2194 /* SSE registers */
2195 for (i = 0; i < 16; ++i)
2196 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2197 }
2198
2199 /* internalize from fpregs into sv_xmm */
2200 static void
2201 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2202 {
2203 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2204 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2205 int i;
2206
2207 /* fpregs -> pcb */
2208 /* FPU control/status */
2209 penv_xmm->en_cw = penv_fpreg->en_cw;
2210 penv_xmm->en_sw = penv_fpreg->en_sw;
2211 penv_xmm->en_tw = penv_fpreg->en_tw;
2212 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2213 penv_xmm->en_rip = penv_fpreg->en_rip;
2214 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2215 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2216 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2217
2218 /* FPU registers */
2219 for (i = 0; i < 8; ++i)
2220 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2221
2222 /* SSE registers */
2223 for (i = 0; i < 16; ++i)
2224 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2225 }
2226
2227 /* externalize from td->pcb */
2228 int
2229 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2230 {
2231
2232 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2233 P_SHOULDSTOP(td->td_proc),
2234 ("not suspended thread %p", td));
2235 fpugetregs(td);
2236 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2237 return (0);
2238 }
2239
2240 /* internalize to td->pcb */
2241 int
2242 set_fpregs(struct thread *td, struct fpreg *fpregs)
2243 {
2244
2245 critical_enter();
2246 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2247 fpuuserinited(td);
2248 critical_exit();
2249 return (0);
2250 }
2251
2252 /*
2253 * Get machine context.
2254 */
2255 int
2256 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2257 {
2258 struct pcb *pcb;
2259 struct trapframe *tp;
2260
2261 pcb = td->td_pcb;
2262 tp = td->td_frame;
2263 PROC_LOCK(curthread->td_proc);
2264 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2265 PROC_UNLOCK(curthread->td_proc);
2266 mcp->mc_r15 = tp->tf_r15;
2267 mcp->mc_r14 = tp->tf_r14;
2268 mcp->mc_r13 = tp->tf_r13;
2269 mcp->mc_r12 = tp->tf_r12;
2270 mcp->mc_r11 = tp->tf_r11;
2271 mcp->mc_r10 = tp->tf_r10;
2272 mcp->mc_r9 = tp->tf_r9;
2273 mcp->mc_r8 = tp->tf_r8;
2274 mcp->mc_rdi = tp->tf_rdi;
2275 mcp->mc_rsi = tp->tf_rsi;
2276 mcp->mc_rbp = tp->tf_rbp;
2277 mcp->mc_rbx = tp->tf_rbx;
2278 mcp->mc_rcx = tp->tf_rcx;
2279 mcp->mc_rflags = tp->tf_rflags;
2280 if (flags & GET_MC_CLEAR_RET) {
2281 mcp->mc_rax = 0;
2282 mcp->mc_rdx = 0;
2283 mcp->mc_rflags &= ~PSL_C;
2284 } else {
2285 mcp->mc_rax = tp->tf_rax;
2286 mcp->mc_rdx = tp->tf_rdx;
2287 }
2288 mcp->mc_rip = tp->tf_rip;
2289 mcp->mc_cs = tp->tf_cs;
2290 mcp->mc_rsp = tp->tf_rsp;
2291 mcp->mc_ss = tp->tf_ss;
2292 mcp->mc_ds = tp->tf_ds;
2293 mcp->mc_es = tp->tf_es;
2294 mcp->mc_fs = tp->tf_fs;
2295 mcp->mc_gs = tp->tf_gs;
2296 mcp->mc_flags = tp->tf_flags;
2297 mcp->mc_len = sizeof(*mcp);
2298 get_fpcontext(td, mcp, NULL, 0);
2299 update_pcb_bases(pcb);
2300 mcp->mc_fsbase = pcb->pcb_fsbase;
2301 mcp->mc_gsbase = pcb->pcb_gsbase;
2302 mcp->mc_xfpustate = 0;
2303 mcp->mc_xfpustate_len = 0;
2304 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2305 return (0);
2306 }
2307
2308 /*
2309 * Set machine context.
2310 *
2311 * However, we don't set any but the user modifiable flags, and we won't
2312 * touch the cs selector.
2313 */
2314 int
2315 set_mcontext(struct thread *td, mcontext_t *mcp)
2316 {
2317 struct pcb *pcb;
2318 struct trapframe *tp;
2319 char *xfpustate;
2320 long rflags;
2321 int ret;
2322
2323 pcb = td->td_pcb;
2324 tp = td->td_frame;
2325 if (mcp->mc_len != sizeof(*mcp) ||
2326 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2327 return (EINVAL);
2328 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2329 (tp->tf_rflags & ~PSL_USERCHANGE);
2330 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2331 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2332 sizeof(struct savefpu))
2333 return (EINVAL);
2334 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2335 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2336 mcp->mc_xfpustate_len);
2337 if (ret != 0)
2338 return (ret);
2339 } else
2340 xfpustate = NULL;
2341 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2342 if (ret != 0)
2343 return (ret);
2344 tp->tf_r15 = mcp->mc_r15;
2345 tp->tf_r14 = mcp->mc_r14;
2346 tp->tf_r13 = mcp->mc_r13;
2347 tp->tf_r12 = mcp->mc_r12;
2348 tp->tf_r11 = mcp->mc_r11;
2349 tp->tf_r10 = mcp->mc_r10;
2350 tp->tf_r9 = mcp->mc_r9;
2351 tp->tf_r8 = mcp->mc_r8;
2352 tp->tf_rdi = mcp->mc_rdi;
2353 tp->tf_rsi = mcp->mc_rsi;
2354 tp->tf_rbp = mcp->mc_rbp;
2355 tp->tf_rbx = mcp->mc_rbx;
2356 tp->tf_rdx = mcp->mc_rdx;
2357 tp->tf_rcx = mcp->mc_rcx;
2358 tp->tf_rax = mcp->mc_rax;
2359 tp->tf_rip = mcp->mc_rip;
2360 tp->tf_rflags = rflags;
2361 tp->tf_rsp = mcp->mc_rsp;
2362 tp->tf_ss = mcp->mc_ss;
2363 tp->tf_flags = mcp->mc_flags;
2364 if (tp->tf_flags & TF_HASSEGS) {
2365 tp->tf_ds = mcp->mc_ds;
2366 tp->tf_es = mcp->mc_es;
2367 tp->tf_fs = mcp->mc_fs;
2368 tp->tf_gs = mcp->mc_gs;
2369 }
2370 set_pcb_flags(pcb, PCB_FULL_IRET);
2371 if (mcp->mc_flags & _MC_HASBASES) {
2372 pcb->pcb_fsbase = mcp->mc_fsbase;
2373 pcb->pcb_gsbase = mcp->mc_gsbase;
2374 }
2375 return (0);
2376 }
2377
2378 static void
2379 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2380 size_t xfpusave_len)
2381 {
2382 size_t max_len, len;
2383
2384 mcp->mc_ownedfp = fpugetregs(td);
2385 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2386 sizeof(mcp->mc_fpstate));
2387 mcp->mc_fpformat = fpuformat();
2388 if (!use_xsave || xfpusave_len == 0)
2389 return;
2390 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2391 len = xfpusave_len;
2392 if (len > max_len) {
2393 len = max_len;
2394 bzero(xfpusave + max_len, len - max_len);
2395 }
2396 mcp->mc_flags |= _MC_HASFPXSTATE;
2397 mcp->mc_xfpustate_len = len;
2398 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2399 }
2400
2401 static int
2402 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2403 size_t xfpustate_len)
2404 {
2405 int error;
2406
2407 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2408 return (0);
2409 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2410 return (EINVAL);
2411 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2412 /* We don't care what state is left in the FPU or PCB. */
2413 fpstate_drop(td);
2414 error = 0;
2415 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2416 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2417 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2418 xfpustate, xfpustate_len);
2419 } else
2420 return (EINVAL);
2421 return (error);
2422 }
2423
2424 void
2425 fpstate_drop(struct thread *td)
2426 {
2427
2428 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2429 critical_enter();
2430 if (PCPU_GET(fpcurthread) == td)
2431 fpudrop();
2432 /*
2433 * XXX force a full drop of the fpu. The above only drops it if we
2434 * owned it.
2435 *
2436 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2437 * drop. Dropping only to the pcb matches fnsave's behaviour.
2438 * We only need to drop to !PCB_INITDONE in sendsig(). But
2439 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2440 * have too many layers.
2441 */
2442 clear_pcb_flags(curthread->td_pcb,
2443 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2444 critical_exit();
2445 }
2446
2447 int
2448 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2449 {
2450 struct pcb *pcb;
2451
2452 if (td == NULL) {
2453 dbregs->dr[0] = rdr0();
2454 dbregs->dr[1] = rdr1();
2455 dbregs->dr[2] = rdr2();
2456 dbregs->dr[3] = rdr3();
2457 dbregs->dr[6] = rdr6();
2458 dbregs->dr[7] = rdr7();
2459 } else {
2460 pcb = td->td_pcb;
2461 dbregs->dr[0] = pcb->pcb_dr0;
2462 dbregs->dr[1] = pcb->pcb_dr1;
2463 dbregs->dr[2] = pcb->pcb_dr2;
2464 dbregs->dr[3] = pcb->pcb_dr3;
2465 dbregs->dr[6] = pcb->pcb_dr6;
2466 dbregs->dr[7] = pcb->pcb_dr7;
2467 }
2468 dbregs->dr[4] = 0;
2469 dbregs->dr[5] = 0;
2470 dbregs->dr[8] = 0;
2471 dbregs->dr[9] = 0;
2472 dbregs->dr[10] = 0;
2473 dbregs->dr[11] = 0;
2474 dbregs->dr[12] = 0;
2475 dbregs->dr[13] = 0;
2476 dbregs->dr[14] = 0;
2477 dbregs->dr[15] = 0;
2478 return (0);
2479 }
2480
2481 int
2482 set_dbregs(struct thread *td, struct dbreg *dbregs)
2483 {
2484 struct pcb *pcb;
2485 int i;
2486
2487 if (td == NULL) {
2488 load_dr0(dbregs->dr[0]);
2489 load_dr1(dbregs->dr[1]);
2490 load_dr2(dbregs->dr[2]);
2491 load_dr3(dbregs->dr[3]);
2492 load_dr6(dbregs->dr[6]);
2493 load_dr7(dbregs->dr[7]);
2494 } else {
2495 /*
2496 * Don't let an illegal value for dr7 get set. Specifically,
2497 * check for undefined settings. Setting these bit patterns
2498 * result in undefined behaviour and can lead to an unexpected
2499 * TRCTRAP or a general protection fault right here.
2500 * Upper bits of dr6 and dr7 must not be set
2501 */
2502 for (i = 0; i < 4; i++) {
2503 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2504 return (EINVAL);
2505 if (td->td_frame->tf_cs == _ucode32sel &&
2506 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2507 return (EINVAL);
2508 }
2509 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2510 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2511 return (EINVAL);
2512
2513 pcb = td->td_pcb;
2514
2515 /*
2516 * Don't let a process set a breakpoint that is not within the
2517 * process's address space. If a process could do this, it
2518 * could halt the system by setting a breakpoint in the kernel
2519 * (if ddb was enabled). Thus, we need to check to make sure
2520 * that no breakpoints are being enabled for addresses outside
2521 * process's address space.
2522 *
2523 * XXX - what about when the watched area of the user's
2524 * address space is written into from within the kernel
2525 * ... wouldn't that still cause a breakpoint to be generated
2526 * from within kernel mode?
2527 */
2528
2529 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2530 /* dr0 is enabled */
2531 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2532 return (EINVAL);
2533 }
2534 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2535 /* dr1 is enabled */
2536 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2537 return (EINVAL);
2538 }
2539 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2540 /* dr2 is enabled */
2541 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2542 return (EINVAL);
2543 }
2544 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2545 /* dr3 is enabled */
2546 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2547 return (EINVAL);
2548 }
2549
2550 pcb->pcb_dr0 = dbregs->dr[0];
2551 pcb->pcb_dr1 = dbregs->dr[1];
2552 pcb->pcb_dr2 = dbregs->dr[2];
2553 pcb->pcb_dr3 = dbregs->dr[3];
2554 pcb->pcb_dr6 = dbregs->dr[6];
2555 pcb->pcb_dr7 = dbregs->dr[7];
2556
2557 set_pcb_flags(pcb, PCB_DBREGS);
2558 }
2559
2560 return (0);
2561 }
2562
2563 void
2564 reset_dbregs(void)
2565 {
2566
2567 load_dr7(0); /* Turn off the control bits first */
2568 load_dr0(0);
2569 load_dr1(0);
2570 load_dr2(0);
2571 load_dr3(0);
2572 load_dr6(0);
2573 }
2574
2575 /*
2576 * Return > 0 if a hardware breakpoint has been hit, and the
2577 * breakpoint was in user space. Return 0, otherwise.
2578 */
2579 int
2580 user_dbreg_trap(register_t dr6)
2581 {
2582 u_int64_t dr7;
2583 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2584 int nbp; /* number of breakpoints that triggered */
2585 caddr_t addr[4]; /* breakpoint addresses */
2586 int i;
2587
2588 bp = dr6 & DBREG_DR6_BMASK;
2589 if (bp == 0) {
2590 /*
2591 * None of the breakpoint bits are set meaning this
2592 * trap was not caused by any of the debug registers
2593 */
2594 return 0;
2595 }
2596
2597 dr7 = rdr7();
2598 if ((dr7 & 0x000000ff) == 0) {
2599 /*
2600 * all GE and LE bits in the dr7 register are zero,
2601 * thus the trap couldn't have been caused by the
2602 * hardware debug registers
2603 */
2604 return 0;
2605 }
2606
2607 nbp = 0;
2608
2609 /*
2610 * at least one of the breakpoints were hit, check to see
2611 * which ones and if any of them are user space addresses
2612 */
2613
2614 if (bp & 0x01) {
2615 addr[nbp++] = (caddr_t)rdr0();
2616 }
2617 if (bp & 0x02) {
2618 addr[nbp++] = (caddr_t)rdr1();
2619 }
2620 if (bp & 0x04) {
2621 addr[nbp++] = (caddr_t)rdr2();
2622 }
2623 if (bp & 0x08) {
2624 addr[nbp++] = (caddr_t)rdr3();
2625 }
2626
2627 for (i = 0; i < nbp; i++) {
2628 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2629 /*
2630 * addr[i] is in user space
2631 */
2632 return nbp;
2633 }
2634 }
2635
2636 /*
2637 * None of the breakpoints are in user space.
2638 */
2639 return 0;
2640 }
2641
2642 /*
2643 * The pcb_flags is only modified by current thread, or by other threads
2644 * when current thread is stopped. However, current thread may change it
2645 * from the interrupt context in cpu_switch(), or in the trap handler.
2646 * When we read-modify-write pcb_flags from C sources, compiler may generate
2647 * code that is not atomic regarding the interrupt handler. If a trap or
2648 * interrupt happens and any flag is modified from the handler, it can be
2649 * clobbered with the cached value later. Therefore, we implement setting
2650 * and clearing flags with single-instruction functions, which do not race
2651 * with possible modification of the flags from the trap or interrupt context,
2652 * because traps and interrupts are executed only on instruction boundary.
2653 */
2654 void
2655 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2656 {
2657
2658 __asm __volatile("orl %1,%0"
2659 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2660 : "cc", "memory");
2661
2662 }
2663
2664 /*
2665 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2666 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2667 * pcb if user space modified the bases. We must save on the context
2668 * switch or if the return to usermode happens through the doreti.
2669 *
2670 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2671 * which have a consequence that the base MSRs must be saved each time
2672 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2673 * context switches.
2674 */
2675 static void
2676 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2677 {
2678 register_t r;
2679
2680 if (curpcb == pcb &&
2681 (flags & PCB_FULL_IRET) != 0 &&
2682 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2683 r = intr_disable();
2684 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2685 if (rfs() == _ufssel)
2686 pcb->pcb_fsbase = rdfsbase();
2687 if (rgs() == _ugssel)
2688 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2689 }
2690 set_pcb_flags_raw(pcb, flags);
2691 intr_restore(r);
2692 } else {
2693 set_pcb_flags_raw(pcb, flags);
2694 }
2695 }
2696
2697 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int), static)
2698 {
2699
2700 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2701 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2702 }
2703
2704 void
2705 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2706 {
2707
2708 __asm __volatile("andl %1,%0"
2709 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2710 : "cc", "memory");
2711 }
2712
2713 #ifdef KDB
2714
2715 /*
2716 * Provide inb() and outb() as functions. They are normally only available as
2717 * inline functions, thus cannot be called from the debugger.
2718 */
2719
2720 /* silence compiler warnings */
2721 u_char inb_(u_short);
2722 void outb_(u_short, u_char);
2723
2724 u_char
2725 inb_(u_short port)
2726 {
2727 return inb(port);
2728 }
2729
2730 void
2731 outb_(u_short port, u_char data)
2732 {
2733 outb(port, data);
2734 }
2735
2736 #endif /* KDB */
2737
2738 #undef memset
2739 #undef memmove
2740 #undef memcpy
2741
2742 void *memset_std(void *buf, int c, size_t len);
2743 void *memset_erms(void *buf, int c, size_t len);
2744 DEFINE_IFUNC(, void *, memset, (void *, int, size_t), static)
2745 {
2746
2747 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2748 memset_erms : memset_std);
2749 }
2750
2751 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2752 size_t len);
2753 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2754 size_t len);
2755 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2756 size_t), static)
2757 {
2758
2759 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2760 memmove_erms : memmove_std);
2761 }
2762
2763 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2764 size_t len);
2765 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2766 size_t len);
2767 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t),
2768 static)
2769 {
2770
2771 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2772 memcpy_erms : memcpy_std);
2773 }
2774
2775 void pagezero_std(void *addr);
2776 void pagezero_erms(void *addr);
2777 DEFINE_IFUNC(, void , pagezero, (void *), static)
2778 {
2779
2780 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2781 pagezero_erms : pagezero_std);
2782 }
Cache object: b25f8d4c88c0f10f0678f2acdec90236
|