The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/amd64/machdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-4-Clause
    3  *
    4  * Copyright (c) 2003 Peter Wemm.
    5  * Copyright (c) 1992 Terrence R. Lambert.
    6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    7  * All rights reserved.
    8  *
    9  * This code is derived from software contributed to Berkeley by
   10  * William Jolitz.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. All advertising materials mentioning features or use of this software
   21  *    must display the following acknowledgement:
   22  *      This product includes software developed by the University of
   23  *      California, Berkeley and its contributors.
   24  * 4. Neither the name of the University nor the names of its contributors
   25  *    may be used to endorse or promote products derived from this software
   26  *    without specific prior written permission.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD$");
   45 
   46 #include "opt_atpic.h"
   47 #include "opt_cpu.h"
   48 #include "opt_ddb.h"
   49 #include "opt_inet.h"
   50 #include "opt_isa.h"
   51 #include "opt_kstack_pages.h"
   52 #include "opt_maxmem.h"
   53 #include "opt_pci.h"
   54 #include "opt_platform.h"
   55 #include "opt_sched.h"
   56 
   57 #include <sys/param.h>
   58 #include <sys/proc.h>
   59 #include <sys/systm.h>
   60 #include <sys/asan.h>
   61 #include <sys/bio.h>
   62 #include <sys/buf.h>
   63 #include <sys/bus.h>
   64 #include <sys/callout.h>
   65 #include <sys/cons.h>
   66 #include <sys/cpu.h>
   67 #include <sys/csan.h>
   68 #include <sys/efi.h>
   69 #include <sys/eventhandler.h>
   70 #include <sys/exec.h>
   71 #include <sys/imgact.h>
   72 #include <sys/kdb.h>
   73 #include <sys/kernel.h>
   74 #include <sys/ktr.h>
   75 #include <sys/linker.h>
   76 #include <sys/lock.h>
   77 #include <sys/malloc.h>
   78 #include <sys/memrange.h>
   79 #include <sys/msan.h>
   80 #include <sys/msgbuf.h>
   81 #include <sys/mutex.h>
   82 #include <sys/pcpu.h>
   83 #include <sys/ptrace.h>
   84 #include <sys/reboot.h>
   85 #include <sys/reg.h>
   86 #include <sys/rwlock.h>
   87 #include <sys/sched.h>
   88 #include <sys/signalvar.h>
   89 #ifdef SMP
   90 #include <sys/smp.h>
   91 #endif
   92 #include <sys/syscallsubr.h>
   93 #include <sys/sysctl.h>
   94 #include <sys/sysent.h>
   95 #include <sys/sysproto.h>
   96 #include <sys/ucontext.h>
   97 #include <sys/vmmeter.h>
   98 
   99 #include <vm/vm.h>
  100 #include <vm/vm_param.h>
  101 #include <vm/vm_extern.h>
  102 #include <vm/vm_kern.h>
  103 #include <vm/vm_page.h>
  104 #include <vm/vm_map.h>
  105 #include <vm/vm_object.h>
  106 #include <vm/vm_pager.h>
  107 #include <vm/vm_phys.h>
  108 #include <vm/vm_dumpset.h>
  109 
  110 #ifdef DDB
  111 #ifndef KDB
  112 #error KDB must be enabled in order for DDB to work!
  113 #endif
  114 #include <ddb/ddb.h>
  115 #include <ddb/db_sym.h>
  116 #endif
  117 
  118 #include <net/netisr.h>
  119 
  120 #include <machine/clock.h>
  121 #include <machine/cpu.h>
  122 #include <machine/cputypes.h>
  123 #include <machine/frame.h>
  124 #include <machine/intr_machdep.h>
  125 #include <x86/mca.h>
  126 #include <machine/md_var.h>
  127 #include <machine/metadata.h>
  128 #include <machine/pc/bios.h>
  129 #include <machine/pcb.h>
  130 #include <machine/proc.h>
  131 #include <machine/sigframe.h>
  132 #include <machine/specialreg.h>
  133 #include <machine/trap.h>
  134 #include <machine/tss.h>
  135 #include <x86/ucode.h>
  136 #include <x86/ifunc.h>
  137 #ifdef SMP
  138 #include <machine/smp.h>
  139 #endif
  140 #ifdef FDT
  141 #include <x86/fdt.h>
  142 #endif
  143 
  144 #ifdef DEV_ATPIC
  145 #include <x86/isa/icu.h>
  146 #else
  147 #include <x86/apicvar.h>
  148 #endif
  149 
  150 #include <isa/isareg.h>
  151 #include <isa/rtc.h>
  152 #include <x86/init.h>
  153 
  154 /* Sanity check for __curthread() */
  155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
  156 
  157 /*
  158  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  159  * couple of scratch registers, as well as the trapframe left behind after an
  160  * iret fault.
  161  */
  162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
  163     offsetof(struct pti_frame, pti_rip));
  164 
  165 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
  166 
  167 static void cpu_startup(void *);
  168 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
  169 
  170 /* Probe 8254 PIT and TSC. */
  171 static void native_clock_source_init(void);
  172 
  173 /* Preload data parse function */
  174 static caddr_t native_parse_preload_data(u_int64_t);
  175 
  176 /* Native function to fetch and parse the e820 map */
  177 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
  178 
  179 /* Default init_ops implementation. */
  180 struct init_ops init_ops = {
  181         .parse_preload_data =           native_parse_preload_data,
  182         .early_clock_source_init =      native_clock_source_init,
  183         .early_delay =                  i8254_delay,
  184         .parse_memmap =                 native_parse_memmap,
  185 };
  186 
  187 /*
  188  * Physical address of the EFI System Table. Stashed from the metadata hints
  189  * passed into the kernel and used by the EFI code to call runtime services.
  190  */
  191 vm_paddr_t efi_systbl_phys;
  192 
  193 /* Intel ICH registers */
  194 #define ICH_PMBASE      0x400
  195 #define ICH_SMI_EN      ICH_PMBASE + 0x30
  196 
  197 int     _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
  198 
  199 int cold = 1;
  200 
  201 long Maxmem = 0;
  202 long realmem = 0;
  203 int late_console = 1;
  204 
  205 struct kva_md_info kmi;
  206 
  207 struct region_descriptor r_idt;
  208 
  209 struct pcpu *__pcpu;
  210 struct pcpu temp_bsp_pcpu;
  211 
  212 struct mtx icu_lock;
  213 
  214 struct mem_range_softc mem_range_softc;
  215 
  216 struct mtx dt_lock;     /* lock for GDT and LDT */
  217 
  218 void (*vmm_resume_p)(void);
  219 
  220 bool efi_boot;
  221 
  222 static void
  223 cpu_startup(dummy)
  224         void *dummy;
  225 {
  226         uintmax_t memsize;
  227         char *sysenv;
  228 
  229         /*
  230          * On MacBooks, we need to disallow the legacy USB circuit to
  231          * generate an SMI# because this can cause several problems,
  232          * namely: incorrect CPU frequency detection and failure to
  233          * start the APs.
  234          * We do this by disabling a bit in the SMI_EN (SMI Control and
  235          * Enable register) of the Intel ICH LPC Interface Bridge. 
  236          */
  237         sysenv = kern_getenv("smbios.system.product");
  238         if (sysenv != NULL) {
  239                 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
  240                     strncmp(sysenv, "MacBook3,1", 10) == 0 ||
  241                     strncmp(sysenv, "MacBook4,1", 10) == 0 ||
  242                     strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
  243                     strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
  244                     strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
  245                     strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
  246                     strncmp(sysenv, "Macmini1,1", 10) == 0) {
  247                         if (bootverbose)
  248                                 printf("Disabling LEGACY_USB_EN bit on "
  249                                     "Intel ICH.\n");
  250                         outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
  251                 }
  252                 freeenv(sysenv);
  253         }
  254 
  255         /*
  256          * Good {morning,afternoon,evening,night}.
  257          */
  258         startrtclock();
  259         printcpuinfo();
  260 
  261         /*
  262          * Display physical memory if SMBIOS reports reasonable amount.
  263          */
  264         memsize = 0;
  265         sysenv = kern_getenv("smbios.memory.enabled");
  266         if (sysenv != NULL) {
  267                 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
  268                 freeenv(sysenv);
  269         }
  270         if (memsize < ptoa((uintmax_t)vm_free_count()))
  271                 memsize = ptoa((uintmax_t)Maxmem);
  272         printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
  273         realmem = atop(memsize);
  274 
  275         /*
  276          * Display any holes after the first chunk of extended memory.
  277          */
  278         if (bootverbose) {
  279                 int indx;
  280 
  281                 printf("Physical memory chunk(s):\n");
  282                 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
  283                         vm_paddr_t size;
  284 
  285                         size = phys_avail[indx + 1] - phys_avail[indx];
  286                         printf(
  287                             "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
  288                             (uintmax_t)phys_avail[indx],
  289                             (uintmax_t)phys_avail[indx + 1] - 1,
  290                             (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
  291                 }
  292         }
  293 
  294         vm_ksubmap_init(&kmi);
  295 
  296         printf("avail memory = %ju (%ju MB)\n",
  297             ptoa((uintmax_t)vm_free_count()),
  298             ptoa((uintmax_t)vm_free_count()) / 1048576);
  299 #ifdef DEV_PCI
  300         if (bootverbose && intel_graphics_stolen_base != 0)
  301                 printf("intel stolen mem: base %#jx size %ju MB\n",
  302                     (uintmax_t)intel_graphics_stolen_base,
  303                     (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
  304 #endif
  305 
  306         /*
  307          * Set up buffers, so they can be used to read disk labels.
  308          */
  309         bufinit();
  310         vm_pager_bufferinit();
  311 
  312         cpu_setregs();
  313 }
  314 
  315 static void
  316 late_ifunc_resolve(void *dummy __unused)
  317 {
  318         link_elf_late_ireloc();
  319 }
  320 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
  321 
  322 
  323 void
  324 cpu_setregs(void)
  325 {
  326         register_t cr0;
  327 
  328         cr0 = rcr0();
  329         /*
  330          * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
  331          * BSP.  See the comments there about why we set them.
  332          */
  333         cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
  334         load_cr0(cr0);
  335 }
  336 
  337 /*
  338  * Initialize amd64 and configure to run kernel
  339  */
  340 
  341 /*
  342  * Initialize segments & interrupt table
  343  */
  344 static struct gate_descriptor idt0[NIDT];
  345 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
  346 
  347 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
  348 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
  349 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
  350 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
  351 CTASSERT(sizeof(struct nmi_pcpu) == 16);
  352 
  353 /*
  354  * Software prototypes -- in more palatable form.
  355  *
  356  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  357  * slots as corresponding segments for i386 kernel.
  358  */
  359 struct soft_segment_descriptor gdt_segs[] = {
  360 /* GNULL_SEL    0 Null Descriptor */
  361 {       .ssd_base = 0x0,
  362         .ssd_limit = 0x0,
  363         .ssd_type = 0,
  364         .ssd_dpl = 0,
  365         .ssd_p = 0,
  366         .ssd_long = 0,
  367         .ssd_def32 = 0,
  368         .ssd_gran = 0           },
  369 /* GNULL2_SEL   1 Null Descriptor */
  370 {       .ssd_base = 0x0,
  371         .ssd_limit = 0x0,
  372         .ssd_type = 0,
  373         .ssd_dpl = 0,
  374         .ssd_p = 0,
  375         .ssd_long = 0,
  376         .ssd_def32 = 0,
  377         .ssd_gran = 0           },
  378 /* GUFS32_SEL   2 32 bit %gs Descriptor for user */
  379 {       .ssd_base = 0x0,
  380         .ssd_limit = 0xfffff,
  381         .ssd_type = SDT_MEMRWA,
  382         .ssd_dpl = SEL_UPL,
  383         .ssd_p = 1,
  384         .ssd_long = 0,
  385         .ssd_def32 = 1,
  386         .ssd_gran = 1           },
  387 /* GUGS32_SEL   3 32 bit %fs Descriptor for user */
  388 {       .ssd_base = 0x0,
  389         .ssd_limit = 0xfffff,
  390         .ssd_type = SDT_MEMRWA,
  391         .ssd_dpl = SEL_UPL,
  392         .ssd_p = 1,
  393         .ssd_long = 0,
  394         .ssd_def32 = 1,
  395         .ssd_gran = 1           },
  396 /* GCODE_SEL    4 Code Descriptor for kernel */
  397 {       .ssd_base = 0x0,
  398         .ssd_limit = 0xfffff,
  399         .ssd_type = SDT_MEMERA,
  400         .ssd_dpl = SEL_KPL,
  401         .ssd_p = 1,
  402         .ssd_long = 1,
  403         .ssd_def32 = 0,
  404         .ssd_gran = 1           },
  405 /* GDATA_SEL    5 Data Descriptor for kernel */
  406 {       .ssd_base = 0x0,
  407         .ssd_limit = 0xfffff,
  408         .ssd_type = SDT_MEMRWA,
  409         .ssd_dpl = SEL_KPL,
  410         .ssd_p = 1,
  411         .ssd_long = 1,
  412         .ssd_def32 = 0,
  413         .ssd_gran = 1           },
  414 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
  415 {       .ssd_base = 0x0,
  416         .ssd_limit = 0xfffff,
  417         .ssd_type = SDT_MEMERA,
  418         .ssd_dpl = SEL_UPL,
  419         .ssd_p = 1,
  420         .ssd_long = 0,
  421         .ssd_def32 = 1,
  422         .ssd_gran = 1           },
  423 /* GUDATA_SEL   7 32/64 bit Data Descriptor for user */
  424 {       .ssd_base = 0x0,
  425         .ssd_limit = 0xfffff,
  426         .ssd_type = SDT_MEMRWA,
  427         .ssd_dpl = SEL_UPL,
  428         .ssd_p = 1,
  429         .ssd_long = 0,
  430         .ssd_def32 = 1,
  431         .ssd_gran = 1           },
  432 /* GUCODE_SEL   8 64 bit Code Descriptor for user */
  433 {       .ssd_base = 0x0,
  434         .ssd_limit = 0xfffff,
  435         .ssd_type = SDT_MEMERA,
  436         .ssd_dpl = SEL_UPL,
  437         .ssd_p = 1,
  438         .ssd_long = 1,
  439         .ssd_def32 = 0,
  440         .ssd_gran = 1           },
  441 /* GPROC0_SEL   9 Proc 0 Tss Descriptor */
  442 {       .ssd_base = 0x0,
  443         .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
  444         .ssd_type = SDT_SYSTSS,
  445         .ssd_dpl = SEL_KPL,
  446         .ssd_p = 1,
  447         .ssd_long = 0,
  448         .ssd_def32 = 0,
  449         .ssd_gran = 0           },
  450 /* Actually, the TSS is a system descriptor which is double size */
  451 {       .ssd_base = 0x0,
  452         .ssd_limit = 0x0,
  453         .ssd_type = 0,
  454         .ssd_dpl = 0,
  455         .ssd_p = 0,
  456         .ssd_long = 0,
  457         .ssd_def32 = 0,
  458         .ssd_gran = 0           },
  459 /* GUSERLDT_SEL 11 LDT Descriptor */
  460 {       .ssd_base = 0x0,
  461         .ssd_limit = 0x0,
  462         .ssd_type = 0,
  463         .ssd_dpl = 0,
  464         .ssd_p = 0,
  465         .ssd_long = 0,
  466         .ssd_def32 = 0,
  467         .ssd_gran = 0           },
  468 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
  469 {       .ssd_base = 0x0,
  470         .ssd_limit = 0x0,
  471         .ssd_type = 0,
  472         .ssd_dpl = 0,
  473         .ssd_p = 0,
  474         .ssd_long = 0,
  475         .ssd_def32 = 0,
  476         .ssd_gran = 0           },
  477 };
  478 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
  479 
  480 void
  481 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
  482 {
  483         struct gate_descriptor *ip;
  484 
  485         ip = idt + idx;
  486         ip->gd_looffset = (uintptr_t)func;
  487         ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
  488         ip->gd_ist = ist;
  489         ip->gd_xx = 0;
  490         ip->gd_type = typ;
  491         ip->gd_dpl = dpl;
  492         ip->gd_p = 1;
  493         ip->gd_hioffset = ((uintptr_t)func)>>16 ;
  494 }
  495 
  496 extern inthand_t
  497         IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
  498         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
  499         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
  500         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
  501         IDTVEC(xmm), IDTVEC(dblfault),
  502         IDTVEC(div_pti), IDTVEC(bpt_pti),
  503         IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
  504         IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
  505         IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
  506         IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
  507         IDTVEC(xmm_pti),
  508 #ifdef KDTRACE_HOOKS
  509         IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
  510 #endif
  511 #ifdef XENHVM
  512         IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
  513 #endif
  514         IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
  515         IDTVEC(fast_syscall_pti);
  516 
  517 #ifdef DDB
  518 /*
  519  * Display the index and function name of any IDT entries that don't use
  520  * the default 'rsvd' entry point.
  521  */
  522 DB_SHOW_COMMAND_FLAGS(idt, db_show_idt, DB_CMD_MEMSAFE)
  523 {
  524         struct gate_descriptor *ip;
  525         int idx;
  526         uintptr_t func;
  527 
  528         ip = idt;
  529         for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
  530                 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
  531                 if (func != (uintptr_t)&IDTVEC(rsvd)) {
  532                         db_printf("%3d\t", idx);
  533                         db_printsym(func, DB_STGY_PROC);
  534                         db_printf("\n");
  535                 }
  536                 ip++;
  537         }
  538 }
  539 
  540 /* Show privileged registers. */
  541 DB_SHOW_COMMAND_FLAGS(sysregs, db_show_sysregs, DB_CMD_MEMSAFE)
  542 {
  543         struct {
  544                 uint16_t limit;
  545                 uint64_t base;
  546         } __packed idtr, gdtr;
  547         uint16_t ldt, tr;
  548 
  549         __asm __volatile("sidt %0" : "=m" (idtr));
  550         db_printf("idtr\t0x%016lx/%04x\n",
  551             (u_long)idtr.base, (u_int)idtr.limit);
  552         __asm __volatile("sgdt %0" : "=m" (gdtr));
  553         db_printf("gdtr\t0x%016lx/%04x\n",
  554             (u_long)gdtr.base, (u_int)gdtr.limit);
  555         __asm __volatile("sldt %0" : "=r" (ldt));
  556         db_printf("ldtr\t0x%04x\n", ldt);
  557         __asm __volatile("str %0" : "=r" (tr));
  558         db_printf("tr\t0x%04x\n", tr);
  559         db_printf("cr0\t0x%016lx\n", rcr0());
  560         db_printf("cr2\t0x%016lx\n", rcr2());
  561         db_printf("cr3\t0x%016lx\n", rcr3());
  562         db_printf("cr4\t0x%016lx\n", rcr4());
  563         if (rcr4() & CR4_XSAVE)
  564                 db_printf("xcr0\t0x%016lx\n", rxcr(0));
  565         db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
  566         if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
  567                 db_printf("FEATURES_CTL\t%016lx\n",
  568                     rdmsr(MSR_IA32_FEATURE_CONTROL));
  569         db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
  570         db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
  571         db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
  572 }
  573 
  574 DB_SHOW_COMMAND_FLAGS(dbregs, db_show_dbregs, DB_CMD_MEMSAFE)
  575 {
  576 
  577         db_printf("dr0\t0x%016lx\n", rdr0());
  578         db_printf("dr1\t0x%016lx\n", rdr1());
  579         db_printf("dr2\t0x%016lx\n", rdr2());
  580         db_printf("dr3\t0x%016lx\n", rdr3());
  581         db_printf("dr6\t0x%016lx\n", rdr6());
  582         db_printf("dr7\t0x%016lx\n", rdr7());
  583 }
  584 #endif
  585 
  586 void
  587 sdtossd(sd, ssd)
  588         struct user_segment_descriptor *sd;
  589         struct soft_segment_descriptor *ssd;
  590 {
  591 
  592         ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
  593         ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
  594         ssd->ssd_type  = sd->sd_type;
  595         ssd->ssd_dpl   = sd->sd_dpl;
  596         ssd->ssd_p     = sd->sd_p;
  597         ssd->ssd_long  = sd->sd_long;
  598         ssd->ssd_def32 = sd->sd_def32;
  599         ssd->ssd_gran  = sd->sd_gran;
  600 }
  601 
  602 void
  603 ssdtosd(ssd, sd)
  604         struct soft_segment_descriptor *ssd;
  605         struct user_segment_descriptor *sd;
  606 {
  607 
  608         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  609         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
  610         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  611         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  612         sd->sd_type  = ssd->ssd_type;
  613         sd->sd_dpl   = ssd->ssd_dpl;
  614         sd->sd_p     = ssd->ssd_p;
  615         sd->sd_long  = ssd->ssd_long;
  616         sd->sd_def32 = ssd->ssd_def32;
  617         sd->sd_gran  = ssd->ssd_gran;
  618 }
  619 
  620 void
  621 ssdtosyssd(ssd, sd)
  622         struct soft_segment_descriptor *ssd;
  623         struct system_segment_descriptor *sd;
  624 {
  625 
  626         sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
  627         sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
  628         sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
  629         sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
  630         sd->sd_type  = ssd->ssd_type;
  631         sd->sd_dpl   = ssd->ssd_dpl;
  632         sd->sd_p     = ssd->ssd_p;
  633         sd->sd_gran  = ssd->ssd_gran;
  634 }
  635 
  636 u_int basemem;
  637 
  638 static int
  639 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
  640     int *physmap_idxp)
  641 {
  642         int i, insert_idx, physmap_idx;
  643 
  644         physmap_idx = *physmap_idxp;
  645 
  646         if (length == 0)
  647                 return (1);
  648 
  649         /*
  650          * Find insertion point while checking for overlap.  Start off by
  651          * assuming the new entry will be added to the end.
  652          *
  653          * NB: physmap_idx points to the next free slot.
  654          */
  655         insert_idx = physmap_idx;
  656         for (i = 0; i <= physmap_idx; i += 2) {
  657                 if (base < physmap[i + 1]) {
  658                         if (base + length <= physmap[i]) {
  659                                 insert_idx = i;
  660                                 break;
  661                         }
  662                         if (boothowto & RB_VERBOSE)
  663                                 printf(
  664                     "Overlapping memory regions, ignoring second region\n");
  665                         return (1);
  666                 }
  667         }
  668 
  669         /* See if we can prepend to the next entry. */
  670         if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
  671                 physmap[insert_idx] = base;
  672                 return (1);
  673         }
  674 
  675         /* See if we can append to the previous entry. */
  676         if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
  677                 physmap[insert_idx - 1] += length;
  678                 return (1);
  679         }
  680 
  681         physmap_idx += 2;
  682         *physmap_idxp = physmap_idx;
  683         if (physmap_idx == PHYS_AVAIL_ENTRIES) {
  684                 printf(
  685                 "Too many segments in the physical address map, giving up\n");
  686                 return (0);
  687         }
  688 
  689         /*
  690          * Move the last 'N' entries down to make room for the new
  691          * entry if needed.
  692          */
  693         for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
  694                 physmap[i] = physmap[i - 2];
  695                 physmap[i + 1] = physmap[i - 1];
  696         }
  697 
  698         /* Insert the new entry. */
  699         physmap[insert_idx] = base;
  700         physmap[insert_idx + 1] = base + length;
  701         return (1);
  702 }
  703 
  704 void
  705 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
  706                       vm_paddr_t *physmap, int *physmap_idx)
  707 {
  708         struct bios_smap *smap, *smapend;
  709 
  710         smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
  711 
  712         for (smap = smapbase; smap < smapend; smap++) {
  713                 if (boothowto & RB_VERBOSE)
  714                         printf("SMAP type=%02x base=%016lx len=%016lx\n",
  715                             smap->type, smap->base, smap->length);
  716 
  717                 if (smap->type != SMAP_TYPE_MEMORY)
  718                         continue;
  719 
  720                 if (!add_physmap_entry(smap->base, smap->length, physmap,
  721                     physmap_idx))
  722                         break;
  723         }
  724 }
  725 
  726 static void
  727 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
  728     int *physmap_idx)
  729 {
  730         struct efi_md *map, *p;
  731         const char *type;
  732         size_t efisz;
  733         int ndesc, i;
  734 
  735         static const char *types[] = {
  736                 "Reserved",
  737                 "LoaderCode",
  738                 "LoaderData",
  739                 "BootServicesCode",
  740                 "BootServicesData",
  741                 "RuntimeServicesCode",
  742                 "RuntimeServicesData",
  743                 "ConventionalMemory",
  744                 "UnusableMemory",
  745                 "ACPIReclaimMemory",
  746                 "ACPIMemoryNVS",
  747                 "MemoryMappedIO",
  748                 "MemoryMappedIOPortSpace",
  749                 "PalCode",
  750                 "PersistentMemory"
  751         };
  752 
  753         /*
  754          * Memory map data provided by UEFI via the GetMemoryMap
  755          * Boot Services API.
  756          */
  757         efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
  758         map = (struct efi_md *)((uint8_t *)efihdr + efisz);
  759 
  760         if (efihdr->descriptor_size == 0)
  761                 return;
  762         ndesc = efihdr->memory_size / efihdr->descriptor_size;
  763 
  764         if (boothowto & RB_VERBOSE)
  765                 printf("%23s %12s %12s %8s %4s\n",
  766                     "Type", "Physical", "Virtual", "#Pages", "Attr");
  767 
  768         for (i = 0, p = map; i < ndesc; i++,
  769             p = efi_next_descriptor(p, efihdr->descriptor_size)) {
  770                 if (boothowto & RB_VERBOSE) {
  771                         if (p->md_type < nitems(types))
  772                                 type = types[p->md_type];
  773                         else
  774                                 type = "<INVALID>";
  775                         printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
  776                             p->md_virt, p->md_pages);
  777                         if (p->md_attr & EFI_MD_ATTR_UC)
  778                                 printf("UC ");
  779                         if (p->md_attr & EFI_MD_ATTR_WC)
  780                                 printf("WC ");
  781                         if (p->md_attr & EFI_MD_ATTR_WT)
  782                                 printf("WT ");
  783                         if (p->md_attr & EFI_MD_ATTR_WB)
  784                                 printf("WB ");
  785                         if (p->md_attr & EFI_MD_ATTR_UCE)
  786                                 printf("UCE ");
  787                         if (p->md_attr & EFI_MD_ATTR_WP)
  788                                 printf("WP ");
  789                         if (p->md_attr & EFI_MD_ATTR_RP)
  790                                 printf("RP ");
  791                         if (p->md_attr & EFI_MD_ATTR_XP)
  792                                 printf("XP ");
  793                         if (p->md_attr & EFI_MD_ATTR_NV)
  794                                 printf("NV ");
  795                         if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
  796                                 printf("MORE_RELIABLE ");
  797                         if (p->md_attr & EFI_MD_ATTR_RO)
  798                                 printf("RO ");
  799                         if (p->md_attr & EFI_MD_ATTR_RT)
  800                                 printf("RUNTIME");
  801                         printf("\n");
  802                 }
  803 
  804                 switch (p->md_type) {
  805                 case EFI_MD_TYPE_CODE:
  806                 case EFI_MD_TYPE_DATA:
  807                 case EFI_MD_TYPE_BS_CODE:
  808                 case EFI_MD_TYPE_BS_DATA:
  809                 case EFI_MD_TYPE_FREE:
  810                         /*
  811                          * We're allowed to use any entry with these types.
  812                          */
  813                         break;
  814                 default:
  815                         continue;
  816                 }
  817 
  818                 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
  819                     physmap, physmap_idx))
  820                         break;
  821         }
  822 }
  823 
  824 static void
  825 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
  826 {
  827         struct bios_smap *smap;
  828         struct efi_map_header *efihdr;
  829         u_int32_t size;
  830 
  831         /*
  832          * Memory map from INT 15:E820.
  833          *
  834          * subr_module.c says:
  835          * "Consumer may safely assume that size value precedes data."
  836          * ie: an int32_t immediately precedes smap.
  837          */
  838 
  839         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
  840             MODINFO_METADATA | MODINFOMD_EFI_MAP);
  841         smap = (struct bios_smap *)preload_search_info(kmdp,
  842             MODINFO_METADATA | MODINFOMD_SMAP);
  843         if (efihdr == NULL && smap == NULL)
  844                 panic("No BIOS smap or EFI map info from loader!");
  845 
  846         if (efihdr != NULL) {
  847                 add_efi_map_entries(efihdr, physmap, physmap_idx);
  848                 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
  849         } else {
  850                 size = *((u_int32_t *)smap - 1);
  851                 bios_add_smap_entries(smap, size, physmap, physmap_idx);
  852                 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
  853         }
  854 }
  855 
  856 #define PAGES_PER_GB    (1024 * 1024 * 1024 / PAGE_SIZE)
  857 
  858 /*
  859  * Populate the (physmap) array with base/bound pairs describing the
  860  * available physical memory in the system, then test this memory and
  861  * build the phys_avail array describing the actually-available memory.
  862  *
  863  * Total memory size may be set by the kernel environment variable
  864  * hw.physmem or the compile-time define MAXMEM.
  865  *
  866  * XXX first should be vm_paddr_t.
  867  */
  868 static void
  869 getmemsize(caddr_t kmdp, u_int64_t first)
  870 {
  871         int i, physmap_idx, pa_indx, da_indx;
  872         vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
  873         u_long physmem_start, physmem_tunable, memtest;
  874         pt_entry_t *pte;
  875         quad_t dcons_addr, dcons_size;
  876         int page_counter;
  877 
  878         /*
  879          * Tell the physical memory allocator about pages used to store
  880          * the kernel and preloaded data.  See kmem_bootstrap_free().
  881          */
  882         vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
  883 
  884         bzero(physmap, sizeof(physmap));
  885         physmap_idx = 0;
  886 
  887         init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
  888         physmap_idx -= 2;
  889 
  890         /*
  891          * Find the 'base memory' segment for SMP
  892          */
  893         basemem = 0;
  894         for (i = 0; i <= physmap_idx; i += 2) {
  895                 if (physmap[i] <= 0xA0000) {
  896                         basemem = physmap[i + 1] / 1024;
  897                         break;
  898                 }
  899         }
  900         if (basemem == 0 || basemem > 640) {
  901                 if (bootverbose)
  902                         printf(
  903                 "Memory map doesn't contain a basemem segment, faking it");
  904                 basemem = 640;
  905         }
  906 
  907         /*
  908          * Maxmem isn't the "maximum memory", it's one larger than the
  909          * highest page of the physical address space.  It should be
  910          * called something like "Maxphyspage".  We may adjust this
  911          * based on ``hw.physmem'' and the results of the memory test.
  912          */
  913         Maxmem = atop(physmap[physmap_idx + 1]);
  914 
  915 #ifdef MAXMEM
  916         Maxmem = MAXMEM / 4;
  917 #endif
  918 
  919         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
  920                 Maxmem = atop(physmem_tunable);
  921 
  922         /*
  923          * The boot memory test is disabled by default, as it takes a
  924          * significant amount of time on large-memory systems, and is
  925          * unfriendly to virtual machines as it unnecessarily touches all
  926          * pages.
  927          *
  928          * A general name is used as the code may be extended to support
  929          * additional tests beyond the current "page present" test.
  930          */
  931         memtest = 0;
  932         TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
  933 
  934         /*
  935          * Don't allow MAXMEM or hw.physmem to extend the amount of memory
  936          * in the system.
  937          */
  938         if (Maxmem > atop(physmap[physmap_idx + 1]))
  939                 Maxmem = atop(physmap[physmap_idx + 1]);
  940 
  941         if (atop(physmap[physmap_idx + 1]) != Maxmem &&
  942             (boothowto & RB_VERBOSE))
  943                 printf("Physical memory use set to %ldK\n", Maxmem * 4);
  944 
  945         /* call pmap initialization to make new kernel address space */
  946         pmap_bootstrap(&first);
  947 
  948         /*
  949          * Size up each available chunk of physical memory.
  950          *
  951          * XXX Some BIOSes corrupt low 64KB between suspend and resume.
  952          * By default, mask off the first 16 pages unless we appear to be
  953          * running in a VM.
  954          */
  955         physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
  956         TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
  957         if (physmap[0] < physmem_start) {
  958                 if (physmem_start < PAGE_SIZE)
  959                         physmap[0] = PAGE_SIZE;
  960                 else if (physmem_start >= physmap[1])
  961                         physmap[0] = round_page(physmap[1] - PAGE_SIZE);
  962                 else
  963                         physmap[0] = round_page(physmem_start);
  964         }
  965         pa_indx = 0;
  966         da_indx = 1;
  967         phys_avail[pa_indx++] = physmap[0];
  968         phys_avail[pa_indx] = physmap[0];
  969         dump_avail[da_indx] = physmap[0];
  970         pte = CMAP1;
  971 
  972         /*
  973          * Get dcons buffer address
  974          */
  975         if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
  976             getenv_quad("dcons.size", &dcons_size) == 0)
  977                 dcons_addr = 0;
  978 
  979         /*
  980          * physmap is in bytes, so when converting to page boundaries,
  981          * round up the start address and round down the end address.
  982          */
  983         page_counter = 0;
  984         if (memtest != 0)
  985                 printf("Testing system memory");
  986         for (i = 0; i <= physmap_idx; i += 2) {
  987                 vm_paddr_t end;
  988 
  989                 end = ptoa((vm_paddr_t)Maxmem);
  990                 if (physmap[i + 1] < end)
  991                         end = trunc_page(physmap[i + 1]);
  992                 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
  993                         int tmp, page_bad, full;
  994                         int *ptr = (int *)CADDR1;
  995 
  996                         full = FALSE;
  997                         /*
  998                          * block out kernel memory as not available.
  999                          */
 1000                         if (pa >= (vm_paddr_t)kernphys && pa < first)
 1001                                 goto do_dump_avail;
 1002 
 1003                         /*
 1004                          * block out dcons buffer
 1005                          */
 1006                         if (dcons_addr > 0
 1007                             && pa >= trunc_page(dcons_addr)
 1008                             && pa < dcons_addr + dcons_size)
 1009                                 goto do_dump_avail;
 1010 
 1011                         page_bad = FALSE;
 1012                         if (memtest == 0)
 1013                                 goto skip_memtest;
 1014 
 1015                         /*
 1016                          * Print a "." every GB to show we're making
 1017                          * progress.
 1018                          */
 1019                         page_counter++;
 1020                         if ((page_counter % PAGES_PER_GB) == 0)
 1021                                 printf(".");
 1022 
 1023                         /*
 1024                          * map page into kernel: valid, read/write,non-cacheable
 1025                          */
 1026                         *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 1027                         invltlb();
 1028 
 1029                         tmp = *(int *)ptr;
 1030                         /*
 1031                          * Test for alternating 1's and 0's
 1032                          */
 1033                         *(volatile int *)ptr = 0xaaaaaaaa;
 1034                         if (*(volatile int *)ptr != 0xaaaaaaaa)
 1035                                 page_bad = TRUE;
 1036                         /*
 1037                          * Test for alternating 0's and 1's
 1038                          */
 1039                         *(volatile int *)ptr = 0x55555555;
 1040                         if (*(volatile int *)ptr != 0x55555555)
 1041                                 page_bad = TRUE;
 1042                         /*
 1043                          * Test for all 1's
 1044                          */
 1045                         *(volatile int *)ptr = 0xffffffff;
 1046                         if (*(volatile int *)ptr != 0xffffffff)
 1047                                 page_bad = TRUE;
 1048                         /*
 1049                          * Test for all 0's
 1050                          */
 1051                         *(volatile int *)ptr = 0x0;
 1052                         if (*(volatile int *)ptr != 0x0)
 1053                                 page_bad = TRUE;
 1054                         /*
 1055                          * Restore original value.
 1056                          */
 1057                         *(int *)ptr = tmp;
 1058 
 1059 skip_memtest:
 1060                         /*
 1061                          * Adjust array of valid/good pages.
 1062                          */
 1063                         if (page_bad == TRUE)
 1064                                 continue;
 1065                         /*
 1066                          * If this good page is a continuation of the
 1067                          * previous set of good pages, then just increase
 1068                          * the end pointer. Otherwise start a new chunk.
 1069                          * Note that "end" points one higher than end,
 1070                          * making the range >= start and < end.
 1071                          * If we're also doing a speculative memory
 1072                          * test and we at or past the end, bump up Maxmem
 1073                          * so that we keep going. The first bad page
 1074                          * will terminate the loop.
 1075                          */
 1076                         if (phys_avail[pa_indx] == pa) {
 1077                                 phys_avail[pa_indx] += PAGE_SIZE;
 1078                         } else {
 1079                                 pa_indx++;
 1080                                 if (pa_indx == PHYS_AVAIL_ENTRIES) {
 1081                                         printf(
 1082                 "Too many holes in the physical address space, giving up\n");
 1083                                         pa_indx--;
 1084                                         full = TRUE;
 1085                                         goto do_dump_avail;
 1086                                 }
 1087                                 phys_avail[pa_indx++] = pa;     /* start */
 1088                                 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 1089                         }
 1090                         physmem++;
 1091 do_dump_avail:
 1092                         if (dump_avail[da_indx] == pa) {
 1093                                 dump_avail[da_indx] += PAGE_SIZE;
 1094                         } else {
 1095                                 da_indx++;
 1096                                 if (da_indx == PHYS_AVAIL_ENTRIES) {
 1097                                         da_indx--;
 1098                                         goto do_next;
 1099                                 }
 1100                                 dump_avail[da_indx++] = pa; /* start */
 1101                                 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 1102                         }
 1103 do_next:
 1104                         if (full)
 1105                                 break;
 1106                 }
 1107         }
 1108         *pte = 0;
 1109         invltlb();
 1110         if (memtest != 0)
 1111                 printf("\n");
 1112 
 1113         /*
 1114          * XXX
 1115          * The last chunk must contain at least one page plus the message
 1116          * buffer to avoid complicating other code (message buffer address
 1117          * calculation, etc.).
 1118          */
 1119         while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 1120             round_page(msgbufsize) >= phys_avail[pa_indx]) {
 1121                 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 1122                 phys_avail[pa_indx--] = 0;
 1123                 phys_avail[pa_indx--] = 0;
 1124         }
 1125 
 1126         Maxmem = atop(phys_avail[pa_indx]);
 1127 
 1128         /* Trim off space for the message buffer. */
 1129         phys_avail[pa_indx] -= round_page(msgbufsize);
 1130 
 1131         /* Map the message buffer. */
 1132         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 1133 }
 1134 
 1135 static caddr_t
 1136 native_parse_preload_data(u_int64_t modulep)
 1137 {
 1138         caddr_t kmdp;
 1139         char *envp;
 1140 #ifdef DDB
 1141         vm_offset_t ksym_start;
 1142         vm_offset_t ksym_end;
 1143 #endif
 1144 
 1145         preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 1146         preload_bootstrap_relocate(KERNBASE);
 1147         kmdp = preload_search_by_type("elf kernel");
 1148         if (kmdp == NULL)
 1149                 kmdp = preload_search_by_type("elf64 kernel");
 1150         boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 1151         envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 1152         if (envp != NULL)
 1153                 envp += KERNBASE;
 1154         init_static_kenv(envp, 0);
 1155 #ifdef DDB
 1156         ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 1157         ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 1158         db_fetch_ksymtab(ksym_start, ksym_end, 0);
 1159 #endif
 1160         efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 1161 
 1162         return (kmdp);
 1163 }
 1164 
 1165 static void
 1166 native_clock_source_init(void)
 1167 {
 1168         i8254_init();
 1169 }
 1170 
 1171 static void
 1172 amd64_kdb_init(void)
 1173 {
 1174         kdb_init();
 1175 #ifdef KDB
 1176         if (boothowto & RB_KDB)
 1177                 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 1178 #endif
 1179 }
 1180 
 1181 /* Set up the fast syscall stuff */
 1182 void
 1183 amd64_conf_fast_syscall(void)
 1184 {
 1185         uint64_t msr;
 1186 
 1187         msr = rdmsr(MSR_EFER) | EFER_SCE;
 1188         wrmsr(MSR_EFER, msr);
 1189         wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 1190             (u_int64_t)IDTVEC(fast_syscall));
 1191         wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 1192         msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 1193             ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 1194         wrmsr(MSR_STAR, msr);
 1195         wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
 1196 }
 1197 
 1198 void
 1199 amd64_bsp_pcpu_init1(struct pcpu *pc)
 1200 {
 1201         struct user_segment_descriptor *gdt;
 1202 
 1203         PCPU_SET(prvspace, pc);
 1204         gdt = *PCPU_PTR(gdt);
 1205         PCPU_SET(curthread, &thread0);
 1206         PCPU_SET(tssp, PCPU_PTR(common_tss));
 1207         PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1208         PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 1209         PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 1210         PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 1211         PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
 1212         PCPU_SET(smp_tlb_gen, 1);
 1213 }
 1214 
 1215 void
 1216 amd64_bsp_pcpu_init2(uint64_t rsp0)
 1217 {
 1218 
 1219         PCPU_SET(rsp0, rsp0);
 1220         PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 1221             PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 1222         PCPU_SET(curpcb, thread0.td_pcb);
 1223 }
 1224 
 1225 void
 1226 amd64_bsp_ist_init(struct pcpu *pc)
 1227 {
 1228         struct nmi_pcpu *np;
 1229         struct amd64tss *tssp;
 1230 
 1231         tssp = &pc->pc_common_tss;
 1232 
 1233         /* doublefault stack space, runs on ist1 */
 1234         np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
 1235         np->np_pcpu = (register_t)pc;
 1236         tssp->tss_ist1 = (long)np;
 1237 
 1238         /*
 1239          * NMI stack, runs on ist2.  The pcpu pointer is stored just
 1240          * above the start of the ist2 stack.
 1241          */
 1242         np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
 1243         np->np_pcpu = (register_t)pc;
 1244         tssp->tss_ist2 = (long)np;
 1245 
 1246         /*
 1247          * MC# stack, runs on ist3.  The pcpu pointer is stored just
 1248          * above the start of the ist3 stack.
 1249          */
 1250         np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
 1251         np->np_pcpu = (register_t)pc;
 1252         tssp->tss_ist3 = (long)np;
 1253 
 1254         /*
 1255          * DB# stack, runs on ist4.
 1256          */
 1257         np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
 1258         np->np_pcpu = (register_t)pc;
 1259         tssp->tss_ist4 = (long)np;
 1260 }
 1261 
 1262 /*
 1263  * Calculate the kernel load address by inspecting page table created by loader.
 1264  * The assumptions:
 1265  * - kernel is mapped at KERNBASE, backed by contiguous phys memory
 1266  *   aligned at 2M, below 4G (the latter is important for AP startup)
 1267  * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M)
 1268  * - kernel is mapped with 2M superpages
 1269  * - all participating memory, i.e. kernel, modules, metadata,
 1270  *   page table is accessible by pre-created 1:1 mapping
 1271  *   (right now loader creates 1:1 mapping for lower 4G, and all
 1272  *   memory is from there)
 1273  * - there is a usable memory block right after the end of the
 1274  *   mapped kernel and all modules/metadata, pointed to by
 1275  *   physfree, for early allocations
 1276  */
 1277 vm_paddr_t __nosanitizeaddress __nosanitizememory
 1278 amd64_loadaddr(void)
 1279 {
 1280         pml4_entry_t *pml4e;
 1281         pdp_entry_t *pdpe;
 1282         pd_entry_t *pde;
 1283         uint64_t cr3;
 1284 
 1285         cr3 = rcr3();
 1286         pml4e = (pml4_entry_t *)cr3 + pmap_pml4e_index(KERNSTART);
 1287         pdpe = (pdp_entry_t *)(*pml4e & PG_FRAME) + pmap_pdpe_index(KERNSTART);
 1288         pde = (pd_entry_t *)(*pdpe & PG_FRAME) + pmap_pde_index(KERNSTART);
 1289         return (*pde & PG_FRAME);
 1290 }
 1291 
 1292 u_int64_t
 1293 hammer_time(u_int64_t modulep, u_int64_t physfree)
 1294 {
 1295         caddr_t kmdp;
 1296         int gsel_tss, x;
 1297         struct pcpu *pc;
 1298         uint64_t rsp0;
 1299         char *env;
 1300         struct user_segment_descriptor *gdt;
 1301         struct region_descriptor r_gdt;
 1302         size_t kstack0_sz;
 1303 
 1304         TSRAW(&thread0, TS_ENTER, __func__, NULL);
 1305 
 1306         kernphys = amd64_loadaddr();
 1307 
 1308         physfree += kernphys;
 1309 
 1310         kmdp = init_ops.parse_preload_data(modulep);
 1311 
 1312         efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
 1313             MODINFOMD_EFI_MAP) != NULL;
 1314 
 1315         if (!efi_boot) {
 1316                 /* Tell the bios to warmboot next time */
 1317                 atomic_store_short((u_short *)0x472, 0x1234);
 1318         }
 1319 
 1320         physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
 1321         physfree = roundup2(physfree, PAGE_SIZE);
 1322 
 1323         identify_cpu1();
 1324         identify_hypervisor();
 1325         identify_cpu_fixup_bsp();
 1326         identify_cpu2();
 1327         initializecpucache();
 1328 
 1329         /*
 1330          * Check for pti, pcid, and invpcid before ifuncs are
 1331          * resolved, to correctly select the implementation for
 1332          * pmap_activate_sw_mode().
 1333          */
 1334         pti = pti_get_default();
 1335         TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 1336         TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 1337         if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 1338                 invpcid_works = (cpu_stdext_feature &
 1339                     CPUID_STDEXT_INVPCID) != 0;
 1340         } else {
 1341                 pmap_pcid_enabled = 0;
 1342         }
 1343 
 1344         /*
 1345          * Now we can do small core initialization, after the PCID
 1346          * CPU features and user knobs are evaluated.
 1347          */
 1348         TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
 1349             &pmap_pcid_invlpg_workaround_uena);
 1350         cpu_init_small_core();
 1351 
 1352         link_elf_ireloc(kmdp);
 1353 
 1354         /*
 1355          * This may be done better later if it gets more high level
 1356          * components in it. If so just link td->td_proc here.
 1357          */
 1358         proc_linkup0(&proc0, &thread0);
 1359 
 1360         /* Init basic tunables, hz etc */
 1361         init_param1();
 1362 
 1363         thread0.td_kstack = physfree - kernphys + KERNSTART;
 1364         thread0.td_kstack_pages = kstack_pages;
 1365         kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 1366         bzero((void *)thread0.td_kstack, kstack0_sz);
 1367         physfree += kstack0_sz;
 1368 
 1369         /*
 1370          * Initialize enough of thread0 for delayed invalidation to
 1371          * work very early.  Rely on thread0.td_base_pri
 1372          * zero-initialization, it is reset to PVM at proc0_init().
 1373          */
 1374         pmap_thread_init_invl_gen(&thread0);
 1375 
 1376         pc = &temp_bsp_pcpu;
 1377         pcpu_init(pc, 0, sizeof(struct pcpu));
 1378         gdt = &temp_bsp_pcpu.pc_gdt[0];
 1379 
 1380         /*
 1381          * make gdt memory segments
 1382          */
 1383         for (x = 0; x < NGDT; x++) {
 1384                 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 1385                     x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 1386                         ssdtosd(&gdt_segs[x], &gdt[x]);
 1387         }
 1388         gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
 1389         ssdtosyssd(&gdt_segs[GPROC0_SEL],
 1390             (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 1391 
 1392         r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 1393         r_gdt.rd_base = (long)gdt;
 1394         lgdt(&r_gdt);
 1395 
 1396         wrmsr(MSR_FSBASE, 0);           /* User value */
 1397         wrmsr(MSR_GSBASE, (u_int64_t)pc);
 1398         wrmsr(MSR_KGSBASE, 0);          /* User value while in the kernel */
 1399 
 1400         dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
 1401         physfree += DPCPU_SIZE;
 1402         amd64_bsp_pcpu_init1(pc);
 1403         /* Non-late cninit() and printf() can be moved up to here. */
 1404 
 1405         /*
 1406          * Initialize mutexes.
 1407          *
 1408          * icu_lock: in order to allow an interrupt to occur in a critical
 1409          *           section, to set pcpu->ipending (etc...) properly, we
 1410          *           must be able to get the icu lock, so it can't be
 1411          *           under witness.
 1412          */
 1413         mutex_init();
 1414         mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 1415         mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 1416 
 1417         /* exceptions */
 1418         for (x = 0; x < NIDT; x++)
 1419                 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 1420                     SEL_KPL, 0);
 1421         setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 1422             SEL_KPL, 0);
 1423         setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 1424         setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 1425         setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 1426             SEL_UPL, 0);
 1427         setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 1428             SEL_UPL, 0);
 1429         setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 1430             SEL_KPL, 0);
 1431         setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 1432             SEL_KPL, 0);
 1433         setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 1434             SEL_KPL, 0);
 1435         setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 1436         setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 1437             SDT_SYSIGT, SEL_KPL, 0);
 1438         setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 1439             SEL_KPL, 0);
 1440         setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 1441             SDT_SYSIGT, SEL_KPL, 0);
 1442         setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 1443             SEL_KPL, 0);
 1444         setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 1445             SEL_KPL, 0);
 1446         setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 1447             SEL_KPL, 0);
 1448         setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 1449             SEL_KPL, 0);
 1450         setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 1451             SEL_KPL, 0);
 1452         setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 1453         setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 1454             SEL_KPL, 0);
 1455 #ifdef KDTRACE_HOOKS
 1456         setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 1457             &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 1458 #endif
 1459 #ifdef XENHVM
 1460         setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 1461             &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 1462 #endif
 1463         r_idt.rd_limit = sizeof(idt0) - 1;
 1464         r_idt.rd_base = (long) idt;
 1465         lidt(&r_idt);
 1466 
 1467         /*
 1468          * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 1469          * transition).
 1470          * Once bootblocks have updated, we can test directly for
 1471          * efi_systbl != NULL here...
 1472          */
 1473         if (efi_boot)
 1474                 vty_set_preferred(VTY_VT);
 1475 
 1476         TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 1477         TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
 1478 
 1479         TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 1480         TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
 1481 
 1482         TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
 1483             &syscall_ret_l1d_flush_mode);
 1484 
 1485         TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 1486         TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
 1487 
 1488         TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
 1489 
 1490         TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
 1491             &x86_rngds_mitg_enable);
 1492 
 1493         finishidentcpu();       /* Final stage of CPU initialization */
 1494 
 1495         /*
 1496          * Initialize the clock before the console so that console
 1497          * initialization can use DELAY().
 1498          */
 1499         clock_init();
 1500 
 1501         initializecpu();        /* Initialize CPU registers */
 1502 
 1503         amd64_bsp_ist_init(pc);
 1504 
 1505         /* Set the IO permission bitmap (empty due to tss seg limit) */
 1506         pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 1507             IOPERM_BITMAP_SIZE;
 1508 
 1509         gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 1510         ltr(gsel_tss);
 1511 
 1512         amd64_conf_fast_syscall();
 1513 
 1514         /*
 1515          * We initialize the PCB pointer early so that exception
 1516          * handlers will work.  Also set up td_critnest to short-cut
 1517          * the page fault handler.
 1518          */
 1519         cpu_max_ext_state_size = sizeof(struct savefpu);
 1520         set_top_of_stack_td(&thread0);
 1521         thread0.td_pcb = get_pcb_td(&thread0);
 1522         thread0.td_critnest = 1;
 1523 
 1524         /*
 1525          * The console and kdb should be initialized even earlier than here,
 1526          * but some console drivers don't work until after getmemsize().
 1527          * Default to late console initialization to support these drivers.
 1528          * This loses mainly printf()s in getmemsize() and early debugging.
 1529          */
 1530         TUNABLE_INT_FETCH("debug.late_console", &late_console);
 1531         if (!late_console) {
 1532                 cninit();
 1533                 amd64_kdb_init();
 1534         }
 1535 
 1536         getmemsize(kmdp, physfree);
 1537         init_param2(physmem);
 1538 
 1539         /* now running on new page tables, configured,and u/iom is accessible */
 1540 
 1541 #ifdef DEV_PCI
 1542         /* This call might adjust phys_avail[]. */
 1543         pci_early_quirks();
 1544 #endif
 1545 
 1546         if (late_console)
 1547                 cninit();
 1548 
 1549         /*
 1550          * Dump the boot metadata. We have to wait for cninit() since console
 1551          * output is required. If it's grossly incorrect the kernel will never
 1552          * make it this far.
 1553          */
 1554         if (getenv_is_true("debug.dump_modinfo_at_boot"))
 1555                 preload_dump();
 1556 
 1557 #ifdef DEV_ISA
 1558 #ifdef DEV_ATPIC
 1559         elcr_probe();
 1560         atpic_startup();
 1561 #else
 1562         /* Reset and mask the atpics and leave them shut down. */
 1563         atpic_reset();
 1564 
 1565         /*
 1566          * Point the ICU spurious interrupt vectors at the APIC spurious
 1567          * interrupt handler.
 1568          */
 1569         setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1570         setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 1571 #endif
 1572 #else
 1573 #error "have you forgotten the isa device?"
 1574 #endif
 1575 
 1576         if (late_console)
 1577                 amd64_kdb_init();
 1578 
 1579         msgbufinit(msgbufp, msgbufsize);
 1580         fpuinit();
 1581 
 1582         /* make an initial tss so cpu can get interrupt stack on syscall! */
 1583         rsp0 = thread0.td_md.md_stack_base;
 1584         /* Ensure the stack is aligned to 16 bytes */
 1585         rsp0 &= ~0xFul;
 1586         PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
 1587         amd64_bsp_pcpu_init2(rsp0);
 1588 
 1589         /* transfer to user mode */
 1590 
 1591         _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 1592         _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 1593         _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 1594         _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 1595         _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 1596 
 1597         load_ds(_udatasel);
 1598         load_es(_udatasel);
 1599         load_fs(_ufssel);
 1600 
 1601         /* setup proc 0's pcb */
 1602         thread0.td_pcb->pcb_flags = 0;
 1603 
 1604         env = kern_getenv("kernelname");
 1605         if (env != NULL)
 1606                 strlcpy(kernelname, env, sizeof(kernelname));
 1607 
 1608         kcsan_cpu_init(0);
 1609 
 1610 #ifdef FDT
 1611         x86_init_fdt();
 1612 #endif
 1613         thread0.td_critnest = 0;
 1614 
 1615         kasan_init();
 1616         kmsan_init();
 1617 
 1618         TSEXIT();
 1619 
 1620         /* Location of kernel stack for locore */
 1621         return (thread0.td_md.md_stack_base);
 1622 }
 1623 
 1624 void
 1625 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 1626 {
 1627 
 1628         pcpu->pc_acpi_id = 0xffffffff;
 1629 }
 1630 
 1631 static int
 1632 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1633 {
 1634         struct bios_smap *smapbase;
 1635         struct bios_smap_xattr smap;
 1636         caddr_t kmdp;
 1637         uint32_t *smapattr;
 1638         int count, error, i;
 1639 
 1640         /* Retrieve the system memory map from the loader. */
 1641         kmdp = preload_search_by_type("elf kernel");
 1642         if (kmdp == NULL)
 1643                 kmdp = preload_search_by_type("elf64 kernel");
 1644         smapbase = (struct bios_smap *)preload_search_info(kmdp,
 1645             MODINFO_METADATA | MODINFOMD_SMAP);
 1646         if (smapbase == NULL)
 1647                 return (0);
 1648         smapattr = (uint32_t *)preload_search_info(kmdp,
 1649             MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 1650         count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 1651         error = 0;
 1652         for (i = 0; i < count; i++) {
 1653                 smap.base = smapbase[i].base;
 1654                 smap.length = smapbase[i].length;
 1655                 smap.type = smapbase[i].type;
 1656                 if (smapattr != NULL)
 1657                         smap.xattr = smapattr[i];
 1658                 else
 1659                         smap.xattr = 0;
 1660                 error = SYSCTL_OUT(req, &smap, sizeof(smap));
 1661         }
 1662         return (error);
 1663 }
 1664 SYSCTL_PROC(_machdep, OID_AUTO, smap,
 1665     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1666     smap_sysctl_handler, "S,bios_smap_xattr",
 1667     "Raw BIOS SMAP data");
 1668 
 1669 static int
 1670 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1671 {
 1672         struct efi_map_header *efihdr;
 1673         caddr_t kmdp;
 1674         uint32_t efisize;
 1675 
 1676         kmdp = preload_search_by_type("elf kernel");
 1677         if (kmdp == NULL)
 1678                 kmdp = preload_search_by_type("elf64 kernel");
 1679         efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 1680             MODINFO_METADATA | MODINFOMD_EFI_MAP);
 1681         if (efihdr == NULL)
 1682                 return (0);
 1683         efisize = *((uint32_t *)efihdr - 1);
 1684         return (SYSCTL_OUT(req, efihdr, efisize));
 1685 }
 1686 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
 1687     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1688     efi_map_sysctl_handler, "S,efi_map_header",
 1689     "Raw EFI Memory Map");
 1690 
 1691 void
 1692 spinlock_enter(void)
 1693 {
 1694         struct thread *td;
 1695         register_t flags;
 1696 
 1697         td = curthread;
 1698         if (td->td_md.md_spinlock_count == 0) {
 1699                 flags = intr_disable();
 1700                 td->td_md.md_spinlock_count = 1;
 1701                 td->td_md.md_saved_flags = flags;
 1702                 critical_enter();
 1703         } else
 1704                 td->td_md.md_spinlock_count++;
 1705 }
 1706 
 1707 void
 1708 spinlock_exit(void)
 1709 {
 1710         struct thread *td;
 1711         register_t flags;
 1712 
 1713         td = curthread;
 1714         flags = td->td_md.md_saved_flags;
 1715         td->td_md.md_spinlock_count--;
 1716         if (td->td_md.md_spinlock_count == 0) {
 1717                 critical_exit();
 1718                 intr_restore(flags);
 1719         }
 1720 }
 1721 
 1722 /*
 1723  * Construct a PCB from a trapframe. This is called from kdb_trap() where
 1724  * we want to start a backtrace from the function that caused us to enter
 1725  * the debugger. We have the context in the trapframe, but base the trace
 1726  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
 1727  * enough for a backtrace.
 1728  */
 1729 void
 1730 makectx(struct trapframe *tf, struct pcb *pcb)
 1731 {
 1732 
 1733         pcb->pcb_r12 = tf->tf_r12;
 1734         pcb->pcb_r13 = tf->tf_r13;
 1735         pcb->pcb_r14 = tf->tf_r14;
 1736         pcb->pcb_r15 = tf->tf_r15;
 1737         pcb->pcb_rbp = tf->tf_rbp;
 1738         pcb->pcb_rbx = tf->tf_rbx;
 1739         pcb->pcb_rip = tf->tf_rip;
 1740         pcb->pcb_rsp = tf->tf_rsp;
 1741 }
 1742 
 1743 /*
 1744  * The pcb_flags is only modified by current thread, or by other threads
 1745  * when current thread is stopped.  However, current thread may change it
 1746  * from the interrupt context in cpu_switch(), or in the trap handler.
 1747  * When we read-modify-write pcb_flags from C sources, compiler may generate
 1748  * code that is not atomic regarding the interrupt handler.  If a trap or
 1749  * interrupt happens and any flag is modified from the handler, it can be
 1750  * clobbered with the cached value later.  Therefore, we implement setting
 1751  * and clearing flags with single-instruction functions, which do not race
 1752  * with possible modification of the flags from the trap or interrupt context,
 1753  * because traps and interrupts are executed only on instruction boundary.
 1754  */
 1755 void
 1756 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 1757 {
 1758 
 1759         __asm __volatile("orl %1,%0"
 1760             : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 1761             : "cc", "memory");
 1762 
 1763 }
 1764 
 1765 /*
 1766  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
 1767  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
 1768  * pcb if user space modified the bases.  We must save on the context
 1769  * switch or if the return to usermode happens through the doreti.
 1770  *
 1771  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
 1772  * which have a consequence that the base MSRs must be saved each time
 1773  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
 1774  * context switches.
 1775  */
 1776 static void
 1777 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
 1778 {
 1779         register_t r;
 1780 
 1781         if (curpcb == pcb &&
 1782             (flags & PCB_FULL_IRET) != 0 &&
 1783             (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 1784                 r = intr_disable();
 1785                 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 1786                         if (rfs() == _ufssel)
 1787                                 pcb->pcb_fsbase = rdfsbase();
 1788                         if (rgs() == _ugssel)
 1789                                 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 1790                 }
 1791                 set_pcb_flags_raw(pcb, flags);
 1792                 intr_restore(r);
 1793         } else {
 1794                 set_pcb_flags_raw(pcb, flags);
 1795         }
 1796 }
 1797 
 1798 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
 1799 {
 1800 
 1801         return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
 1802             set_pcb_flags_fsgsbase : set_pcb_flags_raw);
 1803 }
 1804 
 1805 void
 1806 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 1807 {
 1808 
 1809         __asm __volatile("andl %1,%0"
 1810             : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 1811             : "cc", "memory");
 1812 }
 1813 
 1814 #ifdef KDB
 1815 
 1816 /*
 1817  * Provide inb() and outb() as functions.  They are normally only available as
 1818  * inline functions, thus cannot be called from the debugger.
 1819  */
 1820 
 1821 /* silence compiler warnings */
 1822 u_char inb_(u_short);
 1823 void outb_(u_short, u_char);
 1824 
 1825 u_char
 1826 inb_(u_short port)
 1827 {
 1828         return inb(port);
 1829 }
 1830 
 1831 void
 1832 outb_(u_short port, u_char data)
 1833 {
 1834         outb(port, data);
 1835 }
 1836 
 1837 #endif /* KDB */
 1838 
 1839 #undef memset
 1840 #undef memmove
 1841 #undef memcpy
 1842 
 1843 void    *memset_std(void *buf, int c, size_t len);
 1844 void    *memset_erms(void *buf, int c, size_t len);
 1845 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
 1846             size_t len);
 1847 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
 1848             size_t len);
 1849 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
 1850             size_t len);
 1851 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
 1852             size_t len);
 1853 
 1854 #ifdef KCSAN
 1855 /*
 1856  * These fail to build as ifuncs when used with KCSAN.
 1857  */
 1858 void *
 1859 memset(void *buf, int c, size_t len)
 1860 {
 1861 
 1862         return (memset_std(buf, c, len));
 1863 }
 1864 
 1865 void *
 1866 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 1867 {
 1868 
 1869         return (memmove_std(dst, src, len));
 1870 }
 1871 
 1872 void *
 1873 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 1874 {
 1875 
 1876         return (memcpy_std(dst, src, len));
 1877 }
 1878 #else
 1879 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
 1880 {
 1881 
 1882         return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 1883             memset_erms : memset_std);
 1884 }
 1885 
 1886 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
 1887     size_t))
 1888 {
 1889 
 1890         return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 1891             memmove_erms : memmove_std);
 1892 }
 1893 
 1894 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
 1895 {
 1896 
 1897         return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 1898             memcpy_erms : memcpy_std);
 1899 }
 1900 #endif
 1901 
 1902 void    pagezero_std(void *addr);
 1903 void    pagezero_erms(void *addr);
 1904 DEFINE_IFUNC(, void , pagezero, (void *))
 1905 {
 1906 
 1907         return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 1908             pagezero_erms : pagezero_std);
 1909 }

Cache object: d2a7034d5824baaf8c285f19ca662e32


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.