The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/x86/x86/cpu_machdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 #include "opt_atpic.h"
   45 #include "opt_compat.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kdb.h"
   51 #include "opt_kstack_pages.h"
   52 #include "opt_maxmem.h"
   53 #include "opt_mp_watchdog.h"
   54 #include "opt_perfmon.h"
   55 #include "opt_platform.h"
   56 #ifdef __i386__
   57 #include "opt_apic.h"
   58 #include "opt_xbox.h"
   59 #endif
   60 
   61 #include <sys/param.h>
   62 #include <sys/proc.h>
   63 #include <sys/systm.h>
   64 #include <sys/bus.h>
   65 #include <sys/cpu.h>
   66 #include <sys/kdb.h>
   67 #include <sys/kernel.h>
   68 #include <sys/ktr.h>
   69 #include <sys/lock.h>
   70 #include <sys/malloc.h>
   71 #include <sys/mutex.h>
   72 #include <sys/pcpu.h>
   73 #include <sys/rwlock.h>
   74 #include <sys/sched.h>
   75 #include <sys/smp.h>
   76 #include <sys/sysctl.h>
   77 
   78 #include <machine/clock.h>
   79 #include <machine/cpu.h>
   80 #include <machine/cputypes.h>
   81 #include <machine/specialreg.h>
   82 #include <machine/md_var.h>
   83 #include <machine/mp_watchdog.h>
   84 #ifdef PERFMON
   85 #include <machine/perfmon.h>
   86 #endif
   87 #include <machine/tss.h>
   88 #ifdef SMP
   89 #include <machine/smp.h>
   90 #endif
   91 #ifdef CPU_ELAN
   92 #include <machine/elan_mmcr.h>
   93 #endif
   94 #include <x86/acpica_machdep.h>
   95 
   96 #include <vm/vm.h>
   97 #include <vm/vm_extern.h>
   98 #include <vm/vm_kern.h>
   99 #include <vm/vm_page.h>
  100 #include <vm/vm_map.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_pager.h>
  103 #include <vm/vm_param.h>
  104 
  105 #ifndef PC98
  106 #include <isa/isareg.h>
  107 #endif
  108 
  109 #define STATE_RUNNING   0x0
  110 #define STATE_MWAIT     0x1
  111 #define STATE_SLEEPING  0x2
  112 
  113 #ifdef SMP
  114 static u_int    cpu_reset_proxyid;
  115 static volatile u_int   cpu_reset_proxy_active;
  116 #endif
  117 
  118 struct msr_op_arg {
  119         u_int msr;
  120         int op;
  121         uint64_t arg1;
  122 };
  123 
  124 static void
  125 x86_msr_op_one(void *argp)
  126 {
  127         struct msr_op_arg *a;
  128         uint64_t v;
  129 
  130         a = argp;
  131         switch (a->op) {
  132         case MSR_OP_ANDNOT:
  133                 v = rdmsr(a->msr);
  134                 v &= ~a->arg1;
  135                 wrmsr(a->msr, v);
  136                 break;
  137         case MSR_OP_OR:
  138                 v = rdmsr(a->msr);
  139                 v |= a->arg1;
  140                 wrmsr(a->msr, v);
  141                 break;
  142         case MSR_OP_WRITE:
  143                 wrmsr(a->msr, a->arg1);
  144                 break;
  145         }
  146 }
  147 
  148 #define MSR_OP_EXMODE_MASK      0xf0000000
  149 #define MSR_OP_OP_MASK          0x000000ff
  150 
  151 void
  152 x86_msr_op(u_int msr, u_int op, uint64_t arg1)
  153 {
  154         struct thread *td;
  155         struct msr_op_arg a;
  156         u_int exmode;
  157         int bound_cpu, i, is_bound;
  158 
  159         a.op = op & MSR_OP_OP_MASK;
  160         MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
  161             a.op == MSR_OP_WRITE);
  162         exmode = op & MSR_OP_EXMODE_MASK;
  163         MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
  164             exmode == MSR_OP_RENDEZVOUS);
  165         a.msr = msr;
  166         a.arg1 = arg1;
  167         switch (exmode) {
  168         case MSR_OP_LOCAL:
  169                 x86_msr_op_one(&a);
  170                 break;
  171         case MSR_OP_SCHED:
  172                 td = curthread;
  173                 thread_lock(td);
  174                 is_bound = sched_is_bound(td);
  175                 bound_cpu = td->td_oncpu;
  176                 CPU_FOREACH(i) {
  177                         sched_bind(td, i);
  178                         x86_msr_op_one(&a);
  179                 }
  180                 if (is_bound)
  181                         sched_bind(td, bound_cpu);
  182                 else
  183                         sched_unbind(td);
  184                 thread_unlock(td);
  185                 break;
  186         case MSR_OP_RENDEZVOUS:
  187                 smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
  188                 break;
  189         }
  190 }
  191 
  192 /*
  193  * Machine dependent boot() routine
  194  *
  195  * I haven't seen anything to put here yet
  196  * Possibly some stuff might be grafted back here from boot()
  197  */
  198 void
  199 cpu_boot(int howto)
  200 {
  201 }
  202 
  203 /*
  204  * Flush the D-cache for non-DMA I/O so that the I-cache can
  205  * be made coherent later.
  206  */
  207 void
  208 cpu_flush_dcache(void *ptr, size_t len)
  209 {
  210         /* Not applicable */
  211 }
  212 
  213 void
  214 acpi_cpu_c1(void)
  215 {
  216 
  217         __asm __volatile("sti; hlt");
  218 }
  219 
  220 /*
  221  * Use mwait to pause execution while waiting for an interrupt or
  222  * another thread to signal that there is more work.
  223  *
  224  * NOTE: Interrupts will cause a wakeup; however, this function does
  225  * not enable interrupt handling. The caller is responsible to enable
  226  * interrupts.
  227  */
  228 void
  229 acpi_cpu_idle_mwait(uint32_t mwait_hint)
  230 {
  231         int *state;
  232         uint64_t v;
  233 
  234         /*
  235          * A comment in Linux patch claims that 'CPUs run faster with
  236          * speculation protection disabled. All CPU threads in a core
  237          * must disable speculation protection for it to be
  238          * disabled. Disable it while we are idle so the other
  239          * hyperthread can run fast.'
  240          *
  241          * XXXKIB.  Software coordination mode should be supported,
  242          * but all Intel CPUs provide hardware coordination.
  243          */
  244 
  245         state = (int *)PCPU_PTR(monitorbuf);
  246         KASSERT(atomic_load_int(state) == STATE_SLEEPING,
  247             ("cpu_mwait_cx: wrong monitorbuf state"));
  248         atomic_store_int(state, STATE_MWAIT);
  249         if (PCPU_GET(ibpb_set) || hw_ssb_active) {
  250                 v = rdmsr(MSR_IA32_SPEC_CTRL);
  251                 wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
  252                     IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
  253         } else {
  254                 v = 0;
  255         }
  256         cpu_monitor(state, 0, 0);
  257         if (atomic_load_int(state) == STATE_MWAIT)
  258                 cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
  259 
  260         /*
  261          * SSB cannot be disabled while we sleep, or rather, if it was
  262          * disabled, the sysctl thread will bind to our cpu to tweak
  263          * MSR.
  264          */
  265         if (v != 0)
  266                 wrmsr(MSR_IA32_SPEC_CTRL, v);
  267 
  268         /*
  269          * We should exit on any event that interrupts mwait, because
  270          * that event might be a wanted interrupt.
  271          */
  272         atomic_store_int(state, STATE_RUNNING);
  273 }
  274 
  275 /* Get current clock frequency for the given cpu id. */
  276 int
  277 cpu_est_clockrate(int cpu_id, uint64_t *rate)
  278 {
  279         uint64_t tsc1, tsc2;
  280         uint64_t acnt, mcnt, perf;
  281         register_t reg;
  282 
  283         if (pcpu_find(cpu_id) == NULL || rate == NULL)
  284                 return (EINVAL);
  285 #ifdef __i386__
  286         if ((cpu_feature & CPUID_TSC) == 0)
  287                 return (EOPNOTSUPP);
  288 #endif
  289 
  290         /*
  291          * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
  292          * DELAY(9) based logic fails.
  293          */
  294         if (tsc_is_invariant && !tsc_perf_stat)
  295                 return (EOPNOTSUPP);
  296 
  297 #ifdef SMP
  298         if (smp_cpus > 1) {
  299                 /* Schedule ourselves on the indicated cpu. */
  300                 thread_lock(curthread);
  301                 sched_bind(curthread, cpu_id);
  302                 thread_unlock(curthread);
  303         }
  304 #endif
  305 
  306         /* Calibrate by measuring a short delay. */
  307         reg = intr_disable();
  308         if (tsc_is_invariant) {
  309                 wrmsr(MSR_MPERF, 0);
  310                 wrmsr(MSR_APERF, 0);
  311                 tsc1 = rdtsc();
  312                 DELAY(1000);
  313                 mcnt = rdmsr(MSR_MPERF);
  314                 acnt = rdmsr(MSR_APERF);
  315                 tsc2 = rdtsc();
  316                 intr_restore(reg);
  317                 perf = 1000 * acnt / mcnt;
  318                 *rate = (tsc2 - tsc1) * perf;
  319         } else {
  320                 tsc1 = rdtsc();
  321                 DELAY(1000);
  322                 tsc2 = rdtsc();
  323                 intr_restore(reg);
  324                 *rate = (tsc2 - tsc1) * 1000;
  325         }
  326 
  327 #ifdef SMP
  328         if (smp_cpus > 1) {
  329                 thread_lock(curthread);
  330                 sched_unbind(curthread);
  331                 thread_unlock(curthread);
  332         }
  333 #endif
  334 
  335         return (0);
  336 }
  337 
  338 /*
  339  * Shutdown the CPU as much as possible
  340  */
  341 void
  342 cpu_halt(void)
  343 {
  344         for (;;)
  345                 halt();
  346 }
  347 
  348 static void
  349 cpu_reset_real(void)
  350 {
  351         struct region_descriptor null_idt;
  352 #ifndef PC98
  353         int b;
  354 #endif
  355 
  356         disable_intr();
  357 #ifdef CPU_ELAN
  358         if (elan_mmcr != NULL)
  359                 elan_mmcr->RESCFG = 1;
  360 #endif
  361 #ifdef __i386__
  362         if (cpu == CPU_GEODE1100) {
  363                 /* Attempt Geode's own reset */
  364                 outl(0xcf8, 0x80009044ul);
  365                 outl(0xcfc, 0xf);
  366         }
  367 #endif
  368 #ifdef PC98
  369         /*
  370          * Attempt to do a CPU reset via CPU reset port.
  371          */
  372         if ((inb(0x35) & 0xa0) != 0xa0) {
  373                 outb(0x37, 0x0f);               /* SHUT0 = 0. */
  374                 outb(0x37, 0x0b);               /* SHUT1 = 0. */
  375         }
  376         outb(0xf0, 0x00);                       /* Reset. */
  377 #else
  378 #if !defined(BROKEN_KEYBOARD_RESET)
  379         /*
  380          * Attempt to do a CPU reset via the keyboard controller,
  381          * do not turn off GateA20, as any machine that fails
  382          * to do the reset here would then end up in no man's land.
  383          */
  384         outb(IO_KBD + 4, 0xFE);
  385         DELAY(500000);  /* wait 0.5 sec to see if that did it */
  386 #endif
  387 
  388         /*
  389          * Attempt to force a reset via the Reset Control register at
  390          * I/O port 0xcf9.  Bit 2 forces a system reset when it
  391          * transitions from 0 to 1.  Bit 1 selects the type of reset
  392          * to attempt: 0 selects a "soft" reset, and 1 selects a
  393          * "hard" reset.  We try a "hard" reset.  The first write sets
  394          * bit 1 to select a "hard" reset and clears bit 2.  The
  395          * second write forces a 0 -> 1 transition in bit 2 to trigger
  396          * a reset.
  397          */
  398         outb(0xcf9, 0x2);
  399         outb(0xcf9, 0x6);
  400         DELAY(500000);  /* wait 0.5 sec to see if that did it */
  401 
  402         /*
  403          * Attempt to force a reset via the Fast A20 and Init register
  404          * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
  405          * Bit 0 asserts INIT# when set to 1.  We are careful to only
  406          * preserve bit 1 while setting bit 0.  We also must clear bit
  407          * 0 before setting it if it isn't already clear.
  408          */
  409         b = inb(0x92);
  410         if (b != 0xff) {
  411                 if ((b & 0x1) != 0)
  412                         outb(0x92, b & 0xfe);
  413                 outb(0x92, b | 0x1);
  414                 DELAY(500000);  /* wait 0.5 sec to see if that did it */
  415         }
  416 #endif /* PC98 */
  417 
  418         printf("No known reset method worked, attempting CPU shutdown\n");
  419         DELAY(1000000); /* wait 1 sec for printf to complete */
  420 
  421         /* Wipe the IDT. */
  422         null_idt.rd_limit = 0;
  423         null_idt.rd_base = 0;
  424         lidt(&null_idt);
  425 
  426         /* "good night, sweet prince .... <THUNK!>" */
  427         breakpoint();
  428 
  429         /* NOTREACHED */
  430         while(1);
  431 }
  432 
  433 #ifdef SMP
  434 static void
  435 cpu_reset_proxy(void)
  436 {
  437 
  438         cpu_reset_proxy_active = 1;
  439         while (cpu_reset_proxy_active == 1)
  440                 ia32_pause(); /* Wait for other cpu to see that we've started */
  441 
  442         printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
  443         DELAY(1000000);
  444         cpu_reset_real();
  445 }
  446 #endif
  447 
  448 void
  449 cpu_reset(void)
  450 {
  451 #ifdef SMP
  452         cpuset_t map;
  453         u_int cnt;
  454 
  455         if (smp_started) {
  456                 map = all_cpus;
  457                 CPU_CLR(PCPU_GET(cpuid), &map);
  458                 CPU_NAND(&map, &stopped_cpus);
  459                 if (!CPU_EMPTY(&map)) {
  460                         printf("cpu_reset: Stopping other CPUs\n");
  461                         stop_cpus(map);
  462                 }
  463 
  464                 if (PCPU_GET(cpuid) != 0) {
  465                         cpu_reset_proxyid = PCPU_GET(cpuid);
  466                         cpustop_restartfunc = cpu_reset_proxy;
  467                         cpu_reset_proxy_active = 0;
  468                         printf("cpu_reset: Restarting BSP\n");
  469 
  470                         /* Restart CPU #0. */
  471                         CPU_SETOF(0, &started_cpus);
  472                         wmb();
  473 
  474                         cnt = 0;
  475                         while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
  476                                 ia32_pause();
  477                                 cnt++;  /* Wait for BSP to announce restart */
  478                         }
  479                         if (cpu_reset_proxy_active == 0) {
  480                                 printf("cpu_reset: Failed to restart BSP\n");
  481                         } else {
  482                                 cpu_reset_proxy_active = 2;
  483                                 while (1)
  484                                         ia32_pause();
  485                                 /* NOTREACHED */
  486                         }
  487                 }
  488 
  489                 DELAY(1000000);
  490         }
  491 #endif
  492         cpu_reset_real();
  493         /* NOTREACHED */
  494 }
  495 
  496 bool
  497 cpu_mwait_usable(void)
  498 {
  499 
  500         return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
  501             (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
  502             (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
  503 }
  504 
  505 void (*cpu_idle_hook)(sbintime_t) = NULL;       /* ACPI idle hook. */
  506 static int      cpu_ident_amdc1e = 0;   /* AMD C1E supported. */
  507 static int      idle_mwait = 1;         /* Use MONITOR/MWAIT for short idle. */
  508 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
  509     0, "Use MONITOR/MWAIT for short idle");
  510 
  511 #ifndef PC98
  512 static void
  513 cpu_idle_acpi(sbintime_t sbt)
  514 {
  515         int *state;
  516 
  517         state = (int *)PCPU_PTR(monitorbuf);
  518         atomic_store_int(state, STATE_SLEEPING);
  519 
  520         /* See comments in cpu_idle_hlt(). */
  521         disable_intr();
  522         if (sched_runnable())
  523                 enable_intr();
  524         else if (cpu_idle_hook)
  525                 cpu_idle_hook(sbt);
  526         else
  527                 acpi_cpu_c1();
  528         atomic_store_int(state, STATE_RUNNING);
  529 }
  530 #endif /* !PC98 */
  531 
  532 static void
  533 cpu_idle_hlt(sbintime_t sbt)
  534 {
  535         int *state;
  536 
  537         state = (int *)PCPU_PTR(monitorbuf);
  538         atomic_store_int(state, STATE_SLEEPING);
  539 
  540         /*
  541          * Since we may be in a critical section from cpu_idle(), if
  542          * an interrupt fires during that critical section we may have
  543          * a pending preemption.  If the CPU halts, then that thread
  544          * may not execute until a later interrupt awakens the CPU.
  545          * To handle this race, check for a runnable thread after
  546          * disabling interrupts and immediately return if one is
  547          * found.  Also, we must absolutely guarentee that hlt is
  548          * the next instruction after sti.  This ensures that any
  549          * interrupt that fires after the call to disable_intr() will
  550          * immediately awaken the CPU from hlt.  Finally, please note
  551          * that on x86 this works fine because of interrupts enabled only
  552          * after the instruction following sti takes place, while IF is set
  553          * to 1 immediately, allowing hlt instruction to acknowledge the
  554          * interrupt.
  555          */
  556         disable_intr();
  557         if (sched_runnable())
  558                 enable_intr();
  559         else
  560                 acpi_cpu_c1();
  561         atomic_store_int(state, STATE_RUNNING);
  562 }
  563 
  564 static void
  565 cpu_idle_mwait(sbintime_t sbt)
  566 {
  567         int *state;
  568 
  569         state = (int *)PCPU_PTR(monitorbuf);
  570         atomic_store_int(state, STATE_MWAIT);
  571 
  572         /* See comments in cpu_idle_hlt(). */
  573         disable_intr();
  574         if (sched_runnable()) {
  575                 atomic_store_int(state, STATE_RUNNING);
  576                 enable_intr();
  577                 return;
  578         }
  579 
  580         cpu_monitor(state, 0, 0);
  581         if (atomic_load_int(state) == STATE_MWAIT)
  582                 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
  583         else
  584                 enable_intr();
  585         atomic_store_int(state, STATE_RUNNING);
  586 }
  587 
  588 static void
  589 cpu_idle_spin(sbintime_t sbt)
  590 {
  591         int *state;
  592         int i;
  593 
  594         state = (int *)PCPU_PTR(monitorbuf);
  595         atomic_store_int(state, STATE_RUNNING);
  596 
  597         /*
  598          * The sched_runnable() call is racy but as long as there is
  599          * a loop missing it one time will have just a little impact if any 
  600          * (and it is much better than missing the check at all).
  601          */
  602         for (i = 0; i < 1000; i++) {
  603                 if (sched_runnable())
  604                         return;
  605                 cpu_spinwait();
  606         }
  607 }
  608 
  609 /*
  610  * C1E renders the local APIC timer dead, so we disable it by
  611  * reading the Interrupt Pending Message register and clearing
  612  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  613  * 
  614  * Reference:
  615  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  616  *   #32559 revision 3.00+
  617  */
  618 #define MSR_AMDK8_IPM           0xc0010055
  619 #define AMDK8_SMIONCMPHALT      (1ULL << 27)
  620 #define AMDK8_C1EONCMPHALT      (1ULL << 28)
  621 #define AMDK8_CMPHALT           (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
  622 
  623 void
  624 cpu_probe_amdc1e(void)
  625 {
  626 
  627         /*
  628          * Detect the presence of C1E capability mostly on latest
  629          * dual-cores (or future) k8 family.
  630          */
  631         if (cpu_vendor_id == CPU_VENDOR_AMD &&
  632             (cpu_id & 0x00000f00) == 0x00000f00 &&
  633             (cpu_id & 0x0fff0000) >=  0x00040000) {
  634                 cpu_ident_amdc1e = 1;
  635         }
  636 }
  637 
  638 #if defined(__i386__) && defined(PC98)
  639 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
  640 #else
  641 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
  642 #endif
  643 
  644 void
  645 cpu_idle(int busy)
  646 {
  647         uint64_t msr;
  648         sbintime_t sbt = -1;
  649 
  650         CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
  651             busy, curcpu);
  652 #ifdef MP_WATCHDOG
  653         ap_watchdog(PCPU_GET(cpuid));
  654 #endif
  655 
  656         /* If we are busy - try to use fast methods. */
  657         if (busy) {
  658                 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
  659                         cpu_idle_mwait(busy);
  660                         goto out;
  661                 }
  662         }
  663 
  664         /* If we have time - switch timers into idle mode. */
  665         if (!busy) {
  666                 critical_enter();
  667                 sbt = cpu_idleclock();
  668         }
  669 
  670         /* Apply AMD APIC timer C1E workaround. */
  671         if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
  672                 msr = rdmsr(MSR_AMDK8_IPM);
  673                 if (msr & AMDK8_CMPHALT)
  674                         wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
  675         }
  676 
  677         /* Call main idle method. */
  678         cpu_idle_fn(sbt);
  679 
  680         /* Switch timers back into active mode. */
  681         if (!busy) {
  682                 cpu_activeclock();
  683                 critical_exit();
  684         }
  685 out:
  686         CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
  687             busy, curcpu);
  688 }
  689 
  690 static int cpu_idle_apl31_workaround;
  691 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
  692     &cpu_idle_apl31_workaround, 0,
  693     "Apollo Lake APL31 MWAIT bug workaround");
  694 
  695 int
  696 cpu_idle_wakeup(int cpu)
  697 {
  698         int *state;
  699 
  700         state = (int *)pcpu_find(cpu)->pc_monitorbuf;
  701         switch (atomic_load_int(state)) {
  702         case STATE_SLEEPING:
  703                 return (0);
  704         case STATE_MWAIT:
  705                 atomic_store_int(state, STATE_RUNNING);
  706                 return (cpu_idle_apl31_workaround ? 0 : 1);
  707         case STATE_RUNNING:
  708                 return (1);
  709         default:
  710                 panic("bad monitor state");
  711                 return (1);
  712         }
  713 }
  714 
  715 /*
  716  * Ordered by speed/power consumption.
  717  */
  718 static struct {
  719         void    *id_fn;
  720         char    *id_name;
  721         int     id_cpuid2_flag;
  722 } idle_tbl[] = {
  723         { .id_fn = cpu_idle_spin, .id_name = "spin" },
  724         { .id_fn = cpu_idle_mwait, .id_name = "mwait",
  725             .id_cpuid2_flag = CPUID2_MON },
  726         { .id_fn = cpu_idle_hlt, .id_name = "hlt" },
  727 #if !defined(__i386__) || !defined(PC98)
  728         { .id_fn = cpu_idle_acpi, .id_name = "acpi" },
  729 #endif
  730 };
  731 
  732 static int
  733 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
  734 {
  735         char *avail, *p;
  736         int error;
  737         int i;
  738 
  739         avail = malloc(256, M_TEMP, M_WAITOK);
  740         p = avail;
  741         for (i = 0; i < nitems(idle_tbl); i++) {
  742                 if (idle_tbl[i].id_cpuid2_flag != 0 &&
  743                     (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
  744                         continue;
  745 #if !defined(__i386__) || !defined(PC98)
  746                 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
  747                     cpu_idle_hook == NULL)
  748                         continue;
  749 #endif
  750                 p += sprintf(p, "%s%s", p != avail ? ", " : "",
  751                     idle_tbl[i].id_name);
  752         }
  753         error = sysctl_handle_string(oidp, avail, 0, req);
  754         free(avail, M_TEMP);
  755         return (error);
  756 }
  757 
  758 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
  759     0, 0, idle_sysctl_available, "A", "list of available idle functions");
  760 
  761 static bool
  762 cpu_idle_selector(const char *new_idle_name)
  763 {
  764         int i;
  765 
  766         for (i = 0; i < nitems(idle_tbl); i++) {
  767                 if (idle_tbl[i].id_cpuid2_flag != 0 &&
  768                     (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
  769                         continue;
  770 #if !defined(__i386__) || !defined(PC98)
  771                 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
  772                     cpu_idle_hook == NULL)
  773                         continue;
  774 #endif
  775                 if (strcmp(idle_tbl[i].id_name, new_idle_name))
  776                         continue;
  777                 cpu_idle_fn = idle_tbl[i].id_fn;
  778                 if (bootverbose)
  779                         printf("CPU idle set to %s\n", idle_tbl[i].id_name);
  780                 return (true);
  781         }
  782         return (false);
  783 }
  784 
  785 static int
  786 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
  787 {
  788         char buf[16], *p;
  789         int error, i;
  790 
  791         p = "unknown";
  792         for (i = 0; i < nitems(idle_tbl); i++) {
  793                 if (idle_tbl[i].id_fn == cpu_idle_fn) {
  794                         p = idle_tbl[i].id_name;
  795                         break;
  796                 }
  797         }
  798         strncpy(buf, p, sizeof(buf));
  799         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
  800         if (error != 0 || req->newptr == NULL)
  801                 return (error);
  802         return (cpu_idle_selector(buf) ? 0 : EINVAL);
  803 }
  804 
  805 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
  806     cpu_idle_sysctl, "A", "currently selected idle function");
  807 
  808 static void
  809 cpu_idle_tun(void *unused __unused)
  810 {
  811         char tunvar[16];
  812 
  813         if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
  814                 cpu_idle_selector(tunvar);
  815         else if (cpu_vendor_id == CPU_VENDOR_AMD &&
  816             CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
  817                 /* Ryzen erratas 1057, 1109. */
  818                 cpu_idle_selector("hlt");
  819                 idle_mwait = 0;
  820         }
  821 
  822         if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
  823                 /*
  824                  * Apollo Lake errata APL31 (public errata APL30).
  825                  * Stores to the armed address range may not trigger
  826                  * MWAIT to resume execution.  OS needs to use
  827                  * interrupts to wake processors from MWAIT-induced
  828                  * sleep states.
  829                  */
  830                 cpu_idle_apl31_workaround = 1;
  831         }
  832         TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
  833 }
  834 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
  835 
  836 static int panic_on_nmi = 1;
  837 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
  838     &panic_on_nmi, 0,
  839     "Panic on NMI raised by hardware failure");
  840 int nmi_is_broadcast = 1;
  841 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
  842     &nmi_is_broadcast, 0,
  843     "Chipset NMI is broadcast");
  844 #ifdef KDB
  845 int kdb_on_nmi = 1;
  846 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
  847     &kdb_on_nmi, 0,
  848     "Go to KDB on NMI with unknown source");
  849 #endif
  850 
  851 void
  852 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
  853 {
  854         bool claimed = false;
  855 
  856 #ifdef DEV_ISA
  857         /* machine/parity/power fail/"kitchen sink" faults */
  858         if (isa_nmi(frame->tf_err)) {
  859                 claimed = true;
  860                 if (panic_on_nmi)
  861                         panic("NMI indicates hardware failure");
  862         }
  863 #endif /* DEV_ISA */
  864 #ifdef KDB
  865         if (!claimed && kdb_on_nmi) {
  866                 /*
  867                  * NMI can be hooked up to a pushbutton for debugging.
  868                  */
  869                 printf("NMI/cpu%d ... going to debugger\n", cpu);
  870                 kdb_trap(type, 0, frame);
  871         }
  872 #endif /* KDB */
  873 }
  874 
  875 void
  876 nmi_handle_intr(u_int type, struct trapframe *frame)
  877 {
  878 
  879 #ifdef SMP
  880         if (nmi_is_broadcast) {
  881                 nmi_call_kdb_smp(type, frame);
  882                 return;
  883         }
  884 #endif
  885         nmi_call_kdb(PCPU_GET(cpuid), type, frame);
  886 }
  887 
  888 static int hw_ibrs_active;
  889 int hw_ibrs_ibpb_active;
  890 int hw_ibrs_disable = 1;
  891 
  892 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
  893     "Indirect Branch Restricted Speculation active");
  894 
  895 void
  896 hw_ibrs_recalculate(bool for_all_cpus)
  897 {
  898         if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
  899                 x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
  900                     MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) |
  901                     (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
  902                     IA32_SPEC_CTRL_IBRS);
  903                 hw_ibrs_active = hw_ibrs_disable == 0;
  904                 hw_ibrs_ibpb_active = 0;
  905         } else {
  906                 hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
  907                     CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
  908         }
  909 }
  910 
  911 static int
  912 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
  913 {
  914         int error, val;
  915 
  916         val = hw_ibrs_disable;
  917         error = sysctl_handle_int(oidp, &val, 0, req);
  918         if (error != 0 || req->newptr == NULL)
  919                 return (error);
  920         hw_ibrs_disable = val != 0;
  921         hw_ibrs_recalculate(true);
  922         return (0);
  923 }
  924 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
  925     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
  926     "Disable Indirect Branch Restricted Speculation");
  927 
  928 int hw_ssb_active;
  929 int hw_ssb_disable;
  930 
  931 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
  932     &hw_ssb_active, 0,
  933     "Speculative Store Bypass Disable active");
  934 
  935 static void
  936 hw_ssb_set(bool enable, bool for_all_cpus)
  937 {
  938 
  939         if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
  940                 hw_ssb_active = 0;
  941                 return;
  942         }
  943         hw_ssb_active = enable;
  944         x86_msr_op(MSR_IA32_SPEC_CTRL,
  945             (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
  946             (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
  947 }
  948 
  949 void
  950 hw_ssb_recalculate(bool all_cpus)
  951 {
  952 
  953         switch (hw_ssb_disable) {
  954         default:
  955                 hw_ssb_disable = 0;
  956                 /* FALLTHROUGH */
  957         case 0: /* off */
  958                 hw_ssb_set(false, all_cpus);
  959                 break;
  960         case 1: /* on */
  961                 hw_ssb_set(true, all_cpus);
  962                 break;
  963         case 2: /* auto */
  964                 hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
  965                     false : true, all_cpus);
  966                 break;
  967         }
  968 }
  969 
  970 static int
  971 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
  972 {
  973         int error, val;
  974 
  975         val = hw_ssb_disable;
  976         error = sysctl_handle_int(oidp, &val, 0, req);
  977         if (error != 0 || req->newptr == NULL)
  978                 return (error);
  979         hw_ssb_disable = val;
  980         hw_ssb_recalculate(true);
  981         return (0);
  982 }
  983 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
  984     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
  985     hw_ssb_disable_handler, "I",
  986     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
  987 
  988 int hw_mds_disable;
  989 
  990 /*
  991  * Handler for Microarchitectural Data Sampling issues.  Really not a
  992  * pointer to C function: on amd64 the code must not change any CPU
  993  * architectural state except possibly %rflags. Also, it is always
  994  * called with interrupts disabled.
  995  */
  996 void mds_handler_void(void);
  997 void mds_handler_verw(void);
  998 void mds_handler_ivb(void);
  999 void mds_handler_bdw(void);
 1000 void mds_handler_skl_sse(void);
 1001 void mds_handler_skl_avx(void);
 1002 void mds_handler_skl_avx512(void);
 1003 void mds_handler_silvermont(void);
 1004 void (*mds_handler)(void) = mds_handler_void;
 1005 
 1006 static int
 1007 sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
 1008 {
 1009         const char *state;
 1010 
 1011         if (mds_handler == mds_handler_void)
 1012                 state = "inactive";
 1013         else if (mds_handler == mds_handler_verw)
 1014                 state = "VERW";
 1015         else if (mds_handler == mds_handler_ivb)
 1016                 state = "software IvyBridge";
 1017         else if (mds_handler == mds_handler_bdw)
 1018                 state = "software Broadwell";
 1019         else if (mds_handler == mds_handler_skl_sse)
 1020                 state = "software Skylake SSE";
 1021         else if (mds_handler == mds_handler_skl_avx)
 1022                 state = "software Skylake AVX";
 1023         else if (mds_handler == mds_handler_skl_avx512)
 1024                 state = "software Skylake AVX512";
 1025         else if (mds_handler == mds_handler_silvermont)
 1026                 state = "software Silvermont";
 1027         else
 1028                 state = "unknown";
 1029         return (SYSCTL_OUT(req, state, strlen(state)));
 1030 }
 1031 
 1032 SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
 1033     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1034     sysctl_hw_mds_disable_state_handler, "A",
 1035     "Microarchitectural Data Sampling Mitigation state");
 1036 
 1037 _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
 1038 
 1039 void
 1040 hw_mds_recalculate(void)
 1041 {
 1042         struct pcpu *pc;
 1043         vm_offset_t b64;
 1044         u_long xcr0;
 1045         int i;
 1046 
 1047         /*
 1048          * Allow user to force VERW variant even if MD_CLEAR is not
 1049          * reported.  For instance, hypervisor might unknowingly
 1050          * filter the cap out.
 1051          * For the similar reasons, and for testing, allow to enable
 1052          * mitigation even when MDS_NO cap is set.
 1053          */
 1054         if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
 1055             ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
 1056             hw_mds_disable == 3)) {
 1057                 mds_handler = mds_handler_void;
 1058         } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
 1059             hw_mds_disable == 3) || hw_mds_disable == 1) {
 1060                 mds_handler = mds_handler_verw;
 1061         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1062             (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
 1063             CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
 1064             CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
 1065             CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
 1066             CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
 1067             CPUID_TO_MODEL(cpu_id) == 0x3a) &&
 1068             (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 1069                 /*
 1070                  * Nehalem, SandyBridge, IvyBridge
 1071                  */
 1072                 CPU_FOREACH(i) {
 1073                         pc = pcpu_find(i);
 1074                         if (pc->pc_mds_buf == NULL) {
 1075                                 pc->pc_mds_buf = malloc(672, M_TEMP,
 1076                                     M_WAITOK);
 1077                                 bzero(pc->pc_mds_buf, 16);
 1078                         }
 1079                 }
 1080                 mds_handler = mds_handler_ivb;
 1081         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1082             (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
 1083             CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
 1084             CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
 1085             CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
 1086             (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 1087                 /*
 1088                  * Haswell, Broadwell
 1089                  */
 1090                 CPU_FOREACH(i) {
 1091                         pc = pcpu_find(i);
 1092                         if (pc->pc_mds_buf == NULL) {
 1093                                 pc->pc_mds_buf = malloc(1536, M_TEMP,
 1094                                     M_WAITOK);
 1095                                 bzero(pc->pc_mds_buf, 16);
 1096                         }
 1097                 }
 1098                 mds_handler = mds_handler_bdw;
 1099         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1100             ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
 1101             CPUID_STEPPING) <= 5) ||
 1102             CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
 1103             (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
 1104             CPUID_STEPPING) <= 0xb) ||
 1105             (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
 1106             CPUID_STEPPING) <= 0xc)) &&
 1107             (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 1108                 /*
 1109                  * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
 1110                  * CascadeLake
 1111                  */
 1112                 CPU_FOREACH(i) {
 1113                         pc = pcpu_find(i);
 1114                         if (pc->pc_mds_buf == NULL) {
 1115                                 pc->pc_mds_buf = malloc(6 * 1024,
 1116                                     M_TEMP, M_WAITOK);
 1117                                 b64 = (vm_offset_t)malloc(64 + 63,
 1118                                     M_TEMP, M_WAITOK);
 1119                                 pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
 1120                                 bzero(pc->pc_mds_buf64, 64);
 1121                         }
 1122                 }
 1123                 xcr0 = rxcr(0);
 1124                 if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
 1125                     (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
 1126                         mds_handler = mds_handler_skl_avx512;
 1127                 else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
 1128                     (cpu_feature2 & CPUID2_AVX) != 0)
 1129                         mds_handler = mds_handler_skl_avx;
 1130                 else
 1131                         mds_handler = mds_handler_skl_sse;
 1132         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1133             ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
 1134             CPUID_TO_MODEL(cpu_id) == 0x4a ||
 1135             CPUID_TO_MODEL(cpu_id) == 0x4c ||
 1136             CPUID_TO_MODEL(cpu_id) == 0x4d ||
 1137             CPUID_TO_MODEL(cpu_id) == 0x5a ||
 1138             CPUID_TO_MODEL(cpu_id) == 0x5d ||
 1139             CPUID_TO_MODEL(cpu_id) == 0x6e ||
 1140             CPUID_TO_MODEL(cpu_id) == 0x65 ||
 1141             CPUID_TO_MODEL(cpu_id) == 0x75 ||
 1142             CPUID_TO_MODEL(cpu_id) == 0x1c ||
 1143             CPUID_TO_MODEL(cpu_id) == 0x26 ||
 1144             CPUID_TO_MODEL(cpu_id) == 0x27 ||
 1145             CPUID_TO_MODEL(cpu_id) == 0x35 ||
 1146             CPUID_TO_MODEL(cpu_id) == 0x36 ||
 1147             CPUID_TO_MODEL(cpu_id) == 0x7a))) {
 1148                 /* Silvermont, Airmont */
 1149                 CPU_FOREACH(i) {
 1150                         pc = pcpu_find(i);
 1151                         if (pc->pc_mds_buf == NULL)
 1152                                 pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
 1153                 }
 1154                 mds_handler = mds_handler_silvermont;
 1155         } else {
 1156                 hw_mds_disable = 0;
 1157                 mds_handler = mds_handler_void;
 1158         }
 1159 }
 1160 
 1161 static void
 1162 hw_mds_recalculate_boot(void *arg __unused)
 1163 {
 1164 
 1165         hw_mds_recalculate();
 1166 }
 1167 SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
 1168 
 1169 static int
 1170 sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
 1171 {
 1172         int error, val;
 1173 
 1174         val = hw_mds_disable;
 1175         error = sysctl_handle_int(oidp, &val, 0, req);
 1176         if (error != 0 || req->newptr == NULL)
 1177                 return (error);
 1178         if (val < 0 || val > 3)
 1179                 return (EINVAL);
 1180         hw_mds_disable = val;
 1181         hw_mds_recalculate();
 1182         return (0);
 1183 }
 1184 
 1185 SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
 1186     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1187     sysctl_mds_disable_handler, "I",
 1188     "Microarchitectural Data Sampling Mitigation "
 1189     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
 1190 
 1191 
 1192 /*
 1193  * Intel Transactional Memory Asynchronous Abort Mitigation
 1194  * CVE-2019-11135
 1195  */
 1196 int x86_taa_enable;
 1197 int x86_taa_state;
 1198 enum {
 1199         TAA_NONE        = 0,    /* No mitigation enabled */
 1200         TAA_TSX_DISABLE = 1,    /* Disable TSX via MSR */
 1201         TAA_VERW        = 2,    /* Use VERW mitigation */
 1202         TAA_AUTO        = 3,    /* Automatically select the mitigation */
 1203 
 1204         /* The states below are not selectable by the operator */
 1205 
 1206         TAA_TAA_UC      = 4,    /* Mitigation present in microcode */
 1207         TAA_NOT_PRESENT = 5     /* TSX is not present */
 1208 };
 1209 
 1210 static void
 1211 taa_set(bool enable, bool all)
 1212 {
 1213 
 1214         x86_msr_op(MSR_IA32_TSX_CTRL,
 1215             (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 1216             (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
 1217             IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
 1218 }
 1219 
 1220 void
 1221 x86_taa_recalculate(void)
 1222 {
 1223         static int taa_saved_mds_disable = 0;
 1224         int taa_need = 0, taa_state = 0;
 1225         int mds_disable = 0, need_mds_recalc = 0;
 1226 
 1227         /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
 1228         if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
 1229             (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
 1230                 /* TSX is not present */
 1231                 x86_taa_state = TAA_NOT_PRESENT;
 1232                 return;
 1233         }
 1234 
 1235         /* Check to see what mitigation options the CPU gives us */
 1236         if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
 1237                 /* CPU is not suseptible to TAA */
 1238                 taa_need = TAA_TAA_UC;
 1239         } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
 1240                 /*
 1241                  * CPU can turn off TSX.  This is the next best option
 1242                  * if TAA_NO hardware mitigation isn't present
 1243                  */
 1244                 taa_need = TAA_TSX_DISABLE;
 1245         } else {
 1246                 /* No TSX/TAA specific remedies are available. */
 1247                 if (x86_taa_enable == TAA_TSX_DISABLE) {
 1248                         if (bootverbose)
 1249                                 printf("TSX control not available\n");
 1250                         return;
 1251                 } else
 1252                         taa_need = TAA_VERW;
 1253         }
 1254 
 1255         /* Can we automatically take action, or are we being forced? */
 1256         if (x86_taa_enable == TAA_AUTO)
 1257                 taa_state = taa_need;
 1258         else
 1259                 taa_state = x86_taa_enable;
 1260 
 1261         /* No state change, nothing to do */
 1262         if (taa_state == x86_taa_state) {
 1263                 if (bootverbose)
 1264                         printf("No TSX change made\n");
 1265                 return;
 1266         }
 1267 
 1268         /* Does the MSR need to be turned on or off? */
 1269         if (taa_state == TAA_TSX_DISABLE)
 1270                 taa_set(true, true);
 1271         else if (x86_taa_state == TAA_TSX_DISABLE)
 1272                 taa_set(false, true);
 1273 
 1274         /* Does MDS need to be set to turn on VERW? */
 1275         if (taa_state == TAA_VERW) {
 1276                 taa_saved_mds_disable = hw_mds_disable;
 1277                 mds_disable = hw_mds_disable = 1;
 1278                 need_mds_recalc = 1;
 1279         } else if (x86_taa_state == TAA_VERW) {
 1280                 mds_disable = hw_mds_disable = taa_saved_mds_disable;
 1281                 need_mds_recalc = 1;
 1282         }
 1283         if (need_mds_recalc) {
 1284                 hw_mds_recalculate();
 1285                 if (mds_disable != hw_mds_disable) {
 1286                         if (bootverbose)
 1287                                 printf("Cannot change MDS state for TAA\n");
 1288                         /* Don't update our state */
 1289                         return;
 1290                 }
 1291         }
 1292 
 1293         x86_taa_state = taa_state;
 1294         return;
 1295 }
 1296 
 1297 static void
 1298 taa_recalculate_boot(void * arg __unused)
 1299 {
 1300 
 1301         x86_taa_recalculate();
 1302 }
 1303 SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
 1304 
 1305 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0,
 1306         "TSX Asynchronous Abort Mitigation");
 1307 
 1308 static int
 1309 sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
 1310 {
 1311         int error, val;
 1312 
 1313         val = x86_taa_enable;
 1314         error = sysctl_handle_int(oidp, &val, 0, req);
 1315         if (error != 0 || req->newptr == NULL)
 1316                 return (error);
 1317         if (val < TAA_NONE || val > TAA_AUTO)
 1318                 return (EINVAL);
 1319         x86_taa_enable = val;
 1320         x86_taa_recalculate();
 1321         return (0);
 1322 }
 1323 
 1324 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
 1325     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1326     sysctl_taa_handler, "I",
 1327     "TAA Mitigation enablement control "
 1328     "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
 1329 
 1330 static int
 1331 sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
 1332 {
 1333         const char *state;
 1334 
 1335         switch (x86_taa_state) {
 1336         case TAA_NONE:
 1337                 state = "inactive";
 1338                 break;
 1339         case TAA_TSX_DISABLE:
 1340                 state = "TSX disabled";
 1341                 break;
 1342         case TAA_VERW:
 1343                 state = "VERW";
 1344                 break;
 1345         case TAA_TAA_UC:
 1346                 state = "Mitigated in microcode";
 1347                 break;
 1348         case TAA_NOT_PRESENT:
 1349                 state = "TSX not present";
 1350                 break;
 1351         default:
 1352                 state = "unknown";
 1353         }
 1354 
 1355         return (SYSCTL_OUT(req, state, strlen(state)));
 1356 }
 1357 
 1358 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
 1359     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1360     sysctl_taa_state_handler, "A",
 1361     "TAA Mitigation state");
 1362 
 1363 int __read_frequently cpu_flush_rsb_ctxsw;
 1364 SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
 1365     CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
 1366     "Flush Return Stack Buffer on context switch");
 1367 
 1368 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
 1369     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1370     "MCU Optimization, disable RDSEED mitigation");
 1371 
 1372 int x86_rngds_mitg_enable = 1;
 1373 void
 1374 x86_rngds_mitg_recalculate(bool all_cpus)
 1375 {
 1376         if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
 1377                 return;
 1378         x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
 1379             (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 1380             (all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
 1381             IA32_RNGDS_MITG_DIS);
 1382 }
 1383 
 1384 static int
 1385 sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
 1386 {
 1387         int error, val;
 1388 
 1389         val = x86_rngds_mitg_enable;
 1390         error = sysctl_handle_int(oidp, &val, 0, req);
 1391         if (error != 0 || req->newptr == NULL)
 1392                 return (error);
 1393         x86_rngds_mitg_enable = val;
 1394         x86_rngds_mitg_recalculate(true);
 1395         return (0);
 1396 }
 1397 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
 1398     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1399     sysctl_rngds_mitg_enable_handler, "I",
 1400     "MCU Optimization, disabling RDSEED mitigation control "
 1401     "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled");
 1402 
 1403 static int
 1404 sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
 1405 {
 1406         const char *state;
 1407 
 1408         if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
 1409                 state = "Not applicable";
 1410         } else if (x86_rngds_mitg_enable == 0) {
 1411                 state = "RDSEED not serialized";
 1412         } else {
 1413                 state = "Mitigated";
 1414         }
 1415         return (SYSCTL_OUT(req, state, strlen(state)));
 1416 }
 1417 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
 1418     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1419     sysctl_rngds_state_handler, "A",
 1420     "MCU Optimization state");

Cache object: 1c8e4edfd66f5610b6dcb4c77a357dce


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.