The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/x86/x86/cpu_machdep.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2003 Peter Wemm.
    3  * Copyright (c) 1992 Terrence R. Lambert.
    4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * William Jolitz.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      from: @(#)machdep.c     7.4 (Berkeley) 6/3/91
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD$");
   43 
   44 #include "opt_acpi.h"
   45 #include "opt_atpic.h"
   46 #include "opt_cpu.h"
   47 #include "opt_ddb.h"
   48 #include "opt_inet.h"
   49 #include "opt_isa.h"
   50 #include "opt_kdb.h"
   51 #include "opt_kstack_pages.h"
   52 #include "opt_maxmem.h"
   53 #include "opt_platform.h"
   54 #include "opt_sched.h"
   55 #ifdef __i386__
   56 #include "opt_apic.h"
   57 #endif
   58 
   59 #include <sys/param.h>
   60 #include <sys/proc.h>
   61 #include <sys/systm.h>
   62 #include <sys/bus.h>
   63 #include <sys/cpu.h>
   64 #include <sys/domainset.h>
   65 #include <sys/kdb.h>
   66 #include <sys/kernel.h>
   67 #include <sys/ktr.h>
   68 #include <sys/lock.h>
   69 #include <sys/malloc.h>
   70 #include <sys/mutex.h>
   71 #include <sys/pcpu.h>
   72 #include <sys/rwlock.h>
   73 #include <sys/sched.h>
   74 #include <sys/smp.h>
   75 #include <sys/sysctl.h>
   76 
   77 #include <machine/clock.h>
   78 #include <machine/cpu.h>
   79 #include <machine/cpufunc.h>
   80 #include <machine/cputypes.h>
   81 #include <machine/specialreg.h>
   82 #include <machine/md_var.h>
   83 #include <machine/tss.h>
   84 #ifdef SMP
   85 #include <machine/smp.h>
   86 #endif
   87 #ifdef CPU_ELAN
   88 #include <machine/elan_mmcr.h>
   89 #endif
   90 #include <x86/acpica_machdep.h>
   91 #include <x86/ifunc.h>
   92 
   93 #include <vm/vm.h>
   94 #include <vm/vm_extern.h>
   95 #include <vm/vm_kern.h>
   96 #include <vm/vm_page.h>
   97 #include <vm/vm_map.h>
   98 #include <vm/vm_object.h>
   99 #include <vm/vm_pager.h>
  100 #include <vm/vm_param.h>
  101 
  102 #include <isa/isareg.h>
  103 
  104 #include <contrib/dev/acpica/include/acpi.h>
  105 
  106 #define STATE_RUNNING   0x0
  107 #define STATE_MWAIT     0x1
  108 #define STATE_SLEEPING  0x2
  109 
  110 #ifdef SMP
  111 static u_int    cpu_reset_proxyid;
  112 static volatile u_int   cpu_reset_proxy_active;
  113 #endif
  114 
  115 char bootmethod[16];
  116 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
  117     "System firmware boot method");
  118 
  119 struct msr_op_arg {
  120         u_int msr;
  121         int op;
  122         uint64_t arg1;
  123         uint64_t *res;
  124 };
  125 
  126 static void
  127 x86_msr_op_one(void *argp)
  128 {
  129         struct msr_op_arg *a;
  130         uint64_t v;
  131 
  132         a = argp;
  133         switch (a->op) {
  134         case MSR_OP_ANDNOT:
  135                 v = rdmsr(a->msr);
  136                 v &= ~a->arg1;
  137                 wrmsr(a->msr, v);
  138                 break;
  139         case MSR_OP_OR:
  140                 v = rdmsr(a->msr);
  141                 v |= a->arg1;
  142                 wrmsr(a->msr, v);
  143                 break;
  144         case MSR_OP_WRITE:
  145                 wrmsr(a->msr, a->arg1);
  146                 break;
  147         case MSR_OP_READ:
  148                 v = rdmsr(a->msr);
  149                 *a->res = v;
  150                 break;
  151         }
  152 }
  153 
  154 #define MSR_OP_EXMODE_MASK      0xf0000000
  155 #define MSR_OP_OP_MASK          0x000000ff
  156 #define MSR_OP_GET_CPUID(x)     (((x) & ~MSR_OP_EXMODE_MASK) >> 8)
  157 
  158 void
  159 x86_msr_op(u_int msr, u_int op, uint64_t arg1, uint64_t *res)
  160 {
  161         struct thread *td;
  162         struct msr_op_arg a;
  163         cpuset_t set;
  164         u_int exmode;
  165         int bound_cpu, cpu, i, is_bound;
  166 
  167         a.op = op & MSR_OP_OP_MASK;
  168         MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
  169             a.op == MSR_OP_WRITE || a.op == MSR_OP_READ);
  170         exmode = op & MSR_OP_EXMODE_MASK;
  171         MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED_ALL ||
  172             exmode == MSR_OP_SCHED_ONE || exmode == MSR_OP_RENDEZVOUS_ALL ||
  173             exmode == MSR_OP_RENDEZVOUS_ONE);
  174         a.msr = msr;
  175         a.arg1 = arg1;
  176         a.res = res;
  177         switch (exmode) {
  178         case MSR_OP_LOCAL:
  179                 x86_msr_op_one(&a);
  180                 break;
  181         case MSR_OP_SCHED_ALL:
  182                 td = curthread;
  183                 thread_lock(td);
  184                 is_bound = sched_is_bound(td);
  185                 bound_cpu = td->td_oncpu;
  186                 CPU_FOREACH(i) {
  187                         sched_bind(td, i);
  188                         x86_msr_op_one(&a);
  189                 }
  190                 if (is_bound)
  191                         sched_bind(td, bound_cpu);
  192                 else
  193                         sched_unbind(td);
  194                 thread_unlock(td);
  195                 break;
  196         case MSR_OP_SCHED_ONE:
  197                 td = curthread;
  198                 cpu = MSR_OP_GET_CPUID(op);
  199                 thread_lock(td);
  200                 is_bound = sched_is_bound(td);
  201                 bound_cpu = td->td_oncpu;
  202                 if (!is_bound || bound_cpu != cpu)
  203                         sched_bind(td, cpu);
  204                 x86_msr_op_one(&a);
  205                 if (is_bound) {
  206                         if (bound_cpu != cpu)
  207                                 sched_bind(td, bound_cpu);
  208                 } else {
  209                         sched_unbind(td);
  210                 }
  211                 thread_unlock(td);
  212                 break;
  213         case MSR_OP_RENDEZVOUS_ALL:
  214                 smp_rendezvous(smp_no_rendezvous_barrier, x86_msr_op_one,
  215                     smp_no_rendezvous_barrier, &a);
  216                 break;
  217         case MSR_OP_RENDEZVOUS_ONE:
  218                 cpu = MSR_OP_GET_CPUID(op);
  219                 CPU_SETOF(cpu, &set);
  220                 smp_rendezvous_cpus(set, smp_no_rendezvous_barrier,
  221                     x86_msr_op_one, smp_no_rendezvous_barrier, &a);
  222                 break;
  223         }
  224 }
  225 
  226 /*
  227  * Automatically initialized per CPU errata in cpu_idle_tun below.
  228  */
  229 bool mwait_cpustop_broken = false;
  230 SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN,
  231     &mwait_cpustop_broken, 0,
  232     "Can not reliably wake MONITOR/MWAIT cpus without interrupts");
  233 
  234 /*
  235  * Flush the D-cache for non-DMA I/O so that the I-cache can
  236  * be made coherent later.
  237  */
  238 void
  239 cpu_flush_dcache(void *ptr, size_t len)
  240 {
  241         /* Not applicable */
  242 }
  243 
  244 void
  245 acpi_cpu_c1(void)
  246 {
  247 
  248         __asm __volatile("sti; hlt");
  249 }
  250 
  251 /*
  252  * Use mwait to pause execution while waiting for an interrupt or
  253  * another thread to signal that there is more work.
  254  *
  255  * NOTE: Interrupts will cause a wakeup; however, this function does
  256  * not enable interrupt handling. The caller is responsible to enable
  257  * interrupts.
  258  */
  259 void
  260 acpi_cpu_idle_mwait(uint32_t mwait_hint)
  261 {
  262         int *state;
  263         uint64_t v;
  264 
  265         /*
  266          * A comment in Linux patch claims that 'CPUs run faster with
  267          * speculation protection disabled. All CPU threads in a core
  268          * must disable speculation protection for it to be
  269          * disabled. Disable it while we are idle so the other
  270          * hyperthread can run fast.'
  271          *
  272          * XXXKIB.  Software coordination mode should be supported,
  273          * but all Intel CPUs provide hardware coordination.
  274          */
  275 
  276         state = &PCPU_PTR(monitorbuf)->idle_state;
  277         KASSERT(atomic_load_int(state) == STATE_SLEEPING,
  278             ("cpu_mwait_cx: wrong monitorbuf state"));
  279         atomic_store_int(state, STATE_MWAIT);
  280         if (PCPU_GET(ibpb_set) || hw_ssb_active) {
  281                 v = rdmsr(MSR_IA32_SPEC_CTRL);
  282                 wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
  283                     IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
  284         } else {
  285                 v = 0;
  286         }
  287         cpu_monitor(state, 0, 0);
  288         if (atomic_load_int(state) == STATE_MWAIT)
  289                 cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
  290 
  291         /*
  292          * SSB cannot be disabled while we sleep, or rather, if it was
  293          * disabled, the sysctl thread will bind to our cpu to tweak
  294          * MSR.
  295          */
  296         if (v != 0)
  297                 wrmsr(MSR_IA32_SPEC_CTRL, v);
  298 
  299         /*
  300          * We should exit on any event that interrupts mwait, because
  301          * that event might be a wanted interrupt.
  302          */
  303         atomic_store_int(state, STATE_RUNNING);
  304 }
  305 
  306 /* Get current clock frequency for the given cpu id. */
  307 int
  308 cpu_est_clockrate(int cpu_id, uint64_t *rate)
  309 {
  310         uint64_t tsc1, tsc2;
  311         uint64_t acnt, mcnt, perf;
  312         register_t reg;
  313 
  314         if (pcpu_find(cpu_id) == NULL || rate == NULL)
  315                 return (EINVAL);
  316 #ifdef __i386__
  317         if ((cpu_feature & CPUID_TSC) == 0)
  318                 return (EOPNOTSUPP);
  319 #endif
  320 
  321         /*
  322          * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
  323          * DELAY(9) based logic fails.
  324          */
  325         if (tsc_is_invariant && !tsc_perf_stat)
  326                 return (EOPNOTSUPP);
  327 
  328 #ifdef SMP
  329         if (smp_cpus > 1) {
  330                 /* Schedule ourselves on the indicated cpu. */
  331                 thread_lock(curthread);
  332                 sched_bind(curthread, cpu_id);
  333                 thread_unlock(curthread);
  334         }
  335 #endif
  336 
  337         /* Calibrate by measuring a short delay. */
  338         reg = intr_disable();
  339         if (tsc_is_invariant) {
  340                 wrmsr(MSR_MPERF, 0);
  341                 wrmsr(MSR_APERF, 0);
  342                 tsc1 = rdtsc();
  343                 DELAY(1000);
  344                 mcnt = rdmsr(MSR_MPERF);
  345                 acnt = rdmsr(MSR_APERF);
  346                 tsc2 = rdtsc();
  347                 intr_restore(reg);
  348                 perf = 1000 * acnt / mcnt;
  349                 *rate = (tsc2 - tsc1) * perf;
  350         } else {
  351                 tsc1 = rdtsc();
  352                 DELAY(1000);
  353                 tsc2 = rdtsc();
  354                 intr_restore(reg);
  355                 *rate = (tsc2 - tsc1) * 1000;
  356         }
  357 
  358 #ifdef SMP
  359         if (smp_cpus > 1) {
  360                 thread_lock(curthread);
  361                 sched_unbind(curthread);
  362                 thread_unlock(curthread);
  363         }
  364 #endif
  365 
  366         return (0);
  367 }
  368 
  369 /*
  370  * Shutdown the CPU as much as possible
  371  */
  372 void
  373 cpu_halt(void)
  374 {
  375         for (;;)
  376                 halt();
  377 }
  378 
  379 static void
  380 cpu_reset_real(void)
  381 {
  382         struct region_descriptor null_idt;
  383         int b;
  384 
  385         disable_intr();
  386 #ifdef CPU_ELAN
  387         if (elan_mmcr != NULL)
  388                 elan_mmcr->RESCFG = 1;
  389 #endif
  390 #ifdef __i386__
  391         if (cpu == CPU_GEODE1100) {
  392                 /* Attempt Geode's own reset */
  393                 outl(0xcf8, 0x80009044ul);
  394                 outl(0xcfc, 0xf);
  395         }
  396 #endif
  397 #if !defined(BROKEN_KEYBOARD_RESET)
  398         /*
  399          * Attempt to do a CPU reset via the keyboard controller,
  400          * do not turn off GateA20, as any machine that fails
  401          * to do the reset here would then end up in no man's land.
  402          */
  403         outb(IO_KBD + 4, 0xFE);
  404         DELAY(500000);  /* wait 0.5 sec to see if that did it */
  405 #endif
  406 
  407         /*
  408          * Attempt to force a reset via the Reset Control register at
  409          * I/O port 0xcf9.  Bit 2 forces a system reset when it
  410          * transitions from 0 to 1.  Bit 1 selects the type of reset
  411          * to attempt: 0 selects a "soft" reset, and 1 selects a
  412          * "hard" reset.  We try a "hard" reset.  The first write sets
  413          * bit 1 to select a "hard" reset and clears bit 2.  The
  414          * second write forces a 0 -> 1 transition in bit 2 to trigger
  415          * a reset.
  416          */
  417         outb(0xcf9, 0x2);
  418         outb(0xcf9, 0x6);
  419         DELAY(500000);  /* wait 0.5 sec to see if that did it */
  420 
  421         /*
  422          * Attempt to force a reset via the Fast A20 and Init register
  423          * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
  424          * Bit 0 asserts INIT# when set to 1.  We are careful to only
  425          * preserve bit 1 while setting bit 0.  We also must clear bit
  426          * 0 before setting it if it isn't already clear.
  427          */
  428         b = inb(0x92);
  429         if (b != 0xff) {
  430                 if ((b & 0x1) != 0)
  431                         outb(0x92, b & 0xfe);
  432                 outb(0x92, b | 0x1);
  433                 DELAY(500000);  /* wait 0.5 sec to see if that did it */
  434         }
  435 
  436         printf("No known reset method worked, attempting CPU shutdown\n");
  437         DELAY(1000000); /* wait 1 sec for printf to complete */
  438 
  439         /* Wipe the IDT. */
  440         null_idt.rd_limit = 0;
  441         null_idt.rd_base = 0;
  442         lidt(&null_idt);
  443 
  444         /* "good night, sweet prince .... <THUNK!>" */
  445         breakpoint();
  446 
  447         /* NOTREACHED */
  448         while(1);
  449 }
  450 
  451 #ifdef SMP
  452 static void
  453 cpu_reset_proxy(void)
  454 {
  455 
  456         cpu_reset_proxy_active = 1;
  457         while (cpu_reset_proxy_active == 1)
  458                 ia32_pause(); /* Wait for other cpu to see that we've started */
  459 
  460         printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
  461         DELAY(1000000);
  462         cpu_reset_real();
  463 }
  464 #endif
  465 
  466 void
  467 cpu_reset(void)
  468 {
  469 #ifdef SMP
  470         struct monitorbuf *mb;
  471         cpuset_t map;
  472         u_int cnt;
  473 
  474         if (smp_started) {
  475                 map = all_cpus;
  476                 CPU_CLR(PCPU_GET(cpuid), &map);
  477                 CPU_ANDNOT(&map, &map, &stopped_cpus);
  478                 if (!CPU_EMPTY(&map)) {
  479                         printf("cpu_reset: Stopping other CPUs\n");
  480                         stop_cpus(map);
  481                 }
  482 
  483                 if (PCPU_GET(cpuid) != 0) {
  484                         cpu_reset_proxyid = PCPU_GET(cpuid);
  485                         cpustop_restartfunc = cpu_reset_proxy;
  486                         cpu_reset_proxy_active = 0;
  487                         printf("cpu_reset: Restarting BSP\n");
  488 
  489                         /* Restart CPU #0. */
  490                         CPU_SETOF(0, &started_cpus);
  491                         mb = &pcpu_find(0)->pc_monitorbuf;
  492                         atomic_store_int(&mb->stop_state,
  493                             MONITOR_STOPSTATE_RUNNING);
  494 
  495                         cnt = 0;
  496                         while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
  497                                 ia32_pause();
  498                                 cnt++;  /* Wait for BSP to announce restart */
  499                         }
  500                         if (cpu_reset_proxy_active == 0) {
  501                                 printf("cpu_reset: Failed to restart BSP\n");
  502                         } else {
  503                                 cpu_reset_proxy_active = 2;
  504                                 while (1)
  505                                         ia32_pause();
  506                                 /* NOTREACHED */
  507                         }
  508                 }
  509         }
  510 #endif
  511         cpu_reset_real();
  512         /* NOTREACHED */
  513 }
  514 
  515 bool
  516 cpu_mwait_usable(void)
  517 {
  518 
  519         return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
  520             (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
  521             (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
  522 }
  523 
  524 void (*cpu_idle_hook)(sbintime_t) = NULL;       /* ACPI idle hook. */
  525 
  526 int cpu_amdc1e_bug = 0;                 /* AMD C1E APIC workaround required. */
  527 
  528 static int      idle_mwait = 1;         /* Use MONITOR/MWAIT for short idle. */
  529 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
  530     0, "Use MONITOR/MWAIT for short idle");
  531 
  532 static bool
  533 cpu_idle_enter(int *statep, int newstate)
  534 {
  535         KASSERT(atomic_load_int(statep) == STATE_RUNNING,
  536             ("%s: state %d", __func__, atomic_load_int(statep)));
  537 
  538         /*
  539          * A fence is needed to prevent reordering of the load in
  540          * sched_runnable() with this store to the idle state word.  Without it,
  541          * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
  542          * added load to the queue, and elide an IPI.  Then, sched_runnable()
  543          * can observe tdq_load == 0, so the CPU ends up idling with pending
  544          * work.  tdq_notify() similarly ensures that a prior update to tdq_load
  545          * is visible before calling cpu_idle_wakeup().
  546          */
  547         atomic_store_int(statep, newstate);
  548 #if defined(SCHED_ULE) && defined(SMP)
  549         atomic_thread_fence_seq_cst();
  550 #endif
  551 
  552         /*
  553          * Since we may be in a critical section from cpu_idle(), if
  554          * an interrupt fires during that critical section we may have
  555          * a pending preemption.  If the CPU halts, then that thread
  556          * may not execute until a later interrupt awakens the CPU.
  557          * To handle this race, check for a runnable thread after
  558          * disabling interrupts and immediately return if one is
  559          * found.  Also, we must absolutely guarentee that hlt is
  560          * the next instruction after sti.  This ensures that any
  561          * interrupt that fires after the call to disable_intr() will
  562          * immediately awaken the CPU from hlt.  Finally, please note
  563          * that on x86 this works fine because of interrupts enabled only
  564          * after the instruction following sti takes place, while IF is set
  565          * to 1 immediately, allowing hlt instruction to acknowledge the
  566          * interrupt.
  567          */
  568         disable_intr();
  569         if (sched_runnable()) {
  570                 enable_intr();
  571                 atomic_store_int(statep, STATE_RUNNING);
  572                 return (false);
  573         } else {
  574                 return (true);
  575         }
  576 }
  577 
  578 static void
  579 cpu_idle_exit(int *statep)
  580 {
  581         atomic_store_int(statep, STATE_RUNNING);
  582 }
  583 
  584 static void
  585 cpu_idle_acpi(sbintime_t sbt)
  586 {
  587         int *state;
  588 
  589         state = &PCPU_PTR(monitorbuf)->idle_state;
  590         if (cpu_idle_enter(state, STATE_SLEEPING)) {
  591                 if (cpu_idle_hook)
  592                         cpu_idle_hook(sbt);
  593                 else
  594                         acpi_cpu_c1();
  595                 cpu_idle_exit(state);
  596         }
  597 }
  598 
  599 static void
  600 cpu_idle_hlt(sbintime_t sbt)
  601 {
  602         int *state;
  603 
  604         state = &PCPU_PTR(monitorbuf)->idle_state;
  605         if (cpu_idle_enter(state, STATE_SLEEPING)) {
  606                 acpi_cpu_c1();
  607                 atomic_store_int(state, STATE_RUNNING);
  608         }
  609 }
  610 
  611 static void
  612 cpu_idle_mwait(sbintime_t sbt)
  613 {
  614         int *state;
  615 
  616         state = &PCPU_PTR(monitorbuf)->idle_state;
  617         if (cpu_idle_enter(state, STATE_MWAIT)) {
  618                 cpu_monitor(state, 0, 0);
  619                 if (atomic_load_int(state) == STATE_MWAIT)
  620                         __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
  621                 else
  622                         enable_intr();
  623                 cpu_idle_exit(state);
  624         }
  625 }
  626 
  627 static void
  628 cpu_idle_spin(sbintime_t sbt)
  629 {
  630         int *state;
  631         int i;
  632 
  633         state = &PCPU_PTR(monitorbuf)->idle_state;
  634         atomic_store_int(state, STATE_RUNNING);
  635 
  636         /*
  637          * The sched_runnable() call is racy but as long as there is
  638          * a loop missing it one time will have just a little impact if any 
  639          * (and it is much better than missing the check at all).
  640          */
  641         for (i = 0; i < 1000; i++) {
  642                 if (sched_runnable())
  643                         return;
  644                 cpu_spinwait();
  645         }
  646 }
  647 
  648 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
  649 
  650 void
  651 cpu_idle(int busy)
  652 {
  653         uint64_t msr;
  654         sbintime_t sbt = -1;
  655 
  656         CTR1(KTR_SPARE2, "cpu_idle(%d)", busy);
  657 
  658         /* If we are busy - try to use fast methods. */
  659         if (busy) {
  660                 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
  661                         cpu_idle_mwait(busy);
  662                         goto out;
  663                 }
  664         }
  665 
  666         /* If we have time - switch timers into idle mode. */
  667         if (!busy) {
  668                 critical_enter();
  669                 sbt = cpu_idleclock();
  670         }
  671 
  672         /* Apply AMD APIC timer C1E workaround. */
  673         if (cpu_amdc1e_bug && cpu_disable_c3_sleep) {
  674                 msr = rdmsr(MSR_AMDK8_IPM);
  675                 if ((msr & (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)) != 0)
  676                         wrmsr(MSR_AMDK8_IPM, msr & ~(AMDK8_SMIONCMPHALT |
  677                             AMDK8_C1EONCMPHALT));
  678         }
  679 
  680         /* Call main idle method. */
  681         cpu_idle_fn(sbt);
  682 
  683         /* Switch timers back into active mode. */
  684         if (!busy) {
  685                 cpu_activeclock();
  686                 critical_exit();
  687         }
  688 out:
  689         CTR1(KTR_SPARE2, "cpu_idle(%d) done", busy);
  690 }
  691 
  692 static int cpu_idle_apl31_workaround;
  693 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
  694     &cpu_idle_apl31_workaround, 0,
  695     "Apollo Lake APL31 MWAIT bug workaround");
  696 
  697 int
  698 cpu_idle_wakeup(int cpu)
  699 {
  700         struct monitorbuf *mb;
  701         int *state;
  702 
  703         mb = &pcpu_find(cpu)->pc_monitorbuf;
  704         state = &mb->idle_state;
  705         switch (atomic_load_int(state)) {
  706         case STATE_SLEEPING:
  707                 return (0);
  708         case STATE_MWAIT:
  709                 atomic_store_int(state, STATE_RUNNING);
  710                 return (cpu_idle_apl31_workaround ? 0 : 1);
  711         case STATE_RUNNING:
  712                 return (1);
  713         default:
  714                 panic("bad monitor state");
  715                 return (1);
  716         }
  717 }
  718 
  719 /*
  720  * Ordered by speed/power consumption.
  721  */
  722 static struct {
  723         void    *id_fn;
  724         char    *id_name;
  725         int     id_cpuid2_flag;
  726 } idle_tbl[] = {
  727         { .id_fn = cpu_idle_spin, .id_name = "spin" },
  728         { .id_fn = cpu_idle_mwait, .id_name = "mwait",
  729             .id_cpuid2_flag = CPUID2_MON },
  730         { .id_fn = cpu_idle_hlt, .id_name = "hlt" },
  731         { .id_fn = cpu_idle_acpi, .id_name = "acpi" },
  732 };
  733 
  734 static int
  735 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
  736 {
  737         char *avail, *p;
  738         int error;
  739         int i;
  740 
  741         avail = malloc(256, M_TEMP, M_WAITOK);
  742         p = avail;
  743         for (i = 0; i < nitems(idle_tbl); i++) {
  744                 if (idle_tbl[i].id_cpuid2_flag != 0 &&
  745                     (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
  746                         continue;
  747                 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
  748                     cpu_idle_hook == NULL)
  749                         continue;
  750                 p += sprintf(p, "%s%s", p != avail ? ", " : "",
  751                     idle_tbl[i].id_name);
  752         }
  753         error = sysctl_handle_string(oidp, avail, 0, req);
  754         free(avail, M_TEMP);
  755         return (error);
  756 }
  757 
  758 SYSCTL_PROC(_machdep, OID_AUTO, idle_available,
  759     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
  760     0, 0, idle_sysctl_available, "A",
  761     "list of available idle functions");
  762 
  763 static bool
  764 cpu_idle_selector(const char *new_idle_name)
  765 {
  766         int i;
  767 
  768         for (i = 0; i < nitems(idle_tbl); i++) {
  769                 if (idle_tbl[i].id_cpuid2_flag != 0 &&
  770                     (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
  771                         continue;
  772                 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
  773                     cpu_idle_hook == NULL)
  774                         continue;
  775                 if (strcmp(idle_tbl[i].id_name, new_idle_name))
  776                         continue;
  777                 cpu_idle_fn = idle_tbl[i].id_fn;
  778                 if (bootverbose)
  779                         printf("CPU idle set to %s\n", idle_tbl[i].id_name);
  780                 return (true);
  781         }
  782         return (false);
  783 }
  784 
  785 static int
  786 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
  787 {
  788         char buf[16], *p;
  789         int error, i;
  790 
  791         p = "unknown";
  792         for (i = 0; i < nitems(idle_tbl); i++) {
  793                 if (idle_tbl[i].id_fn == cpu_idle_fn) {
  794                         p = idle_tbl[i].id_name;
  795                         break;
  796                 }
  797         }
  798         strncpy(buf, p, sizeof(buf));
  799         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
  800         if (error != 0 || req->newptr == NULL)
  801                 return (error);
  802         return (cpu_idle_selector(buf) ? 0 : EINVAL);
  803 }
  804 
  805 SYSCTL_PROC(_machdep, OID_AUTO, idle,
  806     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
  807     0, 0, cpu_idle_sysctl, "A",
  808     "currently selected idle function");
  809 
  810 static void
  811 cpu_idle_tun(void *unused __unused)
  812 {
  813         char tunvar[16];
  814 
  815         if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
  816                 cpu_idle_selector(tunvar);
  817         else if (cpu_vendor_id == CPU_VENDOR_AMD &&
  818             CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
  819                 /* Ryzen erratas 1057, 1109. */
  820                 cpu_idle_selector("hlt");
  821                 idle_mwait = 0;
  822                 mwait_cpustop_broken = true;
  823         }
  824 
  825         if (cpu_vendor_id == CPU_VENDOR_INTEL &&
  826             CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x5c) {
  827                 /*
  828                  * Apollo Lake errata APL31 (public errata APL30).
  829                  * Stores to the armed address range may not trigger
  830                  * MWAIT to resume execution.  OS needs to use
  831                  * interrupts to wake processors from MWAIT-induced
  832                  * sleep states.
  833                  */
  834                 cpu_idle_apl31_workaround = 1;
  835                 mwait_cpustop_broken = true;
  836         }
  837         TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
  838 }
  839 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
  840 
  841 static int panic_on_nmi = 0xff;
  842 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
  843     &panic_on_nmi, 0,
  844     "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all");
  845 int nmi_is_broadcast = 1;
  846 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
  847     &nmi_is_broadcast, 0,
  848     "Chipset NMI is broadcast");
  849 int (*apei_nmi)(void);
  850 
  851 void
  852 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
  853 {
  854         bool claimed = false;
  855 
  856 #ifdef DEV_ISA
  857         /* machine/parity/power fail/"kitchen sink" faults */
  858         if (isa_nmi(frame->tf_err)) {
  859                 claimed = true;
  860                 if ((panic_on_nmi & 1) != 0)
  861                         panic("NMI indicates hardware failure");
  862         }
  863 #endif /* DEV_ISA */
  864 
  865         /* ACPI Platform Error Interfaces callback. */
  866         if (apei_nmi != NULL && (*apei_nmi)())
  867                 claimed = true;
  868 
  869         /*
  870          * NMIs can be useful for debugging.  They can be hooked up to a
  871          * pushbutton, usually on an ISA, PCI, or PCIe card.  They can also be
  872          * generated by an IPMI BMC, either manually or in response to a
  873          * watchdog timeout.  For example, see the "power diag" command in
  874          * ports/sysutils/ipmitool.  They can also be generated by a
  875          * hypervisor; see "bhyvectl --inject-nmi".
  876          */
  877 
  878 #ifdef KDB
  879         if (!claimed && (panic_on_nmi & 2) != 0) {
  880                 if (debugger_on_panic) {
  881                         printf("NMI/cpu%d ... going to debugger\n", cpu);
  882                         claimed = kdb_trap(type, 0, frame);
  883                 }
  884         }
  885 #endif /* KDB */
  886 
  887         if (!claimed && panic_on_nmi != 0)
  888                 panic("NMI");
  889 }
  890 
  891 void
  892 nmi_handle_intr(u_int type, struct trapframe *frame)
  893 {
  894 
  895 #ifdef SMP
  896         if (nmi_is_broadcast) {
  897                 nmi_call_kdb_smp(type, frame);
  898                 return;
  899         }
  900 #endif
  901         nmi_call_kdb(PCPU_GET(cpuid), type, frame);
  902 }
  903 
  904 static int hw_ibrs_active;
  905 int hw_ibrs_ibpb_active;
  906 int hw_ibrs_disable = 1;
  907 
  908 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
  909     "Indirect Branch Restricted Speculation active");
  910 
  911 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs,
  912     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  913     "Indirect Branch Restricted Speculation active");
  914 
  915 SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD,
  916     &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active");
  917 
  918 void
  919 hw_ibrs_recalculate(bool for_all_cpus)
  920 {
  921         if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
  922                 x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
  923                     MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL) |
  924                     (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
  925                     IA32_SPEC_CTRL_IBRS, NULL);
  926                 hw_ibrs_active = hw_ibrs_disable == 0;
  927                 hw_ibrs_ibpb_active = 0;
  928         } else {
  929                 hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
  930                     CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
  931         }
  932 }
  933 
  934 static int
  935 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
  936 {
  937         int error, val;
  938 
  939         val = hw_ibrs_disable;
  940         error = sysctl_handle_int(oidp, &val, 0, req);
  941         if (error != 0 || req->newptr == NULL)
  942                 return (error);
  943         hw_ibrs_disable = val != 0;
  944         hw_ibrs_recalculate(true);
  945         return (0);
  946 }
  947 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
  948     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
  949     "Disable Indirect Branch Restricted Speculation");
  950 
  951 SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT |
  952     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
  953     hw_ibrs_disable_handler, "I",
  954     "Disable Indirect Branch Restricted Speculation");
  955 
  956 int hw_ssb_active;
  957 int hw_ssb_disable;
  958 
  959 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
  960     &hw_ssb_active, 0,
  961     "Speculative Store Bypass Disable active");
  962 
  963 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb,
  964     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  965     "Speculative Store Bypass Disable active");
  966 
  967 SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD,
  968     &hw_ssb_active, 0, "Speculative Store Bypass Disable active");
  969 
  970 static void
  971 hw_ssb_set(bool enable, bool for_all_cpus)
  972 {
  973 
  974         if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
  975                 hw_ssb_active = 0;
  976                 return;
  977         }
  978         hw_ssb_active = enable;
  979         x86_msr_op(MSR_IA32_SPEC_CTRL,
  980             (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
  981             (for_all_cpus ? MSR_OP_SCHED_ALL : MSR_OP_LOCAL),
  982             IA32_SPEC_CTRL_SSBD, NULL);
  983 }
  984 
  985 void
  986 hw_ssb_recalculate(bool all_cpus)
  987 {
  988 
  989         switch (hw_ssb_disable) {
  990         default:
  991                 hw_ssb_disable = 0;
  992                 /* FALLTHROUGH */
  993         case 0: /* off */
  994                 hw_ssb_set(false, all_cpus);
  995                 break;
  996         case 1: /* on */
  997                 hw_ssb_set(true, all_cpus);
  998                 break;
  999         case 2: /* auto */
 1000                 hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
 1001                     false : true, all_cpus);
 1002                 break;
 1003         }
 1004 }
 1005 
 1006 static int
 1007 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
 1008 {
 1009         int error, val;
 1010 
 1011         val = hw_ssb_disable;
 1012         error = sysctl_handle_int(oidp, &val, 0, req);
 1013         if (error != 0 || req->newptr == NULL)
 1014                 return (error);
 1015         hw_ssb_disable = val;
 1016         hw_ssb_recalculate(true);
 1017         return (0);
 1018 }
 1019 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
 1020     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1021     hw_ssb_disable_handler, "I",
 1022     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)");
 1023 
 1024 SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT |
 1025     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1026     hw_ssb_disable_handler, "I",
 1027     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto)");
 1028 
 1029 int hw_mds_disable;
 1030 
 1031 /*
 1032  * Handler for Microarchitectural Data Sampling issues.  Really not a
 1033  * pointer to C function: on amd64 the code must not change any CPU
 1034  * architectural state except possibly %rflags. Also, it is always
 1035  * called with interrupts disabled.
 1036  */
 1037 void mds_handler_void(void);
 1038 void mds_handler_verw(void);
 1039 void mds_handler_ivb(void);
 1040 void mds_handler_bdw(void);
 1041 void mds_handler_skl_sse(void);
 1042 void mds_handler_skl_avx(void);
 1043 void mds_handler_skl_avx512(void);
 1044 void mds_handler_silvermont(void);
 1045 void (*mds_handler)(void) = mds_handler_void;
 1046 
 1047 static int
 1048 sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
 1049 {
 1050         const char *state;
 1051 
 1052         if (mds_handler == mds_handler_void)
 1053                 state = "inactive";
 1054         else if (mds_handler == mds_handler_verw)
 1055                 state = "VERW";
 1056         else if (mds_handler == mds_handler_ivb)
 1057                 state = "software IvyBridge";
 1058         else if (mds_handler == mds_handler_bdw)
 1059                 state = "software Broadwell";
 1060         else if (mds_handler == mds_handler_skl_sse)
 1061                 state = "software Skylake SSE";
 1062         else if (mds_handler == mds_handler_skl_avx)
 1063                 state = "software Skylake AVX";
 1064         else if (mds_handler == mds_handler_skl_avx512)
 1065                 state = "software Skylake AVX512";
 1066         else if (mds_handler == mds_handler_silvermont)
 1067                 state = "software Silvermont";
 1068         else
 1069                 state = "unknown";
 1070         return (SYSCTL_OUT(req, state, strlen(state)));
 1071 }
 1072 
 1073 SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
 1074     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1075     sysctl_hw_mds_disable_state_handler, "A",
 1076     "Microarchitectural Data Sampling Mitigation state");
 1077 
 1078 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds,
 1079     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1080     "Microarchitectural Data Sampling Mitigation state");
 1081 
 1082 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state,
 1083     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1084     sysctl_hw_mds_disable_state_handler, "A",
 1085     "Microarchitectural Data Sampling Mitigation state");
 1086 
 1087 _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
 1088 
 1089 void
 1090 hw_mds_recalculate(void)
 1091 {
 1092         struct pcpu *pc;
 1093         vm_offset_t b64;
 1094         u_long xcr0;
 1095         int i;
 1096 
 1097         /*
 1098          * Allow user to force VERW variant even if MD_CLEAR is not
 1099          * reported.  For instance, hypervisor might unknowingly
 1100          * filter the cap out.
 1101          * For the similar reasons, and for testing, allow to enable
 1102          * mitigation even when MDS_NO cap is set.
 1103          */
 1104         if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
 1105             ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
 1106             hw_mds_disable == 3)) {
 1107                 mds_handler = mds_handler_void;
 1108         } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
 1109             hw_mds_disable == 3) || hw_mds_disable == 1) {
 1110                 mds_handler = mds_handler_verw;
 1111         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1112             (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
 1113             CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
 1114             CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
 1115             CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
 1116             CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
 1117             CPUID_TO_MODEL(cpu_id) == 0x3a) &&
 1118             (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 1119                 /*
 1120                  * Nehalem, SandyBridge, IvyBridge
 1121                  */
 1122                 CPU_FOREACH(i) {
 1123                         pc = pcpu_find(i);
 1124                         if (pc->pc_mds_buf == NULL) {
 1125                                 pc->pc_mds_buf = malloc_domainset(672, M_TEMP,
 1126                                     DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
 1127                                 bzero(pc->pc_mds_buf, 16);
 1128                         }
 1129                 }
 1130                 mds_handler = mds_handler_ivb;
 1131         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1132             (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
 1133             CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
 1134             CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
 1135             CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
 1136             (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 1137                 /*
 1138                  * Haswell, Broadwell
 1139                  */
 1140                 CPU_FOREACH(i) {
 1141                         pc = pcpu_find(i);
 1142                         if (pc->pc_mds_buf == NULL) {
 1143                                 pc->pc_mds_buf = malloc_domainset(1536, M_TEMP,
 1144                                     DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
 1145                                 bzero(pc->pc_mds_buf, 16);
 1146                         }
 1147                 }
 1148                 mds_handler = mds_handler_bdw;
 1149         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1150             ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
 1151             CPUID_STEPPING) <= 5) ||
 1152             CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
 1153             (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
 1154             CPUID_STEPPING) <= 0xb) ||
 1155             (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
 1156             CPUID_STEPPING) <= 0xc)) &&
 1157             (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 1158                 /*
 1159                  * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
 1160                  * CascadeLake
 1161                  */
 1162                 CPU_FOREACH(i) {
 1163                         pc = pcpu_find(i);
 1164                         if (pc->pc_mds_buf == NULL) {
 1165                                 pc->pc_mds_buf = malloc_domainset(6 * 1024,
 1166                                     M_TEMP, DOMAINSET_PREF(pc->pc_domain),
 1167                                     M_WAITOK);
 1168                                 b64 = (vm_offset_t)malloc_domainset(64 + 63,
 1169                                     M_TEMP, DOMAINSET_PREF(pc->pc_domain),
 1170                                     M_WAITOK);
 1171                                 pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
 1172                                 bzero(pc->pc_mds_buf64, 64);
 1173                         }
 1174                 }
 1175                 xcr0 = rxcr(0);
 1176                 if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
 1177                     (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0)
 1178                         mds_handler = mds_handler_skl_avx512;
 1179                 else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
 1180                     (cpu_feature2 & CPUID2_AVX) != 0)
 1181                         mds_handler = mds_handler_skl_avx;
 1182                 else
 1183                         mds_handler = mds_handler_skl_sse;
 1184         } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 1185             ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
 1186             CPUID_TO_MODEL(cpu_id) == 0x4a ||
 1187             CPUID_TO_MODEL(cpu_id) == 0x4c ||
 1188             CPUID_TO_MODEL(cpu_id) == 0x4d ||
 1189             CPUID_TO_MODEL(cpu_id) == 0x5a ||
 1190             CPUID_TO_MODEL(cpu_id) == 0x5d ||
 1191             CPUID_TO_MODEL(cpu_id) == 0x6e ||
 1192             CPUID_TO_MODEL(cpu_id) == 0x65 ||
 1193             CPUID_TO_MODEL(cpu_id) == 0x75 ||
 1194             CPUID_TO_MODEL(cpu_id) == 0x1c ||
 1195             CPUID_TO_MODEL(cpu_id) == 0x26 ||
 1196             CPUID_TO_MODEL(cpu_id) == 0x27 ||
 1197             CPUID_TO_MODEL(cpu_id) == 0x35 ||
 1198             CPUID_TO_MODEL(cpu_id) == 0x36 ||
 1199             CPUID_TO_MODEL(cpu_id) == 0x7a))) {
 1200                 /* Silvermont, Airmont */
 1201                 CPU_FOREACH(i) {
 1202                         pc = pcpu_find(i);
 1203                         if (pc->pc_mds_buf == NULL)
 1204                                 pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
 1205                 }
 1206                 mds_handler = mds_handler_silvermont;
 1207         } else {
 1208                 hw_mds_disable = 0;
 1209                 mds_handler = mds_handler_void;
 1210         }
 1211 }
 1212 
 1213 static void
 1214 hw_mds_recalculate_boot(void *arg __unused)
 1215 {
 1216 
 1217         hw_mds_recalculate();
 1218 }
 1219 SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
 1220 
 1221 static int
 1222 sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
 1223 {
 1224         int error, val;
 1225 
 1226         val = hw_mds_disable;
 1227         error = sysctl_handle_int(oidp, &val, 0, req);
 1228         if (error != 0 || req->newptr == NULL)
 1229                 return (error);
 1230         if (val < 0 || val > 3)
 1231                 return (EINVAL);
 1232         hw_mds_disable = val;
 1233         hw_mds_recalculate();
 1234         return (0);
 1235 }
 1236 
 1237 SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
 1238     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1239     sysctl_mds_disable_handler, "I",
 1240     "Microarchitectural Data Sampling Mitigation "
 1241     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)");
 1242 
 1243 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT |
 1244     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1245     sysctl_mds_disable_handler, "I",
 1246     "Microarchitectural Data Sampling Mitigation "
 1247     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO)");
 1248 
 1249 /*
 1250  * Intel Transactional Memory Asynchronous Abort Mitigation
 1251  * CVE-2019-11135
 1252  */
 1253 int x86_taa_enable;
 1254 int x86_taa_state;
 1255 enum {
 1256         TAA_NONE        = 0,    /* No mitigation enabled */
 1257         TAA_TSX_DISABLE = 1,    /* Disable TSX via MSR */
 1258         TAA_VERW        = 2,    /* Use VERW mitigation */
 1259         TAA_AUTO        = 3,    /* Automatically select the mitigation */
 1260 
 1261         /* The states below are not selectable by the operator */
 1262 
 1263         TAA_TAA_UC      = 4,    /* Mitigation present in microcode */
 1264         TAA_NOT_PRESENT = 5     /* TSX is not present */
 1265 };
 1266 
 1267 static void
 1268 taa_set(bool enable, bool all)
 1269 {
 1270 
 1271         x86_msr_op(MSR_IA32_TSX_CTRL,
 1272             (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 1273             (all ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
 1274             IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR,
 1275             NULL);
 1276 }
 1277 
 1278 void
 1279 x86_taa_recalculate(void)
 1280 {
 1281         static int taa_saved_mds_disable = 0;
 1282         int taa_need = 0, taa_state = 0;
 1283         int mds_disable = 0, need_mds_recalc = 0;
 1284 
 1285         /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
 1286         if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
 1287             (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
 1288                 /* TSX is not present */
 1289                 x86_taa_state = TAA_NOT_PRESENT;
 1290                 return;
 1291         }
 1292 
 1293         /* Check to see what mitigation options the CPU gives us */
 1294         if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
 1295                 /* CPU is not suseptible to TAA */
 1296                 taa_need = TAA_TAA_UC;
 1297         } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
 1298                 /*
 1299                  * CPU can turn off TSX.  This is the next best option
 1300                  * if TAA_NO hardware mitigation isn't present
 1301                  */
 1302                 taa_need = TAA_TSX_DISABLE;
 1303         } else {
 1304                 /* No TSX/TAA specific remedies are available. */
 1305                 if (x86_taa_enable == TAA_TSX_DISABLE) {
 1306                         if (bootverbose)
 1307                                 printf("TSX control not available\n");
 1308                         return;
 1309                 } else
 1310                         taa_need = TAA_VERW;
 1311         }
 1312 
 1313         /* Can we automatically take action, or are we being forced? */
 1314         if (x86_taa_enable == TAA_AUTO)
 1315                 taa_state = taa_need;
 1316         else
 1317                 taa_state = x86_taa_enable;
 1318 
 1319         /* No state change, nothing to do */
 1320         if (taa_state == x86_taa_state) {
 1321                 if (bootverbose)
 1322                         printf("No TSX change made\n");
 1323                 return;
 1324         }
 1325 
 1326         /* Does the MSR need to be turned on or off? */
 1327         if (taa_state == TAA_TSX_DISABLE)
 1328                 taa_set(true, true);
 1329         else if (x86_taa_state == TAA_TSX_DISABLE)
 1330                 taa_set(false, true);
 1331 
 1332         /* Does MDS need to be set to turn on VERW? */
 1333         if (taa_state == TAA_VERW) {
 1334                 taa_saved_mds_disable = hw_mds_disable;
 1335                 mds_disable = hw_mds_disable = 1;
 1336                 need_mds_recalc = 1;
 1337         } else if (x86_taa_state == TAA_VERW) {
 1338                 mds_disable = hw_mds_disable = taa_saved_mds_disable;
 1339                 need_mds_recalc = 1;
 1340         }
 1341         if (need_mds_recalc) {
 1342                 hw_mds_recalculate();
 1343                 if (mds_disable != hw_mds_disable) {
 1344                         if (bootverbose)
 1345                                 printf("Cannot change MDS state for TAA\n");
 1346                         /* Don't update our state */
 1347                         return;
 1348                 }
 1349         }
 1350 
 1351         x86_taa_state = taa_state;
 1352         return;
 1353 }
 1354 
 1355 static void
 1356 taa_recalculate_boot(void * arg __unused)
 1357 {
 1358 
 1359         x86_taa_recalculate();
 1360 }
 1361 SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
 1362 
 1363 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa,
 1364     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1365     "TSX Asynchronous Abort Mitigation");
 1366 
 1367 static int
 1368 sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
 1369 {
 1370         int error, val;
 1371 
 1372         val = x86_taa_enable;
 1373         error = sysctl_handle_int(oidp, &val, 0, req);
 1374         if (error != 0 || req->newptr == NULL)
 1375                 return (error);
 1376         if (val < TAA_NONE || val > TAA_AUTO)
 1377                 return (EINVAL);
 1378         x86_taa_enable = val;
 1379         x86_taa_recalculate();
 1380         return (0);
 1381 }
 1382 
 1383 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
 1384     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1385     sysctl_taa_handler, "I",
 1386     "TAA Mitigation enablement control "
 1387     "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO)");
 1388 
 1389 static int
 1390 sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
 1391 {
 1392         const char *state;
 1393 
 1394         switch (x86_taa_state) {
 1395         case TAA_NONE:
 1396                 state = "inactive";
 1397                 break;
 1398         case TAA_TSX_DISABLE:
 1399                 state = "TSX disabled";
 1400                 break;
 1401         case TAA_VERW:
 1402                 state = "VERW";
 1403                 break;
 1404         case TAA_TAA_UC:
 1405                 state = "Mitigated in microcode";
 1406                 break;
 1407         case TAA_NOT_PRESENT:
 1408                 state = "TSX not present";
 1409                 break;
 1410         default:
 1411                 state = "unknown";
 1412         }
 1413 
 1414         return (SYSCTL_OUT(req, state, strlen(state)));
 1415 }
 1416 
 1417 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
 1418     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1419     sysctl_taa_state_handler, "A",
 1420     "TAA Mitigation state");
 1421 
 1422 int __read_frequently cpu_flush_rsb_ctxsw;
 1423 SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
 1424     CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
 1425     "Flush Return Stack Buffer on context switch");
 1426 
 1427 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
 1428     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1429     "MCU Optimization, disable RDSEED mitigation");
 1430 
 1431 int x86_rngds_mitg_enable = 1;
 1432 void
 1433 x86_rngds_mitg_recalculate(bool all_cpus)
 1434 {
 1435         if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
 1436                 return;
 1437         x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
 1438             (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 1439             (all_cpus ? MSR_OP_RENDEZVOUS_ALL : MSR_OP_LOCAL),
 1440             IA32_RNGDS_MITG_DIS, NULL);
 1441 }
 1442 
 1443 static int
 1444 sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
 1445 {
 1446         int error, val;
 1447 
 1448         val = x86_rngds_mitg_enable;
 1449         error = sysctl_handle_int(oidp, &val, 0, req);
 1450         if (error != 0 || req->newptr == NULL)
 1451                 return (error);
 1452         x86_rngds_mitg_enable = val;
 1453         x86_rngds_mitg_recalculate(true);
 1454         return (0);
 1455 }
 1456 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
 1457     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
 1458     sysctl_rngds_mitg_enable_handler, "I",
 1459     "MCU Optimization, disabling RDSEED mitigation control "
 1460     "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled)");
 1461 
 1462 static int
 1463 sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
 1464 {
 1465         const char *state;
 1466 
 1467         if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
 1468                 state = "Not applicable";
 1469         } else if (x86_rngds_mitg_enable == 0) {
 1470                 state = "RDSEED not serialized";
 1471         } else {
 1472                 state = "Mitigated";
 1473         }
 1474         return (SYSCTL_OUT(req, state, strlen(state)));
 1475 }
 1476 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
 1477     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 1478     sysctl_rngds_state_handler, "A",
 1479     "MCU Optimization state");
 1480 
 1481 /*
 1482  * Enable and restore kernel text write permissions.
 1483  * Callers must ensure that disable_wp()/restore_wp() are executed
 1484  * without rescheduling on the same core.
 1485  */
 1486 bool
 1487 disable_wp(void)
 1488 {
 1489         u_int cr0;
 1490 
 1491         cr0 = rcr0();
 1492         if ((cr0 & CR0_WP) == 0)
 1493                 return (false);
 1494         load_cr0(cr0 & ~CR0_WP);
 1495         return (true);
 1496 }
 1497 
 1498 void
 1499 restore_wp(bool old_wp)
 1500 {
 1501 
 1502         if (old_wp)
 1503                 load_cr0(rcr0() | CR0_WP);
 1504 }
 1505 
 1506 bool
 1507 acpi_get_fadt_bootflags(uint16_t *flagsp)
 1508 {
 1509 #ifdef DEV_ACPI
 1510         ACPI_TABLE_FADT *fadt;
 1511         vm_paddr_t physaddr;
 1512 
 1513         physaddr = acpi_find_table(ACPI_SIG_FADT);
 1514         if (physaddr == 0)
 1515                 return (false);
 1516         fadt = acpi_map_table(physaddr, ACPI_SIG_FADT);
 1517         if (fadt == NULL)
 1518                 return (false);
 1519         *flagsp = fadt->BootFlags;
 1520         acpi_unmap_table(fadt);
 1521         return (true);
 1522 #else
 1523         return (false);
 1524 #endif
 1525 }
 1526 
 1527 DEFINE_IFUNC(, uint64_t, rdtsc_ordered, (void))
 1528 {
 1529         bool cpu_is_amd = cpu_vendor_id == CPU_VENDOR_AMD ||
 1530             cpu_vendor_id == CPU_VENDOR_HYGON;
 1531 
 1532         if ((amd_feature & AMDID_RDTSCP) != 0)
 1533                 return (rdtscp);
 1534         else if ((cpu_feature & CPUID_SSE2) != 0)
 1535                 return (cpu_is_amd ? rdtsc_ordered_mfence :
 1536                     rdtsc_ordered_lfence);
 1537         else
 1538                 return (rdtsc);
 1539 }

Cache object: 8b16f4aaadea30733ce5bd009a1b015d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.