1 /*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43
44 #include "opt_atpic.h"
45 #include "opt_compat.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_inet.h"
49 #include "opt_isa.h"
50 #include "opt_kdb.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_perfmon.h"
55 #include "opt_platform.h"
56 #ifdef __i386__
57 #include "opt_apic.h"
58 #include "opt_xbox.h"
59 #endif
60
61 #include <sys/param.h>
62 #include <sys/proc.h>
63 #include <sys/systm.h>
64 #include <sys/bus.h>
65 #include <sys/cpu.h>
66 #include <sys/kdb.h>
67 #include <sys/kernel.h>
68 #include <sys/ktr.h>
69 #include <sys/lock.h>
70 #include <sys/malloc.h>
71 #include <sys/mutex.h>
72 #include <sys/pcpu.h>
73 #include <sys/rwlock.h>
74 #include <sys/sched.h>
75 #include <sys/smp.h>
76 #include <sys/sysctl.h>
77
78 #include <machine/clock.h>
79 #include <machine/cpu.h>
80 #include <machine/cputypes.h>
81 #include <machine/specialreg.h>
82 #include <machine/md_var.h>
83 #include <machine/mp_watchdog.h>
84 #ifdef PERFMON
85 #include <machine/perfmon.h>
86 #endif
87 #include <machine/tss.h>
88 #ifdef SMP
89 #include <machine/smp.h>
90 #endif
91 #ifdef CPU_ELAN
92 #include <machine/elan_mmcr.h>
93 #endif
94 #include <x86/acpica_machdep.h>
95
96 #include <vm/vm.h>
97 #include <vm/vm_extern.h>
98 #include <vm/vm_kern.h>
99 #include <vm/vm_page.h>
100 #include <vm/vm_map.h>
101 #include <vm/vm_object.h>
102 #include <vm/vm_pager.h>
103 #include <vm/vm_param.h>
104
105 #ifndef PC98
106 #include <isa/isareg.h>
107 #endif
108
109 #define STATE_RUNNING 0x0
110 #define STATE_MWAIT 0x1
111 #define STATE_SLEEPING 0x2
112
113 #ifdef SMP
114 static u_int cpu_reset_proxyid;
115 static volatile u_int cpu_reset_proxy_active;
116 #endif
117
118 struct msr_op_arg {
119 u_int msr;
120 int op;
121 uint64_t arg1;
122 };
123
124 static void
125 x86_msr_op_one(void *argp)
126 {
127 struct msr_op_arg *a;
128 uint64_t v;
129
130 a = argp;
131 switch (a->op) {
132 case MSR_OP_ANDNOT:
133 v = rdmsr(a->msr);
134 v &= ~a->arg1;
135 wrmsr(a->msr, v);
136 break;
137 case MSR_OP_OR:
138 v = rdmsr(a->msr);
139 v |= a->arg1;
140 wrmsr(a->msr, v);
141 break;
142 case MSR_OP_WRITE:
143 wrmsr(a->msr, a->arg1);
144 break;
145 }
146 }
147
148 #define MSR_OP_EXMODE_MASK 0xf0000000
149 #define MSR_OP_OP_MASK 0x000000ff
150
151 void
152 x86_msr_op(u_int msr, u_int op, uint64_t arg1)
153 {
154 struct thread *td;
155 struct msr_op_arg a;
156 u_int exmode;
157 int bound_cpu, i, is_bound;
158
159 a.op = op & MSR_OP_OP_MASK;
160 MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
161 a.op == MSR_OP_WRITE);
162 exmode = op & MSR_OP_EXMODE_MASK;
163 MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
164 exmode == MSR_OP_RENDEZVOUS);
165 a.msr = msr;
166 a.arg1 = arg1;
167 switch (exmode) {
168 case MSR_OP_LOCAL:
169 x86_msr_op_one(&a);
170 break;
171 case MSR_OP_SCHED:
172 td = curthread;
173 thread_lock(td);
174 is_bound = sched_is_bound(td);
175 bound_cpu = td->td_oncpu;
176 CPU_FOREACH(i) {
177 sched_bind(td, i);
178 x86_msr_op_one(&a);
179 }
180 if (is_bound)
181 sched_bind(td, bound_cpu);
182 else
183 sched_unbind(td);
184 thread_unlock(td);
185 break;
186 case MSR_OP_RENDEZVOUS:
187 smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
188 break;
189 }
190 }
191
192 /*
193 * Machine dependent boot() routine
194 *
195 * I haven't seen anything to put here yet
196 * Possibly some stuff might be grafted back here from boot()
197 */
198 void
199 cpu_boot(int howto)
200 {
201 }
202
203 /*
204 * Flush the D-cache for non-DMA I/O so that the I-cache can
205 * be made coherent later.
206 */
207 void
208 cpu_flush_dcache(void *ptr, size_t len)
209 {
210 /* Not applicable */
211 }
212
213 void
214 acpi_cpu_c1(void)
215 {
216
217 __asm __volatile("sti; hlt");
218 }
219
220 /*
221 * Use mwait to pause execution while waiting for an interrupt or
222 * another thread to signal that there is more work.
223 *
224 * NOTE: Interrupts will cause a wakeup; however, this function does
225 * not enable interrupt handling. The caller is responsible to enable
226 * interrupts.
227 */
228 void
229 acpi_cpu_idle_mwait(uint32_t mwait_hint)
230 {
231 int *state;
232 uint64_t v;
233
234 /*
235 * A comment in Linux patch claims that 'CPUs run faster with
236 * speculation protection disabled. All CPU threads in a core
237 * must disable speculation protection for it to be
238 * disabled. Disable it while we are idle so the other
239 * hyperthread can run fast.'
240 *
241 * XXXKIB. Software coordination mode should be supported,
242 * but all Intel CPUs provide hardware coordination.
243 */
244
245 state = (int *)PCPU_PTR(monitorbuf);
246 KASSERT(atomic_load_int(state) == STATE_SLEEPING,
247 ("cpu_mwait_cx: wrong monitorbuf state"));
248 atomic_store_int(state, STATE_MWAIT);
249 if (PCPU_GET(ibpb_set) || hw_ssb_active) {
250 v = rdmsr(MSR_IA32_SPEC_CTRL);
251 wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
252 IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
253 } else {
254 v = 0;
255 }
256 cpu_monitor(state, 0, 0);
257 if (atomic_load_int(state) == STATE_MWAIT)
258 cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
259
260 /*
261 * SSB cannot be disabled while we sleep, or rather, if it was
262 * disabled, the sysctl thread will bind to our cpu to tweak
263 * MSR.
264 */
265 if (v != 0)
266 wrmsr(MSR_IA32_SPEC_CTRL, v);
267
268 /*
269 * We should exit on any event that interrupts mwait, because
270 * that event might be a wanted interrupt.
271 */
272 atomic_store_int(state, STATE_RUNNING);
273 }
274
275 /* Get current clock frequency for the given cpu id. */
276 int
277 cpu_est_clockrate(int cpu_id, uint64_t *rate)
278 {
279 uint64_t tsc1, tsc2;
280 uint64_t acnt, mcnt, perf;
281 register_t reg;
282
283 if (pcpu_find(cpu_id) == NULL || rate == NULL)
284 return (EINVAL);
285 #ifdef __i386__
286 if ((cpu_feature & CPUID_TSC) == 0)
287 return (EOPNOTSUPP);
288 #endif
289
290 /*
291 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
292 * DELAY(9) based logic fails.
293 */
294 if (tsc_is_invariant && !tsc_perf_stat)
295 return (EOPNOTSUPP);
296
297 #ifdef SMP
298 if (smp_cpus > 1) {
299 /* Schedule ourselves on the indicated cpu. */
300 thread_lock(curthread);
301 sched_bind(curthread, cpu_id);
302 thread_unlock(curthread);
303 }
304 #endif
305
306 /* Calibrate by measuring a short delay. */
307 reg = intr_disable();
308 if (tsc_is_invariant) {
309 wrmsr(MSR_MPERF, 0);
310 wrmsr(MSR_APERF, 0);
311 tsc1 = rdtsc();
312 DELAY(1000);
313 mcnt = rdmsr(MSR_MPERF);
314 acnt = rdmsr(MSR_APERF);
315 tsc2 = rdtsc();
316 intr_restore(reg);
317 perf = 1000 * acnt / mcnt;
318 *rate = (tsc2 - tsc1) * perf;
319 } else {
320 tsc1 = rdtsc();
321 DELAY(1000);
322 tsc2 = rdtsc();
323 intr_restore(reg);
324 *rate = (tsc2 - tsc1) * 1000;
325 }
326
327 #ifdef SMP
328 if (smp_cpus > 1) {
329 thread_lock(curthread);
330 sched_unbind(curthread);
331 thread_unlock(curthread);
332 }
333 #endif
334
335 return (0);
336 }
337
338 /*
339 * Shutdown the CPU as much as possible
340 */
341 void
342 cpu_halt(void)
343 {
344 for (;;)
345 halt();
346 }
347
348 static void
349 cpu_reset_real(void)
350 {
351 struct region_descriptor null_idt;
352 #ifndef PC98
353 int b;
354 #endif
355
356 disable_intr();
357 #ifdef CPU_ELAN
358 if (elan_mmcr != NULL)
359 elan_mmcr->RESCFG = 1;
360 #endif
361 #ifdef __i386__
362 if (cpu == CPU_GEODE1100) {
363 /* Attempt Geode's own reset */
364 outl(0xcf8, 0x80009044ul);
365 outl(0xcfc, 0xf);
366 }
367 #endif
368 #ifdef PC98
369 /*
370 * Attempt to do a CPU reset via CPU reset port.
371 */
372 if ((inb(0x35) & 0xa0) != 0xa0) {
373 outb(0x37, 0x0f); /* SHUT0 = 0. */
374 outb(0x37, 0x0b); /* SHUT1 = 0. */
375 }
376 outb(0xf0, 0x00); /* Reset. */
377 #else
378 #if !defined(BROKEN_KEYBOARD_RESET)
379 /*
380 * Attempt to do a CPU reset via the keyboard controller,
381 * do not turn off GateA20, as any machine that fails
382 * to do the reset here would then end up in no man's land.
383 */
384 outb(IO_KBD + 4, 0xFE);
385 DELAY(500000); /* wait 0.5 sec to see if that did it */
386 #endif
387
388 /*
389 * Attempt to force a reset via the Reset Control register at
390 * I/O port 0xcf9. Bit 2 forces a system reset when it
391 * transitions from 0 to 1. Bit 1 selects the type of reset
392 * to attempt: 0 selects a "soft" reset, and 1 selects a
393 * "hard" reset. We try a "hard" reset. The first write sets
394 * bit 1 to select a "hard" reset and clears bit 2. The
395 * second write forces a 0 -> 1 transition in bit 2 to trigger
396 * a reset.
397 */
398 outb(0xcf9, 0x2);
399 outb(0xcf9, 0x6);
400 DELAY(500000); /* wait 0.5 sec to see if that did it */
401
402 /*
403 * Attempt to force a reset via the Fast A20 and Init register
404 * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
405 * Bit 0 asserts INIT# when set to 1. We are careful to only
406 * preserve bit 1 while setting bit 0. We also must clear bit
407 * 0 before setting it if it isn't already clear.
408 */
409 b = inb(0x92);
410 if (b != 0xff) {
411 if ((b & 0x1) != 0)
412 outb(0x92, b & 0xfe);
413 outb(0x92, b | 0x1);
414 DELAY(500000); /* wait 0.5 sec to see if that did it */
415 }
416 #endif /* PC98 */
417
418 printf("No known reset method worked, attempting CPU shutdown\n");
419 DELAY(1000000); /* wait 1 sec for printf to complete */
420
421 /* Wipe the IDT. */
422 null_idt.rd_limit = 0;
423 null_idt.rd_base = 0;
424 lidt(&null_idt);
425
426 /* "good night, sweet prince .... <THUNK!>" */
427 breakpoint();
428
429 /* NOTREACHED */
430 while(1);
431 }
432
433 #ifdef SMP
434 static void
435 cpu_reset_proxy(void)
436 {
437
438 cpu_reset_proxy_active = 1;
439 while (cpu_reset_proxy_active == 1)
440 ia32_pause(); /* Wait for other cpu to see that we've started */
441
442 printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
443 DELAY(1000000);
444 cpu_reset_real();
445 }
446 #endif
447
448 void
449 cpu_reset(void)
450 {
451 #ifdef SMP
452 cpuset_t map;
453 u_int cnt;
454
455 if (smp_started) {
456 map = all_cpus;
457 CPU_CLR(PCPU_GET(cpuid), &map);
458 CPU_NAND(&map, &stopped_cpus);
459 if (!CPU_EMPTY(&map)) {
460 printf("cpu_reset: Stopping other CPUs\n");
461 stop_cpus(map);
462 }
463
464 if (PCPU_GET(cpuid) != 0) {
465 cpu_reset_proxyid = PCPU_GET(cpuid);
466 cpustop_restartfunc = cpu_reset_proxy;
467 cpu_reset_proxy_active = 0;
468 printf("cpu_reset: Restarting BSP\n");
469
470 /* Restart CPU #0. */
471 CPU_SETOF(0, &started_cpus);
472 wmb();
473
474 cnt = 0;
475 while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
476 ia32_pause();
477 cnt++; /* Wait for BSP to announce restart */
478 }
479 if (cpu_reset_proxy_active == 0) {
480 printf("cpu_reset: Failed to restart BSP\n");
481 } else {
482 cpu_reset_proxy_active = 2;
483 while (1)
484 ia32_pause();
485 /* NOTREACHED */
486 }
487 }
488
489 DELAY(1000000);
490 }
491 #endif
492 cpu_reset_real();
493 /* NOTREACHED */
494 }
495
496 bool
497 cpu_mwait_usable(void)
498 {
499
500 return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
501 (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
502 (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
503 }
504
505 void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */
506 static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
507 static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
508 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
509 0, "Use MONITOR/MWAIT for short idle");
510
511 #ifndef PC98
512 static void
513 cpu_idle_acpi(sbintime_t sbt)
514 {
515 int *state;
516
517 state = (int *)PCPU_PTR(monitorbuf);
518 atomic_store_int(state, STATE_SLEEPING);
519
520 /* See comments in cpu_idle_hlt(). */
521 disable_intr();
522 if (sched_runnable())
523 enable_intr();
524 else if (cpu_idle_hook)
525 cpu_idle_hook(sbt);
526 else
527 acpi_cpu_c1();
528 atomic_store_int(state, STATE_RUNNING);
529 }
530 #endif /* !PC98 */
531
532 static void
533 cpu_idle_hlt(sbintime_t sbt)
534 {
535 int *state;
536
537 state = (int *)PCPU_PTR(monitorbuf);
538 atomic_store_int(state, STATE_SLEEPING);
539
540 /*
541 * Since we may be in a critical section from cpu_idle(), if
542 * an interrupt fires during that critical section we may have
543 * a pending preemption. If the CPU halts, then that thread
544 * may not execute until a later interrupt awakens the CPU.
545 * To handle this race, check for a runnable thread after
546 * disabling interrupts and immediately return if one is
547 * found. Also, we must absolutely guarentee that hlt is
548 * the next instruction after sti. This ensures that any
549 * interrupt that fires after the call to disable_intr() will
550 * immediately awaken the CPU from hlt. Finally, please note
551 * that on x86 this works fine because of interrupts enabled only
552 * after the instruction following sti takes place, while IF is set
553 * to 1 immediately, allowing hlt instruction to acknowledge the
554 * interrupt.
555 */
556 disable_intr();
557 if (sched_runnable())
558 enable_intr();
559 else
560 acpi_cpu_c1();
561 atomic_store_int(state, STATE_RUNNING);
562 }
563
564 static void
565 cpu_idle_mwait(sbintime_t sbt)
566 {
567 int *state;
568
569 state = (int *)PCPU_PTR(monitorbuf);
570 atomic_store_int(state, STATE_MWAIT);
571
572 /* See comments in cpu_idle_hlt(). */
573 disable_intr();
574 if (sched_runnable()) {
575 atomic_store_int(state, STATE_RUNNING);
576 enable_intr();
577 return;
578 }
579
580 cpu_monitor(state, 0, 0);
581 if (atomic_load_int(state) == STATE_MWAIT)
582 __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
583 else
584 enable_intr();
585 atomic_store_int(state, STATE_RUNNING);
586 }
587
588 static void
589 cpu_idle_spin(sbintime_t sbt)
590 {
591 int *state;
592 int i;
593
594 state = (int *)PCPU_PTR(monitorbuf);
595 atomic_store_int(state, STATE_RUNNING);
596
597 /*
598 * The sched_runnable() call is racy but as long as there is
599 * a loop missing it one time will have just a little impact if any
600 * (and it is much better than missing the check at all).
601 */
602 for (i = 0; i < 1000; i++) {
603 if (sched_runnable())
604 return;
605 cpu_spinwait();
606 }
607 }
608
609 /*
610 * C1E renders the local APIC timer dead, so we disable it by
611 * reading the Interrupt Pending Message register and clearing
612 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
613 *
614 * Reference:
615 * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
616 * #32559 revision 3.00+
617 */
618 #define MSR_AMDK8_IPM 0xc0010055
619 #define AMDK8_SMIONCMPHALT (1ULL << 27)
620 #define AMDK8_C1EONCMPHALT (1ULL << 28)
621 #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
622
623 void
624 cpu_probe_amdc1e(void)
625 {
626
627 /*
628 * Detect the presence of C1E capability mostly on latest
629 * dual-cores (or future) k8 family.
630 */
631 if (cpu_vendor_id == CPU_VENDOR_AMD &&
632 (cpu_id & 0x00000f00) == 0x00000f00 &&
633 (cpu_id & 0x0fff0000) >= 0x00040000) {
634 cpu_ident_amdc1e = 1;
635 }
636 }
637
638 #if defined(__i386__) && defined(PC98)
639 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
640 #else
641 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
642 #endif
643
644 void
645 cpu_idle(int busy)
646 {
647 uint64_t msr;
648 sbintime_t sbt = -1;
649
650 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
651 busy, curcpu);
652 #ifdef MP_WATCHDOG
653 ap_watchdog(PCPU_GET(cpuid));
654 #endif
655
656 /* If we are busy - try to use fast methods. */
657 if (busy) {
658 if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
659 cpu_idle_mwait(busy);
660 goto out;
661 }
662 }
663
664 /* If we have time - switch timers into idle mode. */
665 if (!busy) {
666 critical_enter();
667 sbt = cpu_idleclock();
668 }
669
670 /* Apply AMD APIC timer C1E workaround. */
671 if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
672 msr = rdmsr(MSR_AMDK8_IPM);
673 if (msr & AMDK8_CMPHALT)
674 wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
675 }
676
677 /* Call main idle method. */
678 cpu_idle_fn(sbt);
679
680 /* Switch timers back into active mode. */
681 if (!busy) {
682 cpu_activeclock();
683 critical_exit();
684 }
685 out:
686 CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
687 busy, curcpu);
688 }
689
690 static int cpu_idle_apl31_workaround;
691 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
692 &cpu_idle_apl31_workaround, 0,
693 "Apollo Lake APL31 MWAIT bug workaround");
694
695 int
696 cpu_idle_wakeup(int cpu)
697 {
698 int *state;
699
700 state = (int *)pcpu_find(cpu)->pc_monitorbuf;
701 switch (atomic_load_int(state)) {
702 case STATE_SLEEPING:
703 return (0);
704 case STATE_MWAIT:
705 atomic_store_int(state, STATE_RUNNING);
706 return (cpu_idle_apl31_workaround ? 0 : 1);
707 case STATE_RUNNING:
708 return (1);
709 default:
710 panic("bad monitor state");
711 return (1);
712 }
713 }
714
715 /*
716 * Ordered by speed/power consumption.
717 */
718 static struct {
719 void *id_fn;
720 char *id_name;
721 int id_cpuid2_flag;
722 } idle_tbl[] = {
723 { .id_fn = cpu_idle_spin, .id_name = "spin" },
724 { .id_fn = cpu_idle_mwait, .id_name = "mwait",
725 .id_cpuid2_flag = CPUID2_MON },
726 { .id_fn = cpu_idle_hlt, .id_name = "hlt" },
727 #if !defined(__i386__) || !defined(PC98)
728 { .id_fn = cpu_idle_acpi, .id_name = "acpi" },
729 #endif
730 };
731
732 static int
733 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
734 {
735 char *avail, *p;
736 int error;
737 int i;
738
739 avail = malloc(256, M_TEMP, M_WAITOK);
740 p = avail;
741 for (i = 0; i < nitems(idle_tbl); i++) {
742 if (idle_tbl[i].id_cpuid2_flag != 0 &&
743 (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
744 continue;
745 #if !defined(__i386__) || !defined(PC98)
746 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
747 cpu_idle_hook == NULL)
748 continue;
749 #endif
750 p += sprintf(p, "%s%s", p != avail ? ", " : "",
751 idle_tbl[i].id_name);
752 }
753 error = sysctl_handle_string(oidp, avail, 0, req);
754 free(avail, M_TEMP);
755 return (error);
756 }
757
758 SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
759 0, 0, idle_sysctl_available, "A", "list of available idle functions");
760
761 static bool
762 cpu_idle_selector(const char *new_idle_name)
763 {
764 int i;
765
766 for (i = 0; i < nitems(idle_tbl); i++) {
767 if (idle_tbl[i].id_cpuid2_flag != 0 &&
768 (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
769 continue;
770 #if !defined(__i386__) || !defined(PC98)
771 if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
772 cpu_idle_hook == NULL)
773 continue;
774 #endif
775 if (strcmp(idle_tbl[i].id_name, new_idle_name))
776 continue;
777 cpu_idle_fn = idle_tbl[i].id_fn;
778 if (bootverbose)
779 printf("CPU idle set to %s\n", idle_tbl[i].id_name);
780 return (true);
781 }
782 return (false);
783 }
784
785 static int
786 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
787 {
788 char buf[16], *p;
789 int error, i;
790
791 p = "unknown";
792 for (i = 0; i < nitems(idle_tbl); i++) {
793 if (idle_tbl[i].id_fn == cpu_idle_fn) {
794 p = idle_tbl[i].id_name;
795 break;
796 }
797 }
798 strncpy(buf, p, sizeof(buf));
799 error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
800 if (error != 0 || req->newptr == NULL)
801 return (error);
802 return (cpu_idle_selector(buf) ? 0 : EINVAL);
803 }
804
805 SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
806 cpu_idle_sysctl, "A", "currently selected idle function");
807
808 static void
809 cpu_idle_tun(void *unused __unused)
810 {
811 char tunvar[16];
812
813 if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
814 cpu_idle_selector(tunvar);
815 else if (cpu_vendor_id == CPU_VENDOR_AMD &&
816 CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
817 /* Ryzen erratas 1057, 1109. */
818 cpu_idle_selector("hlt");
819 idle_mwait = 0;
820 }
821
822 if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
823 /*
824 * Apollo Lake errata APL31 (public errata APL30).
825 * Stores to the armed address range may not trigger
826 * MWAIT to resume execution. OS needs to use
827 * interrupts to wake processors from MWAIT-induced
828 * sleep states.
829 */
830 cpu_idle_apl31_workaround = 1;
831 }
832 TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
833 }
834 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
835
836 static int panic_on_nmi = 1;
837 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
838 &panic_on_nmi, 0,
839 "Panic on NMI raised by hardware failure");
840 int nmi_is_broadcast = 1;
841 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
842 &nmi_is_broadcast, 0,
843 "Chipset NMI is broadcast");
844 #ifdef KDB
845 int kdb_on_nmi = 1;
846 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
847 &kdb_on_nmi, 0,
848 "Go to KDB on NMI with unknown source");
849 #endif
850
851 void
852 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
853 {
854 bool claimed = false;
855
856 #ifdef DEV_ISA
857 /* machine/parity/power fail/"kitchen sink" faults */
858 if (isa_nmi(frame->tf_err)) {
859 claimed = true;
860 if (panic_on_nmi)
861 panic("NMI indicates hardware failure");
862 }
863 #endif /* DEV_ISA */
864 #ifdef KDB
865 if (!claimed && kdb_on_nmi) {
866 /*
867 * NMI can be hooked up to a pushbutton for debugging.
868 */
869 printf("NMI/cpu%d ... going to debugger\n", cpu);
870 kdb_trap(type, 0, frame);
871 }
872 #endif /* KDB */
873 }
874
875 void
876 nmi_handle_intr(u_int type, struct trapframe *frame)
877 {
878
879 #ifdef SMP
880 if (nmi_is_broadcast) {
881 nmi_call_kdb_smp(type, frame);
882 return;
883 }
884 #endif
885 nmi_call_kdb(PCPU_GET(cpuid), type, frame);
886 }
887
888 static int hw_ibrs_active;
889 int hw_ibrs_ibpb_active;
890 int hw_ibrs_disable = 1;
891
892 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
893 "Indirect Branch Restricted Speculation active");
894
895 void
896 hw_ibrs_recalculate(bool for_all_cpus)
897 {
898 if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
899 x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
900 MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) |
901 (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
902 IA32_SPEC_CTRL_IBRS);
903 hw_ibrs_active = hw_ibrs_disable == 0;
904 hw_ibrs_ibpb_active = 0;
905 } else {
906 hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
907 CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
908 }
909 }
910
911 static int
912 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
913 {
914 int error, val;
915
916 val = hw_ibrs_disable;
917 error = sysctl_handle_int(oidp, &val, 0, req);
918 if (error != 0 || req->newptr == NULL)
919 return (error);
920 hw_ibrs_disable = val != 0;
921 hw_ibrs_recalculate(true);
922 return (0);
923 }
924 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
925 CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
926 "Disable Indirect Branch Restricted Speculation");
927
928 int hw_ssb_active;
929 int hw_ssb_disable;
930
931 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
932 &hw_ssb_active, 0,
933 "Speculative Store Bypass Disable active");
934
935 static void
936 hw_ssb_set(bool enable, bool for_all_cpus)
937 {
938
939 if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
940 hw_ssb_active = 0;
941 return;
942 }
943 hw_ssb_active = enable;
944 x86_msr_op(MSR_IA32_SPEC_CTRL,
945 (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
946 (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
947 }
948
949 void
950 hw_ssb_recalculate(bool all_cpus)
951 {
952
953 switch (hw_ssb_disable) {
954 default:
955 hw_ssb_disable = 0;
956 /* FALLTHROUGH */
957 case 0: /* off */
958 hw_ssb_set(false, all_cpus);
959 break;
960 case 1: /* on */
961 hw_ssb_set(true, all_cpus);
962 break;
963 case 2: /* auto */
964 hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
965 false : true, all_cpus);
966 break;
967 }
968 }
969
970 static int
971 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
972 {
973 int error, val;
974
975 val = hw_ssb_disable;
976 error = sysctl_handle_int(oidp, &val, 0, req);
977 if (error != 0 || req->newptr == NULL)
978 return (error);
979 hw_ssb_disable = val;
980 hw_ssb_recalculate(true);
981 return (0);
982 }
983 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
984 CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
985 hw_ssb_disable_handler, "I",
986 "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
987
988 int hw_mds_disable;
989
990 /*
991 * Handler for Microarchitectural Data Sampling issues. Really not a
992 * pointer to C function: on amd64 the code must not change any CPU
993 * architectural state except possibly %rflags. Also, it is always
994 * called with interrupts disabled.
995 */
996 void mds_handler_void(void);
997 void mds_handler_verw(void);
998 void mds_handler_ivb(void);
999 void mds_handler_bdw(void);
1000 void mds_handler_skl_sse(void);
1001 void mds_handler_skl_avx(void);
1002 void mds_handler_skl_avx512(void);
1003 void mds_handler_silvermont(void);
1004 void (*mds_handler)(void) = mds_handler_void;
1005
1006 static int
1007 sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
1008 {
1009 const char *state;
1010
1011 if (mds_handler == mds_handler_void)
1012 state = "inactive";
1013 else if (mds_handler == mds_handler_verw)
1014 state = "VERW";
1015 else if (mds_handler == mds_handler_ivb)
1016 state = "software IvyBridge";
1017 else if (mds_handler == mds_handler_bdw)
1018 state = "software Broadwell";
1019 else if (mds_handler == mds_handler_skl_sse)
1020 state = "software Skylake SSE";
1021 else if (mds_handler == mds_handler_skl_avx)
1022 state = "software Skylake AVX";
1023 else if (mds_handler == mds_handler_skl_avx512)
1024 state = "software Skylake AVX512";
1025 else if (mds_handler == mds_handler_silvermont)
1026 state = "software Silvermont";
1027 else
1028 state = "unknown";
1029 return (SYSCTL_OUT(req, state, strlen(state)));
1030 }
1031
1032 SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
1033 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1034 sysctl_hw_mds_disable_state_handler, "A",
1035 "Microarchitectural Data Sampling Mitigation state");
1036
1037 _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
1038
1039 void
1040 hw_mds_recalculate(void)
1041 {
1042 struct pcpu *pc;
1043 vm_offset_t b64;
1044 u_long xcr0;
1045 int i;
1046
1047 /*
1048 * Allow user to force VERW variant even if MD_CLEAR is not
1049 * reported. For instance, hypervisor might unknowingly
1050 * filter the cap out.
1051 * For the similar reasons, and for testing, allow to enable
1052 * mitigation even when MDS_NO cap is set.
1053 */
1054 if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
1055 ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
1056 hw_mds_disable == 3)) {
1057 mds_handler = mds_handler_void;
1058 } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
1059 hw_mds_disable == 3) || hw_mds_disable == 1) {
1060 mds_handler = mds_handler_verw;
1061 } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1062 (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
1063 CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
1064 CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
1065 CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
1066 CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
1067 CPUID_TO_MODEL(cpu_id) == 0x3a) &&
1068 (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1069 /*
1070 * Nehalem, SandyBridge, IvyBridge
1071 */
1072 CPU_FOREACH(i) {
1073 pc = pcpu_find(i);
1074 if (pc->pc_mds_buf == NULL) {
1075 pc->pc_mds_buf = malloc(672, M_TEMP,
1076 M_WAITOK);
1077 bzero(pc->pc_mds_buf, 16);
1078 }
1079 }
1080 mds_handler = mds_handler_ivb;
1081 } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1082 (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
1083 CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
1084 CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
1085 CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
1086 (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1087 /*
1088 * Haswell, Broadwell
1089 */
1090 CPU_FOREACH(i) {
1091 pc = pcpu_find(i);
1092 if (pc->pc_mds_buf == NULL) {
1093 pc->pc_mds_buf = malloc(1536, M_TEMP,
1094 M_WAITOK);
1095 bzero(pc->pc_mds_buf, 16);
1096 }
1097 }
1098 mds_handler = mds_handler_bdw;
1099 } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1100 ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
1101 CPUID_STEPPING) <= 5) ||
1102 CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
1103 (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
1104 CPUID_STEPPING) <= 0xb) ||
1105 (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
1106 CPUID_STEPPING) <= 0xc)) &&
1107 (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1108 /*
1109 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
1110 * CascadeLake
1111 */
1112 CPU_FOREACH(i) {
1113 pc = pcpu_find(i);
1114 if (pc->pc_mds_buf == NULL) {
1115 pc->pc_mds_buf = malloc(6 * 1024,
1116 M_TEMP, M_WAITOK);
1117 b64 = (vm_offset_t)malloc(64 + 63,
1118 M_TEMP, M_WAITOK);
1119 pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
1120 bzero(pc->pc_mds_buf64, 64);
1121 }
1122 }
1123 xcr0 = rxcr(0);
1124 if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
1125 (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
1126 mds_handler = mds_handler_skl_avx512;
1127 else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
1128 (cpu_feature2 & CPUID2_AVX) != 0)
1129 mds_handler = mds_handler_skl_avx;
1130 else
1131 mds_handler = mds_handler_skl_sse;
1132 } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1133 ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
1134 CPUID_TO_MODEL(cpu_id) == 0x4a ||
1135 CPUID_TO_MODEL(cpu_id) == 0x4c ||
1136 CPUID_TO_MODEL(cpu_id) == 0x4d ||
1137 CPUID_TO_MODEL(cpu_id) == 0x5a ||
1138 CPUID_TO_MODEL(cpu_id) == 0x5d ||
1139 CPUID_TO_MODEL(cpu_id) == 0x6e ||
1140 CPUID_TO_MODEL(cpu_id) == 0x65 ||
1141 CPUID_TO_MODEL(cpu_id) == 0x75 ||
1142 CPUID_TO_MODEL(cpu_id) == 0x1c ||
1143 CPUID_TO_MODEL(cpu_id) == 0x26 ||
1144 CPUID_TO_MODEL(cpu_id) == 0x27 ||
1145 CPUID_TO_MODEL(cpu_id) == 0x35 ||
1146 CPUID_TO_MODEL(cpu_id) == 0x36 ||
1147 CPUID_TO_MODEL(cpu_id) == 0x7a))) {
1148 /* Silvermont, Airmont */
1149 CPU_FOREACH(i) {
1150 pc = pcpu_find(i);
1151 if (pc->pc_mds_buf == NULL)
1152 pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
1153 }
1154 mds_handler = mds_handler_silvermont;
1155 } else {
1156 hw_mds_disable = 0;
1157 mds_handler = mds_handler_void;
1158 }
1159 }
1160
1161 static void
1162 hw_mds_recalculate_boot(void *arg __unused)
1163 {
1164
1165 hw_mds_recalculate();
1166 }
1167 SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
1168
1169 static int
1170 sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
1171 {
1172 int error, val;
1173
1174 val = hw_mds_disable;
1175 error = sysctl_handle_int(oidp, &val, 0, req);
1176 if (error != 0 || req->newptr == NULL)
1177 return (error);
1178 if (val < 0 || val > 3)
1179 return (EINVAL);
1180 hw_mds_disable = val;
1181 hw_mds_recalculate();
1182 return (0);
1183 }
1184
1185 SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
1186 CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1187 sysctl_mds_disable_handler, "I",
1188 "Microarchitectural Data Sampling Mitigation "
1189 "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
1190
1191
1192 /*
1193 * Intel Transactional Memory Asynchronous Abort Mitigation
1194 * CVE-2019-11135
1195 */
1196 int x86_taa_enable;
1197 int x86_taa_state;
1198 enum {
1199 TAA_NONE = 0, /* No mitigation enabled */
1200 TAA_TSX_DISABLE = 1, /* Disable TSX via MSR */
1201 TAA_VERW = 2, /* Use VERW mitigation */
1202 TAA_AUTO = 3, /* Automatically select the mitigation */
1203
1204 /* The states below are not selectable by the operator */
1205
1206 TAA_TAA_UC = 4, /* Mitigation present in microcode */
1207 TAA_NOT_PRESENT = 5 /* TSX is not present */
1208 };
1209
1210 static void
1211 taa_set(bool enable, bool all)
1212 {
1213
1214 x86_msr_op(MSR_IA32_TSX_CTRL,
1215 (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1216 (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
1217 IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
1218 }
1219
1220 void
1221 x86_taa_recalculate(void)
1222 {
1223 static int taa_saved_mds_disable = 0;
1224 int taa_need = 0, taa_state = 0;
1225 int mds_disable = 0, need_mds_recalc = 0;
1226
1227 /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
1228 if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
1229 (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
1230 /* TSX is not present */
1231 x86_taa_state = TAA_NOT_PRESENT;
1232 return;
1233 }
1234
1235 /* Check to see what mitigation options the CPU gives us */
1236 if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
1237 /* CPU is not suseptible to TAA */
1238 taa_need = TAA_TAA_UC;
1239 } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
1240 /*
1241 * CPU can turn off TSX. This is the next best option
1242 * if TAA_NO hardware mitigation isn't present
1243 */
1244 taa_need = TAA_TSX_DISABLE;
1245 } else {
1246 /* No TSX/TAA specific remedies are available. */
1247 if (x86_taa_enable == TAA_TSX_DISABLE) {
1248 if (bootverbose)
1249 printf("TSX control not available\n");
1250 return;
1251 } else
1252 taa_need = TAA_VERW;
1253 }
1254
1255 /* Can we automatically take action, or are we being forced? */
1256 if (x86_taa_enable == TAA_AUTO)
1257 taa_state = taa_need;
1258 else
1259 taa_state = x86_taa_enable;
1260
1261 /* No state change, nothing to do */
1262 if (taa_state == x86_taa_state) {
1263 if (bootverbose)
1264 printf("No TSX change made\n");
1265 return;
1266 }
1267
1268 /* Does the MSR need to be turned on or off? */
1269 if (taa_state == TAA_TSX_DISABLE)
1270 taa_set(true, true);
1271 else if (x86_taa_state == TAA_TSX_DISABLE)
1272 taa_set(false, true);
1273
1274 /* Does MDS need to be set to turn on VERW? */
1275 if (taa_state == TAA_VERW) {
1276 taa_saved_mds_disable = hw_mds_disable;
1277 mds_disable = hw_mds_disable = 1;
1278 need_mds_recalc = 1;
1279 } else if (x86_taa_state == TAA_VERW) {
1280 mds_disable = hw_mds_disable = taa_saved_mds_disable;
1281 need_mds_recalc = 1;
1282 }
1283 if (need_mds_recalc) {
1284 hw_mds_recalculate();
1285 if (mds_disable != hw_mds_disable) {
1286 if (bootverbose)
1287 printf("Cannot change MDS state for TAA\n");
1288 /* Don't update our state */
1289 return;
1290 }
1291 }
1292
1293 x86_taa_state = taa_state;
1294 return;
1295 }
1296
1297 static void
1298 taa_recalculate_boot(void * arg __unused)
1299 {
1300
1301 x86_taa_recalculate();
1302 }
1303 SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
1304
1305 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0,
1306 "TSX Asynchronous Abort Mitigation");
1307
1308 static int
1309 sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
1310 {
1311 int error, val;
1312
1313 val = x86_taa_enable;
1314 error = sysctl_handle_int(oidp, &val, 0, req);
1315 if (error != 0 || req->newptr == NULL)
1316 return (error);
1317 if (val < TAA_NONE || val > TAA_AUTO)
1318 return (EINVAL);
1319 x86_taa_enable = val;
1320 x86_taa_recalculate();
1321 return (0);
1322 }
1323
1324 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
1325 CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1326 sysctl_taa_handler, "I",
1327 "TAA Mitigation enablement control "
1328 "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
1329
1330 static int
1331 sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
1332 {
1333 const char *state;
1334
1335 switch (x86_taa_state) {
1336 case TAA_NONE:
1337 state = "inactive";
1338 break;
1339 case TAA_TSX_DISABLE:
1340 state = "TSX disabled";
1341 break;
1342 case TAA_VERW:
1343 state = "VERW";
1344 break;
1345 case TAA_TAA_UC:
1346 state = "Mitigated in microcode";
1347 break;
1348 case TAA_NOT_PRESENT:
1349 state = "TSX not present";
1350 break;
1351 default:
1352 state = "unknown";
1353 }
1354
1355 return (SYSCTL_OUT(req, state, strlen(state)));
1356 }
1357
1358 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
1359 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1360 sysctl_taa_state_handler, "A",
1361 "TAA Mitigation state");
1362
1363 int __read_frequently cpu_flush_rsb_ctxsw;
1364 SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
1365 CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
1366 "Flush Return Stack Buffer on context switch");
1367
1368 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
1369 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1370 "MCU Optimization, disable RDSEED mitigation");
1371
1372 int x86_rngds_mitg_enable = 1;
1373 void
1374 x86_rngds_mitg_recalculate(bool all_cpus)
1375 {
1376 if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
1377 return;
1378 x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
1379 (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1380 (all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
1381 IA32_RNGDS_MITG_DIS);
1382 }
1383
1384 static int
1385 sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
1386 {
1387 int error, val;
1388
1389 val = x86_rngds_mitg_enable;
1390 error = sysctl_handle_int(oidp, &val, 0, req);
1391 if (error != 0 || req->newptr == NULL)
1392 return (error);
1393 x86_rngds_mitg_enable = val;
1394 x86_rngds_mitg_recalculate(true);
1395 return (0);
1396 }
1397 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
1398 CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1399 sysctl_rngds_mitg_enable_handler, "I",
1400 "MCU Optimization, disabling RDSEED mitigation control "
1401 "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled");
1402
1403 static int
1404 sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
1405 {
1406 const char *state;
1407
1408 if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
1409 state = "Not applicable";
1410 } else if (x86_rngds_mitg_enable == 0) {
1411 state = "RDSEED not serialized";
1412 } else {
1413 state = "Mitigated";
1414 }
1415 return (SYSCTL_OUT(req, state, strlen(state)));
1416 }
1417 SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
1418 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1419 sysctl_rngds_state_handler, "A",
1420 "MCU Optimization state");
Cache object: 1c8e4edfd66f5610b6dcb4c77a357dce
|