FreeBSD/Linux Kernel Cross Reference
sys/x86/x86/mp_x86.c
1 /*-
2 * Copyright (c) 1996, by Steve Passe
3 * Copyright (c) 2003, by Peter Wemm
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. The name of the developer may NOT be used to endorse or promote products
12 * derived from this software without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #ifdef __i386__
31 #include "opt_apic.h"
32 #endif
33 #include "opt_cpu.h"
34 #include "opt_kstack_pages.h"
35 #include "opt_pmap.h"
36 #include "opt_sched.h"
37 #include "opt_smp.h"
38
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/bus.h>
42 #include <sys/cons.h> /* cngetc() */
43 #include <sys/cpuset.h>
44 #ifdef GPROF
45 #include <sys/gmon.h>
46 #endif
47 #include <sys/kernel.h>
48 #include <sys/ktr.h>
49 #include <sys/lock.h>
50 #include <sys/malloc.h>
51 #include <sys/memrange.h>
52 #include <sys/mutex.h>
53 #include <sys/pcpu.h>
54 #include <sys/proc.h>
55 #include <sys/sched.h>
56 #include <sys/smp.h>
57 #include <sys/sysctl.h>
58
59 #include <vm/vm.h>
60 #include <vm/vm_param.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_kern.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_map.h>
65
66 #include <x86/apicreg.h>
67 #include <machine/clock.h>
68 #include <machine/cpu.h>
69 #include <machine/cputypes.h>
70 #include <x86/mca.h>
71 #include <machine/md_var.h>
72 #include <machine/pcb.h>
73 #include <machine/psl.h>
74 #include <machine/smp.h>
75 #include <machine/specialreg.h>
76 #include <x86/ucode.h>
77
78 /* lock region used by kernel profiling */
79 int mcount_lock;
80
81 int mp_naps; /* # of Applications processors */
82 int boot_cpu_id = -1; /* designated BSP */
83
84 extern struct pcpu __pcpu[];
85
86 /* AP uses this during bootstrap. Do not staticize. */
87 char *bootSTK;
88 int bootAP;
89
90 /* Free these after use */
91 void *bootstacks[MAXCPU];
92 void *dpcpu;
93
94 struct pcb stoppcbs[MAXCPU];
95 struct susppcb **susppcbs;
96
97 #ifdef COUNT_IPIS
98 /* Interrupt counts. */
99 static u_long *ipi_preempt_counts[MAXCPU];
100 static u_long *ipi_ast_counts[MAXCPU];
101 u_long *ipi_invltlb_counts[MAXCPU];
102 u_long *ipi_invlrng_counts[MAXCPU];
103 u_long *ipi_invlpg_counts[MAXCPU];
104 u_long *ipi_invlcache_counts[MAXCPU];
105 u_long *ipi_rendezvous_counts[MAXCPU];
106 static u_long *ipi_hardclock_counts[MAXCPU];
107 #endif
108
109 /* Default cpu_ops implementation. */
110 struct cpu_ops cpu_ops;
111
112 /*
113 * Local data and functions.
114 */
115
116 static volatile cpuset_t ipi_stop_nmi_pending;
117
118 volatile cpuset_t resuming_cpus;
119 volatile cpuset_t toresume_cpus;
120
121 /* used to hold the AP's until we are ready to release them */
122 struct mtx ap_boot_mtx;
123
124 /* Set to 1 once we're ready to let the APs out of the pen. */
125 volatile int aps_ready = 0;
126
127 /*
128 * Store data from cpu_add() until later in the boot when we actually setup
129 * the APs.
130 */
131 struct cpu_info cpu_info[MAX_APIC_ID + 1];
132 int apic_cpuids[MAX_APIC_ID + 1];
133 int cpu_apic_ids[MAXCPU];
134
135 /* Holds pending bitmap based IPIs per CPU */
136 volatile u_int cpu_ipi_pending[MAXCPU];
137
138 static void release_aps(void *dummy);
139 static void cpustop_handler_post(u_int cpu);
140
141 static int hyperthreading_allowed = 1;
142 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
143 &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
144
145 static struct topo_node topo_root;
146
147 static int pkg_id_shift;
148 static int core_id_shift;
149 static int disabled_cpus;
150
151 struct cache_info {
152 int id_shift;
153 int present;
154 } static caches[MAX_CACHE_LEVELS];
155
156 void
157 mem_range_AP_init(void)
158 {
159
160 if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
161 mem_range_softc.mr_op->initAP(&mem_range_softc);
162 }
163
164 /*
165 * Round up to the next power of two, if necessary, and then
166 * take log2.
167 * Returns -1 if argument is zero.
168 */
169 static __inline int
170 mask_width(u_int x)
171 {
172
173 return (fls(x << (1 - powerof2(x))) - 1);
174 }
175
176 /*
177 * Add a cache level to the cache topology description.
178 */
179 static int
180 add_deterministic_cache(int type, int level, int share_count)
181 {
182
183 if (type == 0)
184 return (0);
185 if (type > 3) {
186 printf("unexpected cache type %d\n", type);
187 return (1);
188 }
189 if (type == 2) /* ignore instruction cache */
190 return (1);
191 if (level == 0 || level > MAX_CACHE_LEVELS) {
192 printf("unexpected cache level %d\n", type);
193 return (1);
194 }
195
196 if (caches[level - 1].present) {
197 printf("WARNING: multiple entries for L%u data cache\n", level);
198 printf("%u => %u\n", caches[level - 1].id_shift,
199 mask_width(share_count));
200 }
201 caches[level - 1].id_shift = mask_width(share_count);
202 caches[level - 1].present = 1;
203
204 if (caches[level - 1].id_shift > pkg_id_shift) {
205 printf("WARNING: L%u data cache covers more "
206 "APIC IDs than a package\n", level);
207 printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift);
208 caches[level - 1].id_shift = pkg_id_shift;
209 }
210 if (caches[level - 1].id_shift < core_id_shift) {
211 printf("WARNING: L%u data cache covers less "
212 "APIC IDs than a core\n", level);
213 printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift);
214 caches[level - 1].id_shift = core_id_shift;
215 }
216
217 return (1);
218 }
219
220 /*
221 * Determine topology of processing units and caches for AMD CPUs.
222 * See:
223 * - AMD CPUID Specification (Publication # 25481)
224 * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
225 * - BKDG For AMD Family 10h Processors (Publication # 31116)
226 * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
227 * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
228 * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
229 */
230 static void
231 topo_probe_amd(void)
232 {
233 u_int p[4];
234 uint64_t v;
235 int level;
236 int nodes_per_socket;
237 int share_count;
238 int type;
239 int i;
240
241 /* No multi-core capability. */
242 if ((amd_feature2 & AMDID2_CMP) == 0)
243 return;
244
245 /* For families 10h and newer. */
246 pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
247 AMDID_COREID_SIZE_SHIFT;
248
249 /* For 0Fh family. */
250 if (pkg_id_shift == 0)
251 pkg_id_shift =
252 mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
253
254 /*
255 * Families prior to 16h define the following value as
256 * cores per compute unit and we don't really care about the AMD
257 * compute units at the moment. Perhaps we should treat them as
258 * cores and cores within the compute units as hardware threads,
259 * but that's up for debate.
260 * Later families define the value as threads per compute unit,
261 * so we are following AMD's nomenclature here.
262 */
263 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
264 CPUID_TO_FAMILY(cpu_id) >= 0x16) {
265 cpuid_count(0x8000001e, 0, p);
266 share_count = ((p[1] >> 8) & 0xff) + 1;
267 core_id_shift = mask_width(share_count);
268 }
269
270 if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
271 for (i = 0; ; i++) {
272 cpuid_count(0x8000001d, i, p);
273 type = p[0] & 0x1f;
274 level = (p[0] >> 5) & 0x7;
275 share_count = 1 + ((p[0] >> 14) & 0xfff);
276
277 if (!add_deterministic_cache(type, level, share_count))
278 break;
279 }
280 } else {
281 if (cpu_exthigh >= 0x80000005) {
282 cpuid_count(0x80000005, 0, p);
283 if (((p[2] >> 24) & 0xff) != 0) {
284 caches[0].id_shift = 0;
285 caches[0].present = 1;
286 }
287 }
288 if (cpu_exthigh >= 0x80000006) {
289 cpuid_count(0x80000006, 0, p);
290 if (((p[2] >> 16) & 0xffff) != 0) {
291 caches[1].id_shift = 0;
292 caches[1].present = 1;
293 }
294 if (((p[3] >> 18) & 0x3fff) != 0) {
295 nodes_per_socket = 1;
296 if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
297 /*
298 * Handle multi-node processors that
299 * have multiple chips, each with its
300 * own L3 cache, on the same die.
301 */
302 v = rdmsr(0xc001100c);
303 nodes_per_socket = 1 + ((v >> 3) & 0x7);
304 }
305 caches[2].id_shift =
306 pkg_id_shift - mask_width(nodes_per_socket);
307 caches[2].present = 1;
308 }
309 }
310 }
311 }
312
313 /*
314 * Determine topology of processing units for Intel CPUs
315 * using CPUID Leaf 1 and Leaf 4, if supported.
316 * See:
317 * - Intel 64 Architecture Processor Topology Enumeration
318 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
319 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
320 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
321 */
322 static void
323 topo_probe_intel_0x4(void)
324 {
325 u_int p[4];
326 int max_cores;
327 int max_logical;
328
329 /* Both zero and one here mean one logical processor per package. */
330 max_logical = (cpu_feature & CPUID_HTT) != 0 ?
331 (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
332 if (max_logical <= 1)
333 return;
334
335 if (cpu_high >= 0x4) {
336 cpuid_count(0x04, 0, p);
337 max_cores = ((p[0] >> 26) & 0x3f) + 1;
338 } else
339 max_cores = 1;
340
341 core_id_shift = mask_width(max_logical/max_cores);
342 KASSERT(core_id_shift >= 0,
343 ("intel topo: max_cores > max_logical\n"));
344 pkg_id_shift = core_id_shift + mask_width(max_cores);
345 }
346
347 /*
348 * Determine topology of processing units for Intel CPUs
349 * using CPUID Leaf 11, if supported.
350 * See:
351 * - Intel 64 Architecture Processor Topology Enumeration
352 * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
353 * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
354 * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
355 */
356 static void
357 topo_probe_intel_0xb(void)
358 {
359 u_int p[4];
360 int bits;
361 int type;
362 int i;
363
364 /* Fall back if CPU leaf 11 doesn't really exist. */
365 cpuid_count(0x0b, 0, p);
366 if (p[1] == 0) {
367 topo_probe_intel_0x4();
368 return;
369 }
370
371 /* We only support three levels for now. */
372 for (i = 0; ; i++) {
373 cpuid_count(0x0b, i, p);
374
375 bits = p[0] & 0x1f;
376 type = (p[2] >> 8) & 0xff;
377
378 if (type == 0)
379 break;
380
381 /* TODO: check for duplicate (re-)assignment */
382 if (type == CPUID_TYPE_SMT)
383 core_id_shift = bits;
384 else if (type == CPUID_TYPE_CORE)
385 pkg_id_shift = bits;
386 else
387 printf("unknown CPU level type %d\n", type);
388 }
389
390 if (pkg_id_shift < core_id_shift) {
391 printf("WARNING: core covers more APIC IDs than a package\n");
392 core_id_shift = pkg_id_shift;
393 }
394 }
395
396 /*
397 * Determine topology of caches for Intel CPUs.
398 * See:
399 * - Intel 64 Architecture Processor Topology Enumeration
400 * - Intel 64 and IA-32 Architectures Software Developer’s Manual
401 * Volume 2A: Instruction Set Reference, A-M,
402 * CPUID instruction
403 */
404 static void
405 topo_probe_intel_caches(void)
406 {
407 u_int p[4];
408 int level;
409 int share_count;
410 int type;
411 int i;
412
413 if (cpu_high < 0x4) {
414 /*
415 * Available cache level and sizes can be determined
416 * via CPUID leaf 2, but that requires a huge table of hardcoded
417 * values, so for now just assume L1 and L2 caches potentially
418 * shared only by HTT processing units, if HTT is present.
419 */
420 caches[0].id_shift = pkg_id_shift;
421 caches[0].present = 1;
422 caches[1].id_shift = pkg_id_shift;
423 caches[1].present = 1;
424 return;
425 }
426
427 for (i = 0; ; i++) {
428 cpuid_count(0x4, i, p);
429 type = p[0] & 0x1f;
430 level = (p[0] >> 5) & 0x7;
431 share_count = 1 + ((p[0] >> 14) & 0xfff);
432
433 if (!add_deterministic_cache(type, level, share_count))
434 break;
435 }
436 }
437
438 /*
439 * Determine topology of processing units and caches for Intel CPUs.
440 * See:
441 * - Intel 64 Architecture Processor Topology Enumeration
442 */
443 static void
444 topo_probe_intel(void)
445 {
446
447 /*
448 * Note that 0x1 <= cpu_high < 4 case should be
449 * compatible with topo_probe_intel_0x4() logic when
450 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
451 * or it should trigger the fallback otherwise.
452 */
453 if (cpu_high >= 0xb)
454 topo_probe_intel_0xb();
455 else if (cpu_high >= 0x1)
456 topo_probe_intel_0x4();
457
458 topo_probe_intel_caches();
459 }
460
461 /*
462 * Topology information is queried only on BSP, on which this
463 * code runs and for which it can query CPUID information.
464 * Then topology is extrapolated on all packages using an
465 * assumption that APIC ID to hardware component ID mapping is
466 * homogenious.
467 * That doesn't necesserily imply that the topology is uniform.
468 */
469 void
470 topo_probe(void)
471 {
472 static int cpu_topo_probed = 0;
473 struct x86_topo_layer {
474 int type;
475 int subtype;
476 int id_shift;
477 } topo_layers[MAX_CACHE_LEVELS + 3];
478 struct topo_node *parent;
479 struct topo_node *node;
480 int layer;
481 int nlayers;
482 int node_id;
483 int i;
484
485 if (cpu_topo_probed)
486 return;
487
488 CPU_ZERO(&logical_cpus_mask);
489
490 if (mp_ncpus <= 1)
491 ; /* nothing */
492 else if (cpu_vendor_id == CPU_VENDOR_AMD)
493 topo_probe_amd();
494 else if (cpu_vendor_id == CPU_VENDOR_INTEL)
495 topo_probe_intel();
496
497 KASSERT(pkg_id_shift >= core_id_shift,
498 ("bug in APIC topology discovery"));
499
500 nlayers = 0;
501 bzero(topo_layers, sizeof(topo_layers));
502
503 topo_layers[nlayers].type = TOPO_TYPE_PKG;
504 topo_layers[nlayers].id_shift = pkg_id_shift;
505 if (bootverbose)
506 printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
507 nlayers++;
508
509 /*
510 * Consider all caches to be within a package/chip
511 * and "in front" of all sub-components like
512 * cores and hardware threads.
513 */
514 for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
515 if (caches[i].present) {
516 KASSERT(caches[i].id_shift <= pkg_id_shift,
517 ("bug in APIC topology discovery"));
518 KASSERT(caches[i].id_shift >= core_id_shift,
519 ("bug in APIC topology discovery"));
520
521 topo_layers[nlayers].type = TOPO_TYPE_CACHE;
522 topo_layers[nlayers].subtype = i + 1;
523 topo_layers[nlayers].id_shift = caches[i].id_shift;
524 if (bootverbose)
525 printf("L%u cache ID shift: %u\n",
526 topo_layers[nlayers].subtype,
527 topo_layers[nlayers].id_shift);
528 nlayers++;
529 }
530 }
531
532 if (pkg_id_shift > core_id_shift) {
533 topo_layers[nlayers].type = TOPO_TYPE_CORE;
534 topo_layers[nlayers].id_shift = core_id_shift;
535 if (bootverbose)
536 printf("Core ID shift: %u\n",
537 topo_layers[nlayers].id_shift);
538 nlayers++;
539 }
540
541 topo_layers[nlayers].type = TOPO_TYPE_PU;
542 topo_layers[nlayers].id_shift = 0;
543 nlayers++;
544
545 topo_init_root(&topo_root);
546 for (i = 0; i <= MAX_APIC_ID; ++i) {
547 if (!cpu_info[i].cpu_present)
548 continue;
549
550 parent = &topo_root;
551 for (layer = 0; layer < nlayers; ++layer) {
552 node_id = i >> topo_layers[layer].id_shift;
553 parent = topo_add_node_by_hwid(parent, node_id,
554 topo_layers[layer].type,
555 topo_layers[layer].subtype);
556 }
557 }
558
559 parent = &topo_root;
560 for (layer = 0; layer < nlayers; ++layer) {
561 node_id = boot_cpu_id >> topo_layers[layer].id_shift;
562 node = topo_find_node_by_hwid(parent, node_id,
563 topo_layers[layer].type,
564 topo_layers[layer].subtype);
565 topo_promote_child(node);
566 parent = node;
567 }
568
569 cpu_topo_probed = 1;
570 }
571
572 /*
573 * Assign logical CPU IDs to local APICs.
574 */
575 void
576 assign_cpu_ids(void)
577 {
578 struct topo_node *node;
579 u_int smt_mask;
580
581 smt_mask = (1u << core_id_shift) - 1;
582
583 /*
584 * Assign CPU IDs to local APIC IDs and disable any CPUs
585 * beyond MAXCPU. CPU 0 is always assigned to the BSP.
586 */
587 mp_ncpus = 0;
588 TOPO_FOREACH(node, &topo_root) {
589 if (node->type != TOPO_TYPE_PU)
590 continue;
591
592 if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
593 cpu_info[node->hwid].cpu_hyperthread = 1;
594
595 if (resource_disabled("lapic", node->hwid)) {
596 if (node->hwid != boot_cpu_id)
597 cpu_info[node->hwid].cpu_disabled = 1;
598 else
599 printf("Cannot disable BSP, APIC ID = %d\n",
600 node->hwid);
601 }
602
603 if (!hyperthreading_allowed &&
604 cpu_info[node->hwid].cpu_hyperthread)
605 cpu_info[node->hwid].cpu_disabled = 1;
606
607 if (mp_ncpus >= MAXCPU)
608 cpu_info[node->hwid].cpu_disabled = 1;
609
610 if (cpu_info[node->hwid].cpu_disabled) {
611 disabled_cpus++;
612 continue;
613 }
614
615 cpu_apic_ids[mp_ncpus] = node->hwid;
616 apic_cpuids[node->hwid] = mp_ncpus;
617 topo_set_pu_id(node, mp_ncpus);
618 mp_ncpus++;
619 }
620
621 KASSERT(mp_maxid >= mp_ncpus - 1,
622 ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
623 mp_ncpus));
624 }
625
626 /*
627 * Print various information about the SMP system hardware and setup.
628 */
629 void
630 cpu_mp_announce(void)
631 {
632 struct topo_node *node;
633 const char *hyperthread;
634 int pkg_count;
635 int cores_per_pkg;
636 int thrs_per_core;
637
638 printf("FreeBSD/SMP: ");
639 if (topo_analyze(&topo_root, 1, &pkg_count,
640 &cores_per_pkg, &thrs_per_core)) {
641 printf("%d package(s)", pkg_count);
642 if (cores_per_pkg > 0)
643 printf(" x %d core(s)", cores_per_pkg);
644 if (thrs_per_core > 1)
645 printf(" x %d hardware threads", thrs_per_core);
646 } else {
647 printf("Non-uniform topology");
648 }
649 printf("\n");
650
651 if (disabled_cpus) {
652 printf("FreeBSD/SMP Online: ");
653 if (topo_analyze(&topo_root, 0, &pkg_count,
654 &cores_per_pkg, &thrs_per_core)) {
655 printf("%d package(s)", pkg_count);
656 if (cores_per_pkg > 0)
657 printf(" x %d core(s)", cores_per_pkg);
658 if (thrs_per_core > 1)
659 printf(" x %d hardware threads", thrs_per_core);
660 } else {
661 printf("Non-uniform topology");
662 }
663 printf("\n");
664 }
665
666 if (!bootverbose)
667 return;
668
669 TOPO_FOREACH(node, &topo_root) {
670 switch (node->type) {
671 case TOPO_TYPE_PKG:
672 printf("Package HW ID = %u (%#x)\n",
673 node->hwid, node->hwid);
674 break;
675 case TOPO_TYPE_CORE:
676 printf("\tCore HW ID = %u (%#x)\n",
677 node->hwid, node->hwid);
678 break;
679 case TOPO_TYPE_PU:
680 if (cpu_info[node->hwid].cpu_hyperthread)
681 hyperthread = "/HT";
682 else
683 hyperthread = "";
684
685 if (node->subtype == 0)
686 printf("\t\tCPU (AP%s): APIC ID: %u (%#x)"
687 "(disabled)\n", hyperthread, node->hwid,
688 node->hwid);
689 else if (node->id == 0)
690 printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n",
691 node->hwid, node->hwid);
692 else
693 printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n",
694 node->id, hyperthread, node->hwid,
695 node->hwid);
696 break;
697 default:
698 /* ignored */
699 break;
700 }
701 }
702 }
703
704 /*
705 * Add a scheduling group, a group of logical processors sharing
706 * a particular cache (and, thus having an affinity), to the scheduling
707 * topology.
708 * This function recursively works on lower level caches.
709 */
710 static void
711 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
712 {
713 struct topo_node *node;
714 int nchildren;
715 int ncores;
716 int i;
717
718 KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE,
719 ("x86topo_add_sched_group: bad type: %u", root->type));
720 CPU_COPY(&root->cpuset, &cg_root->cg_mask);
721 cg_root->cg_count = root->cpu_count;
722 if (root->type == TOPO_TYPE_SYSTEM)
723 cg_root->cg_level = CG_SHARE_NONE;
724 else
725 cg_root->cg_level = root->subtype;
726
727 /*
728 * Check how many core nodes we have under the given root node.
729 * If we have multiple logical processors, but not multiple
730 * cores, then those processors must be hardware threads.
731 */
732 ncores = 0;
733 node = root;
734 while (node != NULL) {
735 if (node->type != TOPO_TYPE_CORE) {
736 node = topo_next_node(root, node);
737 continue;
738 }
739
740 ncores++;
741 node = topo_next_nonchild_node(root, node);
742 }
743
744 if (cg_root->cg_level != CG_SHARE_NONE &&
745 root->cpu_count > 1 && ncores < 2)
746 cg_root->cg_flags = CG_FLAG_SMT;
747
748 /*
749 * Find out how many cache nodes we have under the given root node.
750 * We ignore cache nodes that cover all the same processors as the
751 * root node. Also, we do not descend below found cache nodes.
752 * That is, we count top-level "non-redundant" caches under the root
753 * node.
754 */
755 nchildren = 0;
756 node = root;
757 while (node != NULL) {
758 if (node->type != TOPO_TYPE_CACHE ||
759 (root->type != TOPO_TYPE_SYSTEM &&
760 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
761 node = topo_next_node(root, node);
762 continue;
763 }
764 nchildren++;
765 node = topo_next_nonchild_node(root, node);
766 }
767
768 cg_root->cg_child = smp_topo_alloc(nchildren);
769 cg_root->cg_children = nchildren;
770
771 /*
772 * Now find again the same cache nodes as above and recursively
773 * build scheduling topologies for them.
774 */
775 node = root;
776 i = 0;
777 while (node != NULL) {
778 if (node->type != TOPO_TYPE_CACHE ||
779 (root->type != TOPO_TYPE_SYSTEM &&
780 CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
781 node = topo_next_node(root, node);
782 continue;
783 }
784 cg_root->cg_child[i].cg_parent = cg_root;
785 x86topo_add_sched_group(node, &cg_root->cg_child[i]);
786 i++;
787 node = topo_next_nonchild_node(root, node);
788 }
789 }
790
791 /*
792 * Build the MI scheduling topology from the discovered hardware topology.
793 */
794 struct cpu_group *
795 cpu_topo(void)
796 {
797 struct cpu_group *cg_root;
798
799 if (mp_ncpus <= 1)
800 return (smp_topo_none());
801
802 cg_root = smp_topo_alloc(1);
803 x86topo_add_sched_group(&topo_root, cg_root);
804 return (cg_root);
805 }
806
807
808 /*
809 * Add a logical CPU to the topology.
810 */
811 void
812 cpu_add(u_int apic_id, char boot_cpu)
813 {
814
815 if (apic_id > MAX_APIC_ID) {
816 panic("SMP: APIC ID %d too high", apic_id);
817 return;
818 }
819 KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice",
820 apic_id));
821 cpu_info[apic_id].cpu_present = 1;
822 if (boot_cpu) {
823 KASSERT(boot_cpu_id == -1,
824 ("CPU %d claims to be BSP, but CPU %d already is", apic_id,
825 boot_cpu_id));
826 boot_cpu_id = apic_id;
827 cpu_info[apic_id].cpu_bsp = 1;
828 }
829 if (mp_ncpus < MAXCPU) {
830 mp_ncpus++;
831 mp_maxid = mp_ncpus - 1;
832 }
833 if (bootverbose)
834 printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" :
835 "AP");
836 }
837
838 void
839 cpu_mp_setmaxid(void)
840 {
841
842 /*
843 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
844 * If there were no calls to cpu_add() assume this is a UP system.
845 */
846 if (mp_ncpus == 0)
847 mp_ncpus = 1;
848 }
849
850 int
851 cpu_mp_probe(void)
852 {
853
854 /*
855 * Always record BSP in CPU map so that the mbuf init code works
856 * correctly.
857 */
858 CPU_SETOF(0, &all_cpus);
859 return (mp_ncpus > 1);
860 }
861
862 /*
863 * AP CPU's call this to initialize themselves.
864 */
865 void
866 init_secondary_tail(void)
867 {
868 u_int cpuid;
869
870 pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
871
872 /*
873 * On real hardware, switch to x2apic mode if possible. Do it
874 * after aps_ready was signalled, to avoid manipulating the
875 * mode while BSP might still want to send some IPI to us
876 * (second startup IPI is ignored on modern hardware etc).
877 */
878 lapic_xapic_mode();
879
880 /* Initialize the PAT MSR. */
881 pmap_init_pat();
882
883 /* set up CPU registers and state */
884 cpu_setregs();
885
886 /* set up SSE/NX */
887 initializecpu();
888
889 /* set up FPU state on the AP */
890 #ifdef __amd64__
891 fpuinit();
892 #else
893 npxinit(false);
894 #endif
895
896 if (cpu_ops.cpu_init)
897 cpu_ops.cpu_init();
898
899 /* A quick check from sanity claus */
900 cpuid = PCPU_GET(cpuid);
901 if (PCPU_GET(apic_id) != lapic_id()) {
902 printf("SMP: cpuid = %d\n", cpuid);
903 printf("SMP: actual apic_id = %d\n", lapic_id());
904 printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
905 panic("cpuid mismatch! boom!!");
906 }
907
908 /* Initialize curthread. */
909 KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
910 PCPU_SET(curthread, PCPU_GET(idlethread));
911
912 mtx_lock_spin(&ap_boot_mtx);
913
914 mca_init();
915
916 /* Init local apic for irq's */
917 lapic_setup(1);
918
919 /* Set memory range attributes for this CPU to match the BSP */
920 mem_range_AP_init();
921
922 smp_cpus++;
923
924 CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
925 printf("SMP: AP CPU #%d Launched!\n", cpuid);
926
927 /* Determine if we are a logical CPU. */
928 if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
929 CPU_SET(cpuid, &logical_cpus_mask);
930
931 if (bootverbose)
932 lapic_dump("AP");
933
934 if (smp_cpus == mp_ncpus) {
935 /* enable IPI's, tlb shootdown, freezes etc */
936 atomic_store_rel_int(&smp_started, 1);
937 }
938
939 #ifdef __amd64__
940 /*
941 * Enable global pages TLB extension
942 * This also implicitly flushes the TLB
943 */
944 load_cr4(rcr4() | CR4_PGE);
945 if (pmap_pcid_enabled)
946 load_cr4(rcr4() | CR4_PCIDE);
947 load_ds(_udatasel);
948 load_es(_udatasel);
949 load_fs(_ufssel);
950 #endif
951
952 mtx_unlock_spin(&ap_boot_mtx);
953
954 /* Wait until all the AP's are up. */
955 while (atomic_load_acq_int(&smp_started) == 0)
956 ia32_pause();
957
958 #ifndef EARLY_AP_STARTUP
959 /* Start per-CPU event timers. */
960 cpu_initclocks_ap();
961 #endif
962
963 sched_throw(NULL);
964
965 panic("scheduler returned us to %s", __func__);
966 /* NOTREACHED */
967 }
968
969 /*******************************************************************
970 * local functions and data
971 */
972
973 /*
974 * We tell the I/O APIC code about all the CPUs we want to receive
975 * interrupts. If we don't want certain CPUs to receive IRQs we
976 * can simply not tell the I/O APIC code about them in this function.
977 * We also do not tell it about the BSP since it tells itself about
978 * the BSP internally to work with UP kernels and on UP machines.
979 */
980 void
981 set_interrupt_apic_ids(void)
982 {
983 u_int i, apic_id;
984
985 for (i = 0; i < MAXCPU; i++) {
986 apic_id = cpu_apic_ids[i];
987 if (apic_id == -1)
988 continue;
989 if (cpu_info[apic_id].cpu_bsp)
990 continue;
991 if (cpu_info[apic_id].cpu_disabled)
992 continue;
993
994 /* Don't let hyperthreads service interrupts. */
995 if (cpu_info[apic_id].cpu_hyperthread)
996 continue;
997
998 intr_add_cpu(i);
999 }
1000 }
1001
1002
1003 #ifdef COUNT_XINVLTLB_HITS
1004 u_int xhits_gbl[MAXCPU];
1005 u_int xhits_pg[MAXCPU];
1006 u_int xhits_rng[MAXCPU];
1007 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, "");
1008 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
1009 sizeof(xhits_gbl), "IU", "");
1010 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
1011 sizeof(xhits_pg), "IU", "");
1012 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
1013 sizeof(xhits_rng), "IU", "");
1014
1015 u_int ipi_global;
1016 u_int ipi_page;
1017 u_int ipi_range;
1018 u_int ipi_range_size;
1019 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
1020 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
1021 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
1022 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
1023 0, "");
1024 #endif /* COUNT_XINVLTLB_HITS */
1025
1026 /*
1027 * Init and startup IPI.
1028 */
1029 void
1030 ipi_startup(int apic_id, int vector)
1031 {
1032
1033 /*
1034 * This attempts to follow the algorithm described in the
1035 * Intel Multiprocessor Specification v1.4 in section B.4.
1036 * For each IPI, we allow the local APIC ~20us to deliver the
1037 * IPI. If that times out, we panic.
1038 */
1039
1040 /*
1041 * first we do an INIT IPI: this INIT IPI might be run, resetting
1042 * and running the target CPU. OR this INIT IPI might be latched (P5
1043 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
1044 * ignored.
1045 */
1046 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1047 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
1048 lapic_ipi_wait(100);
1049
1050 /* Explicitly deassert the INIT IPI. */
1051 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
1052 APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
1053 apic_id);
1054
1055 DELAY(10000); /* wait ~10mS */
1056
1057 /*
1058 * next we do a STARTUP IPI: the previous INIT IPI might still be
1059 * latched, (P5 bug) this 1st STARTUP would then terminate
1060 * immediately, and the previously started INIT IPI would continue. OR
1061 * the previous INIT IPI has already run. and this STARTUP IPI will
1062 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
1063 * will run.
1064 */
1065 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1066 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1067 vector, apic_id);
1068 if (!lapic_ipi_wait(100))
1069 panic("Failed to deliver first STARTUP IPI to APIC %d",
1070 apic_id);
1071 DELAY(200); /* wait ~200uS */
1072
1073 /*
1074 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
1075 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
1076 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
1077 * recognized after hardware RESET or INIT IPI.
1078 */
1079 lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
1080 APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
1081 vector, apic_id);
1082 if (!lapic_ipi_wait(100))
1083 panic("Failed to deliver second STARTUP IPI to APIC %d",
1084 apic_id);
1085
1086 DELAY(200); /* wait ~200uS */
1087 }
1088
1089 /*
1090 * Send an IPI to specified CPU handling the bitmap logic.
1091 */
1092 void
1093 ipi_send_cpu(int cpu, u_int ipi)
1094 {
1095 u_int bitmap, old_pending, new_pending;
1096
1097 KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu));
1098
1099 if (IPI_IS_BITMAPED(ipi)) {
1100 bitmap = 1 << ipi;
1101 ipi = IPI_BITMAP_VECTOR;
1102 do {
1103 old_pending = cpu_ipi_pending[cpu];
1104 new_pending = old_pending | bitmap;
1105 } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu],
1106 old_pending, new_pending));
1107 if (old_pending)
1108 return;
1109 }
1110 lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
1111 }
1112
1113 void
1114 ipi_bitmap_handler(struct trapframe frame)
1115 {
1116 struct trapframe *oldframe;
1117 struct thread *td;
1118 int cpu = PCPU_GET(cpuid);
1119 u_int ipi_bitmap;
1120
1121 critical_enter();
1122 td = curthread;
1123 td->td_intr_nesting_level++;
1124 oldframe = td->td_intr_frame;
1125 td->td_intr_frame = &frame;
1126 ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]);
1127 if (ipi_bitmap & (1 << IPI_PREEMPT)) {
1128 #ifdef COUNT_IPIS
1129 (*ipi_preempt_counts[cpu])++;
1130 #endif
1131 sched_preempt(td);
1132 }
1133 if (ipi_bitmap & (1 << IPI_AST)) {
1134 #ifdef COUNT_IPIS
1135 (*ipi_ast_counts[cpu])++;
1136 #endif
1137 /* Nothing to do for AST */
1138 }
1139 if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
1140 #ifdef COUNT_IPIS
1141 (*ipi_hardclock_counts[cpu])++;
1142 #endif
1143 hardclockintr();
1144 }
1145 td->td_intr_frame = oldframe;
1146 td->td_intr_nesting_level--;
1147 critical_exit();
1148 }
1149
1150 /*
1151 * send an IPI to a set of cpus.
1152 */
1153 void
1154 ipi_selected(cpuset_t cpus, u_int ipi)
1155 {
1156 int cpu;
1157
1158 /*
1159 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1160 * of help in order to understand what is the source.
1161 * Set the mask of receiving CPUs for this purpose.
1162 */
1163 if (ipi == IPI_STOP_HARD)
1164 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
1165
1166 while ((cpu = CPU_FFS(&cpus)) != 0) {
1167 cpu--;
1168 CPU_CLR(cpu, &cpus);
1169 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1170 ipi_send_cpu(cpu, ipi);
1171 }
1172 }
1173
1174 /*
1175 * send an IPI to a specific CPU.
1176 */
1177 void
1178 ipi_cpu(int cpu, u_int ipi)
1179 {
1180
1181 /*
1182 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1183 * of help in order to understand what is the source.
1184 * Set the mask of receiving CPUs for this purpose.
1185 */
1186 if (ipi == IPI_STOP_HARD)
1187 CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
1188
1189 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
1190 ipi_send_cpu(cpu, ipi);
1191 }
1192
1193 /*
1194 * send an IPI to all CPUs EXCEPT myself
1195 */
1196 void
1197 ipi_all_but_self(u_int ipi)
1198 {
1199 cpuset_t other_cpus;
1200
1201 other_cpus = all_cpus;
1202 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1203 if (IPI_IS_BITMAPED(ipi)) {
1204 ipi_selected(other_cpus, ipi);
1205 return;
1206 }
1207
1208 /*
1209 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
1210 * of help in order to understand what is the source.
1211 * Set the mask of receiving CPUs for this purpose.
1212 */
1213 if (ipi == IPI_STOP_HARD)
1214 CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
1215
1216 CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
1217 lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
1218 }
1219
1220 int
1221 ipi_nmi_handler(void)
1222 {
1223 u_int cpuid;
1224
1225 /*
1226 * As long as there is not a simple way to know about a NMI's
1227 * source, if the bitmask for the current CPU is present in
1228 * the global pending bitword an IPI_STOP_HARD has been issued
1229 * and should be handled.
1230 */
1231 cpuid = PCPU_GET(cpuid);
1232 if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
1233 return (1);
1234
1235 CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
1236 cpustop_handler();
1237 return (0);
1238 }
1239
1240 int nmi_kdb_lock;
1241
1242 void
1243 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
1244 {
1245 int cpu;
1246 bool call_post;
1247
1248 cpu = PCPU_GET(cpuid);
1249 if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
1250 nmi_call_kdb(cpu, type, frame);
1251 call_post = false;
1252 } else {
1253 savectx(&stoppcbs[cpu]);
1254 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1255 while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
1256 ia32_pause();
1257 call_post = true;
1258 }
1259 atomic_store_rel_int(&nmi_kdb_lock, 0);
1260 if (call_post)
1261 cpustop_handler_post(cpu);
1262 }
1263
1264 /*
1265 * Handle an IPI_STOP by saving our current context and spinning until we
1266 * are resumed.
1267 */
1268 void
1269 cpustop_handler(void)
1270 {
1271 u_int cpu;
1272
1273 cpu = PCPU_GET(cpuid);
1274
1275 savectx(&stoppcbs[cpu]);
1276
1277 /* Indicate that we are stopped */
1278 CPU_SET_ATOMIC(cpu, &stopped_cpus);
1279
1280 /* Wait for restart */
1281 while (!CPU_ISSET(cpu, &started_cpus))
1282 ia32_pause();
1283
1284 cpustop_handler_post(cpu);
1285 }
1286
1287 static void
1288 cpustop_handler_post(u_int cpu)
1289 {
1290
1291 CPU_CLR_ATOMIC(cpu, &started_cpus);
1292 CPU_CLR_ATOMIC(cpu, &stopped_cpus);
1293
1294 #if defined(__amd64__) && defined(DDB)
1295 amd64_db_resume_dbreg();
1296 #endif
1297
1298 if (cpu == 0 && cpustop_restartfunc != NULL) {
1299 cpustop_restartfunc();
1300 cpustop_restartfunc = NULL;
1301 }
1302 }
1303
1304 /*
1305 * Handle an IPI_SUSPEND by saving our current context and spinning until we
1306 * are resumed.
1307 */
1308 void
1309 cpususpend_handler(void)
1310 {
1311 u_int cpu;
1312
1313 mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
1314
1315 cpu = PCPU_GET(cpuid);
1316 if (savectx(&susppcbs[cpu]->sp_pcb)) {
1317 #ifdef __amd64__
1318 fpususpend(susppcbs[cpu]->sp_fpususpend);
1319 #else
1320 npxsuspend(susppcbs[cpu]->sp_fpususpend);
1321 #endif
1322 /*
1323 * suspended_cpus is cleared shortly after each AP is restarted
1324 * by a Startup IPI, so that the BSP can proceed to restarting
1325 * the next AP.
1326 *
1327 * resuming_cpus gets cleared when the AP completes
1328 * initialization after having been released by the BSP.
1329 * resuming_cpus is probably not the best name for the
1330 * variable, because it is actually a set of processors that
1331 * haven't resumed yet and haven't necessarily started resuming.
1332 *
1333 * Note that suspended_cpus is meaningful only for ACPI suspend
1334 * as it's not really used for Xen suspend since the APs are
1335 * automatically restored to the running state and the correct
1336 * context. For the same reason resumectx is never called in
1337 * that case.
1338 */
1339 CPU_SET_ATOMIC(cpu, &suspended_cpus);
1340 CPU_SET_ATOMIC(cpu, &resuming_cpus);
1341
1342 /*
1343 * Invalidate the cache after setting the global status bits.
1344 * The last AP to set its bit may end up being an Owner of the
1345 * corresponding cache line in MOESI protocol. The AP may be
1346 * stopped before the cache line is written to the main memory.
1347 */
1348 wbinvd();
1349 } else {
1350 #ifdef __amd64__
1351 fpuresume(susppcbs[cpu]->sp_fpususpend);
1352 #else
1353 npxresume(susppcbs[cpu]->sp_fpususpend);
1354 #endif
1355 pmap_init_pat();
1356 initializecpu();
1357 PCPU_SET(switchtime, 0);
1358 PCPU_SET(switchticks, ticks);
1359
1360 /* Indicate that we have restarted and restored the context. */
1361 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1362 }
1363
1364 /* Wait for resume directive */
1365 while (!CPU_ISSET(cpu, &toresume_cpus))
1366 ia32_pause();
1367
1368 /* Re-apply microcode updates. */
1369 ucode_reload();
1370
1371 if (cpu_ops.cpu_resume)
1372 cpu_ops.cpu_resume();
1373 #ifdef __amd64__
1374 if (vmm_resume_p)
1375 vmm_resume_p();
1376 #endif
1377
1378 /* Resume MCA and local APIC */
1379 lapic_xapic_mode();
1380 mca_resume();
1381 lapic_setup(0);
1382
1383 /* Indicate that we are resumed */
1384 CPU_CLR_ATOMIC(cpu, &resuming_cpus);
1385 CPU_CLR_ATOMIC(cpu, &suspended_cpus);
1386 CPU_CLR_ATOMIC(cpu, &toresume_cpus);
1387 }
1388
1389
1390 void
1391 invlcache_handler(void)
1392 {
1393 uint32_t generation;
1394
1395 #ifdef COUNT_IPIS
1396 (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
1397 #endif /* COUNT_IPIS */
1398
1399 /*
1400 * Reading the generation here allows greater parallelism
1401 * since wbinvd is a serializing instruction. Without the
1402 * temporary, we'd wait for wbinvd to complete, then the read
1403 * would execute, then the dependent write, which must then
1404 * complete before return from interrupt.
1405 */
1406 generation = smp_tlb_generation;
1407 wbinvd();
1408 PCPU_SET(smp_tlb_done, generation);
1409 }
1410
1411 /*
1412 * This is called once the rest of the system is up and running and we're
1413 * ready to let the AP's out of the pen.
1414 */
1415 static void
1416 release_aps(void *dummy __unused)
1417 {
1418
1419 if (mp_ncpus == 1)
1420 return;
1421 atomic_store_rel_int(&aps_ready, 1);
1422 while (smp_started == 0)
1423 ia32_pause();
1424 }
1425 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
1426
1427 #ifdef COUNT_IPIS
1428 /*
1429 * Setup interrupt counters for IPI handlers.
1430 */
1431 static void
1432 mp_ipi_intrcnt(void *dummy)
1433 {
1434 char buf[64];
1435 int i;
1436
1437 CPU_FOREACH(i) {
1438 snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
1439 intrcnt_add(buf, &ipi_invltlb_counts[i]);
1440 snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
1441 intrcnt_add(buf, &ipi_invlrng_counts[i]);
1442 snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
1443 intrcnt_add(buf, &ipi_invlpg_counts[i]);
1444 snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
1445 intrcnt_add(buf, &ipi_invlcache_counts[i]);
1446 snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
1447 intrcnt_add(buf, &ipi_preempt_counts[i]);
1448 snprintf(buf, sizeof(buf), "cpu%d:ast", i);
1449 intrcnt_add(buf, &ipi_ast_counts[i]);
1450 snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
1451 intrcnt_add(buf, &ipi_rendezvous_counts[i]);
1452 snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
1453 intrcnt_add(buf, &ipi_hardclock_counts[i]);
1454 }
1455 }
1456 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
1457 #endif
1458
1459 /*
1460 * Flush the TLB on other CPU's
1461 */
1462
1463 /* Variables needed for SMP tlb shootdown. */
1464 vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
1465 pmap_t smp_tlb_pmap;
1466 volatile uint32_t smp_tlb_generation;
1467
1468 #ifdef __amd64__
1469 #define read_eflags() read_rflags()
1470 #endif
1471
1472 static void
1473 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
1474 vm_offset_t addr1, vm_offset_t addr2)
1475 {
1476 cpuset_t other_cpus;
1477 volatile uint32_t *p_cpudone;
1478 uint32_t generation;
1479 int cpu;
1480
1481 /*
1482 * Check for other cpus. Return if none.
1483 */
1484 if (CPU_ISFULLSET(&mask)) {
1485 if (mp_ncpus <= 1)
1486 return;
1487 } else {
1488 CPU_CLR(PCPU_GET(cpuid), &mask);
1489 if (CPU_EMPTY(&mask))
1490 return;
1491 }
1492
1493 if (!(read_eflags() & PSL_I))
1494 panic("%s: interrupts disabled", __func__);
1495 mtx_lock_spin(&smp_ipi_mtx);
1496 smp_tlb_addr1 = addr1;
1497 smp_tlb_addr2 = addr2;
1498 smp_tlb_pmap = pmap;
1499 generation = ++smp_tlb_generation;
1500 if (CPU_ISFULLSET(&mask)) {
1501 ipi_all_but_self(vector);
1502 other_cpus = all_cpus;
1503 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
1504 } else {
1505 other_cpus = mask;
1506 while ((cpu = CPU_FFS(&mask)) != 0) {
1507 cpu--;
1508 CPU_CLR(cpu, &mask);
1509 CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
1510 cpu, vector);
1511 ipi_send_cpu(cpu, vector);
1512 }
1513 }
1514 while ((cpu = CPU_FFS(&other_cpus)) != 0) {
1515 cpu--;
1516 CPU_CLR(cpu, &other_cpus);
1517 p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
1518 while (*p_cpudone != generation)
1519 ia32_pause();
1520 }
1521 mtx_unlock_spin(&smp_ipi_mtx);
1522 }
1523
1524 void
1525 smp_masked_invltlb(cpuset_t mask, pmap_t pmap)
1526 {
1527
1528 if (smp_started) {
1529 smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0);
1530 #ifdef COUNT_XINVLTLB_HITS
1531 ipi_global++;
1532 #endif
1533 }
1534 }
1535
1536 void
1537 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap)
1538 {
1539
1540 if (smp_started) {
1541 smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0);
1542 #ifdef COUNT_XINVLTLB_HITS
1543 ipi_page++;
1544 #endif
1545 }
1546 }
1547
1548 void
1549 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
1550 pmap_t pmap)
1551 {
1552
1553 if (smp_started) {
1554 smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap,
1555 addr1, addr2);
1556 #ifdef COUNT_XINVLTLB_HITS
1557 ipi_range++;
1558 ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
1559 #endif
1560 }
1561 }
1562
1563 void
1564 smp_cache_flush(void)
1565 {
1566
1567 if (smp_started) {
1568 smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL,
1569 0, 0);
1570 }
1571 }
1572
1573 /*
1574 * Handlers for TLB related IPIs
1575 */
1576 void
1577 invltlb_handler(void)
1578 {
1579 uint32_t generation;
1580
1581 #ifdef COUNT_XINVLTLB_HITS
1582 xhits_gbl[PCPU_GET(cpuid)]++;
1583 #endif /* COUNT_XINVLTLB_HITS */
1584 #ifdef COUNT_IPIS
1585 (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
1586 #endif /* COUNT_IPIS */
1587
1588 /*
1589 * Reading the generation here allows greater parallelism
1590 * since invalidating the TLB is a serializing operation.
1591 */
1592 generation = smp_tlb_generation;
1593 if (smp_tlb_pmap == kernel_pmap)
1594 invltlb_glob();
1595 else
1596 invltlb();
1597 PCPU_SET(smp_tlb_done, generation);
1598 }
1599
1600 void
1601 invlpg_handler(void)
1602 {
1603 uint32_t generation;
1604
1605 #ifdef COUNT_XINVLTLB_HITS
1606 xhits_pg[PCPU_GET(cpuid)]++;
1607 #endif /* COUNT_XINVLTLB_HITS */
1608 #ifdef COUNT_IPIS
1609 (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
1610 #endif /* COUNT_IPIS */
1611
1612 generation = smp_tlb_generation; /* Overlap with serialization */
1613 invlpg(smp_tlb_addr1);
1614 PCPU_SET(smp_tlb_done, generation);
1615 }
1616
1617 void
1618 invlrng_handler(void)
1619 {
1620 vm_offset_t addr, addr2;
1621 uint32_t generation;
1622
1623 #ifdef COUNT_XINVLTLB_HITS
1624 xhits_rng[PCPU_GET(cpuid)]++;
1625 #endif /* COUNT_XINVLTLB_HITS */
1626 #ifdef COUNT_IPIS
1627 (*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
1628 #endif /* COUNT_IPIS */
1629
1630 addr = smp_tlb_addr1;
1631 addr2 = smp_tlb_addr2;
1632 generation = smp_tlb_generation; /* Overlap with serialization */
1633 do {
1634 invlpg(addr);
1635 addr += PAGE_SIZE;
1636 } while (addr < addr2);
1637
1638 PCPU_SET(smp_tlb_done, generation);
1639 }
Cache object: a7234c818b00afeb09d48a4580ff73b2
|